aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_inode.c5
-rw-r--r--fs/Kconfig6
-rw-r--r--fs/Kconfig.binfmt4
-rw-r--r--fs/Makefile5
-rw-r--r--fs/adfs/Kconfig1
-rw-r--r--fs/adfs/super.c8
-rw-r--r--fs/affs/file.c4
-rw-r--r--fs/affs/inode.c2
-rw-r--r--fs/affs/super.c18
-rw-r--r--fs/afs/dir.c2
-rw-r--r--fs/afs/flock.c5
-rw-r--r--fs/afs/mntpt.c1
-rw-r--r--fs/afs/super.c5
-rw-r--r--fs/afs/write.c19
-rw-r--r--fs/aio.c24
-rw-r--r--fs/anon_inodes.c6
-rw-r--r--fs/autofs/Kconfig1
-rw-r--r--fs/autofs/root.c2
-rw-r--r--fs/autofs4/dev-ioctl.c1
-rw-r--r--fs/autofs4/inode.c1
-rw-r--r--fs/autofs4/root.c14
-rw-r--r--fs/bfs/dir.c2
-rw-r--r--fs/bfs/inode.c5
-rw-r--r--fs/binfmt_aout.c4
-rw-r--r--fs/binfmt_elf.c2
-rw-r--r--fs/binfmt_misc.c4
-rw-r--r--fs/block_dev.c36
-rw-r--r--fs/btrfs/disk-io.c19
-rw-r--r--fs/btrfs/extent-tree.c3
-rw-r--r--fs/btrfs/inode.c4
-rw-r--r--fs/btrfs/super.c1
-rw-r--r--fs/btrfs/volumes.c4
-rw-r--r--fs/btrfs/volumes.h1
-rw-r--r--fs/buffer.c36
-rw-r--r--fs/cachefiles/daemon.c1
-rw-r--r--fs/ceph/Kconfig15
-rw-r--r--fs/ceph/Makefile11
-rw-r--r--fs/ceph/README20
-rw-r--r--fs/ceph/addr.c81
-rw-r--r--fs/ceph/armor.c103
-rw-r--r--fs/ceph/auth.c259
-rw-r--r--fs/ceph/auth.h92
-rw-r--r--fs/ceph/auth_none.c131
-rw-r--r--fs/ceph/auth_none.h30
-rw-r--r--fs/ceph/auth_x.c687
-rw-r--r--fs/ceph/auth_x.h49
-rw-r--r--fs/ceph/auth_x_protocol.h90
-rw-r--r--fs/ceph/buffer.c65
-rw-r--r--fs/ceph/buffer.h39
-rw-r--r--fs/ceph/caps.c108
-rw-r--r--fs/ceph/ceph_debug.h37
-rw-r--r--fs/ceph/ceph_frag.c3
-rw-r--r--fs/ceph/ceph_frag.h109
-rw-r--r--fs/ceph/ceph_fs.c72
-rw-r--r--fs/ceph/ceph_fs.h728
-rw-r--r--fs/ceph/ceph_hash.c118
-rw-r--r--fs/ceph/ceph_hash.h13
-rw-r--r--fs/ceph/crush/crush.c151
-rw-r--r--fs/ceph/crush/crush.h180
-rw-r--r--fs/ceph/crush/hash.c149
-rw-r--r--fs/ceph/crush/hash.h17
-rw-r--r--fs/ceph/crush/mapper.c609
-rw-r--r--fs/ceph/crush/mapper.h20
-rw-r--r--fs/ceph/crypto.c412
-rw-r--r--fs/ceph/crypto.h48
-rw-r--r--fs/ceph/debugfs.c406
-rw-r--r--fs/ceph/decode.h196
-rw-r--r--fs/ceph/dir.c107
-rw-r--r--fs/ceph/export.c26
-rw-r--r--fs/ceph/file.c209
-rw-r--r--fs/ceph/inode.c30
-rw-r--r--fs/ceph/ioctl.c77
-rw-r--r--fs/ceph/ioctl.h4
-rw-r--r--fs/ceph/locks.c23
-rw-r--r--fs/ceph/mds_client.c129
-rw-r--r--fs/ceph/mds_client.h20
-rw-r--r--fs/ceph/mdsmap.c11
-rw-r--r--fs/ceph/mdsmap.h62
-rw-r--r--fs/ceph/messenger.c2277
-rw-r--r--fs/ceph/messenger.h253
-rw-r--r--fs/ceph/mon_client.c1018
-rw-r--r--fs/ceph/mon_client.h121
-rw-r--r--fs/ceph/msgpool.c64
-rw-r--r--fs/ceph/msgpool.h25
-rw-r--r--fs/ceph/msgr.h175
-rw-r--r--fs/ceph/osd_client.c1539
-rw-r--r--fs/ceph/osd_client.h167
-rw-r--r--fs/ceph/osdmap.c1110
-rw-r--r--fs/ceph/osdmap.h128
-rw-r--r--fs/ceph/pagelist.c55
-rw-r--r--fs/ceph/pagelist.h54
-rw-r--r--fs/ceph/rados.h405
-rw-r--r--fs/ceph/snap.c102
-rw-r--r--fs/ceph/strings.c (renamed from fs/ceph/ceph_strings.c)82
-rw-r--r--fs/ceph/super.c1154
-rw-r--r--fs/ceph/super.h405
-rw-r--r--fs/ceph/types.h29
-rw-r--r--fs/ceph/xattr.c18
-rw-r--r--fs/char_dev.c5
-rw-r--r--fs/cifs/README5
-rw-r--r--fs/cifs/cifs_debug.c12
-rw-r--r--fs/cifs/cifs_debug.h2
-rw-r--r--fs/cifs/cifs_dfs_ref.c24
-rw-r--r--fs/cifs/cifs_fs_sb.h13
-rw-r--r--fs/cifs/cifsacl.c46
-rw-r--r--fs/cifs/cifsencrypt.c214
-rw-r--r--fs/cifs/cifsfs.c105
-rw-r--r--fs/cifs/cifsfs.h10
-rw-r--r--fs/cifs/cifsglob.h91
-rw-r--r--fs/cifs/cifspdu.h1
-rw-r--r--fs/cifs/cifsproto.h22
-rw-r--r--fs/cifs/cifssmb.c79
-rw-r--r--fs/cifs/cn_cifs.h37
-rw-r--r--fs/cifs/connect.c534
-rw-r--r--fs/cifs/dir.c212
-rw-r--r--fs/cifs/file.c783
-rw-r--r--fs/cifs/fscache.c13
-rw-r--r--fs/cifs/inode.c239
-rw-r--r--fs/cifs/ioctl.c17
-rw-r--r--fs/cifs/link.c372
-rw-r--r--fs/cifs/misc.c32
-rw-r--r--fs/cifs/ntlmssp.h15
-rw-r--r--fs/cifs/readdir.c79
-rw-r--r--fs/cifs/sess.c167
-rw-r--r--fs/cifs/transport.c6
-rw-r--r--fs/cifs/xattr.c60
-rw-r--r--fs/coda/cache.c17
-rw-r--r--fs/coda/cnode.c19
-rw-r--r--fs/coda/dir.c157
-rw-r--r--fs/coda/file.c31
-rw-r--r--fs/coda/inode.c57
-rw-r--r--fs/coda/pioctl.c23
-rw-r--r--fs/coda/psdev.c42
-rw-r--r--fs/coda/symlink.c3
-rw-r--r--fs/coda/upcall.c89
-rw-r--r--fs/compat.c4
-rw-r--r--fs/compat_ioctl.c70
-rw-r--r--fs/configfs/inode.c1
-rw-r--r--fs/dcache.c277
-rw-r--r--fs/debugfs/file.c3
-rw-r--r--fs/debugfs/inode.c1
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/dlm/debug_fs.c3
-rw-r--r--fs/dlm/lock.c3
-rw-r--r--fs/dlm/plock.c3
-rw-r--r--fs/dlm/user.c3
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/ecryptfs/file.c4
-rw-r--r--fs/ecryptfs/miscdev.c1
-rw-r--r--fs/eventfd.c1
-rw-r--r--fs/eventpoll.c38
-rw-r--r--fs/exec.c213
-rw-r--r--fs/exofs/dir.c4
-rw-r--r--fs/exofs/file.c6
-rw-r--r--fs/exofs/inode.c84
-rw-r--r--fs/exofs/ios.c10
-rw-r--r--fs/exofs/namei.c2
-rw-r--r--fs/exportfs/expfs.c17
-rw-r--r--fs/ext2/dir.c2
-rw-r--r--fs/ext2/ext2.h1
-rw-r--r--fs/ext2/inode.c15
-rw-r--r--fs/ext2/namei.c2
-rw-r--r--fs/ext2/super.c10
-rw-r--r--fs/ext2/xattr.c2
-rw-r--r--fs/ext3/fsync.c3
-rw-r--r--fs/ext3/inode.c4
-rw-r--r--fs/ext3/namei.c2
-rw-r--r--fs/ext3/super.c17
-rw-r--r--fs/ext4/fsync.c5
-rw-r--r--fs/ext4/inode.c11
-rw-r--r--fs/ext4/mballoc.c3
-rw-r--r--fs/ext4/namei.c2
-rw-r--r--fs/ext4/super.c19
-rw-r--r--fs/fat/fatent.c3
-rw-r--r--fs/fat/inode.c5
-rw-r--r--fs/fat/misc.c5
-rw-r--r--fs/fat/namei_msdos.c6
-rw-r--r--fs/fat/namei_vfat.c6
-rw-r--r--fs/fcntl.c62
-rw-r--r--fs/fifo.c1
-rw-r--r--fs/file_table.c17
-rw-r--r--fs/freevxfs/vxfs_inode.c1
-rw-r--r--fs/freevxfs/vxfs_lookup.c14
-rw-r--r--fs/freevxfs/vxfs_super.c7
-rw-r--r--fs/fs-writeback.c100
-rw-r--r--fs/fuse/control.c5
-rw-r--r--fs/fuse/cuse.c1
-rw-r--r--fs/fuse/dev.c21
-rw-r--r--fs/gfs2/Kconfig2
-rw-r--r--fs/gfs2/aops.c27
-rw-r--r--fs/gfs2/bmap.c255
-rw-r--r--fs/gfs2/bmap.h20
-rw-r--r--fs/gfs2/dentry.c2
-rw-r--r--fs/gfs2/dir.c31
-rw-r--r--fs/gfs2/dir.h34
-rw-r--r--fs/gfs2/export.c9
-rw-r--r--fs/gfs2/file.c10
-rw-r--r--fs/gfs2/glock.c23
-rw-r--r--fs/gfs2/glock.h2
-rw-r--r--fs/gfs2/glops.c6
-rw-r--r--fs/gfs2/incore.h8
-rw-r--r--fs/gfs2/inode.c9
-rw-r--r--fs/gfs2/inode.h15
-rw-r--r--fs/gfs2/lock_dlm.c4
-rw-r--r--fs/gfs2/log.c19
-rw-r--r--fs/gfs2/main.c6
-rw-r--r--fs/gfs2/meta_io.c2
-rw-r--r--fs/gfs2/ops_fstype.c80
-rw-r--r--fs/gfs2/ops_inode.c328
-rw-r--r--fs/gfs2/quota.c16
-rw-r--r--fs/gfs2/recovery.c15
-rw-r--r--fs/gfs2/rgrp.c56
-rw-r--r--fs/gfs2/rgrp.h8
-rw-r--r--fs/gfs2/super.c27
-rw-r--r--fs/gfs2/sys.c22
-rw-r--r--fs/gfs2/trace_gfs2.h3
-rw-r--r--fs/gfs2/trans.h9
-rw-r--r--fs/gfs2/xattr.c2
-rw-r--r--fs/hfs/bfind.c4
-rw-r--r--fs/hfs/btree.c2
-rw-r--r--fs/hfs/btree.h2
-rw-r--r--fs/hfs/hfs_fs.h13
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfs/mdb.c4
-rw-r--r--fs/hfs/super.c7
-rw-r--r--fs/hfsplus/bfind.c17
-rw-r--r--fs/hfsplus/bitmap.c20
-rw-r--r--fs/hfsplus/brec.c29
-rw-r--r--fs/hfsplus/btree.c67
-rw-r--r--fs/hfsplus/catalog.c50
-rw-r--r--fs/hfsplus/dir.c203
-rw-r--r--fs/hfsplus/extents.c223
-rw-r--r--fs/hfsplus/hfsplus_fs.h85
-rw-r--r--fs/hfsplus/hfsplus_raw.h3
-rw-r--r--fs/hfsplus/inode.c185
-rw-r--r--fs/hfsplus/ioctl.c153
-rw-r--r--fs/hfsplus/options.c10
-rw-r--r--fs/hfsplus/part_tbl.c5
-rw-r--r--fs/hfsplus/super.c310
-rw-r--r--fs/hfsplus/unicode.c16
-rw-r--r--fs/hfsplus/wrapper.c40
-rw-r--r--fs/hostfs/hostfs.h10
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hostfs/hostfs_user.c14
-rw-r--r--fs/hpfs/Kconfig1
-rw-r--r--fs/hpfs/super.c8
-rw-r--r--fs/hppfs/hppfs.c1
-rw-r--r--fs/hugetlbfs/inode.c17
-rw-r--r--fs/inode.c527
-rw-r--r--fs/internal.h7
-rw-r--r--fs/isofs/dir.c6
-rw-r--r--fs/isofs/inode.c66
-rw-r--r--fs/isofs/isofs.h1
-rw-r--r--fs/isofs/namei.c8
-rw-r--r--fs/isofs/rock.c10
-rw-r--r--fs/jbd/commit.c32
-rw-r--r--fs/jbd2/checkpoint.c3
-rw-r--r--fs/jbd2/commit.c76
-rw-r--r--fs/jbd2/journal.c4
-rw-r--r--fs/jffs2/dir.c4
-rw-r--r--fs/jffs2/fs.c4
-rw-r--r--fs/jffs2/super.c9
-rw-r--r--fs/jfs/jfs_imap.c2
-rw-r--r--fs/jfs/jfs_logmgr.c6
-rw-r--r--fs/jfs/jfs_mount.c4
-rw-r--r--fs/jfs/jfs_txnmgr.c2
-rw-r--r--fs/jfs/namei.c2
-rw-r--r--fs/jfs/super.c23
-rw-r--r--fs/libfs.c37
-rw-r--r--fs/lockd/clntlock.c15
-rw-r--r--fs/lockd/clntproc.c13
-rw-r--r--fs/lockd/host.c1
-rw-r--r--fs/lockd/mon.c1
-rw-r--r--fs/lockd/svc.c13
-rw-r--r--fs/lockd/svc4proc.c2
-rw-r--r--fs/lockd/svclock.c37
-rw-r--r--fs/lockd/svcproc.c2
-rw-r--r--fs/lockd/svcsubs.c9
-rw-r--r--fs/locks.c182
-rw-r--r--fs/logfs/dir.c3
-rw-r--r--fs/minix/namei.c2
-rw-r--r--fs/namei.c16
-rw-r--r--fs/namespace.c4
-rw-r--r--fs/ncpfs/dir.c221
-rw-r--r--fs/ncpfs/file.c25
-rw-r--r--fs/ncpfs/inode.c55
-rw-r--r--fs/ncpfs/ioctl.c470
-rw-r--r--fs/ncpfs/ncplib_kernel.c101
-rw-r--r--fs/ncpfs/ncplib_kernel.h15
-rw-r--r--fs/ncpfs/ncpsign_kernel.c10
-rw-r--r--fs/ncpfs/sock.c1
-rw-r--r--fs/nfs/Kconfig19
-rw-r--r--fs/nfs/Makefile4
-rw-r--r--fs/nfs/callback.c4
-rw-r--r--fs/nfs/callback_proc.c8
-rw-r--r--fs/nfs/client.c28
-rw-r--r--fs/nfs/delegation.c10
-rw-r--r--fs/nfs/dir.c1015
-rw-r--r--fs/nfs/dns_resolve.c6
-rw-r--r--fs/nfs/file.c86
-rw-r--r--fs/nfs/getroot.c3
-rw-r--r--fs/nfs/idmap.c211
-rw-r--r--fs/nfs/inode.c39
-rw-r--r--fs/nfs/internal.h12
-rw-r--r--fs/nfs/mount_clnt.c4
-rw-r--r--fs/nfs/nfs2xdr.c107
-rw-r--r--fs/nfs/nfs3proc.c62
-rw-r--r--fs/nfs/nfs3xdr.c196
-rw-r--r--fs/nfs/nfs4_fs.h4
-rw-r--r--fs/nfs/nfs4filelayout.c280
-rw-r--r--fs/nfs/nfs4filelayout.h94
-rw-r--r--fs/nfs/nfs4filelayoutdev.c448
-rw-r--r--fs/nfs/nfs4proc.c497
-rw-r--r--fs/nfs/nfs4state.c52
-rw-r--r--fs/nfs/nfs4xdr.c700
-rw-r--r--fs/nfs/nfsroot.c568
-rw-r--r--fs/nfs/pnfs.c783
-rw-r--r--fs/nfs/pnfs.h189
-rw-r--r--fs/nfs/proc.c35
-rw-r--r--fs/nfs/read.c4
-rw-r--r--fs/nfs/super.c72
-rw-r--r--fs/nfs/sysctl.c2
-rw-r--r--fs/nfs/unlink.c259
-rw-r--r--fs/nfs/write.c22
-rw-r--r--fs/nfsd/Kconfig12
-rw-r--r--fs/nfsd/export.c73
-rw-r--r--fs/nfsd/nfs4callback.c245
-rw-r--r--fs/nfsd/nfs4idmap.c105
-rw-r--r--fs/nfsd/nfs4proc.c7
-rw-r--r--fs/nfsd/nfs4state.c525
-rw-r--r--fs/nfsd/nfs4xdr.c18
-rw-r--r--fs/nfsd/nfsctl.c27
-rw-r--r--fs/nfsd/nfsd.h2
-rw-r--r--fs/nfsd/nfsfh.h2
-rw-r--r--fs/nfsd/nfssvc.c5
-rw-r--r--fs/nfsd/state.h52
-rw-r--r--fs/nfsd/vfs.c16
-rw-r--r--fs/nilfs2/Makefile2
-rw-r--r--fs/nilfs2/bmap.c22
-rw-r--r--fs/nilfs2/bmap.h10
-rw-r--r--fs/nilfs2/btnode.c17
-rw-r--r--fs/nilfs2/cpfile.c72
-rw-r--r--fs/nilfs2/cpfile.h4
-rw-r--r--fs/nilfs2/dat.c92
-rw-r--r--fs/nilfs2/dat.h4
-rw-r--r--fs/nilfs2/export.h17
-rw-r--r--fs/nilfs2/gcdat.c87
-rw-r--r--fs/nilfs2/gcinode.c134
-rw-r--r--fs/nilfs2/ifile.c51
-rw-r--r--fs/nilfs2/ifile.h4
-rw-r--r--fs/nilfs2/inode.c167
-rw-r--r--fs/nilfs2/ioctl.c24
-rw-r--r--fs/nilfs2/mdt.c313
-rw-r--r--fs/nilfs2/mdt.h32
-rw-r--r--fs/nilfs2/namei.c141
-rw-r--r--fs/nilfs2/nilfs.h38
-rw-r--r--fs/nilfs2/page.c55
-rw-r--r--fs/nilfs2/page.h6
-rw-r--r--fs/nilfs2/recovery.c19
-rw-r--r--fs/nilfs2/sb.h10
-rw-r--r--fs/nilfs2/segbuf.c3
-rw-r--r--fs/nilfs2/segment.c104
-rw-r--r--fs/nilfs2/segment.h10
-rw-r--r--fs/nilfs2/sufile.c77
-rw-r--r--fs/nilfs2/sufile.h6
-rw-r--r--fs/nilfs2/super.c629
-rw-r--r--fs/nilfs2/the_nilfs.c346
-rw-r--r--fs/nilfs2/the_nilfs.h101
-rw-r--r--fs/no-block.c1
-rw-r--r--fs/notify/Kconfig2
-rw-r--r--fs/notify/fanotify/fanotify_user.c1
-rw-r--r--fs/notify/fsnotify.c33
-rw-r--r--fs/notify/inode_mark.c2
-rw-r--r--fs/notify/inotify/inotify_user.c1
-rw-r--r--fs/ntfs/super.c43
-rw-r--r--fs/ocfs2/acl.c3
-rw-r--r--fs/ocfs2/aops.c28
-rw-r--r--fs/ocfs2/aops.h6
-rw-r--r--fs/ocfs2/cluster/heartbeat.c532
-rw-r--r--fs/ocfs2/cluster/heartbeat.h4
-rw-r--r--fs/ocfs2/cluster/masklog.h3
-rw-r--r--fs/ocfs2/cluster/nodemanager.c5
-rw-r--r--fs/ocfs2/cluster/ocfs2_nodemanager.h6
-rw-r--r--fs/ocfs2/cluster/tcp.c7
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h2
-rw-r--r--fs/ocfs2/dcache.c33
-rw-r--r--fs/ocfs2/dcache.h1
-rw-r--r--fs/ocfs2/dir.c24
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h30
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c21
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c401
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c40
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c3
-rw-r--r--fs/ocfs2/dlmglue.c8
-rw-r--r--fs/ocfs2/dlmglue.h1
-rw-r--r--fs/ocfs2/file.c85
-rw-r--r--fs/ocfs2/inode.c1
-rw-r--r--fs/ocfs2/inode.h12
-rw-r--r--fs/ocfs2/ioctl.c356
-rw-r--r--fs/ocfs2/journal.c9
-rw-r--r--fs/ocfs2/journal.h3
-rw-r--r--fs/ocfs2/mmap.c7
-rw-r--r--fs/ocfs2/namei.c5
-rw-r--r--fs/ocfs2/ocfs2.h63
-rw-r--r--fs/ocfs2/ocfs2_fs.h83
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h103
-rw-r--r--fs/ocfs2/refcounttree.c48
-rw-r--r--fs/ocfs2/refcounttree.h7
-rw-r--r--fs/ocfs2/reservations.c22
-rw-r--r--fs/ocfs2/slot_map.c2
-rw-r--r--fs/ocfs2/stack_o2cb.c2
-rw-r--r--fs/ocfs2/stack_user.c4
-rw-r--r--fs/ocfs2/suballoc.c20
-rw-r--r--fs/ocfs2/super.c170
-rw-r--r--fs/ocfs2/symlink.c2
-rw-r--r--fs/ocfs2/sysfile.c60
-rw-r--r--fs/ocfs2/xattr.c6
-rw-r--r--fs/partitions/check.c42
-rw-r--r--fs/partitions/check.h3
-rw-r--r--fs/partitions/efi.c25
-rw-r--r--fs/partitions/ldm.c2
-rw-r--r--fs/partitions/ldm.h2
-rw-r--r--fs/pipe.c4
-rw-r--r--fs/proc/Kconfig4
-rw-r--r--fs/proc/base.c123
-rw-r--r--fs/proc/proc_sysctl.c3
-rw-r--r--fs/proc/root.c1
-rw-r--r--fs/proc/softirqs.c4
-rw-r--r--fs/proc/stat.c14
-rw-r--r--fs/proc/task_mmu.c11
-rw-r--r--fs/proc/vmcore.c2
-rw-r--r--fs/qnx4/dir.c4
-rw-r--r--fs/qnx4/inode.c6
-rw-r--r--fs/qnx4/namei.c4
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/read_write.c33
-rw-r--r--fs/reiserfs/Kconfig6
-rw-r--r--fs/reiserfs/README2
-rw-r--r--fs/reiserfs/file.c3
-rw-r--r--fs/reiserfs/inode.c26
-rw-r--r--fs/reiserfs/ioctl.c13
-rw-r--r--fs/reiserfs/journal.c106
-rw-r--r--fs/reiserfs/namei.c2
-rw-r--r--fs/reiserfs/xattr.c7
-rw-r--r--fs/romfs/super.c1
-rw-r--r--fs/select.c6
-rw-r--r--fs/seq_file.c8
-rw-r--r--fs/signalfd.c11
-rw-r--r--fs/smbfs/Kconfig1
-rw-r--r--fs/smbfs/dir.c16
-rw-r--r--fs/smbfs/inode.c6
-rw-r--r--fs/smbfs/proc.c10
-rw-r--r--fs/squashfs/dir.c3
-rw-r--r--fs/squashfs/super.c5
-rw-r--r--fs/super.c8
-rw-r--r--fs/sysfs/bin.c68
-rw-r--r--fs/sysfs/group.c59
-rw-r--r--fs/sysv/namei.c2
-rw-r--r--fs/timerfd.c1
-rw-r--r--fs/ubifs/commit.c4
-rw-r--r--fs/ubifs/debug.c157
-rw-r--r--fs/ubifs/debug.h4
-rw-r--r--fs/ubifs/dir.c2
-rw-r--r--fs/ubifs/file.c7
-rw-r--r--fs/ubifs/gc.c82
-rw-r--r--fs/ubifs/io.c20
-rw-r--r--fs/ubifs/journal.c3
-rw-r--r--fs/ubifs/key.h14
-rw-r--r--fs/ubifs/log.c6
-rw-r--r--fs/ubifs/lpt.c7
-rw-r--r--fs/ubifs/lpt_commit.c3
-rw-r--r--fs/ubifs/master.c3
-rw-r--r--fs/ubifs/misc.h9
-rw-r--r--fs/ubifs/recovery.c11
-rw-r--r--fs/ubifs/replay.c20
-rw-r--r--fs/ubifs/sb.c9
-rw-r--r--fs/ubifs/scan.c6
-rw-r--r--fs/ubifs/shrinker.c2
-rw-r--r--fs/ubifs/super.c80
-rw-r--r--fs/ubifs/tnc.c5
-rw-r--r--fs/ubifs/ubifs.h23
-rw-r--r--fs/udf/Kconfig1
-rw-r--r--fs/udf/namei.c2
-rw-r--r--fs/udf/super.c8
-rw-r--r--fs/ufs/Kconfig1
-rw-r--r--fs/ufs/namei.c2
-rw-r--r--fs/ufs/super.c5
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c3
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c238
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h81
-rw-r--r--fs/xfs/linux-2.6/xfs_cred.h28
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.c31
-rw-r--r--fs/xfs/linux-2.6/xfs_globals.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_globals.h23
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c19
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c5
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.h6
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c45
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h5
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c29
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c432
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h4
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h5
-rw-r--r--fs/xfs/linux-2.6/xfs_version.h29
-rw-r--r--fs/xfs/quota/xfs_dquot.c164
-rw-r--r--fs/xfs/quota/xfs_qm.c221
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c16
-rw-r--r--fs/xfs/xfs_ag.h9
-rw-r--r--fs/xfs/xfs_alloc.c4
-rw-r--r--fs/xfs/xfs_alloc_btree.c33
-rw-r--r--fs/xfs/xfs_attr.c37
-rw-r--r--fs/xfs/xfs_bmap.c44
-rw-r--r--fs/xfs/xfs_bmap.h9
-rw-r--r--fs/xfs/xfs_btree.c56
-rw-r--r--fs/xfs/xfs_btree.h14
-rw-r--r--fs/xfs/xfs_buf_item.c7
-rw-r--r--fs/xfs/xfs_da_btree.c2
-rw-r--r--fs/xfs/xfs_dinode.h5
-rw-r--r--fs/xfs/xfs_dir2_leaf.c2
-rw-r--r--fs/xfs/xfs_fs.h7
-rw-r--r--fs/xfs/xfs_fsops.c14
-rw-r--r--fs/xfs/xfs_ialloc.c2
-rw-r--r--fs/xfs/xfs_ialloc_btree.c33
-rw-r--r--fs/xfs/xfs_iget.c4
-rw-r--r--fs/xfs/xfs_inode.c17
-rw-r--r--fs/xfs/xfs_inode.h32
-rw-r--r--fs/xfs/xfs_inode_item.c9
-rw-r--r--fs/xfs/xfs_itable.c3
-rw-r--r--fs/xfs/xfs_log.c18
-rw-r--r--fs/xfs/xfs_log_cil.c244
-rw-r--r--fs/xfs/xfs_log_priv.h37
-rw-r--r--fs/xfs/xfs_log_recover.c25
-rw-r--r--fs/xfs/xfs_mount.c308
-rw-r--r--fs/xfs/xfs_mount.h9
-rw-r--r--fs/xfs/xfs_refcache.h52
-rw-r--r--fs/xfs/xfs_rename.c14
-rw-r--r--fs/xfs/xfs_rtalloc.c29
-rw-r--r--fs/xfs/xfs_sb.h10
-rw-r--r--fs/xfs/xfs_trans.c91
-rw-r--r--fs/xfs/xfs_trans.h3
-rw-r--r--fs/xfs/xfs_trans_buf.c2
-rw-r--r--fs/xfs/xfs_trans_inode.c30
-rw-r--r--fs/xfs/xfs_types.h2
-rw-r--r--fs/xfs/xfs_utils.c9
-rw-r--r--fs/xfs/xfs_utils.h3
-rw-r--r--fs/xfs/xfs_vnodeops.c65
-rw-r--r--fs/xfs/xfs_vnodeops.h6
549 files changed, 17086 insertions, 23080 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 9e670d527646..ef5905f7c8a3 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1789,9 +1789,10 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
1789 kfree(st); 1789 kfree(st);
1790 } else { 1790 } else {
1791 /* Caching disabled. No need to get upto date stat info. 1791 /* Caching disabled. No need to get upto date stat info.
1792 * This dentry will be released immediately. So, just i_count++ 1792 * This dentry will be released immediately. So, just hold the
1793 * inode
1793 */ 1794 */
1794 atomic_inc(&old_dentry->d_inode->i_count); 1795 ihold(old_dentry->d_inode);
1795 } 1796 }
1796 1797
1797 dentry->d_op = old_dentry->d_op; 1798 dentry->d_op = old_dentry->d_op;
diff --git a/fs/Kconfig b/fs/Kconfig
index 3d185308ec88..97673c955484 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -47,6 +47,9 @@ source "fs/nilfs2/Kconfig"
47 47
48endif # BLOCK 48endif # BLOCK
49 49
50config EXPORTFS
51 tristate
52
50config FILE_LOCKING 53config FILE_LOCKING
51 bool "Enable POSIX file locking API" if EMBEDDED 54 bool "Enable POSIX file locking API" if EMBEDDED
52 default y 55 default y
@@ -221,9 +224,6 @@ config LOCKD_V4
221 depends on FILE_LOCKING 224 depends on FILE_LOCKING
222 default y 225 default y
223 226
224config EXPORTFS
225 tristate
226
227config NFS_ACL_SUPPORT 227config NFS_ACL_SUPPORT
228 tristate 228 tristate
229 select FS_POSIX_ACL 229 select FS_POSIX_ACL
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index bb4cc5b8abc8..79e2ca7973b7 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -42,7 +42,7 @@ config BINFMT_ELF_FDPIC
42 42
43config CORE_DUMP_DEFAULT_ELF_HEADERS 43config CORE_DUMP_DEFAULT_ELF_HEADERS
44 bool "Write ELF core dumps with partial segments" 44 bool "Write ELF core dumps with partial segments"
45 default n 45 default y
46 depends on BINFMT_ELF && ELF_CORE 46 depends on BINFMT_ELF && ELF_CORE
47 help 47 help
48 ELF core dump files describe each memory mapping of the crashed 48 ELF core dump files describe each memory mapping of the crashed
@@ -60,7 +60,7 @@ config CORE_DUMP_DEFAULT_ELF_HEADERS
60 inherited. See Documentation/filesystems/proc.txt for details. 60 inherited. See Documentation/filesystems/proc.txt for details.
61 61
62 This config option changes the default setting of coredump_filter 62 This config option changes the default setting of coredump_filter
63 seen at boot time. If unsure, say N. 63 seen at boot time. If unsure, say Y.
64 64
65config BINFMT_FLAT 65config BINFMT_FLAT
66 bool "Kernel support for flat binaries" 66 bool "Kernel support for flat binaries"
diff --git a/fs/Makefile b/fs/Makefile
index e6ec1d309b1d..26956fcec917 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -29,10 +29,7 @@ obj-$(CONFIG_EVENTFD) += eventfd.o
29obj-$(CONFIG_AIO) += aio.o 29obj-$(CONFIG_AIO) += aio.o
30obj-$(CONFIG_FILE_LOCKING) += locks.o 30obj-$(CONFIG_FILE_LOCKING) += locks.o
31obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o 31obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
32 32obj-$(CONFIG_NFSD_DEPRECATED) += nfsctl.o
33nfsd-$(CONFIG_NFSD) := nfsctl.o
34obj-y += $(nfsd-y) $(nfsd-m)
35
36obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o 33obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
37obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o 34obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o
38obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o 35obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index e55182a74605..1dd5f34b3cf2 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig
@@ -1,6 +1,7 @@
1config ADFS_FS 1config ADFS_FS
2 tristate "ADFS file system support (EXPERIMENTAL)" 2 tristate "ADFS file system support (EXPERIMENTAL)"
3 depends on BLOCK && EXPERIMENTAL 3 depends on BLOCK && EXPERIMENTAL
4 depends on BKL # need to fix
4 help 5 help
5 The Acorn Disc Filing System is the standard file system of the 6 The Acorn Disc Filing System is the standard file system of the
6 RiscOS operating system which runs on Acorn's ARM-based Risc PC 7 RiscOS operating system which runs on Acorn's ARM-based Risc PC
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 4a3af7075c1d..d9803f73236f 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -352,11 +352,15 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
352 struct adfs_sb_info *asb; 352 struct adfs_sb_info *asb;
353 struct inode *root; 353 struct inode *root;
354 354
355 lock_kernel();
356
355 sb->s_flags |= MS_NODIRATIME; 357 sb->s_flags |= MS_NODIRATIME;
356 358
357 asb = kzalloc(sizeof(*asb), GFP_KERNEL); 359 asb = kzalloc(sizeof(*asb), GFP_KERNEL);
358 if (!asb) 360 if (!asb) {
361 unlock_kernel();
359 return -ENOMEM; 362 return -ENOMEM;
363 }
360 sb->s_fs_info = asb; 364 sb->s_fs_info = asb;
361 365
362 /* set default options */ 366 /* set default options */
@@ -474,6 +478,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
474 goto error; 478 goto error;
475 } else 479 } else
476 sb->s_root->d_op = &adfs_dentry_operations; 480 sb->s_root->d_op = &adfs_dentry_operations;
481 unlock_kernel();
477 return 0; 482 return 0;
478 483
479error_free_bh: 484error_free_bh:
@@ -481,6 +486,7 @@ error_free_bh:
481error: 486error:
482 sb->s_fs_info = NULL; 487 sb->s_fs_info = NULL;
483 kfree(asb); 488 kfree(asb);
489 unlock_kernel();
484 return -EINVAL; 490 return -EINVAL;
485} 491}
486 492
diff --git a/fs/affs/file.c b/fs/affs/file.c
index c4a9875bd1a6..0a90dcd46de2 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -894,9 +894,9 @@ affs_truncate(struct inode *inode)
894 if (AFFS_SB(sb)->s_flags & SF_OFS) { 894 if (AFFS_SB(sb)->s_flags & SF_OFS) {
895 struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0); 895 struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0);
896 u32 tmp; 896 u32 tmp;
897 if (IS_ERR(ext_bh)) { 897 if (IS_ERR(bh)) {
898 affs_warning(sb, "truncate", "unexpected read error for last block %u (%d)", 898 affs_warning(sb, "truncate", "unexpected read error for last block %u (%d)",
899 ext, PTR_ERR(ext_bh)); 899 ext, PTR_ERR(bh));
900 return; 900 return;
901 } 901 }
902 tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next); 902 tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 3a0fdec175ba..5d828903ac69 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -388,7 +388,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
388 affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain)); 388 affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain));
389 mark_buffer_dirty_inode(inode_bh, inode); 389 mark_buffer_dirty_inode(inode_bh, inode);
390 inode->i_nlink = 2; 390 inode->i_nlink = 2;
391 atomic_inc(&inode->i_count); 391 ihold(inode);
392 } 392 }
393 affs_fix_checksum(sb, bh); 393 affs_fix_checksum(sb, bh);
394 mark_buffer_dirty_inode(bh, inode); 394 mark_buffer_dirty_inode(bh, inode);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 33c4e7eef470..fa4fbe1e238a 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -16,7 +16,6 @@
16#include <linux/parser.h> 16#include <linux/parser.h>
17#include <linux/magic.h> 17#include <linux/magic.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/smp_lock.h>
20#include <linux/slab.h> 19#include <linux/slab.h>
21#include "affs.h" 20#include "affs.h"
22 21
@@ -46,8 +45,6 @@ affs_put_super(struct super_block *sb)
46 struct affs_sb_info *sbi = AFFS_SB(sb); 45 struct affs_sb_info *sbi = AFFS_SB(sb);
47 pr_debug("AFFS: put_super()\n"); 46 pr_debug("AFFS: put_super()\n");
48 47
49 lock_kernel();
50
51 if (!(sb->s_flags & MS_RDONLY) && sb->s_dirt) 48 if (!(sb->s_flags & MS_RDONLY) && sb->s_dirt)
52 affs_commit_super(sb, 1, 1); 49 affs_commit_super(sb, 1, 1);
53 50
@@ -56,8 +53,6 @@ affs_put_super(struct super_block *sb)
56 affs_brelse(sbi->s_root_bh); 53 affs_brelse(sbi->s_root_bh);
57 kfree(sbi); 54 kfree(sbi);
58 sb->s_fs_info = NULL; 55 sb->s_fs_info = NULL;
59
60 unlock_kernel();
61} 56}
62 57
63static void 58static void
@@ -109,8 +104,8 @@ static void init_once(void *foo)
109{ 104{
110 struct affs_inode_info *ei = (struct affs_inode_info *) foo; 105 struct affs_inode_info *ei = (struct affs_inode_info *) foo;
111 106
112 init_MUTEX(&ei->i_link_lock); 107 sema_init(&ei->i_link_lock, 1);
113 init_MUTEX(&ei->i_ext_lock); 108 sema_init(&ei->i_ext_lock, 1);
114 inode_init_once(&ei->vfs_inode); 109 inode_init_once(&ei->vfs_inode);
115} 110}
116 111
@@ -302,6 +297,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
302 sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL); 297 sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL);
303 if (!sbi) 298 if (!sbi)
304 return -ENOMEM; 299 return -ENOMEM;
300
305 sb->s_fs_info = sbi; 301 sb->s_fs_info = sbi;
306 mutex_init(&sbi->s_bmlock); 302 mutex_init(&sbi->s_bmlock);
307 spin_lock_init(&sbi->symlink_lock); 303 spin_lock_init(&sbi->symlink_lock);
@@ -527,7 +523,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
527 kfree(new_opts); 523 kfree(new_opts);
528 return -EINVAL; 524 return -EINVAL;
529 } 525 }
530 lock_kernel(); 526
531 replace_mount_options(sb, new_opts); 527 replace_mount_options(sb, new_opts);
532 528
533 sbi->s_flags = mount_flags; 529 sbi->s_flags = mount_flags;
@@ -543,17 +539,15 @@ affs_remount(struct super_block *sb, int *flags, char *data)
543 memcpy(sbi->s_volume, volume, 32); 539 memcpy(sbi->s_volume, volume, 32);
544 spin_unlock(&sbi->symlink_lock); 540 spin_unlock(&sbi->symlink_lock);
545 541
546 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { 542 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
547 unlock_kernel();
548 return 0; 543 return 0;
549 } 544
550 if (*flags & MS_RDONLY) { 545 if (*flags & MS_RDONLY) {
551 affs_write_super(sb); 546 affs_write_super(sb);
552 affs_free_bitmap(sb); 547 affs_free_bitmap(sb);
553 } else 548 } else
554 res = affs_init_bitmap(sb, flags); 549 res = affs_init_bitmap(sb, flags);
555 550
556 unlock_kernel();
557 return res; 551 return res;
558} 552}
559 553
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 0d38c09bd55e..5439e1bc9a86 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1045,7 +1045,7 @@ static int afs_link(struct dentry *from, struct inode *dir,
1045 if (ret < 0) 1045 if (ret < 0)
1046 goto link_error; 1046 goto link_error;
1047 1047
1048 atomic_inc(&vnode->vfs_inode.i_count); 1048 ihold(&vnode->vfs_inode);
1049 d_instantiate(dentry, &vnode->vfs_inode); 1049 d_instantiate(dentry, &vnode->vfs_inode);
1050 key_put(key); 1050 key_put(key);
1051 _leave(" = 0"); 1051 _leave(" = 0");
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 0931bc1325eb..757d664575dd 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -9,7 +9,6 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/smp_lock.h>
13#include "internal.h" 12#include "internal.h"
14 13
15#define AFS_LOCK_GRANTED 0 14#define AFS_LOCK_GRANTED 0
@@ -274,7 +273,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
274 273
275 type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; 274 type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
276 275
277 lock_kernel(); 276 lock_flocks();
278 277
279 /* make sure we've got a callback on this file and that our view of the 278 /* make sure we've got a callback on this file and that our view of the
280 * data version is up to date */ 279 * data version is up to date */
@@ -421,7 +420,7 @@ given_lock:
421 afs_vnode_fetch_status(vnode, NULL, key); 420 afs_vnode_fetch_status(vnode, NULL, key);
422 421
423error: 422error:
424 unlock_kernel(); 423 unlock_flocks();
425 _leave(" = %d", ret); 424 _leave(" = %d", ret);
426 return ret; 425 return ret;
427 426
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 6d552686c498..6153417caf57 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -29,6 +29,7 @@ static void afs_mntpt_expiry_timed_out(struct work_struct *work);
29 29
30const struct file_operations afs_mntpt_file_operations = { 30const struct file_operations afs_mntpt_file_operations = {
31 .open = afs_mntpt_open, 31 .open = afs_mntpt_open,
32 .llseek = noop_llseek,
32}; 33};
33 34
34const struct inode_operations afs_mntpt_inode_operations = { 35const struct inode_operations afs_mntpt_inode_operations = {
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 77e1e5a61154..eacf76d98ae0 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -19,7 +19,6 @@
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/smp_lock.h>
23#include <linux/fs.h> 22#include <linux/fs.h>
24#include <linux/pagemap.h> 23#include <linux/pagemap.h>
25#include <linux/parser.h> 24#include <linux/parser.h>
@@ -453,12 +452,8 @@ static void afs_put_super(struct super_block *sb)
453 452
454 _enter(""); 453 _enter("");
455 454
456 lock_kernel();
457
458 afs_put_volume(as->volume); 455 afs_put_volume(as->volume);
459 456
460 unlock_kernel();
461
462 _leave(""); 457 _leave("");
463} 458}
464 459
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 722743b152d8..15690bb1d3b5 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -438,7 +438,6 @@ no_more:
438 */ 438 */
439int afs_writepage(struct page *page, struct writeback_control *wbc) 439int afs_writepage(struct page *page, struct writeback_control *wbc)
440{ 440{
441 struct backing_dev_info *bdi = page->mapping->backing_dev_info;
442 struct afs_writeback *wb; 441 struct afs_writeback *wb;
443 int ret; 442 int ret;
444 443
@@ -455,8 +454,6 @@ int afs_writepage(struct page *page, struct writeback_control *wbc)
455 } 454 }
456 455
457 wbc->nr_to_write -= ret; 456 wbc->nr_to_write -= ret;
458 if (wbc->nonblocking && bdi_write_congested(bdi))
459 wbc->encountered_congestion = 1;
460 457
461 _leave(" = 0"); 458 _leave(" = 0");
462 return 0; 459 return 0;
@@ -469,7 +466,6 @@ static int afs_writepages_region(struct address_space *mapping,
469 struct writeback_control *wbc, 466 struct writeback_control *wbc,
470 pgoff_t index, pgoff_t end, pgoff_t *_next) 467 pgoff_t index, pgoff_t end, pgoff_t *_next)
471{ 468{
472 struct backing_dev_info *bdi = mapping->backing_dev_info;
473 struct afs_writeback *wb; 469 struct afs_writeback *wb;
474 struct page *page; 470 struct page *page;
475 int ret, n; 471 int ret, n;
@@ -529,11 +525,6 @@ static int afs_writepages_region(struct address_space *mapping,
529 525
530 wbc->nr_to_write -= ret; 526 wbc->nr_to_write -= ret;
531 527
532 if (wbc->nonblocking && bdi_write_congested(bdi)) {
533 wbc->encountered_congestion = 1;
534 break;
535 }
536
537 cond_resched(); 528 cond_resched();
538 } while (index < end && wbc->nr_to_write > 0); 529 } while (index < end && wbc->nr_to_write > 0);
539 530
@@ -548,24 +539,16 @@ static int afs_writepages_region(struct address_space *mapping,
548int afs_writepages(struct address_space *mapping, 539int afs_writepages(struct address_space *mapping,
549 struct writeback_control *wbc) 540 struct writeback_control *wbc)
550{ 541{
551 struct backing_dev_info *bdi = mapping->backing_dev_info;
552 pgoff_t start, end, next; 542 pgoff_t start, end, next;
553 int ret; 543 int ret;
554 544
555 _enter(""); 545 _enter("");
556 546
557 if (wbc->nonblocking && bdi_write_congested(bdi)) {
558 wbc->encountered_congestion = 1;
559 _leave(" = 0 [congest]");
560 return 0;
561 }
562
563 if (wbc->range_cyclic) { 547 if (wbc->range_cyclic) {
564 start = mapping->writeback_index; 548 start = mapping->writeback_index;
565 end = -1; 549 end = -1;
566 ret = afs_writepages_region(mapping, wbc, start, end, &next); 550 ret = afs_writepages_region(mapping, wbc, start, end, &next);
567 if (start > 0 && wbc->nr_to_write > 0 && ret == 0 && 551 if (start > 0 && wbc->nr_to_write > 0 && ret == 0)
568 !(wbc->nonblocking && wbc->encountered_congestion))
569 ret = afs_writepages_region(mapping, wbc, 0, start, 552 ret = afs_writepages_region(mapping, wbc, 0, start,
570 &next); 553 &next);
571 mapping->writeback_index = next; 554 mapping->writeback_index = next;
diff --git a/fs/aio.c b/fs/aio.c
index 1320b2a05fb2..8c8f6c5b6d79 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -712,8 +712,16 @@ static ssize_t aio_run_iocb(struct kiocb *iocb)
712 */ 712 */
713 ret = retry(iocb); 713 ret = retry(iocb);
714 714
715 if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) 715 if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
716 /*
717 * There's no easy way to restart the syscall since other AIO's
718 * may be already running. Just fail this IO with EINTR.
719 */
720 if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
721 ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK))
722 ret = -EINTR;
716 aio_complete(iocb, ret, 0); 723 aio_complete(iocb, ret, 0);
724 }
717out: 725out:
718 spin_lock_irq(&ctx->ctx_lock); 726 spin_lock_irq(&ctx->ctx_lock);
719 727
@@ -1535,7 +1543,19 @@ static void aio_batch_add(struct address_space *mapping,
1535 } 1543 }
1536 1544
1537 abe = mempool_alloc(abe_pool, GFP_KERNEL); 1545 abe = mempool_alloc(abe_pool, GFP_KERNEL);
1538 BUG_ON(!igrab(mapping->host)); 1546
1547 /*
1548 * we should be using igrab here, but
1549 * we don't want to hammer on the global
1550 * inode spinlock just to take an extra
1551 * reference on a file that we must already
1552 * have a reference to.
1553 *
1554 * When we're called, we always have a reference
1555 * on the file, so we must always have a reference
1556 * on the inode, so ihold() is safe here.
1557 */
1558 ihold(mapping->host);
1539 abe->mapping = mapping; 1559 abe->mapping = mapping;
1540 hlist_add_head(&abe->list, &batch_hash[bucket]); 1560 hlist_add_head(&abe->list, &batch_hash[bucket]);
1541 return; 1561 return;
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index e4b75d6eda83..5365527ca43f 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -111,10 +111,9 @@ struct file *anon_inode_getfile(const char *name,
111 path.mnt = mntget(anon_inode_mnt); 111 path.mnt = mntget(anon_inode_mnt);
112 /* 112 /*
113 * We know the anon_inode inode count is always greater than zero, 113 * We know the anon_inode inode count is always greater than zero,
114 * so we can avoid doing an igrab() and we can use an open-coded 114 * so ihold() is safe.
115 * atomic_inc().
116 */ 115 */
117 atomic_inc(&anon_inode_inode->i_count); 116 ihold(anon_inode_inode);
118 117
119 path.dentry->d_op = &anon_inodefs_dentry_operations; 118 path.dentry->d_op = &anon_inodefs_dentry_operations;
120 d_instantiate(path.dentry, anon_inode_inode); 119 d_instantiate(path.dentry, anon_inode_inode);
@@ -194,6 +193,7 @@ static struct inode *anon_inode_mkinode(void)
194 if (!inode) 193 if (!inode)
195 return ERR_PTR(-ENOMEM); 194 return ERR_PTR(-ENOMEM);
196 195
196 inode->i_ino = get_next_ino();
197 inode->i_fop = &anon_inode_fops; 197 inode->i_fop = &anon_inode_fops;
198 198
199 inode->i_mapping->a_ops = &anon_aops; 199 inode->i_mapping->a_ops = &anon_aops;
diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig
index 5f3bea90911e..480e210c83ab 100644
--- a/fs/autofs/Kconfig
+++ b/fs/autofs/Kconfig
@@ -1,5 +1,6 @@
1config AUTOFS_FS 1config AUTOFS_FS
2 tristate "Kernel automounter support" 2 tristate "Kernel automounter support"
3 depends on BKL # unfixable, just use autofs4
3 help 4 help
4 The automounter is a tool to automatically mount remote file systems 5 The automounter is a tool to automatically mount remote file systems
5 on demand. This implementation is partially kernel-based to reduce 6 on demand. This implementation is partially kernel-based to reduce
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 11b1ea786d00..0c4ca81aeaeb 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -27,7 +27,9 @@ static int autofs_root_unlink(struct inode *,struct dentry *);
27static int autofs_root_rmdir(struct inode *,struct dentry *); 27static int autofs_root_rmdir(struct inode *,struct dentry *);
28static int autofs_root_mkdir(struct inode *,struct dentry *,int); 28static int autofs_root_mkdir(struct inode *,struct dentry *,int);
29static long autofs_root_ioctl(struct file *,unsigned int,unsigned long); 29static long autofs_root_ioctl(struct file *,unsigned int,unsigned long);
30#ifdef CONFIG_COMPAT
30static long autofs_root_compat_ioctl(struct file *,unsigned int,unsigned long); 31static long autofs_root_compat_ioctl(struct file *,unsigned int,unsigned long);
32#endif
31 33
32const struct file_operations autofs_root_operations = { 34const struct file_operations autofs_root_operations = {
33 .llseek = generic_file_llseek, 35 .llseek = generic_file_llseek,
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index ba4a38b9c22f..eff9a419469a 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -724,6 +724,7 @@ static const struct file_operations _dev_ioctl_fops = {
724 .unlocked_ioctl = autofs_dev_ioctl, 724 .unlocked_ioctl = autofs_dev_ioctl,
725 .compat_ioctl = autofs_dev_ioctl_compat, 725 .compat_ioctl = autofs_dev_ioctl_compat,
726 .owner = THIS_MODULE, 726 .owner = THIS_MODULE,
727 .llseek = noop_llseek,
727}; 728};
728 729
729static struct miscdevice _autofs_dev_ioctl_misc = { 730static struct miscdevice _autofs_dev_ioctl_misc = {
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 821b2b955dac..ac87e49fa706 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -398,6 +398,7 @@ struct inode *autofs4_get_inode(struct super_block *sb,
398 inode->i_gid = sb->s_root->d_inode->i_gid; 398 inode->i_gid = sb->s_root->d_inode->i_gid;
399 } 399 }
400 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 400 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
401 inode->i_ino = get_next_ino();
401 402
402 if (S_ISDIR(inf->mode)) { 403 if (S_ISDIR(inf->mode)) {
403 inode->i_nlink = 2; 404 inode->i_nlink = 2;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index cb1bd38dc08c..d5c1401f0031 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -19,7 +19,7 @@
19#include <linux/param.h> 19#include <linux/param.h>
20#include <linux/time.h> 20#include <linux/time.h>
21#include <linux/compat.h> 21#include <linux/compat.h>
22#include <linux/smp_lock.h> 22#include <linux/mutex.h>
23 23
24#include "autofs_i.h" 24#include "autofs_i.h"
25 25
@@ -28,7 +28,9 @@ static int autofs4_dir_unlink(struct inode *,struct dentry *);
28static int autofs4_dir_rmdir(struct inode *,struct dentry *); 28static int autofs4_dir_rmdir(struct inode *,struct dentry *);
29static int autofs4_dir_mkdir(struct inode *,struct dentry *,int); 29static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
30static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long); 30static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
31#ifdef CONFIG_COMPAT
31static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long); 32static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
33#endif
32static int autofs4_dir_open(struct inode *inode, struct file *file); 34static int autofs4_dir_open(struct inode *inode, struct file *file);
33static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); 35static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
34static void *autofs4_follow_link(struct dentry *, struct nameidata *); 36static void *autofs4_follow_link(struct dentry *, struct nameidata *);
@@ -978,15 +980,17 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
978 } 980 }
979} 981}
980 982
983static DEFINE_MUTEX(autofs4_ioctl_mutex);
984
981static long autofs4_root_ioctl(struct file *filp, 985static long autofs4_root_ioctl(struct file *filp,
982 unsigned int cmd, unsigned long arg) 986 unsigned int cmd, unsigned long arg)
983{ 987{
984 long ret; 988 long ret;
985 struct inode *inode = filp->f_dentry->d_inode; 989 struct inode *inode = filp->f_dentry->d_inode;
986 990
987 lock_kernel(); 991 mutex_lock(&autofs4_ioctl_mutex);
988 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); 992 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
989 unlock_kernel(); 993 mutex_unlock(&autofs4_ioctl_mutex);
990 994
991 return ret; 995 return ret;
992} 996}
@@ -998,13 +1002,13 @@ static long autofs4_root_compat_ioctl(struct file *filp,
998 struct inode *inode = filp->f_path.dentry->d_inode; 1002 struct inode *inode = filp->f_path.dentry->d_inode;
999 int ret; 1003 int ret;
1000 1004
1001 lock_kernel(); 1005 mutex_lock(&autofs4_ioctl_mutex);
1002 if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL) 1006 if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
1003 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); 1007 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
1004 else 1008 else
1005 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, 1009 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
1006 (unsigned long)compat_ptr(arg)); 1010 (unsigned long)compat_ptr(arg));
1007 unlock_kernel(); 1011 mutex_unlock(&autofs4_ioctl_mutex);
1008 1012
1009 return ret; 1013 return ret;
1010} 1014}
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index d967e052b779..685ecff3ab31 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -176,7 +176,7 @@ static int bfs_link(struct dentry *old, struct inode *dir,
176 inc_nlink(inode); 176 inc_nlink(inode);
177 inode->i_ctime = CURRENT_TIME_SEC; 177 inode->i_ctime = CURRENT_TIME_SEC;
178 mark_inode_dirty(inode); 178 mark_inode_dirty(inode);
179 atomic_inc(&inode->i_count); 179 ihold(inode);
180 d_instantiate(new, inode); 180 d_instantiate(new, inode);
181 mutex_unlock(&info->bfs_lock); 181 mutex_unlock(&info->bfs_lock);
182 return 0; 182 return 0;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index c4daf0f5fc02..883e77acd5a8 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -12,7 +12,6 @@
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/smp_lock.h>
16#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
17#include <linux/vfs.h> 16#include <linux/vfs.h>
18#include <linux/writeback.h> 17#include <linux/writeback.h>
@@ -215,14 +214,10 @@ static void bfs_put_super(struct super_block *s)
215 if (!info) 214 if (!info)
216 return; 215 return;
217 216
218 lock_kernel();
219
220 mutex_destroy(&info->bfs_lock); 217 mutex_destroy(&info->bfs_lock);
221 kfree(info->si_imap); 218 kfree(info->si_imap);
222 kfree(info); 219 kfree(info);
223 s->s_fs_info = NULL; 220 s->s_fs_info = NULL;
224
225 unlock_kernel();
226} 221}
227 222
228static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf) 223static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index f96eff04e11a..a6395bdb26ae 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -134,10 +134,6 @@ static int aout_core_dump(struct coredump_params *cprm)
134 if (!dump_write(file, dump_start, dump_size)) 134 if (!dump_write(file, dump_start, dump_size))
135 goto end_coredump; 135 goto end_coredump;
136 } 136 }
137/* Finally dump the task struct. Not be used by gdb, but could be useful */
138 set_fs(KERNEL_DS);
139 if (!dump_write(file, current, sizeof(*current)))
140 goto end_coredump;
141end_coredump: 137end_coredump:
142 set_fs(fs); 138 set_fs(fs);
143 return has_dumped; 139 return has_dumped;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 535e763ab1a6..6884e198e0c7 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -800,7 +800,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
800 * default mmap base, as well as whatever program they 800 * default mmap base, as well as whatever program they
801 * might try to exec. This is because the brk will 801 * might try to exec. This is because the brk will
802 * follow the loader, and is not movable. */ 802 * follow the loader, and is not movable. */
803#ifdef CONFIG_X86 803#if defined(CONFIG_X86) || defined(CONFIG_ARM)
804 load_bias = 0; 804 load_bias = 0;
805#else 805#else
806 load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); 806 load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index fd0cc0bf9a40..29990f0eee0c 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -495,6 +495,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
495 struct inode * inode = new_inode(sb); 495 struct inode * inode = new_inode(sb);
496 496
497 if (inode) { 497 if (inode) {
498 inode->i_ino = get_next_ino();
498 inode->i_mode = mode; 499 inode->i_mode = mode;
499 inode->i_atime = inode->i_mtime = inode->i_ctime = 500 inode->i_atime = inode->i_mtime = inode->i_ctime =
500 current_fs_time(inode->i_sb); 501 current_fs_time(inode->i_sb);
@@ -576,6 +577,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
576static const struct file_operations bm_entry_operations = { 577static const struct file_operations bm_entry_operations = {
577 .read = bm_entry_read, 578 .read = bm_entry_read,
578 .write = bm_entry_write, 579 .write = bm_entry_write,
580 .llseek = default_llseek,
579}; 581};
580 582
581/* /register */ 583/* /register */
@@ -643,6 +645,7 @@ out:
643 645
644static const struct file_operations bm_register_operations = { 646static const struct file_operations bm_register_operations = {
645 .write = bm_register_write, 647 .write = bm_register_write,
648 .llseek = noop_llseek,
646}; 649};
647 650
648/* /status */ 651/* /status */
@@ -680,6 +683,7 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer,
680static const struct file_operations bm_status_operations = { 683static const struct file_operations bm_status_operations = {
681 .read = bm_status_read, 684 .read = bm_status_read,
682 .write = bm_status_write, 685 .write = bm_status_write,
686 .llseek = default_llseek,
683}; 687};
684 688
685/* Superblock handling */ 689/* Superblock handling */
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 50e8c8582faa..dea3b628a6ce 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -48,6 +48,21 @@ inline struct block_device *I_BDEV(struct inode *inode)
48 48
49EXPORT_SYMBOL(I_BDEV); 49EXPORT_SYMBOL(I_BDEV);
50 50
51/*
52 * move the inode from it's current bdi to the a new bdi. if the inode is dirty
53 * we need to move it onto the dirty list of @dst so that the inode is always
54 * on the right list.
55 */
56static void bdev_inode_switch_bdi(struct inode *inode,
57 struct backing_dev_info *dst)
58{
59 spin_lock(&inode_lock);
60 inode->i_data.backing_dev_info = dst;
61 if (inode->i_state & I_DIRTY)
62 list_move(&inode->i_wb_list, &dst->wb.b_dirty);
63 spin_unlock(&inode_lock);
64}
65
51static sector_t max_block(struct block_device *bdev) 66static sector_t max_block(struct block_device *bdev)
52{ 67{
53 sector_t retval = ~((sector_t)0); 68 sector_t retval = ~((sector_t)0);
@@ -370,7 +385,7 @@ int blkdev_fsync(struct file *filp, int datasync)
370 */ 385 */
371 mutex_unlock(&bd_inode->i_mutex); 386 mutex_unlock(&bd_inode->i_mutex);
372 387
373 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT); 388 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
374 if (error == -EOPNOTSUPP) 389 if (error == -EOPNOTSUPP)
375 error = 0; 390 error = 0;
376 391
@@ -550,7 +565,7 @@ EXPORT_SYMBOL(bdget);
550 */ 565 */
551struct block_device *bdgrab(struct block_device *bdev) 566struct block_device *bdgrab(struct block_device *bdev)
552{ 567{
553 atomic_inc(&bdev->bd_inode->i_count); 568 ihold(bdev->bd_inode);
554 return bdev; 569 return bdev;
555} 570}
556 571
@@ -580,7 +595,7 @@ static struct block_device *bd_acquire(struct inode *inode)
580 spin_lock(&bdev_lock); 595 spin_lock(&bdev_lock);
581 bdev = inode->i_bdev; 596 bdev = inode->i_bdev;
582 if (bdev) { 597 if (bdev) {
583 atomic_inc(&bdev->bd_inode->i_count); 598 ihold(bdev->bd_inode);
584 spin_unlock(&bdev_lock); 599 spin_unlock(&bdev_lock);
585 return bdev; 600 return bdev;
586 } 601 }
@@ -591,12 +606,12 @@ static struct block_device *bd_acquire(struct inode *inode)
591 spin_lock(&bdev_lock); 606 spin_lock(&bdev_lock);
592 if (!inode->i_bdev) { 607 if (!inode->i_bdev) {
593 /* 608 /*
594 * We take an additional bd_inode->i_count for inode, 609 * We take an additional reference to bd_inode,
595 * and it's released in clear_inode() of inode. 610 * and it's released in clear_inode() of inode.
596 * So, we can access it via ->i_mapping always 611 * So, we can access it via ->i_mapping always
597 * without igrab(). 612 * without igrab().
598 */ 613 */
599 atomic_inc(&bdev->bd_inode->i_count); 614 ihold(bdev->bd_inode);
600 inode->i_bdev = bdev; 615 inode->i_bdev = bdev;
601 inode->i_mapping = bdev->bd_inode->i_mapping; 616 inode->i_mapping = bdev->bd_inode->i_mapping;
602 list_add(&inode->i_devices, &bdev->bd_inodes); 617 list_add(&inode->i_devices, &bdev->bd_inodes);
@@ -1390,7 +1405,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1390 bdi = blk_get_backing_dev_info(bdev); 1405 bdi = blk_get_backing_dev_info(bdev);
1391 if (bdi == NULL) 1406 if (bdi == NULL)
1392 bdi = &default_backing_dev_info; 1407 bdi = &default_backing_dev_info;
1393 bdev->bd_inode->i_data.backing_dev_info = bdi; 1408 bdev_inode_switch_bdi(bdev->bd_inode, bdi);
1394 } 1409 }
1395 if (bdev->bd_invalidated) 1410 if (bdev->bd_invalidated)
1396 rescan_partitions(disk, bdev); 1411 rescan_partitions(disk, bdev);
@@ -1405,8 +1420,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1405 if (ret) 1420 if (ret)
1406 goto out_clear; 1421 goto out_clear;
1407 bdev->bd_contains = whole; 1422 bdev->bd_contains = whole;
1408 bdev->bd_inode->i_data.backing_dev_info = 1423 bdev_inode_switch_bdi(bdev->bd_inode,
1409 whole->bd_inode->i_data.backing_dev_info; 1424 whole->bd_inode->i_data.backing_dev_info);
1410 bdev->bd_part = disk_get_part(disk, partno); 1425 bdev->bd_part = disk_get_part(disk, partno);
1411 if (!(disk->flags & GENHD_FL_UP) || 1426 if (!(disk->flags & GENHD_FL_UP) ||
1412 !bdev->bd_part || !bdev->bd_part->nr_sects) { 1427 !bdev->bd_part || !bdev->bd_part->nr_sects) {
@@ -1439,7 +1454,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1439 disk_put_part(bdev->bd_part); 1454 disk_put_part(bdev->bd_part);
1440 bdev->bd_disk = NULL; 1455 bdev->bd_disk = NULL;
1441 bdev->bd_part = NULL; 1456 bdev->bd_part = NULL;
1442 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; 1457 bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
1443 if (bdev != bdev->bd_contains) 1458 if (bdev != bdev->bd_contains)
1444 __blkdev_put(bdev->bd_contains, mode, 1); 1459 __blkdev_put(bdev->bd_contains, mode, 1);
1445 bdev->bd_contains = NULL; 1460 bdev->bd_contains = NULL;
@@ -1533,7 +1548,8 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1533 disk_put_part(bdev->bd_part); 1548 disk_put_part(bdev->bd_part);
1534 bdev->bd_part = NULL; 1549 bdev->bd_part = NULL;
1535 bdev->bd_disk = NULL; 1550 bdev->bd_disk = NULL;
1536 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; 1551 bdev_inode_switch_bdi(bdev->bd_inode,
1552 &default_backing_dev_info);
1537 if (bdev != bdev->bd_contains) 1553 if (bdev != bdev->bd_contains)
1538 victim = bdev->bd_contains; 1554 victim = bdev->bd_contains;
1539 bdev->bd_contains = NULL; 1555 bdev->bd_contains = NULL;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 64f10082f048..5e789f4a3ed0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2063,7 +2063,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
2063 if (uptodate) { 2063 if (uptodate) {
2064 set_buffer_uptodate(bh); 2064 set_buffer_uptodate(bh);
2065 } else { 2065 } else {
2066 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { 2066 if (printk_ratelimit()) {
2067 printk(KERN_WARNING "lost page write due to " 2067 printk(KERN_WARNING "lost page write due to "
2068 "I/O error on %s\n", 2068 "I/O error on %s\n",
2069 bdevname(bh->b_bdev, b)); 2069 bdevname(bh->b_bdev, b));
@@ -2200,21 +2200,10 @@ static int write_dev_supers(struct btrfs_device *device,
2200 bh->b_end_io = btrfs_end_buffer_write_sync; 2200 bh->b_end_io = btrfs_end_buffer_write_sync;
2201 } 2201 }
2202 2202
2203 if (i == last_barrier && do_barriers && device->barriers) { 2203 if (i == last_barrier && do_barriers)
2204 ret = submit_bh(WRITE_BARRIER, bh); 2204 ret = submit_bh(WRITE_FLUSH_FUA, bh);
2205 if (ret == -EOPNOTSUPP) { 2205 else
2206 printk("btrfs: disabling barriers on dev %s\n",
2207 device->name);
2208 set_buffer_uptodate(bh);
2209 device->barriers = 0;
2210 /* one reference for submit_bh */
2211 get_bh(bh);
2212 lock_buffer(bh);
2213 ret = submit_bh(WRITE_SYNC, bh);
2214 }
2215 } else {
2216 ret = submit_bh(WRITE_SYNC, bh); 2206 ret = submit_bh(WRITE_SYNC, bh);
2217 }
2218 2207
2219 if (ret) 2208 if (ret)
2220 errors++; 2209 errors++;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 32d094002a57..0b81ecdb101c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1695,8 +1695,7 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
1695static void btrfs_issue_discard(struct block_device *bdev, 1695static void btrfs_issue_discard(struct block_device *bdev,
1696 u64 start, u64 len) 1696 u64 start, u64 len)
1697{ 1697{
1698 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 1698 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0);
1699 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
1700} 1699}
1701 1700
1702static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1701static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c03864406af3..64f99cf69ce0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3849,7 +3849,7 @@ again:
3849 p = &root->inode_tree.rb_node; 3849 p = &root->inode_tree.rb_node;
3850 parent = NULL; 3850 parent = NULL;
3851 3851
3852 if (hlist_unhashed(&inode->i_hash)) 3852 if (inode_unhashed(inode))
3853 return; 3853 return;
3854 3854
3855 spin_lock(&root->inode_lock); 3855 spin_lock(&root->inode_lock);
@@ -4758,7 +4758,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4758 } 4758 }
4759 4759
4760 btrfs_set_trans_block_group(trans, dir); 4760 btrfs_set_trans_block_group(trans, dir);
4761 atomic_inc(&inode->i_count); 4761 ihold(inode);
4762 4762
4763 err = btrfs_add_nondir(trans, dentry, inode, 1, index); 4763 err = btrfs_add_nondir(trans, dentry, inode, 1, index);
4764 4764
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1776dbd8dc98..144f8a5730f5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -815,6 +815,7 @@ static const struct file_operations btrfs_ctl_fops = {
815 .unlocked_ioctl = btrfs_control_ioctl, 815 .unlocked_ioctl = btrfs_control_ioctl,
816 .compat_ioctl = btrfs_control_ioctl, 816 .compat_ioctl = btrfs_control_ioctl,
817 .owner = THIS_MODULE, 817 .owner = THIS_MODULE,
818 .llseek = noop_llseek,
818}; 819};
819 820
820static struct miscdevice btrfs_misc = { 821static struct miscdevice btrfs_misc = {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dd318ff280b2..e25e46a8b4e2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -398,7 +398,6 @@ static noinline int device_list_add(const char *path,
398 device->work.func = pending_bios_fn; 398 device->work.func = pending_bios_fn;
399 memcpy(device->uuid, disk_super->dev_item.uuid, 399 memcpy(device->uuid, disk_super->dev_item.uuid,
400 BTRFS_UUID_SIZE); 400 BTRFS_UUID_SIZE);
401 device->barriers = 1;
402 spin_lock_init(&device->io_lock); 401 spin_lock_init(&device->io_lock);
403 device->name = kstrdup(path, GFP_NOFS); 402 device->name = kstrdup(path, GFP_NOFS);
404 if (!device->name) { 403 if (!device->name) {
@@ -462,7 +461,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
462 device->devid = orig_dev->devid; 461 device->devid = orig_dev->devid;
463 device->work.func = pending_bios_fn; 462 device->work.func = pending_bios_fn;
464 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); 463 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
465 device->barriers = 1;
466 spin_lock_init(&device->io_lock); 464 spin_lock_init(&device->io_lock);
467 INIT_LIST_HEAD(&device->dev_list); 465 INIT_LIST_HEAD(&device->dev_list);
468 INIT_LIST_HEAD(&device->dev_alloc_list); 466 INIT_LIST_HEAD(&device->dev_alloc_list);
@@ -1489,7 +1487,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1489 trans = btrfs_start_transaction(root, 0); 1487 trans = btrfs_start_transaction(root, 0);
1490 lock_chunks(root); 1488 lock_chunks(root);
1491 1489
1492 device->barriers = 1;
1493 device->writeable = 1; 1490 device->writeable = 1;
1494 device->work.func = pending_bios_fn; 1491 device->work.func = pending_bios_fn;
1495 generate_random_uuid(device->uuid); 1492 generate_random_uuid(device->uuid);
@@ -3084,7 +3081,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
3084 return NULL; 3081 return NULL;
3085 list_add(&device->dev_list, 3082 list_add(&device->dev_list,
3086 &fs_devices->devices); 3083 &fs_devices->devices);
3087 device->barriers = 1;
3088 device->dev_root = root->fs_info->dev_root; 3084 device->dev_root = root->fs_info->dev_root;
3089 device->devid = devid; 3085 device->devid = devid;
3090 device->work.func = pending_bios_fn; 3086 device->work.func = pending_bios_fn;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 31b0fabdd2ea..2b638b6e4eea 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -42,7 +42,6 @@ struct btrfs_device {
42 int running_pending; 42 int running_pending;
43 u64 generation; 43 u64 generation;
44 44
45 int barriers;
46 int writeable; 45 int writeable;
47 int in_fs_metadata; 46 int in_fs_metadata;
48 47
diff --git a/fs/buffer.c b/fs/buffer.c
index 3e7dca279d1c..5930e382959b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -156,7 +156,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
156 if (uptodate) { 156 if (uptodate) {
157 set_buffer_uptodate(bh); 157 set_buffer_uptodate(bh);
158 } else { 158 } else {
159 if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) { 159 if (!quiet_error(bh)) {
160 buffer_io_error(bh); 160 buffer_io_error(bh);
161 printk(KERN_WARNING "lost page write due to " 161 printk(KERN_WARNING "lost page write due to "
162 "I/O error on %s\n", 162 "I/O error on %s\n",
@@ -905,7 +905,6 @@ try_again:
905 905
906 bh->b_state = 0; 906 bh->b_state = 0;
907 atomic_set(&bh->b_count, 0); 907 atomic_set(&bh->b_count, 0);
908 bh->b_private = NULL;
909 bh->b_size = size; 908 bh->b_size = size;
910 909
911 /* Link the buffer to its page */ 910 /* Link the buffer to its page */
@@ -1706,7 +1705,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1706 * and kswapd activity, but those code paths have their own 1705 * and kswapd activity, but those code paths have their own
1707 * higher-level throttling. 1706 * higher-level throttling.
1708 */ 1707 */
1709 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { 1708 if (wbc->sync_mode != WB_SYNC_NONE) {
1710 lock_buffer(bh); 1709 lock_buffer(bh);
1711 } else if (!trylock_buffer(bh)) { 1710 } else if (!trylock_buffer(bh)) {
1712 redirty_page_for_writepage(wbc, page); 1711 redirty_page_for_writepage(wbc, page);
@@ -1834,9 +1833,11 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1834} 1833}
1835EXPORT_SYMBOL(page_zero_new_buffers); 1834EXPORT_SYMBOL(page_zero_new_buffers);
1836 1835
1837int block_prepare_write(struct page *page, unsigned from, unsigned to, 1836int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1838 get_block_t *get_block) 1837 get_block_t *get_block)
1839{ 1838{
1839 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1840 unsigned to = from + len;
1840 struct inode *inode = page->mapping->host; 1841 struct inode *inode = page->mapping->host;
1841 unsigned block_start, block_end; 1842 unsigned block_start, block_end;
1842 sector_t block; 1843 sector_t block;
@@ -1916,7 +1917,7 @@ int block_prepare_write(struct page *page, unsigned from, unsigned to,
1916 } 1917 }
1917 return err; 1918 return err;
1918} 1919}
1919EXPORT_SYMBOL(block_prepare_write); 1920EXPORT_SYMBOL(__block_write_begin);
1920 1921
1921static int __block_commit_write(struct inode *inode, struct page *page, 1922static int __block_commit_write(struct inode *inode, struct page *page,
1922 unsigned from, unsigned to) 1923 unsigned from, unsigned to)
@@ -1953,15 +1954,6 @@ static int __block_commit_write(struct inode *inode, struct page *page,
1953 return 0; 1954 return 0;
1954} 1955}
1955 1956
1956int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1957 get_block_t *get_block)
1958{
1959 unsigned start = pos & (PAGE_CACHE_SIZE - 1);
1960
1961 return block_prepare_write(page, start, start + len, get_block);
1962}
1963EXPORT_SYMBOL(__block_write_begin);
1964
1965/* 1957/*
1966 * block_write_begin takes care of the basic task of block allocation and 1958 * block_write_begin takes care of the basic task of block allocation and
1967 * bringing partial write blocks uptodate first. 1959 * bringing partial write blocks uptodate first.
@@ -2379,7 +2371,7 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2379 else 2371 else
2380 end = PAGE_CACHE_SIZE; 2372 end = PAGE_CACHE_SIZE;
2381 2373
2382 ret = block_prepare_write(page, 0, end, get_block); 2374 ret = __block_write_begin(page, 0, end, get_block);
2383 if (!ret) 2375 if (!ret)
2384 ret = block_commit_write(page, 0, end); 2376 ret = block_commit_write(page, 0, end);
2385 2377
@@ -2466,11 +2458,10 @@ int nobh_write_begin(struct address_space *mapping,
2466 *fsdata = NULL; 2458 *fsdata = NULL;
2467 2459
2468 if (page_has_buffers(page)) { 2460 if (page_has_buffers(page)) {
2469 unlock_page(page); 2461 ret = __block_write_begin(page, pos, len, get_block);
2470 page_cache_release(page); 2462 if (unlikely(ret))
2471 *pagep = NULL; 2463 goto out_release;
2472 return block_write_begin(mapping, pos, len, flags, pagep, 2464 return ret;
2473 get_block);
2474 } 2465 }
2475 2466
2476 if (PageMappedToDisk(page)) 2467 if (PageMappedToDisk(page))
@@ -2891,7 +2882,6 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
2891 2882
2892 if (err == -EOPNOTSUPP) { 2883 if (err == -EOPNOTSUPP) {
2893 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); 2884 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2894 set_bit(BH_Eopnotsupp, &bh->b_state);
2895 } 2885 }
2896 2886
2897 if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags))) 2887 if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
@@ -3031,10 +3021,6 @@ int __sync_dirty_buffer(struct buffer_head *bh, int rw)
3031 bh->b_end_io = end_buffer_write_sync; 3021 bh->b_end_io = end_buffer_write_sync;
3032 ret = submit_bh(rw, bh); 3022 ret = submit_bh(rw, bh);
3033 wait_on_buffer(bh); 3023 wait_on_buffer(bh);
3034 if (buffer_eopnotsupp(bh)) {
3035 clear_buffer_eopnotsupp(bh);
3036 ret = -EOPNOTSUPP;
3037 }
3038 if (!ret && !buffer_uptodate(bh)) 3024 if (!ret && !buffer_uptodate(bh))
3039 ret = -EIO; 3025 ret = -EIO;
3040 } else { 3026 } else {
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 727caedcdd92..0a1467b15516 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -55,6 +55,7 @@ const struct file_operations cachefiles_daemon_fops = {
55 .read = cachefiles_daemon_read, 55 .read = cachefiles_daemon_read,
56 .write = cachefiles_daemon_write, 56 .write = cachefiles_daemon_write,
57 .poll = cachefiles_daemon_poll, 57 .poll = cachefiles_daemon_poll,
58 .llseek = noop_llseek,
58}; 59};
59 60
60struct cachefiles_daemon_cmd { 61struct cachefiles_daemon_cmd {
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index bc87b9c1d27e..9eb134ea6eb2 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -1,8 +1,11 @@
1config CEPH_FS 1config CEPH_FS
2 tristate "Ceph distributed file system (EXPERIMENTAL)" 2 tristate "Ceph distributed file system (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL 3 depends on INET && EXPERIMENTAL
4 select CEPH_LIB
4 select LIBCRC32C 5 select LIBCRC32C
5 select CRYPTO_AES 6 select CRYPTO_AES
7 select CRYPTO
8 default n
6 help 9 help
7 Choose Y or M here to include support for mounting the 10 Choose Y or M here to include support for mounting the
8 experimental Ceph distributed file system. Ceph is an extremely 11 experimental Ceph distributed file system. Ceph is an extremely
@@ -13,15 +16,3 @@ config CEPH_FS
13 16
14 If unsure, say N. 17 If unsure, say N.
15 18
16config CEPH_FS_PRETTYDEBUG
17 bool "Include file:line in ceph debug output"
18 depends on CEPH_FS
19 default n
20 help
21 If you say Y here, debug output will include a filename and
22 line to aid debugging. This icnreases kernel size and slows
23 execution slightly when debug call sites are enabled (e.g.,
24 via CONFIG_DYNAMIC_DEBUG).
25
26 If unsure, say N.
27
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 278e1172600d..9e6c4f2e8ff1 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -8,15 +8,8 @@ obj-$(CONFIG_CEPH_FS) += ceph.o
8 8
9ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ 9ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
10 export.o caps.o snap.o xattr.o \ 10 export.o caps.o snap.o xattr.o \
11 messenger.o msgpool.o buffer.o pagelist.o \ 11 mds_client.o mdsmap.o strings.o ceph_frag.o \
12 mds_client.o mdsmap.o \ 12 debugfs.o
13 mon_client.o \
14 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
15 debugfs.o \
16 auth.o auth_none.o \
17 crypto.o armor.o \
18 auth_x.o \
19 ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
20 13
21else 14else
22#Otherwise we were called directly from the command 15#Otherwise we were called directly from the command
diff --git a/fs/ceph/README b/fs/ceph/README
deleted file mode 100644
index 18352fab37c0..000000000000
--- a/fs/ceph/README
+++ /dev/null
@@ -1,20 +0,0 @@
1#
2# The following files are shared by (and manually synchronized
3# between) the Ceph userland and kernel client.
4#
5# userland kernel
6src/include/ceph_fs.h fs/ceph/ceph_fs.h
7src/include/ceph_fs.cc fs/ceph/ceph_fs.c
8src/include/msgr.h fs/ceph/msgr.h
9src/include/rados.h fs/ceph/rados.h
10src/include/ceph_strings.cc fs/ceph/ceph_strings.c
11src/include/ceph_frag.h fs/ceph/ceph_frag.h
12src/include/ceph_frag.cc fs/ceph/ceph_frag.c
13src/include/ceph_hash.h fs/ceph/ceph_hash.h
14src/include/ceph_hash.cc fs/ceph/ceph_hash.c
15src/crush/crush.c fs/ceph/crush/crush.c
16src/crush/crush.h fs/ceph/crush/crush.h
17src/crush/mapper.c fs/ceph/crush/mapper.c
18src/crush/mapper.h fs/ceph/crush/mapper.h
19src/crush/hash.h fs/ceph/crush/hash.h
20src/crush/hash.c fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 4cfce1ee31fa..e9c874abc9e1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/backing-dev.h> 3#include <linux/backing-dev.h>
4#include <linux/fs.h> 4#include <linux/fs.h>
@@ -10,7 +10,8 @@
10#include <linux/task_io_accounting_ops.h> 10#include <linux/task_io_accounting_ops.h>
11 11
12#include "super.h" 12#include "super.h"
13#include "osd_client.h" 13#include "mds_client.h"
14#include <linux/ceph/osd_client.h>
14 15
15/* 16/*
16 * Ceph address space ops. 17 * Ceph address space ops.
@@ -193,7 +194,8 @@ static int readpage_nounlock(struct file *filp, struct page *page)
193{ 194{
194 struct inode *inode = filp->f_dentry->d_inode; 195 struct inode *inode = filp->f_dentry->d_inode;
195 struct ceph_inode_info *ci = ceph_inode(inode); 196 struct ceph_inode_info *ci = ceph_inode(inode);
196 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; 197 struct ceph_osd_client *osdc =
198 &ceph_inode_to_client(inode)->client->osdc;
197 int err = 0; 199 int err = 0;
198 u64 len = PAGE_CACHE_SIZE; 200 u64 len = PAGE_CACHE_SIZE;
199 201
@@ -265,7 +267,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
265{ 267{
266 struct inode *inode = file->f_dentry->d_inode; 268 struct inode *inode = file->f_dentry->d_inode;
267 struct ceph_inode_info *ci = ceph_inode(inode); 269 struct ceph_inode_info *ci = ceph_inode(inode);
268 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; 270 struct ceph_osd_client *osdc =
271 &ceph_inode_to_client(inode)->client->osdc;
269 int rc = 0; 272 int rc = 0;
270 struct page **pages; 273 struct page **pages;
271 loff_t offset; 274 loff_t offset;
@@ -365,7 +368,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
365{ 368{
366 struct inode *inode; 369 struct inode *inode;
367 struct ceph_inode_info *ci; 370 struct ceph_inode_info *ci;
368 struct ceph_client *client; 371 struct ceph_fs_client *fsc;
369 struct ceph_osd_client *osdc; 372 struct ceph_osd_client *osdc;
370 loff_t page_off = page->index << PAGE_CACHE_SHIFT; 373 loff_t page_off = page->index << PAGE_CACHE_SHIFT;
371 int len = PAGE_CACHE_SIZE; 374 int len = PAGE_CACHE_SIZE;
@@ -383,8 +386,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
383 } 386 }
384 inode = page->mapping->host; 387 inode = page->mapping->host;
385 ci = ceph_inode(inode); 388 ci = ceph_inode(inode);
386 client = ceph_inode_to_client(inode); 389 fsc = ceph_inode_to_client(inode);
387 osdc = &client->osdc; 390 osdc = &fsc->client->osdc;
388 391
389 /* verify this is a writeable snap context */ 392 /* verify this is a writeable snap context */
390 snapc = (void *)page->private; 393 snapc = (void *)page->private;
@@ -411,13 +414,13 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
411 if (i_size < page_off + len) 414 if (i_size < page_off + len)
412 len = i_size - page_off; 415 len = i_size - page_off;
413 416
414 dout("writepage %p page %p index %lu on %llu~%u\n", 417 dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
415 inode, page, page->index, page_off, len); 418 inode, page, page->index, page_off, len, snapc);
416 419
417 writeback_stat = atomic_long_inc_return(&client->writeback_count); 420 writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
418 if (writeback_stat > 421 if (writeback_stat >
419 CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) 422 CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
420 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); 423 set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
421 424
422 set_page_writeback(page); 425 set_page_writeback(page);
423 err = ceph_osdc_writepages(osdc, ceph_vino(inode), 426 err = ceph_osdc_writepages(osdc, ceph_vino(inode),
@@ -496,7 +499,7 @@ static void writepages_finish(struct ceph_osd_request *req,
496 struct address_space *mapping = inode->i_mapping; 499 struct address_space *mapping = inode->i_mapping;
497 __s32 rc = -EIO; 500 __s32 rc = -EIO;
498 u64 bytes = 0; 501 u64 bytes = 0;
499 struct ceph_client *client = ceph_inode_to_client(inode); 502 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
500 long writeback_stat; 503 long writeback_stat;
501 unsigned issued = ceph_caps_issued(ci); 504 unsigned issued = ceph_caps_issued(ci);
502 505
@@ -529,10 +532,10 @@ static void writepages_finish(struct ceph_osd_request *req,
529 WARN_ON(!PageUptodate(page)); 532 WARN_ON(!PageUptodate(page));
530 533
531 writeback_stat = 534 writeback_stat =
532 atomic_long_dec_return(&client->writeback_count); 535 atomic_long_dec_return(&fsc->writeback_count);
533 if (writeback_stat < 536 if (writeback_stat <
534 CONGESTION_OFF_THRESH(client->mount_args->congestion_kb)) 537 CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
535 clear_bdi_congested(&client->backing_dev_info, 538 clear_bdi_congested(&fsc->backing_dev_info,
536 BLK_RW_ASYNC); 539 BLK_RW_ASYNC);
537 540
538 ceph_put_snap_context((void *)page->private); 541 ceph_put_snap_context((void *)page->private);
@@ -569,13 +572,13 @@ static void writepages_finish(struct ceph_osd_request *req,
569 * mempool. we avoid the mempool if we can because req->r_num_pages 572 * mempool. we avoid the mempool if we can because req->r_num_pages
570 * may be less than the maximum write size. 573 * may be less than the maximum write size.
571 */ 574 */
572static void alloc_page_vec(struct ceph_client *client, 575static void alloc_page_vec(struct ceph_fs_client *fsc,
573 struct ceph_osd_request *req) 576 struct ceph_osd_request *req)
574{ 577{
575 req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages, 578 req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
576 GFP_NOFS); 579 GFP_NOFS);
577 if (!req->r_pages) { 580 if (!req->r_pages) {
578 req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS); 581 req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
579 req->r_pages_from_pool = 1; 582 req->r_pages_from_pool = 1;
580 WARN_ON(!req->r_pages); 583 WARN_ON(!req->r_pages);
581 } 584 }
@@ -588,9 +591,8 @@ static int ceph_writepages_start(struct address_space *mapping,
588 struct writeback_control *wbc) 591 struct writeback_control *wbc)
589{ 592{
590 struct inode *inode = mapping->host; 593 struct inode *inode = mapping->host;
591 struct backing_dev_info *bdi = mapping->backing_dev_info;
592 struct ceph_inode_info *ci = ceph_inode(inode); 594 struct ceph_inode_info *ci = ceph_inode(inode);
593 struct ceph_client *client; 595 struct ceph_fs_client *fsc;
594 pgoff_t index, start, end; 596 pgoff_t index, start, end;
595 int range_whole = 0; 597 int range_whole = 0;
596 int should_loop = 1; 598 int should_loop = 1;
@@ -617,26 +619,19 @@ static int ceph_writepages_start(struct address_space *mapping,
617 wbc->sync_mode == WB_SYNC_NONE ? "NONE" : 619 wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
618 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); 620 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
619 621
620 client = ceph_inode_to_client(inode); 622 fsc = ceph_inode_to_client(inode);
621 if (client->mount_state == CEPH_MOUNT_SHUTDOWN) { 623 if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
622 pr_warning("writepage_start %p on forced umount\n", inode); 624 pr_warning("writepage_start %p on forced umount\n", inode);
623 return -EIO; /* we're in a forced umount, don't write! */ 625 return -EIO; /* we're in a forced umount, don't write! */
624 } 626 }
625 if (client->mount_args->wsize && client->mount_args->wsize < wsize) 627 if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
626 wsize = client->mount_args->wsize; 628 wsize = fsc->mount_options->wsize;
627 if (wsize < PAGE_CACHE_SIZE) 629 if (wsize < PAGE_CACHE_SIZE)
628 wsize = PAGE_CACHE_SIZE; 630 wsize = PAGE_CACHE_SIZE;
629 max_pages_ever = wsize >> PAGE_CACHE_SHIFT; 631 max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
630 632
631 pagevec_init(&pvec, 0); 633 pagevec_init(&pvec, 0);
632 634
633 /* ?? */
634 if (wbc->nonblocking && bdi_write_congested(bdi)) {
635 dout(" writepages congested\n");
636 wbc->encountered_congestion = 1;
637 goto out_final;
638 }
639
640 /* where to start/end? */ 635 /* where to start/end? */
641 if (wbc->range_cyclic) { 636 if (wbc->range_cyclic) {
642 start = mapping->writeback_index; /* Start from prev offset */ 637 start = mapping->writeback_index; /* Start from prev offset */
@@ -766,9 +761,10 @@ get_more_pages:
766 /* ok */ 761 /* ok */
767 if (locked_pages == 0) { 762 if (locked_pages == 0) {
768 /* prepare async write request */ 763 /* prepare async write request */
769 offset = page->index << PAGE_CACHE_SHIFT; 764 offset = (unsigned long long)page->index
765 << PAGE_CACHE_SHIFT;
770 len = wsize; 766 len = wsize;
771 req = ceph_osdc_new_request(&client->osdc, 767 req = ceph_osdc_new_request(&fsc->client->osdc,
772 &ci->i_layout, 768 &ci->i_layout,
773 ceph_vino(inode), 769 ceph_vino(inode),
774 offset, &len, 770 offset, &len,
@@ -781,7 +777,7 @@ get_more_pages:
781 &inode->i_mtime, true, 1); 777 &inode->i_mtime, true, 1);
782 max_pages = req->r_num_pages; 778 max_pages = req->r_num_pages;
783 779
784 alloc_page_vec(client, req); 780 alloc_page_vec(fsc, req);
785 req->r_callback = writepages_finish; 781 req->r_callback = writepages_finish;
786 req->r_inode = inode; 782 req->r_inode = inode;
787 } 783 }
@@ -793,10 +789,10 @@ get_more_pages:
793 inode, page, page->index); 789 inode, page, page->index);
794 790
795 writeback_stat = 791 writeback_stat =
796 atomic_long_inc_return(&client->writeback_count); 792 atomic_long_inc_return(&fsc->writeback_count);
797 if (writeback_stat > CONGESTION_ON_THRESH( 793 if (writeback_stat > CONGESTION_ON_THRESH(
798 client->mount_args->congestion_kb)) { 794 fsc->mount_options->congestion_kb)) {
799 set_bdi_congested(&client->backing_dev_info, 795 set_bdi_congested(&fsc->backing_dev_info,
800 BLK_RW_ASYNC); 796 BLK_RW_ASYNC);
801 } 797 }
802 798
@@ -845,7 +841,7 @@ get_more_pages:
845 op->payload_len = cpu_to_le32(len); 841 op->payload_len = cpu_to_le32(len);
846 req->r_request->hdr.data_len = cpu_to_le32(len); 842 req->r_request->hdr.data_len = cpu_to_le32(len);
847 843
848 ceph_osdc_start_request(&client->osdc, req, true); 844 ceph_osdc_start_request(&fsc->client->osdc, req, true);
849 req = NULL; 845 req = NULL;
850 846
851 /* continue? */ 847 /* continue? */
@@ -881,7 +877,6 @@ out:
881 rc = 0; /* vfs expects us to return 0 */ 877 rc = 0; /* vfs expects us to return 0 */
882 ceph_put_snap_context(snapc); 878 ceph_put_snap_context(snapc);
883 dout("writepages done, rc = %d\n", rc); 879 dout("writepages done, rc = %d\n", rc);
884out_final:
885 return rc; 880 return rc;
886} 881}
887 882
@@ -914,7 +909,7 @@ static int ceph_update_writeable_page(struct file *file,
914{ 909{
915 struct inode *inode = file->f_dentry->d_inode; 910 struct inode *inode = file->f_dentry->d_inode;
916 struct ceph_inode_info *ci = ceph_inode(inode); 911 struct ceph_inode_info *ci = ceph_inode(inode);
917 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; 912 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
918 loff_t page_off = pos & PAGE_CACHE_MASK; 913 loff_t page_off = pos & PAGE_CACHE_MASK;
919 int pos_in_page = pos & ~PAGE_CACHE_MASK; 914 int pos_in_page = pos & ~PAGE_CACHE_MASK;
920 int end_in_page = pos_in_page + len; 915 int end_in_page = pos_in_page + len;
@@ -1052,8 +1047,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
1052 struct page *page, void *fsdata) 1047 struct page *page, void *fsdata)
1053{ 1048{
1054 struct inode *inode = file->f_dentry->d_inode; 1049 struct inode *inode = file->f_dentry->d_inode;
1055 struct ceph_client *client = ceph_inode_to_client(inode); 1050 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1056 struct ceph_mds_client *mdsc = &client->mdsc; 1051 struct ceph_mds_client *mdsc = fsc->mdsc;
1057 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 1052 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1058 int check_cap = 0; 1053 int check_cap = 0;
1059 1054
@@ -1122,7 +1117,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1122{ 1117{
1123 struct inode *inode = vma->vm_file->f_dentry->d_inode; 1118 struct inode *inode = vma->vm_file->f_dentry->d_inode;
1124 struct page *page = vmf->page; 1119 struct page *page = vmf->page;
1125 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; 1120 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1126 loff_t off = page->index << PAGE_CACHE_SHIFT; 1121 loff_t off = page->index << PAGE_CACHE_SHIFT;
1127 loff_t size, len; 1122 loff_t size, len;
1128 int ret; 1123 int ret;
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
deleted file mode 100644
index eb2a666b0be7..000000000000
--- a/fs/ceph/armor.c
+++ /dev/null
@@ -1,103 +0,0 @@
1
2#include <linux/errno.h>
3
4int ceph_armor(char *dst, const char *src, const char *end);
5int ceph_unarmor(char *dst, const char *src, const char *end);
6
7/*
8 * base64 encode/decode.
9 */
10
11static const char *pem_key =
12 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
13
14static int encode_bits(int c)
15{
16 return pem_key[c];
17}
18
19static int decode_bits(char c)
20{
21 if (c >= 'A' && c <= 'Z')
22 return c - 'A';
23 if (c >= 'a' && c <= 'z')
24 return c - 'a' + 26;
25 if (c >= '0' && c <= '9')
26 return c - '0' + 52;
27 if (c == '+')
28 return 62;
29 if (c == '/')
30 return 63;
31 if (c == '=')
32 return 0; /* just non-negative, please */
33 return -EINVAL;
34}
35
36int ceph_armor(char *dst, const char *src, const char *end)
37{
38 int olen = 0;
39 int line = 0;
40
41 while (src < end) {
42 unsigned char a, b, c;
43
44 a = *src++;
45 *dst++ = encode_bits(a >> 2);
46 if (src < end) {
47 b = *src++;
48 *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
49 if (src < end) {
50 c = *src++;
51 *dst++ = encode_bits(((b & 15) << 2) |
52 (c >> 6));
53 *dst++ = encode_bits(c & 63);
54 } else {
55 *dst++ = encode_bits((b & 15) << 2);
56 *dst++ = '=';
57 }
58 } else {
59 *dst++ = encode_bits(((a & 3) << 4));
60 *dst++ = '=';
61 *dst++ = '=';
62 }
63 olen += 4;
64 line += 4;
65 if (line == 64) {
66 line = 0;
67 *(dst++) = '\n';
68 olen++;
69 }
70 }
71 return olen;
72}
73
74int ceph_unarmor(char *dst, const char *src, const char *end)
75{
76 int olen = 0;
77
78 while (src < end) {
79 int a, b, c, d;
80
81 if (src < end && src[0] == '\n')
82 src++;
83 if (src + 4 > end)
84 return -EINVAL;
85 a = decode_bits(src[0]);
86 b = decode_bits(src[1]);
87 c = decode_bits(src[2]);
88 d = decode_bits(src[3]);
89 if (a < 0 || b < 0 || c < 0 || d < 0)
90 return -EINVAL;
91
92 *dst++ = (a << 2) | (b >> 4);
93 if (src[2] == '=')
94 return olen + 1;
95 *dst++ = ((b & 15) << 4) | (c >> 2);
96 if (src[3] == '=')
97 return olen + 2;
98 *dst++ = ((c & 3) << 6) | d;
99 olen += 3;
100 src += 4;
101 }
102 return olen;
103}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
deleted file mode 100644
index 6d2e30600627..000000000000
--- a/fs/ceph/auth.c
+++ /dev/null
@@ -1,259 +0,0 @@
1#include "ceph_debug.h"
2
3#include <linux/module.h>
4#include <linux/err.h>
5#include <linux/slab.h>
6
7#include "types.h"
8#include "auth_none.h"
9#include "auth_x.h"
10#include "decode.h"
11#include "super.h"
12
13#include "messenger.h"
14
15/*
16 * get protocol handler
17 */
18static u32 supported_protocols[] = {
19 CEPH_AUTH_NONE,
20 CEPH_AUTH_CEPHX
21};
22
23static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
24{
25 switch (protocol) {
26 case CEPH_AUTH_NONE:
27 return ceph_auth_none_init(ac);
28 case CEPH_AUTH_CEPHX:
29 return ceph_x_init(ac);
30 default:
31 return -ENOENT;
32 }
33}
34
35/*
36 * setup, teardown.
37 */
38struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
39{
40 struct ceph_auth_client *ac;
41 int ret;
42
43 dout("auth_init name '%s' secret '%s'\n", name, secret);
44
45 ret = -ENOMEM;
46 ac = kzalloc(sizeof(*ac), GFP_NOFS);
47 if (!ac)
48 goto out;
49
50 ac->negotiating = true;
51 if (name)
52 ac->name = name;
53 else
54 ac->name = CEPH_AUTH_NAME_DEFAULT;
55 dout("auth_init name %s secret %s\n", ac->name, secret);
56 ac->secret = secret;
57 return ac;
58
59out:
60 return ERR_PTR(ret);
61}
62
63void ceph_auth_destroy(struct ceph_auth_client *ac)
64{
65 dout("auth_destroy %p\n", ac);
66 if (ac->ops)
67 ac->ops->destroy(ac);
68 kfree(ac);
69}
70
71/*
72 * Reset occurs when reconnecting to the monitor.
73 */
74void ceph_auth_reset(struct ceph_auth_client *ac)
75{
76 dout("auth_reset %p\n", ac);
77 if (ac->ops && !ac->negotiating)
78 ac->ops->reset(ac);
79 ac->negotiating = true;
80}
81
82int ceph_entity_name_encode(const char *name, void **p, void *end)
83{
84 int len = strlen(name);
85
86 if (*p + 2*sizeof(u32) + len > end)
87 return -ERANGE;
88 ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
89 ceph_encode_32(p, len);
90 ceph_encode_copy(p, name, len);
91 return 0;
92}
93
94/*
95 * Initiate protocol negotiation with monitor. Include entity name
96 * and list supported protocols.
97 */
98int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
99{
100 struct ceph_mon_request_header *monhdr = buf;
101 void *p = monhdr + 1, *end = buf + len, *lenp;
102 int i, num;
103 int ret;
104
105 dout("auth_build_hello\n");
106 monhdr->have_version = 0;
107 monhdr->session_mon = cpu_to_le16(-1);
108 monhdr->session_mon_tid = 0;
109
110 ceph_encode_32(&p, 0); /* no protocol, yet */
111
112 lenp = p;
113 p += sizeof(u32);
114
115 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
116 ceph_encode_8(&p, 1);
117 num = ARRAY_SIZE(supported_protocols);
118 ceph_encode_32(&p, num);
119 ceph_decode_need(&p, end, num * sizeof(u32), bad);
120 for (i = 0; i < num; i++)
121 ceph_encode_32(&p, supported_protocols[i]);
122
123 ret = ceph_entity_name_encode(ac->name, &p, end);
124 if (ret < 0)
125 return ret;
126 ceph_decode_need(&p, end, sizeof(u64), bad);
127 ceph_encode_64(&p, ac->global_id);
128
129 ceph_encode_32(&lenp, p - lenp - sizeof(u32));
130 return p - buf;
131
132bad:
133 return -ERANGE;
134}
135
136static int ceph_build_auth_request(struct ceph_auth_client *ac,
137 void *msg_buf, size_t msg_len)
138{
139 struct ceph_mon_request_header *monhdr = msg_buf;
140 void *p = monhdr + 1;
141 void *end = msg_buf + msg_len;
142 int ret;
143
144 monhdr->have_version = 0;
145 monhdr->session_mon = cpu_to_le16(-1);
146 monhdr->session_mon_tid = 0;
147
148 ceph_encode_32(&p, ac->protocol);
149
150 ret = ac->ops->build_request(ac, p + sizeof(u32), end);
151 if (ret < 0) {
152 pr_err("error %d building auth method %s request\n", ret,
153 ac->ops->name);
154 return ret;
155 }
156 dout(" built request %d bytes\n", ret);
157 ceph_encode_32(&p, ret);
158 return p + ret - msg_buf;
159}
160
161/*
162 * Handle auth message from monitor.
163 */
164int ceph_handle_auth_reply(struct ceph_auth_client *ac,
165 void *buf, size_t len,
166 void *reply_buf, size_t reply_len)
167{
168 void *p = buf;
169 void *end = buf + len;
170 int protocol;
171 s32 result;
172 u64 global_id;
173 void *payload, *payload_end;
174 int payload_len;
175 char *result_msg;
176 int result_msg_len;
177 int ret = -EINVAL;
178
179 dout("handle_auth_reply %p %p\n", p, end);
180 ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
181 protocol = ceph_decode_32(&p);
182 result = ceph_decode_32(&p);
183 global_id = ceph_decode_64(&p);
184 payload_len = ceph_decode_32(&p);
185 payload = p;
186 p += payload_len;
187 ceph_decode_need(&p, end, sizeof(u32), bad);
188 result_msg_len = ceph_decode_32(&p);
189 result_msg = p;
190 p += result_msg_len;
191 if (p != end)
192 goto bad;
193
194 dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
195 result_msg, global_id, payload_len);
196
197 payload_end = payload + payload_len;
198
199 if (global_id && ac->global_id != global_id) {
200 dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
201 ac->global_id = global_id;
202 }
203
204 if (ac->negotiating) {
205 /* server does not support our protocols? */
206 if (!protocol && result < 0) {
207 ret = result;
208 goto out;
209 }
210 /* set up (new) protocol handler? */
211 if (ac->protocol && ac->protocol != protocol) {
212 ac->ops->destroy(ac);
213 ac->protocol = 0;
214 ac->ops = NULL;
215 }
216 if (ac->protocol != protocol) {
217 ret = ceph_auth_init_protocol(ac, protocol);
218 if (ret) {
219 pr_err("error %d on auth protocol %d init\n",
220 ret, protocol);
221 goto out;
222 }
223 }
224
225 ac->negotiating = false;
226 }
227
228 ret = ac->ops->handle_reply(ac, result, payload, payload_end);
229 if (ret == -EAGAIN) {
230 return ceph_build_auth_request(ac, reply_buf, reply_len);
231 } else if (ret) {
232 pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
233 return ret;
234 }
235 return 0;
236
237bad:
238 pr_err("failed to decode auth msg\n");
239out:
240 return ret;
241}
242
243int ceph_build_auth(struct ceph_auth_client *ac,
244 void *msg_buf, size_t msg_len)
245{
246 if (!ac->protocol)
247 return ceph_auth_build_hello(ac, msg_buf, msg_len);
248 BUG_ON(!ac->ops);
249 if (ac->ops->should_authenticate(ac))
250 return ceph_build_auth_request(ac, msg_buf, msg_len);
251 return 0;
252}
253
254int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
255{
256 if (!ac->ops)
257 return 0;
258 return ac->ops->is_authenticated(ac);
259}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
deleted file mode 100644
index d38a2fb4a137..000000000000
--- a/fs/ceph/auth.h
+++ /dev/null
@@ -1,92 +0,0 @@
1#ifndef _FS_CEPH_AUTH_H
2#define _FS_CEPH_AUTH_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * Abstract interface for communicating with the authenticate module.
9 * There is some handshake that takes place between us and the monitor
10 * to acquire the necessary keys. These are used to generate an
11 * 'authorizer' that we use when connecting to a service (mds, osd).
12 */
13
14struct ceph_auth_client;
15struct ceph_authorizer;
16
17struct ceph_auth_client_ops {
18 const char *name;
19
20 /*
21 * true if we are authenticated and can connect to
22 * services.
23 */
24 int (*is_authenticated)(struct ceph_auth_client *ac);
25
26 /*
27 * true if we should (re)authenticate, e.g., when our tickets
28 * are getting old and crusty.
29 */
30 int (*should_authenticate)(struct ceph_auth_client *ac);
31
32 /*
33 * build requests and process replies during monitor
34 * handshake. if handle_reply returns -EAGAIN, we build
35 * another request.
36 */
37 int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
38 int (*handle_reply)(struct ceph_auth_client *ac, int result,
39 void *buf, void *end);
40
41 /*
42 * Create authorizer for connecting to a service, and verify
43 * the response to authenticate the service.
44 */
45 int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
46 struct ceph_authorizer **a,
47 void **buf, size_t *len,
48 void **reply_buf, size_t *reply_len);
49 int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
50 struct ceph_authorizer *a, size_t len);
51 void (*destroy_authorizer)(struct ceph_auth_client *ac,
52 struct ceph_authorizer *a);
53 void (*invalidate_authorizer)(struct ceph_auth_client *ac,
54 int peer_type);
55
56 /* reset when we (re)connect to a monitor */
57 void (*reset)(struct ceph_auth_client *ac);
58
59 void (*destroy)(struct ceph_auth_client *ac);
60};
61
62struct ceph_auth_client {
63 u32 protocol; /* CEPH_AUTH_* */
64 void *private; /* for use by protocol implementation */
65 const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */
66
67 bool negotiating; /* true if negotiating protocol */
68 const char *name; /* entity name */
69 u64 global_id; /* our unique id in system */
70 const char *secret; /* our secret key */
71 unsigned want_keys; /* which services we want */
72};
73
74extern struct ceph_auth_client *ceph_auth_init(const char *name,
75 const char *secret);
76extern void ceph_auth_destroy(struct ceph_auth_client *ac);
77
78extern void ceph_auth_reset(struct ceph_auth_client *ac);
79
80extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
81 void *buf, size_t len);
82extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
83 void *buf, size_t len,
84 void *reply_buf, size_t reply_len);
85extern int ceph_entity_name_encode(const char *name, void **p, void *end);
86
87extern int ceph_build_auth(struct ceph_auth_client *ac,
88 void *msg_buf, size_t msg_len);
89
90extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
91
92#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
deleted file mode 100644
index ad1dc21286c7..000000000000
--- a/fs/ceph/auth_none.c
+++ /dev/null
@@ -1,131 +0,0 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7#include <linux/slab.h>
8
9#include "auth_none.h"
10#include "auth.h"
11#include "decode.h"
12
13static void reset(struct ceph_auth_client *ac)
14{
15 struct ceph_auth_none_info *xi = ac->private;
16
17 xi->starting = true;
18 xi->built_authorizer = false;
19}
20
21static void destroy(struct ceph_auth_client *ac)
22{
23 kfree(ac->private);
24 ac->private = NULL;
25}
26
27static int is_authenticated(struct ceph_auth_client *ac)
28{
29 struct ceph_auth_none_info *xi = ac->private;
30
31 return !xi->starting;
32}
33
34static int should_authenticate(struct ceph_auth_client *ac)
35{
36 struct ceph_auth_none_info *xi = ac->private;
37
38 return xi->starting;
39}
40
41/*
42 * the generic auth code decode the global_id, and we carry no actual
43 * authenticate state, so nothing happens here.
44 */
45static int handle_reply(struct ceph_auth_client *ac, int result,
46 void *buf, void *end)
47{
48 struct ceph_auth_none_info *xi = ac->private;
49
50 xi->starting = false;
51 return result;
52}
53
54/*
55 * build an 'authorizer' with our entity_name and global_id. we can
56 * reuse a single static copy since it is identical for all services
57 * we connect to.
58 */
59static int ceph_auth_none_create_authorizer(
60 struct ceph_auth_client *ac, int peer_type,
61 struct ceph_authorizer **a,
62 void **buf, size_t *len,
63 void **reply_buf, size_t *reply_len)
64{
65 struct ceph_auth_none_info *ai = ac->private;
66 struct ceph_none_authorizer *au = &ai->au;
67 void *p, *end;
68 int ret;
69
70 if (!ai->built_authorizer) {
71 p = au->buf;
72 end = p + sizeof(au->buf);
73 ceph_encode_8(&p, 1);
74 ret = ceph_entity_name_encode(ac->name, &p, end - 8);
75 if (ret < 0)
76 goto bad;
77 ceph_decode_need(&p, end, sizeof(u64), bad2);
78 ceph_encode_64(&p, ac->global_id);
79 au->buf_len = p - (void *)au->buf;
80 ai->built_authorizer = true;
81 dout("built authorizer len %d\n", au->buf_len);
82 }
83
84 *a = (struct ceph_authorizer *)au;
85 *buf = au->buf;
86 *len = au->buf_len;
87 *reply_buf = au->reply_buf;
88 *reply_len = sizeof(au->reply_buf);
89 return 0;
90
91bad2:
92 ret = -ERANGE;
93bad:
94 return ret;
95}
96
97static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
98 struct ceph_authorizer *a)
99{
100 /* nothing to do */
101}
102
103static const struct ceph_auth_client_ops ceph_auth_none_ops = {
104 .name = "none",
105 .reset = reset,
106 .destroy = destroy,
107 .is_authenticated = is_authenticated,
108 .should_authenticate = should_authenticate,
109 .handle_reply = handle_reply,
110 .create_authorizer = ceph_auth_none_create_authorizer,
111 .destroy_authorizer = ceph_auth_none_destroy_authorizer,
112};
113
114int ceph_auth_none_init(struct ceph_auth_client *ac)
115{
116 struct ceph_auth_none_info *xi;
117
118 dout("ceph_auth_none_init %p\n", ac);
119 xi = kzalloc(sizeof(*xi), GFP_NOFS);
120 if (!xi)
121 return -ENOMEM;
122
123 xi->starting = true;
124 xi->built_authorizer = false;
125
126 ac->protocol = CEPH_AUTH_NONE;
127 ac->private = xi;
128 ac->ops = &ceph_auth_none_ops;
129 return 0;
130}
131
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
deleted file mode 100644
index 8164df1a08be..000000000000
--- a/fs/ceph/auth_none.h
+++ /dev/null
@@ -1,30 +0,0 @@
1#ifndef _FS_CEPH_AUTH_NONE_H
2#define _FS_CEPH_AUTH_NONE_H
3
4#include <linux/slab.h>
5
6#include "auth.h"
7
8/*
9 * null security mode.
10 *
11 * we use a single static authorizer that simply encodes our entity name
12 * and global id.
13 */
14
15struct ceph_none_authorizer {
16 char buf[128];
17 int buf_len;
18 char reply_buf[0];
19};
20
21struct ceph_auth_none_info {
22 bool starting;
23 bool built_authorizer;
24 struct ceph_none_authorizer au; /* we only need one; it's static */
25};
26
27extern int ceph_auth_none_init(struct ceph_auth_client *ac);
28
29#endif
30
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
deleted file mode 100644
index a2d002cbdec2..000000000000
--- a/fs/ceph/auth_x.c
+++ /dev/null
@@ -1,687 +0,0 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7#include <linux/slab.h>
8
9#include "auth_x.h"
10#include "auth_x_protocol.h"
11#include "crypto.h"
12#include "auth.h"
13#include "decode.h"
14
15#define TEMP_TICKET_BUF_LEN 256
16
17static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
18
19static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
20{
21 struct ceph_x_info *xi = ac->private;
22 int need;
23
24 ceph_x_validate_tickets(ac, &need);
25 dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
26 ac->want_keys, need, xi->have_keys);
27 return (ac->want_keys & xi->have_keys) == ac->want_keys;
28}
29
30static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
31{
32 struct ceph_x_info *xi = ac->private;
33 int need;
34
35 ceph_x_validate_tickets(ac, &need);
36 dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
37 ac->want_keys, need, xi->have_keys);
38 return need != 0;
39}
40
41static int ceph_x_encrypt_buflen(int ilen)
42{
43 return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
44 sizeof(u32);
45}
46
47static int ceph_x_encrypt(struct ceph_crypto_key *secret,
48 void *ibuf, int ilen, void *obuf, size_t olen)
49{
50 struct ceph_x_encrypt_header head = {
51 .struct_v = 1,
52 .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
53 };
54 size_t len = olen - sizeof(u32);
55 int ret;
56
57 ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
58 &head, sizeof(head), ibuf, ilen);
59 if (ret)
60 return ret;
61 ceph_encode_32(&obuf, len);
62 return len + sizeof(u32);
63}
64
65static int ceph_x_decrypt(struct ceph_crypto_key *secret,
66 void **p, void *end, void *obuf, size_t olen)
67{
68 struct ceph_x_encrypt_header head;
69 size_t head_len = sizeof(head);
70 int len, ret;
71
72 len = ceph_decode_32(p);
73 if (*p + len > end)
74 return -EINVAL;
75
76 dout("ceph_x_decrypt len %d\n", len);
77 ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
78 *p, len);
79 if (ret)
80 return ret;
81 if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
82 return -EPERM;
83 *p += len;
84 return olen;
85}
86
87/*
88 * get existing (or insert new) ticket handler
89 */
90static struct ceph_x_ticket_handler *
91get_ticket_handler(struct ceph_auth_client *ac, int service)
92{
93 struct ceph_x_ticket_handler *th;
94 struct ceph_x_info *xi = ac->private;
95 struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
96
97 while (*p) {
98 parent = *p;
99 th = rb_entry(parent, struct ceph_x_ticket_handler, node);
100 if (service < th->service)
101 p = &(*p)->rb_left;
102 else if (service > th->service)
103 p = &(*p)->rb_right;
104 else
105 return th;
106 }
107
108 /* add it */
109 th = kzalloc(sizeof(*th), GFP_NOFS);
110 if (!th)
111 return ERR_PTR(-ENOMEM);
112 th->service = service;
113 rb_link_node(&th->node, parent, p);
114 rb_insert_color(&th->node, &xi->ticket_handlers);
115 return th;
116}
117
118static void remove_ticket_handler(struct ceph_auth_client *ac,
119 struct ceph_x_ticket_handler *th)
120{
121 struct ceph_x_info *xi = ac->private;
122
123 dout("remove_ticket_handler %p %d\n", th, th->service);
124 rb_erase(&th->node, &xi->ticket_handlers);
125 ceph_crypto_key_destroy(&th->session_key);
126 if (th->ticket_blob)
127 ceph_buffer_put(th->ticket_blob);
128 kfree(th);
129}
130
131static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
132 struct ceph_crypto_key *secret,
133 void *buf, void *end)
134{
135 struct ceph_x_info *xi = ac->private;
136 int num;
137 void *p = buf;
138 int ret;
139 char *dbuf;
140 char *ticket_buf;
141 u8 reply_struct_v;
142
143 dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
144 if (!dbuf)
145 return -ENOMEM;
146
147 ret = -ENOMEM;
148 ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
149 if (!ticket_buf)
150 goto out_dbuf;
151
152 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
153 reply_struct_v = ceph_decode_8(&p);
154 if (reply_struct_v != 1)
155 goto bad;
156 num = ceph_decode_32(&p);
157 dout("%d tickets\n", num);
158 while (num--) {
159 int type;
160 u8 tkt_struct_v, blob_struct_v;
161 struct ceph_x_ticket_handler *th;
162 void *dp, *dend;
163 int dlen;
164 char is_enc;
165 struct timespec validity;
166 struct ceph_crypto_key old_key;
167 void *tp, *tpend;
168 struct ceph_timespec new_validity;
169 struct ceph_crypto_key new_session_key;
170 struct ceph_buffer *new_ticket_blob;
171 unsigned long new_expires, new_renew_after;
172 u64 new_secret_id;
173
174 ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
175
176 type = ceph_decode_32(&p);
177 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
178
179 tkt_struct_v = ceph_decode_8(&p);
180 if (tkt_struct_v != 1)
181 goto bad;
182
183 th = get_ticket_handler(ac, type);
184 if (IS_ERR(th)) {
185 ret = PTR_ERR(th);
186 goto out;
187 }
188
189 /* blob for me */
190 dlen = ceph_x_decrypt(secret, &p, end, dbuf,
191 TEMP_TICKET_BUF_LEN);
192 if (dlen <= 0) {
193 ret = dlen;
194 goto out;
195 }
196 dout(" decrypted %d bytes\n", dlen);
197 dend = dbuf + dlen;
198 dp = dbuf;
199
200 tkt_struct_v = ceph_decode_8(&dp);
201 if (tkt_struct_v != 1)
202 goto bad;
203
204 memcpy(&old_key, &th->session_key, sizeof(old_key));
205 ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
206 if (ret)
207 goto out;
208
209 ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
210 ceph_decode_timespec(&validity, &new_validity);
211 new_expires = get_seconds() + validity.tv_sec;
212 new_renew_after = new_expires - (validity.tv_sec / 4);
213 dout(" expires=%lu renew_after=%lu\n", new_expires,
214 new_renew_after);
215
216 /* ticket blob for service */
217 ceph_decode_8_safe(&p, end, is_enc, bad);
218 tp = ticket_buf;
219 if (is_enc) {
220 /* encrypted */
221 dout(" encrypted ticket\n");
222 dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
223 TEMP_TICKET_BUF_LEN);
224 if (dlen < 0) {
225 ret = dlen;
226 goto out;
227 }
228 dlen = ceph_decode_32(&tp);
229 } else {
230 /* unencrypted */
231 ceph_decode_32_safe(&p, end, dlen, bad);
232 ceph_decode_need(&p, end, dlen, bad);
233 ceph_decode_copy(&p, ticket_buf, dlen);
234 }
235 tpend = tp + dlen;
236 dout(" ticket blob is %d bytes\n", dlen);
237 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
238 blob_struct_v = ceph_decode_8(&tp);
239 new_secret_id = ceph_decode_64(&tp);
240 ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
241 if (ret)
242 goto out;
243
244 /* all is well, update our ticket */
245 ceph_crypto_key_destroy(&th->session_key);
246 if (th->ticket_blob)
247 ceph_buffer_put(th->ticket_blob);
248 th->session_key = new_session_key;
249 th->ticket_blob = new_ticket_blob;
250 th->validity = new_validity;
251 th->secret_id = new_secret_id;
252 th->expires = new_expires;
253 th->renew_after = new_renew_after;
254 dout(" got ticket service %d (%s) secret_id %lld len %d\n",
255 type, ceph_entity_type_name(type), th->secret_id,
256 (int)th->ticket_blob->vec.iov_len);
257 xi->have_keys |= th->service;
258 }
259
260 ret = 0;
261out:
262 kfree(ticket_buf);
263out_dbuf:
264 kfree(dbuf);
265 return ret;
266
267bad:
268 ret = -EINVAL;
269 goto out;
270}
271
272static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
273 struct ceph_x_ticket_handler *th,
274 struct ceph_x_authorizer *au)
275{
276 int maxlen;
277 struct ceph_x_authorize_a *msg_a;
278 struct ceph_x_authorize_b msg_b;
279 void *p, *end;
280 int ret;
281 int ticket_blob_len =
282 (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
283
284 dout("build_authorizer for %s %p\n",
285 ceph_entity_type_name(th->service), au);
286
287 maxlen = sizeof(*msg_a) + sizeof(msg_b) +
288 ceph_x_encrypt_buflen(ticket_blob_len);
289 dout(" need len %d\n", maxlen);
290 if (au->buf && au->buf->alloc_len < maxlen) {
291 ceph_buffer_put(au->buf);
292 au->buf = NULL;
293 }
294 if (!au->buf) {
295 au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
296 if (!au->buf)
297 return -ENOMEM;
298 }
299 au->service = th->service;
300
301 msg_a = au->buf->vec.iov_base;
302 msg_a->struct_v = 1;
303 msg_a->global_id = cpu_to_le64(ac->global_id);
304 msg_a->service_id = cpu_to_le32(th->service);
305 msg_a->ticket_blob.struct_v = 1;
306 msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
307 msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
308 if (ticket_blob_len) {
309 memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
310 th->ticket_blob->vec.iov_len);
311 }
312 dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
313 le64_to_cpu(msg_a->ticket_blob.secret_id));
314
315 p = msg_a + 1;
316 p += ticket_blob_len;
317 end = au->buf->vec.iov_base + au->buf->vec.iov_len;
318
319 get_random_bytes(&au->nonce, sizeof(au->nonce));
320 msg_b.struct_v = 1;
321 msg_b.nonce = cpu_to_le64(au->nonce);
322 ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
323 p, end - p);
324 if (ret < 0)
325 goto out_buf;
326 p += ret;
327 au->buf->vec.iov_len = p - au->buf->vec.iov_base;
328 dout(" built authorizer nonce %llx len %d\n", au->nonce,
329 (int)au->buf->vec.iov_len);
330 BUG_ON(au->buf->vec.iov_len > maxlen);
331 return 0;
332
333out_buf:
334 ceph_buffer_put(au->buf);
335 au->buf = NULL;
336 return ret;
337}
338
339static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
340 void **p, void *end)
341{
342 ceph_decode_need(p, end, 1 + sizeof(u64), bad);
343 ceph_encode_8(p, 1);
344 ceph_encode_64(p, th->secret_id);
345 if (th->ticket_blob) {
346 const char *buf = th->ticket_blob->vec.iov_base;
347 u32 len = th->ticket_blob->vec.iov_len;
348
349 ceph_encode_32_safe(p, end, len, bad);
350 ceph_encode_copy_safe(p, end, buf, len, bad);
351 } else {
352 ceph_encode_32_safe(p, end, 0, bad);
353 }
354
355 return 0;
356bad:
357 return -ERANGE;
358}
359
360static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
361{
362 int want = ac->want_keys;
363 struct ceph_x_info *xi = ac->private;
364 int service;
365
366 *pneed = ac->want_keys & ~(xi->have_keys);
367
368 for (service = 1; service <= want; service <<= 1) {
369 struct ceph_x_ticket_handler *th;
370
371 if (!(ac->want_keys & service))
372 continue;
373
374 if (*pneed & service)
375 continue;
376
377 th = get_ticket_handler(ac, service);
378
379 if (IS_ERR(th)) {
380 *pneed |= service;
381 continue;
382 }
383
384 if (get_seconds() >= th->renew_after)
385 *pneed |= service;
386 if (get_seconds() >= th->expires)
387 xi->have_keys &= ~service;
388 }
389}
390
391
392static int ceph_x_build_request(struct ceph_auth_client *ac,
393 void *buf, void *end)
394{
395 struct ceph_x_info *xi = ac->private;
396 int need;
397 struct ceph_x_request_header *head = buf;
398 int ret;
399 struct ceph_x_ticket_handler *th =
400 get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
401
402 if (IS_ERR(th))
403 return PTR_ERR(th);
404
405 ceph_x_validate_tickets(ac, &need);
406
407 dout("build_request want %x have %x need %x\n",
408 ac->want_keys, xi->have_keys, need);
409
410 if (need & CEPH_ENTITY_TYPE_AUTH) {
411 struct ceph_x_authenticate *auth = (void *)(head + 1);
412 void *p = auth + 1;
413 struct ceph_x_challenge_blob tmp;
414 char tmp_enc[40];
415 u64 *u;
416
417 if (p > end)
418 return -ERANGE;
419
420 dout(" get_auth_session_key\n");
421 head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
422
423 /* encrypt and hash */
424 get_random_bytes(&auth->client_challenge, sizeof(u64));
425 tmp.client_challenge = auth->client_challenge;
426 tmp.server_challenge = cpu_to_le64(xi->server_challenge);
427 ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
428 tmp_enc, sizeof(tmp_enc));
429 if (ret < 0)
430 return ret;
431
432 auth->struct_v = 1;
433 auth->key = 0;
434 for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
435 auth->key ^= *(__le64 *)u;
436 dout(" server_challenge %llx client_challenge %llx key %llx\n",
437 xi->server_challenge, le64_to_cpu(auth->client_challenge),
438 le64_to_cpu(auth->key));
439
440 /* now encode the old ticket if exists */
441 ret = ceph_x_encode_ticket(th, &p, end);
442 if (ret < 0)
443 return ret;
444
445 return p - buf;
446 }
447
448 if (need) {
449 void *p = head + 1;
450 struct ceph_x_service_ticket_request *req;
451
452 if (p > end)
453 return -ERANGE;
454 head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
455
456 ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
457 if (ret)
458 return ret;
459 ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
460 xi->auth_authorizer.buf->vec.iov_len);
461
462 req = p;
463 req->keys = cpu_to_le32(need);
464 p += sizeof(*req);
465 return p - buf;
466 }
467
468 return 0;
469}
470
471static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
472 void *buf, void *end)
473{
474 struct ceph_x_info *xi = ac->private;
475 struct ceph_x_reply_header *head = buf;
476 struct ceph_x_ticket_handler *th;
477 int len = end - buf;
478 int op;
479 int ret;
480
481 if (result)
482 return result; /* XXX hmm? */
483
484 if (xi->starting) {
485 /* it's a hello */
486 struct ceph_x_server_challenge *sc = buf;
487
488 if (len != sizeof(*sc))
489 return -EINVAL;
490 xi->server_challenge = le64_to_cpu(sc->server_challenge);
491 dout("handle_reply got server challenge %llx\n",
492 xi->server_challenge);
493 xi->starting = false;
494 xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
495 return -EAGAIN;
496 }
497
498 op = le16_to_cpu(head->op);
499 result = le32_to_cpu(head->result);
500 dout("handle_reply op %d result %d\n", op, result);
501 switch (op) {
502 case CEPHX_GET_AUTH_SESSION_KEY:
503 /* verify auth key */
504 ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
505 buf + sizeof(*head), end);
506 break;
507
508 case CEPHX_GET_PRINCIPAL_SESSION_KEY:
509 th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
510 if (IS_ERR(th))
511 return PTR_ERR(th);
512 ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
513 buf + sizeof(*head), end);
514 break;
515
516 default:
517 return -EINVAL;
518 }
519 if (ret)
520 return ret;
521 if (ac->want_keys == xi->have_keys)
522 return 0;
523 return -EAGAIN;
524}
525
526static int ceph_x_create_authorizer(
527 struct ceph_auth_client *ac, int peer_type,
528 struct ceph_authorizer **a,
529 void **buf, size_t *len,
530 void **reply_buf, size_t *reply_len)
531{
532 struct ceph_x_authorizer *au;
533 struct ceph_x_ticket_handler *th;
534 int ret;
535
536 th = get_ticket_handler(ac, peer_type);
537 if (IS_ERR(th))
538 return PTR_ERR(th);
539
540 au = kzalloc(sizeof(*au), GFP_NOFS);
541 if (!au)
542 return -ENOMEM;
543
544 ret = ceph_x_build_authorizer(ac, th, au);
545 if (ret) {
546 kfree(au);
547 return ret;
548 }
549
550 *a = (struct ceph_authorizer *)au;
551 *buf = au->buf->vec.iov_base;
552 *len = au->buf->vec.iov_len;
553 *reply_buf = au->reply_buf;
554 *reply_len = sizeof(au->reply_buf);
555 return 0;
556}
557
558static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
559 struct ceph_authorizer *a, size_t len)
560{
561 struct ceph_x_authorizer *au = (void *)a;
562 struct ceph_x_ticket_handler *th;
563 int ret = 0;
564 struct ceph_x_authorize_reply reply;
565 void *p = au->reply_buf;
566 void *end = p + sizeof(au->reply_buf);
567
568 th = get_ticket_handler(ac, au->service);
569 if (IS_ERR(th))
570 return PTR_ERR(th);
571 ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
572 if (ret < 0)
573 return ret;
574 if (ret != sizeof(reply))
575 return -EPERM;
576
577 if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
578 ret = -EPERM;
579 else
580 ret = 0;
581 dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
582 au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
583 return ret;
584}
585
586static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
587 struct ceph_authorizer *a)
588{
589 struct ceph_x_authorizer *au = (void *)a;
590
591 ceph_buffer_put(au->buf);
592 kfree(au);
593}
594
595
596static void ceph_x_reset(struct ceph_auth_client *ac)
597{
598 struct ceph_x_info *xi = ac->private;
599
600 dout("reset\n");
601 xi->starting = true;
602 xi->server_challenge = 0;
603}
604
605static void ceph_x_destroy(struct ceph_auth_client *ac)
606{
607 struct ceph_x_info *xi = ac->private;
608 struct rb_node *p;
609
610 dout("ceph_x_destroy %p\n", ac);
611 ceph_crypto_key_destroy(&xi->secret);
612
613 while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
614 struct ceph_x_ticket_handler *th =
615 rb_entry(p, struct ceph_x_ticket_handler, node);
616 remove_ticket_handler(ac, th);
617 }
618
619 if (xi->auth_authorizer.buf)
620 ceph_buffer_put(xi->auth_authorizer.buf);
621
622 kfree(ac->private);
623 ac->private = NULL;
624}
625
626static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
627 int peer_type)
628{
629 struct ceph_x_ticket_handler *th;
630
631 th = get_ticket_handler(ac, peer_type);
632 if (!IS_ERR(th))
633 remove_ticket_handler(ac, th);
634}
635
636
637static const struct ceph_auth_client_ops ceph_x_ops = {
638 .name = "x",
639 .is_authenticated = ceph_x_is_authenticated,
640 .should_authenticate = ceph_x_should_authenticate,
641 .build_request = ceph_x_build_request,
642 .handle_reply = ceph_x_handle_reply,
643 .create_authorizer = ceph_x_create_authorizer,
644 .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
645 .destroy_authorizer = ceph_x_destroy_authorizer,
646 .invalidate_authorizer = ceph_x_invalidate_authorizer,
647 .reset = ceph_x_reset,
648 .destroy = ceph_x_destroy,
649};
650
651
652int ceph_x_init(struct ceph_auth_client *ac)
653{
654 struct ceph_x_info *xi;
655 int ret;
656
657 dout("ceph_x_init %p\n", ac);
658 ret = -ENOMEM;
659 xi = kzalloc(sizeof(*xi), GFP_NOFS);
660 if (!xi)
661 goto out;
662
663 ret = -EINVAL;
664 if (!ac->secret) {
665 pr_err("no secret set (for auth_x protocol)\n");
666 goto out_nomem;
667 }
668
669 ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
670 if (ret)
671 goto out_nomem;
672
673 xi->starting = true;
674 xi->ticket_handlers = RB_ROOT;
675
676 ac->protocol = CEPH_AUTH_CEPHX;
677 ac->private = xi;
678 ac->ops = &ceph_x_ops;
679 return 0;
680
681out_nomem:
682 kfree(xi);
683out:
684 return ret;
685}
686
687
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h
deleted file mode 100644
index ff6f8180e681..000000000000
--- a/fs/ceph/auth_x.h
+++ /dev/null
@@ -1,49 +0,0 @@
1#ifndef _FS_CEPH_AUTH_X_H
2#define _FS_CEPH_AUTH_X_H
3
4#include <linux/rbtree.h>
5
6#include "crypto.h"
7#include "auth.h"
8#include "auth_x_protocol.h"
9
10/*
11 * Handle ticket for a single service.
12 */
13struct ceph_x_ticket_handler {
14 struct rb_node node;
15 unsigned service;
16
17 struct ceph_crypto_key session_key;
18 struct ceph_timespec validity;
19
20 u64 secret_id;
21 struct ceph_buffer *ticket_blob;
22
23 unsigned long renew_after, expires;
24};
25
26
27struct ceph_x_authorizer {
28 struct ceph_buffer *buf;
29 unsigned service;
30 u64 nonce;
31 char reply_buf[128]; /* big enough for encrypted blob */
32};
33
34struct ceph_x_info {
35 struct ceph_crypto_key secret;
36
37 bool starting;
38 u64 server_challenge;
39
40 unsigned have_keys;
41 struct rb_root ticket_handlers;
42
43 struct ceph_x_authorizer auth_authorizer;
44};
45
46extern int ceph_x_init(struct ceph_auth_client *ac);
47
48#endif
49
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h
deleted file mode 100644
index 671d30576c4f..000000000000
--- a/fs/ceph/auth_x_protocol.h
+++ /dev/null
@@ -1,90 +0,0 @@
1#ifndef __FS_CEPH_AUTH_X_PROTOCOL
2#define __FS_CEPH_AUTH_X_PROTOCOL
3
4#define CEPHX_GET_AUTH_SESSION_KEY 0x0100
5#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
6#define CEPHX_GET_ROTATING_KEY 0x0400
7
8/* common bits */
9struct ceph_x_ticket_blob {
10 __u8 struct_v;
11 __le64 secret_id;
12 __le32 blob_len;
13 char blob[];
14} __attribute__ ((packed));
15
16
17/* common request/reply headers */
18struct ceph_x_request_header {
19 __le16 op;
20} __attribute__ ((packed));
21
22struct ceph_x_reply_header {
23 __le16 op;
24 __le32 result;
25} __attribute__ ((packed));
26
27
28/* authenticate handshake */
29
30/* initial hello (no reply header) */
31struct ceph_x_server_challenge {
32 __u8 struct_v;
33 __le64 server_challenge;
34} __attribute__ ((packed));
35
36struct ceph_x_authenticate {
37 __u8 struct_v;
38 __le64 client_challenge;
39 __le64 key;
40 /* ticket blob */
41} __attribute__ ((packed));
42
43struct ceph_x_service_ticket_request {
44 __u8 struct_v;
45 __le32 keys;
46} __attribute__ ((packed));
47
48struct ceph_x_challenge_blob {
49 __le64 server_challenge;
50 __le64 client_challenge;
51} __attribute__ ((packed));
52
53
54
55/* authorize handshake */
56
57/*
58 * The authorizer consists of two pieces:
59 * a - service id, ticket blob
60 * b - encrypted with session key
61 */
62struct ceph_x_authorize_a {
63 __u8 struct_v;
64 __le64 global_id;
65 __le32 service_id;
66 struct ceph_x_ticket_blob ticket_blob;
67} __attribute__ ((packed));
68
69struct ceph_x_authorize_b {
70 __u8 struct_v;
71 __le64 nonce;
72} __attribute__ ((packed));
73
74struct ceph_x_authorize_reply {
75 __u8 struct_v;
76 __le64 nonce_plus_one;
77} __attribute__ ((packed));
78
79
80/*
81 * encyption bundle
82 */
83#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
84
85struct ceph_x_encrypt_header {
86 __u8 struct_v;
87 __le64 magic;
88} __attribute__ ((packed));
89
90#endif
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
deleted file mode 100644
index cd39f17021de..000000000000
--- a/fs/ceph/buffer.c
+++ /dev/null
@@ -1,65 +0,0 @@
1
2#include "ceph_debug.h"
3
4#include <linux/slab.h>
5
6#include "buffer.h"
7#include "decode.h"
8
9struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
10{
11 struct ceph_buffer *b;
12
13 b = kmalloc(sizeof(*b), gfp);
14 if (!b)
15 return NULL;
16
17 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
18 if (b->vec.iov_base) {
19 b->is_vmalloc = false;
20 } else {
21 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
22 if (!b->vec.iov_base) {
23 kfree(b);
24 return NULL;
25 }
26 b->is_vmalloc = true;
27 }
28
29 kref_init(&b->kref);
30 b->alloc_len = len;
31 b->vec.iov_len = len;
32 dout("buffer_new %p\n", b);
33 return b;
34}
35
36void ceph_buffer_release(struct kref *kref)
37{
38 struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
39
40 dout("buffer_release %p\n", b);
41 if (b->vec.iov_base) {
42 if (b->is_vmalloc)
43 vfree(b->vec.iov_base);
44 else
45 kfree(b->vec.iov_base);
46 }
47 kfree(b);
48}
49
50int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
51{
52 size_t len;
53
54 ceph_decode_need(p, end, sizeof(u32), bad);
55 len = ceph_decode_32(p);
56 dout("decode_buffer len %d\n", (int)len);
57 ceph_decode_need(p, end, len, bad);
58 *b = ceph_buffer_new(len, GFP_NOFS);
59 if (!*b)
60 return -ENOMEM;
61 ceph_decode_copy(p, (*b)->vec.iov_base, len);
62 return 0;
63bad:
64 return -EINVAL;
65}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
deleted file mode 100644
index 58d19014068f..000000000000
--- a/fs/ceph/buffer.h
+++ /dev/null
@@ -1,39 +0,0 @@
1#ifndef __FS_CEPH_BUFFER_H
2#define __FS_CEPH_BUFFER_H
3
4#include <linux/kref.h>
5#include <linux/mm.h>
6#include <linux/vmalloc.h>
7#include <linux/types.h>
8#include <linux/uio.h>
9
10/*
11 * a simple reference counted buffer.
12 *
13 * use kmalloc for small sizes (<= one page), vmalloc for larger
14 * sizes.
15 */
16struct ceph_buffer {
17 struct kref kref;
18 struct kvec vec;
19 size_t alloc_len;
20 bool is_vmalloc;
21};
22
23extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
24extern void ceph_buffer_release(struct kref *kref);
25
26static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
27{
28 kref_get(&b->kref);
29 return b;
30}
31
32static inline void ceph_buffer_put(struct ceph_buffer *b)
33{
34 kref_put(&b->kref, ceph_buffer_release);
35}
36
37extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
38
39#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index a2069b6680ae..98ab13e2b71d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/fs.h> 3#include <linux/fs.h>
4#include <linux/kernel.h> 4#include <linux/kernel.h>
@@ -9,8 +9,9 @@
9#include <linux/writeback.h> 9#include <linux/writeback.h>
10 10
11#include "super.h" 11#include "super.h"
12#include "decode.h" 12#include "mds_client.h"
13#include "messenger.h" 13#include <linux/ceph/decode.h>
14#include <linux/ceph/messenger.h>
14 15
15/* 16/*
16 * Capability management 17 * Capability management
@@ -287,11 +288,11 @@ void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
287 spin_unlock(&mdsc->caps_list_lock); 288 spin_unlock(&mdsc->caps_list_lock);
288} 289}
289 290
290void ceph_reservation_status(struct ceph_client *client, 291void ceph_reservation_status(struct ceph_fs_client *fsc,
291 int *total, int *avail, int *used, int *reserved, 292 int *total, int *avail, int *used, int *reserved,
292 int *min) 293 int *min)
293{ 294{
294 struct ceph_mds_client *mdsc = &client->mdsc; 295 struct ceph_mds_client *mdsc = fsc->mdsc;
295 296
296 if (total) 297 if (total)
297 *total = mdsc->caps_total_count; 298 *total = mdsc->caps_total_count;
@@ -399,7 +400,7 @@ static void __insert_cap_node(struct ceph_inode_info *ci,
399static void __cap_set_timeouts(struct ceph_mds_client *mdsc, 400static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
400 struct ceph_inode_info *ci) 401 struct ceph_inode_info *ci)
401{ 402{
402 struct ceph_mount_args *ma = mdsc->client->mount_args; 403 struct ceph_mount_options *ma = mdsc->fsc->mount_options;
403 404
404 ci->i_hold_caps_min = round_jiffies(jiffies + 405 ci->i_hold_caps_min = round_jiffies(jiffies +
405 ma->caps_wanted_delay_min * HZ); 406 ma->caps_wanted_delay_min * HZ);
@@ -515,7 +516,7 @@ int ceph_add_cap(struct inode *inode,
515 unsigned seq, unsigned mseq, u64 realmino, int flags, 516 unsigned seq, unsigned mseq, u64 realmino, int flags,
516 struct ceph_cap_reservation *caps_reservation) 517 struct ceph_cap_reservation *caps_reservation)
517{ 518{
518 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; 519 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
519 struct ceph_inode_info *ci = ceph_inode(inode); 520 struct ceph_inode_info *ci = ceph_inode(inode);
520 struct ceph_cap *new_cap = NULL; 521 struct ceph_cap *new_cap = NULL;
521 struct ceph_cap *cap; 522 struct ceph_cap *cap;
@@ -814,7 +815,7 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
814 used |= CEPH_CAP_PIN; 815 used |= CEPH_CAP_PIN;
815 if (ci->i_rd_ref) 816 if (ci->i_rd_ref)
816 used |= CEPH_CAP_FILE_RD; 817 used |= CEPH_CAP_FILE_RD;
817 if (ci->i_rdcache_ref || ci->i_rdcache_gen) 818 if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages)
818 used |= CEPH_CAP_FILE_CACHE; 819 used |= CEPH_CAP_FILE_CACHE;
819 if (ci->i_wr_ref) 820 if (ci->i_wr_ref)
820 used |= CEPH_CAP_FILE_WR; 821 used |= CEPH_CAP_FILE_WR;
@@ -873,7 +874,7 @@ void __ceph_remove_cap(struct ceph_cap *cap)
873 struct ceph_mds_session *session = cap->session; 874 struct ceph_mds_session *session = cap->session;
874 struct ceph_inode_info *ci = cap->ci; 875 struct ceph_inode_info *ci = cap->ci;
875 struct ceph_mds_client *mdsc = 876 struct ceph_mds_client *mdsc =
876 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; 877 ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
877 int removed = 0; 878 int removed = 0;
878 879
879 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); 880 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -1195,10 +1196,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1195 * asynchronously back to the MDS once sync writes complete and dirty 1196 * asynchronously back to the MDS once sync writes complete and dirty
1196 * data is written out. 1197 * data is written out.
1197 * 1198 *
1199 * Unless @again is true, skip cap_snaps that were already sent to
1200 * the MDS (i.e., during this session).
1201 *
1198 * Called under i_lock. Takes s_mutex as needed. 1202 * Called under i_lock. Takes s_mutex as needed.
1199 */ 1203 */
1200void __ceph_flush_snaps(struct ceph_inode_info *ci, 1204void __ceph_flush_snaps(struct ceph_inode_info *ci,
1201 struct ceph_mds_session **psession) 1205 struct ceph_mds_session **psession,
1206 int again)
1202 __releases(ci->vfs_inode->i_lock) 1207 __releases(ci->vfs_inode->i_lock)
1203 __acquires(ci->vfs_inode->i_lock) 1208 __acquires(ci->vfs_inode->i_lock)
1204{ 1209{
@@ -1206,7 +1211,7 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci,
1206 int mds; 1211 int mds;
1207 struct ceph_cap_snap *capsnap; 1212 struct ceph_cap_snap *capsnap;
1208 u32 mseq; 1213 u32 mseq;
1209 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; 1214 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1210 struct ceph_mds_session *session = NULL; /* if session != NULL, we hold 1215 struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
1211 session->s_mutex */ 1216 session->s_mutex */
1212 u64 next_follows = 0; /* keep track of how far we've gotten through the 1217 u64 next_follows = 0; /* keep track of how far we've gotten through the
@@ -1227,7 +1232,7 @@ retry:
1227 * pages to be written out. 1232 * pages to be written out.
1228 */ 1233 */
1229 if (capsnap->dirty_pages || capsnap->writing) 1234 if (capsnap->dirty_pages || capsnap->writing)
1230 continue; 1235 break;
1231 1236
1232 /* 1237 /*
1233 * if cap writeback already occurred, we should have dropped 1238 * if cap writeback already occurred, we should have dropped
@@ -1240,6 +1245,13 @@ retry:
1240 dout("no auth cap (migrating?), doing nothing\n"); 1245 dout("no auth cap (migrating?), doing nothing\n");
1241 goto out; 1246 goto out;
1242 } 1247 }
1248
1249 /* only flush each capsnap once */
1250 if (!again && !list_empty(&capsnap->flushing_item)) {
1251 dout("already flushed %p, skipping\n", capsnap);
1252 continue;
1253 }
1254
1243 mds = ci->i_auth_cap->session->s_mds; 1255 mds = ci->i_auth_cap->session->s_mds;
1244 mseq = ci->i_auth_cap->mseq; 1256 mseq = ci->i_auth_cap->mseq;
1245 1257
@@ -1276,8 +1288,8 @@ retry:
1276 &session->s_cap_snaps_flushing); 1288 &session->s_cap_snaps_flushing);
1277 spin_unlock(&inode->i_lock); 1289 spin_unlock(&inode->i_lock);
1278 1290
1279 dout("flush_snaps %p cap_snap %p follows %lld size %llu\n", 1291 dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
1280 inode, capsnap, next_follows, capsnap->size); 1292 inode, capsnap, capsnap->follows, capsnap->flush_tid);
1281 send_cap_msg(session, ceph_vino(inode).ino, 0, 1293 send_cap_msg(session, ceph_vino(inode).ino, 0,
1282 CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, 1294 CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
1283 capsnap->dirty, 0, capsnap->flush_tid, 0, mseq, 1295 capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
@@ -1314,7 +1326,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
1314 struct inode *inode = &ci->vfs_inode; 1326 struct inode *inode = &ci->vfs_inode;
1315 1327
1316 spin_lock(&inode->i_lock); 1328 spin_lock(&inode->i_lock);
1317 __ceph_flush_snaps(ci, NULL); 1329 __ceph_flush_snaps(ci, NULL, 0);
1318 spin_unlock(&inode->i_lock); 1330 spin_unlock(&inode->i_lock);
1319} 1331}
1320 1332
@@ -1325,7 +1337,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
1325void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) 1337void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1326{ 1338{
1327 struct ceph_mds_client *mdsc = 1339 struct ceph_mds_client *mdsc =
1328 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; 1340 ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
1329 struct inode *inode = &ci->vfs_inode; 1341 struct inode *inode = &ci->vfs_inode;
1330 int was = ci->i_dirty_caps; 1342 int was = ci->i_dirty_caps;
1331 int dirty = 0; 1343 int dirty = 0;
@@ -1367,7 +1379,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1367static int __mark_caps_flushing(struct inode *inode, 1379static int __mark_caps_flushing(struct inode *inode,
1368 struct ceph_mds_session *session) 1380 struct ceph_mds_session *session)
1369{ 1381{
1370 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 1382 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
1371 struct ceph_inode_info *ci = ceph_inode(inode); 1383 struct ceph_inode_info *ci = ceph_inode(inode);
1372 int flushing; 1384 int flushing;
1373 1385
@@ -1405,17 +1417,6 @@ static int __mark_caps_flushing(struct inode *inode,
1405/* 1417/*
1406 * try to invalidate mapping pages without blocking. 1418 * try to invalidate mapping pages without blocking.
1407 */ 1419 */
1408static int mapping_is_empty(struct address_space *mapping)
1409{
1410 struct page *page = find_get_page(mapping, 0);
1411
1412 if (!page)
1413 return 1;
1414
1415 put_page(page);
1416 return 0;
1417}
1418
1419static int try_nonblocking_invalidate(struct inode *inode) 1420static int try_nonblocking_invalidate(struct inode *inode)
1420{ 1421{
1421 struct ceph_inode_info *ci = ceph_inode(inode); 1422 struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1425,7 +1426,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
1425 invalidate_mapping_pages(&inode->i_data, 0, -1); 1426 invalidate_mapping_pages(&inode->i_data, 0, -1);
1426 spin_lock(&inode->i_lock); 1427 spin_lock(&inode->i_lock);
1427 1428
1428 if (mapping_is_empty(&inode->i_data) && 1429 if (inode->i_data.nrpages == 0 &&
1429 invalidating_gen == ci->i_rdcache_gen) { 1430 invalidating_gen == ci->i_rdcache_gen) {
1430 /* success. */ 1431 /* success. */
1431 dout("try_nonblocking_invalidate %p success\n", inode); 1432 dout("try_nonblocking_invalidate %p success\n", inode);
@@ -1451,8 +1452,8 @@ static int try_nonblocking_invalidate(struct inode *inode)
1451void ceph_check_caps(struct ceph_inode_info *ci, int flags, 1452void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1452 struct ceph_mds_session *session) 1453 struct ceph_mds_session *session)
1453{ 1454{
1454 struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode); 1455 struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
1455 struct ceph_mds_client *mdsc = &client->mdsc; 1456 struct ceph_mds_client *mdsc = fsc->mdsc;
1456 struct inode *inode = &ci->vfs_inode; 1457 struct inode *inode = &ci->vfs_inode;
1457 struct ceph_cap *cap; 1458 struct ceph_cap *cap;
1458 int file_wanted, used; 1459 int file_wanted, used;
@@ -1477,7 +1478,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1477 1478
1478 /* flush snaps first time around only */ 1479 /* flush snaps first time around only */
1479 if (!list_empty(&ci->i_cap_snaps)) 1480 if (!list_empty(&ci->i_cap_snaps))
1480 __ceph_flush_snaps(ci, &session); 1481 __ceph_flush_snaps(ci, &session, 0);
1481 goto retry_locked; 1482 goto retry_locked;
1482retry: 1483retry:
1483 spin_lock(&inode->i_lock); 1484 spin_lock(&inode->i_lock);
@@ -1522,7 +1523,7 @@ retry_locked:
1522 */ 1523 */
1523 if ((!is_delayed || mdsc->stopping) && 1524 if ((!is_delayed || mdsc->stopping) &&
1524 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ 1525 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
1525 ci->i_rdcache_gen && /* may have cached pages */ 1526 inode->i_data.nrpages && /* have cached pages */
1526 (file_wanted == 0 || /* no open files */ 1527 (file_wanted == 0 || /* no open files */
1527 (revoking & (CEPH_CAP_FILE_CACHE| 1528 (revoking & (CEPH_CAP_FILE_CACHE|
1528 CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ 1529 CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */
@@ -1695,7 +1696,7 @@ ack:
1695static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, 1696static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
1696 unsigned *flush_tid) 1697 unsigned *flush_tid)
1697{ 1698{
1698 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 1699 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
1699 struct ceph_inode_info *ci = ceph_inode(inode); 1700 struct ceph_inode_info *ci = ceph_inode(inode);
1700 int unlock_session = session ? 0 : 1; 1701 int unlock_session = session ? 0 : 1;
1701 int flushing = 0; 1702 int flushing = 0;
@@ -1861,7 +1862,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1861 caps_are_flushed(inode, flush_tid)); 1862 caps_are_flushed(inode, flush_tid));
1862 } else { 1863 } else {
1863 struct ceph_mds_client *mdsc = 1864 struct ceph_mds_client *mdsc =
1864 &ceph_sb_to_client(inode->i_sb)->mdsc; 1865 ceph_sb_to_client(inode->i_sb)->mdsc;
1865 1866
1866 spin_lock(&inode->i_lock); 1867 spin_lock(&inode->i_lock);
1867 if (__ceph_caps_dirty(ci)) 1868 if (__ceph_caps_dirty(ci))
@@ -1894,7 +1895,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
1894 if (cap && cap->session == session) { 1895 if (cap && cap->session == session) {
1895 dout("kick_flushing_caps %p cap %p capsnap %p\n", inode, 1896 dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
1896 cap, capsnap); 1897 cap, capsnap);
1897 __ceph_flush_snaps(ci, &session); 1898 __ceph_flush_snaps(ci, &session, 1);
1898 } else { 1899 } else {
1899 pr_err("%p auth cap %p not mds%d ???\n", inode, 1900 pr_err("%p auth cap %p not mds%d ???\n", inode,
1900 cap, session->s_mds); 1901 cap, session->s_mds);
@@ -2272,7 +2273,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2272{ 2273{
2273 struct ceph_inode_info *ci = ceph_inode(inode); 2274 struct ceph_inode_info *ci = ceph_inode(inode);
2274 int mds = session->s_mds; 2275 int mds = session->s_mds;
2275 int seq = le32_to_cpu(grant->seq); 2276 unsigned seq = le32_to_cpu(grant->seq);
2277 unsigned issue_seq = le32_to_cpu(grant->issue_seq);
2276 int newcaps = le32_to_cpu(grant->caps); 2278 int newcaps = le32_to_cpu(grant->caps);
2277 int issued, implemented, used, wanted, dirty; 2279 int issued, implemented, used, wanted, dirty;
2278 u64 size = le64_to_cpu(grant->size); 2280 u64 size = le64_to_cpu(grant->size);
@@ -2284,8 +2286,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2284 int revoked_rdcache = 0; 2286 int revoked_rdcache = 0;
2285 int queue_invalidate = 0; 2287 int queue_invalidate = 0;
2286 2288
2287 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", 2289 dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n",
2288 inode, cap, mds, seq, ceph_cap_string(newcaps)); 2290 inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps));
2289 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, 2291 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
2290 inode->i_size); 2292 inode->i_size);
2291 2293
@@ -2381,6 +2383,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2381 } 2383 }
2382 2384
2383 cap->seq = seq; 2385 cap->seq = seq;
2386 cap->issue_seq = issue_seq;
2384 2387
2385 /* file layout may have changed */ 2388 /* file layout may have changed */
2386 ci->i_layout = grant->layout; 2389 ci->i_layout = grant->layout;
@@ -2452,7 +2455,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2452 __releases(inode->i_lock) 2455 __releases(inode->i_lock)
2453{ 2456{
2454 struct ceph_inode_info *ci = ceph_inode(inode); 2457 struct ceph_inode_info *ci = ceph_inode(inode);
2455 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 2458 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
2456 unsigned seq = le32_to_cpu(m->seq); 2459 unsigned seq = le32_to_cpu(m->seq);
2457 int dirty = le32_to_cpu(m->dirty); 2460 int dirty = le32_to_cpu(m->dirty);
2458 int cleaned = 0; 2461 int cleaned = 0;
@@ -2700,7 +2703,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2700 struct ceph_msg *msg) 2703 struct ceph_msg *msg)
2701{ 2704{
2702 struct ceph_mds_client *mdsc = session->s_mdsc; 2705 struct ceph_mds_client *mdsc = session->s_mdsc;
2703 struct super_block *sb = mdsc->client->sb; 2706 struct super_block *sb = mdsc->fsc->sb;
2704 struct inode *inode; 2707 struct inode *inode;
2705 struct ceph_cap *cap; 2708 struct ceph_cap *cap;
2706 struct ceph_mds_caps *h; 2709 struct ceph_mds_caps *h;
@@ -2763,15 +2766,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2763 if (op == CEPH_CAP_OP_IMPORT) 2766 if (op == CEPH_CAP_OP_IMPORT)
2764 __queue_cap_release(session, vino.ino, cap_id, 2767 __queue_cap_release(session, vino.ino, cap_id,
2765 mseq, seq); 2768 mseq, seq);
2766 2769 goto flush_cap_releases;
2767 /*
2768 * send any full release message to try to move things
2769 * along for the mds (who clearly thinks we still have this
2770 * cap).
2771 */
2772 ceph_add_cap_releases(mdsc, session);
2773 ceph_send_cap_releases(mdsc, session);
2774 goto done;
2775 } 2770 }
2776 2771
2777 /* these will work even if we don't have a cap yet */ 2772 /* these will work even if we don't have a cap yet */
@@ -2799,7 +2794,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2799 dout(" no cap on %p ino %llx.%llx from mds%d\n", 2794 dout(" no cap on %p ino %llx.%llx from mds%d\n",
2800 inode, ceph_ino(inode), ceph_snap(inode), mds); 2795 inode, ceph_ino(inode), ceph_snap(inode), mds);
2801 spin_unlock(&inode->i_lock); 2796 spin_unlock(&inode->i_lock);
2802 goto done; 2797 goto flush_cap_releases;
2803 } 2798 }
2804 2799
2805 /* note that each of these drops i_lock for us */ 2800 /* note that each of these drops i_lock for us */
@@ -2823,6 +2818,17 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2823 ceph_cap_op_name(op)); 2818 ceph_cap_op_name(op));
2824 } 2819 }
2825 2820
2821 goto done;
2822
2823flush_cap_releases:
2824 /*
2825 * send any full release message to try to move things
2826 * along for the mds (who clearly thinks we still have this
2827 * cap).
2828 */
2829 ceph_add_cap_releases(mdsc, session);
2830 ceph_send_cap_releases(mdsc, session);
2831
2826done: 2832done:
2827 mutex_unlock(&session->s_mutex); 2833 mutex_unlock(&session->s_mutex);
2828done_unlocked: 2834done_unlocked:
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
deleted file mode 100644
index 1818c2305610..000000000000
--- a/fs/ceph/ceph_debug.h
+++ /dev/null
@@ -1,37 +0,0 @@
1#ifndef _FS_CEPH_DEBUG_H
2#define _FS_CEPH_DEBUG_H
3
4#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
5
6#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
7
8/*
9 * wrap pr_debug to include a filename:lineno prefix on each line.
10 * this incurs some overhead (kernel size and execution time) due to
11 * the extra function call at each call site.
12 */
13
14# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
15extern const char *ceph_file_part(const char *s, int len);
16# define dout(fmt, ...) \
17 pr_debug(" %12.12s:%-4d : " fmt, \
18 ceph_file_part(__FILE__, sizeof(__FILE__)), \
19 __LINE__, ##__VA_ARGS__)
20# else
21/* faux printk call just to see any compiler warnings. */
22# define dout(fmt, ...) do { \
23 if (0) \
24 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
25 } while (0)
26# endif
27
28#else
29
30/*
31 * or, just wrap pr_debug
32 */
33# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
34
35#endif
36
37#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
index ab6cf35c4091..bdce8b1fbd06 100644
--- a/fs/ceph/ceph_frag.c
+++ b/fs/ceph/ceph_frag.c
@@ -1,7 +1,8 @@
1/* 1/*
2 * Ceph 'frag' type 2 * Ceph 'frag' type
3 */ 3 */
4#include "types.h" 4#include <linux/module.h>
5#include <linux/ceph/types.h>
5 6
6int ceph_frag_compare(__u32 a, __u32 b) 7int ceph_frag_compare(__u32 a, __u32 b)
7{ 8{
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
deleted file mode 100644
index 5babb8e95352..000000000000
--- a/fs/ceph/ceph_frag.h
+++ /dev/null
@@ -1,109 +0,0 @@
1#ifndef FS_CEPH_FRAG_H
2#define FS_CEPH_FRAG_H
3
4/*
5 * "Frags" are a way to describe a subset of a 32-bit number space,
6 * using a mask and a value to match against that mask. Any given frag
7 * (subset of the number space) can be partitioned into 2^n sub-frags.
8 *
9 * Frags are encoded into a 32-bit word:
10 * 8 upper bits = "bits"
11 * 24 lower bits = "value"
12 * (We could go to 5+27 bits, but who cares.)
13 *
14 * We use the _most_ significant bits of the 24 bit value. This makes
15 * values logically sort.
16 *
17 * Unfortunately, because the "bits" field is still in the high bits, we
18 * can't sort encoded frags numerically. However, it does allow you
19 * to feed encoded frags as values into frag_contains_value.
20 */
21static inline __u32 ceph_frag_make(__u32 b, __u32 v)
22{
23 return (b << 24) |
24 (v & (0xffffffu << (24-b)) & 0xffffffu);
25}
26static inline __u32 ceph_frag_bits(__u32 f)
27{
28 return f >> 24;
29}
30static inline __u32 ceph_frag_value(__u32 f)
31{
32 return f & 0xffffffu;
33}
34static inline __u32 ceph_frag_mask(__u32 f)
35{
36 return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
37}
38static inline __u32 ceph_frag_mask_shift(__u32 f)
39{
40 return 24 - ceph_frag_bits(f);
41}
42
43static inline int ceph_frag_contains_value(__u32 f, __u32 v)
44{
45 return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
46}
47static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
48{
49 /* is sub as specific as us, and contained by us? */
50 return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
51 (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
52}
53
54static inline __u32 ceph_frag_parent(__u32 f)
55{
56 return ceph_frag_make(ceph_frag_bits(f) - 1,
57 ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
58}
59static inline int ceph_frag_is_left_child(__u32 f)
60{
61 return ceph_frag_bits(f) > 0 &&
62 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
63}
64static inline int ceph_frag_is_right_child(__u32 f)
65{
66 return ceph_frag_bits(f) > 0 &&
67 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
68}
69static inline __u32 ceph_frag_sibling(__u32 f)
70{
71 return ceph_frag_make(ceph_frag_bits(f),
72 ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
73}
74static inline __u32 ceph_frag_left_child(__u32 f)
75{
76 return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
77}
78static inline __u32 ceph_frag_right_child(__u32 f)
79{
80 return ceph_frag_make(ceph_frag_bits(f)+1,
81 ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
82}
83static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
84{
85 int newbits = ceph_frag_bits(f) + by;
86 return ceph_frag_make(newbits,
87 ceph_frag_value(f) | (i << (24 - newbits)));
88}
89static inline int ceph_frag_is_leftmost(__u32 f)
90{
91 return ceph_frag_value(f) == 0;
92}
93static inline int ceph_frag_is_rightmost(__u32 f)
94{
95 return ceph_frag_value(f) == ceph_frag_mask(f);
96}
97static inline __u32 ceph_frag_next(__u32 f)
98{
99 return ceph_frag_make(ceph_frag_bits(f),
100 ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
101}
102
103/*
104 * comparator to sort frags logically, as when traversing the
105 * number space in ascending order...
106 */
107int ceph_frag_compare(__u32 a, __u32 b);
108
109#endif
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
deleted file mode 100644
index 3ac6cc7c1156..000000000000
--- a/fs/ceph/ceph_fs.c
+++ /dev/null
@@ -1,72 +0,0 @@
1/*
2 * Some non-inline ceph helpers
3 */
4#include "types.h"
5
6/*
7 * return true if @layout appears to be valid
8 */
9int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
10{
11 __u32 su = le32_to_cpu(layout->fl_stripe_unit);
12 __u32 sc = le32_to_cpu(layout->fl_stripe_count);
13 __u32 os = le32_to_cpu(layout->fl_object_size);
14
15 /* stripe unit, object size must be non-zero, 64k increment */
16 if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
17 return 0;
18 if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
19 return 0;
20 /* object size must be a multiple of stripe unit */
21 if (os < su || os % su)
22 return 0;
23 /* stripe count must be non-zero */
24 if (!sc)
25 return 0;
26 return 1;
27}
28
29
30int ceph_flags_to_mode(int flags)
31{
32 int mode;
33
34#ifdef O_DIRECTORY /* fixme */
35 if ((flags & O_DIRECTORY) == O_DIRECTORY)
36 return CEPH_FILE_MODE_PIN;
37#endif
38 if ((flags & O_APPEND) == O_APPEND)
39 flags |= O_WRONLY;
40
41 if ((flags & O_ACCMODE) == O_RDWR)
42 mode = CEPH_FILE_MODE_RDWR;
43 else if ((flags & O_ACCMODE) == O_WRONLY)
44 mode = CEPH_FILE_MODE_WR;
45 else
46 mode = CEPH_FILE_MODE_RD;
47
48#ifdef O_LAZY
49 if (flags & O_LAZY)
50 mode |= CEPH_FILE_MODE_LAZY;
51#endif
52
53 return mode;
54}
55
56int ceph_caps_for_mode(int mode)
57{
58 int caps = CEPH_CAP_PIN;
59
60 if (mode & CEPH_FILE_MODE_RD)
61 caps |= CEPH_CAP_FILE_SHARED |
62 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
63 if (mode & CEPH_FILE_MODE_WR)
64 caps |= CEPH_CAP_FILE_EXCL |
65 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
66 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
67 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
68 if (mode & CEPH_FILE_MODE_LAZY)
69 caps |= CEPH_CAP_FILE_LAZYIO;
70
71 return caps;
72}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
deleted file mode 100644
index d5619ac86711..000000000000
--- a/fs/ceph/ceph_fs.h
+++ /dev/null
@@ -1,728 +0,0 @@
1/*
2 * ceph_fs.h - Ceph constants and data types to share between kernel and
3 * user space.
4 *
5 * Most types in this file are defined as little-endian, and are
6 * primarily intended to describe data structures that pass over the
7 * wire or that are stored on disk.
8 *
9 * LGPL2
10 */
11
12#ifndef CEPH_FS_H
13#define CEPH_FS_H
14
15#include "msgr.h"
16#include "rados.h"
17
18/*
19 * subprotocol versions. when specific messages types or high-level
20 * protocols change, bump the affected components. we keep rev
21 * internal cluster protocols separately from the public,
22 * client-facing protocol.
23 */
24#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
25#define CEPH_MDS_PROTOCOL 12 /* cluster internal */
26#define CEPH_MON_PROTOCOL 5 /* cluster internal */
27#define CEPH_OSDC_PROTOCOL 24 /* server/client */
28#define CEPH_MDSC_PROTOCOL 32 /* server/client */
29#define CEPH_MONC_PROTOCOL 15 /* server/client */
30
31
32#define CEPH_INO_ROOT 1
33#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
34
35/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
36#define CEPH_MAX_MON 31
37
38
39/*
40 * feature bits
41 */
42#define CEPH_FEATURE_UID (1<<0)
43#define CEPH_FEATURE_NOSRCADDR (1<<1)
44#define CEPH_FEATURE_MONCLOCKCHECK (1<<2)
45#define CEPH_FEATURE_FLOCK (1<<3)
46
47
48/*
49 * ceph_file_layout - describe data layout for a file/inode
50 */
51struct ceph_file_layout {
52 /* file -> object mapping */
53 __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
54 of page size. */
55 __le32 fl_stripe_count; /* over this many objects */
56 __le32 fl_object_size; /* until objects are this big, then move to
57 new objects */
58 __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */
59
60 /* pg -> disk layout */
61 __le32 fl_object_stripe_unit; /* for per-object parity, if any */
62
63 /* object -> pg layout */
64 __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
65 __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
66} __attribute__ ((packed));
67
68#define CEPH_MIN_STRIPE_UNIT 65536
69
70int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
71
72
73/* crypto algorithms */
74#define CEPH_CRYPTO_NONE 0x0
75#define CEPH_CRYPTO_AES 0x1
76
77#define CEPH_AES_IV "cephsageyudagreg"
78
79/* security/authentication protocols */
80#define CEPH_AUTH_UNKNOWN 0x0
81#define CEPH_AUTH_NONE 0x1
82#define CEPH_AUTH_CEPHX 0x2
83
84#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
85
86
87/*********************************************
88 * message layer
89 */
90
91/*
92 * message types
93 */
94
95/* misc */
96#define CEPH_MSG_SHUTDOWN 1
97#define CEPH_MSG_PING 2
98
99/* client <-> monitor */
100#define CEPH_MSG_MON_MAP 4
101#define CEPH_MSG_MON_GET_MAP 5
102#define CEPH_MSG_STATFS 13
103#define CEPH_MSG_STATFS_REPLY 14
104#define CEPH_MSG_MON_SUBSCRIBE 15
105#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
106#define CEPH_MSG_AUTH 17
107#define CEPH_MSG_AUTH_REPLY 18
108
109/* client <-> mds */
110#define CEPH_MSG_MDS_MAP 21
111
112#define CEPH_MSG_CLIENT_SESSION 22
113#define CEPH_MSG_CLIENT_RECONNECT 23
114
115#define CEPH_MSG_CLIENT_REQUEST 24
116#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
117#define CEPH_MSG_CLIENT_REPLY 26
118#define CEPH_MSG_CLIENT_CAPS 0x310
119#define CEPH_MSG_CLIENT_LEASE 0x311
120#define CEPH_MSG_CLIENT_SNAP 0x312
121#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
122
123/* pool ops */
124#define CEPH_MSG_POOLOP_REPLY 48
125#define CEPH_MSG_POOLOP 49
126
127
128/* osd */
129#define CEPH_MSG_OSD_MAP 41
130#define CEPH_MSG_OSD_OP 42
131#define CEPH_MSG_OSD_OPREPLY 43
132
133/* pool operations */
134enum {
135 POOL_OP_CREATE = 0x01,
136 POOL_OP_DELETE = 0x02,
137 POOL_OP_AUID_CHANGE = 0x03,
138 POOL_OP_CREATE_SNAP = 0x11,
139 POOL_OP_DELETE_SNAP = 0x12,
140 POOL_OP_CREATE_UNMANAGED_SNAP = 0x21,
141 POOL_OP_DELETE_UNMANAGED_SNAP = 0x22,
142};
143
144struct ceph_mon_request_header {
145 __le64 have_version;
146 __le16 session_mon;
147 __le64 session_mon_tid;
148} __attribute__ ((packed));
149
150struct ceph_mon_statfs {
151 struct ceph_mon_request_header monhdr;
152 struct ceph_fsid fsid;
153} __attribute__ ((packed));
154
155struct ceph_statfs {
156 __le64 kb, kb_used, kb_avail;
157 __le64 num_objects;
158} __attribute__ ((packed));
159
160struct ceph_mon_statfs_reply {
161 struct ceph_fsid fsid;
162 __le64 version;
163 struct ceph_statfs st;
164} __attribute__ ((packed));
165
166const char *ceph_pool_op_name(int op);
167
168struct ceph_mon_poolop {
169 struct ceph_mon_request_header monhdr;
170 struct ceph_fsid fsid;
171 __le32 pool;
172 __le32 op;
173 __le64 auid;
174 __le64 snapid;
175 __le32 name_len;
176} __attribute__ ((packed));
177
178struct ceph_mon_poolop_reply {
179 struct ceph_mon_request_header monhdr;
180 struct ceph_fsid fsid;
181 __le32 reply_code;
182 __le32 epoch;
183 char has_data;
184 char data[0];
185} __attribute__ ((packed));
186
187struct ceph_mon_unmanaged_snap {
188 __le64 snapid;
189} __attribute__ ((packed));
190
191struct ceph_osd_getmap {
192 struct ceph_mon_request_header monhdr;
193 struct ceph_fsid fsid;
194 __le32 start;
195} __attribute__ ((packed));
196
197struct ceph_mds_getmap {
198 struct ceph_mon_request_header monhdr;
199 struct ceph_fsid fsid;
200} __attribute__ ((packed));
201
202struct ceph_client_mount {
203 struct ceph_mon_request_header monhdr;
204} __attribute__ ((packed));
205
206struct ceph_mon_subscribe_item {
207 __le64 have_version; __le64 have;
208 __u8 onetime;
209} __attribute__ ((packed));
210
211struct ceph_mon_subscribe_ack {
212 __le32 duration; /* seconds */
213 struct ceph_fsid fsid;
214} __attribute__ ((packed));
215
216/*
217 * mds states
218 * > 0 -> in
219 * <= 0 -> out
220 */
221#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */
222#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees.
223 empty log. */
224#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */
225#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
226#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
227#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
228#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
229
230#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
231#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
232 operations (import, rename, etc.) */
233#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */
234#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */
235#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
236#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */
237#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */
238
239extern const char *ceph_mds_state_name(int s);
240
241
242/*
243 * metadata lock types.
244 * - these are bitmasks.. we can compose them
245 * - they also define the lock ordering by the MDS
246 * - a few of these are internal to the mds
247 */
248#define CEPH_LOCK_DVERSION 1
249#define CEPH_LOCK_DN 2
250#define CEPH_LOCK_ISNAP 16
251#define CEPH_LOCK_IVERSION 32 /* mds internal */
252#define CEPH_LOCK_IFILE 64
253#define CEPH_LOCK_IAUTH 128
254#define CEPH_LOCK_ILINK 256
255#define CEPH_LOCK_IDFT 512 /* dir frag tree */
256#define CEPH_LOCK_INEST 1024 /* mds internal */
257#define CEPH_LOCK_IXATTR 2048
258#define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */
259#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */
260
261/* client_session ops */
262enum {
263 CEPH_SESSION_REQUEST_OPEN,
264 CEPH_SESSION_OPEN,
265 CEPH_SESSION_REQUEST_CLOSE,
266 CEPH_SESSION_CLOSE,
267 CEPH_SESSION_REQUEST_RENEWCAPS,
268 CEPH_SESSION_RENEWCAPS,
269 CEPH_SESSION_STALE,
270 CEPH_SESSION_RECALL_STATE,
271};
272
273extern const char *ceph_session_op_name(int op);
274
275struct ceph_mds_session_head {
276 __le32 op;
277 __le64 seq;
278 struct ceph_timespec stamp;
279 __le32 max_caps, max_leases;
280} __attribute__ ((packed));
281
282/* client_request */
283/*
284 * metadata ops.
285 * & 0x001000 -> write op
286 * & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
287 & & 0x100000 -> use weird ino/path trace
288 */
289#define CEPH_MDS_OP_WRITE 0x001000
290enum {
291 CEPH_MDS_OP_LOOKUP = 0x00100,
292 CEPH_MDS_OP_GETATTR = 0x00101,
293 CEPH_MDS_OP_LOOKUPHASH = 0x00102,
294 CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
295
296 CEPH_MDS_OP_SETXATTR = 0x01105,
297 CEPH_MDS_OP_RMXATTR = 0x01106,
298 CEPH_MDS_OP_SETLAYOUT = 0x01107,
299 CEPH_MDS_OP_SETATTR = 0x01108,
300 CEPH_MDS_OP_SETFILELOCK= 0x01109,
301 CEPH_MDS_OP_GETFILELOCK= 0x00110,
302
303 CEPH_MDS_OP_MKNOD = 0x01201,
304 CEPH_MDS_OP_LINK = 0x01202,
305 CEPH_MDS_OP_UNLINK = 0x01203,
306 CEPH_MDS_OP_RENAME = 0x01204,
307 CEPH_MDS_OP_MKDIR = 0x01220,
308 CEPH_MDS_OP_RMDIR = 0x01221,
309 CEPH_MDS_OP_SYMLINK = 0x01222,
310
311 CEPH_MDS_OP_CREATE = 0x01301,
312 CEPH_MDS_OP_OPEN = 0x00302,
313 CEPH_MDS_OP_READDIR = 0x00305,
314
315 CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
316 CEPH_MDS_OP_MKSNAP = 0x01400,
317 CEPH_MDS_OP_RMSNAP = 0x01401,
318 CEPH_MDS_OP_LSSNAP = 0x00402,
319};
320
321extern const char *ceph_mds_op_name(int op);
322
323
324#define CEPH_SETATTR_MODE 1
325#define CEPH_SETATTR_UID 2
326#define CEPH_SETATTR_GID 4
327#define CEPH_SETATTR_MTIME 8
328#define CEPH_SETATTR_ATIME 16
329#define CEPH_SETATTR_SIZE 32
330#define CEPH_SETATTR_CTIME 64
331
332union ceph_mds_request_args {
333 struct {
334 __le32 mask; /* CEPH_CAP_* */
335 } __attribute__ ((packed)) getattr;
336 struct {
337 __le32 mode;
338 __le32 uid;
339 __le32 gid;
340 struct ceph_timespec mtime;
341 struct ceph_timespec atime;
342 __le64 size, old_size; /* old_size needed by truncate */
343 __le32 mask; /* CEPH_SETATTR_* */
344 } __attribute__ ((packed)) setattr;
345 struct {
346 __le32 frag; /* which dir fragment */
347 __le32 max_entries; /* how many dentries to grab */
348 __le32 max_bytes;
349 } __attribute__ ((packed)) readdir;
350 struct {
351 __le32 mode;
352 __le32 rdev;
353 } __attribute__ ((packed)) mknod;
354 struct {
355 __le32 mode;
356 } __attribute__ ((packed)) mkdir;
357 struct {
358 __le32 flags;
359 __le32 mode;
360 __le32 stripe_unit; /* layout for newly created file */
361 __le32 stripe_count; /* ... */
362 __le32 object_size;
363 __le32 file_replication;
364 __le32 preferred;
365 } __attribute__ ((packed)) open;
366 struct {
367 __le32 flags;
368 } __attribute__ ((packed)) setxattr;
369 struct {
370 struct ceph_file_layout layout;
371 } __attribute__ ((packed)) setlayout;
372 struct {
373 __u8 rule; /* currently fcntl or flock */
374 __u8 type; /* shared, exclusive, remove*/
375 __le64 pid; /* process id requesting the lock */
376 __le64 pid_namespace;
377 __le64 start; /* initial location to lock */
378 __le64 length; /* num bytes to lock from start */
379 __u8 wait; /* will caller wait for lock to become available? */
380 } __attribute__ ((packed)) filelock_change;
381} __attribute__ ((packed));
382
383#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
384#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
385
386struct ceph_mds_request_head {
387 __le64 oldest_client_tid;
388 __le32 mdsmap_epoch; /* on client */
389 __le32 flags; /* CEPH_MDS_FLAG_* */
390 __u8 num_retry, num_fwd; /* count retry, fwd attempts */
391 __le16 num_releases; /* # include cap/lease release records */
392 __le32 op; /* mds op code */
393 __le32 caller_uid, caller_gid;
394 __le64 ino; /* use this ino for openc, mkdir, mknod,
395 etc. (if replaying) */
396 union ceph_mds_request_args args;
397} __attribute__ ((packed));
398
399/* cap/lease release record */
400struct ceph_mds_request_release {
401 __le64 ino, cap_id; /* ino and unique cap id */
402 __le32 caps, wanted; /* new issued, wanted */
403 __le32 seq, issue_seq, mseq;
404 __le32 dname_seq; /* if releasing a dentry lease, a */
405 __le32 dname_len; /* string follows. */
406} __attribute__ ((packed));
407
408/* client reply */
409struct ceph_mds_reply_head {
410 __le32 op;
411 __le32 result;
412 __le32 mdsmap_epoch;
413 __u8 safe; /* true if committed to disk */
414 __u8 is_dentry, is_target; /* true if dentry, target inode records
415 are included with reply */
416} __attribute__ ((packed));
417
418/* one for each node split */
419struct ceph_frag_tree_split {
420 __le32 frag; /* this frag splits... */
421 __le32 by; /* ...by this many bits */
422} __attribute__ ((packed));
423
424struct ceph_frag_tree_head {
425 __le32 nsplits; /* num ceph_frag_tree_split records */
426 struct ceph_frag_tree_split splits[];
427} __attribute__ ((packed));
428
429/* capability issue, for bundling with mds reply */
430struct ceph_mds_reply_cap {
431 __le32 caps, wanted; /* caps issued, wanted */
432 __le64 cap_id;
433 __le32 seq, mseq;
434 __le64 realm; /* snap realm */
435 __u8 flags; /* CEPH_CAP_FLAG_* */
436} __attribute__ ((packed));
437
438#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */
439
440/* inode record, for bundling with mds reply */
441struct ceph_mds_reply_inode {
442 __le64 ino;
443 __le64 snapid;
444 __le32 rdev;
445 __le64 version; /* inode version */
446 __le64 xattr_version; /* version for xattr blob */
447 struct ceph_mds_reply_cap cap; /* caps issued for this inode */
448 struct ceph_file_layout layout;
449 struct ceph_timespec ctime, mtime, atime;
450 __le32 time_warp_seq;
451 __le64 size, max_size, truncate_size;
452 __le32 truncate_seq;
453 __le32 mode, uid, gid;
454 __le32 nlink;
455 __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */
456 struct ceph_timespec rctime;
457 struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */
458} __attribute__ ((packed));
459/* followed by frag array, then symlink string, then xattr blob */
460
461/* reply_lease follows dname, and reply_inode */
462struct ceph_mds_reply_lease {
463 __le16 mask; /* lease type(s) */
464 __le32 duration_ms; /* lease duration */
465 __le32 seq;
466} __attribute__ ((packed));
467
468struct ceph_mds_reply_dirfrag {
469 __le32 frag; /* fragment */
470 __le32 auth; /* auth mds, if this is a delegation point */
471 __le32 ndist; /* number of mds' this is replicated on */
472 __le32 dist[];
473} __attribute__ ((packed));
474
475#define CEPH_LOCK_FCNTL 1
476#define CEPH_LOCK_FLOCK 2
477
478#define CEPH_LOCK_SHARED 1
479#define CEPH_LOCK_EXCL 2
480#define CEPH_LOCK_UNLOCK 4
481
482struct ceph_filelock {
483 __le64 start;/* file offset to start lock at */
484 __le64 length; /* num bytes to lock; 0 for all following start */
485 __le64 client; /* which client holds the lock */
486 __le64 pid; /* process id holding the lock on the client */
487 __le64 pid_namespace;
488 __u8 type; /* shared lock, exclusive lock, or unlock */
489} __attribute__ ((packed));
490
491
492/* file access modes */
493#define CEPH_FILE_MODE_PIN 0
494#define CEPH_FILE_MODE_RD 1
495#define CEPH_FILE_MODE_WR 2
496#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
497#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
498#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
499
500int ceph_flags_to_mode(int flags);
501
502
503/* capability bits */
504#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */
505
506/* generic cap bits */
507#define CEPH_CAP_GSHARED 1 /* client can reads */
508#define CEPH_CAP_GEXCL 2 /* client can read and update */
509#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */
510#define CEPH_CAP_GRD 8 /* (file) client can read */
511#define CEPH_CAP_GWR 16 /* (file) client can write */
512#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */
513#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
514#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
515
516/* per-lock shift */
517#define CEPH_CAP_SAUTH 2
518#define CEPH_CAP_SLINK 4
519#define CEPH_CAP_SXATTR 6
520#define CEPH_CAP_SFILE 8
521#define CEPH_CAP_SFLOCK 20
522
523#define CEPH_CAP_BITS 22
524
525/* composed values */
526#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
527#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)
528#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK)
529#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK)
530#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR)
531#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR)
532#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE)
533#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE)
534#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE)
535#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE)
536#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE)
537#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE)
538#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
539#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
540#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
541#define CEPH_CAP_FLOCK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFLOCK)
542#define CEPH_CAP_FLOCK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFLOCK)
543
544
545/* cap masks (for getattr) */
546#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
547#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */
548#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN
549#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED
550#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED
551#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED
552#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED
553#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED
554#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED
555#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED
556#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */
557#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED
558#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \
559 CEPH_CAP_AUTH_SHARED | \
560 CEPH_CAP_LINK_SHARED | \
561 CEPH_CAP_FILE_SHARED | \
562 CEPH_CAP_XATTR_SHARED)
563
564#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
565 CEPH_CAP_LINK_SHARED | \
566 CEPH_CAP_XATTR_SHARED | \
567 CEPH_CAP_FILE_SHARED)
568#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \
569 CEPH_CAP_FILE_CACHE)
570
571#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \
572 CEPH_CAP_LINK_EXCL | \
573 CEPH_CAP_XATTR_EXCL | \
574 CEPH_CAP_FILE_EXCL)
575#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
576 CEPH_CAP_FILE_EXCL)
577#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
578#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
579 CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
580 CEPH_CAP_PIN)
581
582#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
583 CEPH_LOCK_IXATTR)
584
585int ceph_caps_for_mode(int mode);
586
587enum {
588 CEPH_CAP_OP_GRANT, /* mds->client grant */
589 CEPH_CAP_OP_REVOKE, /* mds->client revoke */
590 CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */
591 CEPH_CAP_OP_EXPORT, /* mds has exported the cap */
592 CEPH_CAP_OP_IMPORT, /* mds has imported the cap */
593 CEPH_CAP_OP_UPDATE, /* client->mds update */
594 CEPH_CAP_OP_DROP, /* client->mds drop cap bits */
595 CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */
596 CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */
597 CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
598 CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
599 CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
600 CEPH_CAP_OP_RENEW, /* client->mds renewal request */
601};
602
603extern const char *ceph_cap_op_name(int op);
604
605/*
606 * caps message, used for capability callbacks, acks, requests, etc.
607 */
608struct ceph_mds_caps {
609 __le32 op; /* CEPH_CAP_OP_* */
610 __le64 ino, realm;
611 __le64 cap_id;
612 __le32 seq, issue_seq;
613 __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
614 __le32 migrate_seq;
615 __le64 snap_follows;
616 __le32 snap_trace_len;
617
618 /* authlock */
619 __le32 uid, gid, mode;
620
621 /* linklock */
622 __le32 nlink;
623
624 /* xattrlock */
625 __le32 xattr_len;
626 __le64 xattr_version;
627
628 /* filelock */
629 __le64 size, max_size, truncate_size;
630 __le32 truncate_seq;
631 struct ceph_timespec mtime, atime, ctime;
632 struct ceph_file_layout layout;
633 __le32 time_warp_seq;
634} __attribute__ ((packed));
635
636/* cap release msg head */
637struct ceph_mds_cap_release {
638 __le32 num; /* number of cap_items that follow */
639} __attribute__ ((packed));
640
641struct ceph_mds_cap_item {
642 __le64 ino;
643 __le64 cap_id;
644 __le32 migrate_seq, seq;
645} __attribute__ ((packed));
646
647#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */
648#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */
649#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */
650#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */
651
652extern const char *ceph_lease_op_name(int o);
653
654/* lease msg header */
655struct ceph_mds_lease {
656 __u8 action; /* CEPH_MDS_LEASE_* */
657 __le16 mask; /* which lease */
658 __le64 ino;
659 __le64 first, last; /* snap range */
660 __le32 seq;
661 __le32 duration_ms; /* duration of renewal */
662} __attribute__ ((packed));
663/* followed by a __le32+string for dname */
664
665/* client reconnect */
666struct ceph_mds_cap_reconnect {
667 __le64 cap_id;
668 __le32 wanted;
669 __le32 issued;
670 __le64 snaprealm;
671 __le64 pathbase; /* base ino for our path to this ino */
672 __le32 flock_len; /* size of flock state blob, if any */
673} __attribute__ ((packed));
674/* followed by flock blob */
675
676struct ceph_mds_cap_reconnect_v1 {
677 __le64 cap_id;
678 __le32 wanted;
679 __le32 issued;
680 __le64 size;
681 struct ceph_timespec mtime, atime;
682 __le64 snaprealm;
683 __le64 pathbase; /* base ino for our path to this ino */
684} __attribute__ ((packed));
685
686struct ceph_mds_snaprealm_reconnect {
687 __le64 ino; /* snap realm base */
688 __le64 seq; /* snap seq for this snap realm */
689 __le64 parent; /* parent realm */
690} __attribute__ ((packed));
691
692/*
693 * snaps
694 */
695enum {
696 CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */
697 CEPH_SNAP_OP_CREATE,
698 CEPH_SNAP_OP_DESTROY,
699 CEPH_SNAP_OP_SPLIT,
700};
701
702extern const char *ceph_snap_op_name(int o);
703
704/* snap msg header */
705struct ceph_mds_snap_head {
706 __le32 op; /* CEPH_SNAP_OP_* */
707 __le64 split; /* ino to split off, if any */
708 __le32 num_split_inos; /* # inos belonging to new child realm */
709 __le32 num_split_realms; /* # child realms udner new child realm */
710 __le32 trace_len; /* size of snap trace blob */
711} __attribute__ ((packed));
712/* followed by split ino list, then split realms, then the trace blob */
713
714/*
715 * encode info about a snaprealm, as viewed by a client
716 */
717struct ceph_mds_snap_realm {
718 __le64 ino; /* ino */
719 __le64 created; /* snap: when created */
720 __le64 parent; /* ino: parent realm */
721 __le64 parent_since; /* snap: same parent since */
722 __le64 seq; /* snap: version */
723 __le32 num_snaps;
724 __le32 num_prior_parent_snaps;
725} __attribute__ ((packed));
726/* followed by my snap list, then prior parent snap list */
727
728#endif
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
deleted file mode 100644
index bd570015d147..000000000000
--- a/fs/ceph/ceph_hash.c
+++ /dev/null
@@ -1,118 +0,0 @@
1
2#include "types.h"
3
4/*
5 * Robert Jenkin's hash function.
6 * http://burtleburtle.net/bob/hash/evahash.html
7 * This is in the public domain.
8 */
9#define mix(a, b, c) \
10 do { \
11 a = a - b; a = a - c; a = a ^ (c >> 13); \
12 b = b - c; b = b - a; b = b ^ (a << 8); \
13 c = c - a; c = c - b; c = c ^ (b >> 13); \
14 a = a - b; a = a - c; a = a ^ (c >> 12); \
15 b = b - c; b = b - a; b = b ^ (a << 16); \
16 c = c - a; c = c - b; c = c ^ (b >> 5); \
17 a = a - b; a = a - c; a = a ^ (c >> 3); \
18 b = b - c; b = b - a; b = b ^ (a << 10); \
19 c = c - a; c = c - b; c = c ^ (b >> 15); \
20 } while (0)
21
22unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
23{
24 const unsigned char *k = (const unsigned char *)str;
25 __u32 a, b, c; /* the internal state */
26 __u32 len; /* how many key bytes still need mixing */
27
28 /* Set up the internal state */
29 len = length;
30 a = 0x9e3779b9; /* the golden ratio; an arbitrary value */
31 b = a;
32 c = 0; /* variable initialization of internal state */
33
34 /* handle most of the key */
35 while (len >= 12) {
36 a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
37 ((__u32)k[3] << 24));
38 b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
39 ((__u32)k[7] << 24));
40 c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
41 ((__u32)k[11] << 24));
42 mix(a, b, c);
43 k = k + 12;
44 len = len - 12;
45 }
46
47 /* handle the last 11 bytes */
48 c = c + length;
49 switch (len) { /* all the case statements fall through */
50 case 11:
51 c = c + ((__u32)k[10] << 24);
52 case 10:
53 c = c + ((__u32)k[9] << 16);
54 case 9:
55 c = c + ((__u32)k[8] << 8);
56 /* the first byte of c is reserved for the length */
57 case 8:
58 b = b + ((__u32)k[7] << 24);
59 case 7:
60 b = b + ((__u32)k[6] << 16);
61 case 6:
62 b = b + ((__u32)k[5] << 8);
63 case 5:
64 b = b + k[4];
65 case 4:
66 a = a + ((__u32)k[3] << 24);
67 case 3:
68 a = a + ((__u32)k[2] << 16);
69 case 2:
70 a = a + ((__u32)k[1] << 8);
71 case 1:
72 a = a + k[0];
73 /* case 0: nothing left to add */
74 }
75 mix(a, b, c);
76
77 return c;
78}
79
80/*
81 * linux dcache hash
82 */
83unsigned ceph_str_hash_linux(const char *str, unsigned length)
84{
85 unsigned long hash = 0;
86 unsigned char c;
87
88 while (length--) {
89 c = *str++;
90 hash = (hash + (c << 4) + (c >> 4)) * 11;
91 }
92 return hash;
93}
94
95
96unsigned ceph_str_hash(int type, const char *s, unsigned len)
97{
98 switch (type) {
99 case CEPH_STR_HASH_LINUX:
100 return ceph_str_hash_linux(s, len);
101 case CEPH_STR_HASH_RJENKINS:
102 return ceph_str_hash_rjenkins(s, len);
103 default:
104 return -1;
105 }
106}
107
108const char *ceph_str_hash_name(int type)
109{
110 switch (type) {
111 case CEPH_STR_HASH_LINUX:
112 return "linux";
113 case CEPH_STR_HASH_RJENKINS:
114 return "rjenkins";
115 default:
116 return "unknown";
117 }
118}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
deleted file mode 100644
index d099c3f90236..000000000000
--- a/fs/ceph/ceph_hash.h
+++ /dev/null
@@ -1,13 +0,0 @@
1#ifndef FS_CEPH_HASH_H
2#define FS_CEPH_HASH_H
3
4#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
5#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
6
7extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
8extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
9
10extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
11extern const char *ceph_str_hash_name(int type);
12
13#endif
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
deleted file mode 100644
index fabd302e5779..000000000000
--- a/fs/ceph/crush/crush.c
+++ /dev/null
@@ -1,151 +0,0 @@
1
2#ifdef __KERNEL__
3# include <linux/slab.h>
4#else
5# include <stdlib.h>
6# include <assert.h>
7# define kfree(x) do { if (x) free(x); } while (0)
8# define BUG_ON(x) assert(!(x))
9#endif
10
11#include "crush.h"
12
13const char *crush_bucket_alg_name(int alg)
14{
15 switch (alg) {
16 case CRUSH_BUCKET_UNIFORM: return "uniform";
17 case CRUSH_BUCKET_LIST: return "list";
18 case CRUSH_BUCKET_TREE: return "tree";
19 case CRUSH_BUCKET_STRAW: return "straw";
20 default: return "unknown";
21 }
22}
23
24/**
25 * crush_get_bucket_item_weight - Get weight of an item in given bucket
26 * @b: bucket pointer
27 * @p: item index in bucket
28 */
29int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
30{
31 if (p >= b->size)
32 return 0;
33
34 switch (b->alg) {
35 case CRUSH_BUCKET_UNIFORM:
36 return ((struct crush_bucket_uniform *)b)->item_weight;
37 case CRUSH_BUCKET_LIST:
38 return ((struct crush_bucket_list *)b)->item_weights[p];
39 case CRUSH_BUCKET_TREE:
40 if (p & 1)
41 return ((struct crush_bucket_tree *)b)->node_weights[p];
42 return 0;
43 case CRUSH_BUCKET_STRAW:
44 return ((struct crush_bucket_straw *)b)->item_weights[p];
45 }
46 return 0;
47}
48
49/**
50 * crush_calc_parents - Calculate parent vectors for the given crush map.
51 * @map: crush_map pointer
52 */
53void crush_calc_parents(struct crush_map *map)
54{
55 int i, b, c;
56
57 for (b = 0; b < map->max_buckets; b++) {
58 if (map->buckets[b] == NULL)
59 continue;
60 for (i = 0; i < map->buckets[b]->size; i++) {
61 c = map->buckets[b]->items[i];
62 BUG_ON(c >= map->max_devices ||
63 c < -map->max_buckets);
64 if (c >= 0)
65 map->device_parents[c] = map->buckets[b]->id;
66 else
67 map->bucket_parents[-1-c] = map->buckets[b]->id;
68 }
69 }
70}
71
72void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
73{
74 kfree(b->h.perm);
75 kfree(b->h.items);
76 kfree(b);
77}
78
79void crush_destroy_bucket_list(struct crush_bucket_list *b)
80{
81 kfree(b->item_weights);
82 kfree(b->sum_weights);
83 kfree(b->h.perm);
84 kfree(b->h.items);
85 kfree(b);
86}
87
88void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
89{
90 kfree(b->node_weights);
91 kfree(b);
92}
93
94void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
95{
96 kfree(b->straws);
97 kfree(b->item_weights);
98 kfree(b->h.perm);
99 kfree(b->h.items);
100 kfree(b);
101}
102
103void crush_destroy_bucket(struct crush_bucket *b)
104{
105 switch (b->alg) {
106 case CRUSH_BUCKET_UNIFORM:
107 crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
108 break;
109 case CRUSH_BUCKET_LIST:
110 crush_destroy_bucket_list((struct crush_bucket_list *)b);
111 break;
112 case CRUSH_BUCKET_TREE:
113 crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
114 break;
115 case CRUSH_BUCKET_STRAW:
116 crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
117 break;
118 }
119}
120
121/**
122 * crush_destroy - Destroy a crush_map
123 * @map: crush_map pointer
124 */
125void crush_destroy(struct crush_map *map)
126{
127 int b;
128
129 /* buckets */
130 if (map->buckets) {
131 for (b = 0; b < map->max_buckets; b++) {
132 if (map->buckets[b] == NULL)
133 continue;
134 crush_destroy_bucket(map->buckets[b]);
135 }
136 kfree(map->buckets);
137 }
138
139 /* rules */
140 if (map->rules) {
141 for (b = 0; b < map->max_rules; b++)
142 kfree(map->rules[b]);
143 kfree(map->rules);
144 }
145
146 kfree(map->bucket_parents);
147 kfree(map->device_parents);
148 kfree(map);
149}
150
151
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
deleted file mode 100644
index 97e435b191f4..000000000000
--- a/fs/ceph/crush/crush.h
+++ /dev/null
@@ -1,180 +0,0 @@
1#ifndef CEPH_CRUSH_CRUSH_H
2#define CEPH_CRUSH_CRUSH_H
3
4#include <linux/types.h>
5
6/*
7 * CRUSH is a pseudo-random data distribution algorithm that
8 * efficiently distributes input values (typically, data objects)
9 * across a heterogeneous, structured storage cluster.
10 *
11 * The algorithm was originally described in detail in this paper
12 * (although the algorithm has evolved somewhat since then):
13 *
14 * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
15 *
16 * LGPL2
17 */
18
19
20#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
21
22
23#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
24#define CRUSH_MAX_SET 10 /* max size of a mapping result */
25
26
27/*
28 * CRUSH uses user-defined "rules" to describe how inputs should be
29 * mapped to devices. A rule consists of sequence of steps to perform
30 * to generate the set of output devices.
31 */
32struct crush_rule_step {
33 __u32 op;
34 __s32 arg1;
35 __s32 arg2;
36};
37
38/* step op codes */
39enum {
40 CRUSH_RULE_NOOP = 0,
41 CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
42 CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
43 /* arg2 = type */
44 CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
45 CRUSH_RULE_EMIT = 4, /* no args */
46 CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
47 CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
48};
49
50/*
51 * for specifying choose num (arg1) relative to the max parameter
52 * passed to do_rule
53 */
54#define CRUSH_CHOOSE_N 0
55#define CRUSH_CHOOSE_N_MINUS(x) (-(x))
56
57/*
58 * The rule mask is used to describe what the rule is intended for.
59 * Given a ruleset and size of output set, we search through the
60 * rule list for a matching rule_mask.
61 */
62struct crush_rule_mask {
63 __u8 ruleset;
64 __u8 type;
65 __u8 min_size;
66 __u8 max_size;
67};
68
69struct crush_rule {
70 __u32 len;
71 struct crush_rule_mask mask;
72 struct crush_rule_step steps[0];
73};
74
75#define crush_rule_size(len) (sizeof(struct crush_rule) + \
76 (len)*sizeof(struct crush_rule_step))
77
78
79
80/*
81 * A bucket is a named container of other items (either devices or
82 * other buckets). Items within a bucket are chosen using one of a
83 * few different algorithms. The table summarizes how the speed of
84 * each option measures up against mapping stability when items are
85 * added or removed.
86 *
87 * Bucket Alg Speed Additions Removals
88 * ------------------------------------------------
89 * uniform O(1) poor poor
90 * list O(n) optimal poor
91 * tree O(log n) good good
92 * straw O(n) optimal optimal
93 */
94enum {
95 CRUSH_BUCKET_UNIFORM = 1,
96 CRUSH_BUCKET_LIST = 2,
97 CRUSH_BUCKET_TREE = 3,
98 CRUSH_BUCKET_STRAW = 4
99};
100extern const char *crush_bucket_alg_name(int alg);
101
102struct crush_bucket {
103 __s32 id; /* this'll be negative */
104 __u16 type; /* non-zero; type=0 is reserved for devices */
105 __u8 alg; /* one of CRUSH_BUCKET_* */
106 __u8 hash; /* which hash function to use, CRUSH_HASH_* */
107 __u32 weight; /* 16-bit fixed point */
108 __u32 size; /* num items */
109 __s32 *items;
110
111 /*
112 * cached random permutation: used for uniform bucket and for
113 * the linear search fallback for the other bucket types.
114 */
115 __u32 perm_x; /* @x for which *perm is defined */
116 __u32 perm_n; /* num elements of *perm that are permuted/defined */
117 __u32 *perm;
118};
119
120struct crush_bucket_uniform {
121 struct crush_bucket h;
122 __u32 item_weight; /* 16-bit fixed point; all items equally weighted */
123};
124
125struct crush_bucket_list {
126 struct crush_bucket h;
127 __u32 *item_weights; /* 16-bit fixed point */
128 __u32 *sum_weights; /* 16-bit fixed point. element i is sum
129 of weights 0..i, inclusive */
130};
131
132struct crush_bucket_tree {
133 struct crush_bucket h; /* note: h.size is _tree_ size, not number of
134 actual items */
135 __u8 num_nodes;
136 __u32 *node_weights;
137};
138
139struct crush_bucket_straw {
140 struct crush_bucket h;
141 __u32 *item_weights; /* 16-bit fixed point */
142 __u32 *straws; /* 16-bit fixed point */
143};
144
145
146
147/*
148 * CRUSH map includes all buckets, rules, etc.
149 */
150struct crush_map {
151 struct crush_bucket **buckets;
152 struct crush_rule **rules;
153
154 /*
155 * Parent pointers to identify the parent bucket a device or
156 * bucket in the hierarchy. If an item appears more than
157 * once, this is the _last_ time it appeared (where buckets
158 * are processed in bucket id order, from -1 on down to
159 * -max_buckets.
160 */
161 __u32 *bucket_parents;
162 __u32 *device_parents;
163
164 __s32 max_buckets;
165 __u32 max_rules;
166 __s32 max_devices;
167};
168
169
170/* crush.c */
171extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
172extern void crush_calc_parents(struct crush_map *map);
173extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
174extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
175extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
176extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
177extern void crush_destroy_bucket(struct crush_bucket *b);
178extern void crush_destroy(struct crush_map *map);
179
180#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
deleted file mode 100644
index 5873aed694bf..000000000000
--- a/fs/ceph/crush/hash.c
+++ /dev/null
@@ -1,149 +0,0 @@
1
2#include <linux/types.h>
3#include "hash.h"
4
5/*
6 * Robert Jenkins' function for mixing 32-bit values
7 * http://burtleburtle.net/bob/hash/evahash.html
8 * a, b = random bits, c = input and output
9 */
10#define crush_hashmix(a, b, c) do { \
11 a = a-b; a = a-c; a = a^(c>>13); \
12 b = b-c; b = b-a; b = b^(a<<8); \
13 c = c-a; c = c-b; c = c^(b>>13); \
14 a = a-b; a = a-c; a = a^(c>>12); \
15 b = b-c; b = b-a; b = b^(a<<16); \
16 c = c-a; c = c-b; c = c^(b>>5); \
17 a = a-b; a = a-c; a = a^(c>>3); \
18 b = b-c; b = b-a; b = b^(a<<10); \
19 c = c-a; c = c-b; c = c^(b>>15); \
20 } while (0)
21
22#define crush_hash_seed 1315423911
23
24static __u32 crush_hash32_rjenkins1(__u32 a)
25{
26 __u32 hash = crush_hash_seed ^ a;
27 __u32 b = a;
28 __u32 x = 231232;
29 __u32 y = 1232;
30 crush_hashmix(b, x, hash);
31 crush_hashmix(y, a, hash);
32 return hash;
33}
34
35static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
36{
37 __u32 hash = crush_hash_seed ^ a ^ b;
38 __u32 x = 231232;
39 __u32 y = 1232;
40 crush_hashmix(a, b, hash);
41 crush_hashmix(x, a, hash);
42 crush_hashmix(b, y, hash);
43 return hash;
44}
45
46static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
47{
48 __u32 hash = crush_hash_seed ^ a ^ b ^ c;
49 __u32 x = 231232;
50 __u32 y = 1232;
51 crush_hashmix(a, b, hash);
52 crush_hashmix(c, x, hash);
53 crush_hashmix(y, a, hash);
54 crush_hashmix(b, x, hash);
55 crush_hashmix(y, c, hash);
56 return hash;
57}
58
59static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
60{
61 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
62 __u32 x = 231232;
63 __u32 y = 1232;
64 crush_hashmix(a, b, hash);
65 crush_hashmix(c, d, hash);
66 crush_hashmix(a, x, hash);
67 crush_hashmix(y, b, hash);
68 crush_hashmix(c, x, hash);
69 crush_hashmix(y, d, hash);
70 return hash;
71}
72
73static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
74 __u32 e)
75{
76 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
77 __u32 x = 231232;
78 __u32 y = 1232;
79 crush_hashmix(a, b, hash);
80 crush_hashmix(c, d, hash);
81 crush_hashmix(e, x, hash);
82 crush_hashmix(y, a, hash);
83 crush_hashmix(b, x, hash);
84 crush_hashmix(y, c, hash);
85 crush_hashmix(d, x, hash);
86 crush_hashmix(y, e, hash);
87 return hash;
88}
89
90
91__u32 crush_hash32(int type, __u32 a)
92{
93 switch (type) {
94 case CRUSH_HASH_RJENKINS1:
95 return crush_hash32_rjenkins1(a);
96 default:
97 return 0;
98 }
99}
100
101__u32 crush_hash32_2(int type, __u32 a, __u32 b)
102{
103 switch (type) {
104 case CRUSH_HASH_RJENKINS1:
105 return crush_hash32_rjenkins1_2(a, b);
106 default:
107 return 0;
108 }
109}
110
111__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
112{
113 switch (type) {
114 case CRUSH_HASH_RJENKINS1:
115 return crush_hash32_rjenkins1_3(a, b, c);
116 default:
117 return 0;
118 }
119}
120
121__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
122{
123 switch (type) {
124 case CRUSH_HASH_RJENKINS1:
125 return crush_hash32_rjenkins1_4(a, b, c, d);
126 default:
127 return 0;
128 }
129}
130
131__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
132{
133 switch (type) {
134 case CRUSH_HASH_RJENKINS1:
135 return crush_hash32_rjenkins1_5(a, b, c, d, e);
136 default:
137 return 0;
138 }
139}
140
141const char *crush_hash_name(int type)
142{
143 switch (type) {
144 case CRUSH_HASH_RJENKINS1:
145 return "rjenkins1";
146 default:
147 return "unknown";
148 }
149}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
deleted file mode 100644
index 91e884230d5d..000000000000
--- a/fs/ceph/crush/hash.h
+++ /dev/null
@@ -1,17 +0,0 @@
1#ifndef CEPH_CRUSH_HASH_H
2#define CEPH_CRUSH_HASH_H
3
4#define CRUSH_HASH_RJENKINS1 0
5
6#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
7
8extern const char *crush_hash_name(int type);
9
10extern __u32 crush_hash32(int type, __u32 a);
11extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
12extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
13extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
14extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
15 __u32 e);
16
17#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
deleted file mode 100644
index a4eec133258e..000000000000
--- a/fs/ceph/crush/mapper.c
+++ /dev/null
@@ -1,609 +0,0 @@
1
2#ifdef __KERNEL__
3# include <linux/string.h>
4# include <linux/slab.h>
5# include <linux/bug.h>
6# include <linux/kernel.h>
7# ifndef dprintk
8# define dprintk(args...)
9# endif
10#else
11# include <string.h>
12# include <stdio.h>
13# include <stdlib.h>
14# include <assert.h>
15# define BUG_ON(x) assert(!(x))
16# define dprintk(args...) /* printf(args) */
17# define kmalloc(x, f) malloc(x)
18# define kfree(x) free(x)
19#endif
20
21#include "crush.h"
22#include "hash.h"
23
24/*
25 * Implement the core CRUSH mapping algorithm.
26 */
27
28/**
29 * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
30 * @map: the crush_map
31 * @ruleset: the storage ruleset id (user defined)
32 * @type: storage ruleset type (user defined)
33 * @size: output set size
34 */
35int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
36{
37 int i;
38
39 for (i = 0; i < map->max_rules; i++) {
40 if (map->rules[i] &&
41 map->rules[i]->mask.ruleset == ruleset &&
42 map->rules[i]->mask.type == type &&
43 map->rules[i]->mask.min_size <= size &&
44 map->rules[i]->mask.max_size >= size)
45 return i;
46 }
47 return -1;
48}
49
50
51/*
52 * bucket choose methods
53 *
54 * For each bucket algorithm, we have a "choose" method that, given a
55 * crush input @x and replica position (usually, position in output set) @r,
56 * will produce an item in the bucket.
57 */
58
59/*
60 * Choose based on a random permutation of the bucket.
61 *
62 * We used to use some prime number arithmetic to do this, but it
63 * wasn't very random, and had some other bad behaviors. Instead, we
64 * calculate an actual random permutation of the bucket members.
65 * Since this is expensive, we optimize for the r=0 case, which
66 * captures the vast majority of calls.
67 */
68static int bucket_perm_choose(struct crush_bucket *bucket,
69 int x, int r)
70{
71 unsigned pr = r % bucket->size;
72 unsigned i, s;
73
74 /* start a new permutation if @x has changed */
75 if (bucket->perm_x != x || bucket->perm_n == 0) {
76 dprintk("bucket %d new x=%d\n", bucket->id, x);
77 bucket->perm_x = x;
78
79 /* optimize common r=0 case */
80 if (pr == 0) {
81 s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
82 bucket->size;
83 bucket->perm[0] = s;
84 bucket->perm_n = 0xffff; /* magic value, see below */
85 goto out;
86 }
87
88 for (i = 0; i < bucket->size; i++)
89 bucket->perm[i] = i;
90 bucket->perm_n = 0;
91 } else if (bucket->perm_n == 0xffff) {
92 /* clean up after the r=0 case above */
93 for (i = 1; i < bucket->size; i++)
94 bucket->perm[i] = i;
95 bucket->perm[bucket->perm[0]] = 0;
96 bucket->perm_n = 1;
97 }
98
99 /* calculate permutation up to pr */
100 for (i = 0; i < bucket->perm_n; i++)
101 dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
102 while (bucket->perm_n <= pr) {
103 unsigned p = bucket->perm_n;
104 /* no point in swapping the final entry */
105 if (p < bucket->size - 1) {
106 i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
107 (bucket->size - p);
108 if (i) {
109 unsigned t = bucket->perm[p + i];
110 bucket->perm[p + i] = bucket->perm[p];
111 bucket->perm[p] = t;
112 }
113 dprintk(" perm_choose swap %d with %d\n", p, p+i);
114 }
115 bucket->perm_n++;
116 }
117 for (i = 0; i < bucket->size; i++)
118 dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
119
120 s = bucket->perm[pr];
121out:
122 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
123 bucket->size, x, r, pr, s);
124 return bucket->items[s];
125}
126
127/* uniform */
128static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
129 int x, int r)
130{
131 return bucket_perm_choose(&bucket->h, x, r);
132}
133
134/* list */
135static int bucket_list_choose(struct crush_bucket_list *bucket,
136 int x, int r)
137{
138 int i;
139
140 for (i = bucket->h.size-1; i >= 0; i--) {
141 __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
142 r, bucket->h.id);
143 w &= 0xffff;
144 dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
145 "sw %x rand %llx",
146 i, x, r, bucket->h.items[i], bucket->item_weights[i],
147 bucket->sum_weights[i], w);
148 w *= bucket->sum_weights[i];
149 w = w >> 16;
150 /*dprintk(" scaled %llx\n", w);*/
151 if (w < bucket->item_weights[i])
152 return bucket->h.items[i];
153 }
154
155 BUG_ON(1);
156 return 0;
157}
158
159
160/* (binary) tree */
161static int height(int n)
162{
163 int h = 0;
164 while ((n & 1) == 0) {
165 h++;
166 n = n >> 1;
167 }
168 return h;
169}
170
171static int left(int x)
172{
173 int h = height(x);
174 return x - (1 << (h-1));
175}
176
177static int right(int x)
178{
179 int h = height(x);
180 return x + (1 << (h-1));
181}
182
183static int terminal(int x)
184{
185 return x & 1;
186}
187
188static int bucket_tree_choose(struct crush_bucket_tree *bucket,
189 int x, int r)
190{
191 int n, l;
192 __u32 w;
193 __u64 t;
194
195 /* start at root */
196 n = bucket->num_nodes >> 1;
197
198 while (!terminal(n)) {
199 /* pick point in [0, w) */
200 w = bucket->node_weights[n];
201 t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
202 bucket->h.id) * (__u64)w;
203 t = t >> 32;
204
205 /* descend to the left or right? */
206 l = left(n);
207 if (t < bucket->node_weights[l])
208 n = l;
209 else
210 n = right(n);
211 }
212
213 return bucket->h.items[n >> 1];
214}
215
216
217/* straw */
218
219static int bucket_straw_choose(struct crush_bucket_straw *bucket,
220 int x, int r)
221{
222 int i;
223 int high = 0;
224 __u64 high_draw = 0;
225 __u64 draw;
226
227 for (i = 0; i < bucket->h.size; i++) {
228 draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
229 draw &= 0xffff;
230 draw *= bucket->straws[i];
231 if (i == 0 || draw > high_draw) {
232 high = i;
233 high_draw = draw;
234 }
235 }
236 return bucket->h.items[high];
237}
238
239static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
240{
241 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
242 switch (in->alg) {
243 case CRUSH_BUCKET_UNIFORM:
244 return bucket_uniform_choose((struct crush_bucket_uniform *)in,
245 x, r);
246 case CRUSH_BUCKET_LIST:
247 return bucket_list_choose((struct crush_bucket_list *)in,
248 x, r);
249 case CRUSH_BUCKET_TREE:
250 return bucket_tree_choose((struct crush_bucket_tree *)in,
251 x, r);
252 case CRUSH_BUCKET_STRAW:
253 return bucket_straw_choose((struct crush_bucket_straw *)in,
254 x, r);
255 default:
256 BUG_ON(1);
257 return in->items[0];
258 }
259}
260
261/*
262 * true if device is marked "out" (failed, fully offloaded)
263 * of the cluster
264 */
265static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
266{
267 if (weight[item] >= 0x10000)
268 return 0;
269 if (weight[item] == 0)
270 return 1;
271 if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
272 < weight[item])
273 return 0;
274 return 1;
275}
276
277/**
278 * crush_choose - choose numrep distinct items of given type
279 * @map: the crush_map
280 * @bucket: the bucket we are choose an item from
281 * @x: crush input value
282 * @numrep: the number of items to choose
283 * @type: the type of item to choose
284 * @out: pointer to output vector
285 * @outpos: our position in that vector
286 * @firstn: true if choosing "first n" items, false if choosing "indep"
287 * @recurse_to_leaf: true if we want one device under each item of given type
288 * @out2: second output vector for leaf items (if @recurse_to_leaf)
289 */
290static int crush_choose(struct crush_map *map,
291 struct crush_bucket *bucket,
292 __u32 *weight,
293 int x, int numrep, int type,
294 int *out, int outpos,
295 int firstn, int recurse_to_leaf,
296 int *out2)
297{
298 int rep;
299 int ftotal, flocal;
300 int retry_descent, retry_bucket, skip_rep;
301 struct crush_bucket *in = bucket;
302 int r;
303 int i;
304 int item = 0;
305 int itemtype;
306 int collide, reject;
307 const int orig_tries = 5; /* attempts before we fall back to search */
308
309 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
310 bucket->id, x, outpos, numrep);
311
312 for (rep = outpos; rep < numrep; rep++) {
313 /* keep trying until we get a non-out, non-colliding item */
314 ftotal = 0;
315 skip_rep = 0;
316 do {
317 retry_descent = 0;
318 in = bucket; /* initial bucket */
319
320 /* choose through intervening buckets */
321 flocal = 0;
322 do {
323 collide = 0;
324 retry_bucket = 0;
325 r = rep;
326 if (in->alg == CRUSH_BUCKET_UNIFORM) {
327 /* be careful */
328 if (firstn || numrep >= in->size)
329 /* r' = r + f_total */
330 r += ftotal;
331 else if (in->size % numrep == 0)
332 /* r'=r+(n+1)*f_local */
333 r += (numrep+1) *
334 (flocal+ftotal);
335 else
336 /* r' = r + n*f_local */
337 r += numrep * (flocal+ftotal);
338 } else {
339 if (firstn)
340 /* r' = r + f_total */
341 r += ftotal;
342 else
343 /* r' = r + n*f_local */
344 r += numrep * (flocal+ftotal);
345 }
346
347 /* bucket choose */
348 if (in->size == 0) {
349 reject = 1;
350 goto reject;
351 }
352 if (flocal >= (in->size>>1) &&
353 flocal > orig_tries)
354 item = bucket_perm_choose(in, x, r);
355 else
356 item = crush_bucket_choose(in, x, r);
357 BUG_ON(item >= map->max_devices);
358
359 /* desired type? */
360 if (item < 0)
361 itemtype = map->buckets[-1-item]->type;
362 else
363 itemtype = 0;
364 dprintk(" item %d type %d\n", item, itemtype);
365
366 /* keep going? */
367 if (itemtype != type) {
368 BUG_ON(item >= 0 ||
369 (-1-item) >= map->max_buckets);
370 in = map->buckets[-1-item];
371 retry_bucket = 1;
372 continue;
373 }
374
375 /* collision? */
376 for (i = 0; i < outpos; i++) {
377 if (out[i] == item) {
378 collide = 1;
379 break;
380 }
381 }
382
383 reject = 0;
384 if (recurse_to_leaf) {
385 if (item < 0) {
386 if (crush_choose(map,
387 map->buckets[-1-item],
388 weight,
389 x, outpos+1, 0,
390 out2, outpos,
391 firstn, 0,
392 NULL) <= outpos)
393 /* didn't get leaf */
394 reject = 1;
395 } else {
396 /* we already have a leaf! */
397 out2[outpos] = item;
398 }
399 }
400
401 if (!reject) {
402 /* out? */
403 if (itemtype == 0)
404 reject = is_out(map, weight,
405 item, x);
406 else
407 reject = 0;
408 }
409
410reject:
411 if (reject || collide) {
412 ftotal++;
413 flocal++;
414
415 if (collide && flocal < 3)
416 /* retry locally a few times */
417 retry_bucket = 1;
418 else if (flocal < in->size + orig_tries)
419 /* exhaustive bucket search */
420 retry_bucket = 1;
421 else if (ftotal < 20)
422 /* then retry descent */
423 retry_descent = 1;
424 else
425 /* else give up */
426 skip_rep = 1;
427 dprintk(" reject %d collide %d "
428 "ftotal %d flocal %d\n",
429 reject, collide, ftotal,
430 flocal);
431 }
432 } while (retry_bucket);
433 } while (retry_descent);
434
435 if (skip_rep) {
436 dprintk("skip rep\n");
437 continue;
438 }
439
440 dprintk("CHOOSE got %d\n", item);
441 out[outpos] = item;
442 outpos++;
443 }
444
445 dprintk("CHOOSE returns %d\n", outpos);
446 return outpos;
447}
448
449
450/**
451 * crush_do_rule - calculate a mapping with the given input and rule
452 * @map: the crush_map
453 * @ruleno: the rule id
454 * @x: hash input
455 * @result: pointer to result vector
456 * @result_max: maximum result size
457 * @force: force initial replica choice; -1 for none
458 */
459int crush_do_rule(struct crush_map *map,
460 int ruleno, int x, int *result, int result_max,
461 int force, __u32 *weight)
462{
463 int result_len;
464 int force_context[CRUSH_MAX_DEPTH];
465 int force_pos = -1;
466 int a[CRUSH_MAX_SET];
467 int b[CRUSH_MAX_SET];
468 int c[CRUSH_MAX_SET];
469 int recurse_to_leaf;
470 int *w;
471 int wsize = 0;
472 int *o;
473 int osize;
474 int *tmp;
475 struct crush_rule *rule;
476 int step;
477 int i, j;
478 int numrep;
479 int firstn;
480 int rc = -1;
481
482 BUG_ON(ruleno >= map->max_rules);
483
484 rule = map->rules[ruleno];
485 result_len = 0;
486 w = a;
487 o = b;
488
489 /*
490 * determine hierarchical context of force, if any. note
491 * that this may or may not correspond to the specific types
492 * referenced by the crush rule.
493 */
494 if (force >= 0) {
495 if (force >= map->max_devices ||
496 map->device_parents[force] == 0) {
497 /*dprintk("CRUSH: forcefed device dne\n");*/
498 rc = -1; /* force fed device dne */
499 goto out;
500 }
501 if (!is_out(map, weight, force, x)) {
502 while (1) {
503 force_context[++force_pos] = force;
504 if (force >= 0)
505 force = map->device_parents[force];
506 else
507 force = map->bucket_parents[-1-force];
508 if (force == 0)
509 break;
510 }
511 }
512 }
513
514 for (step = 0; step < rule->len; step++) {
515 firstn = 0;
516 switch (rule->steps[step].op) {
517 case CRUSH_RULE_TAKE:
518 w[0] = rule->steps[step].arg1;
519 if (force_pos >= 0) {
520 BUG_ON(force_context[force_pos] != w[0]);
521 force_pos--;
522 }
523 wsize = 1;
524 break;
525
526 case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
527 case CRUSH_RULE_CHOOSE_FIRSTN:
528 firstn = 1;
529 case CRUSH_RULE_CHOOSE_LEAF_INDEP:
530 case CRUSH_RULE_CHOOSE_INDEP:
531 BUG_ON(wsize == 0);
532
533 recurse_to_leaf =
534 rule->steps[step].op ==
535 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
536 rule->steps[step].op ==
537 CRUSH_RULE_CHOOSE_LEAF_INDEP;
538
539 /* reset output */
540 osize = 0;
541
542 for (i = 0; i < wsize; i++) {
543 /*
544 * see CRUSH_N, CRUSH_N_MINUS macros.
545 * basically, numrep <= 0 means relative to
546 * the provided result_max
547 */
548 numrep = rule->steps[step].arg1;
549 if (numrep <= 0) {
550 numrep += result_max;
551 if (numrep <= 0)
552 continue;
553 }
554 j = 0;
555 if (osize == 0 && force_pos >= 0) {
556 /* skip any intermediate types */
557 while (force_pos &&
558 force_context[force_pos] < 0 &&
559 rule->steps[step].arg2 !=
560 map->buckets[-1 -
561 force_context[force_pos]]->type)
562 force_pos--;
563 o[osize] = force_context[force_pos];
564 if (recurse_to_leaf)
565 c[osize] = force_context[0];
566 j++;
567 force_pos--;
568 }
569 osize += crush_choose(map,
570 map->buckets[-1-w[i]],
571 weight,
572 x, numrep,
573 rule->steps[step].arg2,
574 o+osize, j,
575 firstn,
576 recurse_to_leaf, c+osize);
577 }
578
579 if (recurse_to_leaf)
580 /* copy final _leaf_ values to output set */
581 memcpy(o, c, osize*sizeof(*o));
582
583 /* swap t and w arrays */
584 tmp = o;
585 o = w;
586 w = tmp;
587 wsize = osize;
588 break;
589
590
591 case CRUSH_RULE_EMIT:
592 for (i = 0; i < wsize && result_len < result_max; i++) {
593 result[result_len] = w[i];
594 result_len++;
595 }
596 wsize = 0;
597 break;
598
599 default:
600 BUG_ON(1);
601 }
602 }
603 rc = result_len;
604
605out:
606 return rc;
607}
608
609
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
deleted file mode 100644
index c46b99c18bb0..000000000000
--- a/fs/ceph/crush/mapper.h
+++ /dev/null
@@ -1,20 +0,0 @@
1#ifndef CEPH_CRUSH_MAPPER_H
2#define CEPH_CRUSH_MAPPER_H
3
4/*
5 * CRUSH functions for find rules and then mapping an input to an
6 * output set.
7 *
8 * LGPL2
9 */
10
11#include "crush.h"
12
13extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
14extern int crush_do_rule(struct crush_map *map,
15 int ruleno,
16 int x, int *result, int result_max,
17 int forcefeed, /* -1 for none */
18 __u32 *weights);
19
20#endif
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
deleted file mode 100644
index a3e627f63293..000000000000
--- a/fs/ceph/crypto.c
+++ /dev/null
@@ -1,412 +0,0 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/scatterlist.h>
6#include <linux/slab.h>
7#include <crypto/hash.h>
8
9#include "crypto.h"
10#include "decode.h"
11
12int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
13{
14 if (*p + sizeof(u16) + sizeof(key->created) +
15 sizeof(u16) + key->len > end)
16 return -ERANGE;
17 ceph_encode_16(p, key->type);
18 ceph_encode_copy(p, &key->created, sizeof(key->created));
19 ceph_encode_16(p, key->len);
20 ceph_encode_copy(p, key->key, key->len);
21 return 0;
22}
23
24int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
25{
26 ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
27 key->type = ceph_decode_16(p);
28 ceph_decode_copy(p, &key->created, sizeof(key->created));
29 key->len = ceph_decode_16(p);
30 ceph_decode_need(p, end, key->len, bad);
31 key->key = kmalloc(key->len, GFP_NOFS);
32 if (!key->key)
33 return -ENOMEM;
34 ceph_decode_copy(p, key->key, key->len);
35 return 0;
36
37bad:
38 dout("failed to decode crypto key\n");
39 return -EINVAL;
40}
41
42int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
43{
44 int inlen = strlen(inkey);
45 int blen = inlen * 3 / 4;
46 void *buf, *p;
47 int ret;
48
49 dout("crypto_key_unarmor %s\n", inkey);
50 buf = kmalloc(blen, GFP_NOFS);
51 if (!buf)
52 return -ENOMEM;
53 blen = ceph_unarmor(buf, inkey, inkey+inlen);
54 if (blen < 0) {
55 kfree(buf);
56 return blen;
57 }
58
59 p = buf;
60 ret = ceph_crypto_key_decode(key, &p, p + blen);
61 kfree(buf);
62 if (ret)
63 return ret;
64 dout("crypto_key_unarmor key %p type %d len %d\n", key,
65 key->type, key->len);
66 return 0;
67}
68
69
70
71#define AES_KEY_SIZE 16
72
73static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
74{
75 return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
76}
77
78static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
79
80static int ceph_aes_encrypt(const void *key, int key_len,
81 void *dst, size_t *dst_len,
82 const void *src, size_t src_len)
83{
84 struct scatterlist sg_in[2], sg_out[1];
85 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
86 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
87 int ret;
88 void *iv;
89 int ivsize;
90 size_t zero_padding = (0x10 - (src_len & 0x0f));
91 char pad[16];
92
93 if (IS_ERR(tfm))
94 return PTR_ERR(tfm);
95
96 memset(pad, zero_padding, zero_padding);
97
98 *dst_len = src_len + zero_padding;
99
100 crypto_blkcipher_setkey((void *)tfm, key, key_len);
101 sg_init_table(sg_in, 2);
102 sg_set_buf(&sg_in[0], src, src_len);
103 sg_set_buf(&sg_in[1], pad, zero_padding);
104 sg_init_table(sg_out, 1);
105 sg_set_buf(sg_out, dst, *dst_len);
106 iv = crypto_blkcipher_crt(tfm)->iv;
107 ivsize = crypto_blkcipher_ivsize(tfm);
108
109 memcpy(iv, aes_iv, ivsize);
110 /*
111 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
112 key, key_len, 1);
113 print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
114 src, src_len, 1);
115 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
116 pad, zero_padding, 1);
117 */
118 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
119 src_len + zero_padding);
120 crypto_free_blkcipher(tfm);
121 if (ret < 0)
122 pr_err("ceph_aes_crypt failed %d\n", ret);
123 /*
124 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
125 dst, *dst_len, 1);
126 */
127 return 0;
128}
129
130static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
131 size_t *dst_len,
132 const void *src1, size_t src1_len,
133 const void *src2, size_t src2_len)
134{
135 struct scatterlist sg_in[3], sg_out[1];
136 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
137 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
138 int ret;
139 void *iv;
140 int ivsize;
141 size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
142 char pad[16];
143
144 if (IS_ERR(tfm))
145 return PTR_ERR(tfm);
146
147 memset(pad, zero_padding, zero_padding);
148
149 *dst_len = src1_len + src2_len + zero_padding;
150
151 crypto_blkcipher_setkey((void *)tfm, key, key_len);
152 sg_init_table(sg_in, 3);
153 sg_set_buf(&sg_in[0], src1, src1_len);
154 sg_set_buf(&sg_in[1], src2, src2_len);
155 sg_set_buf(&sg_in[2], pad, zero_padding);
156 sg_init_table(sg_out, 1);
157 sg_set_buf(sg_out, dst, *dst_len);
158 iv = crypto_blkcipher_crt(tfm)->iv;
159 ivsize = crypto_blkcipher_ivsize(tfm);
160
161 memcpy(iv, aes_iv, ivsize);
162 /*
163 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
164 key, key_len, 1);
165 print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
166 src1, src1_len, 1);
167 print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
168 src2, src2_len, 1);
169 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
170 pad, zero_padding, 1);
171 */
172 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
173 src1_len + src2_len + zero_padding);
174 crypto_free_blkcipher(tfm);
175 if (ret < 0)
176 pr_err("ceph_aes_crypt2 failed %d\n", ret);
177 /*
178 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
179 dst, *dst_len, 1);
180 */
181 return 0;
182}
183
184static int ceph_aes_decrypt(const void *key, int key_len,
185 void *dst, size_t *dst_len,
186 const void *src, size_t src_len)
187{
188 struct scatterlist sg_in[1], sg_out[2];
189 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
190 struct blkcipher_desc desc = { .tfm = tfm };
191 char pad[16];
192 void *iv;
193 int ivsize;
194 int ret;
195 int last_byte;
196
197 if (IS_ERR(tfm))
198 return PTR_ERR(tfm);
199
200 crypto_blkcipher_setkey((void *)tfm, key, key_len);
201 sg_init_table(sg_in, 1);
202 sg_init_table(sg_out, 2);
203 sg_set_buf(sg_in, src, src_len);
204 sg_set_buf(&sg_out[0], dst, *dst_len);
205 sg_set_buf(&sg_out[1], pad, sizeof(pad));
206
207 iv = crypto_blkcipher_crt(tfm)->iv;
208 ivsize = crypto_blkcipher_ivsize(tfm);
209
210 memcpy(iv, aes_iv, ivsize);
211
212 /*
213 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
214 key, key_len, 1);
215 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
216 src, src_len, 1);
217 */
218
219 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
220 crypto_free_blkcipher(tfm);
221 if (ret < 0) {
222 pr_err("ceph_aes_decrypt failed %d\n", ret);
223 return ret;
224 }
225
226 if (src_len <= *dst_len)
227 last_byte = ((char *)dst)[src_len - 1];
228 else
229 last_byte = pad[src_len - *dst_len - 1];
230 if (last_byte <= 16 && src_len >= last_byte) {
231 *dst_len = src_len - last_byte;
232 } else {
233 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
234 last_byte, (int)src_len);
235 return -EPERM; /* bad padding */
236 }
237 /*
238 print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
239 dst, *dst_len, 1);
240 */
241 return 0;
242}
243
244static int ceph_aes_decrypt2(const void *key, int key_len,
245 void *dst1, size_t *dst1_len,
246 void *dst2, size_t *dst2_len,
247 const void *src, size_t src_len)
248{
249 struct scatterlist sg_in[1], sg_out[3];
250 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
251 struct blkcipher_desc desc = { .tfm = tfm };
252 char pad[16];
253 void *iv;
254 int ivsize;
255 int ret;
256 int last_byte;
257
258 if (IS_ERR(tfm))
259 return PTR_ERR(tfm);
260
261 sg_init_table(sg_in, 1);
262 sg_set_buf(sg_in, src, src_len);
263 sg_init_table(sg_out, 3);
264 sg_set_buf(&sg_out[0], dst1, *dst1_len);
265 sg_set_buf(&sg_out[1], dst2, *dst2_len);
266 sg_set_buf(&sg_out[2], pad, sizeof(pad));
267
268 crypto_blkcipher_setkey((void *)tfm, key, key_len);
269 iv = crypto_blkcipher_crt(tfm)->iv;
270 ivsize = crypto_blkcipher_ivsize(tfm);
271
272 memcpy(iv, aes_iv, ivsize);
273
274 /*
275 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
276 key, key_len, 1);
277 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
278 src, src_len, 1);
279 */
280
281 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
282 crypto_free_blkcipher(tfm);
283 if (ret < 0) {
284 pr_err("ceph_aes_decrypt failed %d\n", ret);
285 return ret;
286 }
287
288 if (src_len <= *dst1_len)
289 last_byte = ((char *)dst1)[src_len - 1];
290 else if (src_len <= *dst1_len + *dst2_len)
291 last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
292 else
293 last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
294 if (last_byte <= 16 && src_len >= last_byte) {
295 src_len -= last_byte;
296 } else {
297 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
298 last_byte, (int)src_len);
299 return -EPERM; /* bad padding */
300 }
301
302 if (src_len < *dst1_len) {
303 *dst1_len = src_len;
304 *dst2_len = 0;
305 } else {
306 *dst2_len = src_len - *dst1_len;
307 }
308 /*
309 print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1,
310 dst1, *dst1_len, 1);
311 print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1,
312 dst2, *dst2_len, 1);
313 */
314
315 return 0;
316}
317
318
319int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
320 const void *src, size_t src_len)
321{
322 switch (secret->type) {
323 case CEPH_CRYPTO_NONE:
324 if (*dst_len < src_len)
325 return -ERANGE;
326 memcpy(dst, src, src_len);
327 *dst_len = src_len;
328 return 0;
329
330 case CEPH_CRYPTO_AES:
331 return ceph_aes_decrypt(secret->key, secret->len, dst,
332 dst_len, src, src_len);
333
334 default:
335 return -EINVAL;
336 }
337}
338
339int ceph_decrypt2(struct ceph_crypto_key *secret,
340 void *dst1, size_t *dst1_len,
341 void *dst2, size_t *dst2_len,
342 const void *src, size_t src_len)
343{
344 size_t t;
345
346 switch (secret->type) {
347 case CEPH_CRYPTO_NONE:
348 if (*dst1_len + *dst2_len < src_len)
349 return -ERANGE;
350 t = min(*dst1_len, src_len);
351 memcpy(dst1, src, t);
352 *dst1_len = t;
353 src += t;
354 src_len -= t;
355 if (src_len) {
356 t = min(*dst2_len, src_len);
357 memcpy(dst2, src, t);
358 *dst2_len = t;
359 }
360 return 0;
361
362 case CEPH_CRYPTO_AES:
363 return ceph_aes_decrypt2(secret->key, secret->len,
364 dst1, dst1_len, dst2, dst2_len,
365 src, src_len);
366
367 default:
368 return -EINVAL;
369 }
370}
371
372int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
373 const void *src, size_t src_len)
374{
375 switch (secret->type) {
376 case CEPH_CRYPTO_NONE:
377 if (*dst_len < src_len)
378 return -ERANGE;
379 memcpy(dst, src, src_len);
380 *dst_len = src_len;
381 return 0;
382
383 case CEPH_CRYPTO_AES:
384 return ceph_aes_encrypt(secret->key, secret->len, dst,
385 dst_len, src, src_len);
386
387 default:
388 return -EINVAL;
389 }
390}
391
392int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
393 const void *src1, size_t src1_len,
394 const void *src2, size_t src2_len)
395{
396 switch (secret->type) {
397 case CEPH_CRYPTO_NONE:
398 if (*dst_len < src1_len + src2_len)
399 return -ERANGE;
400 memcpy(dst, src1, src1_len);
401 memcpy(dst + src1_len, src2, src2_len);
402 *dst_len = src1_len + src2_len;
403 return 0;
404
405 case CEPH_CRYPTO_AES:
406 return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
407 src1, src1_len, src2, src2_len);
408
409 default:
410 return -EINVAL;
411 }
412}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
deleted file mode 100644
index bdf38607323c..000000000000
--- a/fs/ceph/crypto.h
+++ /dev/null
@@ -1,48 +0,0 @@
1#ifndef _FS_CEPH_CRYPTO_H
2#define _FS_CEPH_CRYPTO_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * cryptographic secret
9 */
10struct ceph_crypto_key {
11 int type;
12 struct ceph_timespec created;
13 int len;
14 void *key;
15};
16
17static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
18{
19 kfree(key->key);
20}
21
22extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
23 void **p, void *end);
24extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
25 void **p, void *end);
26extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
27
28/* crypto.c */
29extern int ceph_decrypt(struct ceph_crypto_key *secret,
30 void *dst, size_t *dst_len,
31 const void *src, size_t src_len);
32extern int ceph_encrypt(struct ceph_crypto_key *secret,
33 void *dst, size_t *dst_len,
34 const void *src, size_t src_len);
35extern int ceph_decrypt2(struct ceph_crypto_key *secret,
36 void *dst1, size_t *dst1_len,
37 void *dst2, size_t *dst2_len,
38 const void *src, size_t src_len);
39extern int ceph_encrypt2(struct ceph_crypto_key *secret,
40 void *dst, size_t *dst_len,
41 const void *src1, size_t src1_len,
42 const void *src2, size_t src2_len);
43
44/* armor.c */
45extern int ceph_armor(char *dst, const char *src, const char *end);
46extern int ceph_unarmor(char *dst, const char *src, const char *end);
47
48#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 6fd8b20a8611..7ae1b3d55b58 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/device.h> 3#include <linux/device.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
@@ -7,143 +7,49 @@
7#include <linux/debugfs.h> 7#include <linux/debugfs.h>
8#include <linux/seq_file.h> 8#include <linux/seq_file.h>
9 9
10#include <linux/ceph/libceph.h>
11#include <linux/ceph/mon_client.h>
12#include <linux/ceph/auth.h>
13#include <linux/ceph/debugfs.h>
14
10#include "super.h" 15#include "super.h"
11#include "mds_client.h"
12#include "mon_client.h"
13#include "auth.h"
14 16
15#ifdef CONFIG_DEBUG_FS 17#ifdef CONFIG_DEBUG_FS
16 18
17/* 19#include "mds_client.h"
18 * Implement /sys/kernel/debug/ceph fun
19 *
20 * /sys/kernel/debug/ceph/client* - an instance of the ceph client
21 * .../osdmap - current osdmap
22 * .../mdsmap - current mdsmap
23 * .../monmap - current monmap
24 * .../osdc - active osd requests
25 * .../mdsc - active mds requests
26 * .../monc - mon client state
27 * .../dentry_lru - dump contents of dentry lru
28 * .../caps - expose cap (reservation) stats
29 * .../bdi - symlink to ../../bdi/something
30 */
31
32static struct dentry *ceph_debugfs_dir;
33
34static int monmap_show(struct seq_file *s, void *p)
35{
36 int i;
37 struct ceph_client *client = s->private;
38
39 if (client->monc.monmap == NULL)
40 return 0;
41
42 seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
43 for (i = 0; i < client->monc.monmap->num_mon; i++) {
44 struct ceph_entity_inst *inst =
45 &client->monc.monmap->mon_inst[i];
46
47 seq_printf(s, "\t%s%lld\t%s\n",
48 ENTITY_NAME(inst->name),
49 pr_addr(&inst->addr.in_addr));
50 }
51 return 0;
52}
53 20
54static int mdsmap_show(struct seq_file *s, void *p) 21static int mdsmap_show(struct seq_file *s, void *p)
55{ 22{
56 int i; 23 int i;
57 struct ceph_client *client = s->private; 24 struct ceph_fs_client *fsc = s->private;
58 25
59 if (client->mdsc.mdsmap == NULL) 26 if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
60 return 0; 27 return 0;
61 seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch); 28 seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
62 seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root); 29 seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
63 seq_printf(s, "session_timeout %d\n", 30 seq_printf(s, "session_timeout %d\n",
64 client->mdsc.mdsmap->m_session_timeout); 31 fsc->mdsc->mdsmap->m_session_timeout);
65 seq_printf(s, "session_autoclose %d\n", 32 seq_printf(s, "session_autoclose %d\n",
66 client->mdsc.mdsmap->m_session_autoclose); 33 fsc->mdsc->mdsmap->m_session_autoclose);
67 for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) { 34 for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
68 struct ceph_entity_addr *addr = 35 struct ceph_entity_addr *addr =
69 &client->mdsc.mdsmap->m_info[i].addr; 36 &fsc->mdsc->mdsmap->m_info[i].addr;
70 int state = client->mdsc.mdsmap->m_info[i].state; 37 int state = fsc->mdsc->mdsmap->m_info[i].state;
71 38
72 seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr), 39 seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
40 ceph_pr_addr(&addr->in_addr),
73 ceph_mds_state_name(state)); 41 ceph_mds_state_name(state));
74 } 42 }
75 return 0; 43 return 0;
76} 44}
77 45
78static int osdmap_show(struct seq_file *s, void *p) 46/*
79{ 47 * mdsc debugfs
80 int i; 48 */
81 struct ceph_client *client = s->private;
82 struct rb_node *n;
83
84 if (client->osdc.osdmap == NULL)
85 return 0;
86 seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
87 seq_printf(s, "flags%s%s\n",
88 (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
89 " NEARFULL" : "",
90 (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
91 " FULL" : "");
92 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
93 struct ceph_pg_pool_info *pool =
94 rb_entry(n, struct ceph_pg_pool_info, node);
95 seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
96 pool->id, pool->v.pg_num, pool->pg_num_mask,
97 pool->v.lpg_num, pool->lpg_num_mask);
98 }
99 for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
100 struct ceph_entity_addr *addr =
101 &client->osdc.osdmap->osd_addr[i];
102 int state = client->osdc.osdmap->osd_state[i];
103 char sb[64];
104
105 seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
106 i, pr_addr(&addr->in_addr),
107 ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
108 ceph_osdmap_state_str(sb, sizeof(sb), state));
109 }
110 return 0;
111}
112
113static int monc_show(struct seq_file *s, void *p)
114{
115 struct ceph_client *client = s->private;
116 struct ceph_mon_generic_request *req;
117 struct ceph_mon_client *monc = &client->monc;
118 struct rb_node *rp;
119
120 mutex_lock(&monc->mutex);
121
122 if (monc->have_mdsmap)
123 seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
124 if (monc->have_osdmap)
125 seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
126 if (monc->want_next_osdmap)
127 seq_printf(s, "want next osdmap\n");
128
129 for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
130 __u16 op;
131 req = rb_entry(rp, struct ceph_mon_generic_request, node);
132 op = le16_to_cpu(req->request->hdr.type);
133 if (op == CEPH_MSG_STATFS)
134 seq_printf(s, "%lld statfs\n", req->tid);
135 else
136 seq_printf(s, "%lld unknown\n", req->tid);
137 }
138
139 mutex_unlock(&monc->mutex);
140 return 0;
141}
142
143static int mdsc_show(struct seq_file *s, void *p) 49static int mdsc_show(struct seq_file *s, void *p)
144{ 50{
145 struct ceph_client *client = s->private; 51 struct ceph_fs_client *fsc = s->private;
146 struct ceph_mds_client *mdsc = &client->mdsc; 52 struct ceph_mds_client *mdsc = fsc->mdsc;
147 struct ceph_mds_request *req; 53 struct ceph_mds_request *req;
148 struct rb_node *rp; 54 struct rb_node *rp;
149 int pathlen; 55 int pathlen;
@@ -214,61 +120,12 @@ static int mdsc_show(struct seq_file *s, void *p)
214 return 0; 120 return 0;
215} 121}
216 122
217static int osdc_show(struct seq_file *s, void *pp)
218{
219 struct ceph_client *client = s->private;
220 struct ceph_osd_client *osdc = &client->osdc;
221 struct rb_node *p;
222
223 mutex_lock(&osdc->request_mutex);
224 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
225 struct ceph_osd_request *req;
226 struct ceph_osd_request_head *head;
227 struct ceph_osd_op *op;
228 int num_ops;
229 int opcode, olen;
230 int i;
231
232 req = rb_entry(p, struct ceph_osd_request, r_node);
233
234 seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
235 req->r_osd ? req->r_osd->o_osd : -1,
236 le32_to_cpu(req->r_pgid.pool),
237 le16_to_cpu(req->r_pgid.ps));
238
239 head = req->r_request->front.iov_base;
240 op = (void *)(head + 1);
241
242 num_ops = le16_to_cpu(head->num_ops);
243 olen = le32_to_cpu(head->object_len);
244 seq_printf(s, "%.*s", olen,
245 (const char *)(head->ops + num_ops));
246
247 if (req->r_reassert_version.epoch)
248 seq_printf(s, "\t%u'%llu",
249 (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
250 le64_to_cpu(req->r_reassert_version.version));
251 else
252 seq_printf(s, "\t");
253
254 for (i = 0; i < num_ops; i++) {
255 opcode = le16_to_cpu(op->op);
256 seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
257 op++;
258 }
259
260 seq_printf(s, "\n");
261 }
262 mutex_unlock(&osdc->request_mutex);
263 return 0;
264}
265
266static int caps_show(struct seq_file *s, void *p) 123static int caps_show(struct seq_file *s, void *p)
267{ 124{
268 struct ceph_client *client = s->private; 125 struct ceph_fs_client *fsc = s->private;
269 int total, avail, used, reserved, min; 126 int total, avail, used, reserved, min;
270 127
271 ceph_reservation_status(client, &total, &avail, &used, &reserved, &min); 128 ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
272 seq_printf(s, "total\t\t%d\n" 129 seq_printf(s, "total\t\t%d\n"
273 "avail\t\t%d\n" 130 "avail\t\t%d\n"
274 "used\t\t%d\n" 131 "used\t\t%d\n"
@@ -280,8 +137,8 @@ static int caps_show(struct seq_file *s, void *p)
280 137
281static int dentry_lru_show(struct seq_file *s, void *ptr) 138static int dentry_lru_show(struct seq_file *s, void *ptr)
282{ 139{
283 struct ceph_client *client = s->private; 140 struct ceph_fs_client *fsc = s->private;
284 struct ceph_mds_client *mdsc = &client->mdsc; 141 struct ceph_mds_client *mdsc = fsc->mdsc;
285 struct ceph_dentry_info *di; 142 struct ceph_dentry_info *di;
286 143
287 spin_lock(&mdsc->dentry_lru_lock); 144 spin_lock(&mdsc->dentry_lru_lock);
@@ -295,199 +152,124 @@ static int dentry_lru_show(struct seq_file *s, void *ptr)
295 return 0; 152 return 0;
296} 153}
297 154
298#define DEFINE_SHOW_FUNC(name) \ 155CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
299static int name##_open(struct inode *inode, struct file *file) \ 156CEPH_DEFINE_SHOW_FUNC(mdsc_show)
300{ \ 157CEPH_DEFINE_SHOW_FUNC(caps_show)
301 struct seq_file *sf; \ 158CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
302 int ret; \ 159
303 \
304 ret = single_open(file, name, NULL); \
305 sf = file->private_data; \
306 sf->private = inode->i_private; \
307 return ret; \
308} \
309 \
310static const struct file_operations name##_fops = { \
311 .open = name##_open, \
312 .read = seq_read, \
313 .llseek = seq_lseek, \
314 .release = single_release, \
315};
316
317DEFINE_SHOW_FUNC(monmap_show)
318DEFINE_SHOW_FUNC(mdsmap_show)
319DEFINE_SHOW_FUNC(osdmap_show)
320DEFINE_SHOW_FUNC(monc_show)
321DEFINE_SHOW_FUNC(mdsc_show)
322DEFINE_SHOW_FUNC(osdc_show)
323DEFINE_SHOW_FUNC(dentry_lru_show)
324DEFINE_SHOW_FUNC(caps_show)
325 160
161/*
162 * debugfs
163 */
326static int congestion_kb_set(void *data, u64 val) 164static int congestion_kb_set(void *data, u64 val)
327{ 165{
328 struct ceph_client *client = (struct ceph_client *)data; 166 struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
329
330 if (client)
331 client->mount_args->congestion_kb = (int)val;
332 167
168 fsc->mount_options->congestion_kb = (int)val;
333 return 0; 169 return 0;
334} 170}
335 171
336static int congestion_kb_get(void *data, u64 *val) 172static int congestion_kb_get(void *data, u64 *val)
337{ 173{
338 struct ceph_client *client = (struct ceph_client *)data; 174 struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
339
340 if (client)
341 *val = (u64)client->mount_args->congestion_kb;
342 175
176 *val = (u64)fsc->mount_options->congestion_kb;
343 return 0; 177 return 0;
344} 178}
345 179
346
347DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get, 180DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
348 congestion_kb_set, "%llu\n"); 181 congestion_kb_set, "%llu\n");
349 182
350int __init ceph_debugfs_init(void)
351{
352 ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
353 if (!ceph_debugfs_dir)
354 return -ENOMEM;
355 return 0;
356}
357 183
358void ceph_debugfs_cleanup(void) 184void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
359{ 185{
360 debugfs_remove(ceph_debugfs_dir); 186 dout("ceph_fs_debugfs_cleanup\n");
187 debugfs_remove(fsc->debugfs_bdi);
188 debugfs_remove(fsc->debugfs_congestion_kb);
189 debugfs_remove(fsc->debugfs_mdsmap);
190 debugfs_remove(fsc->debugfs_caps);
191 debugfs_remove(fsc->debugfs_mdsc);
192 debugfs_remove(fsc->debugfs_dentry_lru);
361} 193}
362 194
363int ceph_debugfs_client_init(struct ceph_client *client) 195int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
364{ 196{
365 int ret = 0; 197 char name[100];
366 char name[80]; 198 int err = -ENOMEM;
367
368 snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
369 client->monc.auth->global_id);
370 199
371 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); 200 dout("ceph_fs_debugfs_init\n");
372 if (!client->debugfs_dir) 201 fsc->debugfs_congestion_kb =
373 goto out; 202 debugfs_create_file("writeback_congestion_kb",
374 203 0600,
375 client->monc.debugfs_file = debugfs_create_file("monc", 204 fsc->client->debugfs_dir,
376 0600, 205 fsc,
377 client->debugfs_dir, 206 &congestion_kb_fops);
378 client, 207 if (!fsc->debugfs_congestion_kb)
379 &monc_show_fops);
380 if (!client->monc.debugfs_file)
381 goto out; 208 goto out;
382 209
383 client->mdsc.debugfs_file = debugfs_create_file("mdsc", 210 dout("a\n");
384 0600,
385 client->debugfs_dir,
386 client,
387 &mdsc_show_fops);
388 if (!client->mdsc.debugfs_file)
389 goto out;
390 211
391 client->osdc.debugfs_file = debugfs_create_file("osdc", 212 snprintf(name, sizeof(name), "../../bdi/%s",
392 0600, 213 dev_name(fsc->backing_dev_info.dev));
393 client->debugfs_dir, 214 fsc->debugfs_bdi =
394 client, 215 debugfs_create_symlink("bdi",
395 &osdc_show_fops); 216 fsc->client->debugfs_dir,
396 if (!client->osdc.debugfs_file) 217 name);
218 if (!fsc->debugfs_bdi)
397 goto out; 219 goto out;
398 220
399 client->debugfs_monmap = debugfs_create_file("monmap", 221 dout("b\n");
222 fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
400 0600, 223 0600,
401 client->debugfs_dir, 224 fsc->client->debugfs_dir,
402 client, 225 fsc,
403 &monmap_show_fops);
404 if (!client->debugfs_monmap)
405 goto out;
406
407 client->debugfs_mdsmap = debugfs_create_file("mdsmap",
408 0600,
409 client->debugfs_dir,
410 client,
411 &mdsmap_show_fops); 226 &mdsmap_show_fops);
412 if (!client->debugfs_mdsmap) 227 if (!fsc->debugfs_mdsmap)
413 goto out;
414
415 client->debugfs_osdmap = debugfs_create_file("osdmap",
416 0600,
417 client->debugfs_dir,
418 client,
419 &osdmap_show_fops);
420 if (!client->debugfs_osdmap)
421 goto out; 228 goto out;
422 229
423 client->debugfs_dentry_lru = debugfs_create_file("dentry_lru", 230 dout("ca\n");
424 0600, 231 fsc->debugfs_mdsc = debugfs_create_file("mdsc",
425 client->debugfs_dir, 232 0600,
426 client, 233 fsc->client->debugfs_dir,
427 &dentry_lru_show_fops); 234 fsc,
428 if (!client->debugfs_dentry_lru) 235 &mdsc_show_fops);
236 if (!fsc->debugfs_mdsc)
429 goto out; 237 goto out;
430 238
431 client->debugfs_caps = debugfs_create_file("caps", 239 dout("da\n");
240 fsc->debugfs_caps = debugfs_create_file("caps",
432 0400, 241 0400,
433 client->debugfs_dir, 242 fsc->client->debugfs_dir,
434 client, 243 fsc,
435 &caps_show_fops); 244 &caps_show_fops);
436 if (!client->debugfs_caps) 245 if (!fsc->debugfs_caps)
437 goto out; 246 goto out;
438 247
439 client->debugfs_congestion_kb = 248 dout("ea\n");
440 debugfs_create_file("writeback_congestion_kb", 249 fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
441 0600, 250 0600,
442 client->debugfs_dir, 251 fsc->client->debugfs_dir,
443 client, 252 fsc,
444 &congestion_kb_fops); 253 &dentry_lru_show_fops);
445 if (!client->debugfs_congestion_kb) 254 if (!fsc->debugfs_dentry_lru)
446 goto out; 255 goto out;
447 256
448 sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
449 client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
450 name);
451
452 return 0; 257 return 0;
453 258
454out: 259out:
455 ceph_debugfs_client_cleanup(client); 260 ceph_fs_debugfs_cleanup(fsc);
456 return ret; 261 return err;
457} 262}
458 263
459void ceph_debugfs_client_cleanup(struct ceph_client *client)
460{
461 debugfs_remove(client->debugfs_bdi);
462 debugfs_remove(client->debugfs_caps);
463 debugfs_remove(client->debugfs_dentry_lru);
464 debugfs_remove(client->debugfs_osdmap);
465 debugfs_remove(client->debugfs_mdsmap);
466 debugfs_remove(client->debugfs_monmap);
467 debugfs_remove(client->osdc.debugfs_file);
468 debugfs_remove(client->mdsc.debugfs_file);
469 debugfs_remove(client->monc.debugfs_file);
470 debugfs_remove(client->debugfs_congestion_kb);
471 debugfs_remove(client->debugfs_dir);
472}
473 264
474#else /* CONFIG_DEBUG_FS */ 265#else /* CONFIG_DEBUG_FS */
475 266
476int __init ceph_debugfs_init(void) 267int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
477{
478 return 0;
479}
480
481void ceph_debugfs_cleanup(void)
482{
483}
484
485int ceph_debugfs_client_init(struct ceph_client *client)
486{ 268{
487 return 0; 269 return 0;
488} 270}
489 271
490void ceph_debugfs_client_cleanup(struct ceph_client *client) 272void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
491{ 273{
492} 274}
493 275
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
deleted file mode 100644
index 3d25415afe63..000000000000
--- a/fs/ceph/decode.h
+++ /dev/null
@@ -1,196 +0,0 @@
1#ifndef __CEPH_DECODE_H
2#define __CEPH_DECODE_H
3
4#include <asm/unaligned.h>
5#include <linux/time.h>
6
7#include "types.h"
8
9/*
10 * in all cases,
11 * void **p pointer to position pointer
12 * void *end pointer to end of buffer (last byte + 1)
13 */
14
15static inline u64 ceph_decode_64(void **p)
16{
17 u64 v = get_unaligned_le64(*p);
18 *p += sizeof(u64);
19 return v;
20}
21static inline u32 ceph_decode_32(void **p)
22{
23 u32 v = get_unaligned_le32(*p);
24 *p += sizeof(u32);
25 return v;
26}
27static inline u16 ceph_decode_16(void **p)
28{
29 u16 v = get_unaligned_le16(*p);
30 *p += sizeof(u16);
31 return v;
32}
33static inline u8 ceph_decode_8(void **p)
34{
35 u8 v = *(u8 *)*p;
36 (*p)++;
37 return v;
38}
39static inline void ceph_decode_copy(void **p, void *pv, size_t n)
40{
41 memcpy(pv, *p, n);
42 *p += n;
43}
44
45/*
46 * bounds check input.
47 */
48#define ceph_decode_need(p, end, n, bad) \
49 do { \
50 if (unlikely(*(p) + (n) > (end))) \
51 goto bad; \
52 } while (0)
53
54#define ceph_decode_64_safe(p, end, v, bad) \
55 do { \
56 ceph_decode_need(p, end, sizeof(u64), bad); \
57 v = ceph_decode_64(p); \
58 } while (0)
59#define ceph_decode_32_safe(p, end, v, bad) \
60 do { \
61 ceph_decode_need(p, end, sizeof(u32), bad); \
62 v = ceph_decode_32(p); \
63 } while (0)
64#define ceph_decode_16_safe(p, end, v, bad) \
65 do { \
66 ceph_decode_need(p, end, sizeof(u16), bad); \
67 v = ceph_decode_16(p); \
68 } while (0)
69#define ceph_decode_8_safe(p, end, v, bad) \
70 do { \
71 ceph_decode_need(p, end, sizeof(u8), bad); \
72 v = ceph_decode_8(p); \
73 } while (0)
74
75#define ceph_decode_copy_safe(p, end, pv, n, bad) \
76 do { \
77 ceph_decode_need(p, end, n, bad); \
78 ceph_decode_copy(p, pv, n); \
79 } while (0)
80
81/*
82 * struct ceph_timespec <-> struct timespec
83 */
84static inline void ceph_decode_timespec(struct timespec *ts,
85 const struct ceph_timespec *tv)
86{
87 ts->tv_sec = le32_to_cpu(tv->tv_sec);
88 ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
89}
90static inline void ceph_encode_timespec(struct ceph_timespec *tv,
91 const struct timespec *ts)
92{
93 tv->tv_sec = cpu_to_le32(ts->tv_sec);
94 tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
95}
96
97/*
98 * sockaddr_storage <-> ceph_sockaddr
99 */
100static inline void ceph_encode_addr(struct ceph_entity_addr *a)
101{
102 __be16 ss_family = htons(a->in_addr.ss_family);
103 a->in_addr.ss_family = *(__u16 *)&ss_family;
104}
105static inline void ceph_decode_addr(struct ceph_entity_addr *a)
106{
107 __be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
108 a->in_addr.ss_family = ntohs(ss_family);
109 WARN_ON(a->in_addr.ss_family == 512);
110}
111
112/*
113 * encoders
114 */
115static inline void ceph_encode_64(void **p, u64 v)
116{
117 put_unaligned_le64(v, (__le64 *)*p);
118 *p += sizeof(u64);
119}
120static inline void ceph_encode_32(void **p, u32 v)
121{
122 put_unaligned_le32(v, (__le32 *)*p);
123 *p += sizeof(u32);
124}
125static inline void ceph_encode_16(void **p, u16 v)
126{
127 put_unaligned_le16(v, (__le16 *)*p);
128 *p += sizeof(u16);
129}
130static inline void ceph_encode_8(void **p, u8 v)
131{
132 *(u8 *)*p = v;
133 (*p)++;
134}
135static inline void ceph_encode_copy(void **p, const void *s, int len)
136{
137 memcpy(*p, s, len);
138 *p += len;
139}
140
141/*
142 * filepath, string encoders
143 */
144static inline void ceph_encode_filepath(void **p, void *end,
145 u64 ino, const char *path)
146{
147 u32 len = path ? strlen(path) : 0;
148 BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
149 ceph_encode_8(p, 1);
150 ceph_encode_64(p, ino);
151 ceph_encode_32(p, len);
152 if (len)
153 memcpy(*p, path, len);
154 *p += len;
155}
156
157static inline void ceph_encode_string(void **p, void *end,
158 const char *s, u32 len)
159{
160 BUG_ON(*p + sizeof(len) + len > end);
161 ceph_encode_32(p, len);
162 if (len)
163 memcpy(*p, s, len);
164 *p += len;
165}
166
167#define ceph_encode_need(p, end, n, bad) \
168 do { \
169 if (unlikely(*(p) + (n) > (end))) \
170 goto bad; \
171 } while (0)
172
173#define ceph_encode_64_safe(p, end, v, bad) \
174 do { \
175 ceph_encode_need(p, end, sizeof(u64), bad); \
176 ceph_encode_64(p, v); \
177 } while (0)
178#define ceph_encode_32_safe(p, end, v, bad) \
179 do { \
180 ceph_encode_need(p, end, sizeof(u32), bad); \
181 ceph_encode_32(p, v); \
182 } while (0)
183#define ceph_encode_16_safe(p, end, v, bad) \
184 do { \
185 ceph_encode_need(p, end, sizeof(u16), bad); \
186 ceph_encode_16(p, v); \
187 } while (0)
188
189#define ceph_encode_copy_safe(p, end, pv, n, bad) \
190 do { \
191 ceph_encode_need(p, end, n, bad); \
192 ceph_encode_copy(p, pv, n); \
193 } while (0)
194
195
196#endif
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 6e4f43ff23ec..e0a2dc6fcafc 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/spinlock.h> 3#include <linux/spinlock.h>
4#include <linux/fs_struct.h> 4#include <linux/fs_struct.h>
@@ -7,6 +7,7 @@
7#include <linux/sched.h> 7#include <linux/sched.h>
8 8
9#include "super.h" 9#include "super.h"
10#include "mds_client.h"
10 11
11/* 12/*
12 * Directory operations: readdir, lookup, create, link, unlink, 13 * Directory operations: readdir, lookup, create, link, unlink,
@@ -94,10 +95,7 @@ static unsigned fpos_off(loff_t p)
94 */ 95 */
95static int __dcache_readdir(struct file *filp, 96static int __dcache_readdir(struct file *filp,
96 void *dirent, filldir_t filldir) 97 void *dirent, filldir_t filldir)
97 __releases(inode->i_lock)
98 __acquires(inode->i_lock)
99{ 98{
100 struct inode *inode = filp->f_dentry->d_inode;
101 struct ceph_file_info *fi = filp->private_data; 99 struct ceph_file_info *fi = filp->private_data;
102 struct dentry *parent = filp->f_dentry; 100 struct dentry *parent = filp->f_dentry;
103 struct inode *dir = parent->d_inode; 101 struct inode *dir = parent->d_inode;
@@ -153,7 +151,6 @@ more:
153 151
154 atomic_inc(&dentry->d_count); 152 atomic_inc(&dentry->d_count);
155 spin_unlock(&dcache_lock); 153 spin_unlock(&dcache_lock);
156 spin_unlock(&inode->i_lock);
157 154
158 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, 155 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
159 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); 156 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
@@ -171,35 +168,30 @@ more:
171 } else { 168 } else {
172 dput(last); 169 dput(last);
173 } 170 }
174 last = NULL;
175 } 171 }
176
177 spin_lock(&inode->i_lock);
178 spin_lock(&dcache_lock);
179
180 last = dentry; 172 last = dentry;
181 173
182 if (err < 0) 174 if (err < 0)
183 goto out_unlock; 175 goto out;
184 176
185 p = p->prev;
186 filp->f_pos++; 177 filp->f_pos++;
187 178
188 /* make sure a dentry wasn't dropped while we didn't have dcache_lock */ 179 /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
189 if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE)) 180 if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
190 goto more; 181 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
191 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); 182 err = -EAGAIN;
192 err = -EAGAIN; 183 goto out;
184 }
185
186 spin_lock(&dcache_lock);
187 p = p->prev; /* advance to next dentry */
188 goto more;
193 189
194out_unlock: 190out_unlock:
195 spin_unlock(&dcache_lock); 191 spin_unlock(&dcache_lock);
196 192out:
197 if (last) { 193 if (last)
198 spin_unlock(&inode->i_lock);
199 dput(last); 194 dput(last);
200 spin_lock(&inode->i_lock);
201 }
202
203 return err; 195 return err;
204} 196}
205 197
@@ -227,15 +219,15 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
227 struct ceph_file_info *fi = filp->private_data; 219 struct ceph_file_info *fi = filp->private_data;
228 struct inode *inode = filp->f_dentry->d_inode; 220 struct inode *inode = filp->f_dentry->d_inode;
229 struct ceph_inode_info *ci = ceph_inode(inode); 221 struct ceph_inode_info *ci = ceph_inode(inode);
230 struct ceph_client *client = ceph_inode_to_client(inode); 222 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
231 struct ceph_mds_client *mdsc = &client->mdsc; 223 struct ceph_mds_client *mdsc = fsc->mdsc;
232 unsigned frag = fpos_frag(filp->f_pos); 224 unsigned frag = fpos_frag(filp->f_pos);
233 int off = fpos_off(filp->f_pos); 225 int off = fpos_off(filp->f_pos);
234 int err; 226 int err;
235 u32 ftype; 227 u32 ftype;
236 struct ceph_mds_reply_info_parsed *rinfo; 228 struct ceph_mds_reply_info_parsed *rinfo;
237 const int max_entries = client->mount_args->max_readdir; 229 const int max_entries = fsc->mount_options->max_readdir;
238 const int max_bytes = client->mount_args->max_readdir_bytes; 230 const int max_bytes = fsc->mount_options->max_readdir_bytes;
239 231
240 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); 232 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
241 if (fi->at_end) 233 if (fi->at_end)
@@ -267,17 +259,17 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
267 /* can we use the dcache? */ 259 /* can we use the dcache? */
268 spin_lock(&inode->i_lock); 260 spin_lock(&inode->i_lock);
269 if ((filp->f_pos == 2 || fi->dentry) && 261 if ((filp->f_pos == 2 || fi->dentry) &&
270 !ceph_test_opt(client, NOASYNCREADDIR) && 262 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
271 ceph_snap(inode) != CEPH_SNAPDIR && 263 ceph_snap(inode) != CEPH_SNAPDIR &&
272 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 264 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
273 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 265 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
266 spin_unlock(&inode->i_lock);
274 err = __dcache_readdir(filp, dirent, filldir); 267 err = __dcache_readdir(filp, dirent, filldir);
275 if (err != -EAGAIN) { 268 if (err != -EAGAIN)
276 spin_unlock(&inode->i_lock);
277 return err; 269 return err;
278 } 270 } else {
271 spin_unlock(&inode->i_lock);
279 } 272 }
280 spin_unlock(&inode->i_lock);
281 if (fi->dentry) { 273 if (fi->dentry) {
282 err = note_last_dentry(fi, fi->dentry->d_name.name, 274 err = note_last_dentry(fi, fi->dentry->d_name.name,
283 fi->dentry->d_name.len); 275 fi->dentry->d_name.len);
@@ -487,14 +479,13 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
487struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, 479struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
488 struct dentry *dentry, int err) 480 struct dentry *dentry, int err)
489{ 481{
490 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); 482 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
491 struct inode *parent = dentry->d_parent->d_inode; 483 struct inode *parent = dentry->d_parent->d_inode;
492 484
493 /* .snap dir? */ 485 /* .snap dir? */
494 if (err == -ENOENT && 486 if (err == -ENOENT &&
495 ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
496 strcmp(dentry->d_name.name, 487 strcmp(dentry->d_name.name,
497 client->mount_args->snapdir_name) == 0) { 488 fsc->mount_options->snapdir_name) == 0) {
498 struct inode *inode = ceph_get_snapdir(parent); 489 struct inode *inode = ceph_get_snapdir(parent);
499 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", 490 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
500 dentry, dentry->d_name.len, dentry->d_name.name, inode); 491 dentry, dentry->d_name.len, dentry->d_name.name, inode);
@@ -539,8 +530,8 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
539static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, 530static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
540 struct nameidata *nd) 531 struct nameidata *nd)
541{ 532{
542 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 533 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
543 struct ceph_mds_client *mdsc = &client->mdsc; 534 struct ceph_mds_client *mdsc = fsc->mdsc;
544 struct ceph_mds_request *req; 535 struct ceph_mds_request *req;
545 int op; 536 int op;
546 int err; 537 int err;
@@ -572,7 +563,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
572 spin_lock(&dir->i_lock); 563 spin_lock(&dir->i_lock);
573 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); 564 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
574 if (strncmp(dentry->d_name.name, 565 if (strncmp(dentry->d_name.name,
575 client->mount_args->snapdir_name, 566 fsc->mount_options->snapdir_name,
576 dentry->d_name.len) && 567 dentry->d_name.len) &&
577 !is_root_ceph_dentry(dir, dentry) && 568 !is_root_ceph_dentry(dir, dentry) &&
578 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 569 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
@@ -629,8 +620,8 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
629static int ceph_mknod(struct inode *dir, struct dentry *dentry, 620static int ceph_mknod(struct inode *dir, struct dentry *dentry,
630 int mode, dev_t rdev) 621 int mode, dev_t rdev)
631{ 622{
632 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 623 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
633 struct ceph_mds_client *mdsc = &client->mdsc; 624 struct ceph_mds_client *mdsc = fsc->mdsc;
634 struct ceph_mds_request *req; 625 struct ceph_mds_request *req;
635 int err; 626 int err;
636 627
@@ -685,8 +676,8 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
685static int ceph_symlink(struct inode *dir, struct dentry *dentry, 676static int ceph_symlink(struct inode *dir, struct dentry *dentry,
686 const char *dest) 677 const char *dest)
687{ 678{
688 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 679 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
689 struct ceph_mds_client *mdsc = &client->mdsc; 680 struct ceph_mds_client *mdsc = fsc->mdsc;
690 struct ceph_mds_request *req; 681 struct ceph_mds_request *req;
691 int err; 682 int err;
692 683
@@ -716,8 +707,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
716 707
717static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode) 708static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
718{ 709{
719 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 710 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
720 struct ceph_mds_client *mdsc = &client->mdsc; 711 struct ceph_mds_client *mdsc = fsc->mdsc;
721 struct ceph_mds_request *req; 712 struct ceph_mds_request *req;
722 int err = -EROFS; 713 int err = -EROFS;
723 int op; 714 int op;
@@ -758,8 +749,8 @@ out:
758static int ceph_link(struct dentry *old_dentry, struct inode *dir, 749static int ceph_link(struct dentry *old_dentry, struct inode *dir,
759 struct dentry *dentry) 750 struct dentry *dentry)
760{ 751{
761 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 752 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
762 struct ceph_mds_client *mdsc = &client->mdsc; 753 struct ceph_mds_client *mdsc = fsc->mdsc;
763 struct ceph_mds_request *req; 754 struct ceph_mds_request *req;
764 int err; 755 int err;
765 756
@@ -813,8 +804,8 @@ static int drop_caps_for_unlink(struct inode *inode)
813 */ 804 */
814static int ceph_unlink(struct inode *dir, struct dentry *dentry) 805static int ceph_unlink(struct inode *dir, struct dentry *dentry)
815{ 806{
816 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 807 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
817 struct ceph_mds_client *mdsc = &client->mdsc; 808 struct ceph_mds_client *mdsc = fsc->mdsc;
818 struct inode *inode = dentry->d_inode; 809 struct inode *inode = dentry->d_inode;
819 struct ceph_mds_request *req; 810 struct ceph_mds_request *req;
820 int err = -EROFS; 811 int err = -EROFS;
@@ -854,8 +845,8 @@ out:
854static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, 845static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
855 struct inode *new_dir, struct dentry *new_dentry) 846 struct inode *new_dir, struct dentry *new_dentry)
856{ 847{
857 struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb); 848 struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
858 struct ceph_mds_client *mdsc = &client->mdsc; 849 struct ceph_mds_client *mdsc = fsc->mdsc;
859 struct ceph_mds_request *req; 850 struct ceph_mds_request *req;
860 int err; 851 int err;
861 852
@@ -1021,11 +1012,15 @@ out_touch:
1021static void ceph_dentry_release(struct dentry *dentry) 1012static void ceph_dentry_release(struct dentry *dentry)
1022{ 1013{
1023 struct ceph_dentry_info *di = ceph_dentry(dentry); 1014 struct ceph_dentry_info *di = ceph_dentry(dentry);
1024 struct inode *parent_inode = dentry->d_parent->d_inode; 1015 struct inode *parent_inode = NULL;
1025 u64 snapid = ceph_snap(parent_inode); 1016 u64 snapid = CEPH_NOSNAP;
1026 1017
1018 if (!IS_ROOT(dentry)) {
1019 parent_inode = dentry->d_parent->d_inode;
1020 if (parent_inode)
1021 snapid = ceph_snap(parent_inode);
1022 }
1027 dout("dentry_release %p parent %p\n", dentry, parent_inode); 1023 dout("dentry_release %p parent %p\n", dentry, parent_inode);
1028
1029 if (parent_inode && snapid != CEPH_SNAPDIR) { 1024 if (parent_inode && snapid != CEPH_SNAPDIR) {
1030 struct ceph_inode_info *ci = ceph_inode(parent_inode); 1025 struct ceph_inode_info *ci = ceph_inode(parent_inode);
1031 1026
@@ -1072,7 +1067,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1072 struct ceph_inode_info *ci = ceph_inode(inode); 1067 struct ceph_inode_info *ci = ceph_inode(inode);
1073 int left; 1068 int left;
1074 1069
1075 if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) 1070 if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
1076 return -EISDIR; 1071 return -EISDIR;
1077 1072
1078 if (!cf->dir_info) { 1073 if (!cf->dir_info) {
@@ -1173,7 +1168,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
1173 dout("dentry_lru_add %p %p '%.*s'\n", di, dn, 1168 dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1174 dn->d_name.len, dn->d_name.name); 1169 dn->d_name.len, dn->d_name.name);
1175 if (di) { 1170 if (di) {
1176 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; 1171 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1177 spin_lock(&mdsc->dentry_lru_lock); 1172 spin_lock(&mdsc->dentry_lru_lock);
1178 list_add_tail(&di->lru, &mdsc->dentry_lru); 1173 list_add_tail(&di->lru, &mdsc->dentry_lru);
1179 mdsc->num_dentry++; 1174 mdsc->num_dentry++;
@@ -1189,7 +1184,7 @@ void ceph_dentry_lru_touch(struct dentry *dn)
1189 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, 1184 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
1190 dn->d_name.len, dn->d_name.name, di->offset); 1185 dn->d_name.len, dn->d_name.name, di->offset);
1191 if (di) { 1186 if (di) {
1192 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; 1187 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1193 spin_lock(&mdsc->dentry_lru_lock); 1188 spin_lock(&mdsc->dentry_lru_lock);
1194 list_move_tail(&di->lru, &mdsc->dentry_lru); 1189 list_move_tail(&di->lru, &mdsc->dentry_lru);
1195 spin_unlock(&mdsc->dentry_lru_lock); 1190 spin_unlock(&mdsc->dentry_lru_lock);
@@ -1204,7 +1199,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
1204 dout("dentry_lru_del %p %p '%.*s'\n", di, dn, 1199 dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1205 dn->d_name.len, dn->d_name.name); 1200 dn->d_name.len, dn->d_name.name);
1206 if (di) { 1201 if (di) {
1207 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; 1202 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1208 spin_lock(&mdsc->dentry_lru_lock); 1203 spin_lock(&mdsc->dentry_lru_lock);
1209 list_del_init(&di->lru); 1204 list_del_init(&di->lru);
1210 mdsc->num_dentry--; 1205 mdsc->num_dentry--;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 4480cb1c63e7..2297d9426992 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -1,10 +1,11 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/exportfs.h> 3#include <linux/exportfs.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
5#include <asm/unaligned.h> 5#include <asm/unaligned.h>
6 6
7#include "super.h" 7#include "super.h"
8#include "mds_client.h"
8 9
9/* 10/*
10 * NFS export support 11 * NFS export support
@@ -42,32 +43,37 @@ struct ceph_nfs_confh {
42static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, 43static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
43 int connectable) 44 int connectable)
44{ 45{
46 int type;
45 struct ceph_nfs_fh *fh = (void *)rawfh; 47 struct ceph_nfs_fh *fh = (void *)rawfh;
46 struct ceph_nfs_confh *cfh = (void *)rawfh; 48 struct ceph_nfs_confh *cfh = (void *)rawfh;
47 struct dentry *parent = dentry->d_parent; 49 struct dentry *parent = dentry->d_parent;
48 struct inode *inode = dentry->d_inode; 50 struct inode *inode = dentry->d_inode;
49 int type; 51 int connected_handle_length = sizeof(*cfh)/4;
52 int handle_length = sizeof(*fh)/4;
50 53
51 /* don't re-export snaps */ 54 /* don't re-export snaps */
52 if (ceph_snap(inode) != CEPH_NOSNAP) 55 if (ceph_snap(inode) != CEPH_NOSNAP)
53 return -EINVAL; 56 return -EINVAL;
54 57
55 if (*max_len >= sizeof(*cfh)) { 58 if (*max_len >= connected_handle_length) {
56 dout("encode_fh %p connectable\n", dentry); 59 dout("encode_fh %p connectable\n", dentry);
57 cfh->ino = ceph_ino(dentry->d_inode); 60 cfh->ino = ceph_ino(dentry->d_inode);
58 cfh->parent_ino = ceph_ino(parent->d_inode); 61 cfh->parent_ino = ceph_ino(parent->d_inode);
59 cfh->parent_name_hash = parent->d_name.hash; 62 cfh->parent_name_hash = parent->d_name.hash;
60 *max_len = sizeof(*cfh); 63 *max_len = connected_handle_length;
61 type = 2; 64 type = 2;
62 } else if (*max_len > sizeof(*fh)) { 65 } else if (*max_len >= handle_length) {
63 if (connectable) 66 if (connectable) {
64 return -ENOSPC; 67 *max_len = connected_handle_length;
68 return 255;
69 }
65 dout("encode_fh %p\n", dentry); 70 dout("encode_fh %p\n", dentry);
66 fh->ino = ceph_ino(dentry->d_inode); 71 fh->ino = ceph_ino(dentry->d_inode);
67 *max_len = sizeof(*fh); 72 *max_len = handle_length;
68 type = 1; 73 type = 1;
69 } else { 74 } else {
70 return -ENOSPC; 75 *max_len = handle_length;
76 return 255;
71 } 77 }
72 return type; 78 return type;
73} 79}
@@ -115,7 +121,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
115static struct dentry *__cfh_to_dentry(struct super_block *sb, 121static struct dentry *__cfh_to_dentry(struct super_block *sb,
116 struct ceph_nfs_confh *cfh) 122 struct ceph_nfs_confh *cfh)
117{ 123{
118 struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc; 124 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
119 struct inode *inode; 125 struct inode *inode;
120 struct dentry *dentry; 126 struct dentry *dentry;
121 struct ceph_vino vino; 127 struct ceph_vino vino;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 8c044a4f0457..e77c28cf3690 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1,5 +1,6 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/module.h>
3#include <linux/sched.h> 4#include <linux/sched.h>
4#include <linux/slab.h> 5#include <linux/slab.h>
5#include <linux/file.h> 6#include <linux/file.h>
@@ -38,8 +39,8 @@
38static struct ceph_mds_request * 39static struct ceph_mds_request *
39prepare_open_request(struct super_block *sb, int flags, int create_mode) 40prepare_open_request(struct super_block *sb, int flags, int create_mode)
40{ 41{
41 struct ceph_client *client = ceph_sb_to_client(sb); 42 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
42 struct ceph_mds_client *mdsc = &client->mdsc; 43 struct ceph_mds_client *mdsc = fsc->mdsc;
43 struct ceph_mds_request *req; 44 struct ceph_mds_request *req;
44 int want_auth = USE_ANY_MDS; 45 int want_auth = USE_ANY_MDS;
45 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN; 46 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
@@ -117,8 +118,8 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
117int ceph_open(struct inode *inode, struct file *file) 118int ceph_open(struct inode *inode, struct file *file)
118{ 119{
119 struct ceph_inode_info *ci = ceph_inode(inode); 120 struct ceph_inode_info *ci = ceph_inode(inode);
120 struct ceph_client *client = ceph_sb_to_client(inode->i_sb); 121 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
121 struct ceph_mds_client *mdsc = &client->mdsc; 122 struct ceph_mds_client *mdsc = fsc->mdsc;
122 struct ceph_mds_request *req; 123 struct ceph_mds_request *req;
123 struct ceph_file_info *cf = file->private_data; 124 struct ceph_file_info *cf = file->private_data;
124 struct inode *parent_inode = file->f_dentry->d_parent->d_inode; 125 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
@@ -216,8 +217,8 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
216 struct nameidata *nd, int mode, 217 struct nameidata *nd, int mode,
217 int locked_dir) 218 int locked_dir)
218{ 219{
219 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 220 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
220 struct ceph_mds_client *mdsc = &client->mdsc; 221 struct ceph_mds_client *mdsc = fsc->mdsc;
221 struct file *file = nd->intent.open.file; 222 struct file *file = nd->intent.open.file;
222 struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry); 223 struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
223 struct ceph_mds_request *req; 224 struct ceph_mds_request *req;
@@ -270,163 +271,6 @@ int ceph_release(struct inode *inode, struct file *file)
270} 271}
271 272
272/* 273/*
273 * build a vector of user pages
274 */
275static struct page **get_direct_page_vector(const char __user *data,
276 int num_pages,
277 loff_t off, size_t len)
278{
279 struct page **pages;
280 int rc;
281
282 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
283 if (!pages)
284 return ERR_PTR(-ENOMEM);
285
286 down_read(&current->mm->mmap_sem);
287 rc = get_user_pages(current, current->mm, (unsigned long)data,
288 num_pages, 0, 0, pages, NULL);
289 up_read(&current->mm->mmap_sem);
290 if (rc < 0)
291 goto fail;
292 return pages;
293
294fail:
295 kfree(pages);
296 return ERR_PTR(rc);
297}
298
299static void put_page_vector(struct page **pages, int num_pages)
300{
301 int i;
302
303 for (i = 0; i < num_pages; i++)
304 put_page(pages[i]);
305 kfree(pages);
306}
307
308void ceph_release_page_vector(struct page **pages, int num_pages)
309{
310 int i;
311
312 for (i = 0; i < num_pages; i++)
313 __free_pages(pages[i], 0);
314 kfree(pages);
315}
316
317/*
318 * allocate a vector new pages
319 */
320static struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
321{
322 struct page **pages;
323 int i;
324
325 pages = kmalloc(sizeof(*pages) * num_pages, flags);
326 if (!pages)
327 return ERR_PTR(-ENOMEM);
328 for (i = 0; i < num_pages; i++) {
329 pages[i] = __page_cache_alloc(flags);
330 if (pages[i] == NULL) {
331 ceph_release_page_vector(pages, i);
332 return ERR_PTR(-ENOMEM);
333 }
334 }
335 return pages;
336}
337
338/*
339 * copy user data into a page vector
340 */
341static int copy_user_to_page_vector(struct page **pages,
342 const char __user *data,
343 loff_t off, size_t len)
344{
345 int i = 0;
346 int po = off & ~PAGE_CACHE_MASK;
347 int left = len;
348 int l, bad;
349
350 while (left > 0) {
351 l = min_t(int, PAGE_CACHE_SIZE-po, left);
352 bad = copy_from_user(page_address(pages[i]) + po, data, l);
353 if (bad == l)
354 return -EFAULT;
355 data += l - bad;
356 left -= l - bad;
357 po += l - bad;
358 if (po == PAGE_CACHE_SIZE) {
359 po = 0;
360 i++;
361 }
362 }
363 return len;
364}
365
366/*
367 * copy user data from a page vector into a user pointer
368 */
369static int copy_page_vector_to_user(struct page **pages, char __user *data,
370 loff_t off, size_t len)
371{
372 int i = 0;
373 int po = off & ~PAGE_CACHE_MASK;
374 int left = len;
375 int l, bad;
376
377 while (left > 0) {
378 l = min_t(int, left, PAGE_CACHE_SIZE-po);
379 bad = copy_to_user(data, page_address(pages[i]) + po, l);
380 if (bad == l)
381 return -EFAULT;
382 data += l - bad;
383 left -= l - bad;
384 if (po) {
385 po += l - bad;
386 if (po == PAGE_CACHE_SIZE)
387 po = 0;
388 }
389 i++;
390 }
391 return len;
392}
393
394/*
395 * Zero an extent within a page vector. Offset is relative to the
396 * start of the first page.
397 */
398static void zero_page_vector_range(int off, int len, struct page **pages)
399{
400 int i = off >> PAGE_CACHE_SHIFT;
401
402 off &= ~PAGE_CACHE_MASK;
403
404 dout("zero_page_vector_page %u~%u\n", off, len);
405
406 /* leading partial page? */
407 if (off) {
408 int end = min((int)PAGE_CACHE_SIZE, off + len);
409 dout("zeroing %d %p head from %d\n", i, pages[i],
410 (int)off);
411 zero_user_segment(pages[i], off, end);
412 len -= (end - off);
413 i++;
414 }
415 while (len >= PAGE_CACHE_SIZE) {
416 dout("zeroing %d %p len=%d\n", i, pages[i], len);
417 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
418 len -= PAGE_CACHE_SIZE;
419 i++;
420 }
421 /* trailing partial page? */
422 if (len) {
423 dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
424 zero_user_segment(pages[i], 0, len);
425 }
426}
427
428
429/*
430 * Read a range of bytes striped over one or more objects. Iterate over 274 * Read a range of bytes striped over one or more objects. Iterate over
431 * objects we stripe over. (That's not atomic, but good enough for now.) 275 * objects we stripe over. (That's not atomic, but good enough for now.)
432 * 276 *
@@ -438,7 +282,7 @@ static int striped_read(struct inode *inode,
438 struct page **pages, int num_pages, 282 struct page **pages, int num_pages,
439 int *checkeof) 283 int *checkeof)
440{ 284{
441 struct ceph_client *client = ceph_inode_to_client(inode); 285 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
442 struct ceph_inode_info *ci = ceph_inode(inode); 286 struct ceph_inode_info *ci = ceph_inode(inode);
443 u64 pos, this_len; 287 u64 pos, this_len;
444 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ 288 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
@@ -459,7 +303,7 @@ static int striped_read(struct inode *inode,
459 303
460more: 304more:
461 this_len = left; 305 this_len = left;
462 ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode), 306 ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
463 &ci->i_layout, pos, &this_len, 307 &ci->i_layout, pos, &this_len,
464 ci->i_truncate_seq, 308 ci->i_truncate_seq,
465 ci->i_truncate_size, 309 ci->i_truncate_size,
@@ -477,8 +321,8 @@ more:
477 321
478 if (read < pos - off) { 322 if (read < pos - off) {
479 dout(" zero gap %llu to %llu\n", off + read, pos); 323 dout(" zero gap %llu to %llu\n", off + read, pos);
480 zero_page_vector_range(page_off + read, 324 ceph_zero_page_vector_range(page_off + read,
481 pos - off - read, pages); 325 pos - off - read, pages);
482 } 326 }
483 pos += ret; 327 pos += ret;
484 read = pos - off; 328 read = pos - off;
@@ -495,8 +339,8 @@ more:
495 /* was original extent fully inside i_size? */ 339 /* was original extent fully inside i_size? */
496 if (pos + left <= inode->i_size) { 340 if (pos + left <= inode->i_size) {
497 dout("zero tail\n"); 341 dout("zero tail\n");
498 zero_page_vector_range(page_off + read, len - read, 342 ceph_zero_page_vector_range(page_off + read, len - read,
499 pages); 343 pages);
500 read = len; 344 read = len;
501 goto out; 345 goto out;
502 } 346 }
@@ -531,7 +375,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
531 (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 375 (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
532 376
533 if (file->f_flags & O_DIRECT) { 377 if (file->f_flags & O_DIRECT) {
534 pages = get_direct_page_vector(data, num_pages, off, len); 378 pages = ceph_get_direct_page_vector(data, num_pages, off, len);
535 379
536 /* 380 /*
537 * flush any page cache pages in this range. this 381 * flush any page cache pages in this range. this
@@ -552,13 +396,13 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
552 ret = striped_read(inode, off, len, pages, num_pages, checkeof); 396 ret = striped_read(inode, off, len, pages, num_pages, checkeof);
553 397
554 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) 398 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
555 ret = copy_page_vector_to_user(pages, data, off, ret); 399 ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
556 if (ret >= 0) 400 if (ret >= 0)
557 *poff = off + ret; 401 *poff = off + ret;
558 402
559done: 403done:
560 if (file->f_flags & O_DIRECT) 404 if (file->f_flags & O_DIRECT)
561 put_page_vector(pages, num_pages); 405 ceph_put_page_vector(pages, num_pages);
562 else 406 else
563 ceph_release_page_vector(pages, num_pages); 407 ceph_release_page_vector(pages, num_pages);
564 dout("sync_read result %d\n", ret); 408 dout("sync_read result %d\n", ret);
@@ -594,7 +438,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
594{ 438{
595 struct inode *inode = file->f_dentry->d_inode; 439 struct inode *inode = file->f_dentry->d_inode;
596 struct ceph_inode_info *ci = ceph_inode(inode); 440 struct ceph_inode_info *ci = ceph_inode(inode);
597 struct ceph_client *client = ceph_inode_to_client(inode); 441 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
598 struct ceph_osd_request *req; 442 struct ceph_osd_request *req;
599 struct page **pages; 443 struct page **pages;
600 int num_pages; 444 int num_pages;
@@ -642,7 +486,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
642 */ 486 */
643more: 487more:
644 len = left; 488 len = left;
645 req = ceph_osdc_new_request(&client->osdc, &ci->i_layout, 489 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
646 ceph_vino(inode), pos, &len, 490 ceph_vino(inode), pos, &len,
647 CEPH_OSD_OP_WRITE, flags, 491 CEPH_OSD_OP_WRITE, flags,
648 ci->i_snap_realm->cached_context, 492 ci->i_snap_realm->cached_context,
@@ -655,7 +499,7 @@ more:
655 num_pages = calc_pages_for(pos, len); 499 num_pages = calc_pages_for(pos, len);
656 500
657 if (file->f_flags & O_DIRECT) { 501 if (file->f_flags & O_DIRECT) {
658 pages = get_direct_page_vector(data, num_pages, pos, len); 502 pages = ceph_get_direct_page_vector(data, num_pages, pos, len);
659 if (IS_ERR(pages)) { 503 if (IS_ERR(pages)) {
660 ret = PTR_ERR(pages); 504 ret = PTR_ERR(pages);
661 goto out; 505 goto out;
@@ -673,7 +517,7 @@ more:
673 ret = PTR_ERR(pages); 517 ret = PTR_ERR(pages);
674 goto out; 518 goto out;
675 } 519 }
676 ret = copy_user_to_page_vector(pages, data, pos, len); 520 ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
677 if (ret < 0) { 521 if (ret < 0) {
678 ceph_release_page_vector(pages, num_pages); 522 ceph_release_page_vector(pages, num_pages);
679 goto out; 523 goto out;
@@ -689,7 +533,7 @@ more:
689 req->r_num_pages = num_pages; 533 req->r_num_pages = num_pages;
690 req->r_inode = inode; 534 req->r_inode = inode;
691 535
692 ret = ceph_osdc_start_request(&client->osdc, req, false); 536 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
693 if (!ret) { 537 if (!ret) {
694 if (req->r_safe_callback) { 538 if (req->r_safe_callback) {
695 /* 539 /*
@@ -697,15 +541,15 @@ more:
697 * start_request so that a tid has been assigned. 541 * start_request so that a tid has been assigned.
698 */ 542 */
699 spin_lock(&ci->i_unsafe_lock); 543 spin_lock(&ci->i_unsafe_lock);
700 list_add(&ci->i_unsafe_writes, &req->r_unsafe_item); 544 list_add(&req->r_unsafe_item, &ci->i_unsafe_writes);
701 spin_unlock(&ci->i_unsafe_lock); 545 spin_unlock(&ci->i_unsafe_lock);
702 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); 546 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
703 } 547 }
704 ret = ceph_osdc_wait_request(&client->osdc, req); 548 ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
705 } 549 }
706 550
707 if (file->f_flags & O_DIRECT) 551 if (file->f_flags & O_DIRECT)
708 put_page_vector(pages, num_pages); 552 ceph_put_page_vector(pages, num_pages);
709 else if (file->f_flags & O_SYNC) 553 else if (file->f_flags & O_SYNC)
710 ceph_release_page_vector(pages, num_pages); 554 ceph_release_page_vector(pages, num_pages);
711 555
@@ -814,7 +658,8 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
814 struct ceph_file_info *fi = file->private_data; 658 struct ceph_file_info *fi = file->private_data;
815 struct inode *inode = file->f_dentry->d_inode; 659 struct inode *inode = file->f_dentry->d_inode;
816 struct ceph_inode_info *ci = ceph_inode(inode); 660 struct ceph_inode_info *ci = ceph_inode(inode);
817 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; 661 struct ceph_osd_client *osdc =
662 &ceph_sb_to_client(inode->i_sb)->client->osdc;
818 loff_t endoff = pos + iov->iov_len; 663 loff_t endoff = pos + iov->iov_len;
819 int want, got = 0; 664 int want, got = 0;
820 int ret, err; 665 int ret, err;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e7cca414da03..1d6a45b5a04c 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/fs.h> 4#include <linux/fs.h>
@@ -13,7 +13,8 @@
13#include <linux/pagevec.h> 13#include <linux/pagevec.h>
14 14
15#include "super.h" 15#include "super.h"
16#include "decode.h" 16#include "mds_client.h"
17#include <linux/ceph/decode.h>
17 18
18/* 19/*
19 * Ceph inode operations 20 * Ceph inode operations
@@ -384,7 +385,7 @@ void ceph_destroy_inode(struct inode *inode)
384 */ 385 */
385 if (ci->i_snap_realm) { 386 if (ci->i_snap_realm) {
386 struct ceph_mds_client *mdsc = 387 struct ceph_mds_client *mdsc =
387 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; 388 ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
388 struct ceph_snap_realm *realm = ci->i_snap_realm; 389 struct ceph_snap_realm *realm = ci->i_snap_realm;
389 390
390 dout(" dropping residual ref to snap realm %p\n", realm); 391 dout(" dropping residual ref to snap realm %p\n", realm);
@@ -685,7 +686,7 @@ static int fill_inode(struct inode *inode,
685 } 686 }
686 687
687 /* it may be better to set st_size in getattr instead? */ 688 /* it may be better to set st_size in getattr instead? */
688 if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES)) 689 if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
689 inode->i_size = ci->i_rbytes; 690 inode->i_size = ci->i_rbytes;
690 break; 691 break;
691 default: 692 default:
@@ -845,7 +846,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
845 * the caller) if we fail. 846 * the caller) if we fail.
846 */ 847 */
847static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, 848static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
848 bool *prehash) 849 bool *prehash, bool set_offset)
849{ 850{
850 struct dentry *realdn; 851 struct dentry *realdn;
851 852
@@ -877,7 +878,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
877 } 878 }
878 if ((!prehash || *prehash) && d_unhashed(dn)) 879 if ((!prehash || *prehash) && d_unhashed(dn))
879 d_rehash(dn); 880 d_rehash(dn);
880 ceph_set_dentry_offset(dn); 881 if (set_offset)
882 ceph_set_dentry_offset(dn);
881out: 883out:
882 return dn; 884 return dn;
883} 885}
@@ -900,7 +902,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
900 struct inode *in = NULL; 902 struct inode *in = NULL;
901 struct ceph_mds_reply_inode *ininfo; 903 struct ceph_mds_reply_inode *ininfo;
902 struct ceph_vino vino; 904 struct ceph_vino vino;
903 struct ceph_client *client = ceph_sb_to_client(sb); 905 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
904 int i = 0; 906 int i = 0;
905 int err = 0; 907 int err = 0;
906 908
@@ -964,7 +966,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
964 */ 966 */
965 if (rinfo->head->is_dentry && !req->r_aborted && 967 if (rinfo->head->is_dentry && !req->r_aborted &&
966 (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, 968 (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
967 client->mount_args->snapdir_name, 969 fsc->mount_options->snapdir_name,
968 req->r_dentry->d_name.len))) { 970 req->r_dentry->d_name.len))) {
969 /* 971 /*
970 * lookup link rename : null -> possibly existing inode 972 * lookup link rename : null -> possibly existing inode
@@ -1062,7 +1064,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1062 d_delete(dn); 1064 d_delete(dn);
1063 goto done; 1065 goto done;
1064 } 1066 }
1065 dn = splice_dentry(dn, in, &have_lease); 1067 dn = splice_dentry(dn, in, &have_lease, true);
1066 if (IS_ERR(dn)) { 1068 if (IS_ERR(dn)) {
1067 err = PTR_ERR(dn); 1069 err = PTR_ERR(dn);
1068 goto done; 1070 goto done;
@@ -1105,7 +1107,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1105 goto done; 1107 goto done;
1106 } 1108 }
1107 dout(" linking snapped dir %p to dn %p\n", in, dn); 1109 dout(" linking snapped dir %p to dn %p\n", in, dn);
1108 dn = splice_dentry(dn, in, NULL); 1110 dn = splice_dentry(dn, in, NULL, true);
1109 if (IS_ERR(dn)) { 1111 if (IS_ERR(dn)) {
1110 err = PTR_ERR(dn); 1112 err = PTR_ERR(dn);
1111 goto done; 1113 goto done;
@@ -1237,7 +1239,7 @@ retry_lookup:
1237 err = PTR_ERR(in); 1239 err = PTR_ERR(in);
1238 goto out; 1240 goto out;
1239 } 1241 }
1240 dn = splice_dentry(dn, in, NULL); 1242 dn = splice_dentry(dn, in, NULL, false);
1241 if (IS_ERR(dn)) 1243 if (IS_ERR(dn))
1242 dn = NULL; 1244 dn = NULL;
1243 } 1245 }
@@ -1532,7 +1534,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1532 struct inode *parent_inode = dentry->d_parent->d_inode; 1534 struct inode *parent_inode = dentry->d_parent->d_inode;
1533 const unsigned int ia_valid = attr->ia_valid; 1535 const unsigned int ia_valid = attr->ia_valid;
1534 struct ceph_mds_request *req; 1536 struct ceph_mds_request *req;
1535 struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc; 1537 struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
1536 int issued; 1538 int issued;
1537 int release = 0, dirtied = 0; 1539 int release = 0, dirtied = 0;
1538 int mask = 0; 1540 int mask = 0;
@@ -1727,8 +1729,8 @@ out:
1727 */ 1729 */
1728int ceph_do_getattr(struct inode *inode, int mask) 1730int ceph_do_getattr(struct inode *inode, int mask)
1729{ 1731{
1730 struct ceph_client *client = ceph_sb_to_client(inode->i_sb); 1732 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
1731 struct ceph_mds_client *mdsc = &client->mdsc; 1733 struct ceph_mds_client *mdsc = fsc->mdsc;
1732 struct ceph_mds_request *req; 1734 struct ceph_mds_request *req;
1733 int err; 1735 int err;
1734 1736
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 76e307d2aba1..8888c9ba68db 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -1,8 +1,10 @@
1#include <linux/in.h> 1#include <linux/in.h>
2 2
3#include "ioctl.h"
4#include "super.h" 3#include "super.h"
5#include "ceph_debug.h" 4#include "mds_client.h"
5#include <linux/ceph/ceph_debug.h>
6
7#include "ioctl.h"
6 8
7 9
8/* 10/*
@@ -37,7 +39,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
37{ 39{
38 struct inode *inode = file->f_dentry->d_inode; 40 struct inode *inode = file->f_dentry->d_inode;
39 struct inode *parent_inode = file->f_dentry->d_parent->d_inode; 41 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
40 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 42 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
41 struct ceph_mds_request *req; 43 struct ceph_mds_request *req;
42 struct ceph_ioctl_layout l; 44 struct ceph_ioctl_layout l;
43 int err, i; 45 int err, i;
@@ -90,6 +92,68 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
90} 92}
91 93
92/* 94/*
95 * Set a layout policy on a directory inode. All items in the tree
96 * rooted at this inode will inherit this layout on creation,
97 * (It doesn't apply retroactively )
98 * unless a subdirectory has its own layout policy.
99 */
100static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
101{
102 struct inode *inode = file->f_dentry->d_inode;
103 struct ceph_mds_request *req;
104 struct ceph_ioctl_layout l;
105 int err, i;
106 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
107
108 /* copy and validate */
109 if (copy_from_user(&l, arg, sizeof(l)))
110 return -EFAULT;
111
112 if ((l.object_size & ~PAGE_MASK) ||
113 (l.stripe_unit & ~PAGE_MASK) ||
114 !l.stripe_unit ||
115 (l.object_size &&
116 (unsigned)l.object_size % (unsigned)l.stripe_unit))
117 return -EINVAL;
118
119 /* make sure it's a valid data pool */
120 if (l.data_pool > 0) {
121 mutex_lock(&mdsc->mutex);
122 err = -EINVAL;
123 for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
124 if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
125 err = 0;
126 break;
127 }
128 mutex_unlock(&mdsc->mutex);
129 if (err)
130 return err;
131 }
132
133 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT,
134 USE_AUTH_MDS);
135
136 if (IS_ERR(req))
137 return PTR_ERR(req);
138 req->r_inode = igrab(inode);
139
140 req->r_args.setlayout.layout.fl_stripe_unit =
141 cpu_to_le32(l.stripe_unit);
142 req->r_args.setlayout.layout.fl_stripe_count =
143 cpu_to_le32(l.stripe_count);
144 req->r_args.setlayout.layout.fl_object_size =
145 cpu_to_le32(l.object_size);
146 req->r_args.setlayout.layout.fl_pg_pool =
147 cpu_to_le32(l.data_pool);
148 req->r_args.setlayout.layout.fl_pg_preferred =
149 cpu_to_le32(l.preferred_osd);
150
151 err = ceph_mdsc_do_request(mdsc, inode, req);
152 ceph_mdsc_put_request(req);
153 return err;
154}
155
156/*
93 * Return object name, size/offset information, and location (OSD 157 * Return object name, size/offset information, and location (OSD
94 * number, network address) for a given file offset. 158 * number, network address) for a given file offset.
95 */ 159 */
@@ -98,7 +162,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
98 struct ceph_ioctl_dataloc dl; 162 struct ceph_ioctl_dataloc dl;
99 struct inode *inode = file->f_dentry->d_inode; 163 struct inode *inode = file->f_dentry->d_inode;
100 struct ceph_inode_info *ci = ceph_inode(inode); 164 struct ceph_inode_info *ci = ceph_inode(inode);
101 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; 165 struct ceph_osd_client *osdc =
166 &ceph_sb_to_client(inode->i_sb)->client->osdc;
102 u64 len = 1, olen; 167 u64 len = 1, olen;
103 u64 tmp; 168 u64 tmp;
104 struct ceph_object_layout ol; 169 struct ceph_object_layout ol;
@@ -174,11 +239,15 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
174 case CEPH_IOC_SET_LAYOUT: 239 case CEPH_IOC_SET_LAYOUT:
175 return ceph_ioctl_set_layout(file, (void __user *)arg); 240 return ceph_ioctl_set_layout(file, (void __user *)arg);
176 241
242 case CEPH_IOC_SET_LAYOUT_POLICY:
243 return ceph_ioctl_set_layout_policy(file, (void __user *)arg);
244
177 case CEPH_IOC_GET_DATALOC: 245 case CEPH_IOC_GET_DATALOC:
178 return ceph_ioctl_get_dataloc(file, (void __user *)arg); 246 return ceph_ioctl_get_dataloc(file, (void __user *)arg);
179 247
180 case CEPH_IOC_LAZYIO: 248 case CEPH_IOC_LAZYIO:
181 return ceph_ioctl_lazyio(file); 249 return ceph_ioctl_lazyio(file);
182 } 250 }
251
183 return -ENOTTY; 252 return -ENOTTY;
184} 253}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index 88451a3b6857..a6ce54e94eb5 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -4,7 +4,7 @@
4#include <linux/ioctl.h> 4#include <linux/ioctl.h>
5#include <linux/types.h> 5#include <linux/types.h>
6 6
7#define CEPH_IOCTL_MAGIC 0x97 7#define CEPH_IOCTL_MAGIC 0x98
8 8
9/* just use u64 to align sanely on all archs */ 9/* just use u64 to align sanely on all archs */
10struct ceph_ioctl_layout { 10struct ceph_ioctl_layout {
@@ -17,6 +17,8 @@ struct ceph_ioctl_layout {
17 struct ceph_ioctl_layout) 17 struct ceph_ioctl_layout)
18#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \ 18#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \
19 struct ceph_ioctl_layout) 19 struct ceph_ioctl_layout)
20#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5, \
21 struct ceph_ioctl_layout)
20 22
21/* 23/*
22 * Extract identity, address of the OSD and object storing a given 24 * Extract identity, address of the OSD and object storing a given
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index ff4e753aae92..40abde93c345 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -1,11 +1,11 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/file.h> 3#include <linux/file.h>
4#include <linux/namei.h> 4#include <linux/namei.h>
5 5
6#include "super.h" 6#include "super.h"
7#include "mds_client.h" 7#include "mds_client.h"
8#include "pagelist.h" 8#include <linux/ceph/pagelist.h>
9 9
10/** 10/**
11 * Implement fcntl and flock locking functions. 11 * Implement fcntl and flock locking functions.
@@ -16,7 +16,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
16{ 16{
17 struct inode *inode = file->f_dentry->d_inode; 17 struct inode *inode = file->f_dentry->d_inode;
18 struct ceph_mds_client *mdsc = 18 struct ceph_mds_client *mdsc =
19 &ceph_sb_to_client(inode->i_sb)->mdsc; 19 ceph_sb_to_client(inode->i_sb)->mdsc;
20 struct ceph_mds_request *req; 20 struct ceph_mds_request *req;
21 int err; 21 int err;
22 22
@@ -181,8 +181,9 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
181 * Encode the flock and fcntl locks for the given inode into the pagelist. 181 * Encode the flock and fcntl locks for the given inode into the pagelist.
182 * Format is: #fcntl locks, sequential fcntl locks, #flock locks, 182 * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
183 * sequential flock locks. 183 * sequential flock locks.
184 * Must be called with BLK already held, and the lock numbers should have 184 * Must be called with lock_flocks() already held.
185 * been gathered under the same lock holding window. 185 * If we encounter more of a specific lock type than expected,
186 * we return the value 1.
186 */ 187 */
187int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, 188int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
188 int num_fcntl_locks, int num_flock_locks) 189 int num_fcntl_locks, int num_flock_locks)
@@ -190,6 +191,8 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
190 struct file_lock *lock; 191 struct file_lock *lock;
191 struct ceph_filelock cephlock; 192 struct ceph_filelock cephlock;
192 int err = 0; 193 int err = 0;
194 int seen_fcntl = 0;
195 int seen_flock = 0;
193 196
194 dout("encoding %d flock and %d fcntl locks", num_flock_locks, 197 dout("encoding %d flock and %d fcntl locks", num_flock_locks,
195 num_fcntl_locks); 198 num_fcntl_locks);
@@ -198,6 +201,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
198 goto fail; 201 goto fail;
199 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { 202 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
200 if (lock->fl_flags & FL_POSIX) { 203 if (lock->fl_flags & FL_POSIX) {
204 ++seen_fcntl;
205 if (seen_fcntl > num_fcntl_locks) {
206 err = -ENOSPC;
207 goto fail;
208 }
201 err = lock_to_ceph_filelock(lock, &cephlock); 209 err = lock_to_ceph_filelock(lock, &cephlock);
202 if (err) 210 if (err)
203 goto fail; 211 goto fail;
@@ -213,6 +221,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
213 goto fail; 221 goto fail;
214 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { 222 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
215 if (lock->fl_flags & FL_FLOCK) { 223 if (lock->fl_flags & FL_FLOCK) {
224 ++seen_flock;
225 if (seen_flock > num_flock_locks) {
226 err = -ENOSPC;
227 goto fail;
228 }
216 err = lock_to_ceph_filelock(lock, &cephlock); 229 err = lock_to_ceph_filelock(lock, &cephlock);
217 if (err) 230 if (err)
218 goto fail; 231 goto fail;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index f091b1351786..3142b15940c2 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1,17 +1,21 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/fs.h>
3#include <linux/wait.h> 4#include <linux/wait.h>
4#include <linux/slab.h> 5#include <linux/slab.h>
5#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/debugfs.h>
8#include <linux/seq_file.h>
6#include <linux/smp_lock.h> 9#include <linux/smp_lock.h>
7 10
8#include "mds_client.h"
9#include "mon_client.h"
10#include "super.h" 11#include "super.h"
11#include "messenger.h" 12#include "mds_client.h"
12#include "decode.h" 13
13#include "auth.h" 14#include <linux/ceph/messenger.h>
14#include "pagelist.h" 15#include <linux/ceph/decode.h>
16#include <linux/ceph/pagelist.h>
17#include <linux/ceph/auth.h>
18#include <linux/ceph/debugfs.h>
15 19
16/* 20/*
17 * A cluster of MDS (metadata server) daemons is responsible for 21 * A cluster of MDS (metadata server) daemons is responsible for
@@ -286,8 +290,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
286 atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); 290 atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
287 if (atomic_dec_and_test(&s->s_ref)) { 291 if (atomic_dec_and_test(&s->s_ref)) {
288 if (s->s_authorizer) 292 if (s->s_authorizer)
289 s->s_mdsc->client->monc.auth->ops->destroy_authorizer( 293 s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer(
290 s->s_mdsc->client->monc.auth, s->s_authorizer); 294 s->s_mdsc->fsc->client->monc.auth,
295 s->s_authorizer);
291 kfree(s); 296 kfree(s);
292 } 297 }
293} 298}
@@ -344,7 +349,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
344 s->s_seq = 0; 349 s->s_seq = 0;
345 mutex_init(&s->s_mutex); 350 mutex_init(&s->s_mutex);
346 351
347 ceph_con_init(mdsc->client->msgr, &s->s_con); 352 ceph_con_init(mdsc->fsc->client->msgr, &s->s_con);
348 s->s_con.private = s; 353 s->s_con.private = s;
349 s->s_con.ops = &mds_con_ops; 354 s->s_con.ops = &mds_con_ops;
350 s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS; 355 s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
@@ -599,7 +604,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
599 } else if (req->r_dentry) { 604 } else if (req->r_dentry) {
600 struct inode *dir = req->r_dentry->d_parent->d_inode; 605 struct inode *dir = req->r_dentry->d_parent->d_inode;
601 606
602 if (dir->i_sb != mdsc->client->sb) { 607 if (dir->i_sb != mdsc->fsc->sb) {
603 /* not this fs! */ 608 /* not this fs! */
604 inode = req->r_dentry->d_inode; 609 inode = req->r_dentry->d_inode;
605 } else if (ceph_snap(dir) != CEPH_NOSNAP) { 610 } else if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -884,7 +889,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
884 __ceph_remove_cap(cap); 889 __ceph_remove_cap(cap);
885 if (!__ceph_is_any_real_caps(ci)) { 890 if (!__ceph_is_any_real_caps(ci)) {
886 struct ceph_mds_client *mdsc = 891 struct ceph_mds_client *mdsc =
887 &ceph_sb_to_client(inode->i_sb)->mdsc; 892 ceph_sb_to_client(inode->i_sb)->mdsc;
888 893
889 spin_lock(&mdsc->cap_dirty_lock); 894 spin_lock(&mdsc->cap_dirty_lock);
890 if (!list_empty(&ci->i_dirty_item)) { 895 if (!list_empty(&ci->i_dirty_item)) {
@@ -1146,7 +1151,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
1146 struct ceph_msg *msg, *partial = NULL; 1151 struct ceph_msg *msg, *partial = NULL;
1147 struct ceph_mds_cap_release *head; 1152 struct ceph_mds_cap_release *head;
1148 int err = -ENOMEM; 1153 int err = -ENOMEM;
1149 int extra = mdsc->client->mount_args->cap_release_safety; 1154 int extra = mdsc->fsc->mount_options->cap_release_safety;
1150 int num; 1155 int num;
1151 1156
1152 dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds, 1157 dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
@@ -2085,7 +2090,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2085 2090
2086 /* insert trace into our cache */ 2091 /* insert trace into our cache */
2087 mutex_lock(&req->r_fill_mutex); 2092 mutex_lock(&req->r_fill_mutex);
2088 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session); 2093 err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
2089 if (err == 0) { 2094 if (err == 0) {
2090 if (result == 0 && rinfo->dir_nr) 2095 if (result == 0 && rinfo->dir_nr)
2091 ceph_readdir_prepopulate(req, req->r_session); 2096 ceph_readdir_prepopulate(req, req->r_session);
@@ -2361,19 +2366,37 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2361 2366
2362 if (recon_state->flock) { 2367 if (recon_state->flock) {
2363 int num_fcntl_locks, num_flock_locks; 2368 int num_fcntl_locks, num_flock_locks;
2364 2369 struct ceph_pagelist_cursor trunc_point;
2365 lock_kernel(); 2370
2366 ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); 2371 ceph_pagelist_set_cursor(pagelist, &trunc_point);
2367 rec.v2.flock_len = (2*sizeof(u32) + 2372 do {
2368 (num_fcntl_locks+num_flock_locks) * 2373 lock_flocks();
2369 sizeof(struct ceph_filelock)); 2374 ceph_count_locks(inode, &num_fcntl_locks,
2370 2375 &num_flock_locks);
2376 rec.v2.flock_len = (2*sizeof(u32) +
2377 (num_fcntl_locks+num_flock_locks) *
2378 sizeof(struct ceph_filelock));
2379 unlock_flocks();
2380
2381 /* pre-alloc pagelist */
2382 ceph_pagelist_truncate(pagelist, &trunc_point);
2383 err = ceph_pagelist_append(pagelist, &rec, reclen);
2384 if (!err)
2385 err = ceph_pagelist_reserve(pagelist,
2386 rec.v2.flock_len);
2387
2388 /* encode locks */
2389 if (!err) {
2390 lock_flocks();
2391 err = ceph_encode_locks(inode,
2392 pagelist,
2393 num_fcntl_locks,
2394 num_flock_locks);
2395 unlock_flocks();
2396 }
2397 } while (err == -ENOSPC);
2398 } else {
2371 err = ceph_pagelist_append(pagelist, &rec, reclen); 2399 err = ceph_pagelist_append(pagelist, &rec, reclen);
2372 if (!err)
2373 err = ceph_encode_locks(inode, pagelist,
2374 num_fcntl_locks,
2375 num_flock_locks);
2376 unlock_kernel();
2377 } 2400 }
2378 2401
2379out_free: 2402out_free:
@@ -2611,7 +2634,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2611 struct ceph_mds_session *session, 2634 struct ceph_mds_session *session,
2612 struct ceph_msg *msg) 2635 struct ceph_msg *msg)
2613{ 2636{
2614 struct super_block *sb = mdsc->client->sb; 2637 struct super_block *sb = mdsc->fsc->sb;
2615 struct inode *inode; 2638 struct inode *inode;
2616 struct ceph_inode_info *ci; 2639 struct ceph_inode_info *ci;
2617 struct dentry *parent, *dentry; 2640 struct dentry *parent, *dentry;
@@ -2889,10 +2912,16 @@ static void delayed_work(struct work_struct *work)
2889 schedule_delayed(mdsc); 2912 schedule_delayed(mdsc);
2890} 2913}
2891 2914
2915int ceph_mdsc_init(struct ceph_fs_client *fsc)
2892 2916
2893int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2894{ 2917{
2895 mdsc->client = client; 2918 struct ceph_mds_client *mdsc;
2919
2920 mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
2921 if (!mdsc)
2922 return -ENOMEM;
2923 mdsc->fsc = fsc;
2924 fsc->mdsc = mdsc;
2896 mutex_init(&mdsc->mutex); 2925 mutex_init(&mdsc->mutex);
2897 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); 2926 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
2898 if (mdsc->mdsmap == NULL) 2927 if (mdsc->mdsmap == NULL)
@@ -2925,7 +2954,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2925 INIT_LIST_HEAD(&mdsc->dentry_lru); 2954 INIT_LIST_HEAD(&mdsc->dentry_lru);
2926 2955
2927 ceph_caps_init(mdsc); 2956 ceph_caps_init(mdsc);
2928 ceph_adjust_min_caps(mdsc, client->min_caps); 2957 ceph_adjust_min_caps(mdsc, fsc->min_caps);
2929 2958
2930 return 0; 2959 return 0;
2931} 2960}
@@ -2937,7 +2966,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2937static void wait_requests(struct ceph_mds_client *mdsc) 2966static void wait_requests(struct ceph_mds_client *mdsc)
2938{ 2967{
2939 struct ceph_mds_request *req; 2968 struct ceph_mds_request *req;
2940 struct ceph_client *client = mdsc->client; 2969 struct ceph_fs_client *fsc = mdsc->fsc;
2941 2970
2942 mutex_lock(&mdsc->mutex); 2971 mutex_lock(&mdsc->mutex);
2943 if (__get_oldest_req(mdsc)) { 2972 if (__get_oldest_req(mdsc)) {
@@ -2945,7 +2974,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
2945 2974
2946 dout("wait_requests waiting for requests\n"); 2975 dout("wait_requests waiting for requests\n");
2947 wait_for_completion_timeout(&mdsc->safe_umount_waiters, 2976 wait_for_completion_timeout(&mdsc->safe_umount_waiters,
2948 client->mount_args->mount_timeout * HZ); 2977 fsc->client->options->mount_timeout * HZ);
2949 2978
2950 /* tear down remaining requests */ 2979 /* tear down remaining requests */
2951 mutex_lock(&mdsc->mutex); 2980 mutex_lock(&mdsc->mutex);
@@ -3028,7 +3057,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
3028{ 3057{
3029 u64 want_tid, want_flush; 3058 u64 want_tid, want_flush;
3030 3059
3031 if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) 3060 if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
3032 return; 3061 return;
3033 3062
3034 dout("sync\n"); 3063 dout("sync\n");
@@ -3051,7 +3080,7 @@ bool done_closing_sessions(struct ceph_mds_client *mdsc)
3051{ 3080{
3052 int i, n = 0; 3081 int i, n = 0;
3053 3082
3054 if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) 3083 if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
3055 return true; 3084 return true;
3056 3085
3057 mutex_lock(&mdsc->mutex); 3086 mutex_lock(&mdsc->mutex);
@@ -3069,8 +3098,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
3069{ 3098{
3070 struct ceph_mds_session *session; 3099 struct ceph_mds_session *session;
3071 int i; 3100 int i;
3072 struct ceph_client *client = mdsc->client; 3101 struct ceph_fs_client *fsc = mdsc->fsc;
3073 unsigned long timeout = client->mount_args->mount_timeout * HZ; 3102 unsigned long timeout = fsc->client->options->mount_timeout * HZ;
3074 3103
3075 dout("close_sessions\n"); 3104 dout("close_sessions\n");
3076 3105
@@ -3117,7 +3146,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
3117 dout("stopped\n"); 3146 dout("stopped\n");
3118} 3147}
3119 3148
3120void ceph_mdsc_stop(struct ceph_mds_client *mdsc) 3149static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
3121{ 3150{
3122 dout("stop\n"); 3151 dout("stop\n");
3123 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ 3152 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
@@ -3127,6 +3156,15 @@ void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
3127 ceph_caps_finalize(mdsc); 3156 ceph_caps_finalize(mdsc);
3128} 3157}
3129 3158
3159void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
3160{
3161 struct ceph_mds_client *mdsc = fsc->mdsc;
3162
3163 ceph_mdsc_stop(mdsc);
3164 fsc->mdsc = NULL;
3165 kfree(mdsc);
3166}
3167
3130 3168
3131/* 3169/*
3132 * handle mds map update. 3170 * handle mds map update.
@@ -3143,14 +3181,14 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
3143 3181
3144 ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad); 3182 ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
3145 ceph_decode_copy(&p, &fsid, sizeof(fsid)); 3183 ceph_decode_copy(&p, &fsid, sizeof(fsid));
3146 if (ceph_check_fsid(mdsc->client, &fsid) < 0) 3184 if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
3147 return; 3185 return;
3148 epoch = ceph_decode_32(&p); 3186 epoch = ceph_decode_32(&p);
3149 maplen = ceph_decode_32(&p); 3187 maplen = ceph_decode_32(&p);
3150 dout("handle_map epoch %u len %d\n", epoch, (int)maplen); 3188 dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
3151 3189
3152 /* do we need it? */ 3190 /* do we need it? */
3153 ceph_monc_got_mdsmap(&mdsc->client->monc, epoch); 3191 ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
3154 mutex_lock(&mdsc->mutex); 3192 mutex_lock(&mdsc->mutex);
3155 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { 3193 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
3156 dout("handle_map epoch %u <= our %u\n", 3194 dout("handle_map epoch %u <= our %u\n",
@@ -3174,7 +3212,7 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
3174 } else { 3212 } else {
3175 mdsc->mdsmap = newmap; /* first mds map */ 3213 mdsc->mdsmap = newmap; /* first mds map */
3176 } 3214 }
3177 mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; 3215 mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
3178 3216
3179 __wake_requests(mdsc, &mdsc->waiting_for_map); 3217 __wake_requests(mdsc, &mdsc->waiting_for_map);
3180 3218
@@ -3275,7 +3313,7 @@ static int get_authorizer(struct ceph_connection *con,
3275{ 3313{
3276 struct ceph_mds_session *s = con->private; 3314 struct ceph_mds_session *s = con->private;
3277 struct ceph_mds_client *mdsc = s->s_mdsc; 3315 struct ceph_mds_client *mdsc = s->s_mdsc;
3278 struct ceph_auth_client *ac = mdsc->client->monc.auth; 3316 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3279 int ret = 0; 3317 int ret = 0;
3280 3318
3281 if (force_new && s->s_authorizer) { 3319 if (force_new && s->s_authorizer) {
@@ -3309,7 +3347,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
3309{ 3347{
3310 struct ceph_mds_session *s = con->private; 3348 struct ceph_mds_session *s = con->private;
3311 struct ceph_mds_client *mdsc = s->s_mdsc; 3349 struct ceph_mds_client *mdsc = s->s_mdsc;
3312 struct ceph_auth_client *ac = mdsc->client->monc.auth; 3350 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3313 3351
3314 return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len); 3352 return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
3315} 3353}
@@ -3318,12 +3356,12 @@ static int invalidate_authorizer(struct ceph_connection *con)
3318{ 3356{
3319 struct ceph_mds_session *s = con->private; 3357 struct ceph_mds_session *s = con->private;
3320 struct ceph_mds_client *mdsc = s->s_mdsc; 3358 struct ceph_mds_client *mdsc = s->s_mdsc;
3321 struct ceph_auth_client *ac = mdsc->client->monc.auth; 3359 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3322 3360
3323 if (ac->ops->invalidate_authorizer) 3361 if (ac->ops->invalidate_authorizer)
3324 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); 3362 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
3325 3363
3326 return ceph_monc_validate_auth(&mdsc->client->monc); 3364 return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
3327} 3365}
3328 3366
3329static const struct ceph_connection_operations mds_con_ops = { 3367static const struct ceph_connection_operations mds_con_ops = {
@@ -3336,7 +3374,4 @@ static const struct ceph_connection_operations mds_con_ops = {
3336 .peer_reset = peer_reset, 3374 .peer_reset = peer_reset,
3337}; 3375};
3338 3376
3339
3340
3341
3342/* eof */ 3377/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index c98267ce6d2a..d66d63c72355 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -8,9 +8,9 @@
8#include <linux/rbtree.h> 8#include <linux/rbtree.h>
9#include <linux/spinlock.h> 9#include <linux/spinlock.h>
10 10
11#include "types.h" 11#include <linux/ceph/types.h>
12#include "messenger.h" 12#include <linux/ceph/messenger.h>
13#include "mdsmap.h" 13#include <linux/ceph/mdsmap.h>
14 14
15/* 15/*
16 * Some lock dependencies: 16 * Some lock dependencies:
@@ -26,7 +26,7 @@
26 * 26 *
27 */ 27 */
28 28
29struct ceph_client; 29struct ceph_fs_client;
30struct ceph_cap; 30struct ceph_cap;
31 31
32/* 32/*
@@ -230,7 +230,7 @@ struct ceph_mds_request {
230 * mds client state 230 * mds client state
231 */ 231 */
232struct ceph_mds_client { 232struct ceph_mds_client {
233 struct ceph_client *client; 233 struct ceph_fs_client *fsc;
234 struct mutex mutex; /* all nested structures */ 234 struct mutex mutex; /* all nested structures */
235 235
236 struct ceph_mdsmap *mdsmap; 236 struct ceph_mdsmap *mdsmap;
@@ -289,11 +289,6 @@ struct ceph_mds_client {
289 int caps_avail_count; /* unused, unreserved */ 289 int caps_avail_count; /* unused, unreserved */
290 int caps_min_count; /* keep at least this many 290 int caps_min_count; /* keep at least this many
291 (unreserved) */ 291 (unreserved) */
292
293#ifdef CONFIG_DEBUG_FS
294 struct dentry *debugfs_file;
295#endif
296
297 spinlock_t dentry_lru_lock; 292 spinlock_t dentry_lru_lock;
298 struct list_head dentry_lru; 293 struct list_head dentry_lru;
299 int num_dentry; 294 int num_dentry;
@@ -316,10 +311,9 @@ extern void ceph_put_mds_session(struct ceph_mds_session *s);
316extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, 311extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
317 struct ceph_msg *msg, int mds); 312 struct ceph_msg *msg, int mds);
318 313
319extern int ceph_mdsc_init(struct ceph_mds_client *mdsc, 314extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
320 struct ceph_client *client);
321extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); 315extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
322extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc); 316extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
323 317
324extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); 318extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
325 319
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 040be6d1150b..73b7d44e8a35 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/bug.h> 3#include <linux/bug.h>
4#include <linux/err.h> 4#include <linux/err.h>
@@ -6,9 +6,9 @@
6#include <linux/slab.h> 6#include <linux/slab.h>
7#include <linux/types.h> 7#include <linux/types.h>
8 8
9#include "mdsmap.h" 9#include <linux/ceph/mdsmap.h>
10#include "messenger.h" 10#include <linux/ceph/messenger.h>
11#include "decode.h" 11#include <linux/ceph/decode.h>
12 12
13#include "super.h" 13#include "super.h"
14 14
@@ -117,7 +117,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
117 } 117 }
118 118
119 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", 119 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
120 i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr), 120 i+1, n, global_id, mds, inc,
121 ceph_pr_addr(&addr.in_addr),
121 ceph_mds_state_name(state)); 122 ceph_mds_state_name(state));
122 if (mds >= 0 && mds < m->m_max_mds && state > 0) { 123 if (mds >= 0 && mds < m->m_max_mds && state > 0) {
123 m->m_info[mds].global_id = global_id; 124 m->m_info[mds].global_id = global_id;
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
deleted file mode 100644
index 4c5cb0880bba..000000000000
--- a/fs/ceph/mdsmap.h
+++ /dev/null
@@ -1,62 +0,0 @@
1#ifndef _FS_CEPH_MDSMAP_H
2#define _FS_CEPH_MDSMAP_H
3
4#include "types.h"
5
6/*
7 * mds map - describe servers in the mds cluster.
8 *
9 * we limit fields to those the client actually xcares about
10 */
11struct ceph_mds_info {
12 u64 global_id;
13 struct ceph_entity_addr addr;
14 s32 state;
15 int num_export_targets;
16 bool laggy;
17 u32 *export_targets;
18};
19
20struct ceph_mdsmap {
21 u32 m_epoch, m_client_epoch, m_last_failure;
22 u32 m_root;
23 u32 m_session_timeout; /* seconds */
24 u32 m_session_autoclose; /* seconds */
25 u64 m_max_file_size;
26 u32 m_max_mds; /* size of m_addr, m_state arrays */
27 struct ceph_mds_info *m_info;
28
29 /* which object pools file data can be stored in */
30 int m_num_data_pg_pools;
31 u32 *m_data_pg_pools;
32 u32 m_cas_pg_pool;
33};
34
35static inline struct ceph_entity_addr *
36ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
37{
38 if (w >= m->m_max_mds)
39 return NULL;
40 return &m->m_info[w].addr;
41}
42
43static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
44{
45 BUG_ON(w < 0);
46 if (w >= m->m_max_mds)
47 return CEPH_MDS_STATE_DNE;
48 return m->m_info[w].state;
49}
50
51static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
52{
53 if (w >= 0 && w < m->m_max_mds)
54 return m->m_info[w].laggy;
55 return false;
56}
57
58extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
59extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
60extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
61
62#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
deleted file mode 100644
index 2502d76fcec1..000000000000
--- a/fs/ceph/messenger.c
+++ /dev/null
@@ -1,2277 +0,0 @@
1#include "ceph_debug.h"
2
3#include <linux/crc32c.h>
4#include <linux/ctype.h>
5#include <linux/highmem.h>
6#include <linux/inet.h>
7#include <linux/kthread.h>
8#include <linux/net.h>
9#include <linux/slab.h>
10#include <linux/socket.h>
11#include <linux/string.h>
12#include <net/tcp.h>
13
14#include "super.h"
15#include "messenger.h"
16#include "decode.h"
17#include "pagelist.h"
18
19/*
20 * Ceph uses the messenger to exchange ceph_msg messages with other
21 * hosts in the system. The messenger provides ordered and reliable
22 * delivery. We tolerate TCP disconnects by reconnecting (with
23 * exponential backoff) in the case of a fault (disconnection, bad
24 * crc, protocol error). Acks allow sent messages to be discarded by
25 * the sender.
26 */
27
28/* static tag bytes (protocol control messages) */
29static char tag_msg = CEPH_MSGR_TAG_MSG;
30static char tag_ack = CEPH_MSGR_TAG_ACK;
31static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
32
33#ifdef CONFIG_LOCKDEP
34static struct lock_class_key socket_class;
35#endif
36
37
38static void queue_con(struct ceph_connection *con);
39static void con_work(struct work_struct *);
40static void ceph_fault(struct ceph_connection *con);
41
42/*
43 * nicely render a sockaddr as a string.
44 */
45#define MAX_ADDR_STR 20
46#define MAX_ADDR_STR_LEN 60
47static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
48static DEFINE_SPINLOCK(addr_str_lock);
49static int last_addr_str;
50
51const char *pr_addr(const struct sockaddr_storage *ss)
52{
53 int i;
54 char *s;
55 struct sockaddr_in *in4 = (void *)ss;
56 struct sockaddr_in6 *in6 = (void *)ss;
57
58 spin_lock(&addr_str_lock);
59 i = last_addr_str++;
60 if (last_addr_str == MAX_ADDR_STR)
61 last_addr_str = 0;
62 spin_unlock(&addr_str_lock);
63 s = addr_str[i];
64
65 switch (ss->ss_family) {
66 case AF_INET:
67 snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
68 (unsigned int)ntohs(in4->sin_port));
69 break;
70
71 case AF_INET6:
72 snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
73 (unsigned int)ntohs(in6->sin6_port));
74 break;
75
76 default:
77 sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
78 }
79
80 return s;
81}
82
83static void encode_my_addr(struct ceph_messenger *msgr)
84{
85 memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
86 ceph_encode_addr(&msgr->my_enc_addr);
87}
88
89/*
90 * work queue for all reading and writing to/from the socket.
91 */
92struct workqueue_struct *ceph_msgr_wq;
93
94int __init ceph_msgr_init(void)
95{
96 ceph_msgr_wq = create_workqueue("ceph-msgr");
97 if (IS_ERR(ceph_msgr_wq)) {
98 int ret = PTR_ERR(ceph_msgr_wq);
99 pr_err("msgr_init failed to create workqueue: %d\n", ret);
100 ceph_msgr_wq = NULL;
101 return ret;
102 }
103 return 0;
104}
105
106void ceph_msgr_exit(void)
107{
108 destroy_workqueue(ceph_msgr_wq);
109}
110
111void ceph_msgr_flush(void)
112{
113 flush_workqueue(ceph_msgr_wq);
114}
115
116
117/*
118 * socket callback functions
119 */
120
121/* data available on socket, or listen socket received a connect */
122static void ceph_data_ready(struct sock *sk, int count_unused)
123{
124 struct ceph_connection *con =
125 (struct ceph_connection *)sk->sk_user_data;
126 if (sk->sk_state != TCP_CLOSE_WAIT) {
127 dout("ceph_data_ready on %p state = %lu, queueing work\n",
128 con, con->state);
129 queue_con(con);
130 }
131}
132
133/* socket has buffer space for writing */
134static void ceph_write_space(struct sock *sk)
135{
136 struct ceph_connection *con =
137 (struct ceph_connection *)sk->sk_user_data;
138
139 /* only queue to workqueue if there is data we want to write. */
140 if (test_bit(WRITE_PENDING, &con->state)) {
141 dout("ceph_write_space %p queueing write work\n", con);
142 queue_con(con);
143 } else {
144 dout("ceph_write_space %p nothing to write\n", con);
145 }
146
147 /* since we have our own write_space, clear the SOCK_NOSPACE flag */
148 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
149}
150
151/* socket's state has changed */
152static void ceph_state_change(struct sock *sk)
153{
154 struct ceph_connection *con =
155 (struct ceph_connection *)sk->sk_user_data;
156
157 dout("ceph_state_change %p state = %lu sk_state = %u\n",
158 con, con->state, sk->sk_state);
159
160 if (test_bit(CLOSED, &con->state))
161 return;
162
163 switch (sk->sk_state) {
164 case TCP_CLOSE:
165 dout("ceph_state_change TCP_CLOSE\n");
166 case TCP_CLOSE_WAIT:
167 dout("ceph_state_change TCP_CLOSE_WAIT\n");
168 if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
169 if (test_bit(CONNECTING, &con->state))
170 con->error_msg = "connection failed";
171 else
172 con->error_msg = "socket closed";
173 queue_con(con);
174 }
175 break;
176 case TCP_ESTABLISHED:
177 dout("ceph_state_change TCP_ESTABLISHED\n");
178 queue_con(con);
179 break;
180 }
181}
182
183/*
184 * set up socket callbacks
185 */
186static void set_sock_callbacks(struct socket *sock,
187 struct ceph_connection *con)
188{
189 struct sock *sk = sock->sk;
190 sk->sk_user_data = (void *)con;
191 sk->sk_data_ready = ceph_data_ready;
192 sk->sk_write_space = ceph_write_space;
193 sk->sk_state_change = ceph_state_change;
194}
195
196
197/*
198 * socket helpers
199 */
200
201/*
202 * initiate connection to a remote socket.
203 */
204static struct socket *ceph_tcp_connect(struct ceph_connection *con)
205{
206 struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
207 struct socket *sock;
208 int ret;
209
210 BUG_ON(con->sock);
211 ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
212 IPPROTO_TCP, &sock);
213 if (ret)
214 return ERR_PTR(ret);
215 con->sock = sock;
216 sock->sk->sk_allocation = GFP_NOFS;
217
218#ifdef CONFIG_LOCKDEP
219 lockdep_set_class(&sock->sk->sk_lock, &socket_class);
220#endif
221
222 set_sock_callbacks(sock, con);
223
224 dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
225
226 ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
227 O_NONBLOCK);
228 if (ret == -EINPROGRESS) {
229 dout("connect %s EINPROGRESS sk_state = %u\n",
230 pr_addr(&con->peer_addr.in_addr),
231 sock->sk->sk_state);
232 ret = 0;
233 }
234 if (ret < 0) {
235 pr_err("connect %s error %d\n",
236 pr_addr(&con->peer_addr.in_addr), ret);
237 sock_release(sock);
238 con->sock = NULL;
239 con->error_msg = "connect error";
240 }
241
242 if (ret < 0)
243 return ERR_PTR(ret);
244 return sock;
245}
246
247static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
248{
249 struct kvec iov = {buf, len};
250 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
251
252 return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
253}
254
255/*
256 * write something. @more is true if caller will be sending more data
257 * shortly.
258 */
259static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
260 size_t kvlen, size_t len, int more)
261{
262 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
263
264 if (more)
265 msg.msg_flags |= MSG_MORE;
266 else
267 msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
268
269 return kernel_sendmsg(sock, &msg, iov, kvlen, len);
270}
271
272
273/*
274 * Shutdown/close the socket for the given connection.
275 */
276static int con_close_socket(struct ceph_connection *con)
277{
278 int rc;
279
280 dout("con_close_socket on %p sock %p\n", con, con->sock);
281 if (!con->sock)
282 return 0;
283 set_bit(SOCK_CLOSED, &con->state);
284 rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
285 sock_release(con->sock);
286 con->sock = NULL;
287 clear_bit(SOCK_CLOSED, &con->state);
288 return rc;
289}
290
291/*
292 * Reset a connection. Discard all incoming and outgoing messages
293 * and clear *_seq state.
294 */
295static void ceph_msg_remove(struct ceph_msg *msg)
296{
297 list_del_init(&msg->list_head);
298 ceph_msg_put(msg);
299}
300static void ceph_msg_remove_list(struct list_head *head)
301{
302 while (!list_empty(head)) {
303 struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
304 list_head);
305 ceph_msg_remove(msg);
306 }
307}
308
309static void reset_connection(struct ceph_connection *con)
310{
311 /* reset connection, out_queue, msg_ and connect_seq */
312 /* discard existing out_queue and msg_seq */
313 ceph_msg_remove_list(&con->out_queue);
314 ceph_msg_remove_list(&con->out_sent);
315
316 if (con->in_msg) {
317 ceph_msg_put(con->in_msg);
318 con->in_msg = NULL;
319 }
320
321 con->connect_seq = 0;
322 con->out_seq = 0;
323 if (con->out_msg) {
324 ceph_msg_put(con->out_msg);
325 con->out_msg = NULL;
326 }
327 con->out_keepalive_pending = false;
328 con->in_seq = 0;
329 con->in_seq_acked = 0;
330}
331
332/*
333 * mark a peer down. drop any open connections.
334 */
335void ceph_con_close(struct ceph_connection *con)
336{
337 dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
338 set_bit(CLOSED, &con->state); /* in case there's queued work */
339 clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
340 clear_bit(LOSSYTX, &con->state); /* so we retry next connect */
341 clear_bit(KEEPALIVE_PENDING, &con->state);
342 clear_bit(WRITE_PENDING, &con->state);
343 mutex_lock(&con->mutex);
344 reset_connection(con);
345 con->peer_global_seq = 0;
346 cancel_delayed_work(&con->work);
347 mutex_unlock(&con->mutex);
348 queue_con(con);
349}
350
351/*
352 * Reopen a closed connection, with a new peer address.
353 */
354void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
355{
356 dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
357 set_bit(OPENING, &con->state);
358 clear_bit(CLOSED, &con->state);
359 memcpy(&con->peer_addr, addr, sizeof(*addr));
360 con->delay = 0; /* reset backoff memory */
361 queue_con(con);
362}
363
364/*
365 * return true if this connection ever successfully opened
366 */
367bool ceph_con_opened(struct ceph_connection *con)
368{
369 return con->connect_seq > 0;
370}
371
372/*
373 * generic get/put
374 */
375struct ceph_connection *ceph_con_get(struct ceph_connection *con)
376{
377 dout("con_get %p nref = %d -> %d\n", con,
378 atomic_read(&con->nref), atomic_read(&con->nref) + 1);
379 if (atomic_inc_not_zero(&con->nref))
380 return con;
381 return NULL;
382}
383
384void ceph_con_put(struct ceph_connection *con)
385{
386 dout("con_put %p nref = %d -> %d\n", con,
387 atomic_read(&con->nref), atomic_read(&con->nref) - 1);
388 BUG_ON(atomic_read(&con->nref) == 0);
389 if (atomic_dec_and_test(&con->nref)) {
390 BUG_ON(con->sock);
391 kfree(con);
392 }
393}
394
395/*
396 * initialize a new connection.
397 */
398void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
399{
400 dout("con_init %p\n", con);
401 memset(con, 0, sizeof(*con));
402 atomic_set(&con->nref, 1);
403 con->msgr = msgr;
404 mutex_init(&con->mutex);
405 INIT_LIST_HEAD(&con->out_queue);
406 INIT_LIST_HEAD(&con->out_sent);
407 INIT_DELAYED_WORK(&con->work, con_work);
408}
409
410
411/*
412 * We maintain a global counter to order connection attempts. Get
413 * a unique seq greater than @gt.
414 */
415static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
416{
417 u32 ret;
418
419 spin_lock(&msgr->global_seq_lock);
420 if (msgr->global_seq < gt)
421 msgr->global_seq = gt;
422 ret = ++msgr->global_seq;
423 spin_unlock(&msgr->global_seq_lock);
424 return ret;
425}
426
427
428/*
429 * Prepare footer for currently outgoing message, and finish things
430 * off. Assumes out_kvec* are already valid.. we just add on to the end.
431 */
432static void prepare_write_message_footer(struct ceph_connection *con, int v)
433{
434 struct ceph_msg *m = con->out_msg;
435
436 dout("prepare_write_message_footer %p\n", con);
437 con->out_kvec_is_msg = true;
438 con->out_kvec[v].iov_base = &m->footer;
439 con->out_kvec[v].iov_len = sizeof(m->footer);
440 con->out_kvec_bytes += sizeof(m->footer);
441 con->out_kvec_left++;
442 con->out_more = m->more_to_follow;
443 con->out_msg_done = true;
444}
445
446/*
447 * Prepare headers for the next outgoing message.
448 */
449static void prepare_write_message(struct ceph_connection *con)
450{
451 struct ceph_msg *m;
452 int v = 0;
453
454 con->out_kvec_bytes = 0;
455 con->out_kvec_is_msg = true;
456 con->out_msg_done = false;
457
458 /* Sneak an ack in there first? If we can get it into the same
459 * TCP packet that's a good thing. */
460 if (con->in_seq > con->in_seq_acked) {
461 con->in_seq_acked = con->in_seq;
462 con->out_kvec[v].iov_base = &tag_ack;
463 con->out_kvec[v++].iov_len = 1;
464 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
465 con->out_kvec[v].iov_base = &con->out_temp_ack;
466 con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
467 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
468 }
469
470 m = list_first_entry(&con->out_queue,
471 struct ceph_msg, list_head);
472 con->out_msg = m;
473 if (test_bit(LOSSYTX, &con->state)) {
474 list_del_init(&m->list_head);
475 } else {
476 /* put message on sent list */
477 ceph_msg_get(m);
478 list_move_tail(&m->list_head, &con->out_sent);
479 }
480
481 /*
482 * only assign outgoing seq # if we haven't sent this message
483 * yet. if it is requeued, resend with it's original seq.
484 */
485 if (m->needs_out_seq) {
486 m->hdr.seq = cpu_to_le64(++con->out_seq);
487 m->needs_out_seq = false;
488 }
489
490 dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
491 m, con->out_seq, le16_to_cpu(m->hdr.type),
492 le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
493 le32_to_cpu(m->hdr.data_len),
494 m->nr_pages);
495 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
496
497 /* tag + hdr + front + middle */
498 con->out_kvec[v].iov_base = &tag_msg;
499 con->out_kvec[v++].iov_len = 1;
500 con->out_kvec[v].iov_base = &m->hdr;
501 con->out_kvec[v++].iov_len = sizeof(m->hdr);
502 con->out_kvec[v++] = m->front;
503 if (m->middle)
504 con->out_kvec[v++] = m->middle->vec;
505 con->out_kvec_left = v;
506 con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
507 (m->middle ? m->middle->vec.iov_len : 0);
508 con->out_kvec_cur = con->out_kvec;
509
510 /* fill in crc (except data pages), footer */
511 con->out_msg->hdr.crc =
512 cpu_to_le32(crc32c(0, (void *)&m->hdr,
513 sizeof(m->hdr) - sizeof(m->hdr.crc)));
514 con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
515 con->out_msg->footer.front_crc =
516 cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
517 if (m->middle)
518 con->out_msg->footer.middle_crc =
519 cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
520 m->middle->vec.iov_len));
521 else
522 con->out_msg->footer.middle_crc = 0;
523 con->out_msg->footer.data_crc = 0;
524 dout("prepare_write_message front_crc %u data_crc %u\n",
525 le32_to_cpu(con->out_msg->footer.front_crc),
526 le32_to_cpu(con->out_msg->footer.middle_crc));
527
528 /* is there a data payload? */
529 if (le32_to_cpu(m->hdr.data_len) > 0) {
530 /* initialize page iterator */
531 con->out_msg_pos.page = 0;
532 con->out_msg_pos.page_pos =
533 le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
534 con->out_msg_pos.data_pos = 0;
535 con->out_msg_pos.did_page_crc = 0;
536 con->out_more = 1; /* data + footer will follow */
537 } else {
538 /* no, queue up footer too and be done */
539 prepare_write_message_footer(con, v);
540 }
541
542 set_bit(WRITE_PENDING, &con->state);
543}
544
545/*
546 * Prepare an ack.
547 */
548static void prepare_write_ack(struct ceph_connection *con)
549{
550 dout("prepare_write_ack %p %llu -> %llu\n", con,
551 con->in_seq_acked, con->in_seq);
552 con->in_seq_acked = con->in_seq;
553
554 con->out_kvec[0].iov_base = &tag_ack;
555 con->out_kvec[0].iov_len = 1;
556 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
557 con->out_kvec[1].iov_base = &con->out_temp_ack;
558 con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
559 con->out_kvec_left = 2;
560 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
561 con->out_kvec_cur = con->out_kvec;
562 con->out_more = 1; /* more will follow.. eventually.. */
563 set_bit(WRITE_PENDING, &con->state);
564}
565
566/*
567 * Prepare to write keepalive byte.
568 */
569static void prepare_write_keepalive(struct ceph_connection *con)
570{
571 dout("prepare_write_keepalive %p\n", con);
572 con->out_kvec[0].iov_base = &tag_keepalive;
573 con->out_kvec[0].iov_len = 1;
574 con->out_kvec_left = 1;
575 con->out_kvec_bytes = 1;
576 con->out_kvec_cur = con->out_kvec;
577 set_bit(WRITE_PENDING, &con->state);
578}
579
580/*
581 * Connection negotiation.
582 */
583
584static void prepare_connect_authorizer(struct ceph_connection *con)
585{
586 void *auth_buf;
587 int auth_len = 0;
588 int auth_protocol = 0;
589
590 mutex_unlock(&con->mutex);
591 if (con->ops->get_authorizer)
592 con->ops->get_authorizer(con, &auth_buf, &auth_len,
593 &auth_protocol, &con->auth_reply_buf,
594 &con->auth_reply_buf_len,
595 con->auth_retry);
596 mutex_lock(&con->mutex);
597
598 con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
599 con->out_connect.authorizer_len = cpu_to_le32(auth_len);
600
601 con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
602 con->out_kvec[con->out_kvec_left].iov_len = auth_len;
603 con->out_kvec_left++;
604 con->out_kvec_bytes += auth_len;
605}
606
607/*
608 * We connected to a peer and are saying hello.
609 */
610static void prepare_write_banner(struct ceph_messenger *msgr,
611 struct ceph_connection *con)
612{
613 int len = strlen(CEPH_BANNER);
614
615 con->out_kvec[0].iov_base = CEPH_BANNER;
616 con->out_kvec[0].iov_len = len;
617 con->out_kvec[1].iov_base = &msgr->my_enc_addr;
618 con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
619 con->out_kvec_left = 2;
620 con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
621 con->out_kvec_cur = con->out_kvec;
622 con->out_more = 0;
623 set_bit(WRITE_PENDING, &con->state);
624}
625
626static void prepare_write_connect(struct ceph_messenger *msgr,
627 struct ceph_connection *con,
628 int after_banner)
629{
630 unsigned global_seq = get_global_seq(con->msgr, 0);
631 int proto;
632
633 switch (con->peer_name.type) {
634 case CEPH_ENTITY_TYPE_MON:
635 proto = CEPH_MONC_PROTOCOL;
636 break;
637 case CEPH_ENTITY_TYPE_OSD:
638 proto = CEPH_OSDC_PROTOCOL;
639 break;
640 case CEPH_ENTITY_TYPE_MDS:
641 proto = CEPH_MDSC_PROTOCOL;
642 break;
643 default:
644 BUG();
645 }
646
647 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
648 con->connect_seq, global_seq, proto);
649
650 con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED);
651 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
652 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
653 con->out_connect.global_seq = cpu_to_le32(global_seq);
654 con->out_connect.protocol_version = cpu_to_le32(proto);
655 con->out_connect.flags = 0;
656
657 if (!after_banner) {
658 con->out_kvec_left = 0;
659 con->out_kvec_bytes = 0;
660 }
661 con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
662 con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
663 con->out_kvec_left++;
664 con->out_kvec_bytes += sizeof(con->out_connect);
665 con->out_kvec_cur = con->out_kvec;
666 con->out_more = 0;
667 set_bit(WRITE_PENDING, &con->state);
668
669 prepare_connect_authorizer(con);
670}
671
672
673/*
674 * write as much of pending kvecs to the socket as we can.
675 * 1 -> done
676 * 0 -> socket full, but more to do
677 * <0 -> error
678 */
679static int write_partial_kvec(struct ceph_connection *con)
680{
681 int ret;
682
683 dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
684 while (con->out_kvec_bytes > 0) {
685 ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
686 con->out_kvec_left, con->out_kvec_bytes,
687 con->out_more);
688 if (ret <= 0)
689 goto out;
690 con->out_kvec_bytes -= ret;
691 if (con->out_kvec_bytes == 0)
692 break; /* done */
693 while (ret > 0) {
694 if (ret >= con->out_kvec_cur->iov_len) {
695 ret -= con->out_kvec_cur->iov_len;
696 con->out_kvec_cur++;
697 con->out_kvec_left--;
698 } else {
699 con->out_kvec_cur->iov_len -= ret;
700 con->out_kvec_cur->iov_base += ret;
701 ret = 0;
702 break;
703 }
704 }
705 }
706 con->out_kvec_left = 0;
707 con->out_kvec_is_msg = false;
708 ret = 1;
709out:
710 dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
711 con->out_kvec_bytes, con->out_kvec_left, ret);
712 return ret; /* done! */
713}
714
715/*
716 * Write as much message data payload as we can. If we finish, queue
717 * up the footer.
718 * 1 -> done, footer is now queued in out_kvec[].
719 * 0 -> socket full, but more to do
720 * <0 -> error
721 */
722static int write_partial_msg_pages(struct ceph_connection *con)
723{
724 struct ceph_msg *msg = con->out_msg;
725 unsigned data_len = le32_to_cpu(msg->hdr.data_len);
726 size_t len;
727 int crc = con->msgr->nocrc;
728 int ret;
729
730 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
731 con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
732 con->out_msg_pos.page_pos);
733
734 while (con->out_msg_pos.page < con->out_msg->nr_pages) {
735 struct page *page = NULL;
736 void *kaddr = NULL;
737
738 /*
739 * if we are calculating the data crc (the default), we need
740 * to map the page. if our pages[] has been revoked, use the
741 * zero page.
742 */
743 if (msg->pages) {
744 page = msg->pages[con->out_msg_pos.page];
745 if (crc)
746 kaddr = kmap(page);
747 } else if (msg->pagelist) {
748 page = list_first_entry(&msg->pagelist->head,
749 struct page, lru);
750 if (crc)
751 kaddr = kmap(page);
752 } else {
753 page = con->msgr->zero_page;
754 if (crc)
755 kaddr = page_address(con->msgr->zero_page);
756 }
757 len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
758 (int)(data_len - con->out_msg_pos.data_pos));
759 if (crc && !con->out_msg_pos.did_page_crc) {
760 void *base = kaddr + con->out_msg_pos.page_pos;
761 u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
762
763 BUG_ON(kaddr == NULL);
764 con->out_msg->footer.data_crc =
765 cpu_to_le32(crc32c(tmpcrc, base, len));
766 con->out_msg_pos.did_page_crc = 1;
767 }
768
769 ret = kernel_sendpage(con->sock, page,
770 con->out_msg_pos.page_pos, len,
771 MSG_DONTWAIT | MSG_NOSIGNAL |
772 MSG_MORE);
773
774 if (crc && (msg->pages || msg->pagelist))
775 kunmap(page);
776
777 if (ret <= 0)
778 goto out;
779
780 con->out_msg_pos.data_pos += ret;
781 con->out_msg_pos.page_pos += ret;
782 if (ret == len) {
783 con->out_msg_pos.page_pos = 0;
784 con->out_msg_pos.page++;
785 con->out_msg_pos.did_page_crc = 0;
786 if (msg->pagelist)
787 list_move_tail(&page->lru,
788 &msg->pagelist->head);
789 }
790 }
791
792 dout("write_partial_msg_pages %p msg %p done\n", con, msg);
793
794 /* prepare and queue up footer, too */
795 if (!crc)
796 con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
797 con->out_kvec_bytes = 0;
798 con->out_kvec_left = 0;
799 con->out_kvec_cur = con->out_kvec;
800 prepare_write_message_footer(con, 0);
801 ret = 1;
802out:
803 return ret;
804}
805
806/*
807 * write some zeros
808 */
809static int write_partial_skip(struct ceph_connection *con)
810{
811 int ret;
812
813 while (con->out_skip > 0) {
814 struct kvec iov = {
815 .iov_base = page_address(con->msgr->zero_page),
816 .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
817 };
818
819 ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
820 if (ret <= 0)
821 goto out;
822 con->out_skip -= ret;
823 }
824 ret = 1;
825out:
826 return ret;
827}
828
829/*
830 * Prepare to read connection handshake, or an ack.
831 */
832static void prepare_read_banner(struct ceph_connection *con)
833{
834 dout("prepare_read_banner %p\n", con);
835 con->in_base_pos = 0;
836}
837
838static void prepare_read_connect(struct ceph_connection *con)
839{
840 dout("prepare_read_connect %p\n", con);
841 con->in_base_pos = 0;
842}
843
844static void prepare_read_ack(struct ceph_connection *con)
845{
846 dout("prepare_read_ack %p\n", con);
847 con->in_base_pos = 0;
848}
849
850static void prepare_read_tag(struct ceph_connection *con)
851{
852 dout("prepare_read_tag %p\n", con);
853 con->in_base_pos = 0;
854 con->in_tag = CEPH_MSGR_TAG_READY;
855}
856
857/*
858 * Prepare to read a message.
859 */
860static int prepare_read_message(struct ceph_connection *con)
861{
862 dout("prepare_read_message %p\n", con);
863 BUG_ON(con->in_msg != NULL);
864 con->in_base_pos = 0;
865 con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
866 return 0;
867}
868
869
870static int read_partial(struct ceph_connection *con,
871 int *to, int size, void *object)
872{
873 *to += size;
874 while (con->in_base_pos < *to) {
875 int left = *to - con->in_base_pos;
876 int have = size - left;
877 int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
878 if (ret <= 0)
879 return ret;
880 con->in_base_pos += ret;
881 }
882 return 1;
883}
884
885
886/*
887 * Read all or part of the connect-side handshake on a new connection
888 */
889static int read_partial_banner(struct ceph_connection *con)
890{
891 int ret, to = 0;
892
893 dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
894
895 /* peer's banner */
896 ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
897 if (ret <= 0)
898 goto out;
899 ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
900 &con->actual_peer_addr);
901 if (ret <= 0)
902 goto out;
903 ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
904 &con->peer_addr_for_me);
905 if (ret <= 0)
906 goto out;
907out:
908 return ret;
909}
910
911static int read_partial_connect(struct ceph_connection *con)
912{
913 int ret, to = 0;
914
915 dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
916
917 ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
918 if (ret <= 0)
919 goto out;
920 ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
921 con->auth_reply_buf);
922 if (ret <= 0)
923 goto out;
924
925 dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
926 con, (int)con->in_reply.tag,
927 le32_to_cpu(con->in_reply.connect_seq),
928 le32_to_cpu(con->in_reply.global_seq));
929out:
930 return ret;
931
932}
933
934/*
935 * Verify the hello banner looks okay.
936 */
937static int verify_hello(struct ceph_connection *con)
938{
939 if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
940 pr_err("connect to %s got bad banner\n",
941 pr_addr(&con->peer_addr.in_addr));
942 con->error_msg = "protocol error, bad banner";
943 return -1;
944 }
945 return 0;
946}
947
948static bool addr_is_blank(struct sockaddr_storage *ss)
949{
950 switch (ss->ss_family) {
951 case AF_INET:
952 return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
953 case AF_INET6:
954 return
955 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
956 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
957 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
958 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
959 }
960 return false;
961}
962
963static int addr_port(struct sockaddr_storage *ss)
964{
965 switch (ss->ss_family) {
966 case AF_INET:
967 return ntohs(((struct sockaddr_in *)ss)->sin_port);
968 case AF_INET6:
969 return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
970 }
971 return 0;
972}
973
974static void addr_set_port(struct sockaddr_storage *ss, int p)
975{
976 switch (ss->ss_family) {
977 case AF_INET:
978 ((struct sockaddr_in *)ss)->sin_port = htons(p);
979 case AF_INET6:
980 ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
981 }
982}
983
984/*
985 * Parse an ip[:port] list into an addr array. Use the default
986 * monitor port if a port isn't specified.
987 */
988int ceph_parse_ips(const char *c, const char *end,
989 struct ceph_entity_addr *addr,
990 int max_count, int *count)
991{
992 int i;
993 const char *p = c;
994
995 dout("parse_ips on '%.*s'\n", (int)(end-c), c);
996 for (i = 0; i < max_count; i++) {
997 const char *ipend;
998 struct sockaddr_storage *ss = &addr[i].in_addr;
999 struct sockaddr_in *in4 = (void *)ss;
1000 struct sockaddr_in6 *in6 = (void *)ss;
1001 int port;
1002 char delim = ',';
1003
1004 if (*p == '[') {
1005 delim = ']';
1006 p++;
1007 }
1008
1009 memset(ss, 0, sizeof(*ss));
1010 if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
1011 delim, &ipend))
1012 ss->ss_family = AF_INET;
1013 else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
1014 delim, &ipend))
1015 ss->ss_family = AF_INET6;
1016 else
1017 goto bad;
1018 p = ipend;
1019
1020 if (delim == ']') {
1021 if (*p != ']') {
1022 dout("missing matching ']'\n");
1023 goto bad;
1024 }
1025 p++;
1026 }
1027
1028 /* port? */
1029 if (p < end && *p == ':') {
1030 port = 0;
1031 p++;
1032 while (p < end && *p >= '0' && *p <= '9') {
1033 port = (port * 10) + (*p - '0');
1034 p++;
1035 }
1036 if (port > 65535 || port == 0)
1037 goto bad;
1038 } else {
1039 port = CEPH_MON_PORT;
1040 }
1041
1042 addr_set_port(ss, port);
1043
1044 dout("parse_ips got %s\n", pr_addr(ss));
1045
1046 if (p == end)
1047 break;
1048 if (*p != ',')
1049 goto bad;
1050 p++;
1051 }
1052
1053 if (p != end)
1054 goto bad;
1055
1056 if (count)
1057 *count = i + 1;
1058 return 0;
1059
1060bad:
1061 pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
1062 return -EINVAL;
1063}
1064
1065static int process_banner(struct ceph_connection *con)
1066{
1067 dout("process_banner on %p\n", con);
1068
1069 if (verify_hello(con) < 0)
1070 return -1;
1071
1072 ceph_decode_addr(&con->actual_peer_addr);
1073 ceph_decode_addr(&con->peer_addr_for_me);
1074
1075 /*
1076 * Make sure the other end is who we wanted. note that the other
1077 * end may not yet know their ip address, so if it's 0.0.0.0, give
1078 * them the benefit of the doubt.
1079 */
1080 if (memcmp(&con->peer_addr, &con->actual_peer_addr,
1081 sizeof(con->peer_addr)) != 0 &&
1082 !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
1083 con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
1084 pr_warning("wrong peer, want %s/%d, got %s/%d\n",
1085 pr_addr(&con->peer_addr.in_addr),
1086 (int)le32_to_cpu(con->peer_addr.nonce),
1087 pr_addr(&con->actual_peer_addr.in_addr),
1088 (int)le32_to_cpu(con->actual_peer_addr.nonce));
1089 con->error_msg = "wrong peer at address";
1090 return -1;
1091 }
1092
1093 /*
1094 * did we learn our address?
1095 */
1096 if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
1097 int port = addr_port(&con->msgr->inst.addr.in_addr);
1098
1099 memcpy(&con->msgr->inst.addr.in_addr,
1100 &con->peer_addr_for_me.in_addr,
1101 sizeof(con->peer_addr_for_me.in_addr));
1102 addr_set_port(&con->msgr->inst.addr.in_addr, port);
1103 encode_my_addr(con->msgr);
1104 dout("process_banner learned my addr is %s\n",
1105 pr_addr(&con->msgr->inst.addr.in_addr));
1106 }
1107
1108 set_bit(NEGOTIATING, &con->state);
1109 prepare_read_connect(con);
1110 return 0;
1111}
1112
1113static void fail_protocol(struct ceph_connection *con)
1114{
1115 reset_connection(con);
1116 set_bit(CLOSED, &con->state); /* in case there's queued work */
1117
1118 mutex_unlock(&con->mutex);
1119 if (con->ops->bad_proto)
1120 con->ops->bad_proto(con);
1121 mutex_lock(&con->mutex);
1122}
1123
1124static int process_connect(struct ceph_connection *con)
1125{
1126 u64 sup_feat = CEPH_FEATURE_SUPPORTED;
1127 u64 req_feat = CEPH_FEATURE_REQUIRED;
1128 u64 server_feat = le64_to_cpu(con->in_reply.features);
1129
1130 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
1131
1132 switch (con->in_reply.tag) {
1133 case CEPH_MSGR_TAG_FEATURES:
1134 pr_err("%s%lld %s feature set mismatch,"
1135 " my %llx < server's %llx, missing %llx\n",
1136 ENTITY_NAME(con->peer_name),
1137 pr_addr(&con->peer_addr.in_addr),
1138 sup_feat, server_feat, server_feat & ~sup_feat);
1139 con->error_msg = "missing required protocol features";
1140 fail_protocol(con);
1141 return -1;
1142
1143 case CEPH_MSGR_TAG_BADPROTOVER:
1144 pr_err("%s%lld %s protocol version mismatch,"
1145 " my %d != server's %d\n",
1146 ENTITY_NAME(con->peer_name),
1147 pr_addr(&con->peer_addr.in_addr),
1148 le32_to_cpu(con->out_connect.protocol_version),
1149 le32_to_cpu(con->in_reply.protocol_version));
1150 con->error_msg = "protocol version mismatch";
1151 fail_protocol(con);
1152 return -1;
1153
1154 case CEPH_MSGR_TAG_BADAUTHORIZER:
1155 con->auth_retry++;
1156 dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
1157 con->auth_retry);
1158 if (con->auth_retry == 2) {
1159 con->error_msg = "connect authorization failure";
1160 reset_connection(con);
1161 set_bit(CLOSED, &con->state);
1162 return -1;
1163 }
1164 con->auth_retry = 1;
1165 prepare_write_connect(con->msgr, con, 0);
1166 prepare_read_connect(con);
1167 break;
1168
1169 case CEPH_MSGR_TAG_RESETSESSION:
1170 /*
1171 * If we connected with a large connect_seq but the peer
1172 * has no record of a session with us (no connection, or
1173 * connect_seq == 0), they will send RESETSESION to indicate
1174 * that they must have reset their session, and may have
1175 * dropped messages.
1176 */
1177 dout("process_connect got RESET peer seq %u\n",
1178 le32_to_cpu(con->in_connect.connect_seq));
1179 pr_err("%s%lld %s connection reset\n",
1180 ENTITY_NAME(con->peer_name),
1181 pr_addr(&con->peer_addr.in_addr));
1182 reset_connection(con);
1183 prepare_write_connect(con->msgr, con, 0);
1184 prepare_read_connect(con);
1185
1186 /* Tell ceph about it. */
1187 mutex_unlock(&con->mutex);
1188 pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
1189 if (con->ops->peer_reset)
1190 con->ops->peer_reset(con);
1191 mutex_lock(&con->mutex);
1192 break;
1193
1194 case CEPH_MSGR_TAG_RETRY_SESSION:
1195 /*
1196 * If we sent a smaller connect_seq than the peer has, try
1197 * again with a larger value.
1198 */
1199 dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
1200 le32_to_cpu(con->out_connect.connect_seq),
1201 le32_to_cpu(con->in_connect.connect_seq));
1202 con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
1203 prepare_write_connect(con->msgr, con, 0);
1204 prepare_read_connect(con);
1205 break;
1206
1207 case CEPH_MSGR_TAG_RETRY_GLOBAL:
1208 /*
1209 * If we sent a smaller global_seq than the peer has, try
1210 * again with a larger value.
1211 */
1212 dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
1213 con->peer_global_seq,
1214 le32_to_cpu(con->in_connect.global_seq));
1215 get_global_seq(con->msgr,
1216 le32_to_cpu(con->in_connect.global_seq));
1217 prepare_write_connect(con->msgr, con, 0);
1218 prepare_read_connect(con);
1219 break;
1220
1221 case CEPH_MSGR_TAG_READY:
1222 if (req_feat & ~server_feat) {
1223 pr_err("%s%lld %s protocol feature mismatch,"
1224 " my required %llx > server's %llx, need %llx\n",
1225 ENTITY_NAME(con->peer_name),
1226 pr_addr(&con->peer_addr.in_addr),
1227 req_feat, server_feat, req_feat & ~server_feat);
1228 con->error_msg = "missing required protocol features";
1229 fail_protocol(con);
1230 return -1;
1231 }
1232 clear_bit(CONNECTING, &con->state);
1233 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1234 con->connect_seq++;
1235 con->peer_features = server_feat;
1236 dout("process_connect got READY gseq %d cseq %d (%d)\n",
1237 con->peer_global_seq,
1238 le32_to_cpu(con->in_reply.connect_seq),
1239 con->connect_seq);
1240 WARN_ON(con->connect_seq !=
1241 le32_to_cpu(con->in_reply.connect_seq));
1242
1243 if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
1244 set_bit(LOSSYTX, &con->state);
1245
1246 prepare_read_tag(con);
1247 break;
1248
1249 case CEPH_MSGR_TAG_WAIT:
1250 /*
1251 * If there is a connection race (we are opening
1252 * connections to each other), one of us may just have
1253 * to WAIT. This shouldn't happen if we are the
1254 * client.
1255 */
1256 pr_err("process_connect peer connecting WAIT\n");
1257
1258 default:
1259 pr_err("connect protocol error, will retry\n");
1260 con->error_msg = "protocol error, garbage tag during connect";
1261 return -1;
1262 }
1263 return 0;
1264}
1265
1266
1267/*
1268 * read (part of) an ack
1269 */
1270static int read_partial_ack(struct ceph_connection *con)
1271{
1272 int to = 0;
1273
1274 return read_partial(con, &to, sizeof(con->in_temp_ack),
1275 &con->in_temp_ack);
1276}
1277
1278
1279/*
1280 * We can finally discard anything that's been acked.
1281 */
1282static void process_ack(struct ceph_connection *con)
1283{
1284 struct ceph_msg *m;
1285 u64 ack = le64_to_cpu(con->in_temp_ack);
1286 u64 seq;
1287
1288 while (!list_empty(&con->out_sent)) {
1289 m = list_first_entry(&con->out_sent, struct ceph_msg,
1290 list_head);
1291 seq = le64_to_cpu(m->hdr.seq);
1292 if (seq > ack)
1293 break;
1294 dout("got ack for seq %llu type %d at %p\n", seq,
1295 le16_to_cpu(m->hdr.type), m);
1296 ceph_msg_remove(m);
1297 }
1298 prepare_read_tag(con);
1299}
1300
1301
1302
1303
1304static int read_partial_message_section(struct ceph_connection *con,
1305 struct kvec *section,
1306 unsigned int sec_len, u32 *crc)
1307{
1308 int left;
1309 int ret;
1310
1311 BUG_ON(!section);
1312
1313 while (section->iov_len < sec_len) {
1314 BUG_ON(section->iov_base == NULL);
1315 left = sec_len - section->iov_len;
1316 ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
1317 section->iov_len, left);
1318 if (ret <= 0)
1319 return ret;
1320 section->iov_len += ret;
1321 if (section->iov_len == sec_len)
1322 *crc = crc32c(0, section->iov_base,
1323 section->iov_len);
1324 }
1325
1326 return 1;
1327}
1328
1329static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
1330 struct ceph_msg_header *hdr,
1331 int *skip);
1332/*
1333 * read (part of) a message.
1334 */
1335static int read_partial_message(struct ceph_connection *con)
1336{
1337 struct ceph_msg *m = con->in_msg;
1338 void *p;
1339 int ret;
1340 int to, left;
1341 unsigned front_len, middle_len, data_len, data_off;
1342 int datacrc = con->msgr->nocrc;
1343 int skip;
1344 u64 seq;
1345
1346 dout("read_partial_message con %p msg %p\n", con, m);
1347
1348 /* header */
1349 while (con->in_base_pos < sizeof(con->in_hdr)) {
1350 left = sizeof(con->in_hdr) - con->in_base_pos;
1351 ret = ceph_tcp_recvmsg(con->sock,
1352 (char *)&con->in_hdr + con->in_base_pos,
1353 left);
1354 if (ret <= 0)
1355 return ret;
1356 con->in_base_pos += ret;
1357 if (con->in_base_pos == sizeof(con->in_hdr)) {
1358 u32 crc = crc32c(0, (void *)&con->in_hdr,
1359 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
1360 if (crc != le32_to_cpu(con->in_hdr.crc)) {
1361 pr_err("read_partial_message bad hdr "
1362 " crc %u != expected %u\n",
1363 crc, con->in_hdr.crc);
1364 return -EBADMSG;
1365 }
1366 }
1367 }
1368 front_len = le32_to_cpu(con->in_hdr.front_len);
1369 if (front_len > CEPH_MSG_MAX_FRONT_LEN)
1370 return -EIO;
1371 middle_len = le32_to_cpu(con->in_hdr.middle_len);
1372 if (middle_len > CEPH_MSG_MAX_DATA_LEN)
1373 return -EIO;
1374 data_len = le32_to_cpu(con->in_hdr.data_len);
1375 if (data_len > CEPH_MSG_MAX_DATA_LEN)
1376 return -EIO;
1377 data_off = le16_to_cpu(con->in_hdr.data_off);
1378
1379 /* verify seq# */
1380 seq = le64_to_cpu(con->in_hdr.seq);
1381 if ((s64)seq - (s64)con->in_seq < 1) {
1382 pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
1383 ENTITY_NAME(con->peer_name),
1384 pr_addr(&con->peer_addr.in_addr),
1385 seq, con->in_seq + 1);
1386 con->in_base_pos = -front_len - middle_len - data_len -
1387 sizeof(m->footer);
1388 con->in_tag = CEPH_MSGR_TAG_READY;
1389 con->in_seq++;
1390 return 0;
1391 } else if ((s64)seq - (s64)con->in_seq > 1) {
1392 pr_err("read_partial_message bad seq %lld expected %lld\n",
1393 seq, con->in_seq + 1);
1394 con->error_msg = "bad message sequence # for incoming message";
1395 return -EBADMSG;
1396 }
1397
1398 /* allocate message? */
1399 if (!con->in_msg) {
1400 dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
1401 con->in_hdr.front_len, con->in_hdr.data_len);
1402 skip = 0;
1403 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
1404 if (skip) {
1405 /* skip this message */
1406 dout("alloc_msg said skip message\n");
1407 BUG_ON(con->in_msg);
1408 con->in_base_pos = -front_len - middle_len - data_len -
1409 sizeof(m->footer);
1410 con->in_tag = CEPH_MSGR_TAG_READY;
1411 con->in_seq++;
1412 return 0;
1413 }
1414 if (!con->in_msg) {
1415 con->error_msg =
1416 "error allocating memory for incoming message";
1417 return -ENOMEM;
1418 }
1419 m = con->in_msg;
1420 m->front.iov_len = 0; /* haven't read it yet */
1421 if (m->middle)
1422 m->middle->vec.iov_len = 0;
1423
1424 con->in_msg_pos.page = 0;
1425 con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
1426 con->in_msg_pos.data_pos = 0;
1427 }
1428
1429 /* front */
1430 ret = read_partial_message_section(con, &m->front, front_len,
1431 &con->in_front_crc);
1432 if (ret <= 0)
1433 return ret;
1434
1435 /* middle */
1436 if (m->middle) {
1437 ret = read_partial_message_section(con, &m->middle->vec,
1438 middle_len,
1439 &con->in_middle_crc);
1440 if (ret <= 0)
1441 return ret;
1442 }
1443
1444 /* (page) data */
1445 while (con->in_msg_pos.data_pos < data_len) {
1446 left = min((int)(data_len - con->in_msg_pos.data_pos),
1447 (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
1448 BUG_ON(m->pages == NULL);
1449 p = kmap(m->pages[con->in_msg_pos.page]);
1450 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1451 left);
1452 if (ret > 0 && datacrc)
1453 con->in_data_crc =
1454 crc32c(con->in_data_crc,
1455 p + con->in_msg_pos.page_pos, ret);
1456 kunmap(m->pages[con->in_msg_pos.page]);
1457 if (ret <= 0)
1458 return ret;
1459 con->in_msg_pos.data_pos += ret;
1460 con->in_msg_pos.page_pos += ret;
1461 if (con->in_msg_pos.page_pos == PAGE_SIZE) {
1462 con->in_msg_pos.page_pos = 0;
1463 con->in_msg_pos.page++;
1464 }
1465 }
1466
1467 /* footer */
1468 to = sizeof(m->hdr) + sizeof(m->footer);
1469 while (con->in_base_pos < to) {
1470 left = to - con->in_base_pos;
1471 ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
1472 (con->in_base_pos - sizeof(m->hdr)),
1473 left);
1474 if (ret <= 0)
1475 return ret;
1476 con->in_base_pos += ret;
1477 }
1478 dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
1479 m, front_len, m->footer.front_crc, middle_len,
1480 m->footer.middle_crc, data_len, m->footer.data_crc);
1481
1482 /* crc ok? */
1483 if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
1484 pr_err("read_partial_message %p front crc %u != exp. %u\n",
1485 m, con->in_front_crc, m->footer.front_crc);
1486 return -EBADMSG;
1487 }
1488 if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
1489 pr_err("read_partial_message %p middle crc %u != exp %u\n",
1490 m, con->in_middle_crc, m->footer.middle_crc);
1491 return -EBADMSG;
1492 }
1493 if (datacrc &&
1494 (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
1495 con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
1496 pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
1497 con->in_data_crc, le32_to_cpu(m->footer.data_crc));
1498 return -EBADMSG;
1499 }
1500
1501 return 1; /* done! */
1502}
1503
1504/*
1505 * Process message. This happens in the worker thread. The callback should
1506 * be careful not to do anything that waits on other incoming messages or it
1507 * may deadlock.
1508 */
1509static void process_message(struct ceph_connection *con)
1510{
1511 struct ceph_msg *msg;
1512
1513 msg = con->in_msg;
1514 con->in_msg = NULL;
1515
1516 /* if first message, set peer_name */
1517 if (con->peer_name.type == 0)
1518 con->peer_name = msg->hdr.src;
1519
1520 con->in_seq++;
1521 mutex_unlock(&con->mutex);
1522
1523 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
1524 msg, le64_to_cpu(msg->hdr.seq),
1525 ENTITY_NAME(msg->hdr.src),
1526 le16_to_cpu(msg->hdr.type),
1527 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1528 le32_to_cpu(msg->hdr.front_len),
1529 le32_to_cpu(msg->hdr.data_len),
1530 con->in_front_crc, con->in_middle_crc, con->in_data_crc);
1531 con->ops->dispatch(con, msg);
1532
1533 mutex_lock(&con->mutex);
1534 prepare_read_tag(con);
1535}
1536
1537
1538/*
1539 * Write something to the socket. Called in a worker thread when the
1540 * socket appears to be writeable and we have something ready to send.
1541 */
1542static int try_write(struct ceph_connection *con)
1543{
1544 struct ceph_messenger *msgr = con->msgr;
1545 int ret = 1;
1546
1547 dout("try_write start %p state %lu nref %d\n", con, con->state,
1548 atomic_read(&con->nref));
1549
1550more:
1551 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
1552
1553 /* open the socket first? */
1554 if (con->sock == NULL) {
1555 /*
1556 * if we were STANDBY and are reconnecting _this_
1557 * connection, bump connect_seq now. Always bump
1558 * global_seq.
1559 */
1560 if (test_and_clear_bit(STANDBY, &con->state))
1561 con->connect_seq++;
1562
1563 prepare_write_banner(msgr, con);
1564 prepare_write_connect(msgr, con, 1);
1565 prepare_read_banner(con);
1566 set_bit(CONNECTING, &con->state);
1567 clear_bit(NEGOTIATING, &con->state);
1568
1569 BUG_ON(con->in_msg);
1570 con->in_tag = CEPH_MSGR_TAG_READY;
1571 dout("try_write initiating connect on %p new state %lu\n",
1572 con, con->state);
1573 con->sock = ceph_tcp_connect(con);
1574 if (IS_ERR(con->sock)) {
1575 con->sock = NULL;
1576 con->error_msg = "connect error";
1577 ret = -1;
1578 goto out;
1579 }
1580 }
1581
1582more_kvec:
1583 /* kvec data queued? */
1584 if (con->out_skip) {
1585 ret = write_partial_skip(con);
1586 if (ret <= 0)
1587 goto done;
1588 if (ret < 0) {
1589 dout("try_write write_partial_skip err %d\n", ret);
1590 goto done;
1591 }
1592 }
1593 if (con->out_kvec_left) {
1594 ret = write_partial_kvec(con);
1595 if (ret <= 0)
1596 goto done;
1597 }
1598
1599 /* msg pages? */
1600 if (con->out_msg) {
1601 if (con->out_msg_done) {
1602 ceph_msg_put(con->out_msg);
1603 con->out_msg = NULL; /* we're done with this one */
1604 goto do_next;
1605 }
1606
1607 ret = write_partial_msg_pages(con);
1608 if (ret == 1)
1609 goto more_kvec; /* we need to send the footer, too! */
1610 if (ret == 0)
1611 goto done;
1612 if (ret < 0) {
1613 dout("try_write write_partial_msg_pages err %d\n",
1614 ret);
1615 goto done;
1616 }
1617 }
1618
1619do_next:
1620 if (!test_bit(CONNECTING, &con->state)) {
1621 /* is anything else pending? */
1622 if (!list_empty(&con->out_queue)) {
1623 prepare_write_message(con);
1624 goto more;
1625 }
1626 if (con->in_seq > con->in_seq_acked) {
1627 prepare_write_ack(con);
1628 goto more;
1629 }
1630 if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
1631 prepare_write_keepalive(con);
1632 goto more;
1633 }
1634 }
1635
1636 /* Nothing to do! */
1637 clear_bit(WRITE_PENDING, &con->state);
1638 dout("try_write nothing else to write.\n");
1639done:
1640 ret = 0;
1641out:
1642 dout("try_write done on %p\n", con);
1643 return ret;
1644}
1645
1646
1647
1648/*
1649 * Read what we can from the socket.
1650 */
1651static int try_read(struct ceph_connection *con)
1652{
1653 int ret = -1;
1654
1655 if (!con->sock)
1656 return 0;
1657
1658 if (test_bit(STANDBY, &con->state))
1659 return 0;
1660
1661 dout("try_read start on %p\n", con);
1662
1663more:
1664 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
1665 con->in_base_pos);
1666 if (test_bit(CONNECTING, &con->state)) {
1667 if (!test_bit(NEGOTIATING, &con->state)) {
1668 dout("try_read connecting\n");
1669 ret = read_partial_banner(con);
1670 if (ret <= 0)
1671 goto done;
1672 if (process_banner(con) < 0) {
1673 ret = -1;
1674 goto out;
1675 }
1676 }
1677 ret = read_partial_connect(con);
1678 if (ret <= 0)
1679 goto done;
1680 if (process_connect(con) < 0) {
1681 ret = -1;
1682 goto out;
1683 }
1684 goto more;
1685 }
1686
1687 if (con->in_base_pos < 0) {
1688 /*
1689 * skipping + discarding content.
1690 *
1691 * FIXME: there must be a better way to do this!
1692 */
1693 static char buf[1024];
1694 int skip = min(1024, -con->in_base_pos);
1695 dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
1696 ret = ceph_tcp_recvmsg(con->sock, buf, skip);
1697 if (ret <= 0)
1698 goto done;
1699 con->in_base_pos += ret;
1700 if (con->in_base_pos)
1701 goto more;
1702 }
1703 if (con->in_tag == CEPH_MSGR_TAG_READY) {
1704 /*
1705 * what's next?
1706 */
1707 ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
1708 if (ret <= 0)
1709 goto done;
1710 dout("try_read got tag %d\n", (int)con->in_tag);
1711 switch (con->in_tag) {
1712 case CEPH_MSGR_TAG_MSG:
1713 prepare_read_message(con);
1714 break;
1715 case CEPH_MSGR_TAG_ACK:
1716 prepare_read_ack(con);
1717 break;
1718 case CEPH_MSGR_TAG_CLOSE:
1719 set_bit(CLOSED, &con->state); /* fixme */
1720 goto done;
1721 default:
1722 goto bad_tag;
1723 }
1724 }
1725 if (con->in_tag == CEPH_MSGR_TAG_MSG) {
1726 ret = read_partial_message(con);
1727 if (ret <= 0) {
1728 switch (ret) {
1729 case -EBADMSG:
1730 con->error_msg = "bad crc";
1731 ret = -EIO;
1732 goto out;
1733 case -EIO:
1734 con->error_msg = "io error";
1735 goto out;
1736 default:
1737 goto done;
1738 }
1739 }
1740 if (con->in_tag == CEPH_MSGR_TAG_READY)
1741 goto more;
1742 process_message(con);
1743 goto more;
1744 }
1745 if (con->in_tag == CEPH_MSGR_TAG_ACK) {
1746 ret = read_partial_ack(con);
1747 if (ret <= 0)
1748 goto done;
1749 process_ack(con);
1750 goto more;
1751 }
1752
1753done:
1754 ret = 0;
1755out:
1756 dout("try_read done on %p\n", con);
1757 return ret;
1758
1759bad_tag:
1760 pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
1761 con->error_msg = "protocol error, garbage tag";
1762 ret = -1;
1763 goto out;
1764}
1765
1766
1767/*
1768 * Atomically queue work on a connection. Bump @con reference to
1769 * avoid races with connection teardown.
1770 *
1771 * There is some trickery going on with QUEUED and BUSY because we
1772 * only want a _single_ thread operating on each connection at any
1773 * point in time, but we want to use all available CPUs.
1774 *
1775 * The worker thread only proceeds if it can atomically set BUSY. It
1776 * clears QUEUED and does it's thing. When it thinks it's done, it
1777 * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
1778 * (tries again to set BUSY).
1779 *
1780 * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
1781 * try to queue work. If that fails (work is already queued, or BUSY)
1782 * we give up (work also already being done or is queued) but leave QUEUED
1783 * set so that the worker thread will loop if necessary.
1784 */
1785static void queue_con(struct ceph_connection *con)
1786{
1787 if (test_bit(DEAD, &con->state)) {
1788 dout("queue_con %p ignoring: DEAD\n",
1789 con);
1790 return;
1791 }
1792
1793 if (!con->ops->get(con)) {
1794 dout("queue_con %p ref count 0\n", con);
1795 return;
1796 }
1797
1798 set_bit(QUEUED, &con->state);
1799 if (test_bit(BUSY, &con->state)) {
1800 dout("queue_con %p - already BUSY\n", con);
1801 con->ops->put(con);
1802 } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
1803 dout("queue_con %p - already queued\n", con);
1804 con->ops->put(con);
1805 } else {
1806 dout("queue_con %p\n", con);
1807 }
1808}
1809
1810/*
1811 * Do some work on a connection. Drop a connection ref when we're done.
1812 */
1813static void con_work(struct work_struct *work)
1814{
1815 struct ceph_connection *con = container_of(work, struct ceph_connection,
1816 work.work);
1817 int backoff = 0;
1818
1819more:
1820 if (test_and_set_bit(BUSY, &con->state) != 0) {
1821 dout("con_work %p BUSY already set\n", con);
1822 goto out;
1823 }
1824 dout("con_work %p start, clearing QUEUED\n", con);
1825 clear_bit(QUEUED, &con->state);
1826
1827 mutex_lock(&con->mutex);
1828
1829 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
1830 dout("con_work CLOSED\n");
1831 con_close_socket(con);
1832 goto done;
1833 }
1834 if (test_and_clear_bit(OPENING, &con->state)) {
1835 /* reopen w/ new peer */
1836 dout("con_work OPENING\n");
1837 con_close_socket(con);
1838 }
1839
1840 if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
1841 try_read(con) < 0 ||
1842 try_write(con) < 0) {
1843 mutex_unlock(&con->mutex);
1844 backoff = 1;
1845 ceph_fault(con); /* error/fault path */
1846 goto done_unlocked;
1847 }
1848
1849done:
1850 mutex_unlock(&con->mutex);
1851
1852done_unlocked:
1853 clear_bit(BUSY, &con->state);
1854 dout("con->state=%lu\n", con->state);
1855 if (test_bit(QUEUED, &con->state)) {
1856 if (!backoff || test_bit(OPENING, &con->state)) {
1857 dout("con_work %p QUEUED reset, looping\n", con);
1858 goto more;
1859 }
1860 dout("con_work %p QUEUED reset, but just faulted\n", con);
1861 clear_bit(QUEUED, &con->state);
1862 }
1863 dout("con_work %p done\n", con);
1864
1865out:
1866 con->ops->put(con);
1867}
1868
1869
1870/*
1871 * Generic error/fault handler. A retry mechanism is used with
1872 * exponential backoff
1873 */
1874static void ceph_fault(struct ceph_connection *con)
1875{
1876 pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
1877 pr_addr(&con->peer_addr.in_addr), con->error_msg);
1878 dout("fault %p state %lu to peer %s\n",
1879 con, con->state, pr_addr(&con->peer_addr.in_addr));
1880
1881 if (test_bit(LOSSYTX, &con->state)) {
1882 dout("fault on LOSSYTX channel\n");
1883 goto out;
1884 }
1885
1886 mutex_lock(&con->mutex);
1887 if (test_bit(CLOSED, &con->state))
1888 goto out_unlock;
1889
1890 con_close_socket(con);
1891
1892 if (con->in_msg) {
1893 ceph_msg_put(con->in_msg);
1894 con->in_msg = NULL;
1895 }
1896
1897 /* Requeue anything that hasn't been acked */
1898 list_splice_init(&con->out_sent, &con->out_queue);
1899
1900 /* If there are no messages in the queue, place the connection
1901 * in a STANDBY state (i.e., don't try to reconnect just yet). */
1902 if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
1903 dout("fault setting STANDBY\n");
1904 set_bit(STANDBY, &con->state);
1905 } else {
1906 /* retry after a delay. */
1907 if (con->delay == 0)
1908 con->delay = BASE_DELAY_INTERVAL;
1909 else if (con->delay < MAX_DELAY_INTERVAL)
1910 con->delay *= 2;
1911 dout("fault queueing %p delay %lu\n", con, con->delay);
1912 con->ops->get(con);
1913 if (queue_delayed_work(ceph_msgr_wq, &con->work,
1914 round_jiffies_relative(con->delay)) == 0)
1915 con->ops->put(con);
1916 }
1917
1918out_unlock:
1919 mutex_unlock(&con->mutex);
1920out:
1921 /*
1922 * in case we faulted due to authentication, invalidate our
1923 * current tickets so that we can get new ones.
1924 */
1925 if (con->auth_retry && con->ops->invalidate_authorizer) {
1926 dout("calling invalidate_authorizer()\n");
1927 con->ops->invalidate_authorizer(con);
1928 }
1929
1930 if (con->ops->fault)
1931 con->ops->fault(con);
1932}
1933
1934
1935
1936/*
1937 * create a new messenger instance
1938 */
1939struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
1940{
1941 struct ceph_messenger *msgr;
1942
1943 msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
1944 if (msgr == NULL)
1945 return ERR_PTR(-ENOMEM);
1946
1947 spin_lock_init(&msgr->global_seq_lock);
1948
1949 /* the zero page is needed if a request is "canceled" while the message
1950 * is being written over the socket */
1951 msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
1952 if (!msgr->zero_page) {
1953 kfree(msgr);
1954 return ERR_PTR(-ENOMEM);
1955 }
1956 kmap(msgr->zero_page);
1957
1958 if (myaddr)
1959 msgr->inst.addr = *myaddr;
1960
1961 /* select a random nonce */
1962 msgr->inst.addr.type = 0;
1963 get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
1964 encode_my_addr(msgr);
1965
1966 dout("messenger_create %p\n", msgr);
1967 return msgr;
1968}
1969
1970void ceph_messenger_destroy(struct ceph_messenger *msgr)
1971{
1972 dout("destroy %p\n", msgr);
1973 kunmap(msgr->zero_page);
1974 __free_page(msgr->zero_page);
1975 kfree(msgr);
1976 dout("destroyed messenger %p\n", msgr);
1977}
1978
1979/*
1980 * Queue up an outgoing message on the given connection.
1981 */
1982void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
1983{
1984 if (test_bit(CLOSED, &con->state)) {
1985 dout("con_send %p closed, dropping %p\n", con, msg);
1986 ceph_msg_put(msg);
1987 return;
1988 }
1989
1990 /* set src+dst */
1991 msg->hdr.src = con->msgr->inst.name;
1992
1993 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
1994
1995 msg->needs_out_seq = true;
1996
1997 /* queue */
1998 mutex_lock(&con->mutex);
1999 BUG_ON(!list_empty(&msg->list_head));
2000 list_add_tail(&msg->list_head, &con->out_queue);
2001 dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
2002 ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
2003 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
2004 le32_to_cpu(msg->hdr.front_len),
2005 le32_to_cpu(msg->hdr.middle_len),
2006 le32_to_cpu(msg->hdr.data_len));
2007 mutex_unlock(&con->mutex);
2008
2009 /* if there wasn't anything waiting to send before, queue
2010 * new work */
2011 if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2012 queue_con(con);
2013}
2014
2015/*
2016 * Revoke a message that was previously queued for send
2017 */
2018void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
2019{
2020 mutex_lock(&con->mutex);
2021 if (!list_empty(&msg->list_head)) {
2022 dout("con_revoke %p msg %p - was on queue\n", con, msg);
2023 list_del_init(&msg->list_head);
2024 ceph_msg_put(msg);
2025 msg->hdr.seq = 0;
2026 }
2027 if (con->out_msg == msg) {
2028 dout("con_revoke %p msg %p - was sending\n", con, msg);
2029 con->out_msg = NULL;
2030 if (con->out_kvec_is_msg) {
2031 con->out_skip = con->out_kvec_bytes;
2032 con->out_kvec_is_msg = false;
2033 }
2034 ceph_msg_put(msg);
2035 msg->hdr.seq = 0;
2036 }
2037 mutex_unlock(&con->mutex);
2038}
2039
2040/*
2041 * Revoke a message that we may be reading data into
2042 */
2043void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
2044{
2045 mutex_lock(&con->mutex);
2046 if (con->in_msg && con->in_msg == msg) {
2047 unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
2048 unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
2049 unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
2050
2051 /* skip rest of message */
2052 dout("con_revoke_pages %p msg %p revoked\n", con, msg);
2053 con->in_base_pos = con->in_base_pos -
2054 sizeof(struct ceph_msg_header) -
2055 front_len -
2056 middle_len -
2057 data_len -
2058 sizeof(struct ceph_msg_footer);
2059 ceph_msg_put(con->in_msg);
2060 con->in_msg = NULL;
2061 con->in_tag = CEPH_MSGR_TAG_READY;
2062 con->in_seq++;
2063 } else {
2064 dout("con_revoke_pages %p msg %p pages %p no-op\n",
2065 con, con->in_msg, msg);
2066 }
2067 mutex_unlock(&con->mutex);
2068}
2069
2070/*
2071 * Queue a keepalive byte to ensure the tcp connection is alive.
2072 */
2073void ceph_con_keepalive(struct ceph_connection *con)
2074{
2075 if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
2076 test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2077 queue_con(con);
2078}
2079
2080
2081/*
2082 * construct a new message with given type, size
2083 * the new msg has a ref count of 1.
2084 */
2085struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
2086{
2087 struct ceph_msg *m;
2088
2089 m = kmalloc(sizeof(*m), flags);
2090 if (m == NULL)
2091 goto out;
2092 kref_init(&m->kref);
2093 INIT_LIST_HEAD(&m->list_head);
2094
2095 m->hdr.tid = 0;
2096 m->hdr.type = cpu_to_le16(type);
2097 m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
2098 m->hdr.version = 0;
2099 m->hdr.front_len = cpu_to_le32(front_len);
2100 m->hdr.middle_len = 0;
2101 m->hdr.data_len = 0;
2102 m->hdr.data_off = 0;
2103 m->hdr.reserved = 0;
2104 m->footer.front_crc = 0;
2105 m->footer.middle_crc = 0;
2106 m->footer.data_crc = 0;
2107 m->footer.flags = 0;
2108 m->front_max = front_len;
2109 m->front_is_vmalloc = false;
2110 m->more_to_follow = false;
2111 m->pool = NULL;
2112
2113 /* front */
2114 if (front_len) {
2115 if (front_len > PAGE_CACHE_SIZE) {
2116 m->front.iov_base = __vmalloc(front_len, flags,
2117 PAGE_KERNEL);
2118 m->front_is_vmalloc = true;
2119 } else {
2120 m->front.iov_base = kmalloc(front_len, flags);
2121 }
2122 if (m->front.iov_base == NULL) {
2123 pr_err("msg_new can't allocate %d bytes\n",
2124 front_len);
2125 goto out2;
2126 }
2127 } else {
2128 m->front.iov_base = NULL;
2129 }
2130 m->front.iov_len = front_len;
2131
2132 /* middle */
2133 m->middle = NULL;
2134
2135 /* data */
2136 m->nr_pages = 0;
2137 m->pages = NULL;
2138 m->pagelist = NULL;
2139
2140 dout("ceph_msg_new %p front %d\n", m, front_len);
2141 return m;
2142
2143out2:
2144 ceph_msg_put(m);
2145out:
2146 pr_err("msg_new can't create type %d front %d\n", type, front_len);
2147 return NULL;
2148}
2149
2150/*
2151 * Allocate "middle" portion of a message, if it is needed and wasn't
2152 * allocated by alloc_msg. This allows us to read a small fixed-size
2153 * per-type header in the front and then gracefully fail (i.e.,
2154 * propagate the error to the caller based on info in the front) when
2155 * the middle is too large.
2156 */
2157static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
2158{
2159 int type = le16_to_cpu(msg->hdr.type);
2160 int middle_len = le32_to_cpu(msg->hdr.middle_len);
2161
2162 dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
2163 ceph_msg_type_name(type), middle_len);
2164 BUG_ON(!middle_len);
2165 BUG_ON(msg->middle);
2166
2167 msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
2168 if (!msg->middle)
2169 return -ENOMEM;
2170 return 0;
2171}
2172
2173/*
2174 * Generic message allocator, for incoming messages.
2175 */
2176static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
2177 struct ceph_msg_header *hdr,
2178 int *skip)
2179{
2180 int type = le16_to_cpu(hdr->type);
2181 int front_len = le32_to_cpu(hdr->front_len);
2182 int middle_len = le32_to_cpu(hdr->middle_len);
2183 struct ceph_msg *msg = NULL;
2184 int ret;
2185
2186 if (con->ops->alloc_msg) {
2187 mutex_unlock(&con->mutex);
2188 msg = con->ops->alloc_msg(con, hdr, skip);
2189 mutex_lock(&con->mutex);
2190 if (!msg || *skip)
2191 return NULL;
2192 }
2193 if (!msg) {
2194 *skip = 0;
2195 msg = ceph_msg_new(type, front_len, GFP_NOFS);
2196 if (!msg) {
2197 pr_err("unable to allocate msg type %d len %d\n",
2198 type, front_len);
2199 return NULL;
2200 }
2201 }
2202 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2203
2204 if (middle_len && !msg->middle) {
2205 ret = ceph_alloc_middle(con, msg);
2206 if (ret < 0) {
2207 ceph_msg_put(msg);
2208 return NULL;
2209 }
2210 }
2211
2212 return msg;
2213}
2214
2215
2216/*
2217 * Free a generically kmalloc'd message.
2218 */
2219void ceph_msg_kfree(struct ceph_msg *m)
2220{
2221 dout("msg_kfree %p\n", m);
2222 if (m->front_is_vmalloc)
2223 vfree(m->front.iov_base);
2224 else
2225 kfree(m->front.iov_base);
2226 kfree(m);
2227}
2228
2229/*
2230 * Drop a msg ref. Destroy as needed.
2231 */
2232void ceph_msg_last_put(struct kref *kref)
2233{
2234 struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
2235
2236 dout("ceph_msg_put last one on %p\n", m);
2237 WARN_ON(!list_empty(&m->list_head));
2238
2239 /* drop middle, data, if any */
2240 if (m->middle) {
2241 ceph_buffer_put(m->middle);
2242 m->middle = NULL;
2243 }
2244 m->nr_pages = 0;
2245 m->pages = NULL;
2246
2247 if (m->pagelist) {
2248 ceph_pagelist_release(m->pagelist);
2249 kfree(m->pagelist);
2250 m->pagelist = NULL;
2251 }
2252
2253 if (m->pool)
2254 ceph_msgpool_put(m->pool, m);
2255 else
2256 ceph_msg_kfree(m);
2257}
2258
2259void ceph_msg_dump(struct ceph_msg *msg)
2260{
2261 pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
2262 msg->front_max, msg->nr_pages);
2263 print_hex_dump(KERN_DEBUG, "header: ",
2264 DUMP_PREFIX_OFFSET, 16, 1,
2265 &msg->hdr, sizeof(msg->hdr), true);
2266 print_hex_dump(KERN_DEBUG, " front: ",
2267 DUMP_PREFIX_OFFSET, 16, 1,
2268 msg->front.iov_base, msg->front.iov_len, true);
2269 if (msg->middle)
2270 print_hex_dump(KERN_DEBUG, "middle: ",
2271 DUMP_PREFIX_OFFSET, 16, 1,
2272 msg->middle->vec.iov_base,
2273 msg->middle->vec.iov_len, true);
2274 print_hex_dump(KERN_DEBUG, "footer: ",
2275 DUMP_PREFIX_OFFSET, 16, 1,
2276 &msg->footer, sizeof(msg->footer), true);
2277}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
deleted file mode 100644
index 76fbc957bc13..000000000000
--- a/fs/ceph/messenger.h
+++ /dev/null
@@ -1,253 +0,0 @@
1#ifndef __FS_CEPH_MESSENGER_H
2#define __FS_CEPH_MESSENGER_H
3
4#include <linux/kref.h>
5#include <linux/mutex.h>
6#include <linux/net.h>
7#include <linux/radix-tree.h>
8#include <linux/uio.h>
9#include <linux/version.h>
10#include <linux/workqueue.h>
11
12#include "types.h"
13#include "buffer.h"
14
15struct ceph_msg;
16struct ceph_connection;
17
18extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */
19
20/*
21 * Ceph defines these callbacks for handling connection events.
22 */
23struct ceph_connection_operations {
24 struct ceph_connection *(*get)(struct ceph_connection *);
25 void (*put)(struct ceph_connection *);
26
27 /* handle an incoming message. */
28 void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
29
30 /* authorize an outgoing connection */
31 int (*get_authorizer) (struct ceph_connection *con,
32 void **buf, int *len, int *proto,
33 void **reply_buf, int *reply_len, int force_new);
34 int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
35 int (*invalidate_authorizer)(struct ceph_connection *con);
36
37 /* protocol version mismatch */
38 void (*bad_proto) (struct ceph_connection *con);
39
40 /* there was some error on the socket (disconnect, whatever) */
41 void (*fault) (struct ceph_connection *con);
42
43 /* a remote host as terminated a message exchange session, and messages
44 * we sent (or they tried to send us) may be lost. */
45 void (*peer_reset) (struct ceph_connection *con);
46
47 struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
48 struct ceph_msg_header *hdr,
49 int *skip);
50};
51
52/* use format string %s%d */
53#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
54
55struct ceph_messenger {
56 struct ceph_entity_inst inst; /* my name+address */
57 struct ceph_entity_addr my_enc_addr;
58 struct page *zero_page; /* used in certain error cases */
59
60 bool nocrc;
61
62 /*
63 * the global_seq counts connections i (attempt to) initiate
64 * in order to disambiguate certain connect race conditions.
65 */
66 u32 global_seq;
67 spinlock_t global_seq_lock;
68};
69
70/*
71 * a single message. it contains a header (src, dest, message type, etc.),
72 * footer (crc values, mainly), a "front" message body, and possibly a
73 * data payload (stored in some number of pages).
74 */
75struct ceph_msg {
76 struct ceph_msg_header hdr; /* header */
77 struct ceph_msg_footer footer; /* footer */
78 struct kvec front; /* unaligned blobs of message */
79 struct ceph_buffer *middle;
80 struct page **pages; /* data payload. NOT OWNER. */
81 unsigned nr_pages; /* size of page array */
82 struct ceph_pagelist *pagelist; /* instead of pages */
83 struct list_head list_head;
84 struct kref kref;
85 bool front_is_vmalloc;
86 bool more_to_follow;
87 bool needs_out_seq;
88 int front_max;
89
90 struct ceph_msgpool *pool;
91};
92
93struct ceph_msg_pos {
94 int page, page_pos; /* which page; offset in page */
95 int data_pos; /* offset in data payload */
96 int did_page_crc; /* true if we've calculated crc for current page */
97};
98
99/* ceph connection fault delay defaults, for exponential backoff */
100#define BASE_DELAY_INTERVAL (HZ/2)
101#define MAX_DELAY_INTERVAL (5 * 60 * HZ)
102
103/*
104 * ceph_connection state bit flags
105 *
106 * QUEUED and BUSY are used together to ensure that only a single
107 * thread is currently opening, reading or writing data to the socket.
108 */
109#define LOSSYTX 0 /* we can close channel or drop messages on errors */
110#define CONNECTING 1
111#define NEGOTIATING 2
112#define KEEPALIVE_PENDING 3
113#define WRITE_PENDING 4 /* we have data ready to send */
114#define QUEUED 5 /* there is work queued on this connection */
115#define BUSY 6 /* work is being done */
116#define STANDBY 8 /* no outgoing messages, socket closed. we keep
117 * the ceph_connection around to maintain shared
118 * state with the peer. */
119#define CLOSED 10 /* we've closed the connection */
120#define SOCK_CLOSED 11 /* socket state changed to closed */
121#define OPENING 13 /* open connection w/ (possibly new) peer */
122#define DEAD 14 /* dead, about to kfree */
123
124/*
125 * A single connection with another host.
126 *
127 * We maintain a queue of outgoing messages, and some session state to
128 * ensure that we can preserve the lossless, ordered delivery of
129 * messages in the case of a TCP disconnect.
130 */
131struct ceph_connection {
132 void *private;
133 atomic_t nref;
134
135 const struct ceph_connection_operations *ops;
136
137 struct ceph_messenger *msgr;
138 struct socket *sock;
139 unsigned long state; /* connection state (see flags above) */
140 const char *error_msg; /* error message, if any */
141
142 struct ceph_entity_addr peer_addr; /* peer address */
143 struct ceph_entity_name peer_name; /* peer name */
144 struct ceph_entity_addr peer_addr_for_me;
145 unsigned peer_features;
146 u32 connect_seq; /* identify the most recent connection
147 attempt for this connection, client */
148 u32 peer_global_seq; /* peer's global seq for this connection */
149
150 int auth_retry; /* true if we need a newer authorizer */
151 void *auth_reply_buf; /* where to put the authorizer reply */
152 int auth_reply_buf_len;
153
154 struct mutex mutex;
155
156 /* out queue */
157 struct list_head out_queue;
158 struct list_head out_sent; /* sending or sent but unacked */
159 u64 out_seq; /* last message queued for send */
160 bool out_keepalive_pending;
161
162 u64 in_seq, in_seq_acked; /* last message received, acked */
163
164 /* connection negotiation temps */
165 char in_banner[CEPH_BANNER_MAX_LEN];
166 union {
167 struct { /* outgoing connection */
168 struct ceph_msg_connect out_connect;
169 struct ceph_msg_connect_reply in_reply;
170 };
171 struct { /* incoming */
172 struct ceph_msg_connect in_connect;
173 struct ceph_msg_connect_reply out_reply;
174 };
175 };
176 struct ceph_entity_addr actual_peer_addr;
177
178 /* message out temps */
179 struct ceph_msg *out_msg; /* sending message (== tail of
180 out_sent) */
181 bool out_msg_done;
182 struct ceph_msg_pos out_msg_pos;
183
184 struct kvec out_kvec[8], /* sending header/footer data */
185 *out_kvec_cur;
186 int out_kvec_left; /* kvec's left in out_kvec */
187 int out_skip; /* skip this many bytes */
188 int out_kvec_bytes; /* total bytes left */
189 bool out_kvec_is_msg; /* kvec refers to out_msg */
190 int out_more; /* there is more data after the kvecs */
191 __le64 out_temp_ack; /* for writing an ack */
192
193 /* message in temps */
194 struct ceph_msg_header in_hdr;
195 struct ceph_msg *in_msg;
196 struct ceph_msg_pos in_msg_pos;
197 u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
198
199 char in_tag; /* protocol control byte */
200 int in_base_pos; /* bytes read */
201 __le64 in_temp_ack; /* for reading an ack */
202
203 struct delayed_work work; /* send|recv work */
204 unsigned long delay; /* current delay interval */
205};
206
207
208extern const char *pr_addr(const struct sockaddr_storage *ss);
209extern int ceph_parse_ips(const char *c, const char *end,
210 struct ceph_entity_addr *addr,
211 int max_count, int *count);
212
213
214extern int ceph_msgr_init(void);
215extern void ceph_msgr_exit(void);
216extern void ceph_msgr_flush(void);
217
218extern struct ceph_messenger *ceph_messenger_create(
219 struct ceph_entity_addr *myaddr);
220extern void ceph_messenger_destroy(struct ceph_messenger *);
221
222extern void ceph_con_init(struct ceph_messenger *msgr,
223 struct ceph_connection *con);
224extern void ceph_con_open(struct ceph_connection *con,
225 struct ceph_entity_addr *addr);
226extern bool ceph_con_opened(struct ceph_connection *con);
227extern void ceph_con_close(struct ceph_connection *con);
228extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
229extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
230extern void ceph_con_revoke_message(struct ceph_connection *con,
231 struct ceph_msg *msg);
232extern void ceph_con_keepalive(struct ceph_connection *con);
233extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
234extern void ceph_con_put(struct ceph_connection *con);
235
236extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
237extern void ceph_msg_kfree(struct ceph_msg *m);
238
239
240static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
241{
242 kref_get(&msg->kref);
243 return msg;
244}
245extern void ceph_msg_last_put(struct kref *kref);
246static inline void ceph_msg_put(struct ceph_msg *msg)
247{
248 kref_put(&msg->kref, ceph_msg_last_put);
249}
250
251extern void ceph_msg_dump(struct ceph_msg *msg);
252
253#endif
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
deleted file mode 100644
index b2a5a3e4a671..000000000000
--- a/fs/ceph/mon_client.c
+++ /dev/null
@@ -1,1018 +0,0 @@
1#include "ceph_debug.h"
2
3#include <linux/types.h>
4#include <linux/slab.h>
5#include <linux/random.h>
6#include <linux/sched.h>
7
8#include "mon_client.h"
9#include "super.h"
10#include "auth.h"
11#include "decode.h"
12
13/*
14 * Interact with Ceph monitor cluster. Handle requests for new map
15 * versions, and periodically resend as needed. Also implement
16 * statfs() and umount().
17 *
18 * A small cluster of Ceph "monitors" are responsible for managing critical
19 * cluster configuration and state information. An odd number (e.g., 3, 5)
20 * of cmon daemons use a modified version of the Paxos part-time parliament
21 * algorithm to manage the MDS map (mds cluster membership), OSD map, and
22 * list of clients who have mounted the file system.
23 *
24 * We maintain an open, active session with a monitor at all times in order to
25 * receive timely MDSMap updates. We periodically send a keepalive byte on the
26 * TCP socket to ensure we detect a failure. If the connection does break, we
27 * randomly hunt for a new monitor. Once the connection is reestablished, we
28 * resend any outstanding requests.
29 */
30
31static const struct ceph_connection_operations mon_con_ops;
32
33static int __validate_auth(struct ceph_mon_client *monc);
34
35/*
36 * Decode a monmap blob (e.g., during mount).
37 */
38struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
39{
40 struct ceph_monmap *m = NULL;
41 int i, err = -EINVAL;
42 struct ceph_fsid fsid;
43 u32 epoch, num_mon;
44 u16 version;
45 u32 len;
46
47 ceph_decode_32_safe(&p, end, len, bad);
48 ceph_decode_need(&p, end, len, bad);
49
50 dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
51
52 ceph_decode_16_safe(&p, end, version, bad);
53
54 ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
55 ceph_decode_copy(&p, &fsid, sizeof(fsid));
56 epoch = ceph_decode_32(&p);
57
58 num_mon = ceph_decode_32(&p);
59 ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
60
61 if (num_mon >= CEPH_MAX_MON)
62 goto bad;
63 m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
64 if (m == NULL)
65 return ERR_PTR(-ENOMEM);
66 m->fsid = fsid;
67 m->epoch = epoch;
68 m->num_mon = num_mon;
69 ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
70 for (i = 0; i < num_mon; i++)
71 ceph_decode_addr(&m->mon_inst[i].addr);
72
73 dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
74 m->num_mon);
75 for (i = 0; i < m->num_mon; i++)
76 dout("monmap_decode mon%d is %s\n", i,
77 pr_addr(&m->mon_inst[i].addr.in_addr));
78 return m;
79
80bad:
81 dout("monmap_decode failed with %d\n", err);
82 kfree(m);
83 return ERR_PTR(err);
84}
85
86/*
87 * return true if *addr is included in the monmap.
88 */
89int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
90{
91 int i;
92
93 for (i = 0; i < m->num_mon; i++)
94 if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
95 return 1;
96 return 0;
97}
98
99/*
100 * Send an auth request.
101 */
102static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
103{
104 monc->pending_auth = 1;
105 monc->m_auth->front.iov_len = len;
106 monc->m_auth->hdr.front_len = cpu_to_le32(len);
107 ceph_con_revoke(monc->con, monc->m_auth);
108 ceph_msg_get(monc->m_auth); /* keep our ref */
109 ceph_con_send(monc->con, monc->m_auth);
110}
111
112/*
113 * Close monitor session, if any.
114 */
115static void __close_session(struct ceph_mon_client *monc)
116{
117 if (monc->con) {
118 dout("__close_session closing mon%d\n", monc->cur_mon);
119 ceph_con_revoke(monc->con, monc->m_auth);
120 ceph_con_close(monc->con);
121 monc->cur_mon = -1;
122 monc->pending_auth = 0;
123 ceph_auth_reset(monc->auth);
124 }
125}
126
127/*
128 * Open a session with a (new) monitor.
129 */
130static int __open_session(struct ceph_mon_client *monc)
131{
132 char r;
133 int ret;
134
135 if (monc->cur_mon < 0) {
136 get_random_bytes(&r, 1);
137 monc->cur_mon = r % monc->monmap->num_mon;
138 dout("open_session num=%d r=%d -> mon%d\n",
139 monc->monmap->num_mon, r, monc->cur_mon);
140 monc->sub_sent = 0;
141 monc->sub_renew_after = jiffies; /* i.e., expired */
142 monc->want_next_osdmap = !!monc->want_next_osdmap;
143
144 dout("open_session mon%d opening\n", monc->cur_mon);
145 monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
146 monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
147 ceph_con_open(monc->con,
148 &monc->monmap->mon_inst[monc->cur_mon].addr);
149
150 /* initiatiate authentication handshake */
151 ret = ceph_auth_build_hello(monc->auth,
152 monc->m_auth->front.iov_base,
153 monc->m_auth->front_max);
154 __send_prepared_auth_request(monc, ret);
155 } else {
156 dout("open_session mon%d already open\n", monc->cur_mon);
157 }
158 return 0;
159}
160
161static bool __sub_expired(struct ceph_mon_client *monc)
162{
163 return time_after_eq(jiffies, monc->sub_renew_after);
164}
165
166/*
167 * Reschedule delayed work timer.
168 */
169static void __schedule_delayed(struct ceph_mon_client *monc)
170{
171 unsigned delay;
172
173 if (monc->cur_mon < 0 || __sub_expired(monc))
174 delay = 10 * HZ;
175 else
176 delay = 20 * HZ;
177 dout("__schedule_delayed after %u\n", delay);
178 schedule_delayed_work(&monc->delayed_work, delay);
179}
180
181/*
182 * Send subscribe request for mdsmap and/or osdmap.
183 */
184static void __send_subscribe(struct ceph_mon_client *monc)
185{
186 dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
187 (unsigned)monc->sub_sent, __sub_expired(monc),
188 monc->want_next_osdmap);
189 if ((__sub_expired(monc) && !monc->sub_sent) ||
190 monc->want_next_osdmap == 1) {
191 struct ceph_msg *msg = monc->m_subscribe;
192 struct ceph_mon_subscribe_item *i;
193 void *p, *end;
194
195 p = msg->front.iov_base;
196 end = p + msg->front_max;
197
198 dout("__send_subscribe to 'mdsmap' %u+\n",
199 (unsigned)monc->have_mdsmap);
200 if (monc->want_next_osdmap) {
201 dout("__send_subscribe to 'osdmap' %u\n",
202 (unsigned)monc->have_osdmap);
203 ceph_encode_32(&p, 3);
204 ceph_encode_string(&p, end, "osdmap", 6);
205 i = p;
206 i->have = cpu_to_le64(monc->have_osdmap);
207 i->onetime = 1;
208 p += sizeof(*i);
209 monc->want_next_osdmap = 2; /* requested */
210 } else {
211 ceph_encode_32(&p, 2);
212 }
213 ceph_encode_string(&p, end, "mdsmap", 6);
214 i = p;
215 i->have = cpu_to_le64(monc->have_mdsmap);
216 i->onetime = 0;
217 p += sizeof(*i);
218 ceph_encode_string(&p, end, "monmap", 6);
219 i = p;
220 i->have = 0;
221 i->onetime = 0;
222 p += sizeof(*i);
223
224 msg->front.iov_len = p - msg->front.iov_base;
225 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
226 ceph_con_revoke(monc->con, msg);
227 ceph_con_send(monc->con, ceph_msg_get(msg));
228
229 monc->sub_sent = jiffies | 1; /* never 0 */
230 }
231}
232
233static void handle_subscribe_ack(struct ceph_mon_client *monc,
234 struct ceph_msg *msg)
235{
236 unsigned seconds;
237 struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
238
239 if (msg->front.iov_len < sizeof(*h))
240 goto bad;
241 seconds = le32_to_cpu(h->duration);
242
243 mutex_lock(&monc->mutex);
244 if (monc->hunting) {
245 pr_info("mon%d %s session established\n",
246 monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
247 monc->hunting = false;
248 }
249 dout("handle_subscribe_ack after %d seconds\n", seconds);
250 monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
251 monc->sub_sent = 0;
252 mutex_unlock(&monc->mutex);
253 return;
254bad:
255 pr_err("got corrupt subscribe-ack msg\n");
256 ceph_msg_dump(msg);
257}
258
259/*
260 * Keep track of which maps we have
261 */
262int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
263{
264 mutex_lock(&monc->mutex);
265 monc->have_mdsmap = got;
266 mutex_unlock(&monc->mutex);
267 return 0;
268}
269
270int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
271{
272 mutex_lock(&monc->mutex);
273 monc->have_osdmap = got;
274 monc->want_next_osdmap = 0;
275 mutex_unlock(&monc->mutex);
276 return 0;
277}
278
279/*
280 * Register interest in the next osdmap
281 */
282void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
283{
284 dout("request_next_osdmap have %u\n", monc->have_osdmap);
285 mutex_lock(&monc->mutex);
286 if (!monc->want_next_osdmap)
287 monc->want_next_osdmap = 1;
288 if (monc->want_next_osdmap < 2)
289 __send_subscribe(monc);
290 mutex_unlock(&monc->mutex);
291}
292
293/*
294 *
295 */
296int ceph_monc_open_session(struct ceph_mon_client *monc)
297{
298 if (!monc->con) {
299 monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
300 if (!monc->con)
301 return -ENOMEM;
302 ceph_con_init(monc->client->msgr, monc->con);
303 monc->con->private = monc;
304 monc->con->ops = &mon_con_ops;
305 }
306
307 mutex_lock(&monc->mutex);
308 __open_session(monc);
309 __schedule_delayed(monc);
310 mutex_unlock(&monc->mutex);
311 return 0;
312}
313
314/*
315 * The monitor responds with mount ack indicate mount success. The
316 * included client ticket allows the client to talk to MDSs and OSDs.
317 */
318static void ceph_monc_handle_map(struct ceph_mon_client *monc,
319 struct ceph_msg *msg)
320{
321 struct ceph_client *client = monc->client;
322 struct ceph_monmap *monmap = NULL, *old = monc->monmap;
323 void *p, *end;
324
325 mutex_lock(&monc->mutex);
326
327 dout("handle_monmap\n");
328 p = msg->front.iov_base;
329 end = p + msg->front.iov_len;
330
331 monmap = ceph_monmap_decode(p, end);
332 if (IS_ERR(monmap)) {
333 pr_err("problem decoding monmap, %d\n",
334 (int)PTR_ERR(monmap));
335 goto out;
336 }
337
338 if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
339 kfree(monmap);
340 goto out;
341 }
342
343 client->monc.monmap = monmap;
344 kfree(old);
345
346out:
347 mutex_unlock(&monc->mutex);
348 wake_up_all(&client->auth_wq);
349}
350
351/*
352 * generic requests (e.g., statfs, poolop)
353 */
354static struct ceph_mon_generic_request *__lookup_generic_req(
355 struct ceph_mon_client *monc, u64 tid)
356{
357 struct ceph_mon_generic_request *req;
358 struct rb_node *n = monc->generic_request_tree.rb_node;
359
360 while (n) {
361 req = rb_entry(n, struct ceph_mon_generic_request, node);
362 if (tid < req->tid)
363 n = n->rb_left;
364 else if (tid > req->tid)
365 n = n->rb_right;
366 else
367 return req;
368 }
369 return NULL;
370}
371
372static void __insert_generic_request(struct ceph_mon_client *monc,
373 struct ceph_mon_generic_request *new)
374{
375 struct rb_node **p = &monc->generic_request_tree.rb_node;
376 struct rb_node *parent = NULL;
377 struct ceph_mon_generic_request *req = NULL;
378
379 while (*p) {
380 parent = *p;
381 req = rb_entry(parent, struct ceph_mon_generic_request, node);
382 if (new->tid < req->tid)
383 p = &(*p)->rb_left;
384 else if (new->tid > req->tid)
385 p = &(*p)->rb_right;
386 else
387 BUG();
388 }
389
390 rb_link_node(&new->node, parent, p);
391 rb_insert_color(&new->node, &monc->generic_request_tree);
392}
393
394static void release_generic_request(struct kref *kref)
395{
396 struct ceph_mon_generic_request *req =
397 container_of(kref, struct ceph_mon_generic_request, kref);
398
399 if (req->reply)
400 ceph_msg_put(req->reply);
401 if (req->request)
402 ceph_msg_put(req->request);
403
404 kfree(req);
405}
406
407static void put_generic_request(struct ceph_mon_generic_request *req)
408{
409 kref_put(&req->kref, release_generic_request);
410}
411
412static void get_generic_request(struct ceph_mon_generic_request *req)
413{
414 kref_get(&req->kref);
415}
416
417static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
418 struct ceph_msg_header *hdr,
419 int *skip)
420{
421 struct ceph_mon_client *monc = con->private;
422 struct ceph_mon_generic_request *req;
423 u64 tid = le64_to_cpu(hdr->tid);
424 struct ceph_msg *m;
425
426 mutex_lock(&monc->mutex);
427 req = __lookup_generic_req(monc, tid);
428 if (!req) {
429 dout("get_generic_reply %lld dne\n", tid);
430 *skip = 1;
431 m = NULL;
432 } else {
433 dout("get_generic_reply %lld got %p\n", tid, req->reply);
434 m = ceph_msg_get(req->reply);
435 /*
436 * we don't need to track the connection reading into
437 * this reply because we only have one open connection
438 * at a time, ever.
439 */
440 }
441 mutex_unlock(&monc->mutex);
442 return m;
443}
444
445static int do_generic_request(struct ceph_mon_client *monc,
446 struct ceph_mon_generic_request *req)
447{
448 int err;
449
450 /* register request */
451 mutex_lock(&monc->mutex);
452 req->tid = ++monc->last_tid;
453 req->request->hdr.tid = cpu_to_le64(req->tid);
454 __insert_generic_request(monc, req);
455 monc->num_generic_requests++;
456 ceph_con_send(monc->con, ceph_msg_get(req->request));
457 mutex_unlock(&monc->mutex);
458
459 err = wait_for_completion_interruptible(&req->completion);
460
461 mutex_lock(&monc->mutex);
462 rb_erase(&req->node, &monc->generic_request_tree);
463 monc->num_generic_requests--;
464 mutex_unlock(&monc->mutex);
465
466 if (!err)
467 err = req->result;
468 return err;
469}
470
471/*
472 * statfs
473 */
474static void handle_statfs_reply(struct ceph_mon_client *monc,
475 struct ceph_msg *msg)
476{
477 struct ceph_mon_generic_request *req;
478 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
479 u64 tid = le64_to_cpu(msg->hdr.tid);
480
481 if (msg->front.iov_len != sizeof(*reply))
482 goto bad;
483 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
484
485 mutex_lock(&monc->mutex);
486 req = __lookup_generic_req(monc, tid);
487 if (req) {
488 *(struct ceph_statfs *)req->buf = reply->st;
489 req->result = 0;
490 get_generic_request(req);
491 }
492 mutex_unlock(&monc->mutex);
493 if (req) {
494 complete_all(&req->completion);
495 put_generic_request(req);
496 }
497 return;
498
499bad:
500 pr_err("corrupt generic reply, tid %llu\n", tid);
501 ceph_msg_dump(msg);
502}
503
504/*
505 * Do a synchronous statfs().
506 */
507int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
508{
509 struct ceph_mon_generic_request *req;
510 struct ceph_mon_statfs *h;
511 int err;
512
513 req = kzalloc(sizeof(*req), GFP_NOFS);
514 if (!req)
515 return -ENOMEM;
516
517 kref_init(&req->kref);
518 req->buf = buf;
519 req->buf_len = sizeof(*buf);
520 init_completion(&req->completion);
521
522 err = -ENOMEM;
523 req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
524 if (!req->request)
525 goto out;
526 req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
527 if (!req->reply)
528 goto out;
529
530 /* fill out request */
531 h = req->request->front.iov_base;
532 h->monhdr.have_version = 0;
533 h->monhdr.session_mon = cpu_to_le16(-1);
534 h->monhdr.session_mon_tid = 0;
535 h->fsid = monc->monmap->fsid;
536
537 err = do_generic_request(monc, req);
538
539out:
540 kref_put(&req->kref, release_generic_request);
541 return err;
542}
543
544/*
545 * pool ops
546 */
547static int get_poolop_reply_buf(const char *src, size_t src_len,
548 char *dst, size_t dst_len)
549{
550 u32 buf_len;
551
552 if (src_len != sizeof(u32) + dst_len)
553 return -EINVAL;
554
555 buf_len = le32_to_cpu(*(u32 *)src);
556 if (buf_len != dst_len)
557 return -EINVAL;
558
559 memcpy(dst, src + sizeof(u32), dst_len);
560 return 0;
561}
562
563static void handle_poolop_reply(struct ceph_mon_client *monc,
564 struct ceph_msg *msg)
565{
566 struct ceph_mon_generic_request *req;
567 struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
568 u64 tid = le64_to_cpu(msg->hdr.tid);
569
570 if (msg->front.iov_len < sizeof(*reply))
571 goto bad;
572 dout("handle_poolop_reply %p tid %llu\n", msg, tid);
573
574 mutex_lock(&monc->mutex);
575 req = __lookup_generic_req(monc, tid);
576 if (req) {
577 if (req->buf_len &&
578 get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
579 msg->front.iov_len - sizeof(*reply),
580 req->buf, req->buf_len) < 0) {
581 mutex_unlock(&monc->mutex);
582 goto bad;
583 }
584 req->result = le32_to_cpu(reply->reply_code);
585 get_generic_request(req);
586 }
587 mutex_unlock(&monc->mutex);
588 if (req) {
589 complete(&req->completion);
590 put_generic_request(req);
591 }
592 return;
593
594bad:
595 pr_err("corrupt generic reply, tid %llu\n", tid);
596 ceph_msg_dump(msg);
597}
598
599/*
600 * Do a synchronous pool op.
601 */
602int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
603 u32 pool, u64 snapid,
604 char *buf, int len)
605{
606 struct ceph_mon_generic_request *req;
607 struct ceph_mon_poolop *h;
608 int err;
609
610 req = kzalloc(sizeof(*req), GFP_NOFS);
611 if (!req)
612 return -ENOMEM;
613
614 kref_init(&req->kref);
615 req->buf = buf;
616 req->buf_len = len;
617 init_completion(&req->completion);
618
619 err = -ENOMEM;
620 req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS);
621 if (!req->request)
622 goto out;
623 req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS);
624 if (!req->reply)
625 goto out;
626
627 /* fill out request */
628 req->request->hdr.version = cpu_to_le16(2);
629 h = req->request->front.iov_base;
630 h->monhdr.have_version = 0;
631 h->monhdr.session_mon = cpu_to_le16(-1);
632 h->monhdr.session_mon_tid = 0;
633 h->fsid = monc->monmap->fsid;
634 h->pool = cpu_to_le32(pool);
635 h->op = cpu_to_le32(op);
636 h->auid = 0;
637 h->snapid = cpu_to_le64(snapid);
638 h->name_len = 0;
639
640 err = do_generic_request(monc, req);
641
642out:
643 kref_put(&req->kref, release_generic_request);
644 return err;
645}
646
647int ceph_monc_create_snapid(struct ceph_mon_client *monc,
648 u32 pool, u64 *snapid)
649{
650 return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
651 pool, 0, (char *)snapid, sizeof(*snapid));
652
653}
654
655int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
656 u32 pool, u64 snapid)
657{
658 return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
659 pool, snapid, 0, 0);
660
661}
662
663/*
664 * Resend pending generic requests.
665 */
666static void __resend_generic_request(struct ceph_mon_client *monc)
667{
668 struct ceph_mon_generic_request *req;
669 struct rb_node *p;
670
671 for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
672 req = rb_entry(p, struct ceph_mon_generic_request, node);
673 ceph_con_revoke(monc->con, req->request);
674 ceph_con_send(monc->con, ceph_msg_get(req->request));
675 }
676}
677
678/*
679 * Delayed work. If we haven't mounted yet, retry. Otherwise,
680 * renew/retry subscription as needed (in case it is timing out, or we
681 * got an ENOMEM). And keep the monitor connection alive.
682 */
683static void delayed_work(struct work_struct *work)
684{
685 struct ceph_mon_client *monc =
686 container_of(work, struct ceph_mon_client, delayed_work.work);
687
688 dout("monc delayed_work\n");
689 mutex_lock(&monc->mutex);
690 if (monc->hunting) {
691 __close_session(monc);
692 __open_session(monc); /* continue hunting */
693 } else {
694 ceph_con_keepalive(monc->con);
695
696 __validate_auth(monc);
697
698 if (monc->auth->ops->is_authenticated(monc->auth))
699 __send_subscribe(monc);
700 }
701 __schedule_delayed(monc);
702 mutex_unlock(&monc->mutex);
703}
704
705/*
706 * On startup, we build a temporary monmap populated with the IPs
707 * provided by mount(2).
708 */
709static int build_initial_monmap(struct ceph_mon_client *monc)
710{
711 struct ceph_mount_args *args = monc->client->mount_args;
712 struct ceph_entity_addr *mon_addr = args->mon_addr;
713 int num_mon = args->num_mon;
714 int i;
715
716 /* build initial monmap */
717 monc->monmap = kzalloc(sizeof(*monc->monmap) +
718 num_mon*sizeof(monc->monmap->mon_inst[0]),
719 GFP_KERNEL);
720 if (!monc->monmap)
721 return -ENOMEM;
722 for (i = 0; i < num_mon; i++) {
723 monc->monmap->mon_inst[i].addr = mon_addr[i];
724 monc->monmap->mon_inst[i].addr.nonce = 0;
725 monc->monmap->mon_inst[i].name.type =
726 CEPH_ENTITY_TYPE_MON;
727 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
728 }
729 monc->monmap->num_mon = num_mon;
730 monc->have_fsid = false;
731
732 /* release addr memory */
733 kfree(args->mon_addr);
734 args->mon_addr = NULL;
735 args->num_mon = 0;
736 return 0;
737}
738
739int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
740{
741 int err = 0;
742
743 dout("init\n");
744 memset(monc, 0, sizeof(*monc));
745 monc->client = cl;
746 monc->monmap = NULL;
747 mutex_init(&monc->mutex);
748
749 err = build_initial_monmap(monc);
750 if (err)
751 goto out;
752
753 monc->con = NULL;
754
755 /* authentication */
756 monc->auth = ceph_auth_init(cl->mount_args->name,
757 cl->mount_args->secret);
758 if (IS_ERR(monc->auth))
759 return PTR_ERR(monc->auth);
760 monc->auth->want_keys =
761 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
762 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
763
764 /* msgs */
765 err = -ENOMEM;
766 monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
767 sizeof(struct ceph_mon_subscribe_ack),
768 GFP_NOFS);
769 if (!monc->m_subscribe_ack)
770 goto out_monmap;
771
772 monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
773 if (!monc->m_subscribe)
774 goto out_subscribe_ack;
775
776 monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
777 if (!monc->m_auth_reply)
778 goto out_subscribe;
779
780 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
781 monc->pending_auth = 0;
782 if (!monc->m_auth)
783 goto out_auth_reply;
784
785 monc->cur_mon = -1;
786 monc->hunting = true;
787 monc->sub_renew_after = jiffies;
788 monc->sub_sent = 0;
789
790 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
791 monc->generic_request_tree = RB_ROOT;
792 monc->num_generic_requests = 0;
793 monc->last_tid = 0;
794
795 monc->have_mdsmap = 0;
796 monc->have_osdmap = 0;
797 monc->want_next_osdmap = 1;
798 return 0;
799
800out_auth_reply:
801 ceph_msg_put(monc->m_auth_reply);
802out_subscribe:
803 ceph_msg_put(monc->m_subscribe);
804out_subscribe_ack:
805 ceph_msg_put(monc->m_subscribe_ack);
806out_monmap:
807 kfree(monc->monmap);
808out:
809 return err;
810}
811
812void ceph_monc_stop(struct ceph_mon_client *monc)
813{
814 dout("stop\n");
815 cancel_delayed_work_sync(&monc->delayed_work);
816
817 mutex_lock(&monc->mutex);
818 __close_session(monc);
819 if (monc->con) {
820 monc->con->private = NULL;
821 monc->con->ops->put(monc->con);
822 monc->con = NULL;
823 }
824 mutex_unlock(&monc->mutex);
825
826 ceph_auth_destroy(monc->auth);
827
828 ceph_msg_put(monc->m_auth);
829 ceph_msg_put(monc->m_auth_reply);
830 ceph_msg_put(monc->m_subscribe);
831 ceph_msg_put(monc->m_subscribe_ack);
832
833 kfree(monc->monmap);
834}
835
836static void handle_auth_reply(struct ceph_mon_client *monc,
837 struct ceph_msg *msg)
838{
839 int ret;
840 int was_auth = 0;
841
842 mutex_lock(&monc->mutex);
843 if (monc->auth->ops)
844 was_auth = monc->auth->ops->is_authenticated(monc->auth);
845 monc->pending_auth = 0;
846 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
847 msg->front.iov_len,
848 monc->m_auth->front.iov_base,
849 monc->m_auth->front_max);
850 if (ret < 0) {
851 monc->client->auth_err = ret;
852 wake_up_all(&monc->client->auth_wq);
853 } else if (ret > 0) {
854 __send_prepared_auth_request(monc, ret);
855 } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
856 dout("authenticated, starting session\n");
857
858 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
859 monc->client->msgr->inst.name.num =
860 cpu_to_le64(monc->auth->global_id);
861
862 __send_subscribe(monc);
863 __resend_generic_request(monc);
864 }
865 mutex_unlock(&monc->mutex);
866}
867
868static int __validate_auth(struct ceph_mon_client *monc)
869{
870 int ret;
871
872 if (monc->pending_auth)
873 return 0;
874
875 ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
876 monc->m_auth->front_max);
877 if (ret <= 0)
878 return ret; /* either an error, or no need to authenticate */
879 __send_prepared_auth_request(monc, ret);
880 return 0;
881}
882
883int ceph_monc_validate_auth(struct ceph_mon_client *monc)
884{
885 int ret;
886
887 mutex_lock(&monc->mutex);
888 ret = __validate_auth(monc);
889 mutex_unlock(&monc->mutex);
890 return ret;
891}
892
893/*
894 * handle incoming message
895 */
896static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
897{
898 struct ceph_mon_client *monc = con->private;
899 int type = le16_to_cpu(msg->hdr.type);
900
901 if (!monc)
902 return;
903
904 switch (type) {
905 case CEPH_MSG_AUTH_REPLY:
906 handle_auth_reply(monc, msg);
907 break;
908
909 case CEPH_MSG_MON_SUBSCRIBE_ACK:
910 handle_subscribe_ack(monc, msg);
911 break;
912
913 case CEPH_MSG_STATFS_REPLY:
914 handle_statfs_reply(monc, msg);
915 break;
916
917 case CEPH_MSG_POOLOP_REPLY:
918 handle_poolop_reply(monc, msg);
919 break;
920
921 case CEPH_MSG_MON_MAP:
922 ceph_monc_handle_map(monc, msg);
923 break;
924
925 case CEPH_MSG_MDS_MAP:
926 ceph_mdsc_handle_map(&monc->client->mdsc, msg);
927 break;
928
929 case CEPH_MSG_OSD_MAP:
930 ceph_osdc_handle_map(&monc->client->osdc, msg);
931 break;
932
933 default:
934 pr_err("received unknown message type %d %s\n", type,
935 ceph_msg_type_name(type));
936 }
937 ceph_msg_put(msg);
938}
939
940/*
941 * Allocate memory for incoming message
942 */
943static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
944 struct ceph_msg_header *hdr,
945 int *skip)
946{
947 struct ceph_mon_client *monc = con->private;
948 int type = le16_to_cpu(hdr->type);
949 int front_len = le32_to_cpu(hdr->front_len);
950 struct ceph_msg *m = NULL;
951
952 *skip = 0;
953
954 switch (type) {
955 case CEPH_MSG_MON_SUBSCRIBE_ACK:
956 m = ceph_msg_get(monc->m_subscribe_ack);
957 break;
958 case CEPH_MSG_POOLOP_REPLY:
959 case CEPH_MSG_STATFS_REPLY:
960 return get_generic_reply(con, hdr, skip);
961 case CEPH_MSG_AUTH_REPLY:
962 m = ceph_msg_get(monc->m_auth_reply);
963 break;
964 case CEPH_MSG_MON_MAP:
965 case CEPH_MSG_MDS_MAP:
966 case CEPH_MSG_OSD_MAP:
967 m = ceph_msg_new(type, front_len, GFP_NOFS);
968 break;
969 }
970
971 if (!m) {
972 pr_info("alloc_msg unknown type %d\n", type);
973 *skip = 1;
974 }
975 return m;
976}
977
978/*
979 * If the monitor connection resets, pick a new monitor and resubmit
980 * any pending requests.
981 */
982static void mon_fault(struct ceph_connection *con)
983{
984 struct ceph_mon_client *monc = con->private;
985
986 if (!monc)
987 return;
988
989 dout("mon_fault\n");
990 mutex_lock(&monc->mutex);
991 if (!con->private)
992 goto out;
993
994 if (monc->con && !monc->hunting)
995 pr_info("mon%d %s session lost, "
996 "hunting for new mon\n", monc->cur_mon,
997 pr_addr(&monc->con->peer_addr.in_addr));
998
999 __close_session(monc);
1000 if (!monc->hunting) {
1001 /* start hunting */
1002 monc->hunting = true;
1003 __open_session(monc);
1004 } else {
1005 /* already hunting, let's wait a bit */
1006 __schedule_delayed(monc);
1007 }
1008out:
1009 mutex_unlock(&monc->mutex);
1010}
1011
1012static const struct ceph_connection_operations mon_con_ops = {
1013 .get = ceph_con_get,
1014 .put = ceph_con_put,
1015 .dispatch = dispatch,
1016 .fault = mon_fault,
1017 .alloc_msg = mon_alloc_msg,
1018};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
deleted file mode 100644
index 8e396f2c0963..000000000000
--- a/fs/ceph/mon_client.h
+++ /dev/null
@@ -1,121 +0,0 @@
1#ifndef _FS_CEPH_MON_CLIENT_H
2#define _FS_CEPH_MON_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/rbtree.h>
7
8#include "messenger.h"
9
10struct ceph_client;
11struct ceph_mount_args;
12struct ceph_auth_client;
13
14/*
15 * The monitor map enumerates the set of all monitors.
16 */
17struct ceph_monmap {
18 struct ceph_fsid fsid;
19 u32 epoch;
20 u32 num_mon;
21 struct ceph_entity_inst mon_inst[0];
22};
23
24struct ceph_mon_client;
25struct ceph_mon_generic_request;
26
27
28/*
29 * Generic mechanism for resending monitor requests.
30 */
31typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
32 int newmon);
33
34/* a pending monitor request */
35struct ceph_mon_request {
36 struct ceph_mon_client *monc;
37 struct delayed_work delayed_work;
38 unsigned long delay;
39 ceph_monc_request_func_t do_request;
40};
41
42/*
43 * ceph_mon_generic_request is being used for the statfs and poolop requests
44 * which are bening done a bit differently because we need to get data back
45 * to the caller
46 */
47struct ceph_mon_generic_request {
48 struct kref kref;
49 u64 tid;
50 struct rb_node node;
51 int result;
52 void *buf;
53 int buf_len;
54 struct completion completion;
55 struct ceph_msg *request; /* original request */
56 struct ceph_msg *reply; /* and reply */
57};
58
59struct ceph_mon_client {
60 struct ceph_client *client;
61 struct ceph_monmap *monmap;
62
63 struct mutex mutex;
64 struct delayed_work delayed_work;
65
66 struct ceph_auth_client *auth;
67 struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
68 int pending_auth;
69
70 bool hunting;
71 int cur_mon; /* last monitor i contacted */
72 unsigned long sub_sent, sub_renew_after;
73 struct ceph_connection *con;
74 bool have_fsid;
75
76 /* pending generic requests */
77 struct rb_root generic_request_tree;
78 int num_generic_requests;
79 u64 last_tid;
80
81 /* mds/osd map */
82 int want_next_osdmap; /* 1 = want, 2 = want+asked */
83 u32 have_osdmap, have_mdsmap;
84
85#ifdef CONFIG_DEBUG_FS
86 struct dentry *debugfs_file;
87#endif
88};
89
90extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
91extern int ceph_monmap_contains(struct ceph_monmap *m,
92 struct ceph_entity_addr *addr);
93
94extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
95extern void ceph_monc_stop(struct ceph_mon_client *monc);
96
97/*
98 * The model here is to indicate that we need a new map of at least
99 * epoch @want, and also call in when we receive a map. We will
100 * periodically rerequest the map from the monitor cluster until we
101 * get what we want.
102 */
103extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
104extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
105
106extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
107
108extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
109 struct ceph_statfs *buf);
110
111extern int ceph_monc_open_session(struct ceph_mon_client *monc);
112
113extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
114
115extern int ceph_monc_create_snapid(struct ceph_mon_client *monc,
116 u32 pool, u64 *snapid);
117
118extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
119 u32 pool, u64 snapid);
120
121#endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
deleted file mode 100644
index dd65a6438131..000000000000
--- a/fs/ceph/msgpool.c
+++ /dev/null
@@ -1,64 +0,0 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/sched.h>
5#include <linux/types.h>
6#include <linux/vmalloc.h>
7
8#include "msgpool.h"
9
10static void *alloc_fn(gfp_t gfp_mask, void *arg)
11{
12 struct ceph_msgpool *pool = arg;
13 void *p;
14
15 p = ceph_msg_new(0, pool->front_len, gfp_mask);
16 if (!p)
17 pr_err("msgpool %s alloc failed\n", pool->name);
18 return p;
19}
20
21static void free_fn(void *element, void *arg)
22{
23 ceph_msg_put(element);
24}
25
26int ceph_msgpool_init(struct ceph_msgpool *pool,
27 int front_len, int size, bool blocking, const char *name)
28{
29 pool->front_len = front_len;
30 pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
31 if (!pool->pool)
32 return -ENOMEM;
33 pool->name = name;
34 return 0;
35}
36
37void ceph_msgpool_destroy(struct ceph_msgpool *pool)
38{
39 mempool_destroy(pool->pool);
40}
41
42struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
43 int front_len)
44{
45 if (front_len > pool->front_len) {
46 pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
47 pool->name, front_len, pool->front_len);
48 WARN_ON(1);
49
50 /* try to alloc a fresh message */
51 return ceph_msg_new(0, front_len, GFP_NOFS);
52 }
53
54 return mempool_alloc(pool->pool, GFP_NOFS);
55}
56
57void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
58{
59 /* reset msg front_len; user may have changed it */
60 msg->front.iov_len = pool->front_len;
61 msg->hdr.front_len = cpu_to_le32(pool->front_len);
62
63 kref_init(&msg->kref); /* retake single ref */
64}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
deleted file mode 100644
index a362605f9368..000000000000
--- a/fs/ceph/msgpool.h
+++ /dev/null
@@ -1,25 +0,0 @@
1#ifndef _FS_CEPH_MSGPOOL
2#define _FS_CEPH_MSGPOOL
3
4#include <linux/mempool.h>
5#include "messenger.h"
6
7/*
8 * we use memory pools for preallocating messages we may receive, to
9 * avoid unexpected OOM conditions.
10 */
11struct ceph_msgpool {
12 const char *name;
13 mempool_t *pool;
14 int front_len; /* preallocated payload size */
15};
16
17extern int ceph_msgpool_init(struct ceph_msgpool *pool,
18 int front_len, int size, bool blocking,
19 const char *name);
20extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
21extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
22 int front_len);
23extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
24
25#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
deleted file mode 100644
index 680d3d648cac..000000000000
--- a/fs/ceph/msgr.h
+++ /dev/null
@@ -1,175 +0,0 @@
1#ifndef CEPH_MSGR_H
2#define CEPH_MSGR_H
3
4/*
5 * Data types for message passing layer used by Ceph.
6 */
7
8#define CEPH_MON_PORT 6789 /* default monitor port */
9
10/*
11 * client-side processes will try to bind to ports in this
12 * range, simply for the benefit of tools like nmap or wireshark
13 * that would like to identify the protocol.
14 */
15#define CEPH_PORT_FIRST 6789
16#define CEPH_PORT_START 6800 /* non-monitors start here */
17#define CEPH_PORT_LAST 6900
18
19/*
20 * tcp connection banner. include a protocol version. and adjust
21 * whenever the wire protocol changes. try to keep this string length
22 * constant.
23 */
24#define CEPH_BANNER "ceph v027"
25#define CEPH_BANNER_MAX_LEN 30
26
27
28/*
29 * Rollover-safe type and comparator for 32-bit sequence numbers.
30 * Comparator returns -1, 0, or 1.
31 */
32typedef __u32 ceph_seq_t;
33
34static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
35{
36 return (__s32)a - (__s32)b;
37}
38
39
40/*
41 * entity_name -- logical name for a process participating in the
42 * network, e.g. 'mds0' or 'osd3'.
43 */
44struct ceph_entity_name {
45 __u8 type; /* CEPH_ENTITY_TYPE_* */
46 __le64 num;
47} __attribute__ ((packed));
48
49#define CEPH_ENTITY_TYPE_MON 0x01
50#define CEPH_ENTITY_TYPE_MDS 0x02
51#define CEPH_ENTITY_TYPE_OSD 0x04
52#define CEPH_ENTITY_TYPE_CLIENT 0x08
53#define CEPH_ENTITY_TYPE_AUTH 0x20
54
55#define CEPH_ENTITY_TYPE_ANY 0xFF
56
57extern const char *ceph_entity_type_name(int type);
58
59/*
60 * entity_addr -- network address
61 */
62struct ceph_entity_addr {
63 __le32 type;
64 __le32 nonce; /* unique id for process (e.g. pid) */
65 struct sockaddr_storage in_addr;
66} __attribute__ ((packed));
67
68struct ceph_entity_inst {
69 struct ceph_entity_name name;
70 struct ceph_entity_addr addr;
71} __attribute__ ((packed));
72
73
74/* used by message exchange protocol */
75#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
76#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
77#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
78 incoming connection */
79#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
80 with higher cseq */
81#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
82 with higher gseq */
83#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
84#define CEPH_MSGR_TAG_MSG 7 /* message */
85#define CEPH_MSGR_TAG_ACK 8 /* message ack */
86#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
87#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
88#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
89#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */
90
91
92/*
93 * connection negotiation
94 */
95struct ceph_msg_connect {
96 __le64 features; /* supported feature bits */
97 __le32 host_type; /* CEPH_ENTITY_TYPE_* */
98 __le32 global_seq; /* count connections initiated by this host */
99 __le32 connect_seq; /* count connections initiated in this session */
100 __le32 protocol_version;
101 __le32 authorizer_protocol;
102 __le32 authorizer_len;
103 __u8 flags; /* CEPH_MSG_CONNECT_* */
104} __attribute__ ((packed));
105
106struct ceph_msg_connect_reply {
107 __u8 tag;
108 __le64 features; /* feature bits for this session */
109 __le32 global_seq;
110 __le32 connect_seq;
111 __le32 protocol_version;
112 __le32 authorizer_len;
113 __u8 flags;
114} __attribute__ ((packed));
115
116#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
117
118
119/*
120 * message header
121 */
122struct ceph_msg_header_old {
123 __le64 seq; /* message seq# for this session */
124 __le64 tid; /* transaction id */
125 __le16 type; /* message type */
126 __le16 priority; /* priority. higher value == higher priority */
127 __le16 version; /* version of message encoding */
128
129 __le32 front_len; /* bytes in main payload */
130 __le32 middle_len;/* bytes in middle payload */
131 __le32 data_len; /* bytes of data payload */
132 __le16 data_off; /* sender: include full offset;
133 receiver: mask against ~PAGE_MASK */
134
135 struct ceph_entity_inst src, orig_src;
136 __le32 reserved;
137 __le32 crc; /* header crc32c */
138} __attribute__ ((packed));
139
140struct ceph_msg_header {
141 __le64 seq; /* message seq# for this session */
142 __le64 tid; /* transaction id */
143 __le16 type; /* message type */
144 __le16 priority; /* priority. higher value == higher priority */
145 __le16 version; /* version of message encoding */
146
147 __le32 front_len; /* bytes in main payload */
148 __le32 middle_len;/* bytes in middle payload */
149 __le32 data_len; /* bytes of data payload */
150 __le16 data_off; /* sender: include full offset;
151 receiver: mask against ~PAGE_MASK */
152
153 struct ceph_entity_name src;
154 __le32 reserved;
155 __le32 crc; /* header crc32c */
156} __attribute__ ((packed));
157
158#define CEPH_MSG_PRIO_LOW 64
159#define CEPH_MSG_PRIO_DEFAULT 127
160#define CEPH_MSG_PRIO_HIGH 196
161#define CEPH_MSG_PRIO_HIGHEST 255
162
163/*
164 * follows data payload
165 */
166struct ceph_msg_footer {
167 __le32 front_crc, middle_crc, data_crc;
168 __u8 flags;
169} __attribute__ ((packed));
170
171#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */
172#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
173
174
175#endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
deleted file mode 100644
index dfced1dacbcd..000000000000
--- a/fs/ceph/osd_client.c
+++ /dev/null
@@ -1,1539 +0,0 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/highmem.h>
5#include <linux/mm.h>
6#include <linux/pagemap.h>
7#include <linux/slab.h>
8#include <linux/uaccess.h>
9
10#include "super.h"
11#include "osd_client.h"
12#include "messenger.h"
13#include "decode.h"
14#include "auth.h"
15
16#define OSD_OP_FRONT_LEN 4096
17#define OSD_OPREPLY_FRONT_LEN 512
18
19static const struct ceph_connection_operations osd_con_ops;
20static int __kick_requests(struct ceph_osd_client *osdc,
21 struct ceph_osd *kickosd);
22
23static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
24
25/*
26 * Implement client access to distributed object storage cluster.
27 *
28 * All data objects are stored within a cluster/cloud of OSDs, or
29 * "object storage devices." (Note that Ceph OSDs have _nothing_ to
30 * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
31 * remote daemons serving up and coordinating consistent and safe
32 * access to storage.
33 *
34 * Cluster membership and the mapping of data objects onto storage devices
35 * are described by the osd map.
36 *
37 * We keep track of pending OSD requests (read, write), resubmit
38 * requests to different OSDs when the cluster topology/data layout
39 * change, or retry the affected requests when the communications
40 * channel with an OSD is reset.
41 */
42
43/*
44 * calculate the mapping of a file extent onto an object, and fill out the
45 * request accordingly. shorten extent as necessary if it crosses an
46 * object boundary.
47 *
48 * fill osd op in request message.
49 */
50static void calc_layout(struct ceph_osd_client *osdc,
51 struct ceph_vino vino, struct ceph_file_layout *layout,
52 u64 off, u64 *plen,
53 struct ceph_osd_request *req)
54{
55 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
56 struct ceph_osd_op *op = (void *)(reqhead + 1);
57 u64 orig_len = *plen;
58 u64 objoff, objlen; /* extent in object */
59 u64 bno;
60
61 reqhead->snapid = cpu_to_le64(vino.snap);
62
63 /* object extent? */
64 ceph_calc_file_object_mapping(layout, off, plen, &bno,
65 &objoff, &objlen);
66 if (*plen < orig_len)
67 dout(" skipping last %llu, final file extent %llu~%llu\n",
68 orig_len - *plen, off, *plen);
69
70 sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
71 req->r_oid_len = strlen(req->r_oid);
72
73 op->extent.offset = cpu_to_le64(objoff);
74 op->extent.length = cpu_to_le64(objlen);
75 req->r_num_pages = calc_pages_for(off, *plen);
76
77 dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
78 req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
79}
80
81/*
82 * requests
83 */
84void ceph_osdc_release_request(struct kref *kref)
85{
86 struct ceph_osd_request *req = container_of(kref,
87 struct ceph_osd_request,
88 r_kref);
89
90 if (req->r_request)
91 ceph_msg_put(req->r_request);
92 if (req->r_reply)
93 ceph_msg_put(req->r_reply);
94 if (req->r_con_filling_msg) {
95 dout("release_request revoking pages %p from con %p\n",
96 req->r_pages, req->r_con_filling_msg);
97 ceph_con_revoke_message(req->r_con_filling_msg,
98 req->r_reply);
99 ceph_con_put(req->r_con_filling_msg);
100 }
101 if (req->r_own_pages)
102 ceph_release_page_vector(req->r_pages,
103 req->r_num_pages);
104 ceph_put_snap_context(req->r_snapc);
105 if (req->r_mempool)
106 mempool_free(req, req->r_osdc->req_mempool);
107 else
108 kfree(req);
109}
110
111/*
112 * build new request AND message, calculate layout, and adjust file
113 * extent as needed.
114 *
115 * if the file was recently truncated, we include information about its
116 * old and new size so that the object can be updated appropriately. (we
117 * avoid synchronously deleting truncated objects because it's slow.)
118 *
119 * if @do_sync, include a 'startsync' command so that the osd will flush
120 * data quickly.
121 */
122struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
123 struct ceph_file_layout *layout,
124 struct ceph_vino vino,
125 u64 off, u64 *plen,
126 int opcode, int flags,
127 struct ceph_snap_context *snapc,
128 int do_sync,
129 u32 truncate_seq,
130 u64 truncate_size,
131 struct timespec *mtime,
132 bool use_mempool, int num_reply)
133{
134 struct ceph_osd_request *req;
135 struct ceph_msg *msg;
136 struct ceph_osd_request_head *head;
137 struct ceph_osd_op *op;
138 void *p;
139 int num_op = 1 + do_sync;
140 size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
141 int i;
142
143 if (use_mempool) {
144 req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
145 memset(req, 0, sizeof(*req));
146 } else {
147 req = kzalloc(sizeof(*req), GFP_NOFS);
148 }
149 if (req == NULL)
150 return NULL;
151
152 req->r_osdc = osdc;
153 req->r_mempool = use_mempool;
154 kref_init(&req->r_kref);
155 init_completion(&req->r_completion);
156 init_completion(&req->r_safe_completion);
157 INIT_LIST_HEAD(&req->r_unsafe_item);
158 req->r_flags = flags;
159
160 WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
161
162 /* create reply message */
163 if (use_mempool)
164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
165 else
166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
167 OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
168 if (!msg) {
169 ceph_osdc_put_request(req);
170 return NULL;
171 }
172 req->r_reply = msg;
173
174 /* create request message; allow space for oid */
175 msg_size += 40;
176 if (snapc)
177 msg_size += sizeof(u64) * snapc->num_snaps;
178 if (use_mempool)
179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
180 else
181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
182 if (!msg) {
183 ceph_osdc_put_request(req);
184 return NULL;
185 }
186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
187 memset(msg->front.iov_base, 0, msg->front.iov_len);
188 head = msg->front.iov_base;
189 op = (void *)(head + 1);
190 p = (void *)(op + num_op);
191
192 req->r_request = msg;
193 req->r_snapc = ceph_get_snap_context(snapc);
194
195 head->client_inc = cpu_to_le32(1); /* always, for now. */
196 head->flags = cpu_to_le32(flags);
197 if (flags & CEPH_OSD_FLAG_WRITE)
198 ceph_encode_timespec(&head->mtime, mtime);
199 head->num_ops = cpu_to_le16(num_op);
200 op->op = cpu_to_le16(opcode);
201
202 /* calculate max write size */
203 calc_layout(osdc, vino, layout, off, plen, req);
204 req->r_file_layout = *layout; /* keep a copy */
205
206 if (flags & CEPH_OSD_FLAG_WRITE) {
207 req->r_request->hdr.data_off = cpu_to_le16(off);
208 req->r_request->hdr.data_len = cpu_to_le32(*plen);
209 op->payload_len = cpu_to_le32(*plen);
210 }
211 op->extent.truncate_size = cpu_to_le64(truncate_size);
212 op->extent.truncate_seq = cpu_to_le32(truncate_seq);
213
214 /* fill in oid */
215 head->object_len = cpu_to_le32(req->r_oid_len);
216 memcpy(p, req->r_oid, req->r_oid_len);
217 p += req->r_oid_len;
218
219 if (do_sync) {
220 op++;
221 op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
222 }
223 if (snapc) {
224 head->snap_seq = cpu_to_le64(snapc->seq);
225 head->num_snaps = cpu_to_le32(snapc->num_snaps);
226 for (i = 0; i < snapc->num_snaps; i++) {
227 put_unaligned_le64(snapc->snaps[i], p);
228 p += sizeof(u64);
229 }
230 }
231
232 BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
233 msg_size = p - msg->front.iov_base;
234 msg->front.iov_len = msg_size;
235 msg->hdr.front_len = cpu_to_le32(msg_size);
236 return req;
237}
238
239/*
240 * We keep osd requests in an rbtree, sorted by ->r_tid.
241 */
242static void __insert_request(struct ceph_osd_client *osdc,
243 struct ceph_osd_request *new)
244{
245 struct rb_node **p = &osdc->requests.rb_node;
246 struct rb_node *parent = NULL;
247 struct ceph_osd_request *req = NULL;
248
249 while (*p) {
250 parent = *p;
251 req = rb_entry(parent, struct ceph_osd_request, r_node);
252 if (new->r_tid < req->r_tid)
253 p = &(*p)->rb_left;
254 else if (new->r_tid > req->r_tid)
255 p = &(*p)->rb_right;
256 else
257 BUG();
258 }
259
260 rb_link_node(&new->r_node, parent, p);
261 rb_insert_color(&new->r_node, &osdc->requests);
262}
263
264static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
265 u64 tid)
266{
267 struct ceph_osd_request *req;
268 struct rb_node *n = osdc->requests.rb_node;
269
270 while (n) {
271 req = rb_entry(n, struct ceph_osd_request, r_node);
272 if (tid < req->r_tid)
273 n = n->rb_left;
274 else if (tid > req->r_tid)
275 n = n->rb_right;
276 else
277 return req;
278 }
279 return NULL;
280}
281
282static struct ceph_osd_request *
283__lookup_request_ge(struct ceph_osd_client *osdc,
284 u64 tid)
285{
286 struct ceph_osd_request *req;
287 struct rb_node *n = osdc->requests.rb_node;
288
289 while (n) {
290 req = rb_entry(n, struct ceph_osd_request, r_node);
291 if (tid < req->r_tid) {
292 if (!n->rb_left)
293 return req;
294 n = n->rb_left;
295 } else if (tid > req->r_tid) {
296 n = n->rb_right;
297 } else {
298 return req;
299 }
300 }
301 return NULL;
302}
303
304
305/*
306 * If the osd connection drops, we need to resubmit all requests.
307 */
308static void osd_reset(struct ceph_connection *con)
309{
310 struct ceph_osd *osd = con->private;
311 struct ceph_osd_client *osdc;
312
313 if (!osd)
314 return;
315 dout("osd_reset osd%d\n", osd->o_osd);
316 osdc = osd->o_osdc;
317 down_read(&osdc->map_sem);
318 kick_requests(osdc, osd);
319 up_read(&osdc->map_sem);
320}
321
322/*
323 * Track open sessions with osds.
324 */
325static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
326{
327 struct ceph_osd *osd;
328
329 osd = kzalloc(sizeof(*osd), GFP_NOFS);
330 if (!osd)
331 return NULL;
332
333 atomic_set(&osd->o_ref, 1);
334 osd->o_osdc = osdc;
335 INIT_LIST_HEAD(&osd->o_requests);
336 INIT_LIST_HEAD(&osd->o_osd_lru);
337 osd->o_incarnation = 1;
338
339 ceph_con_init(osdc->client->msgr, &osd->o_con);
340 osd->o_con.private = osd;
341 osd->o_con.ops = &osd_con_ops;
342 osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
343
344 INIT_LIST_HEAD(&osd->o_keepalive_item);
345 return osd;
346}
347
348static struct ceph_osd *get_osd(struct ceph_osd *osd)
349{
350 if (atomic_inc_not_zero(&osd->o_ref)) {
351 dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
352 atomic_read(&osd->o_ref));
353 return osd;
354 } else {
355 dout("get_osd %p FAIL\n", osd);
356 return NULL;
357 }
358}
359
360static void put_osd(struct ceph_osd *osd)
361{
362 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
363 atomic_read(&osd->o_ref) - 1);
364 if (atomic_dec_and_test(&osd->o_ref)) {
365 struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
366
367 if (osd->o_authorizer)
368 ac->ops->destroy_authorizer(ac, osd->o_authorizer);
369 kfree(osd);
370 }
371}
372
373/*
374 * remove an osd from our map
375 */
376static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
377{
378 dout("__remove_osd %p\n", osd);
379 BUG_ON(!list_empty(&osd->o_requests));
380 rb_erase(&osd->o_node, &osdc->osds);
381 list_del_init(&osd->o_osd_lru);
382 ceph_con_close(&osd->o_con);
383 put_osd(osd);
384}
385
386static void __move_osd_to_lru(struct ceph_osd_client *osdc,
387 struct ceph_osd *osd)
388{
389 dout("__move_osd_to_lru %p\n", osd);
390 BUG_ON(!list_empty(&osd->o_osd_lru));
391 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
392 osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
393}
394
395static void __remove_osd_from_lru(struct ceph_osd *osd)
396{
397 dout("__remove_osd_from_lru %p\n", osd);
398 if (!list_empty(&osd->o_osd_lru))
399 list_del_init(&osd->o_osd_lru);
400}
401
402static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
403{
404 struct ceph_osd *osd, *nosd;
405
406 dout("__remove_old_osds %p\n", osdc);
407 mutex_lock(&osdc->request_mutex);
408 list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
409 if (!remove_all && time_before(jiffies, osd->lru_ttl))
410 break;
411 __remove_osd(osdc, osd);
412 }
413 mutex_unlock(&osdc->request_mutex);
414}
415
416/*
417 * reset osd connect
418 */
419static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
420{
421 struct ceph_osd_request *req;
422 int ret = 0;
423
424 dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
425 if (list_empty(&osd->o_requests)) {
426 __remove_osd(osdc, osd);
427 } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
428 &osd->o_con.peer_addr,
429 sizeof(osd->o_con.peer_addr)) == 0 &&
430 !ceph_con_opened(&osd->o_con)) {
431 dout(" osd addr hasn't changed and connection never opened,"
432 " letting msgr retry");
433 /* touch each r_stamp for handle_timeout()'s benfit */
434 list_for_each_entry(req, &osd->o_requests, r_osd_item)
435 req->r_stamp = jiffies;
436 ret = -EAGAIN;
437 } else {
438 ceph_con_close(&osd->o_con);
439 ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
440 osd->o_incarnation++;
441 }
442 return ret;
443}
444
445static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
446{
447 struct rb_node **p = &osdc->osds.rb_node;
448 struct rb_node *parent = NULL;
449 struct ceph_osd *osd = NULL;
450
451 while (*p) {
452 parent = *p;
453 osd = rb_entry(parent, struct ceph_osd, o_node);
454 if (new->o_osd < osd->o_osd)
455 p = &(*p)->rb_left;
456 else if (new->o_osd > osd->o_osd)
457 p = &(*p)->rb_right;
458 else
459 BUG();
460 }
461
462 rb_link_node(&new->o_node, parent, p);
463 rb_insert_color(&new->o_node, &osdc->osds);
464}
465
466static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
467{
468 struct ceph_osd *osd;
469 struct rb_node *n = osdc->osds.rb_node;
470
471 while (n) {
472 osd = rb_entry(n, struct ceph_osd, o_node);
473 if (o < osd->o_osd)
474 n = n->rb_left;
475 else if (o > osd->o_osd)
476 n = n->rb_right;
477 else
478 return osd;
479 }
480 return NULL;
481}
482
483static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
484{
485 schedule_delayed_work(&osdc->timeout_work,
486 osdc->client->mount_args->osd_keepalive_timeout * HZ);
487}
488
489static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
490{
491 cancel_delayed_work(&osdc->timeout_work);
492}
493
494/*
495 * Register request, assign tid. If this is the first request, set up
496 * the timeout event.
497 */
498static void register_request(struct ceph_osd_client *osdc,
499 struct ceph_osd_request *req)
500{
501 mutex_lock(&osdc->request_mutex);
502 req->r_tid = ++osdc->last_tid;
503 req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
504 INIT_LIST_HEAD(&req->r_req_lru_item);
505
506 dout("register_request %p tid %lld\n", req, req->r_tid);
507 __insert_request(osdc, req);
508 ceph_osdc_get_request(req);
509 osdc->num_requests++;
510
511 if (osdc->num_requests == 1) {
512 dout(" first request, scheduling timeout\n");
513 __schedule_osd_timeout(osdc);
514 }
515 mutex_unlock(&osdc->request_mutex);
516}
517
518/*
519 * called under osdc->request_mutex
520 */
521static void __unregister_request(struct ceph_osd_client *osdc,
522 struct ceph_osd_request *req)
523{
524 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
525 rb_erase(&req->r_node, &osdc->requests);
526 osdc->num_requests--;
527
528 if (req->r_osd) {
529 /* make sure the original request isn't in flight. */
530 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
531
532 list_del_init(&req->r_osd_item);
533 if (list_empty(&req->r_osd->o_requests))
534 __move_osd_to_lru(osdc, req->r_osd);
535 req->r_osd = NULL;
536 }
537
538 ceph_osdc_put_request(req);
539
540 list_del_init(&req->r_req_lru_item);
541 if (osdc->num_requests == 0) {
542 dout(" no requests, canceling timeout\n");
543 __cancel_osd_timeout(osdc);
544 }
545}
546
547/*
548 * Cancel a previously queued request message
549 */
550static void __cancel_request(struct ceph_osd_request *req)
551{
552 if (req->r_sent) {
553 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
554 req->r_sent = 0;
555 }
556 list_del_init(&req->r_req_lru_item);
557}
558
559/*
560 * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
561 * (as needed), and set the request r_osd appropriately. If there is
562 * no up osd, set r_osd to NULL.
563 *
564 * Return 0 if unchanged, 1 if changed, or negative on error.
565 *
566 * Caller should hold map_sem for read and request_mutex.
567 */
568static int __map_osds(struct ceph_osd_client *osdc,
569 struct ceph_osd_request *req)
570{
571 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
572 struct ceph_pg pgid;
573 int acting[CEPH_PG_MAX_SIZE];
574 int o = -1, num = 0;
575 int err;
576
577 dout("map_osds %p tid %lld\n", req, req->r_tid);
578 err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
579 &req->r_file_layout, osdc->osdmap);
580 if (err)
581 return err;
582 pgid = reqhead->layout.ol_pgid;
583 req->r_pgid = pgid;
584
585 err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
586 if (err > 0) {
587 o = acting[0];
588 num = err;
589 }
590
591 if ((req->r_osd && req->r_osd->o_osd == o &&
592 req->r_sent >= req->r_osd->o_incarnation &&
593 req->r_num_pg_osds == num &&
594 memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
595 (req->r_osd == NULL && o == -1))
596 return 0; /* no change */
597
598 dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
599 req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
600 req->r_osd ? req->r_osd->o_osd : -1);
601
602 /* record full pg acting set */
603 memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
604 req->r_num_pg_osds = num;
605
606 if (req->r_osd) {
607 __cancel_request(req);
608 list_del_init(&req->r_osd_item);
609 req->r_osd = NULL;
610 }
611
612 req->r_osd = __lookup_osd(osdc, o);
613 if (!req->r_osd && o >= 0) {
614 err = -ENOMEM;
615 req->r_osd = create_osd(osdc);
616 if (!req->r_osd)
617 goto out;
618
619 dout("map_osds osd %p is osd%d\n", req->r_osd, o);
620 req->r_osd->o_osd = o;
621 req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
622 __insert_osd(osdc, req->r_osd);
623
624 ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
625 }
626
627 if (req->r_osd) {
628 __remove_osd_from_lru(req->r_osd);
629 list_add(&req->r_osd_item, &req->r_osd->o_requests);
630 }
631 err = 1; /* osd or pg changed */
632
633out:
634 return err;
635}
636
637/*
638 * caller should hold map_sem (for read) and request_mutex
639 */
640static int __send_request(struct ceph_osd_client *osdc,
641 struct ceph_osd_request *req)
642{
643 struct ceph_osd_request_head *reqhead;
644 int err;
645
646 err = __map_osds(osdc, req);
647 if (err < 0)
648 return err;
649 if (req->r_osd == NULL) {
650 dout("send_request %p no up osds in pg\n", req);
651 ceph_monc_request_next_osdmap(&osdc->client->monc);
652 return 0;
653 }
654
655 dout("send_request %p tid %llu to osd%d flags %d\n",
656 req, req->r_tid, req->r_osd->o_osd, req->r_flags);
657
658 reqhead = req->r_request->front.iov_base;
659 reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
660 reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */
661 reqhead->reassert_version = req->r_reassert_version;
662
663 req->r_stamp = jiffies;
664 list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
665
666 ceph_msg_get(req->r_request); /* send consumes a ref */
667 ceph_con_send(&req->r_osd->o_con, req->r_request);
668 req->r_sent = req->r_osd->o_incarnation;
669 return 0;
670}
671
672/*
673 * Timeout callback, called every N seconds when 1 or more osd
674 * requests has been active for more than N seconds. When this
675 * happens, we ping all OSDs with requests who have timed out to
676 * ensure any communications channel reset is detected. Reset the
677 * request timeouts another N seconds in the future as we go.
678 * Reschedule the timeout event another N seconds in future (unless
679 * there are no open requests).
680 */
681static void handle_timeout(struct work_struct *work)
682{
683 struct ceph_osd_client *osdc =
684 container_of(work, struct ceph_osd_client, timeout_work.work);
685 struct ceph_osd_request *req, *last_req = NULL;
686 struct ceph_osd *osd;
687 unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
688 unsigned long keepalive =
689 osdc->client->mount_args->osd_keepalive_timeout * HZ;
690 unsigned long last_stamp = 0;
691 struct rb_node *p;
692 struct list_head slow_osds;
693
694 dout("timeout\n");
695 down_read(&osdc->map_sem);
696
697 ceph_monc_request_next_osdmap(&osdc->client->monc);
698
699 mutex_lock(&osdc->request_mutex);
700 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
701 req = rb_entry(p, struct ceph_osd_request, r_node);
702
703 if (req->r_resend) {
704 int err;
705
706 dout("osdc resending prev failed %lld\n", req->r_tid);
707 err = __send_request(osdc, req);
708 if (err)
709 dout("osdc failed again on %lld\n", req->r_tid);
710 else
711 req->r_resend = false;
712 continue;
713 }
714 }
715
716 /*
717 * reset osds that appear to be _really_ unresponsive. this
718 * is a failsafe measure.. we really shouldn't be getting to
719 * this point if the system is working properly. the monitors
720 * should mark the osd as failed and we should find out about
721 * it from an updated osd map.
722 */
723 while (timeout && !list_empty(&osdc->req_lru)) {
724 req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
725 r_req_lru_item);
726
727 if (time_before(jiffies, req->r_stamp + timeout))
728 break;
729
730 BUG_ON(req == last_req && req->r_stamp == last_stamp);
731 last_req = req;
732 last_stamp = req->r_stamp;
733
734 osd = req->r_osd;
735 BUG_ON(!osd);
736 pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
737 req->r_tid, osd->o_osd);
738 __kick_requests(osdc, osd);
739 }
740
741 /*
742 * ping osds that are a bit slow. this ensures that if there
743 * is a break in the TCP connection we will notice, and reopen
744 * a connection with that osd (from the fault callback).
745 */
746 INIT_LIST_HEAD(&slow_osds);
747 list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
748 if (time_before(jiffies, req->r_stamp + keepalive))
749 break;
750
751 osd = req->r_osd;
752 BUG_ON(!osd);
753 dout(" tid %llu is slow, will send keepalive on osd%d\n",
754 req->r_tid, osd->o_osd);
755 list_move_tail(&osd->o_keepalive_item, &slow_osds);
756 }
757 while (!list_empty(&slow_osds)) {
758 osd = list_entry(slow_osds.next, struct ceph_osd,
759 o_keepalive_item);
760 list_del_init(&osd->o_keepalive_item);
761 ceph_con_keepalive(&osd->o_con);
762 }
763
764 __schedule_osd_timeout(osdc);
765 mutex_unlock(&osdc->request_mutex);
766
767 up_read(&osdc->map_sem);
768}
769
770static void handle_osds_timeout(struct work_struct *work)
771{
772 struct ceph_osd_client *osdc =
773 container_of(work, struct ceph_osd_client,
774 osds_timeout_work.work);
775 unsigned long delay =
776 osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
777
778 dout("osds timeout\n");
779 down_read(&osdc->map_sem);
780 remove_old_osds(osdc, 0);
781 up_read(&osdc->map_sem);
782
783 schedule_delayed_work(&osdc->osds_timeout_work,
784 round_jiffies_relative(delay));
785}
786
787/*
788 * handle osd op reply. either call the callback if it is specified,
789 * or do the completion to wake up the waiting thread.
790 */
791static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
792 struct ceph_connection *con)
793{
794 struct ceph_osd_reply_head *rhead = msg->front.iov_base;
795 struct ceph_osd_request *req;
796 u64 tid;
797 int numops, object_len, flags;
798 s32 result;
799
800 tid = le64_to_cpu(msg->hdr.tid);
801 if (msg->front.iov_len < sizeof(*rhead))
802 goto bad;
803 numops = le32_to_cpu(rhead->num_ops);
804 object_len = le32_to_cpu(rhead->object_len);
805 result = le32_to_cpu(rhead->result);
806 if (msg->front.iov_len != sizeof(*rhead) + object_len +
807 numops * sizeof(struct ceph_osd_op))
808 goto bad;
809 dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
810
811 /* lookup */
812 mutex_lock(&osdc->request_mutex);
813 req = __lookup_request(osdc, tid);
814 if (req == NULL) {
815 dout("handle_reply tid %llu dne\n", tid);
816 mutex_unlock(&osdc->request_mutex);
817 return;
818 }
819 ceph_osdc_get_request(req);
820 flags = le32_to_cpu(rhead->flags);
821
822 /*
823 * if this connection filled our message, drop our reference now, to
824 * avoid a (safe but slower) revoke later.
825 */
826 if (req->r_con_filling_msg == con && req->r_reply == msg) {
827 dout(" dropping con_filling_msg ref %p\n", con);
828 req->r_con_filling_msg = NULL;
829 ceph_con_put(con);
830 }
831
832 if (!req->r_got_reply) {
833 unsigned bytes;
834
835 req->r_result = le32_to_cpu(rhead->result);
836 bytes = le32_to_cpu(msg->hdr.data_len);
837 dout("handle_reply result %d bytes %d\n", req->r_result,
838 bytes);
839 if (req->r_result == 0)
840 req->r_result = bytes;
841
842 /* in case this is a write and we need to replay, */
843 req->r_reassert_version = rhead->reassert_version;
844
845 req->r_got_reply = 1;
846 } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
847 dout("handle_reply tid %llu dup ack\n", tid);
848 mutex_unlock(&osdc->request_mutex);
849 goto done;
850 }
851
852 dout("handle_reply tid %llu flags %d\n", tid, flags);
853
854 /* either this is a read, or we got the safe response */
855 if (result < 0 ||
856 (flags & CEPH_OSD_FLAG_ONDISK) ||
857 ((flags & CEPH_OSD_FLAG_WRITE) == 0))
858 __unregister_request(osdc, req);
859
860 mutex_unlock(&osdc->request_mutex);
861
862 if (req->r_callback)
863 req->r_callback(req, msg);
864 else
865 complete_all(&req->r_completion);
866
867 if (flags & CEPH_OSD_FLAG_ONDISK) {
868 if (req->r_safe_callback)
869 req->r_safe_callback(req, msg);
870 complete_all(&req->r_safe_completion); /* fsync waiter */
871 }
872
873done:
874 ceph_osdc_put_request(req);
875 return;
876
877bad:
878 pr_err("corrupt osd_op_reply got %d %d expected %d\n",
879 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
880 (int)sizeof(*rhead));
881 ceph_msg_dump(msg);
882}
883
884
885static int __kick_requests(struct ceph_osd_client *osdc,
886 struct ceph_osd *kickosd)
887{
888 struct ceph_osd_request *req;
889 struct rb_node *p, *n;
890 int needmap = 0;
891 int err;
892
893 dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
894 if (kickosd) {
895 err = __reset_osd(osdc, kickosd);
896 if (err == -EAGAIN)
897 return 1;
898 } else {
899 for (p = rb_first(&osdc->osds); p; p = n) {
900 struct ceph_osd *osd =
901 rb_entry(p, struct ceph_osd, o_node);
902
903 n = rb_next(p);
904 if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
905 memcmp(&osd->o_con.peer_addr,
906 ceph_osd_addr(osdc->osdmap,
907 osd->o_osd),
908 sizeof(struct ceph_entity_addr)) != 0)
909 __reset_osd(osdc, osd);
910 }
911 }
912
913 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
914 req = rb_entry(p, struct ceph_osd_request, r_node);
915
916 if (req->r_resend) {
917 dout(" r_resend set on tid %llu\n", req->r_tid);
918 __cancel_request(req);
919 goto kick;
920 }
921 if (req->r_osd && kickosd == req->r_osd) {
922 __cancel_request(req);
923 goto kick;
924 }
925
926 err = __map_osds(osdc, req);
927 if (err == 0)
928 continue; /* no change */
929 if (err < 0) {
930 /*
931 * FIXME: really, we should set the request
932 * error and fail if this isn't a 'nofail'
933 * request, but that's a fair bit more
934 * complicated to do. So retry!
935 */
936 dout(" setting r_resend on %llu\n", req->r_tid);
937 req->r_resend = true;
938 continue;
939 }
940 if (req->r_osd == NULL) {
941 dout("tid %llu maps to no valid osd\n", req->r_tid);
942 needmap++; /* request a newer map */
943 continue;
944 }
945
946kick:
947 dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
948 req->r_osd ? req->r_osd->o_osd : -1);
949 req->r_flags |= CEPH_OSD_FLAG_RETRY;
950 err = __send_request(osdc, req);
951 if (err) {
952 dout(" setting r_resend on %llu\n", req->r_tid);
953 req->r_resend = true;
954 }
955 }
956
957 return needmap;
958}
959
960/*
961 * Resubmit osd requests whose osd or osd address has changed. Request
962 * a new osd map if osds are down, or we are otherwise unable to determine
963 * how to direct a request.
964 *
965 * Close connections to down osds.
966 *
967 * If @who is specified, resubmit requests for that specific osd.
968 *
969 * Caller should hold map_sem for read and request_mutex.
970 */
971static void kick_requests(struct ceph_osd_client *osdc,
972 struct ceph_osd *kickosd)
973{
974 int needmap;
975
976 mutex_lock(&osdc->request_mutex);
977 needmap = __kick_requests(osdc, kickosd);
978 mutex_unlock(&osdc->request_mutex);
979
980 if (needmap) {
981 dout("%d requests for down osds, need new map\n", needmap);
982 ceph_monc_request_next_osdmap(&osdc->client->monc);
983 }
984
985}
986/*
987 * Process updated osd map.
988 *
989 * The message contains any number of incremental and full maps, normally
990 * indicating some sort of topology change in the cluster. Kick requests
991 * off to different OSDs as needed.
992 */
993void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
994{
995 void *p, *end, *next;
996 u32 nr_maps, maplen;
997 u32 epoch;
998 struct ceph_osdmap *newmap = NULL, *oldmap;
999 int err;
1000 struct ceph_fsid fsid;
1001
1002 dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
1003 p = msg->front.iov_base;
1004 end = p + msg->front.iov_len;
1005
1006 /* verify fsid */
1007 ceph_decode_need(&p, end, sizeof(fsid), bad);
1008 ceph_decode_copy(&p, &fsid, sizeof(fsid));
1009 if (ceph_check_fsid(osdc->client, &fsid) < 0)
1010 return;
1011
1012 down_write(&osdc->map_sem);
1013
1014 /* incremental maps */
1015 ceph_decode_32_safe(&p, end, nr_maps, bad);
1016 dout(" %d inc maps\n", nr_maps);
1017 while (nr_maps > 0) {
1018 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1019 epoch = ceph_decode_32(&p);
1020 maplen = ceph_decode_32(&p);
1021 ceph_decode_need(&p, end, maplen, bad);
1022 next = p + maplen;
1023 if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
1024 dout("applying incremental map %u len %d\n",
1025 epoch, maplen);
1026 newmap = osdmap_apply_incremental(&p, next,
1027 osdc->osdmap,
1028 osdc->client->msgr);
1029 if (IS_ERR(newmap)) {
1030 err = PTR_ERR(newmap);
1031 goto bad;
1032 }
1033 BUG_ON(!newmap);
1034 if (newmap != osdc->osdmap) {
1035 ceph_osdmap_destroy(osdc->osdmap);
1036 osdc->osdmap = newmap;
1037 }
1038 } else {
1039 dout("ignoring incremental map %u len %d\n",
1040 epoch, maplen);
1041 }
1042 p = next;
1043 nr_maps--;
1044 }
1045 if (newmap)
1046 goto done;
1047
1048 /* full maps */
1049 ceph_decode_32_safe(&p, end, nr_maps, bad);
1050 dout(" %d full maps\n", nr_maps);
1051 while (nr_maps) {
1052 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1053 epoch = ceph_decode_32(&p);
1054 maplen = ceph_decode_32(&p);
1055 ceph_decode_need(&p, end, maplen, bad);
1056 if (nr_maps > 1) {
1057 dout("skipping non-latest full map %u len %d\n",
1058 epoch, maplen);
1059 } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
1060 dout("skipping full map %u len %d, "
1061 "older than our %u\n", epoch, maplen,
1062 osdc->osdmap->epoch);
1063 } else {
1064 dout("taking full map %u len %d\n", epoch, maplen);
1065 newmap = osdmap_decode(&p, p+maplen);
1066 if (IS_ERR(newmap)) {
1067 err = PTR_ERR(newmap);
1068 goto bad;
1069 }
1070 BUG_ON(!newmap);
1071 oldmap = osdc->osdmap;
1072 osdc->osdmap = newmap;
1073 if (oldmap)
1074 ceph_osdmap_destroy(oldmap);
1075 }
1076 p += maplen;
1077 nr_maps--;
1078 }
1079
1080done:
1081 downgrade_write(&osdc->map_sem);
1082 ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
1083 if (newmap)
1084 kick_requests(osdc, NULL);
1085 up_read(&osdc->map_sem);
1086 wake_up_all(&osdc->client->auth_wq);
1087 return;
1088
1089bad:
1090 pr_err("osdc handle_map corrupt msg\n");
1091 ceph_msg_dump(msg);
1092 up_write(&osdc->map_sem);
1093 return;
1094}
1095
1096/*
1097 * Register request, send initial attempt.
1098 */
1099int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1100 struct ceph_osd_request *req,
1101 bool nofail)
1102{
1103 int rc = 0;
1104
1105 req->r_request->pages = req->r_pages;
1106 req->r_request->nr_pages = req->r_num_pages;
1107
1108 register_request(osdc, req);
1109
1110 down_read(&osdc->map_sem);
1111 mutex_lock(&osdc->request_mutex);
1112 /*
1113 * a racing kick_requests() may have sent the message for us
1114 * while we dropped request_mutex above, so only send now if
1115 * the request still han't been touched yet.
1116 */
1117 if (req->r_sent == 0) {
1118 rc = __send_request(osdc, req);
1119 if (rc) {
1120 if (nofail) {
1121 dout("osdc_start_request failed send, "
1122 " marking %lld\n", req->r_tid);
1123 req->r_resend = true;
1124 rc = 0;
1125 } else {
1126 __unregister_request(osdc, req);
1127 }
1128 }
1129 }
1130 mutex_unlock(&osdc->request_mutex);
1131 up_read(&osdc->map_sem);
1132 return rc;
1133}
1134
1135/*
1136 * wait for a request to complete
1137 */
1138int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
1139 struct ceph_osd_request *req)
1140{
1141 int rc;
1142
1143 rc = wait_for_completion_interruptible(&req->r_completion);
1144 if (rc < 0) {
1145 mutex_lock(&osdc->request_mutex);
1146 __cancel_request(req);
1147 __unregister_request(osdc, req);
1148 mutex_unlock(&osdc->request_mutex);
1149 dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
1150 return rc;
1151 }
1152
1153 dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
1154 return req->r_result;
1155}
1156
1157/*
1158 * sync - wait for all in-flight requests to flush. avoid starvation.
1159 */
1160void ceph_osdc_sync(struct ceph_osd_client *osdc)
1161{
1162 struct ceph_osd_request *req;
1163 u64 last_tid, next_tid = 0;
1164
1165 mutex_lock(&osdc->request_mutex);
1166 last_tid = osdc->last_tid;
1167 while (1) {
1168 req = __lookup_request_ge(osdc, next_tid);
1169 if (!req)
1170 break;
1171 if (req->r_tid > last_tid)
1172 break;
1173
1174 next_tid = req->r_tid + 1;
1175 if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
1176 continue;
1177
1178 ceph_osdc_get_request(req);
1179 mutex_unlock(&osdc->request_mutex);
1180 dout("sync waiting on tid %llu (last is %llu)\n",
1181 req->r_tid, last_tid);
1182 wait_for_completion(&req->r_safe_completion);
1183 mutex_lock(&osdc->request_mutex);
1184 ceph_osdc_put_request(req);
1185 }
1186 mutex_unlock(&osdc->request_mutex);
1187 dout("sync done (thru tid %llu)\n", last_tid);
1188}
1189
1190/*
1191 * init, shutdown
1192 */
1193int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1194{
1195 int err;
1196
1197 dout("init\n");
1198 osdc->client = client;
1199 osdc->osdmap = NULL;
1200 init_rwsem(&osdc->map_sem);
1201 init_completion(&osdc->map_waiters);
1202 osdc->last_requested_map = 0;
1203 mutex_init(&osdc->request_mutex);
1204 osdc->last_tid = 0;
1205 osdc->osds = RB_ROOT;
1206 INIT_LIST_HEAD(&osdc->osd_lru);
1207 osdc->requests = RB_ROOT;
1208 INIT_LIST_HEAD(&osdc->req_lru);
1209 osdc->num_requests = 0;
1210 INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
1211 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
1212
1213 schedule_delayed_work(&osdc->osds_timeout_work,
1214 round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
1215
1216 err = -ENOMEM;
1217 osdc->req_mempool = mempool_create_kmalloc_pool(10,
1218 sizeof(struct ceph_osd_request));
1219 if (!osdc->req_mempool)
1220 goto out;
1221
1222 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
1223 "osd_op");
1224 if (err < 0)
1225 goto out_mempool;
1226 err = ceph_msgpool_init(&osdc->msgpool_op_reply,
1227 OSD_OPREPLY_FRONT_LEN, 10, true,
1228 "osd_op_reply");
1229 if (err < 0)
1230 goto out_msgpool;
1231 return 0;
1232
1233out_msgpool:
1234 ceph_msgpool_destroy(&osdc->msgpool_op);
1235out_mempool:
1236 mempool_destroy(osdc->req_mempool);
1237out:
1238 return err;
1239}
1240
1241void ceph_osdc_stop(struct ceph_osd_client *osdc)
1242{
1243 cancel_delayed_work_sync(&osdc->timeout_work);
1244 cancel_delayed_work_sync(&osdc->osds_timeout_work);
1245 if (osdc->osdmap) {
1246 ceph_osdmap_destroy(osdc->osdmap);
1247 osdc->osdmap = NULL;
1248 }
1249 remove_old_osds(osdc, 1);
1250 mempool_destroy(osdc->req_mempool);
1251 ceph_msgpool_destroy(&osdc->msgpool_op);
1252 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
1253}
1254
1255/*
1256 * Read some contiguous pages. If we cross a stripe boundary, shorten
1257 * *plen. Return number of bytes read, or error.
1258 */
1259int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1260 struct ceph_vino vino, struct ceph_file_layout *layout,
1261 u64 off, u64 *plen,
1262 u32 truncate_seq, u64 truncate_size,
1263 struct page **pages, int num_pages)
1264{
1265 struct ceph_osd_request *req;
1266 int rc = 0;
1267
1268 dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
1269 vino.snap, off, *plen);
1270 req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
1271 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1272 NULL, 0, truncate_seq, truncate_size, NULL,
1273 false, 1);
1274 if (!req)
1275 return -ENOMEM;
1276
1277 /* it may be a short read due to an object boundary */
1278 req->r_pages = pages;
1279
1280 dout("readpages final extent is %llu~%llu (%d pages)\n",
1281 off, *plen, req->r_num_pages);
1282
1283 rc = ceph_osdc_start_request(osdc, req, false);
1284 if (!rc)
1285 rc = ceph_osdc_wait_request(osdc, req);
1286
1287 ceph_osdc_put_request(req);
1288 dout("readpages result %d\n", rc);
1289 return rc;
1290}
1291
1292/*
1293 * do a synchronous write on N pages
1294 */
1295int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1296 struct ceph_file_layout *layout,
1297 struct ceph_snap_context *snapc,
1298 u64 off, u64 len,
1299 u32 truncate_seq, u64 truncate_size,
1300 struct timespec *mtime,
1301 struct page **pages, int num_pages,
1302 int flags, int do_sync, bool nofail)
1303{
1304 struct ceph_osd_request *req;
1305 int rc = 0;
1306
1307 BUG_ON(vino.snap != CEPH_NOSNAP);
1308 req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
1309 CEPH_OSD_OP_WRITE,
1310 flags | CEPH_OSD_FLAG_ONDISK |
1311 CEPH_OSD_FLAG_WRITE,
1312 snapc, do_sync,
1313 truncate_seq, truncate_size, mtime,
1314 nofail, 1);
1315 if (!req)
1316 return -ENOMEM;
1317
1318 /* it may be a short write due to an object boundary */
1319 req->r_pages = pages;
1320 dout("writepages %llu~%llu (%d pages)\n", off, len,
1321 req->r_num_pages);
1322
1323 rc = ceph_osdc_start_request(osdc, req, nofail);
1324 if (!rc)
1325 rc = ceph_osdc_wait_request(osdc, req);
1326
1327 ceph_osdc_put_request(req);
1328 if (rc == 0)
1329 rc = len;
1330 dout("writepages result %d\n", rc);
1331 return rc;
1332}
1333
1334/*
1335 * handle incoming message
1336 */
1337static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1338{
1339 struct ceph_osd *osd = con->private;
1340 struct ceph_osd_client *osdc;
1341 int type = le16_to_cpu(msg->hdr.type);
1342
1343 if (!osd)
1344 goto out;
1345 osdc = osd->o_osdc;
1346
1347 switch (type) {
1348 case CEPH_MSG_OSD_MAP:
1349 ceph_osdc_handle_map(osdc, msg);
1350 break;
1351 case CEPH_MSG_OSD_OPREPLY:
1352 handle_reply(osdc, msg, con);
1353 break;
1354
1355 default:
1356 pr_err("received unknown message type %d %s\n", type,
1357 ceph_msg_type_name(type));
1358 }
1359out:
1360 ceph_msg_put(msg);
1361}
1362
1363/*
1364 * lookup and return message for incoming reply. set up reply message
1365 * pages.
1366 */
1367static struct ceph_msg *get_reply(struct ceph_connection *con,
1368 struct ceph_msg_header *hdr,
1369 int *skip)
1370{
1371 struct ceph_osd *osd = con->private;
1372 struct ceph_osd_client *osdc = osd->o_osdc;
1373 struct ceph_msg *m;
1374 struct ceph_osd_request *req;
1375 int front = le32_to_cpu(hdr->front_len);
1376 int data_len = le32_to_cpu(hdr->data_len);
1377 u64 tid;
1378
1379 tid = le64_to_cpu(hdr->tid);
1380 mutex_lock(&osdc->request_mutex);
1381 req = __lookup_request(osdc, tid);
1382 if (!req) {
1383 *skip = 1;
1384 m = NULL;
1385 pr_info("get_reply unknown tid %llu from osd%d\n", tid,
1386 osd->o_osd);
1387 goto out;
1388 }
1389
1390 if (req->r_con_filling_msg) {
1391 dout("get_reply revoking msg %p from old con %p\n",
1392 req->r_reply, req->r_con_filling_msg);
1393 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
1394 ceph_con_put(req->r_con_filling_msg);
1395 req->r_con_filling_msg = NULL;
1396 }
1397
1398 if (front > req->r_reply->front.iov_len) {
1399 pr_warning("get_reply front %d > preallocated %d\n",
1400 front, (int)req->r_reply->front.iov_len);
1401 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
1402 if (!m)
1403 goto out;
1404 ceph_msg_put(req->r_reply);
1405 req->r_reply = m;
1406 }
1407 m = ceph_msg_get(req->r_reply);
1408
1409 if (data_len > 0) {
1410 unsigned data_off = le16_to_cpu(hdr->data_off);
1411 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1412
1413 if (unlikely(req->r_num_pages < want)) {
1414 pr_warning("tid %lld reply %d > expected %d pages\n",
1415 tid, want, m->nr_pages);
1416 *skip = 1;
1417 ceph_msg_put(m);
1418 m = NULL;
1419 goto out;
1420 }
1421 m->pages = req->r_pages;
1422 m->nr_pages = req->r_num_pages;
1423 }
1424 *skip = 0;
1425 req->r_con_filling_msg = ceph_con_get(con);
1426 dout("get_reply tid %lld %p\n", tid, m);
1427
1428out:
1429 mutex_unlock(&osdc->request_mutex);
1430 return m;
1431
1432}
1433
1434static struct ceph_msg *alloc_msg(struct ceph_connection *con,
1435 struct ceph_msg_header *hdr,
1436 int *skip)
1437{
1438 struct ceph_osd *osd = con->private;
1439 int type = le16_to_cpu(hdr->type);
1440 int front = le32_to_cpu(hdr->front_len);
1441
1442 switch (type) {
1443 case CEPH_MSG_OSD_MAP:
1444 return ceph_msg_new(type, front, GFP_NOFS);
1445 case CEPH_MSG_OSD_OPREPLY:
1446 return get_reply(con, hdr, skip);
1447 default:
1448 pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
1449 osd->o_osd);
1450 *skip = 1;
1451 return NULL;
1452 }
1453}
1454
1455/*
1456 * Wrappers to refcount containing ceph_osd struct
1457 */
1458static struct ceph_connection *get_osd_con(struct ceph_connection *con)
1459{
1460 struct ceph_osd *osd = con->private;
1461 if (get_osd(osd))
1462 return con;
1463 return NULL;
1464}
1465
1466static void put_osd_con(struct ceph_connection *con)
1467{
1468 struct ceph_osd *osd = con->private;
1469 put_osd(osd);
1470}
1471
1472/*
1473 * authentication
1474 */
1475static int get_authorizer(struct ceph_connection *con,
1476 void **buf, int *len, int *proto,
1477 void **reply_buf, int *reply_len, int force_new)
1478{
1479 struct ceph_osd *o = con->private;
1480 struct ceph_osd_client *osdc = o->o_osdc;
1481 struct ceph_auth_client *ac = osdc->client->monc.auth;
1482 int ret = 0;
1483
1484 if (force_new && o->o_authorizer) {
1485 ac->ops->destroy_authorizer(ac, o->o_authorizer);
1486 o->o_authorizer = NULL;
1487 }
1488 if (o->o_authorizer == NULL) {
1489 ret = ac->ops->create_authorizer(
1490 ac, CEPH_ENTITY_TYPE_OSD,
1491 &o->o_authorizer,
1492 &o->o_authorizer_buf,
1493 &o->o_authorizer_buf_len,
1494 &o->o_authorizer_reply_buf,
1495 &o->o_authorizer_reply_buf_len);
1496 if (ret)
1497 return ret;
1498 }
1499
1500 *proto = ac->protocol;
1501 *buf = o->o_authorizer_buf;
1502 *len = o->o_authorizer_buf_len;
1503 *reply_buf = o->o_authorizer_reply_buf;
1504 *reply_len = o->o_authorizer_reply_buf_len;
1505 return 0;
1506}
1507
1508
1509static int verify_authorizer_reply(struct ceph_connection *con, int len)
1510{
1511 struct ceph_osd *o = con->private;
1512 struct ceph_osd_client *osdc = o->o_osdc;
1513 struct ceph_auth_client *ac = osdc->client->monc.auth;
1514
1515 return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
1516}
1517
1518static int invalidate_authorizer(struct ceph_connection *con)
1519{
1520 struct ceph_osd *o = con->private;
1521 struct ceph_osd_client *osdc = o->o_osdc;
1522 struct ceph_auth_client *ac = osdc->client->monc.auth;
1523
1524 if (ac->ops->invalidate_authorizer)
1525 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
1526
1527 return ceph_monc_validate_auth(&osdc->client->monc);
1528}
1529
1530static const struct ceph_connection_operations osd_con_ops = {
1531 .get = get_osd_con,
1532 .put = put_osd_con,
1533 .dispatch = dispatch,
1534 .get_authorizer = get_authorizer,
1535 .verify_authorizer_reply = verify_authorizer_reply,
1536 .invalidate_authorizer = invalidate_authorizer,
1537 .alloc_msg = alloc_msg,
1538 .fault = osd_reset,
1539};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
deleted file mode 100644
index ce776989ef6a..000000000000
--- a/fs/ceph/osd_client.h
+++ /dev/null
@@ -1,167 +0,0 @@
1#ifndef _FS_CEPH_OSD_CLIENT_H
2#define _FS_CEPH_OSD_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/mempool.h>
7#include <linux/rbtree.h>
8
9#include "types.h"
10#include "osdmap.h"
11#include "messenger.h"
12
13struct ceph_msg;
14struct ceph_snap_context;
15struct ceph_osd_request;
16struct ceph_osd_client;
17struct ceph_authorizer;
18
19/*
20 * completion callback for async writepages
21 */
22typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
23 struct ceph_msg *);
24
25/* a given osd we're communicating with */
26struct ceph_osd {
27 atomic_t o_ref;
28 struct ceph_osd_client *o_osdc;
29 int o_osd;
30 int o_incarnation;
31 struct rb_node o_node;
32 struct ceph_connection o_con;
33 struct list_head o_requests;
34 struct list_head o_osd_lru;
35 struct ceph_authorizer *o_authorizer;
36 void *o_authorizer_buf, *o_authorizer_reply_buf;
37 size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
38 unsigned long lru_ttl;
39 int o_marked_for_keepalive;
40 struct list_head o_keepalive_item;
41};
42
43/* an in-flight request */
44struct ceph_osd_request {
45 u64 r_tid; /* unique for this client */
46 struct rb_node r_node;
47 struct list_head r_req_lru_item;
48 struct list_head r_osd_item;
49 struct ceph_osd *r_osd;
50 struct ceph_pg r_pgid;
51 int r_pg_osds[CEPH_PG_MAX_SIZE];
52 int r_num_pg_osds;
53
54 struct ceph_connection *r_con_filling_msg;
55
56 struct ceph_msg *r_request, *r_reply;
57 int r_result;
58 int r_flags; /* any additional flags for the osd */
59 u32 r_sent; /* >0 if r_request is sending/sent */
60 int r_got_reply;
61
62 struct ceph_osd_client *r_osdc;
63 struct kref r_kref;
64 bool r_mempool;
65 struct completion r_completion, r_safe_completion;
66 ceph_osdc_callback_t r_callback, r_safe_callback;
67 struct ceph_eversion r_reassert_version;
68 struct list_head r_unsafe_item;
69
70 struct inode *r_inode; /* for use by callbacks */
71
72 char r_oid[40]; /* object name */
73 int r_oid_len;
74 unsigned long r_stamp; /* send OR check time */
75 bool r_resend; /* msg send failed, needs retry */
76
77 struct ceph_file_layout r_file_layout;
78 struct ceph_snap_context *r_snapc; /* snap context for writes */
79 unsigned r_num_pages; /* size of page array (follows) */
80 struct page **r_pages; /* pages for data payload */
81 int r_pages_from_pool;
82 int r_own_pages; /* if true, i own page list */
83};
84
85struct ceph_osd_client {
86 struct ceph_client *client;
87
88 struct ceph_osdmap *osdmap; /* current map */
89 struct rw_semaphore map_sem;
90 struct completion map_waiters;
91 u64 last_requested_map;
92
93 struct mutex request_mutex;
94 struct rb_root osds; /* osds */
95 struct list_head osd_lru; /* idle osds */
96 u64 timeout_tid; /* tid of timeout triggering rq */
97 u64 last_tid; /* tid of last request */
98 struct rb_root requests; /* pending requests */
99 struct list_head req_lru; /* pending requests lru */
100 int num_requests;
101 struct delayed_work timeout_work;
102 struct delayed_work osds_timeout_work;
103#ifdef CONFIG_DEBUG_FS
104 struct dentry *debugfs_file;
105#endif
106
107 mempool_t *req_mempool;
108
109 struct ceph_msgpool msgpool_op;
110 struct ceph_msgpool msgpool_op_reply;
111};
112
113extern int ceph_osdc_init(struct ceph_osd_client *osdc,
114 struct ceph_client *client);
115extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
116
117extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
118 struct ceph_msg *msg);
119extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
120 struct ceph_msg *msg);
121
122extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
123 struct ceph_file_layout *layout,
124 struct ceph_vino vino,
125 u64 offset, u64 *len, int op, int flags,
126 struct ceph_snap_context *snapc,
127 int do_sync, u32 truncate_seq,
128 u64 truncate_size,
129 struct timespec *mtime,
130 bool use_mempool, int num_reply);
131
132static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
133{
134 kref_get(&req->r_kref);
135}
136extern void ceph_osdc_release_request(struct kref *kref);
137static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
138{
139 kref_put(&req->r_kref, ceph_osdc_release_request);
140}
141
142extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
143 struct ceph_osd_request *req,
144 bool nofail);
145extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
146 struct ceph_osd_request *req);
147extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
148
149extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
150 struct ceph_vino vino,
151 struct ceph_file_layout *layout,
152 u64 off, u64 *plen,
153 u32 truncate_seq, u64 truncate_size,
154 struct page **pages, int nr_pages);
155
156extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
157 struct ceph_vino vino,
158 struct ceph_file_layout *layout,
159 struct ceph_snap_context *sc,
160 u64 off, u64 len,
161 u32 truncate_seq, u64 truncate_size,
162 struct timespec *mtime,
163 struct page **pages, int nr_pages,
164 int flags, int do_sync, bool nofail);
165
166#endif
167
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
deleted file mode 100644
index e31f118f1392..000000000000
--- a/fs/ceph/osdmap.c
+++ /dev/null
@@ -1,1110 +0,0 @@
1
2#include "ceph_debug.h"
3
4#include <linux/slab.h>
5#include <asm/div64.h>
6
7#include "super.h"
8#include "osdmap.h"
9#include "crush/hash.h"
10#include "crush/mapper.h"
11#include "decode.h"
12
13char *ceph_osdmap_state_str(char *str, int len, int state)
14{
15 int flag = 0;
16
17 if (!len)
18 goto done;
19
20 *str = '\0';
21 if (state) {
22 if (state & CEPH_OSD_EXISTS) {
23 snprintf(str, len, "exists");
24 flag = 1;
25 }
26 if (state & CEPH_OSD_UP) {
27 snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
28 "up");
29 flag = 1;
30 }
31 } else {
32 snprintf(str, len, "doesn't exist");
33 }
34done:
35 return str;
36}
37
38/* maps */
39
40static int calc_bits_of(unsigned t)
41{
42 int b = 0;
43 while (t) {
44 t = t >> 1;
45 b++;
46 }
47 return b;
48}
49
50/*
51 * the foo_mask is the smallest value 2^n-1 that is >= foo.
52 */
53static void calc_pg_masks(struct ceph_pg_pool_info *pi)
54{
55 pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
56 pi->pgp_num_mask =
57 (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
58 pi->lpg_num_mask =
59 (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
60 pi->lpgp_num_mask =
61 (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
62}
63
64/*
65 * decode crush map
66 */
67static int crush_decode_uniform_bucket(void **p, void *end,
68 struct crush_bucket_uniform *b)
69{
70 dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
71 ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
72 b->item_weight = ceph_decode_32(p);
73 return 0;
74bad:
75 return -EINVAL;
76}
77
78static int crush_decode_list_bucket(void **p, void *end,
79 struct crush_bucket_list *b)
80{
81 int j;
82 dout("crush_decode_list_bucket %p to %p\n", *p, end);
83 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
84 if (b->item_weights == NULL)
85 return -ENOMEM;
86 b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
87 if (b->sum_weights == NULL)
88 return -ENOMEM;
89 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
90 for (j = 0; j < b->h.size; j++) {
91 b->item_weights[j] = ceph_decode_32(p);
92 b->sum_weights[j] = ceph_decode_32(p);
93 }
94 return 0;
95bad:
96 return -EINVAL;
97}
98
99static int crush_decode_tree_bucket(void **p, void *end,
100 struct crush_bucket_tree *b)
101{
102 int j;
103 dout("crush_decode_tree_bucket %p to %p\n", *p, end);
104 ceph_decode_32_safe(p, end, b->num_nodes, bad);
105 b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
106 if (b->node_weights == NULL)
107 return -ENOMEM;
108 ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
109 for (j = 0; j < b->num_nodes; j++)
110 b->node_weights[j] = ceph_decode_32(p);
111 return 0;
112bad:
113 return -EINVAL;
114}
115
116static int crush_decode_straw_bucket(void **p, void *end,
117 struct crush_bucket_straw *b)
118{
119 int j;
120 dout("crush_decode_straw_bucket %p to %p\n", *p, end);
121 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
122 if (b->item_weights == NULL)
123 return -ENOMEM;
124 b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
125 if (b->straws == NULL)
126 return -ENOMEM;
127 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
128 for (j = 0; j < b->h.size; j++) {
129 b->item_weights[j] = ceph_decode_32(p);
130 b->straws[j] = ceph_decode_32(p);
131 }
132 return 0;
133bad:
134 return -EINVAL;
135}
136
137static struct crush_map *crush_decode(void *pbyval, void *end)
138{
139 struct crush_map *c;
140 int err = -EINVAL;
141 int i, j;
142 void **p = &pbyval;
143 void *start = pbyval;
144 u32 magic;
145
146 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
147
148 c = kzalloc(sizeof(*c), GFP_NOFS);
149 if (c == NULL)
150 return ERR_PTR(-ENOMEM);
151
152 ceph_decode_need(p, end, 4*sizeof(u32), bad);
153 magic = ceph_decode_32(p);
154 if (magic != CRUSH_MAGIC) {
155 pr_err("crush_decode magic %x != current %x\n",
156 (unsigned)magic, (unsigned)CRUSH_MAGIC);
157 goto bad;
158 }
159 c->max_buckets = ceph_decode_32(p);
160 c->max_rules = ceph_decode_32(p);
161 c->max_devices = ceph_decode_32(p);
162
163 c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
164 if (c->device_parents == NULL)
165 goto badmem;
166 c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
167 if (c->bucket_parents == NULL)
168 goto badmem;
169
170 c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
171 if (c->buckets == NULL)
172 goto badmem;
173 c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
174 if (c->rules == NULL)
175 goto badmem;
176
177 /* buckets */
178 for (i = 0; i < c->max_buckets; i++) {
179 int size = 0;
180 u32 alg;
181 struct crush_bucket *b;
182
183 ceph_decode_32_safe(p, end, alg, bad);
184 if (alg == 0) {
185 c->buckets[i] = NULL;
186 continue;
187 }
188 dout("crush_decode bucket %d off %x %p to %p\n",
189 i, (int)(*p-start), *p, end);
190
191 switch (alg) {
192 case CRUSH_BUCKET_UNIFORM:
193 size = sizeof(struct crush_bucket_uniform);
194 break;
195 case CRUSH_BUCKET_LIST:
196 size = sizeof(struct crush_bucket_list);
197 break;
198 case CRUSH_BUCKET_TREE:
199 size = sizeof(struct crush_bucket_tree);
200 break;
201 case CRUSH_BUCKET_STRAW:
202 size = sizeof(struct crush_bucket_straw);
203 break;
204 default:
205 err = -EINVAL;
206 goto bad;
207 }
208 BUG_ON(size == 0);
209 b = c->buckets[i] = kzalloc(size, GFP_NOFS);
210 if (b == NULL)
211 goto badmem;
212
213 ceph_decode_need(p, end, 4*sizeof(u32), bad);
214 b->id = ceph_decode_32(p);
215 b->type = ceph_decode_16(p);
216 b->alg = ceph_decode_8(p);
217 b->hash = ceph_decode_8(p);
218 b->weight = ceph_decode_32(p);
219 b->size = ceph_decode_32(p);
220
221 dout("crush_decode bucket size %d off %x %p to %p\n",
222 b->size, (int)(*p-start), *p, end);
223
224 b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
225 if (b->items == NULL)
226 goto badmem;
227 b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
228 if (b->perm == NULL)
229 goto badmem;
230 b->perm_n = 0;
231
232 ceph_decode_need(p, end, b->size*sizeof(u32), bad);
233 for (j = 0; j < b->size; j++)
234 b->items[j] = ceph_decode_32(p);
235
236 switch (b->alg) {
237 case CRUSH_BUCKET_UNIFORM:
238 err = crush_decode_uniform_bucket(p, end,
239 (struct crush_bucket_uniform *)b);
240 if (err < 0)
241 goto bad;
242 break;
243 case CRUSH_BUCKET_LIST:
244 err = crush_decode_list_bucket(p, end,
245 (struct crush_bucket_list *)b);
246 if (err < 0)
247 goto bad;
248 break;
249 case CRUSH_BUCKET_TREE:
250 err = crush_decode_tree_bucket(p, end,
251 (struct crush_bucket_tree *)b);
252 if (err < 0)
253 goto bad;
254 break;
255 case CRUSH_BUCKET_STRAW:
256 err = crush_decode_straw_bucket(p, end,
257 (struct crush_bucket_straw *)b);
258 if (err < 0)
259 goto bad;
260 break;
261 }
262 }
263
264 /* rules */
265 dout("rule vec is %p\n", c->rules);
266 for (i = 0; i < c->max_rules; i++) {
267 u32 yes;
268 struct crush_rule *r;
269
270 ceph_decode_32_safe(p, end, yes, bad);
271 if (!yes) {
272 dout("crush_decode NO rule %d off %x %p to %p\n",
273 i, (int)(*p-start), *p, end);
274 c->rules[i] = NULL;
275 continue;
276 }
277
278 dout("crush_decode rule %d off %x %p to %p\n",
279 i, (int)(*p-start), *p, end);
280
281 /* len */
282 ceph_decode_32_safe(p, end, yes, bad);
283#if BITS_PER_LONG == 32
284 err = -EINVAL;
285 if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
286 goto bad;
287#endif
288 r = c->rules[i] = kmalloc(sizeof(*r) +
289 yes*sizeof(struct crush_rule_step),
290 GFP_NOFS);
291 if (r == NULL)
292 goto badmem;
293 dout(" rule %d is at %p\n", i, r);
294 r->len = yes;
295 ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
296 ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
297 for (j = 0; j < r->len; j++) {
298 r->steps[j].op = ceph_decode_32(p);
299 r->steps[j].arg1 = ceph_decode_32(p);
300 r->steps[j].arg2 = ceph_decode_32(p);
301 }
302 }
303
304 /* ignore trailing name maps. */
305
306 dout("crush_decode success\n");
307 return c;
308
309badmem:
310 err = -ENOMEM;
311bad:
312 dout("crush_decode fail %d\n", err);
313 crush_destroy(c);
314 return ERR_PTR(err);
315}
316
317/*
318 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
319 * to a set of osds)
320 */
321static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
322{
323 u64 a = *(u64 *)&l;
324 u64 b = *(u64 *)&r;
325
326 if (a < b)
327 return -1;
328 if (a > b)
329 return 1;
330 return 0;
331}
332
333static int __insert_pg_mapping(struct ceph_pg_mapping *new,
334 struct rb_root *root)
335{
336 struct rb_node **p = &root->rb_node;
337 struct rb_node *parent = NULL;
338 struct ceph_pg_mapping *pg = NULL;
339 int c;
340
341 while (*p) {
342 parent = *p;
343 pg = rb_entry(parent, struct ceph_pg_mapping, node);
344 c = pgid_cmp(new->pgid, pg->pgid);
345 if (c < 0)
346 p = &(*p)->rb_left;
347 else if (c > 0)
348 p = &(*p)->rb_right;
349 else
350 return -EEXIST;
351 }
352
353 rb_link_node(&new->node, parent, p);
354 rb_insert_color(&new->node, root);
355 return 0;
356}
357
358static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
359 struct ceph_pg pgid)
360{
361 struct rb_node *n = root->rb_node;
362 struct ceph_pg_mapping *pg;
363 int c;
364
365 while (n) {
366 pg = rb_entry(n, struct ceph_pg_mapping, node);
367 c = pgid_cmp(pgid, pg->pgid);
368 if (c < 0)
369 n = n->rb_left;
370 else if (c > 0)
371 n = n->rb_right;
372 else
373 return pg;
374 }
375 return NULL;
376}
377
378/*
379 * rbtree of pg pool info
380 */
381static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
382{
383 struct rb_node **p = &root->rb_node;
384 struct rb_node *parent = NULL;
385 struct ceph_pg_pool_info *pi = NULL;
386
387 while (*p) {
388 parent = *p;
389 pi = rb_entry(parent, struct ceph_pg_pool_info, node);
390 if (new->id < pi->id)
391 p = &(*p)->rb_left;
392 else if (new->id > pi->id)
393 p = &(*p)->rb_right;
394 else
395 return -EEXIST;
396 }
397
398 rb_link_node(&new->node, parent, p);
399 rb_insert_color(&new->node, root);
400 return 0;
401}
402
403static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
404{
405 struct ceph_pg_pool_info *pi;
406 struct rb_node *n = root->rb_node;
407
408 while (n) {
409 pi = rb_entry(n, struct ceph_pg_pool_info, node);
410 if (id < pi->id)
411 n = n->rb_left;
412 else if (id > pi->id)
413 n = n->rb_right;
414 else
415 return pi;
416 }
417 return NULL;
418}
419
420static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
421{
422 rb_erase(&pi->node, root);
423 kfree(pi->name);
424 kfree(pi);
425}
426
427static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
428{
429 unsigned n, m;
430
431 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
432 calc_pg_masks(pi);
433
434 /* num_snaps * snap_info_t */
435 n = le32_to_cpu(pi->v.num_snaps);
436 while (n--) {
437 ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) +
438 sizeof(struct ceph_timespec), bad);
439 *p += sizeof(u64) + /* key */
440 1 + sizeof(u64) + /* u8, snapid */
441 sizeof(struct ceph_timespec);
442 m = ceph_decode_32(p); /* snap name */
443 *p += m;
444 }
445
446 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
447 return 0;
448
449bad:
450 return -EINVAL;
451}
452
453static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
454{
455 struct ceph_pg_pool_info *pi;
456 u32 num, len, pool;
457
458 ceph_decode_32_safe(p, end, num, bad);
459 dout(" %d pool names\n", num);
460 while (num--) {
461 ceph_decode_32_safe(p, end, pool, bad);
462 ceph_decode_32_safe(p, end, len, bad);
463 dout(" pool %d len %d\n", pool, len);
464 pi = __lookup_pg_pool(&map->pg_pools, pool);
465 if (pi) {
466 kfree(pi->name);
467 pi->name = kmalloc(len + 1, GFP_NOFS);
468 if (pi->name) {
469 memcpy(pi->name, *p, len);
470 pi->name[len] = '\0';
471 dout(" name is %s\n", pi->name);
472 }
473 }
474 *p += len;
475 }
476 return 0;
477
478bad:
479 return -EINVAL;
480}
481
482/*
483 * osd map
484 */
485void ceph_osdmap_destroy(struct ceph_osdmap *map)
486{
487 dout("osdmap_destroy %p\n", map);
488 if (map->crush)
489 crush_destroy(map->crush);
490 while (!RB_EMPTY_ROOT(&map->pg_temp)) {
491 struct ceph_pg_mapping *pg =
492 rb_entry(rb_first(&map->pg_temp),
493 struct ceph_pg_mapping, node);
494 rb_erase(&pg->node, &map->pg_temp);
495 kfree(pg);
496 }
497 while (!RB_EMPTY_ROOT(&map->pg_pools)) {
498 struct ceph_pg_pool_info *pi =
499 rb_entry(rb_first(&map->pg_pools),
500 struct ceph_pg_pool_info, node);
501 __remove_pg_pool(&map->pg_pools, pi);
502 }
503 kfree(map->osd_state);
504 kfree(map->osd_weight);
505 kfree(map->osd_addr);
506 kfree(map);
507}
508
509/*
510 * adjust max osd value. reallocate arrays.
511 */
512static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
513{
514 u8 *state;
515 struct ceph_entity_addr *addr;
516 u32 *weight;
517
518 state = kcalloc(max, sizeof(*state), GFP_NOFS);
519 addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
520 weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
521 if (state == NULL || addr == NULL || weight == NULL) {
522 kfree(state);
523 kfree(addr);
524 kfree(weight);
525 return -ENOMEM;
526 }
527
528 /* copy old? */
529 if (map->osd_state) {
530 memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
531 memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
532 memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
533 kfree(map->osd_state);
534 kfree(map->osd_addr);
535 kfree(map->osd_weight);
536 }
537
538 map->osd_state = state;
539 map->osd_weight = weight;
540 map->osd_addr = addr;
541 map->max_osd = max;
542 return 0;
543}
544
545/*
546 * decode a full map.
547 */
548struct ceph_osdmap *osdmap_decode(void **p, void *end)
549{
550 struct ceph_osdmap *map;
551 u16 version;
552 u32 len, max, i;
553 u8 ev;
554 int err = -EINVAL;
555 void *start = *p;
556 struct ceph_pg_pool_info *pi;
557
558 dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
559
560 map = kzalloc(sizeof(*map), GFP_NOFS);
561 if (map == NULL)
562 return ERR_PTR(-ENOMEM);
563 map->pg_temp = RB_ROOT;
564
565 ceph_decode_16_safe(p, end, version, bad);
566 if (version > CEPH_OSDMAP_VERSION) {
567 pr_warning("got unknown v %d > %d of osdmap\n", version,
568 CEPH_OSDMAP_VERSION);
569 goto bad;
570 }
571
572 ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
573 ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
574 map->epoch = ceph_decode_32(p);
575 ceph_decode_copy(p, &map->created, sizeof(map->created));
576 ceph_decode_copy(p, &map->modified, sizeof(map->modified));
577
578 ceph_decode_32_safe(p, end, max, bad);
579 while (max--) {
580 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
581 pi = kzalloc(sizeof(*pi), GFP_NOFS);
582 if (!pi)
583 goto bad;
584 pi->id = ceph_decode_32(p);
585 ev = ceph_decode_8(p); /* encoding version */
586 if (ev > CEPH_PG_POOL_VERSION) {
587 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
588 ev, CEPH_PG_POOL_VERSION);
589 kfree(pi);
590 goto bad;
591 }
592 err = __decode_pool(p, end, pi);
593 if (err < 0)
594 goto bad;
595 __insert_pg_pool(&map->pg_pools, pi);
596 }
597
598 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
599 goto bad;
600
601 ceph_decode_32_safe(p, end, map->pool_max, bad);
602
603 ceph_decode_32_safe(p, end, map->flags, bad);
604
605 max = ceph_decode_32(p);
606
607 /* (re)alloc osd arrays */
608 err = osdmap_set_max_osd(map, max);
609 if (err < 0)
610 goto bad;
611 dout("osdmap_decode max_osd = %d\n", map->max_osd);
612
613 /* osds */
614 err = -EINVAL;
615 ceph_decode_need(p, end, 3*sizeof(u32) +
616 map->max_osd*(1 + sizeof(*map->osd_weight) +
617 sizeof(*map->osd_addr)), bad);
618 *p += 4; /* skip length field (should match max) */
619 ceph_decode_copy(p, map->osd_state, map->max_osd);
620
621 *p += 4; /* skip length field (should match max) */
622 for (i = 0; i < map->max_osd; i++)
623 map->osd_weight[i] = ceph_decode_32(p);
624
625 *p += 4; /* skip length field (should match max) */
626 ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
627 for (i = 0; i < map->max_osd; i++)
628 ceph_decode_addr(&map->osd_addr[i]);
629
630 /* pg_temp */
631 ceph_decode_32_safe(p, end, len, bad);
632 for (i = 0; i < len; i++) {
633 int n, j;
634 struct ceph_pg pgid;
635 struct ceph_pg_mapping *pg;
636
637 ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
638 ceph_decode_copy(p, &pgid, sizeof(pgid));
639 n = ceph_decode_32(p);
640 ceph_decode_need(p, end, n * sizeof(u32), bad);
641 err = -ENOMEM;
642 pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
643 if (!pg)
644 goto bad;
645 pg->pgid = pgid;
646 pg->len = n;
647 for (j = 0; j < n; j++)
648 pg->osds[j] = ceph_decode_32(p);
649
650 err = __insert_pg_mapping(pg, &map->pg_temp);
651 if (err)
652 goto bad;
653 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
654 }
655
656 /* crush */
657 ceph_decode_32_safe(p, end, len, bad);
658 dout("osdmap_decode crush len %d from off 0x%x\n", len,
659 (int)(*p - start));
660 ceph_decode_need(p, end, len, bad);
661 map->crush = crush_decode(*p, end);
662 *p += len;
663 if (IS_ERR(map->crush)) {
664 err = PTR_ERR(map->crush);
665 map->crush = NULL;
666 goto bad;
667 }
668
669 /* ignore the rest of the map */
670 *p = end;
671
672 dout("osdmap_decode done %p %p\n", *p, end);
673 return map;
674
675bad:
676 dout("osdmap_decode fail\n");
677 ceph_osdmap_destroy(map);
678 return ERR_PTR(err);
679}
680
681/*
682 * decode and apply an incremental map update.
683 */
684struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
685 struct ceph_osdmap *map,
686 struct ceph_messenger *msgr)
687{
688 struct crush_map *newcrush = NULL;
689 struct ceph_fsid fsid;
690 u32 epoch = 0;
691 struct ceph_timespec modified;
692 u32 len, pool;
693 __s32 new_pool_max, new_flags, max;
694 void *start = *p;
695 int err = -EINVAL;
696 u16 version;
697 struct rb_node *rbp;
698
699 ceph_decode_16_safe(p, end, version, bad);
700 if (version > CEPH_OSDMAP_INC_VERSION) {
701 pr_warning("got unknown v %d > %d of inc osdmap\n", version,
702 CEPH_OSDMAP_INC_VERSION);
703 goto bad;
704 }
705
706 ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
707 bad);
708 ceph_decode_copy(p, &fsid, sizeof(fsid));
709 epoch = ceph_decode_32(p);
710 BUG_ON(epoch != map->epoch+1);
711 ceph_decode_copy(p, &modified, sizeof(modified));
712 new_pool_max = ceph_decode_32(p);
713 new_flags = ceph_decode_32(p);
714
715 /* full map? */
716 ceph_decode_32_safe(p, end, len, bad);
717 if (len > 0) {
718 dout("apply_incremental full map len %d, %p to %p\n",
719 len, *p, end);
720 return osdmap_decode(p, min(*p+len, end));
721 }
722
723 /* new crush? */
724 ceph_decode_32_safe(p, end, len, bad);
725 if (len > 0) {
726 dout("apply_incremental new crush map len %d, %p to %p\n",
727 len, *p, end);
728 newcrush = crush_decode(*p, min(*p+len, end));
729 if (IS_ERR(newcrush))
730 return ERR_CAST(newcrush);
731 *p += len;
732 }
733
734 /* new flags? */
735 if (new_flags >= 0)
736 map->flags = new_flags;
737 if (new_pool_max >= 0)
738 map->pool_max = new_pool_max;
739
740 ceph_decode_need(p, end, 5*sizeof(u32), bad);
741
742 /* new max? */
743 max = ceph_decode_32(p);
744 if (max >= 0) {
745 err = osdmap_set_max_osd(map, max);
746 if (err < 0)
747 goto bad;
748 }
749
750 map->epoch++;
751 map->modified = map->modified;
752 if (newcrush) {
753 if (map->crush)
754 crush_destroy(map->crush);
755 map->crush = newcrush;
756 newcrush = NULL;
757 }
758
759 /* new_pool */
760 ceph_decode_32_safe(p, end, len, bad);
761 while (len--) {
762 __u8 ev;
763 struct ceph_pg_pool_info *pi;
764
765 ceph_decode_32_safe(p, end, pool, bad);
766 ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
767 ev = ceph_decode_8(p); /* encoding version */
768 if (ev > CEPH_PG_POOL_VERSION) {
769 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
770 ev, CEPH_PG_POOL_VERSION);
771 goto bad;
772 }
773 pi = __lookup_pg_pool(&map->pg_pools, pool);
774 if (!pi) {
775 pi = kzalloc(sizeof(*pi), GFP_NOFS);
776 if (!pi) {
777 err = -ENOMEM;
778 goto bad;
779 }
780 pi->id = pool;
781 __insert_pg_pool(&map->pg_pools, pi);
782 }
783 err = __decode_pool(p, end, pi);
784 if (err < 0)
785 goto bad;
786 }
787 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
788 goto bad;
789
790 /* old_pool */
791 ceph_decode_32_safe(p, end, len, bad);
792 while (len--) {
793 struct ceph_pg_pool_info *pi;
794
795 ceph_decode_32_safe(p, end, pool, bad);
796 pi = __lookup_pg_pool(&map->pg_pools, pool);
797 if (pi)
798 __remove_pg_pool(&map->pg_pools, pi);
799 }
800
801 /* new_up */
802 err = -EINVAL;
803 ceph_decode_32_safe(p, end, len, bad);
804 while (len--) {
805 u32 osd;
806 struct ceph_entity_addr addr;
807 ceph_decode_32_safe(p, end, osd, bad);
808 ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
809 ceph_decode_addr(&addr);
810 pr_info("osd%d up\n", osd);
811 BUG_ON(osd >= map->max_osd);
812 map->osd_state[osd] |= CEPH_OSD_UP;
813 map->osd_addr[osd] = addr;
814 }
815
816 /* new_down */
817 ceph_decode_32_safe(p, end, len, bad);
818 while (len--) {
819 u32 osd;
820 ceph_decode_32_safe(p, end, osd, bad);
821 (*p)++; /* clean flag */
822 pr_info("osd%d down\n", osd);
823 if (osd < map->max_osd)
824 map->osd_state[osd] &= ~CEPH_OSD_UP;
825 }
826
827 /* new_weight */
828 ceph_decode_32_safe(p, end, len, bad);
829 while (len--) {
830 u32 osd, off;
831 ceph_decode_need(p, end, sizeof(u32)*2, bad);
832 osd = ceph_decode_32(p);
833 off = ceph_decode_32(p);
834 pr_info("osd%d weight 0x%x %s\n", osd, off,
835 off == CEPH_OSD_IN ? "(in)" :
836 (off == CEPH_OSD_OUT ? "(out)" : ""));
837 if (osd < map->max_osd)
838 map->osd_weight[osd] = off;
839 }
840
841 /* new_pg_temp */
842 rbp = rb_first(&map->pg_temp);
843 ceph_decode_32_safe(p, end, len, bad);
844 while (len--) {
845 struct ceph_pg_mapping *pg;
846 int j;
847 struct ceph_pg pgid;
848 u32 pglen;
849 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
850 ceph_decode_copy(p, &pgid, sizeof(pgid));
851 pglen = ceph_decode_32(p);
852
853 /* remove any? */
854 while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
855 node)->pgid, pgid) <= 0) {
856 struct ceph_pg_mapping *cur =
857 rb_entry(rbp, struct ceph_pg_mapping, node);
858
859 rbp = rb_next(rbp);
860 dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
861 rb_erase(&cur->node, &map->pg_temp);
862 kfree(cur);
863 }
864
865 if (pglen) {
866 /* insert */
867 ceph_decode_need(p, end, pglen*sizeof(u32), bad);
868 pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
869 if (!pg) {
870 err = -ENOMEM;
871 goto bad;
872 }
873 pg->pgid = pgid;
874 pg->len = pglen;
875 for (j = 0; j < pglen; j++)
876 pg->osds[j] = ceph_decode_32(p);
877 err = __insert_pg_mapping(pg, &map->pg_temp);
878 if (err) {
879 kfree(pg);
880 goto bad;
881 }
882 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
883 pglen);
884 }
885 }
886 while (rbp) {
887 struct ceph_pg_mapping *cur =
888 rb_entry(rbp, struct ceph_pg_mapping, node);
889
890 rbp = rb_next(rbp);
891 dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
892 rb_erase(&cur->node, &map->pg_temp);
893 kfree(cur);
894 }
895
896 /* ignore the rest */
897 *p = end;
898 return map;
899
900bad:
901 pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
902 epoch, (int)(*p - start), *p, start, end);
903 print_hex_dump(KERN_DEBUG, "osdmap: ",
904 DUMP_PREFIX_OFFSET, 16, 1,
905 start, end - start, true);
906 if (newcrush)
907 crush_destroy(newcrush);
908 return ERR_PTR(err);
909}
910
911
912
913
914/*
915 * calculate file layout from given offset, length.
916 * fill in correct oid, logical length, and object extent
917 * offset, length.
918 *
919 * for now, we write only a single su, until we can
920 * pass a stride back to the caller.
921 */
922void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
923 u64 off, u64 *plen,
924 u64 *ono,
925 u64 *oxoff, u64 *oxlen)
926{
927 u32 osize = le32_to_cpu(layout->fl_object_size);
928 u32 su = le32_to_cpu(layout->fl_stripe_unit);
929 u32 sc = le32_to_cpu(layout->fl_stripe_count);
930 u32 bl, stripeno, stripepos, objsetno;
931 u32 su_per_object;
932 u64 t, su_offset;
933
934 dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen,
935 osize, su);
936 su_per_object = osize / su;
937 dout("osize %u / su %u = su_per_object %u\n", osize, su,
938 su_per_object);
939
940 BUG_ON((su & ~PAGE_MASK) != 0);
941 /* bl = *off / su; */
942 t = off;
943 do_div(t, su);
944 bl = t;
945 dout("off %llu / su %u = bl %u\n", off, su, bl);
946
947 stripeno = bl / sc;
948 stripepos = bl % sc;
949 objsetno = stripeno / su_per_object;
950
951 *ono = objsetno * sc + stripepos;
952 dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
953
954 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
955 t = off;
956 su_offset = do_div(t, su);
957 *oxoff = su_offset + (stripeno % su_per_object) * su;
958
959 /*
960 * Calculate the length of the extent being written to the selected
961 * object. This is the minimum of the full length requested (plen) or
962 * the remainder of the current stripe being written to.
963 */
964 *oxlen = min_t(u64, *plen, su - su_offset);
965 *plen = *oxlen;
966
967 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
968}
969
970/*
971 * calculate an object layout (i.e. pgid) from an oid,
972 * file_layout, and osdmap
973 */
974int ceph_calc_object_layout(struct ceph_object_layout *ol,
975 const char *oid,
976 struct ceph_file_layout *fl,
977 struct ceph_osdmap *osdmap)
978{
979 unsigned num, num_mask;
980 struct ceph_pg pgid;
981 s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
982 int poolid = le32_to_cpu(fl->fl_pg_pool);
983 struct ceph_pg_pool_info *pool;
984 unsigned ps;
985
986 BUG_ON(!osdmap);
987
988 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
989 if (!pool)
990 return -EIO;
991 ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
992 if (preferred >= 0) {
993 ps += preferred;
994 num = le32_to_cpu(pool->v.lpg_num);
995 num_mask = pool->lpg_num_mask;
996 } else {
997 num = le32_to_cpu(pool->v.pg_num);
998 num_mask = pool->pg_num_mask;
999 }
1000
1001 pgid.ps = cpu_to_le16(ps);
1002 pgid.preferred = cpu_to_le16(preferred);
1003 pgid.pool = fl->fl_pg_pool;
1004 if (preferred >= 0)
1005 dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
1006 (int)preferred);
1007 else
1008 dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
1009
1010 ol->ol_pgid = pgid;
1011 ol->ol_stripe_unit = fl->fl_object_stripe_unit;
1012 return 0;
1013}
1014
1015/*
1016 * Calculate raw osd vector for the given pgid. Return pointer to osd
1017 * array, or NULL on failure.
1018 */
1019static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1020 int *osds, int *num)
1021{
1022 struct ceph_pg_mapping *pg;
1023 struct ceph_pg_pool_info *pool;
1024 int ruleno;
1025 unsigned poolid, ps, pps;
1026 int preferred;
1027
1028 /* pg_temp? */
1029 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
1030 if (pg) {
1031 *num = pg->len;
1032 return pg->osds;
1033 }
1034
1035 /* crush */
1036 poolid = le32_to_cpu(pgid.pool);
1037 ps = le16_to_cpu(pgid.ps);
1038 preferred = (s16)le16_to_cpu(pgid.preferred);
1039
1040 /* don't forcefeed bad device ids to crush */
1041 if (preferred >= osdmap->max_osd ||
1042 preferred >= osdmap->crush->max_devices)
1043 preferred = -1;
1044
1045 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
1046 if (!pool)
1047 return NULL;
1048 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
1049 pool->v.type, pool->v.size);
1050 if (ruleno < 0) {
1051 pr_err("no crush rule pool %d ruleset %d type %d size %d\n",
1052 poolid, pool->v.crush_ruleset, pool->v.type,
1053 pool->v.size);
1054 return NULL;
1055 }
1056
1057 if (preferred >= 0)
1058 pps = ceph_stable_mod(ps,
1059 le32_to_cpu(pool->v.lpgp_num),
1060 pool->lpgp_num_mask);
1061 else
1062 pps = ceph_stable_mod(ps,
1063 le32_to_cpu(pool->v.pgp_num),
1064 pool->pgp_num_mask);
1065 pps += poolid;
1066 *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
1067 min_t(int, pool->v.size, *num),
1068 preferred, osdmap->osd_weight);
1069 return osds;
1070}
1071
1072/*
1073 * Return acting set for given pgid.
1074 */
1075int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1076 int *acting)
1077{
1078 int rawosds[CEPH_PG_MAX_SIZE], *osds;
1079 int i, o, num = CEPH_PG_MAX_SIZE;
1080
1081 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1082 if (!osds)
1083 return -1;
1084
1085 /* primary is first up osd */
1086 o = 0;
1087 for (i = 0; i < num; i++)
1088 if (ceph_osd_is_up(osdmap, osds[i]))
1089 acting[o++] = osds[i];
1090 return o;
1091}
1092
1093/*
1094 * Return primary osd for given pgid, or -1 if none.
1095 */
1096int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
1097{
1098 int rawosds[CEPH_PG_MAX_SIZE], *osds;
1099 int i, num = CEPH_PG_MAX_SIZE;
1100
1101 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1102 if (!osds)
1103 return -1;
1104
1105 /* primary is first up osd */
1106 for (i = 0; i < num; i++)
1107 if (ceph_osd_is_up(osdmap, osds[i]))
1108 return osds[i];
1109 return -1;
1110}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
deleted file mode 100644
index 970b547e510d..000000000000
--- a/fs/ceph/osdmap.h
+++ /dev/null
@@ -1,128 +0,0 @@
1#ifndef _FS_CEPH_OSDMAP_H
2#define _FS_CEPH_OSDMAP_H
3
4#include <linux/rbtree.h>
5#include "types.h"
6#include "ceph_fs.h"
7#include "crush/crush.h"
8
9/*
10 * The osd map describes the current membership of the osd cluster and
11 * specifies the mapping of objects to placement groups and placement
12 * groups to (sets of) osds. That is, it completely specifies the
13 * (desired) distribution of all data objects in the system at some
14 * point in time.
15 *
16 * Each map version is identified by an epoch, which increases monotonically.
17 *
18 * The map can be updated either via an incremental map (diff) describing
19 * the change between two successive epochs, or as a fully encoded map.
20 */
21struct ceph_pg_pool_info {
22 struct rb_node node;
23 int id;
24 struct ceph_pg_pool v;
25 int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
26 char *name;
27};
28
29struct ceph_pg_mapping {
30 struct rb_node node;
31 struct ceph_pg pgid;
32 int len;
33 int osds[];
34};
35
36struct ceph_osdmap {
37 struct ceph_fsid fsid;
38 u32 epoch;
39 u32 mkfs_epoch;
40 struct ceph_timespec created, modified;
41
42 u32 flags; /* CEPH_OSDMAP_* */
43
44 u32 max_osd; /* size of osd_state, _offload, _addr arrays */
45 u8 *osd_state; /* CEPH_OSD_* */
46 u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
47 struct ceph_entity_addr *osd_addr;
48
49 struct rb_root pg_temp;
50 struct rb_root pg_pools;
51 u32 pool_max;
52
53 /* the CRUSH map specifies the mapping of placement groups to
54 * the list of osds that store+replicate them. */
55 struct crush_map *crush;
56};
57
58/*
59 * file layout helpers
60 */
61#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
62#define ceph_file_layout_stripe_count(l) \
63 ((__s32)le32_to_cpu((l).fl_stripe_count))
64#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
65#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
66#define ceph_file_layout_object_su(l) \
67 ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
68#define ceph_file_layout_pg_preferred(l) \
69 ((__s32)le32_to_cpu((l).fl_pg_preferred))
70#define ceph_file_layout_pg_pool(l) \
71 ((__s32)le32_to_cpu((l).fl_pg_pool))
72
73static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
74{
75 return le32_to_cpu(l->fl_stripe_unit) *
76 le32_to_cpu(l->fl_stripe_count);
77}
78
79/* "period" == bytes before i start on a new set of objects */
80static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
81{
82 return le32_to_cpu(l->fl_object_size) *
83 le32_to_cpu(l->fl_stripe_count);
84}
85
86
87static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
88{
89 return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
90}
91
92static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
93{
94 return map && (map->flags & flag);
95}
96
97extern char *ceph_osdmap_state_str(char *str, int len, int state);
98
99static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
100 int osd)
101{
102 if (osd >= map->max_osd)
103 return NULL;
104 return &map->osd_addr[osd];
105}
106
107extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
108extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
109 struct ceph_osdmap *map,
110 struct ceph_messenger *msgr);
111extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
112
113/* calculate mapping of a file extent to an object */
114extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
115 u64 off, u64 *plen,
116 u64 *bno, u64 *oxoff, u64 *oxlen);
117
118/* calculate mapping of object to a placement group */
119extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
120 const char *oid,
121 struct ceph_file_layout *fl,
122 struct ceph_osdmap *osdmap);
123extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
124 int *acting);
125extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
126 struct ceph_pg pgid);
127
128#endif
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
deleted file mode 100644
index b6859f47d364..000000000000
--- a/fs/ceph/pagelist.c
+++ /dev/null
@@ -1,55 +0,0 @@
1
2#include <linux/gfp.h>
3#include <linux/pagemap.h>
4#include <linux/highmem.h>
5
6#include "pagelist.h"
7
8int ceph_pagelist_release(struct ceph_pagelist *pl)
9{
10 if (pl->mapped_tail)
11 kunmap(pl->mapped_tail);
12 while (!list_empty(&pl->head)) {
13 struct page *page = list_first_entry(&pl->head, struct page,
14 lru);
15 list_del(&page->lru);
16 __free_page(page);
17 }
18 return 0;
19}
20
21static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
22{
23 struct page *page = __page_cache_alloc(GFP_NOFS);
24 if (!page)
25 return -ENOMEM;
26 pl->room += PAGE_SIZE;
27 list_add_tail(&page->lru, &pl->head);
28 if (pl->mapped_tail)
29 kunmap(pl->mapped_tail);
30 pl->mapped_tail = kmap(page);
31 return 0;
32}
33
34int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
35{
36 while (pl->room < len) {
37 size_t bit = pl->room;
38 int ret;
39
40 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
41 buf, bit);
42 pl->length += bit;
43 pl->room -= bit;
44 buf += bit;
45 len -= bit;
46 ret = ceph_pagelist_addpage(pl);
47 if (ret)
48 return ret;
49 }
50
51 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
52 pl->length += len;
53 pl->room -= len;
54 return 0;
55}
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h
deleted file mode 100644
index e8a4187e1087..000000000000
--- a/fs/ceph/pagelist.h
+++ /dev/null
@@ -1,54 +0,0 @@
1#ifndef __FS_CEPH_PAGELIST_H
2#define __FS_CEPH_PAGELIST_H
3
4#include <linux/list.h>
5
6struct ceph_pagelist {
7 struct list_head head;
8 void *mapped_tail;
9 size_t length;
10 size_t room;
11};
12
13static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
14{
15 INIT_LIST_HEAD(&pl->head);
16 pl->mapped_tail = NULL;
17 pl->length = 0;
18 pl->room = 0;
19}
20extern int ceph_pagelist_release(struct ceph_pagelist *pl);
21
22extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
23
24static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
25{
26 __le64 ev = cpu_to_le64(v);
27 return ceph_pagelist_append(pl, &ev, sizeof(ev));
28}
29static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
30{
31 __le32 ev = cpu_to_le32(v);
32 return ceph_pagelist_append(pl, &ev, sizeof(ev));
33}
34static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
35{
36 __le16 ev = cpu_to_le16(v);
37 return ceph_pagelist_append(pl, &ev, sizeof(ev));
38}
39static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
40{
41 return ceph_pagelist_append(pl, &v, 1);
42}
43static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
44 char *s, size_t len)
45{
46 int ret = ceph_pagelist_encode_32(pl, len);
47 if (ret)
48 return ret;
49 if (len)
50 return ceph_pagelist_append(pl, s, len);
51 return 0;
52}
53
54#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
deleted file mode 100644
index 6d5247f2e81b..000000000000
--- a/fs/ceph/rados.h
+++ /dev/null
@@ -1,405 +0,0 @@
1#ifndef CEPH_RADOS_H
2#define CEPH_RADOS_H
3
4/*
5 * Data types for the Ceph distributed object storage layer RADOS
6 * (Reliable Autonomic Distributed Object Store).
7 */
8
9#include "msgr.h"
10
11/*
12 * osdmap encoding versions
13 */
14#define CEPH_OSDMAP_INC_VERSION 5
15#define CEPH_OSDMAP_INC_VERSION_EXT 5
16#define CEPH_OSDMAP_VERSION 5
17#define CEPH_OSDMAP_VERSION_EXT 5
18
19/*
20 * fs id
21 */
22struct ceph_fsid {
23 unsigned char fsid[16];
24};
25
26static inline int ceph_fsid_compare(const struct ceph_fsid *a,
27 const struct ceph_fsid *b)
28{
29 return memcmp(a, b, sizeof(*a));
30}
31
32/*
33 * ino, object, etc.
34 */
35typedef __le64 ceph_snapid_t;
36#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */
37#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */
38#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */
39
40struct ceph_timespec {
41 __le32 tv_sec;
42 __le32 tv_nsec;
43} __attribute__ ((packed));
44
45
46/*
47 * object layout - how objects are mapped into PGs
48 */
49#define CEPH_OBJECT_LAYOUT_HASH 1
50#define CEPH_OBJECT_LAYOUT_LINEAR 2
51#define CEPH_OBJECT_LAYOUT_HASHINO 3
52
53/*
54 * pg layout -- how PGs are mapped onto (sets of) OSDs
55 */
56#define CEPH_PG_LAYOUT_CRUSH 0
57#define CEPH_PG_LAYOUT_HASH 1
58#define CEPH_PG_LAYOUT_LINEAR 2
59#define CEPH_PG_LAYOUT_HYBRID 3
60
61#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */
62
63/*
64 * placement group.
65 * we encode this into one __le64.
66 */
67struct ceph_pg {
68 __le16 preferred; /* preferred primary osd */
69 __le16 ps; /* placement seed */
70 __le32 pool; /* object pool */
71} __attribute__ ((packed));
72
73/*
74 * pg_pool is a set of pgs storing a pool of objects
75 *
76 * pg_num -- base number of pseudorandomly placed pgs
77 *
78 * pgp_num -- effective number when calculating pg placement. this
79 * is used for pg_num increases. new pgs result in data being "split"
80 * into new pgs. for this to proceed smoothly, new pgs are intiially
81 * colocated with their parents; that is, pgp_num doesn't increase
82 * until the new pgs have successfully split. only _then_ are the new
83 * pgs placed independently.
84 *
85 * lpg_num -- localized pg count (per device). replicas are randomly
86 * selected.
87 *
88 * lpgp_num -- as above.
89 */
90#define CEPH_PG_TYPE_REP 1
91#define CEPH_PG_TYPE_RAID4 2
92#define CEPH_PG_POOL_VERSION 2
93struct ceph_pg_pool {
94 __u8 type; /* CEPH_PG_TYPE_* */
95 __u8 size; /* number of osds in each pg */
96 __u8 crush_ruleset; /* crush placement rule */
97 __u8 object_hash; /* hash mapping object name to ps */
98 __le32 pg_num, pgp_num; /* number of pg's */
99 __le32 lpg_num, lpgp_num; /* number of localized pg's */
100 __le32 last_change; /* most recent epoch changed */
101 __le64 snap_seq; /* seq for per-pool snapshot */
102 __le32 snap_epoch; /* epoch of last snap */
103 __le32 num_snaps;
104 __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
105 __le64 auid; /* who owns the pg */
106} __attribute__ ((packed));
107
108/*
109 * stable_mod func is used to control number of placement groups.
110 * similar to straight-up modulo, but produces a stable mapping as b
111 * increases over time. b is the number of bins, and bmask is the
112 * containing power of 2 minus 1.
113 *
114 * b <= bmask and bmask=(2**n)-1
115 * e.g., b=12 -> bmask=15, b=123 -> bmask=127
116 */
117static inline int ceph_stable_mod(int x, int b, int bmask)
118{
119 if ((x & bmask) < b)
120 return x & bmask;
121 else
122 return x & (bmask >> 1);
123}
124
125/*
126 * object layout - how a given object should be stored.
127 */
128struct ceph_object_layout {
129 struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */
130 __le32 ol_stripe_unit; /* for per-object parity, if any */
131} __attribute__ ((packed));
132
133/*
134 * compound epoch+version, used by storage layer to serialize mutations
135 */
136struct ceph_eversion {
137 __le32 epoch;
138 __le64 version;
139} __attribute__ ((packed));
140
141/*
142 * osd map bits
143 */
144
145/* status bits */
146#define CEPH_OSD_EXISTS 1
147#define CEPH_OSD_UP 2
148
149/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
150#define CEPH_OSD_IN 0x10000
151#define CEPH_OSD_OUT 0
152
153
154/*
155 * osd map flag bits
156 */
157#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
158#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
159#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
160#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
161#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
162
163/*
164 * osd ops
165 */
166#define CEPH_OSD_OP_MODE 0xf000
167#define CEPH_OSD_OP_MODE_RD 0x1000
168#define CEPH_OSD_OP_MODE_WR 0x2000
169#define CEPH_OSD_OP_MODE_RMW 0x3000
170#define CEPH_OSD_OP_MODE_SUB 0x4000
171
172#define CEPH_OSD_OP_TYPE 0x0f00
173#define CEPH_OSD_OP_TYPE_LOCK 0x0100
174#define CEPH_OSD_OP_TYPE_DATA 0x0200
175#define CEPH_OSD_OP_TYPE_ATTR 0x0300
176#define CEPH_OSD_OP_TYPE_EXEC 0x0400
177#define CEPH_OSD_OP_TYPE_PG 0x0500
178
179enum {
180 /** data **/
181 /* read */
182 CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
183 CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
184
185 /* fancy read */
186 CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
187
188 /* write */
189 CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
190 CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
191 CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
192 CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
193 CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
194
195 /* fancy write */
196 CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
197 CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
198 CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
199 CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
200
201 CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
202 CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
203 CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
204
205 CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
206 CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14,
207
208 /** attrs **/
209 /* read */
210 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
211 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
212 CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
213
214 /* write */
215 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
216 CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
217 CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
218 CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
219
220 /** subop **/
221 CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1,
222 CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2,
223 CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3,
224 CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
225 CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5,
226
227 /** lock **/
228 CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
229 CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
230 CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
231 CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
232 CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
233 CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
234
235 /** exec **/
236 CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
237
238 /** pg **/
239 CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
240};
241
242static inline int ceph_osd_op_type_lock(int op)
243{
244 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
245}
246static inline int ceph_osd_op_type_data(int op)
247{
248 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
249}
250static inline int ceph_osd_op_type_attr(int op)
251{
252 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
253}
254static inline int ceph_osd_op_type_exec(int op)
255{
256 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
257}
258static inline int ceph_osd_op_type_pg(int op)
259{
260 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
261}
262
263static inline int ceph_osd_op_mode_subop(int op)
264{
265 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
266}
267static inline int ceph_osd_op_mode_read(int op)
268{
269 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
270}
271static inline int ceph_osd_op_mode_modify(int op)
272{
273 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
274}
275
276/*
277 * note that the following tmap stuff is also defined in the ceph librados.h
278 * any modification here needs to be updated there
279 */
280#define CEPH_OSD_TMAP_HDR 'h'
281#define CEPH_OSD_TMAP_SET 's'
282#define CEPH_OSD_TMAP_RM 'r'
283
284extern const char *ceph_osd_op_name(int op);
285
286
287/*
288 * osd op flags
289 *
290 * An op may be READ, WRITE, or READ|WRITE.
291 */
292enum {
293 CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */
294 CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */
295 CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */
296 CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */
297 CEPH_OSD_FLAG_READ = 16, /* op may read */
298 CEPH_OSD_FLAG_WRITE = 32, /* op may write */
299 CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */
300 CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */
301 CEPH_OSD_FLAG_BALANCE_READS = 256,
302 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
303 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */
304 CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */
305 CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */
306};
307
308enum {
309 CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
310};
311
312#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
313#define EBLACKLISTED ESHUTDOWN /* blacklisted */
314
315/* xattr comparison */
316enum {
317 CEPH_OSD_CMPXATTR_OP_NOP = 0,
318 CEPH_OSD_CMPXATTR_OP_EQ = 1,
319 CEPH_OSD_CMPXATTR_OP_NE = 2,
320 CEPH_OSD_CMPXATTR_OP_GT = 3,
321 CEPH_OSD_CMPXATTR_OP_GTE = 4,
322 CEPH_OSD_CMPXATTR_OP_LT = 5,
323 CEPH_OSD_CMPXATTR_OP_LTE = 6
324};
325
326enum {
327 CEPH_OSD_CMPXATTR_MODE_STRING = 1,
328 CEPH_OSD_CMPXATTR_MODE_U64 = 2
329};
330
331/*
332 * an individual object operation. each may be accompanied by some data
333 * payload
334 */
335struct ceph_osd_op {
336 __le16 op; /* CEPH_OSD_OP_* */
337 __le32 flags; /* CEPH_OSD_FLAG_* */
338 union {
339 struct {
340 __le64 offset, length;
341 __le64 truncate_size;
342 __le32 truncate_seq;
343 } __attribute__ ((packed)) extent;
344 struct {
345 __le32 name_len;
346 __le32 value_len;
347 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
348 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
349 } __attribute__ ((packed)) xattr;
350 struct {
351 __u8 class_len;
352 __u8 method_len;
353 __u8 argc;
354 __le32 indata_len;
355 } __attribute__ ((packed)) cls;
356 struct {
357 __le64 cookie, count;
358 } __attribute__ ((packed)) pgls;
359 struct {
360 __le64 snapid;
361 } __attribute__ ((packed)) snap;
362 };
363 __le32 payload_len;
364} __attribute__ ((packed));
365
366/*
367 * osd request message header. each request may include multiple
368 * ceph_osd_op object operations.
369 */
370struct ceph_osd_request_head {
371 __le32 client_inc; /* client incarnation */
372 struct ceph_object_layout layout; /* pgid */
373 __le32 osdmap_epoch; /* client's osdmap epoch */
374
375 __le32 flags;
376
377 struct ceph_timespec mtime; /* for mutations only */
378 struct ceph_eversion reassert_version; /* if we are replaying op */
379
380 __le32 object_len; /* length of object name */
381
382 __le64 snapid; /* snapid to read */
383 __le64 snap_seq; /* writer's snap context */
384 __le32 num_snaps;
385
386 __le16 num_ops;
387 struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */
388} __attribute__ ((packed));
389
390struct ceph_osd_reply_head {
391 __le32 client_inc; /* client incarnation */
392 __le32 flags;
393 struct ceph_object_layout layout;
394 __le32 osdmap_epoch;
395 struct ceph_eversion reassert_version; /* for replaying uncommitted */
396
397 __le32 result; /* result code */
398
399 __le32 object_len; /* length of object name */
400 __le32 num_ops;
401 struct ceph_osd_op ops[0]; /* ops[], object */
402} __attribute__ ((packed));
403
404
405#endif
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 4868b9dcac5a..39c243acd062 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1,10 +1,12 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/sort.h> 3#include <linux/sort.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
5 5
6#include "super.h" 6#include "super.h"
7#include "decode.h" 7#include "mds_client.h"
8
9#include <linux/ceph/decode.h>
8 10
9/* 11/*
10 * Snapshots in ceph are driven in large part by cooperation from the 12 * Snapshots in ceph are driven in large part by cooperation from the
@@ -119,6 +121,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
119 INIT_LIST_HEAD(&realm->children); 121 INIT_LIST_HEAD(&realm->children);
120 INIT_LIST_HEAD(&realm->child_item); 122 INIT_LIST_HEAD(&realm->child_item);
121 INIT_LIST_HEAD(&realm->empty_item); 123 INIT_LIST_HEAD(&realm->empty_item);
124 INIT_LIST_HEAD(&realm->dirty_item);
122 INIT_LIST_HEAD(&realm->inodes_with_caps); 125 INIT_LIST_HEAD(&realm->inodes_with_caps);
123 spin_lock_init(&realm->inodes_with_caps_lock); 126 spin_lock_init(&realm->inodes_with_caps_lock);
124 __insert_snap_realm(&mdsc->snap_realms, realm); 127 __insert_snap_realm(&mdsc->snap_realms, realm);
@@ -467,7 +470,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
467 INIT_LIST_HEAD(&capsnap->ci_item); 470 INIT_LIST_HEAD(&capsnap->ci_item);
468 INIT_LIST_HEAD(&capsnap->flushing_item); 471 INIT_LIST_HEAD(&capsnap->flushing_item);
469 472
470 capsnap->follows = snapc->seq - 1; 473 capsnap->follows = snapc->seq;
471 capsnap->issued = __ceph_caps_issued(ci, NULL); 474 capsnap->issued = __ceph_caps_issued(ci, NULL);
472 capsnap->dirty = dirty; 475 capsnap->dirty = dirty;
473 476
@@ -525,7 +528,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
525 struct ceph_cap_snap *capsnap) 528 struct ceph_cap_snap *capsnap)
526{ 529{
527 struct inode *inode = &ci->vfs_inode; 530 struct inode *inode = &ci->vfs_inode;
528 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 531 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
529 532
530 BUG_ON(capsnap->writing); 533 BUG_ON(capsnap->writing);
531 capsnap->size = inode->i_size; 534 capsnap->size = inode->i_size;
@@ -604,6 +607,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
604 struct ceph_snap_realm *realm; 607 struct ceph_snap_realm *realm;
605 int invalidate = 0; 608 int invalidate = 0;
606 int err = -ENOMEM; 609 int err = -ENOMEM;
610 LIST_HEAD(dirty_realms);
607 611
608 dout("update_snap_trace deletion=%d\n", deletion); 612 dout("update_snap_trace deletion=%d\n", deletion);
609more: 613more:
@@ -626,24 +630,6 @@ more:
626 } 630 }
627 } 631 }
628 632
629 if (le64_to_cpu(ri->seq) > realm->seq) {
630 dout("update_snap_trace updating %llx %p %lld -> %lld\n",
631 realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
632 /*
633 * if the realm seq has changed, queue a cap_snap for every
634 * inode with open caps. we do this _before_ we update
635 * the realm info so that we prepare for writeback under the
636 * _previous_ snap context.
637 *
638 * ...unless it's a snap deletion!
639 */
640 if (!deletion)
641 queue_realm_cap_snaps(realm);
642 } else {
643 dout("update_snap_trace %llx %p seq %lld unchanged\n",
644 realm->ino, realm, realm->seq);
645 }
646
647 /* ensure the parent is correct */ 633 /* ensure the parent is correct */
648 err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent)); 634 err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
649 if (err < 0) 635 if (err < 0)
@@ -651,6 +637,8 @@ more:
651 invalidate += err; 637 invalidate += err;
652 638
653 if (le64_to_cpu(ri->seq) > realm->seq) { 639 if (le64_to_cpu(ri->seq) > realm->seq) {
640 dout("update_snap_trace updating %llx %p %lld -> %lld\n",
641 realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
654 /* update realm parameters, snap lists */ 642 /* update realm parameters, snap lists */
655 realm->seq = le64_to_cpu(ri->seq); 643 realm->seq = le64_to_cpu(ri->seq);
656 realm->created = le64_to_cpu(ri->created); 644 realm->created = le64_to_cpu(ri->created);
@@ -668,9 +656,17 @@ more:
668 if (err < 0) 656 if (err < 0)
669 goto fail; 657 goto fail;
670 658
659 /* queue realm for cap_snap creation */
660 list_add(&realm->dirty_item, &dirty_realms);
661
671 invalidate = 1; 662 invalidate = 1;
672 } else if (!realm->cached_context) { 663 } else if (!realm->cached_context) {
664 dout("update_snap_trace %llx %p seq %lld new\n",
665 realm->ino, realm, realm->seq);
673 invalidate = 1; 666 invalidate = 1;
667 } else {
668 dout("update_snap_trace %llx %p seq %lld unchanged\n",
669 realm->ino, realm, realm->seq);
674 } 670 }
675 671
676 dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino, 672 dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
@@ -683,6 +679,14 @@ more:
683 if (invalidate) 679 if (invalidate)
684 rebuild_snap_realms(realm); 680 rebuild_snap_realms(realm);
685 681
682 /*
683 * queue cap snaps _after_ we've built the new snap contexts,
684 * so that i_head_snapc can be set appropriately.
685 */
686 list_for_each_entry(realm, &dirty_realms, dirty_item) {
687 queue_realm_cap_snaps(realm);
688 }
689
686 __cleanup_empty_realms(mdsc); 690 __cleanup_empty_realms(mdsc);
687 return 0; 691 return 0;
688 692
@@ -715,7 +719,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
715 igrab(inode); 719 igrab(inode);
716 spin_unlock(&mdsc->snap_flush_lock); 720 spin_unlock(&mdsc->snap_flush_lock);
717 spin_lock(&inode->i_lock); 721 spin_lock(&inode->i_lock);
718 __ceph_flush_snaps(ci, &session); 722 __ceph_flush_snaps(ci, &session, 0);
719 spin_unlock(&inode->i_lock); 723 spin_unlock(&inode->i_lock);
720 iput(inode); 724 iput(inode);
721 spin_lock(&mdsc->snap_flush_lock); 725 spin_lock(&mdsc->snap_flush_lock);
@@ -745,7 +749,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
745 struct ceph_mds_session *session, 749 struct ceph_mds_session *session,
746 struct ceph_msg *msg) 750 struct ceph_msg *msg)
747{ 751{
748 struct super_block *sb = mdsc->client->sb; 752 struct super_block *sb = mdsc->fsc->sb;
749 int mds = session->s_mds; 753 int mds = session->s_mds;
750 u64 split; 754 u64 split;
751 int op; 755 int op;
@@ -816,6 +820,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
816 }; 820 };
817 struct inode *inode = ceph_find_inode(sb, vino); 821 struct inode *inode = ceph_find_inode(sb, vino);
818 struct ceph_inode_info *ci; 822 struct ceph_inode_info *ci;
823 struct ceph_snap_realm *oldrealm;
819 824
820 if (!inode) 825 if (!inode)
821 continue; 826 continue;
@@ -841,18 +846,19 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
841 dout(" will move %p to split realm %llx %p\n", 846 dout(" will move %p to split realm %llx %p\n",
842 inode, realm->ino, realm); 847 inode, realm->ino, realm);
843 /* 848 /*
844 * Remove the inode from the realm's inode 849 * Move the inode to the new realm
845 * list, but don't add it to the new realm
846 * yet. We don't want the cap_snap to be
847 * queued (again) by ceph_update_snap_trace()
848 * below. Queue it _now_, under the old context.
849 */ 850 */
850 spin_lock(&realm->inodes_with_caps_lock); 851 spin_lock(&realm->inodes_with_caps_lock);
851 list_del_init(&ci->i_snap_realm_item); 852 list_del_init(&ci->i_snap_realm_item);
853 list_add(&ci->i_snap_realm_item,
854 &realm->inodes_with_caps);
855 oldrealm = ci->i_snap_realm;
856 ci->i_snap_realm = realm;
852 spin_unlock(&realm->inodes_with_caps_lock); 857 spin_unlock(&realm->inodes_with_caps_lock);
853 spin_unlock(&inode->i_lock); 858 spin_unlock(&inode->i_lock);
854 859
855 ceph_queue_cap_snap(ci); 860 ceph_get_snap_realm(mdsc, realm);
861 ceph_put_snap_realm(mdsc, oldrealm);
856 862
857 iput(inode); 863 iput(inode);
858 continue; 864 continue;
@@ -880,43 +886,9 @@ skip_inode:
880 ceph_update_snap_trace(mdsc, p, e, 886 ceph_update_snap_trace(mdsc, p, e,
881 op == CEPH_SNAP_OP_DESTROY); 887 op == CEPH_SNAP_OP_DESTROY);
882 888
883 if (op == CEPH_SNAP_OP_SPLIT) { 889 if (op == CEPH_SNAP_OP_SPLIT)
884 /*
885 * ok, _now_ add the inodes into the new realm.
886 */
887 for (i = 0; i < num_split_inos; i++) {
888 struct ceph_vino vino = {
889 .ino = le64_to_cpu(split_inos[i]),
890 .snap = CEPH_NOSNAP,
891 };
892 struct inode *inode = ceph_find_inode(sb, vino);
893 struct ceph_inode_info *ci;
894
895 if (!inode)
896 continue;
897 ci = ceph_inode(inode);
898 spin_lock(&inode->i_lock);
899 if (list_empty(&ci->i_snap_realm_item)) {
900 struct ceph_snap_realm *oldrealm =
901 ci->i_snap_realm;
902
903 dout(" moving %p to split realm %llx %p\n",
904 inode, realm->ino, realm);
905 spin_lock(&realm->inodes_with_caps_lock);
906 list_add(&ci->i_snap_realm_item,
907 &realm->inodes_with_caps);
908 ci->i_snap_realm = realm;
909 spin_unlock(&realm->inodes_with_caps_lock);
910 ceph_get_snap_realm(mdsc, realm);
911 ceph_put_snap_realm(mdsc, oldrealm);
912 }
913 spin_unlock(&inode->i_lock);
914 iput(inode);
915 }
916
917 /* we took a reference when we created the realm, above */ 890 /* we took a reference when we created the realm, above */
918 ceph_put_snap_realm(mdsc, realm); 891 ceph_put_snap_realm(mdsc, realm);
919 }
920 892
921 __cleanup_empty_realms(mdsc); 893 __cleanup_empty_realms(mdsc);
922 894
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/strings.c
index c6179d3a26a2..cd5097d7c804 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/strings.c
@@ -1,71 +1,9 @@
1/* 1/*
2 * Ceph string constants 2 * Ceph fs string constants
3 */ 3 */
4#include "types.h" 4#include <linux/module.h>
5#include <linux/ceph/types.h>
5 6
6const char *ceph_entity_type_name(int type)
7{
8 switch (type) {
9 case CEPH_ENTITY_TYPE_MDS: return "mds";
10 case CEPH_ENTITY_TYPE_OSD: return "osd";
11 case CEPH_ENTITY_TYPE_MON: return "mon";
12 case CEPH_ENTITY_TYPE_CLIENT: return "client";
13 case CEPH_ENTITY_TYPE_AUTH: return "auth";
14 default: return "unknown";
15 }
16}
17
18const char *ceph_osd_op_name(int op)
19{
20 switch (op) {
21 case CEPH_OSD_OP_READ: return "read";
22 case CEPH_OSD_OP_STAT: return "stat";
23
24 case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
25
26 case CEPH_OSD_OP_WRITE: return "write";
27 case CEPH_OSD_OP_DELETE: return "delete";
28 case CEPH_OSD_OP_TRUNCATE: return "truncate";
29 case CEPH_OSD_OP_ZERO: return "zero";
30 case CEPH_OSD_OP_WRITEFULL: return "writefull";
31 case CEPH_OSD_OP_ROLLBACK: return "rollback";
32
33 case CEPH_OSD_OP_APPEND: return "append";
34 case CEPH_OSD_OP_STARTSYNC: return "startsync";
35 case CEPH_OSD_OP_SETTRUNC: return "settrunc";
36 case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
37
38 case CEPH_OSD_OP_TMAPUP: return "tmapup";
39 case CEPH_OSD_OP_TMAPGET: return "tmapget";
40 case CEPH_OSD_OP_TMAPPUT: return "tmapput";
41
42 case CEPH_OSD_OP_GETXATTR: return "getxattr";
43 case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
44 case CEPH_OSD_OP_SETXATTR: return "setxattr";
45 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
46 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
47 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
48 case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
49
50 case CEPH_OSD_OP_PULL: return "pull";
51 case CEPH_OSD_OP_PUSH: return "push";
52 case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
53 case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
54 case CEPH_OSD_OP_SCRUB: return "scrub";
55
56 case CEPH_OSD_OP_WRLOCK: return "wrlock";
57 case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
58 case CEPH_OSD_OP_RDLOCK: return "rdlock";
59 case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
60 case CEPH_OSD_OP_UPLOCK: return "uplock";
61 case CEPH_OSD_OP_DNLOCK: return "dnlock";
62
63 case CEPH_OSD_OP_CALL: return "call";
64
65 case CEPH_OSD_OP_PGLS: return "pgls";
66 }
67 return "???";
68}
69 7
70const char *ceph_mds_state_name(int s) 8const char *ceph_mds_state_name(int s)
71{ 9{
@@ -177,17 +115,3 @@ const char *ceph_snap_op_name(int o)
177 } 115 }
178 return "???"; 116 return "???";
179} 117}
180
181const char *ceph_pool_op_name(int op)
182{
183 switch (op) {
184 case POOL_OP_CREATE: return "create";
185 case POOL_OP_DELETE: return "delete";
186 case POOL_OP_AUID_CHANGE: return "auid change";
187 case POOL_OP_CREATE_SNAP: return "create snap";
188 case POOL_OP_DELETE_SNAP: return "delete snap";
189 case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
190 case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
191 }
192 return "???";
193}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 9922628532b2..d6e0e0421891 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1,5 +1,5 @@
1 1
2#include "ceph_debug.h" 2#include <linux/ceph/ceph_debug.h>
3 3
4#include <linux/backing-dev.h> 4#include <linux/backing-dev.h>
5#include <linux/ctype.h> 5#include <linux/ctype.h>
@@ -15,10 +15,13 @@
15#include <linux/statfs.h> 15#include <linux/statfs.h>
16#include <linux/string.h> 16#include <linux/string.h>
17 17
18#include "decode.h"
19#include "super.h" 18#include "super.h"
20#include "mon_client.h" 19#include "mds_client.h"
21#include "auth.h" 20
21#include <linux/ceph/decode.h>
22#include <linux/ceph/mon_client.h>
23#include <linux/ceph/auth.h>
24#include <linux/ceph/debugfs.h>
22 25
23/* 26/*
24 * Ceph superblock operations 27 * Ceph superblock operations
@@ -26,36 +29,22 @@
26 * Handle the basics of mounting, unmounting. 29 * Handle the basics of mounting, unmounting.
27 */ 30 */
28 31
29
30/*
31 * find filename portion of a path (/foo/bar/baz -> baz)
32 */
33const char *ceph_file_part(const char *s, int len)
34{
35 const char *e = s + len;
36
37 while (e != s && *(e-1) != '/')
38 e--;
39 return e;
40}
41
42
43/* 32/*
44 * super ops 33 * super ops
45 */ 34 */
46static void ceph_put_super(struct super_block *s) 35static void ceph_put_super(struct super_block *s)
47{ 36{
48 struct ceph_client *client = ceph_sb_to_client(s); 37 struct ceph_fs_client *fsc = ceph_sb_to_client(s);
49 38
50 dout("put_super\n"); 39 dout("put_super\n");
51 ceph_mdsc_close_sessions(&client->mdsc); 40 ceph_mdsc_close_sessions(fsc->mdsc);
52 41
53 /* 42 /*
54 * ensure we release the bdi before put_anon_super releases 43 * ensure we release the bdi before put_anon_super releases
55 * the device name. 44 * the device name.
56 */ 45 */
57 if (s->s_bdi == &client->backing_dev_info) { 46 if (s->s_bdi == &fsc->backing_dev_info) {
58 bdi_unregister(&client->backing_dev_info); 47 bdi_unregister(&fsc->backing_dev_info);
59 s->s_bdi = NULL; 48 s->s_bdi = NULL;
60 } 49 }
61 50
@@ -64,14 +53,14 @@ static void ceph_put_super(struct super_block *s)
64 53
65static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) 54static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
66{ 55{
67 struct ceph_client *client = ceph_inode_to_client(dentry->d_inode); 56 struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode);
68 struct ceph_monmap *monmap = client->monc.monmap; 57 struct ceph_monmap *monmap = fsc->client->monc.monmap;
69 struct ceph_statfs st; 58 struct ceph_statfs st;
70 u64 fsid; 59 u64 fsid;
71 int err; 60 int err;
72 61
73 dout("statfs\n"); 62 dout("statfs\n");
74 err = ceph_monc_do_statfs(&client->monc, &st); 63 err = ceph_monc_do_statfs(&fsc->client->monc, &st);
75 if (err < 0) 64 if (err < 0)
76 return err; 65 return err;
77 66
@@ -104,238 +93,28 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
104 93
105static int ceph_sync_fs(struct super_block *sb, int wait) 94static int ceph_sync_fs(struct super_block *sb, int wait)
106{ 95{
107 struct ceph_client *client = ceph_sb_to_client(sb); 96 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
108 97
109 if (!wait) { 98 if (!wait) {
110 dout("sync_fs (non-blocking)\n"); 99 dout("sync_fs (non-blocking)\n");
111 ceph_flush_dirty_caps(&client->mdsc); 100 ceph_flush_dirty_caps(fsc->mdsc);
112 dout("sync_fs (non-blocking) done\n"); 101 dout("sync_fs (non-blocking) done\n");
113 return 0; 102 return 0;
114 } 103 }
115 104
116 dout("sync_fs (blocking)\n"); 105 dout("sync_fs (blocking)\n");
117 ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc); 106 ceph_osdc_sync(&fsc->client->osdc);
118 ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc); 107 ceph_mdsc_sync(fsc->mdsc);
119 dout("sync_fs (blocking) done\n"); 108 dout("sync_fs (blocking) done\n");
120 return 0; 109 return 0;
121} 110}
122 111
123static int default_congestion_kb(void)
124{
125 int congestion_kb;
126
127 /*
128 * Copied from NFS
129 *
130 * congestion size, scale with available memory.
131 *
132 * 64MB: 8192k
133 * 128MB: 11585k
134 * 256MB: 16384k
135 * 512MB: 23170k
136 * 1GB: 32768k
137 * 2GB: 46340k
138 * 4GB: 65536k
139 * 8GB: 92681k
140 * 16GB: 131072k
141 *
142 * This allows larger machines to have larger/more transfers.
143 * Limit the default to 256M
144 */
145 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
146 if (congestion_kb > 256*1024)
147 congestion_kb = 256*1024;
148
149 return congestion_kb;
150}
151
152/**
153 * ceph_show_options - Show mount options in /proc/mounts
154 * @m: seq_file to write to
155 * @mnt: mount descriptor
156 */
157static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
158{
159 struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
160 struct ceph_mount_args *args = client->mount_args;
161
162 if (args->flags & CEPH_OPT_FSID)
163 seq_printf(m, ",fsid=%pU", &args->fsid);
164 if (args->flags & CEPH_OPT_NOSHARE)
165 seq_puts(m, ",noshare");
166 if (args->flags & CEPH_OPT_DIRSTAT)
167 seq_puts(m, ",dirstat");
168 if ((args->flags & CEPH_OPT_RBYTES) == 0)
169 seq_puts(m, ",norbytes");
170 if (args->flags & CEPH_OPT_NOCRC)
171 seq_puts(m, ",nocrc");
172 if (args->flags & CEPH_OPT_NOASYNCREADDIR)
173 seq_puts(m, ",noasyncreaddir");
174
175 if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
176 seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
177 if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
178 seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
179 if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
180 seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
181 if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
182 seq_printf(m, ",osdkeepalivetimeout=%d",
183 args->osd_keepalive_timeout);
184 if (args->wsize)
185 seq_printf(m, ",wsize=%d", args->wsize);
186 if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
187 seq_printf(m, ",rsize=%d", args->rsize);
188 if (args->congestion_kb != default_congestion_kb())
189 seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
190 if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
191 seq_printf(m, ",caps_wanted_delay_min=%d",
192 args->caps_wanted_delay_min);
193 if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
194 seq_printf(m, ",caps_wanted_delay_max=%d",
195 args->caps_wanted_delay_max);
196 if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
197 seq_printf(m, ",cap_release_safety=%d",
198 args->cap_release_safety);
199 if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
200 seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
201 if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
202 seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
203 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
204 seq_printf(m, ",snapdirname=%s", args->snapdir_name);
205 if (args->name)
206 seq_printf(m, ",name=%s", args->name);
207 if (args->secret)
208 seq_puts(m, ",secret=<hidden>");
209 return 0;
210}
211
212/*
213 * caches
214 */
215struct kmem_cache *ceph_inode_cachep;
216struct kmem_cache *ceph_cap_cachep;
217struct kmem_cache *ceph_dentry_cachep;
218struct kmem_cache *ceph_file_cachep;
219
220static void ceph_inode_init_once(void *foo)
221{
222 struct ceph_inode_info *ci = foo;
223 inode_init_once(&ci->vfs_inode);
224}
225
226static int __init init_caches(void)
227{
228 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
229 sizeof(struct ceph_inode_info),
230 __alignof__(struct ceph_inode_info),
231 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
232 ceph_inode_init_once);
233 if (ceph_inode_cachep == NULL)
234 return -ENOMEM;
235
236 ceph_cap_cachep = KMEM_CACHE(ceph_cap,
237 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
238 if (ceph_cap_cachep == NULL)
239 goto bad_cap;
240
241 ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
242 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
243 if (ceph_dentry_cachep == NULL)
244 goto bad_dentry;
245
246 ceph_file_cachep = KMEM_CACHE(ceph_file_info,
247 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
248 if (ceph_file_cachep == NULL)
249 goto bad_file;
250
251 return 0;
252
253bad_file:
254 kmem_cache_destroy(ceph_dentry_cachep);
255bad_dentry:
256 kmem_cache_destroy(ceph_cap_cachep);
257bad_cap:
258 kmem_cache_destroy(ceph_inode_cachep);
259 return -ENOMEM;
260}
261
262static void destroy_caches(void)
263{
264 kmem_cache_destroy(ceph_inode_cachep);
265 kmem_cache_destroy(ceph_cap_cachep);
266 kmem_cache_destroy(ceph_dentry_cachep);
267 kmem_cache_destroy(ceph_file_cachep);
268}
269
270
271/*
272 * ceph_umount_begin - initiate forced umount. Tear down down the
273 * mount, skipping steps that may hang while waiting for server(s).
274 */
275static void ceph_umount_begin(struct super_block *sb)
276{
277 struct ceph_client *client = ceph_sb_to_client(sb);
278
279 dout("ceph_umount_begin - starting forced umount\n");
280 if (!client)
281 return;
282 client->mount_state = CEPH_MOUNT_SHUTDOWN;
283 return;
284}
285
286static const struct super_operations ceph_super_ops = {
287 .alloc_inode = ceph_alloc_inode,
288 .destroy_inode = ceph_destroy_inode,
289 .write_inode = ceph_write_inode,
290 .sync_fs = ceph_sync_fs,
291 .put_super = ceph_put_super,
292 .show_options = ceph_show_options,
293 .statfs = ceph_statfs,
294 .umount_begin = ceph_umount_begin,
295};
296
297
298const char *ceph_msg_type_name(int type)
299{
300 switch (type) {
301 case CEPH_MSG_SHUTDOWN: return "shutdown";
302 case CEPH_MSG_PING: return "ping";
303 case CEPH_MSG_AUTH: return "auth";
304 case CEPH_MSG_AUTH_REPLY: return "auth_reply";
305 case CEPH_MSG_MON_MAP: return "mon_map";
306 case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
307 case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
308 case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
309 case CEPH_MSG_STATFS: return "statfs";
310 case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
311 case CEPH_MSG_MDS_MAP: return "mds_map";
312 case CEPH_MSG_CLIENT_SESSION: return "client_session";
313 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
314 case CEPH_MSG_CLIENT_REQUEST: return "client_request";
315 case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
316 case CEPH_MSG_CLIENT_REPLY: return "client_reply";
317 case CEPH_MSG_CLIENT_CAPS: return "client_caps";
318 case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
319 case CEPH_MSG_CLIENT_SNAP: return "client_snap";
320 case CEPH_MSG_CLIENT_LEASE: return "client_lease";
321 case CEPH_MSG_OSD_MAP: return "osd_map";
322 case CEPH_MSG_OSD_OP: return "osd_op";
323 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
324 default: return "unknown";
325 }
326}
327
328
329/* 112/*
330 * mount options 113 * mount options
331 */ 114 */
332enum { 115enum {
333 Opt_wsize, 116 Opt_wsize,
334 Opt_rsize, 117 Opt_rsize,
335 Opt_osdtimeout,
336 Opt_osdkeepalivetimeout,
337 Opt_mount_timeout,
338 Opt_osd_idle_ttl,
339 Opt_caps_wanted_delay_min, 118 Opt_caps_wanted_delay_min,
340 Opt_caps_wanted_delay_max, 119 Opt_caps_wanted_delay_max,
341 Opt_cap_release_safety, 120 Opt_cap_release_safety,
@@ -344,29 +123,19 @@ enum {
344 Opt_congestion_kb, 123 Opt_congestion_kb,
345 Opt_last_int, 124 Opt_last_int,
346 /* int args above */ 125 /* int args above */
347 Opt_fsid,
348 Opt_snapdirname, 126 Opt_snapdirname,
349 Opt_name,
350 Opt_secret,
351 Opt_last_string, 127 Opt_last_string,
352 /* string args above */ 128 /* string args above */
353 Opt_ip,
354 Opt_noshare,
355 Opt_dirstat, 129 Opt_dirstat,
356 Opt_nodirstat, 130 Opt_nodirstat,
357 Opt_rbytes, 131 Opt_rbytes,
358 Opt_norbytes, 132 Opt_norbytes,
359 Opt_nocrc,
360 Opt_noasyncreaddir, 133 Opt_noasyncreaddir,
361}; 134};
362 135
363static match_table_t arg_tokens = { 136static match_table_t fsopt_tokens = {
364 {Opt_wsize, "wsize=%d"}, 137 {Opt_wsize, "wsize=%d"},
365 {Opt_rsize, "rsize=%d"}, 138 {Opt_rsize, "rsize=%d"},
366 {Opt_osdtimeout, "osdtimeout=%d"},
367 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
368 {Opt_mount_timeout, "mount_timeout=%d"},
369 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
370 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, 139 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
371 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, 140 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
372 {Opt_cap_release_safety, "cap_release_safety=%d"}, 141 {Opt_cap_release_safety, "cap_release_safety=%d"},
@@ -374,403 +143,459 @@ static match_table_t arg_tokens = {
374 {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, 143 {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
375 {Opt_congestion_kb, "write_congestion_kb=%d"}, 144 {Opt_congestion_kb, "write_congestion_kb=%d"},
376 /* int args above */ 145 /* int args above */
377 {Opt_fsid, "fsid=%s"},
378 {Opt_snapdirname, "snapdirname=%s"}, 146 {Opt_snapdirname, "snapdirname=%s"},
379 {Opt_name, "name=%s"},
380 {Opt_secret, "secret=%s"},
381 /* string args above */ 147 /* string args above */
382 {Opt_ip, "ip=%s"},
383 {Opt_noshare, "noshare"},
384 {Opt_dirstat, "dirstat"}, 148 {Opt_dirstat, "dirstat"},
385 {Opt_nodirstat, "nodirstat"}, 149 {Opt_nodirstat, "nodirstat"},
386 {Opt_rbytes, "rbytes"}, 150 {Opt_rbytes, "rbytes"},
387 {Opt_norbytes, "norbytes"}, 151 {Opt_norbytes, "norbytes"},
388 {Opt_nocrc, "nocrc"},
389 {Opt_noasyncreaddir, "noasyncreaddir"}, 152 {Opt_noasyncreaddir, "noasyncreaddir"},
390 {-1, NULL} 153 {-1, NULL}
391}; 154};
392 155
393static int parse_fsid(const char *str, struct ceph_fsid *fsid) 156static int parse_fsopt_token(char *c, void *private)
394{ 157{
395 int i = 0; 158 struct ceph_mount_options *fsopt = private;
396 char tmp[3]; 159 substring_t argstr[MAX_OPT_ARGS];
397 int err = -EINVAL; 160 int token, intval, ret;
398 int d; 161
399 162 token = match_token((char *)c, fsopt_tokens, argstr);
400 dout("parse_fsid '%s'\n", str); 163 if (token < 0)
401 tmp[2] = 0; 164 return -EINVAL;
402 while (*str && i < 16) { 165
403 if (ispunct(*str)) { 166 if (token < Opt_last_int) {
404 str++; 167 ret = match_int(&argstr[0], &intval);
405 continue; 168 if (ret < 0) {
169 pr_err("bad mount option arg (not int) "
170 "at '%s'\n", c);
171 return ret;
406 } 172 }
407 if (!isxdigit(str[0]) || !isxdigit(str[1])) 173 dout("got int token %d val %d\n", token, intval);
408 break; 174 } else if (token > Opt_last_int && token < Opt_last_string) {
409 tmp[0] = str[0]; 175 dout("got string token %d val %s\n", token,
410 tmp[1] = str[1]; 176 argstr[0].from);
411 if (sscanf(tmp, "%x", &d) < 1) 177 } else {
412 break; 178 dout("got token %d\n", token);
413 fsid->fsid[i] = d & 0xff;
414 i++;
415 str += 2;
416 } 179 }
417 180
418 if (i == 16) 181 switch (token) {
419 err = 0; 182 case Opt_snapdirname:
420 dout("parse_fsid ret %d got fsid %pU", err, fsid); 183 kfree(fsopt->snapdir_name);
421 return err; 184 fsopt->snapdir_name = kstrndup(argstr[0].from,
185 argstr[0].to-argstr[0].from,
186 GFP_KERNEL);
187 if (!fsopt->snapdir_name)
188 return -ENOMEM;
189 break;
190
191 /* misc */
192 case Opt_wsize:
193 fsopt->wsize = intval;
194 break;
195 case Opt_rsize:
196 fsopt->rsize = intval;
197 break;
198 case Opt_caps_wanted_delay_min:
199 fsopt->caps_wanted_delay_min = intval;
200 break;
201 case Opt_caps_wanted_delay_max:
202 fsopt->caps_wanted_delay_max = intval;
203 break;
204 case Opt_readdir_max_entries:
205 fsopt->max_readdir = intval;
206 break;
207 case Opt_readdir_max_bytes:
208 fsopt->max_readdir_bytes = intval;
209 break;
210 case Opt_congestion_kb:
211 fsopt->congestion_kb = intval;
212 break;
213 case Opt_dirstat:
214 fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
215 break;
216 case Opt_nodirstat:
217 fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
218 break;
219 case Opt_rbytes:
220 fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
221 break;
222 case Opt_norbytes:
223 fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
224 break;
225 case Opt_noasyncreaddir:
226 fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
227 break;
228 default:
229 BUG_ON(token);
230 }
231 return 0;
422} 232}
423 233
424static struct ceph_mount_args *parse_mount_args(int flags, char *options, 234static void destroy_mount_options(struct ceph_mount_options *args)
425 const char *dev_name,
426 const char **path)
427{ 235{
428 struct ceph_mount_args *args; 236 dout("destroy_mount_options %p\n", args);
429 const char *c; 237 kfree(args->snapdir_name);
430 int err = -ENOMEM; 238 kfree(args);
431 substring_t argstr[MAX_OPT_ARGS]; 239}
432 240
433 args = kzalloc(sizeof(*args), GFP_KERNEL); 241static int strcmp_null(const char *s1, const char *s2)
434 if (!args) 242{
435 return ERR_PTR(-ENOMEM); 243 if (!s1 && !s2)
436 args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr), 244 return 0;
437 GFP_KERNEL); 245 if (s1 && !s2)
438 if (!args->mon_addr) 246 return -1;
439 goto out; 247 if (!s1 && s2)
248 return 1;
249 return strcmp(s1, s2);
250}
440 251
441 dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name); 252static int compare_mount_options(struct ceph_mount_options *new_fsopt,
442 253 struct ceph_options *new_opt,
443 /* start with defaults */ 254 struct ceph_fs_client *fsc)
444 args->sb_flags = flags; 255{
445 args->flags = CEPH_OPT_DEFAULT; 256 struct ceph_mount_options *fsopt1 = new_fsopt;
446 args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT; 257 struct ceph_mount_options *fsopt2 = fsc->mount_options;
447 args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; 258 int ofs = offsetof(struct ceph_mount_options, snapdir_name);
448 args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ 259 int ret;
449 args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
450 args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
451 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
452 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
453 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
454 args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
455 args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
456 args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
457 args->congestion_kb = default_congestion_kb();
458
459 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
460 err = -EINVAL;
461 if (!dev_name)
462 goto out;
463 *path = strstr(dev_name, ":/");
464 if (*path == NULL) {
465 pr_err("device name is missing path (no :/ in %s)\n",
466 dev_name);
467 goto out;
468 }
469 260
470 /* get mon ip(s) */ 261 ret = memcmp(fsopt1, fsopt2, ofs);
471 err = ceph_parse_ips(dev_name, *path, args->mon_addr, 262 if (ret)
472 CEPH_MAX_MON, &args->num_mon); 263 return ret;
473 if (err < 0) 264
474 goto out; 265 ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
266 if (ret)
267 return ret;
268
269 return ceph_compare_options(new_opt, fsc->client);
270}
271
272static int parse_mount_options(struct ceph_mount_options **pfsopt,
273 struct ceph_options **popt,
274 int flags, char *options,
275 const char *dev_name,
276 const char **path)
277{
278 struct ceph_mount_options *fsopt;
279 const char *dev_name_end;
280 int err = -ENOMEM;
281
282 fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
283 if (!fsopt)
284 return -ENOMEM;
285
286 dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
287
288 fsopt->sb_flags = flags;
289 fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
290
291 fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
292 fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
293 fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
294 fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
295 fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
296 fsopt->congestion_kb = default_congestion_kb();
297
298 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
299 err = -EINVAL;
300 if (!dev_name)
301 goto out;
302 *path = strstr(dev_name, ":/");
303 if (*path == NULL) {
304 pr_err("device name is missing path (no :/ in %s)\n",
305 dev_name);
306 goto out;
307 }
308 dev_name_end = *path;
309 dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
475 310
476 /* path on server */ 311 /* path on server */
477 *path += 2; 312 *path += 2;
478 dout("server path '%s'\n", *path); 313 dout("server path '%s'\n", *path);
479 314
480 /* parse mount options */ 315 err = ceph_parse_options(popt, options, dev_name, dev_name_end,
481 while ((c = strsep(&options, ",")) != NULL) { 316 parse_fsopt_token, (void *)fsopt);
482 int token, intval, ret; 317 if (err)
483 if (!*c) 318 goto out;
484 continue; 319
485 err = -EINVAL; 320 /* success */
486 token = match_token((char *)c, arg_tokens, argstr); 321 *pfsopt = fsopt;
487 if (token < 0) { 322 return 0;
488 pr_err("bad mount option at '%s'\n", c);
489 goto out;
490 }
491 if (token < Opt_last_int) {
492 ret = match_int(&argstr[0], &intval);
493 if (ret < 0) {
494 pr_err("bad mount option arg (not int) "
495 "at '%s'\n", c);
496 continue;
497 }
498 dout("got int token %d val %d\n", token, intval);
499 } else if (token > Opt_last_int && token < Opt_last_string) {
500 dout("got string token %d val %s\n", token,
501 argstr[0].from);
502 } else {
503 dout("got token %d\n", token);
504 }
505 switch (token) {
506 case Opt_ip:
507 err = ceph_parse_ips(argstr[0].from,
508 argstr[0].to,
509 &args->my_addr,
510 1, NULL);
511 if (err < 0)
512 goto out;
513 args->flags |= CEPH_OPT_MYIP;
514 break;
515
516 case Opt_fsid:
517 err = parse_fsid(argstr[0].from, &args->fsid);
518 if (err == 0)
519 args->flags |= CEPH_OPT_FSID;
520 break;
521 case Opt_snapdirname:
522 kfree(args->snapdir_name);
523 args->snapdir_name = kstrndup(argstr[0].from,
524 argstr[0].to-argstr[0].from,
525 GFP_KERNEL);
526 break;
527 case Opt_name:
528 args->name = kstrndup(argstr[0].from,
529 argstr[0].to-argstr[0].from,
530 GFP_KERNEL);
531 break;
532 case Opt_secret:
533 args->secret = kstrndup(argstr[0].from,
534 argstr[0].to-argstr[0].from,
535 GFP_KERNEL);
536 break;
537
538 /* misc */
539 case Opt_wsize:
540 args->wsize = intval;
541 break;
542 case Opt_rsize:
543 args->rsize = intval;
544 break;
545 case Opt_osdtimeout:
546 args->osd_timeout = intval;
547 break;
548 case Opt_osdkeepalivetimeout:
549 args->osd_keepalive_timeout = intval;
550 break;
551 case Opt_osd_idle_ttl:
552 args->osd_idle_ttl = intval;
553 break;
554 case Opt_mount_timeout:
555 args->mount_timeout = intval;
556 break;
557 case Opt_caps_wanted_delay_min:
558 args->caps_wanted_delay_min = intval;
559 break;
560 case Opt_caps_wanted_delay_max:
561 args->caps_wanted_delay_max = intval;
562 break;
563 case Opt_readdir_max_entries:
564 args->max_readdir = intval;
565 break;
566 case Opt_readdir_max_bytes:
567 args->max_readdir_bytes = intval;
568 break;
569 case Opt_congestion_kb:
570 args->congestion_kb = intval;
571 break;
572
573 case Opt_noshare:
574 args->flags |= CEPH_OPT_NOSHARE;
575 break;
576
577 case Opt_dirstat:
578 args->flags |= CEPH_OPT_DIRSTAT;
579 break;
580 case Opt_nodirstat:
581 args->flags &= ~CEPH_OPT_DIRSTAT;
582 break;
583 case Opt_rbytes:
584 args->flags |= CEPH_OPT_RBYTES;
585 break;
586 case Opt_norbytes:
587 args->flags &= ~CEPH_OPT_RBYTES;
588 break;
589 case Opt_nocrc:
590 args->flags |= CEPH_OPT_NOCRC;
591 break;
592 case Opt_noasyncreaddir:
593 args->flags |= CEPH_OPT_NOASYNCREADDIR;
594 break;
595
596 default:
597 BUG_ON(token);
598 }
599 }
600 return args;
601 323
602out: 324out:
603 kfree(args->mon_addr); 325 destroy_mount_options(fsopt);
604 kfree(args); 326 return err;
605 return ERR_PTR(err);
606} 327}
607 328
608static void destroy_mount_args(struct ceph_mount_args *args) 329/**
330 * ceph_show_options - Show mount options in /proc/mounts
331 * @m: seq_file to write to
332 * @mnt: mount descriptor
333 */
334static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
609{ 335{
610 dout("destroy_mount_args %p\n", args); 336 struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb);
611 kfree(args->snapdir_name); 337 struct ceph_mount_options *fsopt = fsc->mount_options;
612 args->snapdir_name = NULL; 338 struct ceph_options *opt = fsc->client->options;
613 kfree(args->name); 339
614 args->name = NULL; 340 if (opt->flags & CEPH_OPT_FSID)
615 kfree(args->secret); 341 seq_printf(m, ",fsid=%pU", &opt->fsid);
616 args->secret = NULL; 342 if (opt->flags & CEPH_OPT_NOSHARE)
617 kfree(args); 343 seq_puts(m, ",noshare");
344 if (opt->flags & CEPH_OPT_NOCRC)
345 seq_puts(m, ",nocrc");
346
347 if (opt->name)
348 seq_printf(m, ",name=%s", opt->name);
349 if (opt->secret)
350 seq_puts(m, ",secret=<hidden>");
351
352 if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
353 seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
354 if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
355 seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
356 if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
357 seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
358 if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
359 seq_printf(m, ",osdkeepalivetimeout=%d",
360 opt->osd_keepalive_timeout);
361
362 if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
363 seq_puts(m, ",dirstat");
364 if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
365 seq_puts(m, ",norbytes");
366 if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
367 seq_puts(m, ",noasyncreaddir");
368
369 if (fsopt->wsize)
370 seq_printf(m, ",wsize=%d", fsopt->wsize);
371 if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
372 seq_printf(m, ",rsize=%d", fsopt->rsize);
373 if (fsopt->congestion_kb != default_congestion_kb())
374 seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
375 if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
376 seq_printf(m, ",caps_wanted_delay_min=%d",
377 fsopt->caps_wanted_delay_min);
378 if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
379 seq_printf(m, ",caps_wanted_delay_max=%d",
380 fsopt->caps_wanted_delay_max);
381 if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
382 seq_printf(m, ",cap_release_safety=%d",
383 fsopt->cap_release_safety);
384 if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
385 seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
386 if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
387 seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
388 if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
389 seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
390 return 0;
618} 391}
619 392
620/* 393/*
621 * create a fresh client instance 394 * handle any mon messages the standard library doesn't understand.
395 * return error if we don't either.
622 */ 396 */
623static struct ceph_client *ceph_create_client(struct ceph_mount_args *args) 397static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
624{ 398{
625 struct ceph_client *client; 399 struct ceph_fs_client *fsc = client->private;
400 int type = le16_to_cpu(msg->hdr.type);
401
402 switch (type) {
403 case CEPH_MSG_MDS_MAP:
404 ceph_mdsc_handle_map(fsc->mdsc, msg);
405 return 0;
406
407 default:
408 return -1;
409 }
410}
411
412/*
413 * create a new fs client
414 */
415struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
416 struct ceph_options *opt)
417{
418 struct ceph_fs_client *fsc;
626 int err = -ENOMEM; 419 int err = -ENOMEM;
627 420
628 client = kzalloc(sizeof(*client), GFP_KERNEL); 421 fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
629 if (client == NULL) 422 if (!fsc)
630 return ERR_PTR(-ENOMEM); 423 return ERR_PTR(-ENOMEM);
631 424
632 mutex_init(&client->mount_mutex); 425 fsc->client = ceph_create_client(opt, fsc);
633 426 if (IS_ERR(fsc->client)) {
634 init_waitqueue_head(&client->auth_wq); 427 err = PTR_ERR(fsc->client);
428 goto fail;
429 }
430 fsc->client->extra_mon_dispatch = extra_mon_dispatch;
431 fsc->client->supported_features |= CEPH_FEATURE_FLOCK;
432 fsc->client->monc.want_mdsmap = 1;
635 433
636 client->sb = NULL; 434 fsc->mount_options = fsopt;
637 client->mount_state = CEPH_MOUNT_MOUNTING;
638 client->mount_args = args;
639 435
640 client->msgr = NULL; 436 fsc->sb = NULL;
437 fsc->mount_state = CEPH_MOUNT_MOUNTING;
641 438
642 client->auth_err = 0; 439 atomic_long_set(&fsc->writeback_count, 0);
643 atomic_long_set(&client->writeback_count, 0);
644 440
645 err = bdi_init(&client->backing_dev_info); 441 err = bdi_init(&fsc->backing_dev_info);
646 if (err < 0) 442 if (err < 0)
647 goto fail; 443 goto fail_client;
648 444
649 err = -ENOMEM; 445 err = -ENOMEM;
650 client->wb_wq = create_workqueue("ceph-writeback"); 446 fsc->wb_wq = create_workqueue("ceph-writeback");
651 if (client->wb_wq == NULL) 447 if (fsc->wb_wq == NULL)
652 goto fail_bdi; 448 goto fail_bdi;
653 client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid"); 449 fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
654 if (client->pg_inv_wq == NULL) 450 if (fsc->pg_inv_wq == NULL)
655 goto fail_wb_wq; 451 goto fail_wb_wq;
656 client->trunc_wq = create_singlethread_workqueue("ceph-trunc"); 452 fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc");
657 if (client->trunc_wq == NULL) 453 if (fsc->trunc_wq == NULL)
658 goto fail_pg_inv_wq; 454 goto fail_pg_inv_wq;
659 455
660 /* set up mempools */ 456 /* set up mempools */
661 err = -ENOMEM; 457 err = -ENOMEM;
662 client->wb_pagevec_pool = mempool_create_kmalloc_pool(10, 458 fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
663 client->mount_args->wsize >> PAGE_CACHE_SHIFT); 459 fsc->mount_options->wsize >> PAGE_CACHE_SHIFT);
664 if (!client->wb_pagevec_pool) 460 if (!fsc->wb_pagevec_pool)
665 goto fail_trunc_wq; 461 goto fail_trunc_wq;
666 462
667 /* caps */ 463 /* caps */
668 client->min_caps = args->max_readdir; 464 fsc->min_caps = fsopt->max_readdir;
465
466 return fsc;
669 467
670 /* subsystems */
671 err = ceph_monc_init(&client->monc, client);
672 if (err < 0)
673 goto fail_mempool;
674 err = ceph_osdc_init(&client->osdc, client);
675 if (err < 0)
676 goto fail_monc;
677 err = ceph_mdsc_init(&client->mdsc, client);
678 if (err < 0)
679 goto fail_osdc;
680 return client;
681
682fail_osdc:
683 ceph_osdc_stop(&client->osdc);
684fail_monc:
685 ceph_monc_stop(&client->monc);
686fail_mempool:
687 mempool_destroy(client->wb_pagevec_pool);
688fail_trunc_wq: 468fail_trunc_wq:
689 destroy_workqueue(client->trunc_wq); 469 destroy_workqueue(fsc->trunc_wq);
690fail_pg_inv_wq: 470fail_pg_inv_wq:
691 destroy_workqueue(client->pg_inv_wq); 471 destroy_workqueue(fsc->pg_inv_wq);
692fail_wb_wq: 472fail_wb_wq:
693 destroy_workqueue(client->wb_wq); 473 destroy_workqueue(fsc->wb_wq);
694fail_bdi: 474fail_bdi:
695 bdi_destroy(&client->backing_dev_info); 475 bdi_destroy(&fsc->backing_dev_info);
476fail_client:
477 ceph_destroy_client(fsc->client);
696fail: 478fail:
697 kfree(client); 479 kfree(fsc);
698 return ERR_PTR(err); 480 return ERR_PTR(err);
699} 481}
700 482
701static void ceph_destroy_client(struct ceph_client *client) 483void destroy_fs_client(struct ceph_fs_client *fsc)
702{ 484{
703 dout("destroy_client %p\n", client); 485 dout("destroy_fs_client %p\n", fsc);
704 486
705 /* unmount */ 487 destroy_workqueue(fsc->wb_wq);
706 ceph_mdsc_stop(&client->mdsc); 488 destroy_workqueue(fsc->pg_inv_wq);
707 ceph_osdc_stop(&client->osdc); 489 destroy_workqueue(fsc->trunc_wq);
708 490
709 /* 491 bdi_destroy(&fsc->backing_dev_info);
710 * make sure mds and osd connections close out before destroying
711 * the auth module, which is needed to free those connections'
712 * ceph_authorizers.
713 */
714 ceph_msgr_flush();
715
716 ceph_monc_stop(&client->monc);
717 492
718 ceph_debugfs_client_cleanup(client); 493 mempool_destroy(fsc->wb_pagevec_pool);
719 destroy_workqueue(client->wb_wq);
720 destroy_workqueue(client->pg_inv_wq);
721 destroy_workqueue(client->trunc_wq);
722 494
723 bdi_destroy(&client->backing_dev_info); 495 destroy_mount_options(fsc->mount_options);
724 496
725 if (client->msgr) 497 ceph_fs_debugfs_cleanup(fsc);
726 ceph_messenger_destroy(client->msgr);
727 mempool_destroy(client->wb_pagevec_pool);
728 498
729 destroy_mount_args(client->mount_args); 499 ceph_destroy_client(fsc->client);
730 500
731 kfree(client); 501 kfree(fsc);
732 dout("destroy_client %p done\n", client); 502 dout("destroy_fs_client %p done\n", fsc);
733} 503}
734 504
735/* 505/*
736 * Initially learn our fsid, or verify an fsid matches. 506 * caches
737 */ 507 */
738int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) 508struct kmem_cache *ceph_inode_cachep;
509struct kmem_cache *ceph_cap_cachep;
510struct kmem_cache *ceph_dentry_cachep;
511struct kmem_cache *ceph_file_cachep;
512
513static void ceph_inode_init_once(void *foo)
739{ 514{
740 if (client->have_fsid) { 515 struct ceph_inode_info *ci = foo;
741 if (ceph_fsid_compare(&client->fsid, fsid)) { 516 inode_init_once(&ci->vfs_inode);
742 pr_err("bad fsid, had %pU got %pU", 517}
743 &client->fsid, fsid); 518
744 return -1; 519static int __init init_caches(void)
745 } 520{
746 } else { 521 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
747 pr_info("client%lld fsid %pU\n", client->monc.auth->global_id, 522 sizeof(struct ceph_inode_info),
748 fsid); 523 __alignof__(struct ceph_inode_info),
749 memcpy(&client->fsid, fsid, sizeof(*fsid)); 524 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
750 ceph_debugfs_client_init(client); 525 ceph_inode_init_once);
751 client->have_fsid = true; 526 if (ceph_inode_cachep == NULL)
752 } 527 return -ENOMEM;
528
529 ceph_cap_cachep = KMEM_CACHE(ceph_cap,
530 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
531 if (ceph_cap_cachep == NULL)
532 goto bad_cap;
533
534 ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
535 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
536 if (ceph_dentry_cachep == NULL)
537 goto bad_dentry;
538
539 ceph_file_cachep = KMEM_CACHE(ceph_file_info,
540 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
541 if (ceph_file_cachep == NULL)
542 goto bad_file;
543
753 return 0; 544 return 0;
545
546bad_file:
547 kmem_cache_destroy(ceph_dentry_cachep);
548bad_dentry:
549 kmem_cache_destroy(ceph_cap_cachep);
550bad_cap:
551 kmem_cache_destroy(ceph_inode_cachep);
552 return -ENOMEM;
754} 553}
755 554
555static void destroy_caches(void)
556{
557 kmem_cache_destroy(ceph_inode_cachep);
558 kmem_cache_destroy(ceph_cap_cachep);
559 kmem_cache_destroy(ceph_dentry_cachep);
560 kmem_cache_destroy(ceph_file_cachep);
561}
562
563
756/* 564/*
757 * true if we have the mon map (and have thus joined the cluster) 565 * ceph_umount_begin - initiate forced umount. Tear down down the
566 * mount, skipping steps that may hang while waiting for server(s).
758 */ 567 */
759static int have_mon_and_osd_map(struct ceph_client *client) 568static void ceph_umount_begin(struct super_block *sb)
760{ 569{
761 return client->monc.monmap && client->monc.monmap->epoch && 570 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
762 client->osdc.osdmap && client->osdc.osdmap->epoch; 571
572 dout("ceph_umount_begin - starting forced umount\n");
573 if (!fsc)
574 return;
575 fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
576 return;
763} 577}
764 578
579static const struct super_operations ceph_super_ops = {
580 .alloc_inode = ceph_alloc_inode,
581 .destroy_inode = ceph_destroy_inode,
582 .write_inode = ceph_write_inode,
583 .sync_fs = ceph_sync_fs,
584 .put_super = ceph_put_super,
585 .show_options = ceph_show_options,
586 .statfs = ceph_statfs,
587 .umount_begin = ceph_umount_begin,
588};
589
765/* 590/*
766 * Bootstrap mount by opening the root directory. Note the mount 591 * Bootstrap mount by opening the root directory. Note the mount
767 * @started time from caller, and time out if this takes too long. 592 * @started time from caller, and time out if this takes too long.
768 */ 593 */
769static struct dentry *open_root_dentry(struct ceph_client *client, 594static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
770 const char *path, 595 const char *path,
771 unsigned long started) 596 unsigned long started)
772{ 597{
773 struct ceph_mds_client *mdsc = &client->mdsc; 598 struct ceph_mds_client *mdsc = fsc->mdsc;
774 struct ceph_mds_request *req = NULL; 599 struct ceph_mds_request *req = NULL;
775 int err; 600 int err;
776 struct dentry *root; 601 struct dentry *root;
@@ -784,14 +609,14 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
784 req->r_ino1.ino = CEPH_INO_ROOT; 609 req->r_ino1.ino = CEPH_INO_ROOT;
785 req->r_ino1.snap = CEPH_NOSNAP; 610 req->r_ino1.snap = CEPH_NOSNAP;
786 req->r_started = started; 611 req->r_started = started;
787 req->r_timeout = client->mount_args->mount_timeout * HZ; 612 req->r_timeout = fsc->client->options->mount_timeout * HZ;
788 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); 613 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
789 req->r_num_caps = 2; 614 req->r_num_caps = 2;
790 err = ceph_mdsc_do_request(mdsc, NULL, req); 615 err = ceph_mdsc_do_request(mdsc, NULL, req);
791 if (err == 0) { 616 if (err == 0) {
792 dout("open_root_inode success\n"); 617 dout("open_root_inode success\n");
793 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT && 618 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
794 client->sb->s_root == NULL) 619 fsc->sb->s_root == NULL)
795 root = d_alloc_root(req->r_target_inode); 620 root = d_alloc_root(req->r_target_inode);
796 else 621 else
797 root = d_obtain_alias(req->r_target_inode); 622 root = d_obtain_alias(req->r_target_inode);
@@ -804,105 +629,86 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
804 return root; 629 return root;
805} 630}
806 631
632
633
634
807/* 635/*
808 * mount: join the ceph cluster, and open root directory. 636 * mount: join the ceph cluster, and open root directory.
809 */ 637 */
810static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt, 638static int ceph_mount(struct ceph_fs_client *fsc, struct vfsmount *mnt,
811 const char *path) 639 const char *path)
812{ 640{
813 struct ceph_entity_addr *myaddr = NULL;
814 int err; 641 int err;
815 unsigned long timeout = client->mount_args->mount_timeout * HZ;
816 unsigned long started = jiffies; /* note the start time */ 642 unsigned long started = jiffies; /* note the start time */
817 struct dentry *root; 643 struct dentry *root;
644 int first = 0; /* first vfsmount for this super_block */
818 645
819 dout("mount start\n"); 646 dout("mount start\n");
820 mutex_lock(&client->mount_mutex); 647 mutex_lock(&fsc->client->mount_mutex);
821
822 /* initialize the messenger */
823 if (client->msgr == NULL) {
824 if (ceph_test_opt(client, MYIP))
825 myaddr = &client->mount_args->my_addr;
826 client->msgr = ceph_messenger_create(myaddr);
827 if (IS_ERR(client->msgr)) {
828 err = PTR_ERR(client->msgr);
829 client->msgr = NULL;
830 goto out;
831 }
832 client->msgr->nocrc = ceph_test_opt(client, NOCRC);
833 }
834 648
835 /* open session, and wait for mon, mds, and osd maps */ 649 err = __ceph_open_session(fsc->client, started);
836 err = ceph_monc_open_session(&client->monc);
837 if (err < 0) 650 if (err < 0)
838 goto out; 651 goto out;
839 652
840 while (!have_mon_and_osd_map(client)) {
841 err = -EIO;
842 if (timeout && time_after_eq(jiffies, started + timeout))
843 goto out;
844
845 /* wait */
846 dout("mount waiting for mon_map\n");
847 err = wait_event_interruptible_timeout(client->auth_wq,
848 have_mon_and_osd_map(client) || (client->auth_err < 0),
849 timeout);
850 if (err == -EINTR || err == -ERESTARTSYS)
851 goto out;
852 if (client->auth_err < 0) {
853 err = client->auth_err;
854 goto out;
855 }
856 }
857
858 dout("mount opening root\n"); 653 dout("mount opening root\n");
859 root = open_root_dentry(client, "", started); 654 root = open_root_dentry(fsc, "", started);
860 if (IS_ERR(root)) { 655 if (IS_ERR(root)) {
861 err = PTR_ERR(root); 656 err = PTR_ERR(root);
862 goto out; 657 goto out;
863 } 658 }
864 if (client->sb->s_root) 659 if (fsc->sb->s_root) {
865 dput(root); 660 dput(root);
866 else 661 } else {
867 client->sb->s_root = root; 662 fsc->sb->s_root = root;
663 first = 1;
664
665 err = ceph_fs_debugfs_init(fsc);
666 if (err < 0)
667 goto fail;
668 }
868 669
869 if (path[0] == 0) { 670 if (path[0] == 0) {
870 dget(root); 671 dget(root);
871 } else { 672 } else {
872 dout("mount opening base mountpoint\n"); 673 dout("mount opening base mountpoint\n");
873 root = open_root_dentry(client, path, started); 674 root = open_root_dentry(fsc, path, started);
874 if (IS_ERR(root)) { 675 if (IS_ERR(root)) {
875 err = PTR_ERR(root); 676 err = PTR_ERR(root);
876 dput(client->sb->s_root); 677 goto fail;
877 client->sb->s_root = NULL;
878 goto out;
879 } 678 }
880 } 679 }
881 680
882 mnt->mnt_root = root; 681 mnt->mnt_root = root;
883 mnt->mnt_sb = client->sb; 682 mnt->mnt_sb = fsc->sb;
884 683
885 client->mount_state = CEPH_MOUNT_MOUNTED; 684 fsc->mount_state = CEPH_MOUNT_MOUNTED;
886 dout("mount success\n"); 685 dout("mount success\n");
887 err = 0; 686 err = 0;
888 687
889out: 688out:
890 mutex_unlock(&client->mount_mutex); 689 mutex_unlock(&fsc->client->mount_mutex);
891 return err; 690 return err;
691
692fail:
693 if (first) {
694 dput(fsc->sb->s_root);
695 fsc->sb->s_root = NULL;
696 }
697 goto out;
892} 698}
893 699
894static int ceph_set_super(struct super_block *s, void *data) 700static int ceph_set_super(struct super_block *s, void *data)
895{ 701{
896 struct ceph_client *client = data; 702 struct ceph_fs_client *fsc = data;
897 int ret; 703 int ret;
898 704
899 dout("set_super %p data %p\n", s, data); 705 dout("set_super %p data %p\n", s, data);
900 706
901 s->s_flags = client->mount_args->sb_flags; 707 s->s_flags = fsc->mount_options->sb_flags;
902 s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ 708 s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
903 709
904 s->s_fs_info = client; 710 s->s_fs_info = fsc;
905 client->sb = s; 711 fsc->sb = s;
906 712
907 s->s_op = &ceph_super_ops; 713 s->s_op = &ceph_super_ops;
908 s->s_export_op = &ceph_export_ops; 714 s->s_export_op = &ceph_export_ops;
@@ -917,7 +723,7 @@ static int ceph_set_super(struct super_block *s, void *data)
917 723
918fail: 724fail:
919 s->s_fs_info = NULL; 725 s->s_fs_info = NULL;
920 client->sb = NULL; 726 fsc->sb = NULL;
921 return ret; 727 return ret;
922} 728}
923 729
@@ -926,30 +732,23 @@ fail:
926 */ 732 */
927static int ceph_compare_super(struct super_block *sb, void *data) 733static int ceph_compare_super(struct super_block *sb, void *data)
928{ 734{
929 struct ceph_client *new = data; 735 struct ceph_fs_client *new = data;
930 struct ceph_mount_args *args = new->mount_args; 736 struct ceph_mount_options *fsopt = new->mount_options;
931 struct ceph_client *other = ceph_sb_to_client(sb); 737 struct ceph_options *opt = new->client->options;
932 int i; 738 struct ceph_fs_client *other = ceph_sb_to_client(sb);
933 739
934 dout("ceph_compare_super %p\n", sb); 740 dout("ceph_compare_super %p\n", sb);
935 if (args->flags & CEPH_OPT_FSID) { 741
936 if (ceph_fsid_compare(&args->fsid, &other->fsid)) { 742 if (compare_mount_options(fsopt, opt, other)) {
937 dout("fsid doesn't match\n"); 743 dout("monitor(s)/mount options don't match\n");
938 return 0; 744 return 0;
939 }
940 } else {
941 /* do we share (a) monitor? */
942 for (i = 0; i < new->monc.monmap->num_mon; i++)
943 if (ceph_monmap_contains(other->monc.monmap,
944 &new->monc.monmap->mon_inst[i].addr))
945 break;
946 if (i == new->monc.monmap->num_mon) {
947 dout("mon ip not part of monmap\n");
948 return 0;
949 }
950 dout("mon ip matches existing sb %p\n", sb);
951 } 745 }
952 if (args->sb_flags != other->mount_args->sb_flags) { 746 if ((opt->flags & CEPH_OPT_FSID) &&
747 ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
748 dout("fsid doesn't match\n");
749 return 0;
750 }
751 if (fsopt->sb_flags != other->mount_options->sb_flags) {
953 dout("flags differ\n"); 752 dout("flags differ\n");
954 return 0; 753 return 0;
955 } 754 }
@@ -961,19 +760,20 @@ static int ceph_compare_super(struct super_block *sb, void *data)
961 */ 760 */
962static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); 761static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
963 762
964static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) 763static int ceph_register_bdi(struct super_block *sb,
764 struct ceph_fs_client *fsc)
965{ 765{
966 int err; 766 int err;
967 767
968 /* set ra_pages based on rsize mount option? */ 768 /* set ra_pages based on rsize mount option? */
969 if (client->mount_args->rsize >= PAGE_CACHE_SIZE) 769 if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
970 client->backing_dev_info.ra_pages = 770 fsc->backing_dev_info.ra_pages =
971 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1) 771 (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
972 >> PAGE_SHIFT; 772 >> PAGE_SHIFT;
973 err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d", 773 err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
974 atomic_long_inc_return(&bdi_seq)); 774 atomic_long_inc_return(&bdi_seq));
975 if (!err) 775 if (!err)
976 sb->s_bdi = &client->backing_dev_info; 776 sb->s_bdi = &fsc->backing_dev_info;
977 return err; 777 return err;
978} 778}
979 779
@@ -982,46 +782,52 @@ static int ceph_get_sb(struct file_system_type *fs_type,
982 struct vfsmount *mnt) 782 struct vfsmount *mnt)
983{ 783{
984 struct super_block *sb; 784 struct super_block *sb;
985 struct ceph_client *client; 785 struct ceph_fs_client *fsc;
986 int err; 786 int err;
987 int (*compare_super)(struct super_block *, void *) = ceph_compare_super; 787 int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
988 const char *path = NULL; 788 const char *path = NULL;
989 struct ceph_mount_args *args; 789 struct ceph_mount_options *fsopt = NULL;
790 struct ceph_options *opt = NULL;
990 791
991 dout("ceph_get_sb\n"); 792 dout("ceph_get_sb\n");
992 args = parse_mount_args(flags, data, dev_name, &path); 793 err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
993 if (IS_ERR(args)) { 794 if (err < 0)
994 err = PTR_ERR(args);
995 goto out_final; 795 goto out_final;
996 }
997 796
998 /* create client (which we may/may not use) */ 797 /* create client (which we may/may not use) */
999 client = ceph_create_client(args); 798 fsc = create_fs_client(fsopt, opt);
1000 if (IS_ERR(client)) { 799 if (IS_ERR(fsc)) {
1001 err = PTR_ERR(client); 800 err = PTR_ERR(fsc);
801 kfree(fsopt);
802 kfree(opt);
1002 goto out_final; 803 goto out_final;
1003 } 804 }
1004 805
1005 if (client->mount_args->flags & CEPH_OPT_NOSHARE) 806 err = ceph_mdsc_init(fsc);
807 if (err < 0)
808 goto out;
809
810 if (ceph_test_opt(fsc->client, NOSHARE))
1006 compare_super = NULL; 811 compare_super = NULL;
1007 sb = sget(fs_type, compare_super, ceph_set_super, client); 812 sb = sget(fs_type, compare_super, ceph_set_super, fsc);
1008 if (IS_ERR(sb)) { 813 if (IS_ERR(sb)) {
1009 err = PTR_ERR(sb); 814 err = PTR_ERR(sb);
1010 goto out; 815 goto out;
1011 } 816 }
1012 817
1013 if (ceph_sb_to_client(sb) != client) { 818 if (ceph_sb_to_client(sb) != fsc) {
1014 ceph_destroy_client(client); 819 ceph_mdsc_destroy(fsc);
1015 client = ceph_sb_to_client(sb); 820 destroy_fs_client(fsc);
1016 dout("get_sb got existing client %p\n", client); 821 fsc = ceph_sb_to_client(sb);
822 dout("get_sb got existing client %p\n", fsc);
1017 } else { 823 } else {
1018 dout("get_sb using new client %p\n", client); 824 dout("get_sb using new client %p\n", fsc);
1019 err = ceph_register_bdi(sb, client); 825 err = ceph_register_bdi(sb, fsc);
1020 if (err < 0) 826 if (err < 0)
1021 goto out_splat; 827 goto out_splat;
1022 } 828 }
1023 829
1024 err = ceph_mount(client, mnt, path); 830 err = ceph_mount(fsc, mnt, path);
1025 if (err < 0) 831 if (err < 0)
1026 goto out_splat; 832 goto out_splat;
1027 dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root, 833 dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
@@ -1029,12 +835,13 @@ static int ceph_get_sb(struct file_system_type *fs_type,
1029 return 0; 835 return 0;
1030 836
1031out_splat: 837out_splat:
1032 ceph_mdsc_close_sessions(&client->mdsc); 838 ceph_mdsc_close_sessions(fsc->mdsc);
1033 deactivate_locked_super(sb); 839 deactivate_locked_super(sb);
1034 goto out_final; 840 goto out_final;
1035 841
1036out: 842out:
1037 ceph_destroy_client(client); 843 ceph_mdsc_destroy(fsc);
844 destroy_fs_client(fsc);
1038out_final: 845out_final:
1039 dout("ceph_get_sb fail %d\n", err); 846 dout("ceph_get_sb fail %d\n", err);
1040 return err; 847 return err;
@@ -1042,11 +849,12 @@ out_final:
1042 849
1043static void ceph_kill_sb(struct super_block *s) 850static void ceph_kill_sb(struct super_block *s)
1044{ 851{
1045 struct ceph_client *client = ceph_sb_to_client(s); 852 struct ceph_fs_client *fsc = ceph_sb_to_client(s);
1046 dout("kill_sb %p\n", s); 853 dout("kill_sb %p\n", s);
1047 ceph_mdsc_pre_umount(&client->mdsc); 854 ceph_mdsc_pre_umount(fsc->mdsc);
1048 kill_anon_super(s); /* will call put_super after sb is r/o */ 855 kill_anon_super(s); /* will call put_super after sb is r/o */
1049 ceph_destroy_client(client); 856 ceph_mdsc_destroy(fsc);
857 destroy_fs_client(fsc);
1050} 858}
1051 859
1052static struct file_system_type ceph_fs_type = { 860static struct file_system_type ceph_fs_type = {
@@ -1062,36 +870,20 @@ static struct file_system_type ceph_fs_type = {
1062 870
1063static int __init init_ceph(void) 871static int __init init_ceph(void)
1064{ 872{
1065 int ret = 0; 873 int ret = init_caches();
1066
1067 ret = ceph_debugfs_init();
1068 if (ret < 0)
1069 goto out;
1070
1071 ret = ceph_msgr_init();
1072 if (ret < 0)
1073 goto out_debugfs;
1074
1075 ret = init_caches();
1076 if (ret) 874 if (ret)
1077 goto out_msgr; 875 goto out;
1078 876
1079 ret = register_filesystem(&ceph_fs_type); 877 ret = register_filesystem(&ceph_fs_type);
1080 if (ret) 878 if (ret)
1081 goto out_icache; 879 goto out_icache;
1082 880
1083 pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n", 881 pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
1084 CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL, 882
1085 CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
1086 CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
1087 return 0; 883 return 0;
1088 884
1089out_icache: 885out_icache:
1090 destroy_caches(); 886 destroy_caches();
1091out_msgr:
1092 ceph_msgr_exit();
1093out_debugfs:
1094 ceph_debugfs_cleanup();
1095out: 887out:
1096 return ret; 888 return ret;
1097} 889}
@@ -1101,8 +893,6 @@ static void __exit exit_ceph(void)
1101 dout("exit_ceph\n"); 893 dout("exit_ceph\n");
1102 unregister_filesystem(&ceph_fs_type); 894 unregister_filesystem(&ceph_fs_type);
1103 destroy_caches(); 895 destroy_caches();
1104 ceph_msgr_exit();
1105 ceph_debugfs_cleanup();
1106} 896}
1107 897
1108module_init(init_ceph); 898module_init(init_ceph);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index c33897ae5725..1886294e12f7 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1,7 +1,7 @@
1#ifndef _FS_CEPH_SUPER_H 1#ifndef _FS_CEPH_SUPER_H
2#define _FS_CEPH_SUPER_H 2#define _FS_CEPH_SUPER_H
3 3
4#include "ceph_debug.h" 4#include <linux/ceph/ceph_debug.h>
5 5
6#include <asm/unaligned.h> 6#include <asm/unaligned.h>
7#include <linux/backing-dev.h> 7#include <linux/backing-dev.h>
@@ -14,13 +14,7 @@
14#include <linux/writeback.h> 14#include <linux/writeback.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16 16
17#include "types.h" 17#include <linux/ceph/libceph.h>
18#include "messenger.h"
19#include "msgpool.h"
20#include "mon_client.h"
21#include "mds_client.h"
22#include "osd_client.h"
23#include "ceph_fs.h"
24 18
25/* f_type in struct statfs */ 19/* f_type in struct statfs */
26#define CEPH_SUPER_MAGIC 0x00c36400 20#define CEPH_SUPER_MAGIC 0x00c36400
@@ -30,42 +24,25 @@
30#define CEPH_BLOCK_SHIFT 20 /* 1 MB */ 24#define CEPH_BLOCK_SHIFT 20 /* 1 MB */
31#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) 25#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
32 26
33/* 27#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
34 * Supported features 28#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
35 */ 29#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
36#define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR | CEPH_FEATURE_FLOCK
37#define CEPH_FEATURE_REQUIRED CEPH_FEATURE_NOSRCADDR
38 30
39/* 31#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES)
40 * mount options
41 */
42#define CEPH_OPT_FSID (1<<0)
43#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
44#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
45#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */
46#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
47#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */
48#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
49 32
50#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES) 33#define ceph_set_mount_opt(fsc, opt) \
34 (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
35#define ceph_test_mount_opt(fsc, opt) \
36 (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
51 37
52#define ceph_set_opt(client, opt) \ 38#define CEPH_MAX_READDIR_DEFAULT 1024
53 (client)->mount_args->flags |= CEPH_OPT_##opt; 39#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
54#define ceph_test_opt(client, opt) \ 40#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
55 (!!((client)->mount_args->flags & CEPH_OPT_##opt))
56 41
57 42struct ceph_mount_options {
58struct ceph_mount_args {
59 int sb_flags;
60 int flags; 43 int flags;
61 struct ceph_fsid fsid; 44 int sb_flags;
62 struct ceph_entity_addr my_addr; 45
63 int num_mon;
64 struct ceph_entity_addr *mon_addr;
65 int mount_timeout;
66 int osd_idle_ttl;
67 int osd_timeout;
68 int osd_keepalive_timeout;
69 int wsize; 46 int wsize;
70 int rsize; /* max readahead */ 47 int rsize; /* max readahead */
71 int congestion_kb; /* max writeback in flight */ 48 int congestion_kb; /* max writeback in flight */
@@ -73,82 +50,25 @@ struct ceph_mount_args {
73 int cap_release_safety; 50 int cap_release_safety;
74 int max_readdir; /* max readdir result (entires) */ 51 int max_readdir; /* max readdir result (entires) */
75 int max_readdir_bytes; /* max readdir result (bytes) */ 52 int max_readdir_bytes; /* max readdir result (bytes) */
76 char *snapdir_name; /* default ".snap" */
77 char *name;
78 char *secret;
79};
80 53
81/* 54 /*
82 * defaults 55 * everything above this point can be memcmp'd; everything below
83 */ 56 * is handled in compare_mount_options()
84#define CEPH_MOUNT_TIMEOUT_DEFAULT 60 57 */
85#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */
86#define CEPH_OSD_KEEPALIVE_DEFAULT 5
87#define CEPH_OSD_IDLE_TTL_DEFAULT 60
88#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
89#define CEPH_MAX_READDIR_DEFAULT 1024
90#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
91
92#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
93#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
94
95#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
96#define CEPH_AUTH_NAME_DEFAULT "guest"
97/*
98 * Delay telling the MDS we no longer want caps, in case we reopen
99 * the file. Delay a minimum amount of time, even if we send a cap
100 * message for some other reason. Otherwise, take the oppotunity to
101 * update the mds to avoid sending another message later.
102 */
103#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
104#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
105
106#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4)
107
108/* mount state */
109enum {
110 CEPH_MOUNT_MOUNTING,
111 CEPH_MOUNT_MOUNTED,
112 CEPH_MOUNT_UNMOUNTING,
113 CEPH_MOUNT_UNMOUNTED,
114 CEPH_MOUNT_SHUTDOWN,
115};
116
117/*
118 * subtract jiffies
119 */
120static inline unsigned long time_sub(unsigned long a, unsigned long b)
121{
122 BUG_ON(time_after(b, a));
123 return (long)a - (long)b;
124}
125
126/*
127 * per-filesystem client state
128 *
129 * possibly shared by multiple mount points, if they are
130 * mounting the same ceph filesystem/cluster.
131 */
132struct ceph_client {
133 struct ceph_fsid fsid;
134 bool have_fsid;
135 58
136 struct mutex mount_mutex; /* serialize mount attempts */ 59 char *snapdir_name; /* default ".snap" */
137 struct ceph_mount_args *mount_args; 60};
138 61
62struct ceph_fs_client {
139 struct super_block *sb; 63 struct super_block *sb;
140 64
141 unsigned long mount_state; 65 struct ceph_mount_options *mount_options;
142 wait_queue_head_t auth_wq; 66 struct ceph_client *client;
143
144 int auth_err;
145 67
68 unsigned long mount_state;
146 int min_caps; /* min caps i added */ 69 int min_caps; /* min caps i added */
147 70
148 struct ceph_messenger *msgr; /* messenger instance */ 71 struct ceph_mds_client *mdsc;
149 struct ceph_mon_client monc;
150 struct ceph_mds_client mdsc;
151 struct ceph_osd_client osdc;
152 72
153 /* writeback */ 73 /* writeback */
154 mempool_t *wb_pagevec_pool; 74 mempool_t *wb_pagevec_pool;
@@ -160,14 +80,14 @@ struct ceph_client {
160 struct backing_dev_info backing_dev_info; 80 struct backing_dev_info backing_dev_info;
161 81
162#ifdef CONFIG_DEBUG_FS 82#ifdef CONFIG_DEBUG_FS
163 struct dentry *debugfs_monmap; 83 struct dentry *debugfs_dentry_lru, *debugfs_caps;
164 struct dentry *debugfs_mdsmap, *debugfs_osdmap;
165 struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
166 struct dentry *debugfs_congestion_kb; 84 struct dentry *debugfs_congestion_kb;
167 struct dentry *debugfs_bdi; 85 struct dentry *debugfs_bdi;
86 struct dentry *debugfs_mdsc, *debugfs_mdsmap;
168#endif 87#endif
169}; 88};
170 89
90
171/* 91/*
172 * File i/o capability. This tracks shared state with the metadata 92 * File i/o capability. This tracks shared state with the metadata
173 * server that allows us to cache or writeback attributes or to read 93 * server that allows us to cache or writeback attributes or to read
@@ -275,6 +195,20 @@ struct ceph_inode_xattr {
275 int should_free_val; 195 int should_free_val;
276}; 196};
277 197
198/*
199 * Ceph dentry state
200 */
201struct ceph_dentry_info {
202 struct ceph_mds_session *lease_session;
203 u32 lease_gen, lease_shared_gen;
204 u32 lease_seq;
205 unsigned long lease_renew_after, lease_renew_from;
206 struct list_head lru;
207 struct dentry *dentry;
208 u64 time;
209 u64 offset;
210};
211
278struct ceph_inode_xattrs_info { 212struct ceph_inode_xattrs_info {
279 /* 213 /*
280 * (still encoded) xattr blob. we avoid the overhead of parsing 214 * (still encoded) xattr blob. we avoid the overhead of parsing
@@ -296,11 +230,6 @@ struct ceph_inode_xattrs_info {
296/* 230/*
297 * Ceph inode. 231 * Ceph inode.
298 */ 232 */
299#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
300#define CEPH_I_NODELAY 4 /* do not delay cap release */
301#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
302#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
303
304struct ceph_inode_info { 233struct ceph_inode_info {
305 struct ceph_vino i_vino; /* ceph ino + snap */ 234 struct ceph_vino i_vino; /* ceph ino + snap */
306 235
@@ -391,6 +320,63 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
391 return container_of(inode, struct ceph_inode_info, vfs_inode); 320 return container_of(inode, struct ceph_inode_info, vfs_inode);
392} 321}
393 322
323static inline struct ceph_vino ceph_vino(struct inode *inode)
324{
325 return ceph_inode(inode)->i_vino;
326}
327
328/*
329 * ino_t is <64 bits on many architectures, blech.
330 *
331 * don't include snap in ino hash, at least for now.
332 */
333static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
334{
335 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
336#if BITS_PER_LONG == 32
337 ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
338 if (!ino)
339 ino = 1;
340#endif
341 return ino;
342}
343
344/* for printf-style formatting */
345#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
346
347static inline u64 ceph_ino(struct inode *inode)
348{
349 return ceph_inode(inode)->i_vino.ino;
350}
351static inline u64 ceph_snap(struct inode *inode)
352{
353 return ceph_inode(inode)->i_vino.snap;
354}
355
356static inline int ceph_ino_compare(struct inode *inode, void *data)
357{
358 struct ceph_vino *pvino = (struct ceph_vino *)data;
359 struct ceph_inode_info *ci = ceph_inode(inode);
360 return ci->i_vino.ino == pvino->ino &&
361 ci->i_vino.snap == pvino->snap;
362}
363
364static inline struct inode *ceph_find_inode(struct super_block *sb,
365 struct ceph_vino vino)
366{
367 ino_t t = ceph_vino_to_ino(vino);
368 return ilookup5(sb, t, ceph_ino_compare, &vino);
369}
370
371
372/*
373 * Ceph inode.
374 */
375#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
376#define CEPH_I_NODELAY 4 /* do not delay cap release */
377#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
378#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
379
394static inline void ceph_i_clear(struct inode *inode, unsigned mask) 380static inline void ceph_i_clear(struct inode *inode, unsigned mask)
395{ 381{
396 struct ceph_inode_info *ci = ceph_inode(inode); 382 struct ceph_inode_info *ci = ceph_inode(inode);
@@ -414,8 +400,9 @@ static inline bool ceph_i_test(struct inode *inode, unsigned mask)
414 struct ceph_inode_info *ci = ceph_inode(inode); 400 struct ceph_inode_info *ci = ceph_inode(inode);
415 bool r; 401 bool r;
416 402
417 smp_mb(); 403 spin_lock(&inode->i_lock);
418 r = (ci->i_ceph_flags & mask) == mask; 404 r = (ci->i_ceph_flags & mask) == mask;
405 spin_unlock(&inode->i_lock);
419 return r; 406 return r;
420} 407}
421 408
@@ -432,20 +419,6 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
432 struct ceph_inode_frag *pfrag, 419 struct ceph_inode_frag *pfrag,
433 int *found); 420 int *found);
434 421
435/*
436 * Ceph dentry state
437 */
438struct ceph_dentry_info {
439 struct ceph_mds_session *lease_session;
440 u32 lease_gen, lease_shared_gen;
441 u32 lease_seq;
442 unsigned long lease_renew_after, lease_renew_from;
443 struct list_head lru;
444 struct dentry *dentry;
445 u64 time;
446 u64 offset;
447};
448
449static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) 422static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
450{ 423{
451 return (struct ceph_dentry_info *)dentry->d_fsdata; 424 return (struct ceph_dentry_info *)dentry->d_fsdata;
@@ -456,22 +429,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
456 return ((loff_t)frag << 32) | (loff_t)off; 429 return ((loff_t)frag << 32) | (loff_t)off;
457} 430}
458 431
459/*
460 * ino_t is <64 bits on many architectures, blech.
461 *
462 * don't include snap in ino hash, at least for now.
463 */
464static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
465{
466 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
467#if BITS_PER_LONG == 32
468 ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
469 if (!ino)
470 ino = 1;
471#endif
472 return ino;
473}
474
475static inline int ceph_set_ino_cb(struct inode *inode, void *data) 432static inline int ceph_set_ino_cb(struct inode *inode, void *data)
476{ 433{
477 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; 434 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
@@ -479,39 +436,6 @@ static inline int ceph_set_ino_cb(struct inode *inode, void *data)
479 return 0; 436 return 0;
480} 437}
481 438
482static inline struct ceph_vino ceph_vino(struct inode *inode)
483{
484 return ceph_inode(inode)->i_vino;
485}
486
487/* for printf-style formatting */
488#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
489
490static inline u64 ceph_ino(struct inode *inode)
491{
492 return ceph_inode(inode)->i_vino.ino;
493}
494static inline u64 ceph_snap(struct inode *inode)
495{
496 return ceph_inode(inode)->i_vino.snap;
497}
498
499static inline int ceph_ino_compare(struct inode *inode, void *data)
500{
501 struct ceph_vino *pvino = (struct ceph_vino *)data;
502 struct ceph_inode_info *ci = ceph_inode(inode);
503 return ci->i_vino.ino == pvino->ino &&
504 ci->i_vino.snap == pvino->snap;
505}
506
507static inline struct inode *ceph_find_inode(struct super_block *sb,
508 struct ceph_vino vino)
509{
510 ino_t t = ceph_vino_to_ino(vino);
511 return ilookup5(sb, t, ceph_ino_compare, &vino);
512}
513
514
515/* 439/*
516 * caps helpers 440 * caps helpers
517 */ 441 */
@@ -576,18 +500,18 @@ extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
576 struct ceph_cap_reservation *ctx, int need); 500 struct ceph_cap_reservation *ctx, int need);
577extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, 501extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
578 struct ceph_cap_reservation *ctx); 502 struct ceph_cap_reservation *ctx);
579extern void ceph_reservation_status(struct ceph_client *client, 503extern void ceph_reservation_status(struct ceph_fs_client *client,
580 int *total, int *avail, int *used, 504 int *total, int *avail, int *used,
581 int *reserved, int *min); 505 int *reserved, int *min);
582 506
583static inline struct ceph_client *ceph_inode_to_client(struct inode *inode) 507static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
584{ 508{
585 return (struct ceph_client *)inode->i_sb->s_fs_info; 509 return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
586} 510}
587 511
588static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb) 512static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
589{ 513{
590 return (struct ceph_client *)sb->s_fs_info; 514 return (struct ceph_fs_client *)sb->s_fs_info;
591} 515}
592 516
593 517
@@ -617,51 +541,6 @@ struct ceph_file_info {
617 541
618 542
619/* 543/*
620 * snapshots
621 */
622
623/*
624 * A "snap context" is the set of existing snapshots when we
625 * write data. It is used by the OSD to guide its COW behavior.
626 *
627 * The ceph_snap_context is refcounted, and attached to each dirty
628 * page, indicating which context the dirty data belonged when it was
629 * dirtied.
630 */
631struct ceph_snap_context {
632 atomic_t nref;
633 u64 seq;
634 int num_snaps;
635 u64 snaps[];
636};
637
638static inline struct ceph_snap_context *
639ceph_get_snap_context(struct ceph_snap_context *sc)
640{
641 /*
642 printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
643 atomic_read(&sc->nref)+1);
644 */
645 if (sc)
646 atomic_inc(&sc->nref);
647 return sc;
648}
649
650static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
651{
652 if (!sc)
653 return;
654 /*
655 printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
656 atomic_read(&sc->nref)-1);
657 */
658 if (atomic_dec_and_test(&sc->nref)) {
659 /*printk(" deleting snap_context %p\n", sc);*/
660 kfree(sc);
661 }
662}
663
664/*
665 * A "snap realm" describes a subset of the file hierarchy sharing 544 * A "snap realm" describes a subset of the file hierarchy sharing
666 * the same set of snapshots that apply to it. The realms themselves 545 * the same set of snapshots that apply to it. The realms themselves
667 * are organized into a hierarchy, such that children inherit (some of) 546 * are organized into a hierarchy, such that children inherit (some of)
@@ -690,6 +569,8 @@ struct ceph_snap_realm {
690 569
691 struct list_head empty_item; /* if i have ref==0 */ 570 struct list_head empty_item; /* if i have ref==0 */
692 571
572 struct list_head dirty_item; /* if realm needs new context */
573
693 /* the current set of snaps for this realm */ 574 /* the current set of snaps for this realm */
694 struct ceph_snap_context *cached_context; 575 struct ceph_snap_context *cached_context;
695 576
@@ -697,16 +578,33 @@ struct ceph_snap_realm {
697 spinlock_t inodes_with_caps_lock; 578 spinlock_t inodes_with_caps_lock;
698}; 579};
699 580
700 581static inline int default_congestion_kb(void)
701
702/*
703 * calculate the number of pages a given length and offset map onto,
704 * if we align the data.
705 */
706static inline int calc_pages_for(u64 off, u64 len)
707{ 582{
708 return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) - 583 int congestion_kb;
709 (off >> PAGE_CACHE_SHIFT); 584
585 /*
586 * Copied from NFS
587 *
588 * congestion size, scale with available memory.
589 *
590 * 64MB: 8192k
591 * 128MB: 11585k
592 * 256MB: 16384k
593 * 512MB: 23170k
594 * 1GB: 32768k
595 * 2GB: 46340k
596 * 4GB: 65536k
597 * 8GB: 92681k
598 * 16GB: 131072k
599 *
600 * This allows larger machines to have larger/more transfers.
601 * Limit the default to 256M
602 */
603 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
604 if (congestion_kb > 256*1024)
605 congestion_kb = 256*1024;
606
607 return congestion_kb;
710} 608}
711 609
712 610
@@ -739,16 +637,6 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
739 ci_item)->writing; 637 ci_item)->writing;
740} 638}
741 639
742
743/* super.c */
744extern struct kmem_cache *ceph_inode_cachep;
745extern struct kmem_cache *ceph_cap_cachep;
746extern struct kmem_cache *ceph_dentry_cachep;
747extern struct kmem_cache *ceph_file_cachep;
748
749extern const char *ceph_msg_type_name(int type);
750extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
751
752/* inode.c */ 640/* inode.c */
753extern const struct inode_operations ceph_file_iops; 641extern const struct inode_operations ceph_file_iops;
754 642
@@ -826,7 +714,8 @@ extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
826extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, 714extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
827 struct ceph_snap_context *snapc); 715 struct ceph_snap_context *snapc);
828extern void __ceph_flush_snaps(struct ceph_inode_info *ci, 716extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
829 struct ceph_mds_session **psession); 717 struct ceph_mds_session **psession,
718 int again);
830extern void ceph_check_caps(struct ceph_inode_info *ci, int flags, 719extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
831 struct ceph_mds_session *session); 720 struct ceph_mds_session *session);
832extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc); 721extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
@@ -854,12 +743,18 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
854/* file.c */ 743/* file.c */
855extern const struct file_operations ceph_file_fops; 744extern const struct file_operations ceph_file_fops;
856extern const struct address_space_operations ceph_aops; 745extern const struct address_space_operations ceph_aops;
746extern int ceph_copy_to_page_vector(struct page **pages,
747 const char *data,
748 loff_t off, size_t len);
749extern int ceph_copy_from_page_vector(struct page **pages,
750 char *data,
751 loff_t off, size_t len);
752extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
857extern int ceph_open(struct inode *inode, struct file *file); 753extern int ceph_open(struct inode *inode, struct file *file);
858extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, 754extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
859 struct nameidata *nd, int mode, 755 struct nameidata *nd, int mode,
860 int locked_dir); 756 int locked_dir);
861extern int ceph_release(struct inode *inode, struct file *filp); 757extern int ceph_release(struct inode *inode, struct file *filp);
862extern void ceph_release_page_vector(struct page **pages, int num_pages);
863 758
864/* dir.c */ 759/* dir.c */
865extern const struct file_operations ceph_dir_fops; 760extern const struct file_operations ceph_dir_fops;
@@ -889,12 +784,6 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
889/* export.c */ 784/* export.c */
890extern const struct export_operations ceph_export_ops; 785extern const struct export_operations ceph_export_ops;
891 786
892/* debugfs.c */
893extern int ceph_debugfs_init(void);
894extern void ceph_debugfs_cleanup(void);
895extern int ceph_debugfs_client_init(struct ceph_client *client);
896extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
897
898/* locks.c */ 787/* locks.c */
899extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); 788extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
900extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); 789extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
@@ -911,4 +800,8 @@ static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
911 return NULL; 800 return NULL;
912} 801}
913 802
803/* debugfs.c */
804extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
805extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
806
914#endif /* _FS_CEPH_SUPER_H */ 807#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
deleted file mode 100644
index 28b35a005ec2..000000000000
--- a/fs/ceph/types.h
+++ /dev/null
@@ -1,29 +0,0 @@
1#ifndef _FS_CEPH_TYPES_H
2#define _FS_CEPH_TYPES_H
3
4/* needed before including ceph_fs.h */
5#include <linux/in.h>
6#include <linux/types.h>
7#include <linux/fcntl.h>
8#include <linux/string.h>
9
10#include "ceph_fs.h"
11#include "ceph_frag.h"
12#include "ceph_hash.h"
13
14/*
15 * Identify inodes by both their ino AND snapshot id (a u64).
16 */
17struct ceph_vino {
18 u64 ino;
19 u64 snap;
20};
21
22
23/* context for the caps reservation mechanism */
24struct ceph_cap_reservation {
25 int count;
26};
27
28
29#endif
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 9578af610b73..6e12a6ba5f79 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1,6 +1,9 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2
2#include "super.h" 3#include "super.h"
3#include "decode.h" 4#include "mds_client.h"
5
6#include <linux/ceph/decode.h>
4 7
5#include <linux/xattr.h> 8#include <linux/xattr.h>
6#include <linux/slab.h> 9#include <linux/slab.h>
@@ -620,12 +623,12 @@ out:
620static int ceph_sync_setxattr(struct dentry *dentry, const char *name, 623static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
621 const char *value, size_t size, int flags) 624 const char *value, size_t size, int flags)
622{ 625{
623 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); 626 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
624 struct inode *inode = dentry->d_inode; 627 struct inode *inode = dentry->d_inode;
625 struct ceph_inode_info *ci = ceph_inode(inode); 628 struct ceph_inode_info *ci = ceph_inode(inode);
626 struct inode *parent_inode = dentry->d_parent->d_inode; 629 struct inode *parent_inode = dentry->d_parent->d_inode;
627 struct ceph_mds_request *req; 630 struct ceph_mds_request *req;
628 struct ceph_mds_client *mdsc = &client->mdsc; 631 struct ceph_mds_client *mdsc = fsc->mdsc;
629 int err; 632 int err;
630 int i, nr_pages; 633 int i, nr_pages;
631 struct page **pages = NULL; 634 struct page **pages = NULL;
@@ -713,10 +716,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
713 716
714 /* preallocate memory for xattr name, value, index node */ 717 /* preallocate memory for xattr name, value, index node */
715 err = -ENOMEM; 718 err = -ENOMEM;
716 newname = kmalloc(name_len + 1, GFP_NOFS); 719 newname = kmemdup(name, name_len + 1, GFP_NOFS);
717 if (!newname) 720 if (!newname)
718 goto out; 721 goto out;
719 memcpy(newname, name, name_len + 1);
720 722
721 if (val_len) { 723 if (val_len) {
722 newval = kmalloc(val_len + 1, GFP_NOFS); 724 newval = kmalloc(val_len + 1, GFP_NOFS);
@@ -777,8 +779,8 @@ out:
777 779
778static int ceph_send_removexattr(struct dentry *dentry, const char *name) 780static int ceph_send_removexattr(struct dentry *dentry, const char *name)
779{ 781{
780 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); 782 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
781 struct ceph_mds_client *mdsc = &client->mdsc; 783 struct ceph_mds_client *mdsc = fsc->mdsc;
782 struct inode *inode = dentry->d_inode; 784 struct inode *inode = dentry->d_inode;
783 struct inode *parent_inode = dentry->d_parent->d_inode; 785 struct inode *parent_inode = dentry->d_parent->d_inode;
784 struct ceph_mds_request *req; 786 struct ceph_mds_request *req;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index f80a4f25123c..e5b9df993b93 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -40,7 +40,9 @@ struct backing_dev_info directly_mappable_cdev_bdi = {
40#endif 40#endif
41 /* permit direct mmap, for read, write or exec */ 41 /* permit direct mmap, for read, write or exec */
42 BDI_CAP_MAP_DIRECT | 42 BDI_CAP_MAP_DIRECT |
43 BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP), 43 BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP |
44 /* no writeback happens */
45 BDI_CAP_NO_ACCT_AND_WRITEBACK),
44}; 46};
45 47
46static struct kobj_map *cdev_map; 48static struct kobj_map *cdev_map;
@@ -454,6 +456,7 @@ static void cdev_purge(struct cdev *cdev)
454 */ 456 */
455const struct file_operations def_chr_fops = { 457const struct file_operations def_chr_fops = {
456 .open = chrdev_open, 458 .open = chrdev_open,
459 .llseek = noop_llseek,
457}; 460};
458 461
459static struct kobject *exact_match(dev_t dev, int *part, void *data) 462static struct kobject *exact_match(dev_t dev, int *part, void *data)
diff --git a/fs/cifs/README b/fs/cifs/README
index 7099a526f775..ee68d1036544 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -527,6 +527,11 @@ A partial list of the supported mount options follows:
527 SFU does). In the future the bottom 9 bits of the 527 SFU does). In the future the bottom 9 bits of the
528 mode also will be emulated using queries of the security 528 mode also will be emulated using queries of the security
529 descriptor (ACL). 529 descriptor (ACL).
530 mfsymlinks Enable support for Minshall+French symlinks
531 (see http://wiki.samba.org/index.php/UNIX_Extensions#Minshall.2BFrench_symlinks)
532 This option is ignored when specified together with the
533 'sfu' option. Minshall+French symlinks are used even if
534 the server supports the CIFS Unix Extensions.
530 sign Must use packet signing (helps avoid unwanted data modification 535 sign Must use packet signing (helps avoid unwanted data modification
531 by intermediate systems in the route). Note that signing 536 by intermediate systems in the route). Note that signing
532 does not work with lanman or plaintext authentication. 537 does not work with lanman or plaintext authentication.
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index eb1ba493489f..103ab8b605b0 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -148,7 +148,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
148 seq_printf(m, "Servers:"); 148 seq_printf(m, "Servers:");
149 149
150 i = 0; 150 i = 0;
151 read_lock(&cifs_tcp_ses_lock); 151 spin_lock(&cifs_tcp_ses_lock);
152 list_for_each(tmp1, &cifs_tcp_ses_list) { 152 list_for_each(tmp1, &cifs_tcp_ses_list) {
153 server = list_entry(tmp1, struct TCP_Server_Info, 153 server = list_entry(tmp1, struct TCP_Server_Info,
154 tcp_ses_list); 154 tcp_ses_list);
@@ -230,7 +230,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
230 spin_unlock(&GlobalMid_Lock); 230 spin_unlock(&GlobalMid_Lock);
231 } 231 }
232 } 232 }
233 read_unlock(&cifs_tcp_ses_lock); 233 spin_unlock(&cifs_tcp_ses_lock);
234 seq_putc(m, '\n'); 234 seq_putc(m, '\n');
235 235
236 /* BB add code to dump additional info such as TCP session info now */ 236 /* BB add code to dump additional info such as TCP session info now */
@@ -270,7 +270,7 @@ static ssize_t cifs_stats_proc_write(struct file *file,
270 atomic_set(&totBufAllocCount, 0); 270 atomic_set(&totBufAllocCount, 0);
271 atomic_set(&totSmBufAllocCount, 0); 271 atomic_set(&totSmBufAllocCount, 0);
272#endif /* CONFIG_CIFS_STATS2 */ 272#endif /* CONFIG_CIFS_STATS2 */
273 read_lock(&cifs_tcp_ses_lock); 273 spin_lock(&cifs_tcp_ses_lock);
274 list_for_each(tmp1, &cifs_tcp_ses_list) { 274 list_for_each(tmp1, &cifs_tcp_ses_list) {
275 server = list_entry(tmp1, struct TCP_Server_Info, 275 server = list_entry(tmp1, struct TCP_Server_Info,
276 tcp_ses_list); 276 tcp_ses_list);
@@ -303,7 +303,7 @@ static ssize_t cifs_stats_proc_write(struct file *file,
303 } 303 }
304 } 304 }
305 } 305 }
306 read_unlock(&cifs_tcp_ses_lock); 306 spin_unlock(&cifs_tcp_ses_lock);
307 } 307 }
308 308
309 return count; 309 return count;
@@ -343,7 +343,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
343 GlobalCurrentXid, GlobalMaxActiveXid); 343 GlobalCurrentXid, GlobalMaxActiveXid);
344 344
345 i = 0; 345 i = 0;
346 read_lock(&cifs_tcp_ses_lock); 346 spin_lock(&cifs_tcp_ses_lock);
347 list_for_each(tmp1, &cifs_tcp_ses_list) { 347 list_for_each(tmp1, &cifs_tcp_ses_list) {
348 server = list_entry(tmp1, struct TCP_Server_Info, 348 server = list_entry(tmp1, struct TCP_Server_Info,
349 tcp_ses_list); 349 tcp_ses_list);
@@ -397,7 +397,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
397 } 397 }
398 } 398 }
399 } 399 }
400 read_unlock(&cifs_tcp_ses_lock); 400 spin_unlock(&cifs_tcp_ses_lock);
401 401
402 seq_putc(m, '\n'); 402 seq_putc(m, '\n');
403 return 0; 403 return 0;
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index aa316891ac0c..8942b28cf807 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -34,7 +34,7 @@ void cifs_dump_mids(struct TCP_Server_Info *);
34extern int traceSMB; /* flag which enables the function below */ 34extern int traceSMB; /* flag which enables the function below */
35void dump_smb(struct smb_hdr *, int); 35void dump_smb(struct smb_hdr *, int);
36#define CIFS_INFO 0x01 36#define CIFS_INFO 0x01
37#define CIFS_RC 0x02 37#define CIFS_RC 0x02
38#define CIFS_TIMER 0x04 38#define CIFS_TIMER 0x04
39 39
40/* 40/*
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index d6ced7aa23cf..c68a056f27fd 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -44,8 +44,7 @@ static void cifs_dfs_expire_automounts(struct work_struct *work)
44void cifs_dfs_release_automount_timer(void) 44void cifs_dfs_release_automount_timer(void)
45{ 45{
46 BUG_ON(!list_empty(&cifs_dfs_automount_list)); 46 BUG_ON(!list_empty(&cifs_dfs_automount_list));
47 cancel_delayed_work(&cifs_dfs_automount_task); 47 cancel_delayed_work_sync(&cifs_dfs_automount_task);
48 flush_scheduled_work();
49} 48}
50 49
51/** 50/**
@@ -306,6 +305,7 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
306 int xid, i; 305 int xid, i;
307 int rc = 0; 306 int rc = 0;
308 struct vfsmount *mnt = ERR_PTR(-ENOENT); 307 struct vfsmount *mnt = ERR_PTR(-ENOENT);
308 struct tcon_link *tlink;
309 309
310 cFYI(1, "in %s", __func__); 310 cFYI(1, "in %s", __func__);
311 BUG_ON(IS_ROOT(dentry)); 311 BUG_ON(IS_ROOT(dentry));
@@ -315,14 +315,6 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
315 dput(nd->path.dentry); 315 dput(nd->path.dentry);
316 nd->path.dentry = dget(dentry); 316 nd->path.dentry = dget(dentry);
317 317
318 cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
319 ses = cifs_sb->tcon->ses;
320
321 if (!ses) {
322 rc = -EINVAL;
323 goto out_err;
324 }
325
326 /* 318 /*
327 * The MSDFS spec states that paths in DFS referral requests and 319 * The MSDFS spec states that paths in DFS referral requests and
328 * responses must be prefixed by a single '\' character instead of 320 * responses must be prefixed by a single '\' character instead of
@@ -335,10 +327,20 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
335 goto out_err; 327 goto out_err;
336 } 328 }
337 329
338 rc = get_dfs_path(xid, ses , full_path + 1, cifs_sb->local_nls, 330 cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
331 tlink = cifs_sb_tlink(cifs_sb);
332 if (IS_ERR(tlink)) {
333 rc = PTR_ERR(tlink);
334 goto out_err;
335 }
336 ses = tlink_tcon(tlink)->ses;
337
338 rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls,
339 &num_referrals, &referrals, 339 &num_referrals, &referrals,
340 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 340 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
341 341
342 cifs_put_tlink(tlink);
343
342 for (i = 0; i < num_referrals; i++) { 344 for (i = 0; i < num_referrals; i++) {
343 int len; 345 int len;
344 dump_referral(referrals+i); 346 dump_referral(referrals+i);
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 9e771450c3b8..525ba59a4105 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -15,6 +15,8 @@
15 * the GNU Lesser General Public License for more details. 15 * the GNU Lesser General Public License for more details.
16 * 16 *
17 */ 17 */
18#include <linux/radix-tree.h>
19
18#ifndef _CIFS_FS_SB_H 20#ifndef _CIFS_FS_SB_H
19#define _CIFS_FS_SB_H 21#define _CIFS_FS_SB_H
20 22
@@ -36,23 +38,28 @@
36#define CIFS_MOUNT_NOPOSIXBRL 0x2000 /* mandatory not posix byte range lock */ 38#define CIFS_MOUNT_NOPOSIXBRL 0x2000 /* mandatory not posix byte range lock */
37#define CIFS_MOUNT_NOSSYNC 0x4000 /* don't do slow SMBflush on every sync*/ 39#define CIFS_MOUNT_NOSSYNC 0x4000 /* don't do slow SMBflush on every sync*/
38#define CIFS_MOUNT_FSCACHE 0x8000 /* local caching enabled */ 40#define CIFS_MOUNT_FSCACHE 0x8000 /* local caching enabled */
41#define CIFS_MOUNT_MF_SYMLINKS 0x10000 /* Minshall+French Symlinks enabled */
42#define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */
39 43
40struct cifs_sb_info { 44struct cifs_sb_info {
41 struct cifsTconInfo *tcon; /* primary mount */ 45 struct radix_tree_root tlink_tree;
42 struct list_head nested_tcon_q; 46#define CIFS_TLINK_MASTER_TAG 0 /* is "master" (mount) tcon */
47 spinlock_t tlink_tree_lock;
43 struct nls_table *local_nls; 48 struct nls_table *local_nls;
44 unsigned int rsize; 49 unsigned int rsize;
45 unsigned int wsize; 50 unsigned int wsize;
51 atomic_t active;
46 uid_t mnt_uid; 52 uid_t mnt_uid;
47 gid_t mnt_gid; 53 gid_t mnt_gid;
48 mode_t mnt_file_mode; 54 mode_t mnt_file_mode;
49 mode_t mnt_dir_mode; 55 mode_t mnt_dir_mode;
50 int mnt_cifs_flags; 56 unsigned int mnt_cifs_flags;
51 int prepathlen; 57 int prepathlen;
52 char *prepath; /* relative path under the share to mount to */ 58 char *prepath; /* relative path under the share to mount to */
53#ifdef CONFIG_CIFS_DFS_UPCALL 59#ifdef CONFIG_CIFS_DFS_UPCALL
54 char *mountdata; /* mount options received at mount time */ 60 char *mountdata; /* mount options received at mount time */
55#endif 61#endif
56 struct backing_dev_info bdi; 62 struct backing_dev_info bdi;
63 struct delayed_work prune_tlinks;
57}; 64};
58#endif /* _CIFS_FS_SB_H */ 65#endif /* _CIFS_FS_SB_H */
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 85d7cf7ff2c8..c9b4792ae825 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -557,11 +557,16 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
557{ 557{
558 struct cifs_ntsd *pntsd = NULL; 558 struct cifs_ntsd *pntsd = NULL;
559 int xid, rc; 559 int xid, rc;
560 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
561
562 if (IS_ERR(tlink))
563 return NULL;
560 564
561 xid = GetXid(); 565 xid = GetXid();
562 rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen); 566 rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen);
563 FreeXid(xid); 567 FreeXid(xid);
564 568
569 cifs_put_tlink(tlink);
565 570
566 cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen); 571 cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
567 return pntsd; 572 return pntsd;
@@ -574,10 +579,16 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
574 int oplock = 0; 579 int oplock = 0;
575 int xid, rc; 580 int xid, rc;
576 __u16 fid; 581 __u16 fid;
582 struct cifsTconInfo *tcon;
583 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
584
585 if (IS_ERR(tlink))
586 return NULL;
577 587
588 tcon = tlink_tcon(tlink);
578 xid = GetXid(); 589 xid = GetXid();
579 590
580 rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, READ_CONTROL, 0, 591 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0,
581 &fid, &oplock, NULL, cifs_sb->local_nls, 592 &fid, &oplock, NULL, cifs_sb->local_nls,
582 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 593 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
583 if (rc) { 594 if (rc) {
@@ -585,11 +596,12 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
585 goto out; 596 goto out;
586 } 597 }
587 598
588 rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen); 599 rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
589 cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen); 600 cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
590 601
591 CIFSSMBClose(xid, cifs_sb->tcon, fid); 602 CIFSSMBClose(xid, tcon, fid);
592 out: 603 out:
604 cifs_put_tlink(tlink);
593 FreeXid(xid); 605 FreeXid(xid);
594 return pntsd; 606 return pntsd;
595} 607}
@@ -603,7 +615,7 @@ static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
603 struct cifsFileInfo *open_file = NULL; 615 struct cifsFileInfo *open_file = NULL;
604 616
605 if (inode) 617 if (inode)
606 open_file = find_readable_file(CIFS_I(inode)); 618 open_file = find_readable_file(CIFS_I(inode), true);
607 if (!open_file) 619 if (!open_file)
608 return get_cifs_acl_by_path(cifs_sb, path, pacllen); 620 return get_cifs_acl_by_path(cifs_sb, path, pacllen);
609 621
@@ -616,10 +628,15 @@ static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
616 struct cifs_ntsd *pnntsd, u32 acllen) 628 struct cifs_ntsd *pnntsd, u32 acllen)
617{ 629{
618 int xid, rc; 630 int xid, rc;
631 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
632
633 if (IS_ERR(tlink))
634 return PTR_ERR(tlink);
619 635
620 xid = GetXid(); 636 xid = GetXid();
621 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen); 637 rc = CIFSSMBSetCIFSACL(xid, tlink_tcon(tlink), fid, pnntsd, acllen);
622 FreeXid(xid); 638 FreeXid(xid);
639 cifs_put_tlink(tlink);
623 640
624 cFYI(DBG2, "SetCIFSACL rc = %d", rc); 641 cFYI(DBG2, "SetCIFSACL rc = %d", rc);
625 return rc; 642 return rc;
@@ -631,10 +648,16 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
631 int oplock = 0; 648 int oplock = 0;
632 int xid, rc; 649 int xid, rc;
633 __u16 fid; 650 __u16 fid;
651 struct cifsTconInfo *tcon;
652 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
634 653
654 if (IS_ERR(tlink))
655 return PTR_ERR(tlink);
656
657 tcon = tlink_tcon(tlink);
635 xid = GetXid(); 658 xid = GetXid();
636 659
637 rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, WRITE_DAC, 0, 660 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, WRITE_DAC, 0,
638 &fid, &oplock, NULL, cifs_sb->local_nls, 661 &fid, &oplock, NULL, cifs_sb->local_nls,
639 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 662 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
640 if (rc) { 663 if (rc) {
@@ -642,12 +665,13 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
642 goto out; 665 goto out;
643 } 666 }
644 667
645 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen); 668 rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen);
646 cFYI(DBG2, "SetCIFSACL rc = %d", rc); 669 cFYI(DBG2, "SetCIFSACL rc = %d", rc);
647 670
648 CIFSSMBClose(xid, cifs_sb->tcon, fid); 671 CIFSSMBClose(xid, tcon, fid);
649 out: 672out:
650 FreeXid(xid); 673 FreeXid(xid);
674 cifs_put_tlink(tlink);
651 return rc; 675 return rc;
652} 676}
653 677
@@ -661,7 +685,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
661 685
662 cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode); 686 cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
663 687
664 open_file = find_readable_file(CIFS_I(inode)); 688 open_file = find_readable_file(CIFS_I(inode), true);
665 if (!open_file) 689 if (!open_file)
666 return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen); 690 return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
667 691
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 35042d8f7338..7ac0056294cf 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -27,6 +27,7 @@
27#include "md5.h" 27#include "md5.h"
28#include "cifs_unicode.h" 28#include "cifs_unicode.h"
29#include "cifsproto.h" 29#include "cifsproto.h"
30#include "ntlmssp.h"
30#include <linux/ctype.h> 31#include <linux/ctype.h>
31#include <linux/random.h> 32#include <linux/random.h>
32 33
@@ -42,7 +43,7 @@ extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
42 unsigned char *p24); 43 unsigned char *p24);
43 44
44static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, 45static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
45 const struct mac_key *key, char *signature) 46 const struct session_key *key, char *signature)
46{ 47{
47 struct MD5Context context; 48 struct MD5Context context;
48 49
@@ -78,7 +79,7 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
78 server->sequence_number++; 79 server->sequence_number++;
79 spin_unlock(&GlobalMid_Lock); 80 spin_unlock(&GlobalMid_Lock);
80 81
81 rc = cifs_calculate_signature(cifs_pdu, &server->mac_signing_key, 82 rc = cifs_calculate_signature(cifs_pdu, &server->session_key,
82 smb_signature); 83 smb_signature);
83 if (rc) 84 if (rc)
84 memset(cifs_pdu->Signature.SecuritySignature, 0, 8); 85 memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
@@ -89,7 +90,7 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
89} 90}
90 91
91static int cifs_calc_signature2(const struct kvec *iov, int n_vec, 92static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
92 const struct mac_key *key, char *signature) 93 const struct session_key *key, char *signature)
93{ 94{
94 struct MD5Context context; 95 struct MD5Context context;
95 int i; 96 int i;
@@ -145,7 +146,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
145 server->sequence_number++; 146 server->sequence_number++;
146 spin_unlock(&GlobalMid_Lock); 147 spin_unlock(&GlobalMid_Lock);
147 148
148 rc = cifs_calc_signature2(iov, n_vec, &server->mac_signing_key, 149 rc = cifs_calc_signature2(iov, n_vec, &server->session_key,
149 smb_signature); 150 smb_signature);
150 if (rc) 151 if (rc)
151 memset(cifs_pdu->Signature.SecuritySignature, 0, 8); 152 memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
@@ -156,14 +157,14 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
156} 157}
157 158
158int cifs_verify_signature(struct smb_hdr *cifs_pdu, 159int cifs_verify_signature(struct smb_hdr *cifs_pdu,
159 const struct mac_key *mac_key, 160 const struct session_key *session_key,
160 __u32 expected_sequence_number) 161 __u32 expected_sequence_number)
161{ 162{
162 unsigned int rc; 163 unsigned int rc;
163 char server_response_sig[8]; 164 char server_response_sig[8];
164 char what_we_think_sig_should_be[20]; 165 char what_we_think_sig_should_be[20];
165 166
166 if ((cifs_pdu == NULL) || (mac_key == NULL)) 167 if (cifs_pdu == NULL || session_key == NULL)
167 return -EINVAL; 168 return -EINVAL;
168 169
169 if (cifs_pdu->Command == SMB_COM_NEGOTIATE) 170 if (cifs_pdu->Command == SMB_COM_NEGOTIATE)
@@ -192,7 +193,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
192 cpu_to_le32(expected_sequence_number); 193 cpu_to_le32(expected_sequence_number);
193 cifs_pdu->Signature.Sequence.Reserved = 0; 194 cifs_pdu->Signature.Sequence.Reserved = 0;
194 195
195 rc = cifs_calculate_signature(cifs_pdu, mac_key, 196 rc = cifs_calculate_signature(cifs_pdu, session_key,
196 what_we_think_sig_should_be); 197 what_we_think_sig_should_be);
197 198
198 if (rc) 199 if (rc)
@@ -209,7 +210,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
209} 210}
210 211
211/* We fill in key by putting in 40 byte array which was allocated by caller */ 212/* We fill in key by putting in 40 byte array which was allocated by caller */
212int cifs_calculate_mac_key(struct mac_key *key, const char *rn, 213int cifs_calculate_session_key(struct session_key *key, const char *rn,
213 const char *password) 214 const char *password)
214{ 215{
215 char temp_key[16]; 216 char temp_key[16];
@@ -262,6 +263,148 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
262} 263}
263#endif /* CIFS_WEAK_PW_HASH */ 264#endif /* CIFS_WEAK_PW_HASH */
264 265
266/* Build a proper attribute value/target info pairs blob.
267 * Fill in netbios and dns domain name and workstation name
268 * and client time (total five av pairs and + one end of fields indicator.
269 * Allocate domain name which gets freed when session struct is deallocated.
270 */
271static int
272build_avpair_blob(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
273{
274 unsigned int dlen;
275 unsigned int wlen;
276 unsigned int size = 6 * sizeof(struct ntlmssp2_name);
277 __le64 curtime;
278 char *defdmname = "WORKGROUP";
279 unsigned char *blobptr;
280 struct ntlmssp2_name *attrptr;
281
282 if (!ses->domainName) {
283 ses->domainName = kstrdup(defdmname, GFP_KERNEL);
284 if (!ses->domainName)
285 return -ENOMEM;
286 }
287
288 dlen = strlen(ses->domainName);
289 wlen = strlen(ses->server->hostname);
290
291 /* The length of this blob is a size which is
292 * six times the size of a structure which holds name/size +
293 * two times the unicode length of a domain name +
294 * two times the unicode length of a server name +
295 * size of a timestamp (which is 8 bytes).
296 */
297 ses->tilen = size + 2 * (2 * dlen) + 2 * (2 * wlen) + 8;
298 ses->tiblob = kzalloc(ses->tilen, GFP_KERNEL);
299 if (!ses->tiblob) {
300 ses->tilen = 0;
301 cERROR(1, "Challenge target info allocation failure");
302 return -ENOMEM;
303 }
304
305 blobptr = ses->tiblob;
306 attrptr = (struct ntlmssp2_name *) blobptr;
307
308 attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME);
309 attrptr->length = cpu_to_le16(2 * dlen);
310 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
311 cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
312
313 blobptr += 2 * dlen;
314 attrptr = (struct ntlmssp2_name *) blobptr;
315
316 attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_COMPUTER_NAME);
317 attrptr->length = cpu_to_le16(2 * wlen);
318 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
319 cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
320
321 blobptr += 2 * wlen;
322 attrptr = (struct ntlmssp2_name *) blobptr;
323
324 attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_DOMAIN_NAME);
325 attrptr->length = cpu_to_le16(2 * dlen);
326 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
327 cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
328
329 blobptr += 2 * dlen;
330 attrptr = (struct ntlmssp2_name *) blobptr;
331
332 attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_COMPUTER_NAME);
333 attrptr->length = cpu_to_le16(2 * wlen);
334 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
335 cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
336
337 blobptr += 2 * wlen;
338 attrptr = (struct ntlmssp2_name *) blobptr;
339
340 attrptr->type = cpu_to_le16(NTLMSSP_AV_TIMESTAMP);
341 attrptr->length = cpu_to_le16(sizeof(__le64));
342 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
343 curtime = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
344 memcpy(blobptr, &curtime, sizeof(__le64));
345
346 return 0;
347}
348
349/* Server has provided av pairs/target info in the type 2 challenge
350 * packet and we have plucked it and stored within smb session.
351 * We parse that blob here to find netbios domain name to be used
352 * as part of ntlmv2 authentication (in Target String), if not already
353 * specified on the command line.
354 * If this function returns without any error but without fetching
355 * domain name, authentication may fail against some server but
356 * may not fail against other (those who are not very particular
357 * about target string i.e. for some, just user name might suffice.
358 */
359static int
360find_domain_name(struct cifsSesInfo *ses)
361{
362 unsigned int attrsize;
363 unsigned int type;
364 unsigned int onesize = sizeof(struct ntlmssp2_name);
365 unsigned char *blobptr;
366 unsigned char *blobend;
367 struct ntlmssp2_name *attrptr;
368
369 if (!ses->tilen || !ses->tiblob)
370 return 0;
371
372 blobptr = ses->tiblob;
373 blobend = ses->tiblob + ses->tilen;
374
375 while (blobptr + onesize < blobend) {
376 attrptr = (struct ntlmssp2_name *) blobptr;
377 type = le16_to_cpu(attrptr->type);
378 if (type == NTLMSSP_AV_EOL)
379 break;
380 blobptr += 2; /* advance attr type */
381 attrsize = le16_to_cpu(attrptr->length);
382 blobptr += 2; /* advance attr size */
383 if (blobptr + attrsize > blobend)
384 break;
385 if (type == NTLMSSP_AV_NB_DOMAIN_NAME) {
386 if (!attrsize)
387 break;
388 if (!ses->domainName) {
389 struct nls_table *default_nls;
390 ses->domainName =
391 kmalloc(attrsize + 1, GFP_KERNEL);
392 if (!ses->domainName)
393 return -ENOMEM;
394 default_nls = load_nls_default();
395 cifs_from_ucs2(ses->domainName,
396 (__le16 *)blobptr, attrsize, attrsize,
397 default_nls, false);
398 unload_nls(default_nls);
399 break;
400 }
401 }
402 blobptr += attrsize; /* advance attr value */
403 }
404
405 return 0;
406}
407
265static int calc_ntlmv2_hash(struct cifsSesInfo *ses, 408static int calc_ntlmv2_hash(struct cifsSesInfo *ses,
266 const struct nls_table *nls_cp) 409 const struct nls_table *nls_cp)
267{ 410{
@@ -315,13 +458,14 @@ calc_exit_1:
315calc_exit_2: 458calc_exit_2:
316 /* BB FIXME what about bytes 24 through 40 of the signing key? 459 /* BB FIXME what about bytes 24 through 40 of the signing key?
317 compare with the NTLM example */ 460 compare with the NTLM example */
318 hmac_md5_final(ses->server->ntlmv2_hash, pctxt); 461 hmac_md5_final(ses->ntlmv2_hash, pctxt);
319 462
320 kfree(pctxt); 463 kfree(pctxt);
321 return rc; 464 return rc;
322} 465}
323 466
324void setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf, 467int
468setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
325 const struct nls_table *nls_cp) 469 const struct nls_table *nls_cp)
326{ 470{
327 int rc; 471 int rc;
@@ -333,25 +477,48 @@ void setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
333 buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); 477 buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
334 get_random_bytes(&buf->client_chal, sizeof(buf->client_chal)); 478 get_random_bytes(&buf->client_chal, sizeof(buf->client_chal));
335 buf->reserved2 = 0; 479 buf->reserved2 = 0;
336 buf->names[0].type = cpu_to_le16(NTLMSSP_DOMAIN_TYPE); 480
337 buf->names[0].length = 0; 481 if (ses->server->secType == RawNTLMSSP) {
338 buf->names[1].type = 0; 482 if (!ses->domainName) {
339 buf->names[1].length = 0; 483 rc = find_domain_name(ses);
484 if (rc) {
485 cERROR(1, "error %d finding domain name", rc);
486 goto setup_ntlmv2_rsp_ret;
487 }
488 }
489 } else {
490 rc = build_avpair_blob(ses, nls_cp);
491 if (rc) {
492 cERROR(1, "error %d building av pair blob", rc);
493 return rc;
494 }
495 }
340 496
341 /* calculate buf->ntlmv2_hash */ 497 /* calculate buf->ntlmv2_hash */
342 rc = calc_ntlmv2_hash(ses, nls_cp); 498 rc = calc_ntlmv2_hash(ses, nls_cp);
343 if (rc) 499 if (rc) {
344 cERROR(1, "could not get v2 hash rc %d", rc); 500 cERROR(1, "could not get v2 hash rc %d", rc);
501 goto setup_ntlmv2_rsp_ret;
502 }
345 CalcNTLMv2_response(ses, resp_buf); 503 CalcNTLMv2_response(ses, resp_buf);
346 504
347 /* now calculate the MAC key for NTLMv2 */ 505 /* now calculate the session key for NTLMv2 */
348 hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context); 506 hmac_md5_init_limK_to_64(ses->ntlmv2_hash, 16, &context);
349 hmac_md5_update(resp_buf, 16, &context); 507 hmac_md5_update(resp_buf, 16, &context);
350 hmac_md5_final(ses->server->mac_signing_key.data.ntlmv2.key, &context); 508 hmac_md5_final(ses->auth_key.data.ntlmv2.key, &context);
351 509
352 memcpy(&ses->server->mac_signing_key.data.ntlmv2.resp, resp_buf, 510 memcpy(&ses->auth_key.data.ntlmv2.resp, resp_buf,
353 sizeof(struct ntlmv2_resp)); 511 sizeof(struct ntlmv2_resp));
354 ses->server->mac_signing_key.len = 16 + sizeof(struct ntlmv2_resp); 512 ses->auth_key.len = 16 + sizeof(struct ntlmv2_resp);
513
514 return 0;
515
516setup_ntlmv2_rsp_ret:
517 kfree(ses->tiblob);
518 ses->tiblob = NULL;
519 ses->tilen = 0;
520
521 return rc;
355} 522}
356 523
357void CalcNTLMv2_response(const struct cifsSesInfo *ses, 524void CalcNTLMv2_response(const struct cifsSesInfo *ses,
@@ -359,12 +526,15 @@ void CalcNTLMv2_response(const struct cifsSesInfo *ses,
359{ 526{
360 struct HMACMD5Context context; 527 struct HMACMD5Context context;
361 /* rest of v2 struct already generated */ 528 /* rest of v2 struct already generated */
362 memcpy(v2_session_response + 8, ses->server->cryptKey, 8); 529 memcpy(v2_session_response + 8, ses->cryptKey, 8);
363 hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context); 530 hmac_md5_init_limK_to_64(ses->ntlmv2_hash, 16, &context);
364 531
365 hmac_md5_update(v2_session_response+8, 532 hmac_md5_update(v2_session_response+8,
366 sizeof(struct ntlmv2_resp) - 8, &context); 533 sizeof(struct ntlmv2_resp) - 8, &context);
367 534
535 if (ses->tilen)
536 hmac_md5_update(ses->tiblob, ses->tilen, &context);
537
368 hmac_md5_final(v2_session_response, &context); 538 hmac_md5_final(v2_session_response, &context);
369/* cifs_dump_mem("v2_sess_rsp: ", v2_session_response, 32); */ 539/* cifs_dump_mem("v2_sess_rsp: ", v2_session_response, 32); */
370} 540}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index b7431afdd76d..34371637f210 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -35,7 +35,7 @@
35#include <linux/delay.h> 35#include <linux/delay.h>
36#include <linux/kthread.h> 36#include <linux/kthread.h>
37#include <linux/freezer.h> 37#include <linux/freezer.h>
38#include <linux/smp_lock.h> 38#include <net/ipv6.h>
39#include "cifsfs.h" 39#include "cifsfs.h"
40#include "cifspdu.h" 40#include "cifspdu.h"
41#define DECLARE_GLOBALS_HERE 41#define DECLARE_GLOBALS_HERE
@@ -82,6 +82,24 @@ extern mempool_t *cifs_sm_req_poolp;
82extern mempool_t *cifs_req_poolp; 82extern mempool_t *cifs_req_poolp;
83extern mempool_t *cifs_mid_poolp; 83extern mempool_t *cifs_mid_poolp;
84 84
85void
86cifs_sb_active(struct super_block *sb)
87{
88 struct cifs_sb_info *server = CIFS_SB(sb);
89
90 if (atomic_inc_return(&server->active) == 1)
91 atomic_inc(&sb->s_active);
92}
93
94void
95cifs_sb_deactive(struct super_block *sb)
96{
97 struct cifs_sb_info *server = CIFS_SB(sb);
98
99 if (atomic_dec_and_test(&server->active))
100 deactivate_super(sb);
101}
102
85static int 103static int
86cifs_read_super(struct super_block *sb, void *data, 104cifs_read_super(struct super_block *sb, void *data,
87 const char *devname, int silent) 105 const char *devname, int silent)
@@ -97,6 +115,9 @@ cifs_read_super(struct super_block *sb, void *data,
97 if (cifs_sb == NULL) 115 if (cifs_sb == NULL)
98 return -ENOMEM; 116 return -ENOMEM;
99 117
118 spin_lock_init(&cifs_sb->tlink_tree_lock);
119 INIT_RADIX_TREE(&cifs_sb->tlink_tree, GFP_KERNEL);
120
100 rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY); 121 rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
101 if (rc) { 122 if (rc) {
102 kfree(cifs_sb); 123 kfree(cifs_sb);
@@ -136,9 +157,6 @@ cifs_read_super(struct super_block *sb, void *data,
136 sb->s_magic = CIFS_MAGIC_NUMBER; 157 sb->s_magic = CIFS_MAGIC_NUMBER;
137 sb->s_op = &cifs_super_ops; 158 sb->s_op = &cifs_super_ops;
138 sb->s_bdi = &cifs_sb->bdi; 159 sb->s_bdi = &cifs_sb->bdi;
139/* if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512)
140 sb->s_blocksize =
141 cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */
142 sb->s_blocksize = CIFS_MAX_MSGSIZE; 160 sb->s_blocksize = CIFS_MAX_MSGSIZE;
143 sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */ 161 sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */
144 inode = cifs_root_iget(sb, ROOT_I); 162 inode = cifs_root_iget(sb, ROOT_I);
@@ -200,8 +218,6 @@ cifs_put_super(struct super_block *sb)
200 return; 218 return;
201 } 219 }
202 220
203 lock_kernel();
204
205 rc = cifs_umount(sb, cifs_sb); 221 rc = cifs_umount(sb, cifs_sb);
206 if (rc) 222 if (rc)
207 cERROR(1, "cifs_umount failed with return code %d", rc); 223 cERROR(1, "cifs_umount failed with return code %d", rc);
@@ -215,8 +231,6 @@ cifs_put_super(struct super_block *sb)
215 unload_nls(cifs_sb->local_nls); 231 unload_nls(cifs_sb->local_nls);
216 bdi_destroy(&cifs_sb->bdi); 232 bdi_destroy(&cifs_sb->bdi);
217 kfree(cifs_sb); 233 kfree(cifs_sb);
218
219 unlock_kernel();
220} 234}
221 235
222static int 236static int
@@ -224,7 +238,7 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
224{ 238{
225 struct super_block *sb = dentry->d_sb; 239 struct super_block *sb = dentry->d_sb;
226 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 240 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
227 struct cifsTconInfo *tcon = cifs_sb->tcon; 241 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
228 int rc = -EOPNOTSUPP; 242 int rc = -EOPNOTSUPP;
229 int xid; 243 int xid;
230 244
@@ -366,14 +380,36 @@ static int
366cifs_show_options(struct seq_file *s, struct vfsmount *m) 380cifs_show_options(struct seq_file *s, struct vfsmount *m)
367{ 381{
368 struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb); 382 struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb);
369 struct cifsTconInfo *tcon = cifs_sb->tcon; 383 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
384 struct sockaddr *srcaddr;
385 srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
370 386
371 seq_printf(s, ",unc=%s", tcon->treeName); 387 seq_printf(s, ",unc=%s", tcon->treeName);
372 if (tcon->ses->userName) 388
389 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
390 seq_printf(s, ",multiuser");
391 else if (tcon->ses->userName)
373 seq_printf(s, ",username=%s", tcon->ses->userName); 392 seq_printf(s, ",username=%s", tcon->ses->userName);
393
374 if (tcon->ses->domainName) 394 if (tcon->ses->domainName)
375 seq_printf(s, ",domain=%s", tcon->ses->domainName); 395 seq_printf(s, ",domain=%s", tcon->ses->domainName);
376 396
397 if (srcaddr->sa_family != AF_UNSPEC) {
398 struct sockaddr_in *saddr4;
399 struct sockaddr_in6 *saddr6;
400 saddr4 = (struct sockaddr_in *)srcaddr;
401 saddr6 = (struct sockaddr_in6 *)srcaddr;
402 if (srcaddr->sa_family == AF_INET6)
403 seq_printf(s, ",srcaddr=%pI6c",
404 &saddr6->sin6_addr);
405 else if (srcaddr->sa_family == AF_INET)
406 seq_printf(s, ",srcaddr=%pI4",
407 &saddr4->sin_addr.s_addr);
408 else
409 seq_printf(s, ",srcaddr=BAD-AF:%i",
410 (int)(srcaddr->sa_family));
411 }
412
377 seq_printf(s, ",uid=%d", cifs_sb->mnt_uid); 413 seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
378 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) 414 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
379 seq_printf(s, ",forceuid"); 415 seq_printf(s, ",forceuid");
@@ -422,6 +458,8 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
422 seq_printf(s, ",dynperm"); 458 seq_printf(s, ",dynperm");
423 if (m->mnt_sb->s_flags & MS_POSIXACL) 459 if (m->mnt_sb->s_flags & MS_POSIXACL)
424 seq_printf(s, ",acl"); 460 seq_printf(s, ",acl");
461 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
462 seq_printf(s, ",mfsymlinks");
425 463
426 seq_printf(s, ",rsize=%d", cifs_sb->rsize); 464 seq_printf(s, ",rsize=%d", cifs_sb->rsize);
427 seq_printf(s, ",wsize=%d", cifs_sb->wsize); 465 seq_printf(s, ",wsize=%d", cifs_sb->wsize);
@@ -437,20 +475,18 @@ static void cifs_umount_begin(struct super_block *sb)
437 if (cifs_sb == NULL) 475 if (cifs_sb == NULL)
438 return; 476 return;
439 477
440 tcon = cifs_sb->tcon; 478 tcon = cifs_sb_master_tcon(cifs_sb);
441 if (tcon == NULL)
442 return;
443 479
444 read_lock(&cifs_tcp_ses_lock); 480 spin_lock(&cifs_tcp_ses_lock);
445 if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) { 481 if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) {
446 /* we have other mounts to same share or we have 482 /* we have other mounts to same share or we have
447 already tried to force umount this and woken up 483 already tried to force umount this and woken up
448 all waiting network requests, nothing to do */ 484 all waiting network requests, nothing to do */
449 read_unlock(&cifs_tcp_ses_lock); 485 spin_unlock(&cifs_tcp_ses_lock);
450 return; 486 return;
451 } else if (tcon->tc_count == 1) 487 } else if (tcon->tc_count == 1)
452 tcon->tidStatus = CifsExiting; 488 tcon->tidStatus = CifsExiting;
453 read_unlock(&cifs_tcp_ses_lock); 489 spin_unlock(&cifs_tcp_ses_lock);
454 490
455 /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */ 491 /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
456 /* cancel_notify_requests(tcon); */ 492 /* cancel_notify_requests(tcon); */
@@ -514,7 +550,9 @@ cifs_get_sb(struct file_system_type *fs_type,
514 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 550 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
515{ 551{
516 int rc; 552 int rc;
517 struct super_block *sb = sget(fs_type, NULL, set_anon_super, NULL); 553 struct super_block *sb;
554
555 sb = sget(fs_type, NULL, set_anon_super, NULL);
518 556
519 cFYI(1, "Devname: %s flags: %d ", dev_name, flags); 557 cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
520 558
@@ -565,9 +603,10 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
565 603
566static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) 604static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
567{ 605{
568 /* note that this is called by vfs setlease with the BKL held 606 /* note that this is called by vfs setlease with lock_flocks held
569 although I doubt that BKL is needed here in cifs */ 607 to protect *lease from going away */
570 struct inode *inode = file->f_path.dentry->d_inode; 608 struct inode *inode = file->f_path.dentry->d_inode;
609 struct cifsFileInfo *cfile = file->private_data;
571 610
572 if (!(S_ISREG(inode->i_mode))) 611 if (!(S_ISREG(inode->i_mode)))
573 return -EINVAL; 612 return -EINVAL;
@@ -578,8 +617,8 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
578 ((arg == F_WRLCK) && 617 ((arg == F_WRLCK) &&
579 (CIFS_I(inode)->clientCanCacheAll))) 618 (CIFS_I(inode)->clientCanCacheAll)))
580 return generic_setlease(file, arg, lease); 619 return generic_setlease(file, arg, lease);
581 else if (CIFS_SB(inode->i_sb)->tcon->local_lease && 620 else if (tlink_tcon(cfile->tlink)->local_lease &&
582 !CIFS_I(inode)->clientCanCacheRead) 621 !CIFS_I(inode)->clientCanCacheRead)
583 /* If the server claims to support oplock on this 622 /* If the server claims to support oplock on this
584 file, then we still need to check oplock even 623 file, then we still need to check oplock even
585 if the local_lease mount option is set, but there 624 if the local_lease mount option is set, but there
@@ -898,8 +937,8 @@ init_cifs(void)
898 GlobalTotalActiveXid = 0; 937 GlobalTotalActiveXid = 0;
899 GlobalMaxActiveXid = 0; 938 GlobalMaxActiveXid = 0;
900 memset(Local_System_Name, 0, 15); 939 memset(Local_System_Name, 0, 15);
901 rwlock_init(&GlobalSMBSeslock); 940 spin_lock_init(&cifs_tcp_ses_lock);
902 rwlock_init(&cifs_tcp_ses_lock); 941 spin_lock_init(&cifs_file_list_lock);
903 spin_lock_init(&GlobalMid_Lock); 942 spin_lock_init(&GlobalMid_Lock);
904 943
905 if (cifs_max_pending < 2) { 944 if (cifs_max_pending < 2) {
@@ -912,11 +951,11 @@ init_cifs(void)
912 951
913 rc = cifs_fscache_register(); 952 rc = cifs_fscache_register();
914 if (rc) 953 if (rc)
915 goto out; 954 goto out_clean_proc;
916 955
917 rc = cifs_init_inodecache(); 956 rc = cifs_init_inodecache();
918 if (rc) 957 if (rc)
919 goto out_clean_proc; 958 goto out_unreg_fscache;
920 959
921 rc = cifs_init_mids(); 960 rc = cifs_init_mids();
922 if (rc) 961 if (rc)
@@ -938,19 +977,19 @@ init_cifs(void)
938 return 0; 977 return 0;
939 978
940#ifdef CONFIG_CIFS_UPCALL 979#ifdef CONFIG_CIFS_UPCALL
941 out_unregister_filesystem: 980out_unregister_filesystem:
942 unregister_filesystem(&cifs_fs_type); 981 unregister_filesystem(&cifs_fs_type);
943#endif 982#endif
944 out_destroy_request_bufs: 983out_destroy_request_bufs:
945 cifs_destroy_request_bufs(); 984 cifs_destroy_request_bufs();
946 out_destroy_mids: 985out_destroy_mids:
947 cifs_destroy_mids(); 986 cifs_destroy_mids();
948 out_destroy_inodecache: 987out_destroy_inodecache:
949 cifs_destroy_inodecache(); 988 cifs_destroy_inodecache();
950 out_clean_proc: 989out_unreg_fscache:
951 cifs_proc_clean();
952 cifs_fscache_unregister(); 990 cifs_fscache_unregister();
953 out: 991out_clean_proc:
992 cifs_proc_clean();
954 return rc; 993 return rc;
955} 994}
956 995
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index d82f5fb4761e..f35795a16b42 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -42,10 +42,8 @@ extern const struct address_space_operations cifs_addr_ops;
42extern const struct address_space_operations cifs_addr_ops_smallbuf; 42extern const struct address_space_operations cifs_addr_ops_smallbuf;
43 43
44/* Functions related to super block operations */ 44/* Functions related to super block operations */
45/* extern const struct super_operations cifs_super_ops;*/ 45extern void cifs_sb_active(struct super_block *sb);
46extern void cifs_read_inode(struct inode *); 46extern void cifs_sb_deactive(struct super_block *sb);
47/*extern void cifs_delete_inode(struct inode *);*/ /* BB not needed yet */
48/* extern void cifs_write_inode(struct inode *); */ /* BB not needed yet */
49 47
50/* Functions related to inodes */ 48/* Functions related to inodes */
51extern const struct inode_operations cifs_dir_inode_ops; 49extern const struct inode_operations cifs_dir_inode_ops;
@@ -104,7 +102,7 @@ extern int cifs_readlink(struct dentry *direntry, char __user *buffer,
104extern int cifs_symlink(struct inode *inode, struct dentry *direntry, 102extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
105 const char *symname); 103 const char *symname);
106extern int cifs_removexattr(struct dentry *, const char *); 104extern int cifs_removexattr(struct dentry *, const char *);
107extern int cifs_setxattr(struct dentry *, const char *, const void *, 105extern int cifs_setxattr(struct dentry *, const char *, const void *,
108 size_t, int); 106 size_t, int);
109extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t); 107extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t);
110extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); 108extern ssize_t cifs_listxattr(struct dentry *, char *, size_t);
@@ -114,5 +112,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
114extern const struct export_operations cifs_export_ops; 112extern const struct export_operations cifs_export_ops;
115#endif /* EXPERIMENTAL */ 113#endif /* EXPERIMENTAL */
116 114
117#define CIFS_VERSION "1.65" 115#define CIFS_VERSION "1.67"
118#endif /* _CIFSFS_H */ 116#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 0cdfb8c32ac6..3365e77f6f24 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -97,7 +97,7 @@ enum protocolEnum {
97 /* Netbios frames protocol not supported at this time */ 97 /* Netbios frames protocol not supported at this time */
98}; 98};
99 99
100struct mac_key { 100struct session_key {
101 unsigned int len; 101 unsigned int len;
102 union { 102 union {
103 char ntlm[CIFS_SESS_KEY_SIZE + 16]; 103 char ntlm[CIFS_SESS_KEY_SIZE + 16];
@@ -139,6 +139,7 @@ struct TCP_Server_Info {
139 struct sockaddr_in sockAddr; 139 struct sockaddr_in sockAddr;
140 struct sockaddr_in6 sockAddr6; 140 struct sockaddr_in6 sockAddr6;
141 } addr; 141 } addr;
142 struct sockaddr_storage srcaddr; /* locally bind to this IP */
142 wait_queue_head_t response_q; 143 wait_queue_head_t response_q;
143 wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/ 144 wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/
144 struct list_head pending_mid_q; 145 struct list_head pending_mid_q;
@@ -178,12 +179,10 @@ struct TCP_Server_Info {
178 int capabilities; /* allow selective disabling of caps by smb sess */ 179 int capabilities; /* allow selective disabling of caps by smb sess */
179 int timeAdj; /* Adjust for difference in server time zone in sec */ 180 int timeAdj; /* Adjust for difference in server time zone in sec */
180 __u16 CurrentMid; /* multiplex id - rotating counter */ 181 __u16 CurrentMid; /* multiplex id - rotating counter */
181 char cryptKey[CIFS_CRYPTO_KEY_SIZE];
182 /* 16th byte of RFC1001 workstation name is always null */ 182 /* 16th byte of RFC1001 workstation name is always null */
183 char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; 183 char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
184 __u32 sequence_number; /* needed for CIFS PDU signature */ 184 __u32 sequence_number; /* needed for CIFS PDU signature */
185 struct mac_key mac_signing_key; 185 struct session_key session_key;
186 char ntlmv2_hash[16];
187 unsigned long lstrp; /* when we got last response from this server */ 186 unsigned long lstrp; /* when we got last response from this server */
188 u16 dialect; /* dialect index that server chose */ 187 u16 dialect; /* dialect index that server chose */
189 /* extended security flavors that server supports */ 188 /* extended security flavors that server supports */
@@ -191,6 +190,7 @@ struct TCP_Server_Info {
191 bool sec_mskerberos; /* supports legacy MS Kerberos */ 190 bool sec_mskerberos; /* supports legacy MS Kerberos */
192 bool sec_kerberosu2u; /* supports U2U Kerberos */ 191 bool sec_kerberosu2u; /* supports U2U Kerberos */
193 bool sec_ntlmssp; /* supports NTLMSSP */ 192 bool sec_ntlmssp; /* supports NTLMSSP */
193 bool session_estab; /* mark when very first sess is established */
194#ifdef CONFIG_CIFS_FSCACHE 194#ifdef CONFIG_CIFS_FSCACHE
195 struct fscache_cookie *fscache; /* client index cache cookie */ 195 struct fscache_cookie *fscache; /* client index cache cookie */
196#endif 196#endif
@@ -222,6 +222,11 @@ struct cifsSesInfo {
222 char userName[MAX_USERNAME_SIZE + 1]; 222 char userName[MAX_USERNAME_SIZE + 1];
223 char *domainName; 223 char *domainName;
224 char *password; 224 char *password;
225 char cryptKey[CIFS_CRYPTO_KEY_SIZE];
226 struct session_key auth_key;
227 char ntlmv2_hash[16];
228 unsigned int tilen; /* length of the target info blob */
229 unsigned char *tiblob; /* target info blob in challenge response */
225 bool need_reconnect:1; /* connection reset, uid now invalid */ 230 bool need_reconnect:1; /* connection reset, uid now invalid */
226}; 231};
227/* no more than one of the following three session flags may be set */ 232/* no more than one of the following three session flags may be set */
@@ -308,6 +313,44 @@ struct cifsTconInfo {
308}; 313};
309 314
310/* 315/*
316 * This is a refcounted and timestamped container for a tcon pointer. The
317 * container holds a tcon reference. It is considered safe to free one of
318 * these when the tl_count goes to 0. The tl_time is the time of the last
319 * "get" on the container.
320 */
321struct tcon_link {
322 unsigned long tl_index;
323 unsigned long tl_flags;
324#define TCON_LINK_MASTER 0
325#define TCON_LINK_PENDING 1
326#define TCON_LINK_IN_TREE 2
327 unsigned long tl_time;
328 atomic_t tl_count;
329 struct cifsTconInfo *tl_tcon;
330};
331
332extern struct tcon_link *cifs_sb_tlink(struct cifs_sb_info *cifs_sb);
333
334static inline struct cifsTconInfo *
335tlink_tcon(struct tcon_link *tlink)
336{
337 return tlink->tl_tcon;
338}
339
340extern void cifs_put_tlink(struct tcon_link *tlink);
341
342static inline struct tcon_link *
343cifs_get_tlink(struct tcon_link *tlink)
344{
345 if (tlink && !IS_ERR(tlink))
346 atomic_inc(&tlink->tl_count);
347 return tlink;
348}
349
350/* This function is always expected to succeed */
351extern struct cifsTconInfo *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb);
352
353/*
311 * This info hangs off the cifsFileInfo structure, pointed to by llist. 354 * This info hangs off the cifsFileInfo structure, pointed to by llist.
312 * This is used to track byte stream locks on the file 355 * This is used to track byte stream locks on the file
313 */ 356 */
@@ -345,12 +388,11 @@ struct cifsFileInfo {
345 __u16 netfid; /* file id from remote */ 388 __u16 netfid; /* file id from remote */
346 /* BB add lock scope info here if needed */ ; 389 /* BB add lock scope info here if needed */ ;
347 /* lock scope id (0 if none) */ 390 /* lock scope id (0 if none) */
348 struct file *pfile; /* needed for writepage */ 391 struct dentry *dentry;
349 struct inode *pInode; /* needed for oplock break */ 392 unsigned int f_flags;
350 struct vfsmount *mnt; 393 struct tcon_link *tlink;
351 struct mutex lock_mutex; 394 struct mutex lock_mutex;
352 struct list_head llist; /* list of byte range locks we have. */ 395 struct list_head llist; /* list of byte range locks we have. */
353 bool closePend:1; /* file is marked to close */
354 bool invalidHandle:1; /* file closed via session abend */ 396 bool invalidHandle:1; /* file closed via session abend */
355 bool oplock_break_cancelled:1; 397 bool oplock_break_cancelled:1;
356 atomic_t count; /* reference count */ 398 atomic_t count; /* reference count */
@@ -365,14 +407,7 @@ static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
365 atomic_inc(&cifs_file->count); 407 atomic_inc(&cifs_file->count);
366} 408}
367 409
368/* Release a reference on the file private data */ 410void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
369static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
370{
371 if (atomic_dec_and_test(&cifs_file->count)) {
372 iput(cifs_file->pInode);
373 kfree(cifs_file);
374 }
375}
376 411
377/* 412/*
378 * One of these for each file inode 413 * One of these for each file inode
@@ -474,16 +509,16 @@ struct oplock_q_entry {
474 509
475/* for pending dnotify requests */ 510/* for pending dnotify requests */
476struct dir_notify_req { 511struct dir_notify_req {
477 struct list_head lhead; 512 struct list_head lhead;
478 __le16 Pid; 513 __le16 Pid;
479 __le16 PidHigh; 514 __le16 PidHigh;
480 __u16 Mid; 515 __u16 Mid;
481 __u16 Tid; 516 __u16 Tid;
482 __u16 Uid; 517 __u16 Uid;
483 __u16 netfid; 518 __u16 netfid;
484 __u32 filter; /* CompletionFilter (for multishot) */ 519 __u32 filter; /* CompletionFilter (for multishot) */
485 int multishot; 520 int multishot;
486 struct file *pfile; 521 struct file *pfile;
487}; 522};
488 523
489struct dfs_info3_param { 524struct dfs_info3_param {
@@ -667,7 +702,7 @@ GLOBAL_EXTERN struct list_head cifs_tcp_ses_list;
667 * the reference counters for the server, smb session, and tcon. Finally, 702 * the reference counters for the server, smb session, and tcon. Finally,
668 * changes to the tcon->tidStatus should be done while holding this lock. 703 * changes to the tcon->tidStatus should be done while holding this lock.
669 */ 704 */
670GLOBAL_EXTERN rwlock_t cifs_tcp_ses_lock; 705GLOBAL_EXTERN spinlock_t cifs_tcp_ses_lock;
671 706
672/* 707/*
673 * This lock protects the cifs_file->llist and cifs_file->flist 708 * This lock protects the cifs_file->llist and cifs_file->flist
@@ -676,7 +711,7 @@ GLOBAL_EXTERN rwlock_t cifs_tcp_ses_lock;
676 * If cifs_tcp_ses_lock and the lock below are both needed to be held, then 711 * If cifs_tcp_ses_lock and the lock below are both needed to be held, then
677 * the cifs_tcp_ses_lock must be grabbed first and released last. 712 * the cifs_tcp_ses_lock must be grabbed first and released last.
678 */ 713 */
679GLOBAL_EXTERN rwlock_t GlobalSMBSeslock; 714GLOBAL_EXTERN spinlock_t cifs_file_list_lock;
680 715
681/* Outstanding dir notify requests */ 716/* Outstanding dir notify requests */
682GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; 717GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 14d036d8db11..b0f4b5656d4c 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -663,7 +663,6 @@ struct ntlmv2_resp {
663 __le64 time; 663 __le64 time;
664 __u64 client_chal; /* random */ 664 __u64 client_chal; /* random */
665 __u32 reserved2; 665 __u32 reserved2;
666 struct ntlmssp2_name names[2];
667 /* array of name entries could follow ending in minimum 4 byte struct */ 666 /* array of name entries could follow ending in minimum 4 byte struct */
668} __attribute__((packed)); 667} __attribute__((packed));
669 668
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 1d60c655e3e0..e593c40ba7ba 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -78,9 +78,9 @@ extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
78extern bool is_valid_oplock_break(struct smb_hdr *smb, 78extern bool is_valid_oplock_break(struct smb_hdr *smb,
79 struct TCP_Server_Info *); 79 struct TCP_Server_Info *);
80extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); 80extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
81extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *); 81extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
82#ifdef CONFIG_CIFS_EXPERIMENTAL 82#ifdef CONFIG_CIFS_EXPERIMENTAL
83extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *); 83extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
84#endif 84#endif
85extern unsigned int smbCalcSize(struct smb_hdr *ptr); 85extern unsigned int smbCalcSize(struct smb_hdr *ptr);
86extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr); 86extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
@@ -105,12 +105,12 @@ extern u64 cifs_UnixTimeToNT(struct timespec);
105extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, 105extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
106 int offset); 106 int offset);
107 107
108extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode, 108extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle,
109 __u16 fileHandle, struct file *file, 109 struct file *file, struct tcon_link *tlink,
110 struct vfsmount *mnt, unsigned int oflags); 110 __u32 oplock);
111extern int cifs_posix_open(char *full_path, struct inode **pinode, 111extern int cifs_posix_open(char *full_path, struct inode **pinode,
112 struct super_block *sb, 112 struct super_block *sb,
113 int mode, int oflags, 113 int mode, unsigned int f_flags,
114 __u32 *poplock, __u16 *pnetfid, int xid); 114 __u32 *poplock, __u16 *pnetfid, int xid);
115void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr); 115void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr);
116extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, 116extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
@@ -362,12 +362,12 @@ extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
362extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *, 362extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
363 __u32 *); 363 __u32 *);
364extern int cifs_verify_signature(struct smb_hdr *, 364extern int cifs_verify_signature(struct smb_hdr *,
365 const struct mac_key *mac_key, 365 const struct session_key *session_key,
366 __u32 expected_sequence_number); 366 __u32 expected_sequence_number);
367extern int cifs_calculate_mac_key(struct mac_key *key, const char *rn, 367extern int cifs_calculate_session_key(struct session_key *key, const char *rn,
368 const char *pass); 368 const char *pass);
369extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *); 369extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *);
370extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *, 370extern int setup_ntlmv2_rsp(struct cifsSesInfo *, char *,
371 const struct nls_table *); 371 const struct nls_table *);
372#ifdef CONFIG_CIFS_WEAK_PW_HASH 372#ifdef CONFIG_CIFS_WEAK_PW_HASH
373extern void calc_lanman_hash(const char *password, const char *cryptkey, 373extern void calc_lanman_hash(const char *password, const char *cryptkey,
@@ -408,4 +408,8 @@ extern int CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
408extern int CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon, 408extern int CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
409 const int netfid, __u64 *pExtAttrBits, __u64 *pMask); 409 const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
410extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb); 410extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb);
411extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr);
412extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
413 const unsigned char *path,
414 struct cifs_sb_info *cifs_sb, int xid);
411#endif /* _CIFSPROTO_H */ 415#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index c65c3419dd37..e98f1f317b15 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -91,13 +91,13 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
91 struct list_head *tmp1; 91 struct list_head *tmp1;
92 92
93/* list all files open on tree connection and mark them invalid */ 93/* list all files open on tree connection and mark them invalid */
94 write_lock(&GlobalSMBSeslock); 94 spin_lock(&cifs_file_list_lock);
95 list_for_each_safe(tmp, tmp1, &pTcon->openFileList) { 95 list_for_each_safe(tmp, tmp1, &pTcon->openFileList) {
96 open_file = list_entry(tmp, struct cifsFileInfo, tlist); 96 open_file = list_entry(tmp, struct cifsFileInfo, tlist);
97 open_file->invalidHandle = true; 97 open_file->invalidHandle = true;
98 open_file->oplock_break_cancelled = true; 98 open_file->oplock_break_cancelled = true;
99 } 99 }
100 write_unlock(&GlobalSMBSeslock); 100 spin_unlock(&cifs_file_list_lock);
101 /* BB Add call to invalidate_inodes(sb) for all superblocks mounted 101 /* BB Add call to invalidate_inodes(sb) for all superblocks mounted
102 to this tcon */ 102 to this tcon */
103} 103}
@@ -232,7 +232,7 @@ static int
232small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, 232small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
233 void **request_buf) 233 void **request_buf)
234{ 234{
235 int rc = 0; 235 int rc;
236 236
237 rc = cifs_reconnect_tcon(tcon, smb_command); 237 rc = cifs_reconnect_tcon(tcon, smb_command);
238 if (rc) 238 if (rc)
@@ -250,7 +250,7 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
250 if (tcon != NULL) 250 if (tcon != NULL)
251 cifs_stats_inc(&tcon->num_smbs_sent); 251 cifs_stats_inc(&tcon->num_smbs_sent);
252 252
253 return rc; 253 return 0;
254} 254}
255 255
256int 256int
@@ -281,16 +281,9 @@ small_smb_init_no_tc(const int smb_command, const int wct,
281 281
282/* If the return code is zero, this function must fill in request_buf pointer */ 282/* If the return code is zero, this function must fill in request_buf pointer */
283static int 283static int
284smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, 284__smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
285 void **request_buf /* returned */ , 285 void **request_buf, void **response_buf)
286 void **response_buf /* returned */ )
287{ 286{
288 int rc = 0;
289
290 rc = cifs_reconnect_tcon(tcon, smb_command);
291 if (rc)
292 return rc;
293
294 *request_buf = cifs_buf_get(); 287 *request_buf = cifs_buf_get();
295 if (*request_buf == NULL) { 288 if (*request_buf == NULL) {
296 /* BB should we add a retry in here if not a writepage? */ 289 /* BB should we add a retry in here if not a writepage? */
@@ -309,7 +302,31 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
309 if (tcon != NULL) 302 if (tcon != NULL)
310 cifs_stats_inc(&tcon->num_smbs_sent); 303 cifs_stats_inc(&tcon->num_smbs_sent);
311 304
312 return rc; 305 return 0;
306}
307
308/* If the return code is zero, this function must fill in request_buf pointer */
309static int
310smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
311 void **request_buf, void **response_buf)
312{
313 int rc;
314
315 rc = cifs_reconnect_tcon(tcon, smb_command);
316 if (rc)
317 return rc;
318
319 return __smb_init(smb_command, wct, tcon, request_buf, response_buf);
320}
321
322static int
323smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon,
324 void **request_buf, void **response_buf)
325{
326 if (tcon->ses->need_reconnect || tcon->need_reconnect)
327 return -EHOSTDOWN;
328
329 return __smb_init(smb_command, wct, tcon, request_buf, response_buf);
313} 330}
314 331
315static int validate_t2(struct smb_t2_rsp *pSMB) 332static int validate_t2(struct smb_t2_rsp *pSMB)
@@ -486,7 +503,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
486 503
487 if (rsp->EncryptionKeyLength == 504 if (rsp->EncryptionKeyLength ==
488 cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) { 505 cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
489 memcpy(server->cryptKey, rsp->EncryptionKey, 506 memcpy(ses->cryptKey, rsp->EncryptionKey,
490 CIFS_CRYPTO_KEY_SIZE); 507 CIFS_CRYPTO_KEY_SIZE);
491 } else if (server->secMode & SECMODE_PW_ENCRYPT) { 508 } else if (server->secMode & SECMODE_PW_ENCRYPT) {
492 rc = -EIO; /* need cryptkey unless plain text */ 509 rc = -EIO; /* need cryptkey unless plain text */
@@ -557,7 +574,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
557 server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone); 574 server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
558 server->timeAdj *= 60; 575 server->timeAdj *= 60;
559 if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) { 576 if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
560 memcpy(server->cryptKey, pSMBr->u.EncryptionKey, 577 memcpy(ses->cryptKey, pSMBr->u.EncryptionKey,
561 CIFS_CRYPTO_KEY_SIZE); 578 CIFS_CRYPTO_KEY_SIZE);
562 } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC) 579 } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC)
563 && (pSMBr->EncryptionKeyLength == 0)) { 580 && (pSMBr->EncryptionKeyLength == 0)) {
@@ -576,9 +593,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
576 rc = -EIO; 593 rc = -EIO;
577 goto neg_err_exit; 594 goto neg_err_exit;
578 } 595 }
579 read_lock(&cifs_tcp_ses_lock); 596 spin_lock(&cifs_tcp_ses_lock);
580 if (server->srv_count > 1) { 597 if (server->srv_count > 1) {
581 read_unlock(&cifs_tcp_ses_lock); 598 spin_unlock(&cifs_tcp_ses_lock);
582 if (memcmp(server->server_GUID, 599 if (memcmp(server->server_GUID,
583 pSMBr->u.extended_response. 600 pSMBr->u.extended_response.
584 GUID, 16) != 0) { 601 GUID, 16) != 0) {
@@ -588,7 +605,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
588 16); 605 16);
589 } 606 }
590 } else { 607 } else {
591 read_unlock(&cifs_tcp_ses_lock); 608 spin_unlock(&cifs_tcp_ses_lock);
592 memcpy(server->server_GUID, 609 memcpy(server->server_GUID,
593 pSMBr->u.extended_response.GUID, 16); 610 pSMBr->u.extended_response.GUID, 16);
594 } 611 }
@@ -603,13 +620,15 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
603 rc = 0; 620 rc = 0;
604 else 621 else
605 rc = -EINVAL; 622 rc = -EINVAL;
606 623 if (server->secType == Kerberos) {
607 if (server->sec_kerberos || server->sec_mskerberos) 624 if (!server->sec_kerberos &&
608 server->secType = Kerberos; 625 !server->sec_mskerberos)
609 else if (server->sec_ntlmssp) 626 rc = -EOPNOTSUPP;
610 server->secType = RawNTLMSSP; 627 } else if (server->secType == RawNTLMSSP) {
611 else 628 if (!server->sec_ntlmssp)
612 rc = -EOPNOTSUPP; 629 rc = -EOPNOTSUPP;
630 } else
631 rc = -EOPNOTSUPP;
613 } 632 }
614 } else 633 } else
615 server->capabilities &= ~CAP_EXTENDED_SECURITY; 634 server->capabilities &= ~CAP_EXTENDED_SECURITY;
@@ -4534,8 +4553,8 @@ CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon)
4534 4553
4535 cFYI(1, "In QFSUnixInfo"); 4554 cFYI(1, "In QFSUnixInfo");
4536QFSUnixRetry: 4555QFSUnixRetry:
4537 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4556 rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
4538 (void **) &pSMBr); 4557 (void **) &pSMB, (void **) &pSMBr);
4539 if (rc) 4558 if (rc)
4540 return rc; 4559 return rc;
4541 4560
@@ -4604,8 +4623,8 @@ CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon, __u64 cap)
4604 cFYI(1, "In SETFSUnixInfo"); 4623 cFYI(1, "In SETFSUnixInfo");
4605SETFSUnixRetry: 4624SETFSUnixRetry:
4606 /* BB switch to small buf init to save memory */ 4625 /* BB switch to small buf init to save memory */
4607 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4626 rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
4608 (void **) &pSMBr); 4627 (void **) &pSMB, (void **) &pSMBr);
4609 if (rc) 4628 if (rc)
4610 return rc; 4629 return rc;
4611 4630
diff --git a/fs/cifs/cn_cifs.h b/fs/cifs/cn_cifs.h
deleted file mode 100644
index ea59ccac2eb1..000000000000
--- a/fs/cifs/cn_cifs.h
+++ /dev/null
@@ -1,37 +0,0 @@
1/*
2 * fs/cifs/cn_cifs.h
3 *
4 * Copyright (c) International Business Machines Corp., 2002
5 * Author(s): Steve French (sfrench@us.ibm.com)
6 *
7 * This library is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published
9 * by the Free Software Foundation; either version 2.1 of the License, or
10 * (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
15 * the GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with this library; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#ifndef _CN_CIFS_H
23#define _CN_CIFS_H
24#ifdef CONFIG_CIFS_UPCALL
25#include <linux/types.h>
26#include <linux/connector.h>
27
28struct cifs_upcall {
29 char signature[4]; /* CIFS */
30 enum command {
31 CIFS_GET_IP = 0x00000001, /* get ip address for hostname */
32 CIFS_GET_SECBLOB = 0x00000002, /* get SPNEGO wrapped blob */
33 } command;
34 /* union cifs upcall data follows */
35};
36#endif /* CIFS_UPCALL */
37#endif /* _CN_CIFS_H */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 88c84a38bccb..7e73176acb58 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -47,7 +47,6 @@
47#include "ntlmssp.h" 47#include "ntlmssp.h"
48#include "nterr.h" 48#include "nterr.h"
49#include "rfc1002pdu.h" 49#include "rfc1002pdu.h"
50#include "cn_cifs.h"
51#include "fscache.h" 50#include "fscache.h"
52 51
53#define CIFS_PORT 445 52#define CIFS_PORT 445
@@ -100,16 +99,24 @@ struct smb_vol {
100 bool noautotune:1; 99 bool noautotune:1;
101 bool nostrictsync:1; /* do not force expensive SMBflush on every sync */ 100 bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
102 bool fsc:1; /* enable fscache */ 101 bool fsc:1; /* enable fscache */
102 bool mfsymlinks:1; /* use Minshall+French Symlinks */
103 bool multiuser:1;
103 unsigned int rsize; 104 unsigned int rsize;
104 unsigned int wsize; 105 unsigned int wsize;
105 bool sockopt_tcp_nodelay:1; 106 bool sockopt_tcp_nodelay:1;
106 unsigned short int port; 107 unsigned short int port;
107 char *prepath; 108 char *prepath;
109 struct sockaddr_storage srcaddr; /* allow binding to a local IP */
108 struct nls_table *local_nls; 110 struct nls_table *local_nls;
109}; 111};
110 112
113/* FIXME: should these be tunable? */
114#define TLINK_ERROR_EXPIRE (1 * HZ)
115#define TLINK_IDLE_EXPIRE (600 * HZ)
116
111static int ipv4_connect(struct TCP_Server_Info *server); 117static int ipv4_connect(struct TCP_Server_Info *server);
112static int ipv6_connect(struct TCP_Server_Info *server); 118static int ipv6_connect(struct TCP_Server_Info *server);
119static void cifs_prune_tlinks(struct work_struct *work);
113 120
114/* 121/*
115 * cifs tcp session reconnection 122 * cifs tcp session reconnection
@@ -143,7 +150,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
143 150
144 /* before reconnecting the tcp session, mark the smb session (uid) 151 /* before reconnecting the tcp session, mark the smb session (uid)
145 and the tid bad so they are not used until reconnected */ 152 and the tid bad so they are not used until reconnected */
146 read_lock(&cifs_tcp_ses_lock); 153 spin_lock(&cifs_tcp_ses_lock);
147 list_for_each(tmp, &server->smb_ses_list) { 154 list_for_each(tmp, &server->smb_ses_list) {
148 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list); 155 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
149 ses->need_reconnect = true; 156 ses->need_reconnect = true;
@@ -153,7 +160,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
153 tcon->need_reconnect = true; 160 tcon->need_reconnect = true;
154 } 161 }
155 } 162 }
156 read_unlock(&cifs_tcp_ses_lock); 163 spin_unlock(&cifs_tcp_ses_lock);
157 /* do not want to be sending data on a socket we are freeing */ 164 /* do not want to be sending data on a socket we are freeing */
158 mutex_lock(&server->srv_mutex); 165 mutex_lock(&server->srv_mutex);
159 if (server->ssocket) { 166 if (server->ssocket) {
@@ -166,6 +173,8 @@ cifs_reconnect(struct TCP_Server_Info *server)
166 sock_release(server->ssocket); 173 sock_release(server->ssocket);
167 server->ssocket = NULL; 174 server->ssocket = NULL;
168 } 175 }
176 server->sequence_number = 0;
177 server->session_estab = false;
169 178
170 spin_lock(&GlobalMid_Lock); 179 spin_lock(&GlobalMid_Lock);
171 list_for_each(tmp, &server->pending_mid_q) { 180 list_for_each(tmp, &server->pending_mid_q) {
@@ -198,7 +207,6 @@ cifs_reconnect(struct TCP_Server_Info *server)
198 spin_lock(&GlobalMid_Lock); 207 spin_lock(&GlobalMid_Lock);
199 if (server->tcpStatus != CifsExiting) 208 if (server->tcpStatus != CifsExiting)
200 server->tcpStatus = CifsGood; 209 server->tcpStatus = CifsGood;
201 server->sequence_number = 0;
202 spin_unlock(&GlobalMid_Lock); 210 spin_unlock(&GlobalMid_Lock);
203 /* atomic_set(&server->inFlight,0);*/ 211 /* atomic_set(&server->inFlight,0);*/
204 wake_up(&server->response_q); 212 wake_up(&server->response_q);
@@ -629,9 +637,9 @@ multi_t2_fnd:
629 } /* end while !EXITING */ 637 } /* end while !EXITING */
630 638
631 /* take it off the list, if it's not already */ 639 /* take it off the list, if it's not already */
632 write_lock(&cifs_tcp_ses_lock); 640 spin_lock(&cifs_tcp_ses_lock);
633 list_del_init(&server->tcp_ses_list); 641 list_del_init(&server->tcp_ses_list);
634 write_unlock(&cifs_tcp_ses_lock); 642 spin_unlock(&cifs_tcp_ses_lock);
635 643
636 spin_lock(&GlobalMid_Lock); 644 spin_lock(&GlobalMid_Lock);
637 server->tcpStatus = CifsExiting; 645 server->tcpStatus = CifsExiting;
@@ -669,7 +677,7 @@ multi_t2_fnd:
669 * BB: we shouldn't have to do any of this. It shouldn't be 677 * BB: we shouldn't have to do any of this. It shouldn't be
670 * possible to exit from the thread with active SMB sessions 678 * possible to exit from the thread with active SMB sessions
671 */ 679 */
672 read_lock(&cifs_tcp_ses_lock); 680 spin_lock(&cifs_tcp_ses_lock);
673 if (list_empty(&server->pending_mid_q)) { 681 if (list_empty(&server->pending_mid_q)) {
674 /* loop through server session structures attached to this and 682 /* loop through server session structures attached to this and
675 mark them dead */ 683 mark them dead */
@@ -679,7 +687,7 @@ multi_t2_fnd:
679 ses->status = CifsExiting; 687 ses->status = CifsExiting;
680 ses->server = NULL; 688 ses->server = NULL;
681 } 689 }
682 read_unlock(&cifs_tcp_ses_lock); 690 spin_unlock(&cifs_tcp_ses_lock);
683 } else { 691 } else {
684 /* although we can not zero the server struct pointer yet, 692 /* although we can not zero the server struct pointer yet,
685 since there are active requests which may depnd on them, 693 since there are active requests which may depnd on them,
@@ -702,7 +710,7 @@ multi_t2_fnd:
702 } 710 }
703 } 711 }
704 spin_unlock(&GlobalMid_Lock); 712 spin_unlock(&GlobalMid_Lock);
705 read_unlock(&cifs_tcp_ses_lock); 713 spin_unlock(&cifs_tcp_ses_lock);
706 /* 1/8th of sec is more than enough time for them to exit */ 714 /* 1/8th of sec is more than enough time for them to exit */
707 msleep(125); 715 msleep(125);
708 } 716 }
@@ -725,12 +733,12 @@ multi_t2_fnd:
725 if a crazy root user tried to kill cifsd 733 if a crazy root user tried to kill cifsd
726 kernel thread explicitly this might happen) */ 734 kernel thread explicitly this might happen) */
727 /* BB: This shouldn't be necessary, see above */ 735 /* BB: This shouldn't be necessary, see above */
728 read_lock(&cifs_tcp_ses_lock); 736 spin_lock(&cifs_tcp_ses_lock);
729 list_for_each(tmp, &server->smb_ses_list) { 737 list_for_each(tmp, &server->smb_ses_list) {
730 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list); 738 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
731 ses->server = NULL; 739 ses->server = NULL;
732 } 740 }
733 read_unlock(&cifs_tcp_ses_lock); 741 spin_unlock(&cifs_tcp_ses_lock);
734 742
735 kfree(server->hostname); 743 kfree(server->hostname);
736 task_to_wake = xchg(&server->tsk, NULL); 744 task_to_wake = xchg(&server->tsk, NULL);
@@ -1046,6 +1054,22 @@ cifs_parse_mount_options(char *options, const char *devname,
1046 "long\n"); 1054 "long\n");
1047 return 1; 1055 return 1;
1048 } 1056 }
1057 } else if (strnicmp(data, "srcaddr", 7) == 0) {
1058 vol->srcaddr.ss_family = AF_UNSPEC;
1059
1060 if (!value || !*value) {
1061 printk(KERN_WARNING "CIFS: srcaddr value"
1062 " not specified.\n");
1063 return 1; /* needs_arg; */
1064 }
1065 i = cifs_convert_address((struct sockaddr *)&vol->srcaddr,
1066 value, strlen(value));
1067 if (i < 0) {
1068 printk(KERN_WARNING "CIFS: Could not parse"
1069 " srcaddr: %s\n",
1070 value);
1071 return 1;
1072 }
1049 } else if (strnicmp(data, "prefixpath", 10) == 0) { 1073 } else if (strnicmp(data, "prefixpath", 10) == 0) {
1050 if (!value || !*value) { 1074 if (!value || !*value) {
1051 printk(KERN_WARNING 1075 printk(KERN_WARNING
@@ -1325,6 +1349,10 @@ cifs_parse_mount_options(char *options, const char *devname,
1325 "/proc/fs/cifs/LookupCacheEnabled to 0\n"); 1349 "/proc/fs/cifs/LookupCacheEnabled to 0\n");
1326 } else if (strnicmp(data, "fsc", 3) == 0) { 1350 } else if (strnicmp(data, "fsc", 3) == 0) {
1327 vol->fsc = true; 1351 vol->fsc = true;
1352 } else if (strnicmp(data, "mfsymlinks", 10) == 0) {
1353 vol->mfsymlinks = true;
1354 } else if (strnicmp(data, "multiuser", 8) == 0) {
1355 vol->multiuser = true;
1328 } else 1356 } else
1329 printk(KERN_WARNING "CIFS: Unknown mount option %s\n", 1357 printk(KERN_WARNING "CIFS: Unknown mount option %s\n",
1330 data); 1358 data);
@@ -1356,6 +1384,13 @@ cifs_parse_mount_options(char *options, const char *devname,
1356 return 1; 1384 return 1;
1357 } 1385 }
1358 } 1386 }
1387
1388 if (vol->multiuser && !(vol->secFlg & CIFSSEC_MAY_KRB5)) {
1389 cERROR(1, "Multiuser mounts currently require krb5 "
1390 "authentication!");
1391 return 1;
1392 }
1393
1359 if (vol->UNCip == NULL) 1394 if (vol->UNCip == NULL)
1360 vol->UNCip = &vol->UNC[2]; 1395 vol->UNCip = &vol->UNC[2];
1361 1396
@@ -1374,8 +1409,36 @@ cifs_parse_mount_options(char *options, const char *devname,
1374 return 0; 1409 return 0;
1375} 1410}
1376 1411
1412/** Returns true if srcaddr isn't specified and rhs isn't
1413 * specified, or if srcaddr is specified and
1414 * matches the IP address of the rhs argument.
1415 */
1416static bool
1417srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
1418{
1419 switch (srcaddr->sa_family) {
1420 case AF_UNSPEC:
1421 return (rhs->sa_family == AF_UNSPEC);
1422 case AF_INET: {
1423 struct sockaddr_in *saddr4 = (struct sockaddr_in *)srcaddr;
1424 struct sockaddr_in *vaddr4 = (struct sockaddr_in *)rhs;
1425 return (saddr4->sin_addr.s_addr == vaddr4->sin_addr.s_addr);
1426 }
1427 case AF_INET6: {
1428 struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr;
1429 struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)&rhs;
1430 return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr);
1431 }
1432 default:
1433 WARN_ON(1);
1434 return false; /* don't expect to be here */
1435 }
1436}
1437
1438
1377static bool 1439static bool
1378match_address(struct TCP_Server_Info *server, struct sockaddr *addr) 1440match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
1441 struct sockaddr *srcaddr)
1379{ 1442{
1380 struct sockaddr_in *addr4 = (struct sockaddr_in *)addr; 1443 struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
1381 struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr; 1444 struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
@@ -1402,6 +1465,9 @@ match_address(struct TCP_Server_Info *server, struct sockaddr *addr)
1402 break; 1465 break;
1403 } 1466 }
1404 1467
1468 if (!srcip_matches(srcaddr, (struct sockaddr *)&server->srcaddr))
1469 return false;
1470
1405 return true; 1471 return true;
1406} 1472}
1407 1473
@@ -1458,29 +1524,21 @@ cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
1458{ 1524{
1459 struct TCP_Server_Info *server; 1525 struct TCP_Server_Info *server;
1460 1526
1461 write_lock(&cifs_tcp_ses_lock); 1527 spin_lock(&cifs_tcp_ses_lock);
1462 list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { 1528 list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
1463 /* 1529 if (!match_address(server, addr,
1464 * the demux thread can exit on its own while still in CifsNew 1530 (struct sockaddr *)&vol->srcaddr))
1465 * so don't accept any sockets in that state. Since the
1466 * tcpStatus never changes back to CifsNew it's safe to check
1467 * for this without a lock.
1468 */
1469 if (server->tcpStatus == CifsNew)
1470 continue;
1471
1472 if (!match_address(server, addr))
1473 continue; 1531 continue;
1474 1532
1475 if (!match_security(server, vol)) 1533 if (!match_security(server, vol))
1476 continue; 1534 continue;
1477 1535
1478 ++server->srv_count; 1536 ++server->srv_count;
1479 write_unlock(&cifs_tcp_ses_lock); 1537 spin_unlock(&cifs_tcp_ses_lock);
1480 cFYI(1, "Existing tcp session with server found"); 1538 cFYI(1, "Existing tcp session with server found");
1481 return server; 1539 return server;
1482 } 1540 }
1483 write_unlock(&cifs_tcp_ses_lock); 1541 spin_unlock(&cifs_tcp_ses_lock);
1484 return NULL; 1542 return NULL;
1485} 1543}
1486 1544
@@ -1489,14 +1547,14 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
1489{ 1547{
1490 struct task_struct *task; 1548 struct task_struct *task;
1491 1549
1492 write_lock(&cifs_tcp_ses_lock); 1550 spin_lock(&cifs_tcp_ses_lock);
1493 if (--server->srv_count > 0) { 1551 if (--server->srv_count > 0) {
1494 write_unlock(&cifs_tcp_ses_lock); 1552 spin_unlock(&cifs_tcp_ses_lock);
1495 return; 1553 return;
1496 } 1554 }
1497 1555
1498 list_del_init(&server->tcp_ses_list); 1556 list_del_init(&server->tcp_ses_list);
1499 write_unlock(&cifs_tcp_ses_lock); 1557 spin_unlock(&cifs_tcp_ses_lock);
1500 1558
1501 spin_lock(&GlobalMid_Lock); 1559 spin_lock(&GlobalMid_Lock);
1502 server->tcpStatus = CifsExiting; 1560 server->tcpStatus = CifsExiting;
@@ -1574,6 +1632,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1574 volume_info->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); 1632 volume_info->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
1575 memcpy(tcp_ses->server_RFC1001_name, 1633 memcpy(tcp_ses->server_RFC1001_name,
1576 volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); 1634 volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
1635 tcp_ses->session_estab = false;
1577 tcp_ses->sequence_number = 0; 1636 tcp_ses->sequence_number = 0;
1578 INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); 1637 INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
1579 INIT_LIST_HEAD(&tcp_ses->smb_ses_list); 1638 INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
@@ -1584,6 +1643,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1584 * no need to spinlock this init of tcpStatus or srv_count 1643 * no need to spinlock this init of tcpStatus or srv_count
1585 */ 1644 */
1586 tcp_ses->tcpStatus = CifsNew; 1645 tcp_ses->tcpStatus = CifsNew;
1646 memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
1647 sizeof(tcp_ses->srcaddr));
1587 ++tcp_ses->srv_count; 1648 ++tcp_ses->srv_count;
1588 1649
1589 if (addr.ss_family == AF_INET6) { 1650 if (addr.ss_family == AF_INET6) {
@@ -1618,9 +1679,9 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1618 } 1679 }
1619 1680
1620 /* thread spawned, put it on the list */ 1681 /* thread spawned, put it on the list */
1621 write_lock(&cifs_tcp_ses_lock); 1682 spin_lock(&cifs_tcp_ses_lock);
1622 list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list); 1683 list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list);
1623 write_unlock(&cifs_tcp_ses_lock); 1684 spin_unlock(&cifs_tcp_ses_lock);
1624 1685
1625 cifs_fscache_get_client_cookie(tcp_ses); 1686 cifs_fscache_get_client_cookie(tcp_ses);
1626 1687
@@ -1642,7 +1703,7 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
1642{ 1703{
1643 struct cifsSesInfo *ses; 1704 struct cifsSesInfo *ses;
1644 1705
1645 write_lock(&cifs_tcp_ses_lock); 1706 spin_lock(&cifs_tcp_ses_lock);
1646 list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { 1707 list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
1647 switch (server->secType) { 1708 switch (server->secType) {
1648 case Kerberos: 1709 case Kerberos:
@@ -1662,10 +1723,10 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
1662 continue; 1723 continue;
1663 } 1724 }
1664 ++ses->ses_count; 1725 ++ses->ses_count;
1665 write_unlock(&cifs_tcp_ses_lock); 1726 spin_unlock(&cifs_tcp_ses_lock);
1666 return ses; 1727 return ses;
1667 } 1728 }
1668 write_unlock(&cifs_tcp_ses_lock); 1729 spin_unlock(&cifs_tcp_ses_lock);
1669 return NULL; 1730 return NULL;
1670} 1731}
1671 1732
@@ -1676,14 +1737,14 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
1676 struct TCP_Server_Info *server = ses->server; 1737 struct TCP_Server_Info *server = ses->server;
1677 1738
1678 cFYI(1, "%s: ses_count=%d\n", __func__, ses->ses_count); 1739 cFYI(1, "%s: ses_count=%d\n", __func__, ses->ses_count);
1679 write_lock(&cifs_tcp_ses_lock); 1740 spin_lock(&cifs_tcp_ses_lock);
1680 if (--ses->ses_count > 0) { 1741 if (--ses->ses_count > 0) {
1681 write_unlock(&cifs_tcp_ses_lock); 1742 spin_unlock(&cifs_tcp_ses_lock);
1682 return; 1743 return;
1683 } 1744 }
1684 1745
1685 list_del_init(&ses->smb_ses_list); 1746 list_del_init(&ses->smb_ses_list);
1686 write_unlock(&cifs_tcp_ses_lock); 1747 spin_unlock(&cifs_tcp_ses_lock);
1687 1748
1688 if (ses->status == CifsGood) { 1749 if (ses->status == CifsGood) {
1689 xid = GetXid(); 1750 xid = GetXid();
@@ -1740,6 +1801,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
1740 if (ses == NULL) 1801 if (ses == NULL)
1741 goto get_ses_fail; 1802 goto get_ses_fail;
1742 1803
1804 ses->tilen = 0;
1805 ses->tiblob = NULL;
1743 /* new SMB session uses our server ref */ 1806 /* new SMB session uses our server ref */
1744 ses->server = server; 1807 ses->server = server;
1745 if (server->addr.sockAddr6.sin6_family == AF_INET6) 1808 if (server->addr.sockAddr6.sin6_family == AF_INET6)
@@ -1778,9 +1841,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
1778 goto get_ses_fail; 1841 goto get_ses_fail;
1779 1842
1780 /* success, put it on the list */ 1843 /* success, put it on the list */
1781 write_lock(&cifs_tcp_ses_lock); 1844 spin_lock(&cifs_tcp_ses_lock);
1782 list_add(&ses->smb_ses_list, &server->smb_ses_list); 1845 list_add(&ses->smb_ses_list, &server->smb_ses_list);
1783 write_unlock(&cifs_tcp_ses_lock); 1846 spin_unlock(&cifs_tcp_ses_lock);
1784 1847
1785 FreeXid(xid); 1848 FreeXid(xid);
1786 return ses; 1849 return ses;
@@ -1797,7 +1860,7 @@ cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
1797 struct list_head *tmp; 1860 struct list_head *tmp;
1798 struct cifsTconInfo *tcon; 1861 struct cifsTconInfo *tcon;
1799 1862
1800 write_lock(&cifs_tcp_ses_lock); 1863 spin_lock(&cifs_tcp_ses_lock);
1801 list_for_each(tmp, &ses->tcon_list) { 1864 list_for_each(tmp, &ses->tcon_list) {
1802 tcon = list_entry(tmp, struct cifsTconInfo, tcon_list); 1865 tcon = list_entry(tmp, struct cifsTconInfo, tcon_list);
1803 if (tcon->tidStatus == CifsExiting) 1866 if (tcon->tidStatus == CifsExiting)
@@ -1806,10 +1869,10 @@ cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
1806 continue; 1869 continue;
1807 1870
1808 ++tcon->tc_count; 1871 ++tcon->tc_count;
1809 write_unlock(&cifs_tcp_ses_lock); 1872 spin_unlock(&cifs_tcp_ses_lock);
1810 return tcon; 1873 return tcon;
1811 } 1874 }
1812 write_unlock(&cifs_tcp_ses_lock); 1875 spin_unlock(&cifs_tcp_ses_lock);
1813 return NULL; 1876 return NULL;
1814} 1877}
1815 1878
@@ -1820,14 +1883,14 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
1820 struct cifsSesInfo *ses = tcon->ses; 1883 struct cifsSesInfo *ses = tcon->ses;
1821 1884
1822 cFYI(1, "%s: tc_count=%d\n", __func__, tcon->tc_count); 1885 cFYI(1, "%s: tc_count=%d\n", __func__, tcon->tc_count);
1823 write_lock(&cifs_tcp_ses_lock); 1886 spin_lock(&cifs_tcp_ses_lock);
1824 if (--tcon->tc_count > 0) { 1887 if (--tcon->tc_count > 0) {
1825 write_unlock(&cifs_tcp_ses_lock); 1888 spin_unlock(&cifs_tcp_ses_lock);
1826 return; 1889 return;
1827 } 1890 }
1828 1891
1829 list_del_init(&tcon->tcon_list); 1892 list_del_init(&tcon->tcon_list);
1830 write_unlock(&cifs_tcp_ses_lock); 1893 spin_unlock(&cifs_tcp_ses_lock);
1831 1894
1832 xid = GetXid(); 1895 xid = GetXid();
1833 CIFSSMBTDis(xid, tcon); 1896 CIFSSMBTDis(xid, tcon);
@@ -1900,9 +1963,9 @@ cifs_get_tcon(struct cifsSesInfo *ses, struct smb_vol *volume_info)
1900 tcon->nocase = volume_info->nocase; 1963 tcon->nocase = volume_info->nocase;
1901 tcon->local_lease = volume_info->local_lease; 1964 tcon->local_lease = volume_info->local_lease;
1902 1965
1903 write_lock(&cifs_tcp_ses_lock); 1966 spin_lock(&cifs_tcp_ses_lock);
1904 list_add(&tcon->tcon_list, &ses->tcon_list); 1967 list_add(&tcon->tcon_list, &ses->tcon_list);
1905 write_unlock(&cifs_tcp_ses_lock); 1968 spin_unlock(&cifs_tcp_ses_lock);
1906 1969
1907 cifs_fscache_get_super_cookie(tcon); 1970 cifs_fscache_get_super_cookie(tcon);
1908 1971
@@ -1913,6 +1976,23 @@ out_fail:
1913 return ERR_PTR(rc); 1976 return ERR_PTR(rc);
1914} 1977}
1915 1978
1979void
1980cifs_put_tlink(struct tcon_link *tlink)
1981{
1982 if (!tlink || IS_ERR(tlink))
1983 return;
1984
1985 if (!atomic_dec_and_test(&tlink->tl_count) ||
1986 test_bit(TCON_LINK_IN_TREE, &tlink->tl_flags)) {
1987 tlink->tl_time = jiffies;
1988 return;
1989 }
1990
1991 if (!IS_ERR(tlink_tcon(tlink)))
1992 cifs_put_tcon(tlink_tcon(tlink));
1993 kfree(tlink);
1994 return;
1995}
1916 1996
1917int 1997int
1918get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path, 1998get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
@@ -1997,6 +2077,33 @@ static void rfc1002mangle(char *target, char *source, unsigned int length)
1997 2077
1998} 2078}
1999 2079
2080static int
2081bind_socket(struct TCP_Server_Info *server)
2082{
2083 int rc = 0;
2084 if (server->srcaddr.ss_family != AF_UNSPEC) {
2085 /* Bind to the specified local IP address */
2086 struct socket *socket = server->ssocket;
2087 rc = socket->ops->bind(socket,
2088 (struct sockaddr *) &server->srcaddr,
2089 sizeof(server->srcaddr));
2090 if (rc < 0) {
2091 struct sockaddr_in *saddr4;
2092 struct sockaddr_in6 *saddr6;
2093 saddr4 = (struct sockaddr_in *)&server->srcaddr;
2094 saddr6 = (struct sockaddr_in6 *)&server->srcaddr;
2095 if (saddr6->sin6_family == AF_INET6)
2096 cERROR(1, "cifs: "
2097 "Failed to bind to: %pI6c, error: %d\n",
2098 &saddr6->sin6_addr, rc);
2099 else
2100 cERROR(1, "cifs: "
2101 "Failed to bind to: %pI4, error: %d\n",
2102 &saddr4->sin_addr.s_addr, rc);
2103 }
2104 }
2105 return rc;
2106}
2000 2107
2001static int 2108static int
2002ipv4_connect(struct TCP_Server_Info *server) 2109ipv4_connect(struct TCP_Server_Info *server)
@@ -2022,6 +2129,10 @@ ipv4_connect(struct TCP_Server_Info *server)
2022 cifs_reclassify_socket4(socket); 2129 cifs_reclassify_socket4(socket);
2023 } 2130 }
2024 2131
2132 rc = bind_socket(server);
2133 if (rc < 0)
2134 return rc;
2135
2025 /* user overrode default port */ 2136 /* user overrode default port */
2026 if (server->addr.sockAddr.sin_port) { 2137 if (server->addr.sockAddr.sin_port) {
2027 rc = socket->ops->connect(socket, (struct sockaddr *) 2138 rc = socket->ops->connect(socket, (struct sockaddr *)
@@ -2184,6 +2295,10 @@ ipv6_connect(struct TCP_Server_Info *server)
2184 cifs_reclassify_socket6(socket); 2295 cifs_reclassify_socket6(socket);
2185 } 2296 }
2186 2297
2298 rc = bind_socket(server);
2299 if (rc < 0)
2300 return rc;
2301
2187 /* user overrode default port */ 2302 /* user overrode default port */
2188 if (server->addr.sockAddr6.sin6_port) { 2303 if (server->addr.sockAddr6.sin6_port) {
2189 rc = socket->ops->connect(socket, 2304 rc = socket->ops->connect(socket,
@@ -2383,6 +2498,8 @@ convert_delimiter(char *path, char delim)
2383static void setup_cifs_sb(struct smb_vol *pvolume_info, 2498static void setup_cifs_sb(struct smb_vol *pvolume_info,
2384 struct cifs_sb_info *cifs_sb) 2499 struct cifs_sb_info *cifs_sb)
2385{ 2500{
2501 INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks);
2502
2386 if (pvolume_info->rsize > CIFSMaxBufSize) { 2503 if (pvolume_info->rsize > CIFSMaxBufSize) {
2387 cERROR(1, "rsize %d too large, using MaxBufSize", 2504 cERROR(1, "rsize %d too large, using MaxBufSize",
2388 pvolume_info->rsize); 2505 pvolume_info->rsize);
@@ -2462,10 +2579,21 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2462 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM; 2579 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
2463 if (pvolume_info->fsc) 2580 if (pvolume_info->fsc)
2464 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_FSCACHE; 2581 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_FSCACHE;
2582 if (pvolume_info->multiuser)
2583 cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER |
2584 CIFS_MOUNT_NO_PERM);
2465 if (pvolume_info->direct_io) { 2585 if (pvolume_info->direct_io) {
2466 cFYI(1, "mounting share using direct i/o"); 2586 cFYI(1, "mounting share using direct i/o");
2467 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO; 2587 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
2468 } 2588 }
2589 if (pvolume_info->mfsymlinks) {
2590 if (pvolume_info->sfu_emul) {
2591 cERROR(1, "mount option mfsymlinks ignored if sfu "
2592 "mount option is used");
2593 } else {
2594 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MF_SYMLINKS;
2595 }
2596 }
2469 2597
2470 if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm)) 2598 if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
2471 cERROR(1, "mount option dynperm ignored if cifsacl " 2599 cERROR(1, "mount option dynperm ignored if cifsacl "
@@ -2552,6 +2680,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
2552 struct TCP_Server_Info *srvTcp; 2680 struct TCP_Server_Info *srvTcp;
2553 char *full_path; 2681 char *full_path;
2554 char *mount_data = mount_data_global; 2682 char *mount_data = mount_data_global;
2683 struct tcon_link *tlink;
2555#ifdef CONFIG_CIFS_DFS_UPCALL 2684#ifdef CONFIG_CIFS_DFS_UPCALL
2556 struct dfs_info3_param *referrals = NULL; 2685 struct dfs_info3_param *referrals = NULL;
2557 unsigned int num_referrals = 0; 2686 unsigned int num_referrals = 0;
@@ -2563,6 +2692,7 @@ try_mount_again:
2563 pSesInfo = NULL; 2692 pSesInfo = NULL;
2564 srvTcp = NULL; 2693 srvTcp = NULL;
2565 full_path = NULL; 2694 full_path = NULL;
2695 tlink = NULL;
2566 2696
2567 xid = GetXid(); 2697 xid = GetXid();
2568 2698
@@ -2638,8 +2768,6 @@ try_mount_again:
2638 goto remote_path_check; 2768 goto remote_path_check;
2639 } 2769 }
2640 2770
2641 cifs_sb->tcon = tcon;
2642
2643 /* do not care if following two calls succeed - informational */ 2771 /* do not care if following two calls succeed - informational */
2644 if (!tcon->ipc) { 2772 if (!tcon->ipc) {
2645 CIFSSMBQFSDeviceInfo(xid, tcon); 2773 CIFSSMBQFSDeviceInfo(xid, tcon);
@@ -2748,6 +2876,38 @@ remote_path_check:
2748#endif 2876#endif
2749 } 2877 }
2750 2878
2879 if (rc)
2880 goto mount_fail_check;
2881
2882 /* now, hang the tcon off of the superblock */
2883 tlink = kzalloc(sizeof *tlink, GFP_KERNEL);
2884 if (tlink == NULL) {
2885 rc = -ENOMEM;
2886 goto mount_fail_check;
2887 }
2888
2889 tlink->tl_index = pSesInfo->linux_uid;
2890 tlink->tl_tcon = tcon;
2891 tlink->tl_time = jiffies;
2892 set_bit(TCON_LINK_MASTER, &tlink->tl_flags);
2893 set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
2894
2895 rc = radix_tree_preload(GFP_KERNEL);
2896 if (rc == -ENOMEM) {
2897 kfree(tlink);
2898 goto mount_fail_check;
2899 }
2900
2901 spin_lock(&cifs_sb->tlink_tree_lock);
2902 radix_tree_insert(&cifs_sb->tlink_tree, pSesInfo->linux_uid, tlink);
2903 radix_tree_tag_set(&cifs_sb->tlink_tree, pSesInfo->linux_uid,
2904 CIFS_TLINK_MASTER_TAG);
2905 spin_unlock(&cifs_sb->tlink_tree_lock);
2906 radix_tree_preload_end();
2907
2908 queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
2909 TLINK_IDLE_EXPIRE);
2910
2751mount_fail_check: 2911mount_fail_check:
2752 /* on error free sesinfo and tcon struct if needed */ 2912 /* on error free sesinfo and tcon struct if needed */
2753 if (rc) { 2913 if (rc) {
@@ -2825,14 +2985,13 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2825#ifdef CONFIG_CIFS_WEAK_PW_HASH 2985#ifdef CONFIG_CIFS_WEAK_PW_HASH
2826 if ((global_secflags & CIFSSEC_MAY_LANMAN) && 2986 if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
2827 (ses->server->secType == LANMAN)) 2987 (ses->server->secType == LANMAN))
2828 calc_lanman_hash(tcon->password, ses->server->cryptKey, 2988 calc_lanman_hash(tcon->password, ses->cryptKey,
2829 ses->server->secMode & 2989 ses->server->secMode &
2830 SECMODE_PW_ENCRYPT ? true : false, 2990 SECMODE_PW_ENCRYPT ? true : false,
2831 bcc_ptr); 2991 bcc_ptr);
2832 else 2992 else
2833#endif /* CIFS_WEAK_PW_HASH */ 2993#endif /* CIFS_WEAK_PW_HASH */
2834 SMBNTencrypt(tcon->password, ses->server->cryptKey, 2994 SMBNTencrypt(tcon->password, ses->cryptKey, bcc_ptr);
2835 bcc_ptr);
2836 2995
2837 bcc_ptr += CIFS_SESS_KEY_SIZE; 2996 bcc_ptr += CIFS_SESS_KEY_SIZE;
2838 if (ses->capabilities & CAP_UNICODE) { 2997 if (ses->capabilities & CAP_UNICODE) {
@@ -2934,19 +3093,39 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2934int 3093int
2935cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb) 3094cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
2936{ 3095{
2937 int rc = 0; 3096 int i, ret;
2938 char *tmp; 3097 char *tmp;
3098 struct tcon_link *tlink[8];
3099 unsigned long index = 0;
3100
3101 cancel_delayed_work_sync(&cifs_sb->prune_tlinks);
3102
3103 do {
3104 spin_lock(&cifs_sb->tlink_tree_lock);
3105 ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree,
3106 (void **)tlink, index,
3107 ARRAY_SIZE(tlink));
3108 /* increment index for next pass */
3109 if (ret > 0)
3110 index = tlink[ret - 1]->tl_index + 1;
3111 for (i = 0; i < ret; i++) {
3112 cifs_get_tlink(tlink[i]);
3113 clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags);
3114 radix_tree_delete(&cifs_sb->tlink_tree,
3115 tlink[i]->tl_index);
3116 }
3117 spin_unlock(&cifs_sb->tlink_tree_lock);
2939 3118
2940 if (cifs_sb->tcon) 3119 for (i = 0; i < ret; i++)
2941 cifs_put_tcon(cifs_sb->tcon); 3120 cifs_put_tlink(tlink[i]);
3121 } while (ret != 0);
2942 3122
2943 cifs_sb->tcon = NULL;
2944 tmp = cifs_sb->prepath; 3123 tmp = cifs_sb->prepath;
2945 cifs_sb->prepathlen = 0; 3124 cifs_sb->prepathlen = 0;
2946 cifs_sb->prepath = NULL; 3125 cifs_sb->prepath = NULL;
2947 kfree(tmp); 3126 kfree(tmp);
2948 3127
2949 return rc; 3128 return 0;
2950} 3129}
2951 3130
2952int cifs_negotiate_protocol(unsigned int xid, struct cifsSesInfo *ses) 3131int cifs_negotiate_protocol(unsigned int xid, struct cifsSesInfo *ses)
@@ -2997,6 +3176,15 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
2997 if (rc) { 3176 if (rc) {
2998 cERROR(1, "Send error in SessSetup = %d", rc); 3177 cERROR(1, "Send error in SessSetup = %d", rc);
2999 } else { 3178 } else {
3179 mutex_lock(&ses->server->srv_mutex);
3180 if (!server->session_estab) {
3181 memcpy(&server->session_key.data,
3182 &ses->auth_key.data, ses->auth_key.len);
3183 server->session_key.len = ses->auth_key.len;
3184 ses->server->session_estab = true;
3185 }
3186 mutex_unlock(&server->srv_mutex);
3187
3000 cFYI(1, "CIFS Session Established successfully"); 3188 cFYI(1, "CIFS Session Established successfully");
3001 spin_lock(&GlobalMid_Lock); 3189 spin_lock(&GlobalMid_Lock);
3002 ses->status = CifsGood; 3190 ses->status = CifsGood;
@@ -3007,3 +3195,237 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
3007 return rc; 3195 return rc;
3008} 3196}
3009 3197
3198static struct cifsTconInfo *
3199cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
3200{
3201 struct cifsTconInfo *master_tcon = cifs_sb_master_tcon(cifs_sb);
3202 struct cifsSesInfo *ses;
3203 struct cifsTconInfo *tcon = NULL;
3204 struct smb_vol *vol_info;
3205 char username[MAX_USERNAME_SIZE + 1];
3206
3207 vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL);
3208 if (vol_info == NULL) {
3209 tcon = ERR_PTR(-ENOMEM);
3210 goto out;
3211 }
3212
3213 snprintf(username, MAX_USERNAME_SIZE, "krb50x%x", fsuid);
3214 vol_info->username = username;
3215 vol_info->local_nls = cifs_sb->local_nls;
3216 vol_info->linux_uid = fsuid;
3217 vol_info->cred_uid = fsuid;
3218 vol_info->UNC = master_tcon->treeName;
3219 vol_info->retry = master_tcon->retry;
3220 vol_info->nocase = master_tcon->nocase;
3221 vol_info->local_lease = master_tcon->local_lease;
3222 vol_info->no_linux_ext = !master_tcon->unix_ext;
3223
3224 /* FIXME: allow for other secFlg settings */
3225 vol_info->secFlg = CIFSSEC_MUST_KRB5;
3226
3227 /* get a reference for the same TCP session */
3228 spin_lock(&cifs_tcp_ses_lock);
3229 ++master_tcon->ses->server->srv_count;
3230 spin_unlock(&cifs_tcp_ses_lock);
3231
3232 ses = cifs_get_smb_ses(master_tcon->ses->server, vol_info);
3233 if (IS_ERR(ses)) {
3234 tcon = (struct cifsTconInfo *)ses;
3235 cifs_put_tcp_session(master_tcon->ses->server);
3236 goto out;
3237 }
3238
3239 tcon = cifs_get_tcon(ses, vol_info);
3240 if (IS_ERR(tcon)) {
3241 cifs_put_smb_ses(ses);
3242 goto out;
3243 }
3244
3245 if (ses->capabilities & CAP_UNIX)
3246 reset_cifs_unix_caps(0, tcon, NULL, vol_info);
3247out:
3248 kfree(vol_info);
3249
3250 return tcon;
3251}
3252
3253static struct tcon_link *
3254cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb)
3255{
3256 struct tcon_link *tlink;
3257 unsigned int ret;
3258
3259 spin_lock(&cifs_sb->tlink_tree_lock);
3260 ret = radix_tree_gang_lookup_tag(&cifs_sb->tlink_tree, (void **)&tlink,
3261 0, 1, CIFS_TLINK_MASTER_TAG);
3262 spin_unlock(&cifs_sb->tlink_tree_lock);
3263
3264 /* the master tcon should always be present */
3265 if (ret == 0)
3266 BUG();
3267
3268 return tlink;
3269}
3270
3271struct cifsTconInfo *
3272cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb)
3273{
3274 return tlink_tcon(cifs_sb_master_tlink(cifs_sb));
3275}
3276
3277static int
3278cifs_sb_tcon_pending_wait(void *unused)
3279{
3280 schedule();
3281 return signal_pending(current) ? -ERESTARTSYS : 0;
3282}
3283
3284/*
3285 * Find or construct an appropriate tcon given a cifs_sb and the fsuid of the
3286 * current task.
3287 *
3288 * If the superblock doesn't refer to a multiuser mount, then just return
3289 * the master tcon for the mount.
3290 *
3291 * First, search the radix tree for an existing tcon for this fsuid. If one
3292 * exists, then check to see if it's pending construction. If it is then wait
3293 * for construction to complete. Once it's no longer pending, check to see if
3294 * it failed and either return an error or retry construction, depending on
3295 * the timeout.
3296 *
3297 * If one doesn't exist then insert a new tcon_link struct into the tree and
3298 * try to construct a new one.
3299 */
3300struct tcon_link *
3301cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
3302{
3303 int ret;
3304 unsigned long fsuid = (unsigned long) current_fsuid();
3305 struct tcon_link *tlink, *newtlink;
3306
3307 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
3308 return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
3309
3310 spin_lock(&cifs_sb->tlink_tree_lock);
3311 tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid);
3312 if (tlink)
3313 cifs_get_tlink(tlink);
3314 spin_unlock(&cifs_sb->tlink_tree_lock);
3315
3316 if (tlink == NULL) {
3317 newtlink = kzalloc(sizeof(*tlink), GFP_KERNEL);
3318 if (newtlink == NULL)
3319 return ERR_PTR(-ENOMEM);
3320 newtlink->tl_index = fsuid;
3321 newtlink->tl_tcon = ERR_PTR(-EACCES);
3322 set_bit(TCON_LINK_PENDING, &newtlink->tl_flags);
3323 set_bit(TCON_LINK_IN_TREE, &newtlink->tl_flags);
3324 cifs_get_tlink(newtlink);
3325
3326 ret = radix_tree_preload(GFP_KERNEL);
3327 if (ret != 0) {
3328 kfree(newtlink);
3329 return ERR_PTR(ret);
3330 }
3331
3332 spin_lock(&cifs_sb->tlink_tree_lock);
3333 /* was one inserted after previous search? */
3334 tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid);
3335 if (tlink) {
3336 cifs_get_tlink(tlink);
3337 spin_unlock(&cifs_sb->tlink_tree_lock);
3338 radix_tree_preload_end();
3339 kfree(newtlink);
3340 goto wait_for_construction;
3341 }
3342 ret = radix_tree_insert(&cifs_sb->tlink_tree, fsuid, newtlink);
3343 spin_unlock(&cifs_sb->tlink_tree_lock);
3344 radix_tree_preload_end();
3345 if (ret) {
3346 kfree(newtlink);
3347 return ERR_PTR(ret);
3348 }
3349 tlink = newtlink;
3350 } else {
3351wait_for_construction:
3352 ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING,
3353 cifs_sb_tcon_pending_wait,
3354 TASK_INTERRUPTIBLE);
3355 if (ret) {
3356 cifs_put_tlink(tlink);
3357 return ERR_PTR(ret);
3358 }
3359
3360 /* if it's good, return it */
3361 if (!IS_ERR(tlink->tl_tcon))
3362 return tlink;
3363
3364 /* return error if we tried this already recently */
3365 if (time_before(jiffies, tlink->tl_time + TLINK_ERROR_EXPIRE)) {
3366 cifs_put_tlink(tlink);
3367 return ERR_PTR(-EACCES);
3368 }
3369
3370 if (test_and_set_bit(TCON_LINK_PENDING, &tlink->tl_flags))
3371 goto wait_for_construction;
3372 }
3373
3374 tlink->tl_tcon = cifs_construct_tcon(cifs_sb, fsuid);
3375 clear_bit(TCON_LINK_PENDING, &tlink->tl_flags);
3376 wake_up_bit(&tlink->tl_flags, TCON_LINK_PENDING);
3377
3378 if (IS_ERR(tlink->tl_tcon)) {
3379 cifs_put_tlink(tlink);
3380 return ERR_PTR(-EACCES);
3381 }
3382
3383 return tlink;
3384}
3385
3386/*
3387 * periodic workqueue job that scans tcon_tree for a superblock and closes
3388 * out tcons.
3389 */
3390static void
3391cifs_prune_tlinks(struct work_struct *work)
3392{
3393 struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info,
3394 prune_tlinks.work);
3395 struct tcon_link *tlink[8];
3396 unsigned long now = jiffies;
3397 unsigned long index = 0;
3398 int i, ret;
3399
3400 do {
3401 spin_lock(&cifs_sb->tlink_tree_lock);
3402 ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree,
3403 (void **)tlink, index,
3404 ARRAY_SIZE(tlink));
3405 /* increment index for next pass */
3406 if (ret > 0)
3407 index = tlink[ret - 1]->tl_index + 1;
3408 for (i = 0; i < ret; i++) {
3409 if (test_bit(TCON_LINK_MASTER, &tlink[i]->tl_flags) ||
3410 atomic_read(&tlink[i]->tl_count) != 0 ||
3411 time_after(tlink[i]->tl_time + TLINK_IDLE_EXPIRE,
3412 now)) {
3413 tlink[i] = NULL;
3414 continue;
3415 }
3416 cifs_get_tlink(tlink[i]);
3417 clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags);
3418 radix_tree_delete(&cifs_sb->tlink_tree,
3419 tlink[i]->tl_index);
3420 }
3421 spin_unlock(&cifs_sb->tlink_tree_lock);
3422
3423 for (i = 0; i < ret; i++) {
3424 if (tlink[i] != NULL)
3425 cifs_put_tlink(tlink[i]);
3426 }
3427 } while (ret != 0);
3428
3429 queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
3430 TLINK_IDLE_EXPIRE);
3431}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index f9ed0751cc12..3840eddbfb7a 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -54,18 +54,18 @@ build_path_from_dentry(struct dentry *direntry)
54 int dfsplen; 54 int dfsplen;
55 char *full_path; 55 char *full_path;
56 char dirsep; 56 char dirsep;
57 struct cifs_sb_info *cifs_sb; 57 struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
58 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
58 59
59 if (direntry == NULL) 60 if (direntry == NULL)
60 return NULL; /* not much we can do if dentry is freed and 61 return NULL; /* not much we can do if dentry is freed and
61 we need to reopen the file after it was closed implicitly 62 we need to reopen the file after it was closed implicitly
62 when the server crashed */ 63 when the server crashed */
63 64
64 cifs_sb = CIFS_SB(direntry->d_sb);
65 dirsep = CIFS_DIR_SEP(cifs_sb); 65 dirsep = CIFS_DIR_SEP(cifs_sb);
66 pplen = cifs_sb->prepathlen; 66 pplen = cifs_sb->prepathlen;
67 if (cifs_sb->tcon && (cifs_sb->tcon->Flags & SMB_SHARE_IS_IN_DFS)) 67 if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
68 dfsplen = strnlen(cifs_sb->tcon->treeName, MAX_TREE_SIZE + 1); 68 dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
69 else 69 else
70 dfsplen = 0; 70 dfsplen = 0;
71cifs_bp_rename_retry: 71cifs_bp_rename_retry:
@@ -117,7 +117,7 @@ cifs_bp_rename_retry:
117 /* BB test paths to Windows with '/' in the midst of prepath */ 117 /* BB test paths to Windows with '/' in the midst of prepath */
118 118
119 if (dfsplen) { 119 if (dfsplen) {
120 strncpy(full_path, cifs_sb->tcon->treeName, dfsplen); 120 strncpy(full_path, tcon->treeName, dfsplen);
121 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) { 121 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
122 int i; 122 int i;
123 for (i = 0; i < dfsplen; i++) { 123 for (i = 0; i < dfsplen; i++) {
@@ -130,135 +130,6 @@ cifs_bp_rename_retry:
130 return full_path; 130 return full_path;
131} 131}
132 132
133struct cifsFileInfo *
134cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
135 struct file *file, struct vfsmount *mnt, unsigned int oflags)
136{
137 int oplock = 0;
138 struct cifsFileInfo *pCifsFile;
139 struct cifsInodeInfo *pCifsInode;
140 struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb);
141
142 pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
143 if (pCifsFile == NULL)
144 return pCifsFile;
145
146 if (oplockEnabled)
147 oplock = REQ_OPLOCK;
148
149 pCifsFile->netfid = fileHandle;
150 pCifsFile->pid = current->tgid;
151 pCifsFile->pInode = igrab(newinode);
152 pCifsFile->mnt = mnt;
153 pCifsFile->pfile = file;
154 pCifsFile->invalidHandle = false;
155 pCifsFile->closePend = false;
156 mutex_init(&pCifsFile->fh_mutex);
157 mutex_init(&pCifsFile->lock_mutex);
158 INIT_LIST_HEAD(&pCifsFile->llist);
159 atomic_set(&pCifsFile->count, 1);
160 INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
161
162 write_lock(&GlobalSMBSeslock);
163 list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList);
164 pCifsInode = CIFS_I(newinode);
165 if (pCifsInode) {
166 /* if readable file instance put first in list*/
167 if (oflags & FMODE_READ)
168 list_add(&pCifsFile->flist, &pCifsInode->openFileList);
169 else
170 list_add_tail(&pCifsFile->flist,
171 &pCifsInode->openFileList);
172
173 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
174 pCifsInode->clientCanCacheAll = true;
175 pCifsInode->clientCanCacheRead = true;
176 cFYI(1, "Exclusive Oplock inode %p", newinode);
177 } else if ((oplock & 0xF) == OPLOCK_READ)
178 pCifsInode->clientCanCacheRead = true;
179 }
180 write_unlock(&GlobalSMBSeslock);
181
182 file->private_data = pCifsFile;
183
184 return pCifsFile;
185}
186
187int cifs_posix_open(char *full_path, struct inode **pinode,
188 struct super_block *sb, int mode, int oflags,
189 __u32 *poplock, __u16 *pnetfid, int xid)
190{
191 int rc;
192 FILE_UNIX_BASIC_INFO *presp_data;
193 __u32 posix_flags = 0;
194 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
195 struct cifs_fattr fattr;
196
197 cFYI(1, "posix open %s", full_path);
198
199 presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
200 if (presp_data == NULL)
201 return -ENOMEM;
202
203/* So far cifs posix extensions can only map the following flags.
204 There are other valid fmode oflags such as FMODE_LSEEK, FMODE_PREAD, but
205 so far we do not seem to need them, and we can treat them as local only */
206 if ((oflags & (FMODE_READ | FMODE_WRITE)) ==
207 (FMODE_READ | FMODE_WRITE))
208 posix_flags = SMB_O_RDWR;
209 else if (oflags & FMODE_READ)
210 posix_flags = SMB_O_RDONLY;
211 else if (oflags & FMODE_WRITE)
212 posix_flags = SMB_O_WRONLY;
213 if (oflags & O_CREAT)
214 posix_flags |= SMB_O_CREAT;
215 if (oflags & O_EXCL)
216 posix_flags |= SMB_O_EXCL;
217 if (oflags & O_TRUNC)
218 posix_flags |= SMB_O_TRUNC;
219 /* be safe and imply O_SYNC for O_DSYNC */
220 if (oflags & O_DSYNC)
221 posix_flags |= SMB_O_SYNC;
222 if (oflags & O_DIRECTORY)
223 posix_flags |= SMB_O_DIRECTORY;
224 if (oflags & O_NOFOLLOW)
225 posix_flags |= SMB_O_NOFOLLOW;
226 if (oflags & O_DIRECT)
227 posix_flags |= SMB_O_DIRECT;
228
229 mode &= ~current_umask();
230 rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
231 pnetfid, presp_data, poplock, full_path,
232 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
233 CIFS_MOUNT_MAP_SPECIAL_CHR);
234 if (rc)
235 goto posix_open_ret;
236
237 if (presp_data->Type == cpu_to_le32(-1))
238 goto posix_open_ret; /* open ok, caller does qpathinfo */
239
240 if (!pinode)
241 goto posix_open_ret; /* caller does not need info */
242
243 cifs_unix_basic_to_fattr(&fattr, presp_data, cifs_sb);
244
245 /* get new inode and set it up */
246 if (*pinode == NULL) {
247 cifs_fill_uniqueid(sb, &fattr);
248 *pinode = cifs_iget(sb, &fattr);
249 if (!*pinode) {
250 rc = -ENOMEM;
251 goto posix_open_ret;
252 }
253 } else {
254 cifs_fattr_to_inode(*pinode, &fattr);
255 }
256
257posix_open_ret:
258 kfree(presp_data);
259 return rc;
260}
261
262static void setup_cifs_dentry(struct cifsTconInfo *tcon, 133static void setup_cifs_dentry(struct cifsTconInfo *tcon,
263 struct dentry *direntry, 134 struct dentry *direntry,
264 struct inode *newinode) 135 struct inode *newinode)
@@ -291,6 +162,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
291 int desiredAccess = GENERIC_READ | GENERIC_WRITE; 162 int desiredAccess = GENERIC_READ | GENERIC_WRITE;
292 __u16 fileHandle; 163 __u16 fileHandle;
293 struct cifs_sb_info *cifs_sb; 164 struct cifs_sb_info *cifs_sb;
165 struct tcon_link *tlink;
294 struct cifsTconInfo *tcon; 166 struct cifsTconInfo *tcon;
295 char *full_path = NULL; 167 char *full_path = NULL;
296 FILE_ALL_INFO *buf = NULL; 168 FILE_ALL_INFO *buf = NULL;
@@ -300,21 +172,26 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
300 xid = GetXid(); 172 xid = GetXid();
301 173
302 cifs_sb = CIFS_SB(inode->i_sb); 174 cifs_sb = CIFS_SB(inode->i_sb);
303 tcon = cifs_sb->tcon; 175 tlink = cifs_sb_tlink(cifs_sb);
304 176 if (IS_ERR(tlink)) {
305 full_path = build_path_from_dentry(direntry); 177 FreeXid(xid);
306 if (full_path == NULL) { 178 return PTR_ERR(tlink);
307 rc = -ENOMEM;
308 goto cifs_create_out;
309 } 179 }
180 tcon = tlink_tcon(tlink);
310 181
311 if (oplockEnabled) 182 if (oplockEnabled)
312 oplock = REQ_OPLOCK; 183 oplock = REQ_OPLOCK;
313 184
314 if (nd && (nd->flags & LOOKUP_OPEN)) 185 if (nd && (nd->flags & LOOKUP_OPEN))
315 oflags = nd->intent.open.flags; 186 oflags = nd->intent.open.file->f_flags;
316 else 187 else
317 oflags = FMODE_READ | SMB_O_CREAT; 188 oflags = O_RDONLY | O_CREAT;
189
190 full_path = build_path_from_dentry(direntry);
191 if (full_path == NULL) {
192 rc = -ENOMEM;
193 goto cifs_create_out;
194 }
318 195
319 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) && 196 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
320 (CIFS_UNIX_POSIX_PATH_OPS_CAP & 197 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
@@ -344,9 +221,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
344 /* if the file is going to stay open, then we 221 /* if the file is going to stay open, then we
345 need to set the desired access properly */ 222 need to set the desired access properly */
346 desiredAccess = 0; 223 desiredAccess = 0;
347 if (oflags & FMODE_READ) 224 if (OPEN_FMODE(oflags) & FMODE_READ)
348 desiredAccess |= GENERIC_READ; /* is this too little? */ 225 desiredAccess |= GENERIC_READ; /* is this too little? */
349 if (oflags & FMODE_WRITE) 226 if (OPEN_FMODE(oflags) & FMODE_WRITE)
350 desiredAccess |= GENERIC_WRITE; 227 desiredAccess |= GENERIC_WRITE;
351 228
352 if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) 229 if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
@@ -375,7 +252,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
375 if (!tcon->unix_ext && (mode & S_IWUGO) == 0) 252 if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
376 create_options |= CREATE_OPTION_READONLY; 253 create_options |= CREATE_OPTION_READONLY;
377 254
378 if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS) 255 if (tcon->ses->capabilities & CAP_NT_SMBS)
379 rc = CIFSSMBOpen(xid, tcon, full_path, disposition, 256 rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
380 desiredAccess, create_options, 257 desiredAccess, create_options,
381 &fileHandle, &oplock, buf, cifs_sb->local_nls, 258 &fileHandle, &oplock, buf, cifs_sb->local_nls,
@@ -467,8 +344,7 @@ cifs_create_set_dentry:
467 goto cifs_create_out; 344 goto cifs_create_out;
468 } 345 }
469 346
470 pfile_info = cifs_new_fileinfo(newinode, fileHandle, filp, 347 pfile_info = cifs_new_fileinfo(fileHandle, filp, tlink, oplock);
471 nd->path.mnt, oflags);
472 if (pfile_info == NULL) { 348 if (pfile_info == NULL) {
473 fput(filp); 349 fput(filp);
474 CIFSSMBClose(xid, tcon, fileHandle); 350 CIFSSMBClose(xid, tcon, fileHandle);
@@ -481,6 +357,7 @@ cifs_create_set_dentry:
481cifs_create_out: 357cifs_create_out:
482 kfree(buf); 358 kfree(buf);
483 kfree(full_path); 359 kfree(full_path);
360 cifs_put_tlink(tlink);
484 FreeXid(xid); 361 FreeXid(xid);
485 return rc; 362 return rc;
486} 363}
@@ -491,6 +368,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
491 int rc = -EPERM; 368 int rc = -EPERM;
492 int xid; 369 int xid;
493 struct cifs_sb_info *cifs_sb; 370 struct cifs_sb_info *cifs_sb;
371 struct tcon_link *tlink;
494 struct cifsTconInfo *pTcon; 372 struct cifsTconInfo *pTcon;
495 char *full_path = NULL; 373 char *full_path = NULL;
496 struct inode *newinode = NULL; 374 struct inode *newinode = NULL;
@@ -503,10 +381,14 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
503 if (!old_valid_dev(device_number)) 381 if (!old_valid_dev(device_number))
504 return -EINVAL; 382 return -EINVAL;
505 383
506 xid = GetXid();
507
508 cifs_sb = CIFS_SB(inode->i_sb); 384 cifs_sb = CIFS_SB(inode->i_sb);
509 pTcon = cifs_sb->tcon; 385 tlink = cifs_sb_tlink(cifs_sb);
386 if (IS_ERR(tlink))
387 return PTR_ERR(tlink);
388
389 pTcon = tlink_tcon(tlink);
390
391 xid = GetXid();
510 392
511 full_path = build_path_from_dentry(direntry); 393 full_path = build_path_from_dentry(direntry);
512 if (full_path == NULL) { 394 if (full_path == NULL) {
@@ -606,6 +488,7 @@ mknod_out:
606 kfree(full_path); 488 kfree(full_path);
607 kfree(buf); 489 kfree(buf);
608 FreeXid(xid); 490 FreeXid(xid);
491 cifs_put_tlink(tlink);
609 return rc; 492 return rc;
610} 493}
611 494
@@ -619,6 +502,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
619 __u16 fileHandle = 0; 502 __u16 fileHandle = 0;
620 bool posix_open = false; 503 bool posix_open = false;
621 struct cifs_sb_info *cifs_sb; 504 struct cifs_sb_info *cifs_sb;
505 struct tcon_link *tlink;
622 struct cifsTconInfo *pTcon; 506 struct cifsTconInfo *pTcon;
623 struct cifsFileInfo *cfile; 507 struct cifsFileInfo *cfile;
624 struct inode *newInode = NULL; 508 struct inode *newInode = NULL;
@@ -633,7 +517,12 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
633 /* check whether path exists */ 517 /* check whether path exists */
634 518
635 cifs_sb = CIFS_SB(parent_dir_inode->i_sb); 519 cifs_sb = CIFS_SB(parent_dir_inode->i_sb);
636 pTcon = cifs_sb->tcon; 520 tlink = cifs_sb_tlink(cifs_sb);
521 if (IS_ERR(tlink)) {
522 FreeXid(xid);
523 return (struct dentry *)tlink;
524 }
525 pTcon = tlink_tcon(tlink);
637 526
638 /* 527 /*
639 * Don't allow the separator character in a path component. 528 * Don't allow the separator character in a path component.
@@ -644,8 +533,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
644 for (i = 0; i < direntry->d_name.len; i++) 533 for (i = 0; i < direntry->d_name.len; i++)
645 if (direntry->d_name.name[i] == '\\') { 534 if (direntry->d_name.name[i] == '\\') {
646 cFYI(1, "Invalid file name"); 535 cFYI(1, "Invalid file name");
647 FreeXid(xid); 536 rc = -EINVAL;
648 return ERR_PTR(-EINVAL); 537 goto lookup_out;
649 } 538 }
650 } 539 }
651 540
@@ -655,7 +544,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
655 */ 544 */
656 if (nd && (nd->flags & LOOKUP_EXCL)) { 545 if (nd && (nd->flags & LOOKUP_EXCL)) {
657 d_instantiate(direntry, NULL); 546 d_instantiate(direntry, NULL);
658 return NULL; 547 rc = 0;
548 goto lookup_out;
659 } 549 }
660 550
661 /* can not grab the rename sem here since it would 551 /* can not grab the rename sem here since it would
@@ -663,8 +553,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
663 in which we already have the sb rename sem */ 553 in which we already have the sb rename sem */
664 full_path = build_path_from_dentry(direntry); 554 full_path = build_path_from_dentry(direntry);
665 if (full_path == NULL) { 555 if (full_path == NULL) {
666 FreeXid(xid); 556 rc = -ENOMEM;
667 return ERR_PTR(-ENOMEM); 557 goto lookup_out;
668 } 558 }
669 559
670 if (direntry->d_inode != NULL) { 560 if (direntry->d_inode != NULL) {
@@ -687,11 +577,11 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
687 if (pTcon->unix_ext) { 577 if (pTcon->unix_ext) {
688 if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) && 578 if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
689 (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open && 579 (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
690 (nd->intent.open.flags & O_CREAT)) { 580 (nd->intent.open.file->f_flags & O_CREAT)) {
691 rc = cifs_posix_open(full_path, &newInode, 581 rc = cifs_posix_open(full_path, &newInode,
692 parent_dir_inode->i_sb, 582 parent_dir_inode->i_sb,
693 nd->intent.open.create_mode, 583 nd->intent.open.create_mode,
694 nd->intent.open.flags, &oplock, 584 nd->intent.open.file->f_flags, &oplock,
695 &fileHandle, xid); 585 &fileHandle, xid);
696 /* 586 /*
697 * The check below works around a bug in POSIX 587 * The check below works around a bug in POSIX
@@ -727,9 +617,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
727 goto lookup_out; 617 goto lookup_out;
728 } 618 }
729 619
730 cfile = cifs_new_fileinfo(newInode, fileHandle, filp, 620 cfile = cifs_new_fileinfo(fileHandle, filp, tlink,
731 nd->path.mnt, 621 oplock);
732 nd->intent.open.flags);
733 if (cfile == NULL) { 622 if (cfile == NULL) {
734 fput(filp); 623 fput(filp);
735 CIFSSMBClose(xid, pTcon, fileHandle); 624 CIFSSMBClose(xid, pTcon, fileHandle);
@@ -759,6 +648,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
759 648
760lookup_out: 649lookup_out:
761 kfree(full_path); 650 kfree(full_path);
651 cifs_put_tlink(tlink);
762 FreeXid(xid); 652 FreeXid(xid);
763 return ERR_PTR(rc); 653 return ERR_PTR(rc);
764} 654}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index de748c652d11..45af003865d2 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -60,34 +60,32 @@ static inline int cifs_convert_flags(unsigned int flags)
60 FILE_READ_DATA); 60 FILE_READ_DATA);
61} 61}
62 62
63static inline fmode_t cifs_posix_convert_flags(unsigned int flags) 63static u32 cifs_posix_convert_flags(unsigned int flags)
64{ 64{
65 fmode_t posix_flags = 0; 65 u32 posix_flags = 0;
66 66
67 if ((flags & O_ACCMODE) == O_RDONLY) 67 if ((flags & O_ACCMODE) == O_RDONLY)
68 posix_flags = FMODE_READ; 68 posix_flags = SMB_O_RDONLY;
69 else if ((flags & O_ACCMODE) == O_WRONLY) 69 else if ((flags & O_ACCMODE) == O_WRONLY)
70 posix_flags = FMODE_WRITE; 70 posix_flags = SMB_O_WRONLY;
71 else if ((flags & O_ACCMODE) == O_RDWR) { 71 else if ((flags & O_ACCMODE) == O_RDWR)
72 /* GENERIC_ALL is too much permission to request 72 posix_flags = SMB_O_RDWR;
73 can cause unnecessary access denied on create */ 73
74 /* return GENERIC_ALL; */ 74 if (flags & O_CREAT)
75 posix_flags = FMODE_READ | FMODE_WRITE; 75 posix_flags |= SMB_O_CREAT;
76 } 76 if (flags & O_EXCL)
77 /* can not map O_CREAT or O_EXCL or O_TRUNC flags when 77 posix_flags |= SMB_O_EXCL;
78 reopening a file. They had their effect on the original open */ 78 if (flags & O_TRUNC)
79 if (flags & O_APPEND) 79 posix_flags |= SMB_O_TRUNC;
80 posix_flags |= (fmode_t)O_APPEND; 80 /* be safe and imply O_SYNC for O_DSYNC */
81 if (flags & O_DSYNC) 81 if (flags & O_DSYNC)
82 posix_flags |= (fmode_t)O_DSYNC; 82 posix_flags |= SMB_O_SYNC;
83 if (flags & __O_SYNC)
84 posix_flags |= (fmode_t)__O_SYNC;
85 if (flags & O_DIRECTORY) 83 if (flags & O_DIRECTORY)
86 posix_flags |= (fmode_t)O_DIRECTORY; 84 posix_flags |= SMB_O_DIRECTORY;
87 if (flags & O_NOFOLLOW) 85 if (flags & O_NOFOLLOW)
88 posix_flags |= (fmode_t)O_NOFOLLOW; 86 posix_flags |= SMB_O_NOFOLLOW;
89 if (flags & O_DIRECT) 87 if (flags & O_DIRECT)
90 posix_flags |= (fmode_t)O_DIRECT; 88 posix_flags |= SMB_O_DIRECT;
91 89
92 return posix_flags; 90 return posix_flags;
93} 91}
@@ -106,66 +104,8 @@ static inline int cifs_get_disposition(unsigned int flags)
106 return FILE_OPEN; 104 return FILE_OPEN;
107} 105}
108 106
109/* all arguments to this function must be checked for validity in caller */
110static inline int
111cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
112 struct cifsInodeInfo *pCifsInode, __u32 oplock,
113 u16 netfid)
114{
115
116 write_lock(&GlobalSMBSeslock);
117
118 pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
119 if (pCifsInode == NULL) {
120 write_unlock(&GlobalSMBSeslock);
121 return -EINVAL;
122 }
123
124 if (pCifsInode->clientCanCacheRead) {
125 /* we have the inode open somewhere else
126 no need to discard cache data */
127 goto psx_client_can_cache;
128 }
129
130 /* BB FIXME need to fix this check to move it earlier into posix_open
131 BB fIX following section BB FIXME */
132
133 /* if not oplocked, invalidate inode pages if mtime or file
134 size changed */
135/* temp = cifs_NTtimeToUnix(le64_to_cpu(buf->LastWriteTime));
136 if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
137 (file->f_path.dentry->d_inode->i_size ==
138 (loff_t)le64_to_cpu(buf->EndOfFile))) {
139 cFYI(1, "inode unchanged on server");
140 } else {
141 if (file->f_path.dentry->d_inode->i_mapping) {
142 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
143 if (rc != 0)
144 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
145 }
146 cFYI(1, "invalidating remote inode since open detected it "
147 "changed");
148 invalidate_remote_inode(file->f_path.dentry->d_inode);
149 } */
150
151psx_client_can_cache:
152 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
153 pCifsInode->clientCanCacheAll = true;
154 pCifsInode->clientCanCacheRead = true;
155 cFYI(1, "Exclusive Oplock granted on inode %p",
156 file->f_path.dentry->d_inode);
157 } else if ((oplock & 0xF) == OPLOCK_READ)
158 pCifsInode->clientCanCacheRead = true;
159
160 /* will have to change the unlock if we reenable the
161 filemap_fdatawrite (which does not seem necessary */
162 write_unlock(&GlobalSMBSeslock);
163 return 0;
164}
165
166/* all arguments to this function must be checked for validity in caller */
167static inline int cifs_open_inode_helper(struct inode *inode, 107static inline int cifs_open_inode_helper(struct inode *inode,
168 struct cifsTconInfo *pTcon, int *oplock, FILE_ALL_INFO *buf, 108 struct cifsTconInfo *pTcon, __u32 oplock, FILE_ALL_INFO *buf,
169 char *full_path, int xid) 109 char *full_path, int xid)
170{ 110{
171 struct cifsInodeInfo *pCifsInode = CIFS_I(inode); 111 struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
@@ -207,16 +147,175 @@ client_can_cache:
207 rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb, 147 rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
208 xid, NULL); 148 xid, NULL);
209 149
210 if ((*oplock & 0xF) == OPLOCK_EXCLUSIVE) { 150 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
211 pCifsInode->clientCanCacheAll = true; 151 pCifsInode->clientCanCacheAll = true;
212 pCifsInode->clientCanCacheRead = true; 152 pCifsInode->clientCanCacheRead = true;
213 cFYI(1, "Exclusive Oplock granted on inode %p", inode); 153 cFYI(1, "Exclusive Oplock granted on inode %p", inode);
214 } else if ((*oplock & 0xF) == OPLOCK_READ) 154 } else if ((oplock & 0xF) == OPLOCK_READ)
215 pCifsInode->clientCanCacheRead = true; 155 pCifsInode->clientCanCacheRead = true;
216 156
217 return rc; 157 return rc;
218} 158}
219 159
160int cifs_posix_open(char *full_path, struct inode **pinode,
161 struct super_block *sb, int mode, unsigned int f_flags,
162 __u32 *poplock, __u16 *pnetfid, int xid)
163{
164 int rc;
165 FILE_UNIX_BASIC_INFO *presp_data;
166 __u32 posix_flags = 0;
167 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
168 struct cifs_fattr fattr;
169 struct tcon_link *tlink;
170 struct cifsTconInfo *tcon;
171
172 cFYI(1, "posix open %s", full_path);
173
174 presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
175 if (presp_data == NULL)
176 return -ENOMEM;
177
178 tlink = cifs_sb_tlink(cifs_sb);
179 if (IS_ERR(tlink)) {
180 rc = PTR_ERR(tlink);
181 goto posix_open_ret;
182 }
183
184 tcon = tlink_tcon(tlink);
185 mode &= ~current_umask();
186
187 posix_flags = cifs_posix_convert_flags(f_flags);
188 rc = CIFSPOSIXCreate(xid, tcon, posix_flags, mode, pnetfid, presp_data,
189 poplock, full_path, cifs_sb->local_nls,
190 cifs_sb->mnt_cifs_flags &
191 CIFS_MOUNT_MAP_SPECIAL_CHR);
192 cifs_put_tlink(tlink);
193
194 if (rc)
195 goto posix_open_ret;
196
197 if (presp_data->Type == cpu_to_le32(-1))
198 goto posix_open_ret; /* open ok, caller does qpathinfo */
199
200 if (!pinode)
201 goto posix_open_ret; /* caller does not need info */
202
203 cifs_unix_basic_to_fattr(&fattr, presp_data, cifs_sb);
204
205 /* get new inode and set it up */
206 if (*pinode == NULL) {
207 cifs_fill_uniqueid(sb, &fattr);
208 *pinode = cifs_iget(sb, &fattr);
209 if (!*pinode) {
210 rc = -ENOMEM;
211 goto posix_open_ret;
212 }
213 } else {
214 cifs_fattr_to_inode(*pinode, &fattr);
215 }
216
217posix_open_ret:
218 kfree(presp_data);
219 return rc;
220}
221
222struct cifsFileInfo *
223cifs_new_fileinfo(__u16 fileHandle, struct file *file,
224 struct tcon_link *tlink, __u32 oplock)
225{
226 struct dentry *dentry = file->f_path.dentry;
227 struct inode *inode = dentry->d_inode;
228 struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
229 struct cifsFileInfo *pCifsFile;
230
231 pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
232 if (pCifsFile == NULL)
233 return pCifsFile;
234
235 pCifsFile->netfid = fileHandle;
236 pCifsFile->pid = current->tgid;
237 pCifsFile->uid = current_fsuid();
238 pCifsFile->dentry = dget(dentry);
239 pCifsFile->f_flags = file->f_flags;
240 pCifsFile->invalidHandle = false;
241 pCifsFile->tlink = cifs_get_tlink(tlink);
242 mutex_init(&pCifsFile->fh_mutex);
243 mutex_init(&pCifsFile->lock_mutex);
244 INIT_LIST_HEAD(&pCifsFile->llist);
245 atomic_set(&pCifsFile->count, 1);
246 INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
247
248 spin_lock(&cifs_file_list_lock);
249 list_add(&pCifsFile->tlist, &(tlink_tcon(tlink)->openFileList));
250 /* if readable file instance put first in list*/
251 if (file->f_mode & FMODE_READ)
252 list_add(&pCifsFile->flist, &pCifsInode->openFileList);
253 else
254 list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList);
255 spin_unlock(&cifs_file_list_lock);
256
257 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
258 pCifsInode->clientCanCacheAll = true;
259 pCifsInode->clientCanCacheRead = true;
260 cFYI(1, "Exclusive Oplock inode %p", inode);
261 } else if ((oplock & 0xF) == OPLOCK_READ)
262 pCifsInode->clientCanCacheRead = true;
263
264 file->private_data = pCifsFile;
265 return pCifsFile;
266}
267
268/*
269 * Release a reference on the file private data. This may involve closing
270 * the filehandle out on the server.
271 */
272void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
273{
274 struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink);
275 struct cifsInodeInfo *cifsi = CIFS_I(cifs_file->dentry->d_inode);
276 struct cifsLockInfo *li, *tmp;
277
278 spin_lock(&cifs_file_list_lock);
279 if (!atomic_dec_and_test(&cifs_file->count)) {
280 spin_unlock(&cifs_file_list_lock);
281 return;
282 }
283
284 /* remove it from the lists */
285 list_del(&cifs_file->flist);
286 list_del(&cifs_file->tlist);
287
288 if (list_empty(&cifsi->openFileList)) {
289 cFYI(1, "closing last open instance for inode %p",
290 cifs_file->dentry->d_inode);
291 cifsi->clientCanCacheRead = false;
292 cifsi->clientCanCacheAll = false;
293 }
294 spin_unlock(&cifs_file_list_lock);
295
296 if (!tcon->need_reconnect && !cifs_file->invalidHandle) {
297 int xid, rc;
298
299 xid = GetXid();
300 rc = CIFSSMBClose(xid, tcon, cifs_file->netfid);
301 FreeXid(xid);
302 }
303
304 /* Delete any outstanding lock records. We'll lose them when the file
305 * is closed anyway.
306 */
307 mutex_lock(&cifs_file->lock_mutex);
308 list_for_each_entry_safe(li, tmp, &cifs_file->llist, llist) {
309 list_del(&li->llist);
310 kfree(li);
311 }
312 mutex_unlock(&cifs_file->lock_mutex);
313
314 cifs_put_tlink(cifs_file->tlink);
315 dput(cifs_file->dentry);
316 kfree(cifs_file);
317}
318
220int cifs_open(struct inode *inode, struct file *file) 319int cifs_open(struct inode *inode, struct file *file)
221{ 320{
222 int rc = -EACCES; 321 int rc = -EACCES;
@@ -224,6 +323,7 @@ int cifs_open(struct inode *inode, struct file *file)
224 __u32 oplock; 323 __u32 oplock;
225 struct cifs_sb_info *cifs_sb; 324 struct cifs_sb_info *cifs_sb;
226 struct cifsTconInfo *tcon; 325 struct cifsTconInfo *tcon;
326 struct tcon_link *tlink;
227 struct cifsFileInfo *pCifsFile = NULL; 327 struct cifsFileInfo *pCifsFile = NULL;
228 struct cifsInodeInfo *pCifsInode; 328 struct cifsInodeInfo *pCifsInode;
229 char *full_path = NULL; 329 char *full_path = NULL;
@@ -235,7 +335,12 @@ int cifs_open(struct inode *inode, struct file *file)
235 xid = GetXid(); 335 xid = GetXid();
236 336
237 cifs_sb = CIFS_SB(inode->i_sb); 337 cifs_sb = CIFS_SB(inode->i_sb);
238 tcon = cifs_sb->tcon; 338 tlink = cifs_sb_tlink(cifs_sb);
339 if (IS_ERR(tlink)) {
340 FreeXid(xid);
341 return PTR_ERR(tlink);
342 }
343 tcon = tlink_tcon(tlink);
239 344
240 pCifsInode = CIFS_I(file->f_path.dentry->d_inode); 345 pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
241 346
@@ -257,27 +362,15 @@ int cifs_open(struct inode *inode, struct file *file)
257 (tcon->ses->capabilities & CAP_UNIX) && 362 (tcon->ses->capabilities & CAP_UNIX) &&
258 (CIFS_UNIX_POSIX_PATH_OPS_CAP & 363 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
259 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 364 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
260 int oflags = (int) cifs_posix_convert_flags(file->f_flags);
261 oflags |= SMB_O_CREAT;
262 /* can not refresh inode info since size could be stale */ 365 /* can not refresh inode info since size could be stale */
263 rc = cifs_posix_open(full_path, &inode, inode->i_sb, 366 rc = cifs_posix_open(full_path, &inode, inode->i_sb,
264 cifs_sb->mnt_file_mode /* ignored */, 367 cifs_sb->mnt_file_mode /* ignored */,
265 oflags, &oplock, &netfid, xid); 368 file->f_flags, &oplock, &netfid, xid);
266 if (rc == 0) { 369 if (rc == 0) {
267 cFYI(1, "posix open succeeded"); 370 cFYI(1, "posix open succeeded");
268 /* no need for special case handling of setting mode
269 on read only files needed here */
270 371
271 rc = cifs_posix_open_inode_helper(inode, file, 372 pCifsFile = cifs_new_fileinfo(netfid, file, tlink,
272 pCifsInode, oplock, netfid); 373 oplock);
273 if (rc != 0) {
274 CIFSSMBClose(xid, tcon, netfid);
275 goto out;
276 }
277
278 pCifsFile = cifs_new_fileinfo(inode, netfid, file,
279 file->f_path.mnt,
280 oflags);
281 if (pCifsFile == NULL) { 374 if (pCifsFile == NULL) {
282 CIFSSMBClose(xid, tcon, netfid); 375 CIFSSMBClose(xid, tcon, netfid);
283 rc = -ENOMEM; 376 rc = -ENOMEM;
@@ -345,7 +438,7 @@ int cifs_open(struct inode *inode, struct file *file)
345 goto out; 438 goto out;
346 } 439 }
347 440
348 if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS) 441 if (tcon->ses->capabilities & CAP_NT_SMBS)
349 rc = CIFSSMBOpen(xid, tcon, full_path, disposition, 442 rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
350 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf, 443 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
351 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags 444 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
@@ -365,12 +458,11 @@ int cifs_open(struct inode *inode, struct file *file)
365 goto out; 458 goto out;
366 } 459 }
367 460
368 rc = cifs_open_inode_helper(inode, tcon, &oplock, buf, full_path, xid); 461 rc = cifs_open_inode_helper(inode, tcon, oplock, buf, full_path, xid);
369 if (rc != 0) 462 if (rc != 0)
370 goto out; 463 goto out;
371 464
372 pCifsFile = cifs_new_fileinfo(inode, netfid, file, file->f_path.mnt, 465 pCifsFile = cifs_new_fileinfo(netfid, file, tlink, oplock);
373 file->f_flags);
374 if (pCifsFile == NULL) { 466 if (pCifsFile == NULL) {
375 rc = -ENOMEM; 467 rc = -ENOMEM;
376 goto out; 468 goto out;
@@ -402,6 +494,7 @@ out:
402 kfree(buf); 494 kfree(buf);
403 kfree(full_path); 495 kfree(full_path);
404 FreeXid(xid); 496 FreeXid(xid);
497 cifs_put_tlink(tlink);
405 return rc; 498 return rc;
406} 499}
407 500
@@ -416,14 +509,13 @@ static int cifs_relock_file(struct cifsFileInfo *cifsFile)
416 return rc; 509 return rc;
417} 510}
418 511
419static int cifs_reopen_file(struct file *file, bool can_flush) 512static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
420{ 513{
421 int rc = -EACCES; 514 int rc = -EACCES;
422 int xid; 515 int xid;
423 __u32 oplock; 516 __u32 oplock;
424 struct cifs_sb_info *cifs_sb; 517 struct cifs_sb_info *cifs_sb;
425 struct cifsTconInfo *tcon; 518 struct cifsTconInfo *tcon;
426 struct cifsFileInfo *pCifsFile;
427 struct cifsInodeInfo *pCifsInode; 519 struct cifsInodeInfo *pCifsInode;
428 struct inode *inode; 520 struct inode *inode;
429 char *full_path = NULL; 521 char *full_path = NULL;
@@ -431,11 +523,6 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
431 int disposition = FILE_OPEN; 523 int disposition = FILE_OPEN;
432 __u16 netfid; 524 __u16 netfid;
433 525
434 if (file->private_data)
435 pCifsFile = file->private_data;
436 else
437 return -EBADF;
438
439 xid = GetXid(); 526 xid = GetXid();
440 mutex_lock(&pCifsFile->fh_mutex); 527 mutex_lock(&pCifsFile->fh_mutex);
441 if (!pCifsFile->invalidHandle) { 528 if (!pCifsFile->invalidHandle) {
@@ -445,39 +532,24 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
445 return rc; 532 return rc;
446 } 533 }
447 534
448 if (file->f_path.dentry == NULL) { 535 inode = pCifsFile->dentry->d_inode;
449 cERROR(1, "no valid name if dentry freed");
450 dump_stack();
451 rc = -EBADF;
452 goto reopen_error_exit;
453 }
454
455 inode = file->f_path.dentry->d_inode;
456 if (inode == NULL) {
457 cERROR(1, "inode not valid");
458 dump_stack();
459 rc = -EBADF;
460 goto reopen_error_exit;
461 }
462
463 cifs_sb = CIFS_SB(inode->i_sb); 536 cifs_sb = CIFS_SB(inode->i_sb);
464 tcon = cifs_sb->tcon; 537 tcon = tlink_tcon(pCifsFile->tlink);
465 538
466/* can not grab rename sem here because various ops, including 539/* can not grab rename sem here because various ops, including
467 those that already have the rename sem can end up causing writepage 540 those that already have the rename sem can end up causing writepage
468 to get called and if the server was down that means we end up here, 541 to get called and if the server was down that means we end up here,
469 and we can never tell if the caller already has the rename_sem */ 542 and we can never tell if the caller already has the rename_sem */
470 full_path = build_path_from_dentry(file->f_path.dentry); 543 full_path = build_path_from_dentry(pCifsFile->dentry);
471 if (full_path == NULL) { 544 if (full_path == NULL) {
472 rc = -ENOMEM; 545 rc = -ENOMEM;
473reopen_error_exit:
474 mutex_unlock(&pCifsFile->fh_mutex); 546 mutex_unlock(&pCifsFile->fh_mutex);
475 FreeXid(xid); 547 FreeXid(xid);
476 return rc; 548 return rc;
477 } 549 }
478 550
479 cFYI(1, "inode = 0x%p file flags 0x%x for %s", 551 cFYI(1, "inode = 0x%p file flags 0x%x for %s",
480 inode, file->f_flags, full_path); 552 inode, pCifsFile->f_flags, full_path);
481 553
482 if (oplockEnabled) 554 if (oplockEnabled)
483 oplock = REQ_OPLOCK; 555 oplock = REQ_OPLOCK;
@@ -487,8 +559,14 @@ reopen_error_exit:
487 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) && 559 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
488 (CIFS_UNIX_POSIX_PATH_OPS_CAP & 560 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
489 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 561 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
490 int oflags = (int) cifs_posix_convert_flags(file->f_flags); 562
491 /* can not refresh inode info since size could be stale */ 563 /*
564 * O_CREAT, O_EXCL and O_TRUNC already had their effect on the
565 * original open. Must mask them off for a reopen.
566 */
567 unsigned int oflags = pCifsFile->f_flags &
568 ~(O_CREAT | O_EXCL | O_TRUNC);
569
492 rc = cifs_posix_open(full_path, NULL, inode->i_sb, 570 rc = cifs_posix_open(full_path, NULL, inode->i_sb,
493 cifs_sb->mnt_file_mode /* ignored */, 571 cifs_sb->mnt_file_mode /* ignored */,
494 oflags, &oplock, &netfid, xid); 572 oflags, &oplock, &netfid, xid);
@@ -500,7 +578,7 @@ reopen_error_exit:
500 in the reconnect path it is important to retry hard */ 578 in the reconnect path it is important to retry hard */
501 } 579 }
502 580
503 desiredAccess = cifs_convert_flags(file->f_flags); 581 desiredAccess = cifs_convert_flags(pCifsFile->f_flags);
504 582
505 /* Can not refresh inode by passing in file_info buf to be returned 583 /* Can not refresh inode by passing in file_info buf to be returned
506 by SMBOpen and then calling get_inode_info with returned buf 584 by SMBOpen and then calling get_inode_info with returned buf
@@ -516,49 +594,50 @@ reopen_error_exit:
516 mutex_unlock(&pCifsFile->fh_mutex); 594 mutex_unlock(&pCifsFile->fh_mutex);
517 cFYI(1, "cifs_open returned 0x%x", rc); 595 cFYI(1, "cifs_open returned 0x%x", rc);
518 cFYI(1, "oplock: %d", oplock); 596 cFYI(1, "oplock: %d", oplock);
519 } else { 597 goto reopen_error_exit;
598 }
599
520reopen_success: 600reopen_success:
521 pCifsFile->netfid = netfid; 601 pCifsFile->netfid = netfid;
522 pCifsFile->invalidHandle = false; 602 pCifsFile->invalidHandle = false;
523 mutex_unlock(&pCifsFile->fh_mutex); 603 mutex_unlock(&pCifsFile->fh_mutex);
524 pCifsInode = CIFS_I(inode); 604 pCifsInode = CIFS_I(inode);
525 if (pCifsInode) { 605
526 if (can_flush) { 606 if (can_flush) {
527 rc = filemap_write_and_wait(inode->i_mapping); 607 rc = filemap_write_and_wait(inode->i_mapping);
528 if (rc != 0) 608 if (rc != 0)
529 CIFS_I(inode)->write_behind_rc = rc; 609 CIFS_I(inode)->write_behind_rc = rc;
530 /* temporarily disable caching while we 610
531 go to server to get inode info */ 611 pCifsInode->clientCanCacheAll = false;
532 pCifsInode->clientCanCacheAll = false; 612 pCifsInode->clientCanCacheRead = false;
533 pCifsInode->clientCanCacheRead = false; 613 if (tcon->unix_ext)
534 if (tcon->unix_ext) 614 rc = cifs_get_inode_info_unix(&inode,
535 rc = cifs_get_inode_info_unix(&inode, 615 full_path, inode->i_sb, xid);
536 full_path, inode->i_sb, xid); 616 else
537 else 617 rc = cifs_get_inode_info(&inode,
538 rc = cifs_get_inode_info(&inode, 618 full_path, NULL, inode->i_sb,
539 full_path, NULL, inode->i_sb, 619 xid, NULL);
540 xid, NULL); 620 } /* else we are writing out data to server already
541 } /* else we are writing out data to server already 621 and could deadlock if we tried to flush data, and
542 and could deadlock if we tried to flush data, and 622 since we do not know if we have data that would
543 since we do not know if we have data that would 623 invalidate the current end of file on the server
544 invalidate the current end of file on the server 624 we can not go to the server to get the new inod
545 we can not go to the server to get the new inod 625 info */
546 info */ 626 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
547 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { 627 pCifsInode->clientCanCacheAll = true;
548 pCifsInode->clientCanCacheAll = true; 628 pCifsInode->clientCanCacheRead = true;
549 pCifsInode->clientCanCacheRead = true; 629 cFYI(1, "Exclusive Oplock granted on inode %p",
550 cFYI(1, "Exclusive Oplock granted on inode %p", 630 pCifsFile->dentry->d_inode);
551 file->f_path.dentry->d_inode); 631 } else if ((oplock & 0xF) == OPLOCK_READ) {
552 } else if ((oplock & 0xF) == OPLOCK_READ) { 632 pCifsInode->clientCanCacheRead = true;
553 pCifsInode->clientCanCacheRead = true; 633 pCifsInode->clientCanCacheAll = false;
554 pCifsInode->clientCanCacheAll = false; 634 } else {
555 } else { 635 pCifsInode->clientCanCacheRead = false;
556 pCifsInode->clientCanCacheRead = false; 636 pCifsInode->clientCanCacheAll = false;
557 pCifsInode->clientCanCacheAll = false;
558 }
559 cifs_relock_file(pCifsFile);
560 }
561 } 637 }
638 cifs_relock_file(pCifsFile);
639
640reopen_error_exit:
562 kfree(full_path); 641 kfree(full_path);
563 FreeXid(xid); 642 FreeXid(xid);
564 return rc; 643 return rc;
@@ -566,79 +645,11 @@ reopen_success:
566 645
567int cifs_close(struct inode *inode, struct file *file) 646int cifs_close(struct inode *inode, struct file *file)
568{ 647{
569 int rc = 0; 648 cifsFileInfo_put(file->private_data);
570 int xid, timeout; 649 file->private_data = NULL;
571 struct cifs_sb_info *cifs_sb;
572 struct cifsTconInfo *pTcon;
573 struct cifsFileInfo *pSMBFile = file->private_data;
574
575 xid = GetXid();
576
577 cifs_sb = CIFS_SB(inode->i_sb);
578 pTcon = cifs_sb->tcon;
579 if (pSMBFile) {
580 struct cifsLockInfo *li, *tmp;
581 write_lock(&GlobalSMBSeslock);
582 pSMBFile->closePend = true;
583 if (pTcon) {
584 /* no sense reconnecting to close a file that is
585 already closed */
586 if (!pTcon->need_reconnect) {
587 write_unlock(&GlobalSMBSeslock);
588 timeout = 2;
589 while ((atomic_read(&pSMBFile->count) != 1)
590 && (timeout <= 2048)) {
591 /* Give write a better chance to get to
592 server ahead of the close. We do not
593 want to add a wait_q here as it would
594 increase the memory utilization as
595 the struct would be in each open file,
596 but this should give enough time to
597 clear the socket */
598 cFYI(DBG2, "close delay, write pending");
599 msleep(timeout);
600 timeout *= 4;
601 }
602 if (!pTcon->need_reconnect &&
603 !pSMBFile->invalidHandle)
604 rc = CIFSSMBClose(xid, pTcon,
605 pSMBFile->netfid);
606 } else
607 write_unlock(&GlobalSMBSeslock);
608 } else
609 write_unlock(&GlobalSMBSeslock);
610
611 /* Delete any outstanding lock records.
612 We'll lose them when the file is closed anyway. */
613 mutex_lock(&pSMBFile->lock_mutex);
614 list_for_each_entry_safe(li, tmp, &pSMBFile->llist, llist) {
615 list_del(&li->llist);
616 kfree(li);
617 }
618 mutex_unlock(&pSMBFile->lock_mutex);
619 650
620 write_lock(&GlobalSMBSeslock); 651 /* return code from the ->release op is always ignored */
621 list_del(&pSMBFile->flist); 652 return 0;
622 list_del(&pSMBFile->tlist);
623 write_unlock(&GlobalSMBSeslock);
624 cifsFileInfo_put(file->private_data);
625 file->private_data = NULL;
626 } else
627 rc = -EBADF;
628
629 read_lock(&GlobalSMBSeslock);
630 if (list_empty(&(CIFS_I(inode)->openFileList))) {
631 cFYI(1, "closing last open instance for inode %p", inode);
632 /* if the file is not open we do not know if we can cache info
633 on this inode, much less write behind and read ahead */
634 CIFS_I(inode)->clientCanCacheRead = false;
635 CIFS_I(inode)->clientCanCacheAll = false;
636 }
637 read_unlock(&GlobalSMBSeslock);
638 if ((rc == 0) && CIFS_I(inode)->write_behind_rc)
639 rc = CIFS_I(inode)->write_behind_rc;
640 FreeXid(xid);
641 return rc;
642} 653}
643 654
644int cifs_closedir(struct inode *inode, struct file *file) 655int cifs_closedir(struct inode *inode, struct file *file)
@@ -653,25 +664,21 @@ int cifs_closedir(struct inode *inode, struct file *file)
653 xid = GetXid(); 664 xid = GetXid();
654 665
655 if (pCFileStruct) { 666 if (pCFileStruct) {
656 struct cifsTconInfo *pTcon; 667 struct cifsTconInfo *pTcon = tlink_tcon(pCFileStruct->tlink);
657 struct cifs_sb_info *cifs_sb =
658 CIFS_SB(file->f_path.dentry->d_sb);
659
660 pTcon = cifs_sb->tcon;
661 668
662 cFYI(1, "Freeing private data in close dir"); 669 cFYI(1, "Freeing private data in close dir");
663 write_lock(&GlobalSMBSeslock); 670 spin_lock(&cifs_file_list_lock);
664 if (!pCFileStruct->srch_inf.endOfSearch && 671 if (!pCFileStruct->srch_inf.endOfSearch &&
665 !pCFileStruct->invalidHandle) { 672 !pCFileStruct->invalidHandle) {
666 pCFileStruct->invalidHandle = true; 673 pCFileStruct->invalidHandle = true;
667 write_unlock(&GlobalSMBSeslock); 674 spin_unlock(&cifs_file_list_lock);
668 rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid); 675 rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid);
669 cFYI(1, "Closing uncompleted readdir with rc %d", 676 cFYI(1, "Closing uncompleted readdir with rc %d",
670 rc); 677 rc);
671 /* not much we can do if it fails anyway, ignore rc */ 678 /* not much we can do if it fails anyway, ignore rc */
672 rc = 0; 679 rc = 0;
673 } else 680 } else
674 write_unlock(&GlobalSMBSeslock); 681 spin_unlock(&cifs_file_list_lock);
675 ptmp = pCFileStruct->srch_inf.ntwrk_buf_start; 682 ptmp = pCFileStruct->srch_inf.ntwrk_buf_start;
676 if (ptmp) { 683 if (ptmp) {
677 cFYI(1, "closedir free smb buf in srch struct"); 684 cFYI(1, "closedir free smb buf in srch struct");
@@ -681,6 +688,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
681 else 688 else
682 cifs_buf_release(ptmp); 689 cifs_buf_release(ptmp);
683 } 690 }
691 cifs_put_tlink(pCFileStruct->tlink);
684 kfree(file->private_data); 692 kfree(file->private_data);
685 file->private_data = NULL; 693 file->private_data = NULL;
686 } 694 }
@@ -767,7 +775,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
767 cFYI(1, "Unknown type of lock"); 775 cFYI(1, "Unknown type of lock");
768 776
769 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 777 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
770 tcon = cifs_sb->tcon; 778 tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink);
771 779
772 if (file->private_data == NULL) { 780 if (file->private_data == NULL) {
773 rc = -EBADF; 781 rc = -EBADF;
@@ -960,14 +968,14 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
960 968
961 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 969 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
962 970
963 pTcon = cifs_sb->tcon;
964
965 /* cFYI(1, " write %d bytes to offset %lld of %s", write_size, 971 /* cFYI(1, " write %d bytes to offset %lld of %s", write_size,
966 *poffset, file->f_path.dentry->d_name.name); */ 972 *poffset, file->f_path.dentry->d_name.name); */
967 973
968 if (file->private_data == NULL) 974 if (file->private_data == NULL)
969 return -EBADF; 975 return -EBADF;
976
970 open_file = file->private_data; 977 open_file = file->private_data;
978 pTcon = tlink_tcon(open_file->tlink);
971 979
972 rc = generic_write_checks(file, poffset, &write_size, 0); 980 rc = generic_write_checks(file, poffset, &write_size, 0);
973 if (rc) 981 if (rc)
@@ -988,19 +996,12 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
988 we blocked so return what we managed to write */ 996 we blocked so return what we managed to write */
989 return total_written; 997 return total_written;
990 } 998 }
991 if (open_file->closePend) {
992 FreeXid(xid);
993 if (total_written)
994 return total_written;
995 else
996 return -EBADF;
997 }
998 if (open_file->invalidHandle) { 999 if (open_file->invalidHandle) {
999 /* we could deadlock if we called 1000 /* we could deadlock if we called
1000 filemap_fdatawait from here so tell 1001 filemap_fdatawait from here so tell
1001 reopen_file not to flush data to server 1002 reopen_file not to flush data to server
1002 now */ 1003 now */
1003 rc = cifs_reopen_file(file, false); 1004 rc = cifs_reopen_file(open_file, false);
1004 if (rc != 0) 1005 if (rc != 0)
1005 break; 1006 break;
1006 } 1007 }
@@ -1048,8 +1049,9 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
1048 return total_written; 1049 return total_written;
1049} 1050}
1050 1051
1051static ssize_t cifs_write(struct file *file, const char *write_data, 1052static ssize_t cifs_write(struct cifsFileInfo *open_file,
1052 size_t write_size, loff_t *poffset) 1053 const char *write_data, size_t write_size,
1054 loff_t *poffset)
1053{ 1055{
1054 int rc = 0; 1056 int rc = 0;
1055 unsigned int bytes_written = 0; 1057 unsigned int bytes_written = 0;
@@ -1057,19 +1059,15 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
1057 struct cifs_sb_info *cifs_sb; 1059 struct cifs_sb_info *cifs_sb;
1058 struct cifsTconInfo *pTcon; 1060 struct cifsTconInfo *pTcon;
1059 int xid, long_op; 1061 int xid, long_op;
1060 struct cifsFileInfo *open_file; 1062 struct dentry *dentry = open_file->dentry;
1061 struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode); 1063 struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode);
1062 1064
1063 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 1065 cifs_sb = CIFS_SB(dentry->d_sb);
1064
1065 pTcon = cifs_sb->tcon;
1066 1066
1067 cFYI(1, "write %zd bytes to offset %lld of %s", write_size, 1067 cFYI(1, "write %zd bytes to offset %lld of %s", write_size,
1068 *poffset, file->f_path.dentry->d_name.name); 1068 *poffset, dentry->d_name.name);
1069 1069
1070 if (file->private_data == NULL) 1070 pTcon = tlink_tcon(open_file->tlink);
1071 return -EBADF;
1072 open_file = file->private_data;
1073 1071
1074 xid = GetXid(); 1072 xid = GetXid();
1075 1073
@@ -1078,28 +1076,12 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
1078 total_written += bytes_written) { 1076 total_written += bytes_written) {
1079 rc = -EAGAIN; 1077 rc = -EAGAIN;
1080 while (rc == -EAGAIN) { 1078 while (rc == -EAGAIN) {
1081 if (file->private_data == NULL) {
1082 /* file has been closed on us */
1083 FreeXid(xid);
1084 /* if we have gotten here we have written some data
1085 and blocked, and the file has been freed on us
1086 while we blocked so return what we managed to
1087 write */
1088 return total_written;
1089 }
1090 if (open_file->closePend) {
1091 FreeXid(xid);
1092 if (total_written)
1093 return total_written;
1094 else
1095 return -EBADF;
1096 }
1097 if (open_file->invalidHandle) { 1079 if (open_file->invalidHandle) {
1098 /* we could deadlock if we called 1080 /* we could deadlock if we called
1099 filemap_fdatawait from here so tell 1081 filemap_fdatawait from here so tell
1100 reopen_file not to flush data to 1082 reopen_file not to flush data to
1101 server now */ 1083 server now */
1102 rc = cifs_reopen_file(file, false); 1084 rc = cifs_reopen_file(open_file, false);
1103 if (rc != 0) 1085 if (rc != 0)
1104 break; 1086 break;
1105 } 1087 }
@@ -1146,43 +1128,41 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
1146 1128
1147 cifs_stats_bytes_written(pTcon, total_written); 1129 cifs_stats_bytes_written(pTcon, total_written);
1148 1130
1149 /* since the write may have blocked check these pointers again */ 1131 if (total_written > 0) {
1150 if ((file->f_path.dentry) && (file->f_path.dentry->d_inode)) { 1132 spin_lock(&dentry->d_inode->i_lock);
1151/*BB We could make this contingent on superblock ATIME flag too */ 1133 if (*poffset > dentry->d_inode->i_size)
1152/* file->f_path.dentry->d_inode->i_ctime = 1134 i_size_write(dentry->d_inode, *poffset);
1153 file->f_path.dentry->d_inode->i_mtime = CURRENT_TIME;*/ 1135 spin_unlock(&dentry->d_inode->i_lock);
1154 if (total_written > 0) {
1155 spin_lock(&file->f_path.dentry->d_inode->i_lock);
1156 if (*poffset > file->f_path.dentry->d_inode->i_size)
1157 i_size_write(file->f_path.dentry->d_inode,
1158 *poffset);
1159 spin_unlock(&file->f_path.dentry->d_inode->i_lock);
1160 }
1161 mark_inode_dirty_sync(file->f_path.dentry->d_inode);
1162 } 1136 }
1137 mark_inode_dirty_sync(dentry->d_inode);
1163 FreeXid(xid); 1138 FreeXid(xid);
1164 return total_written; 1139 return total_written;
1165} 1140}
1166 1141
1167#ifdef CONFIG_CIFS_EXPERIMENTAL 1142#ifdef CONFIG_CIFS_EXPERIMENTAL
1168struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode) 1143struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
1144 bool fsuid_only)
1169{ 1145{
1170 struct cifsFileInfo *open_file = NULL; 1146 struct cifsFileInfo *open_file = NULL;
1147 struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
1148
1149 /* only filter by fsuid on multiuser mounts */
1150 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
1151 fsuid_only = false;
1171 1152
1172 read_lock(&GlobalSMBSeslock); 1153 spin_lock(&cifs_file_list_lock);
1173 /* we could simply get the first_list_entry since write-only entries 1154 /* we could simply get the first_list_entry since write-only entries
1174 are always at the end of the list but since the first entry might 1155 are always at the end of the list but since the first entry might
1175 have a close pending, we go through the whole list */ 1156 have a close pending, we go through the whole list */
1176 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { 1157 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
1177 if (open_file->closePend) 1158 if (fsuid_only && open_file->uid != current_fsuid())
1178 continue; 1159 continue;
1179 if (open_file->pfile && ((open_file->pfile->f_flags & O_RDWR) || 1160 if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) {
1180 (open_file->pfile->f_flags & O_RDONLY))) {
1181 if (!open_file->invalidHandle) { 1161 if (!open_file->invalidHandle) {
1182 /* found a good file */ 1162 /* found a good file */
1183 /* lock it so it will not be closed on us */ 1163 /* lock it so it will not be closed on us */
1184 cifsFileInfo_get(open_file); 1164 cifsFileInfo_get(open_file);
1185 read_unlock(&GlobalSMBSeslock); 1165 spin_unlock(&cifs_file_list_lock);
1186 return open_file; 1166 return open_file;
1187 } /* else might as well continue, and look for 1167 } /* else might as well continue, and look for
1188 another, or simply have the caller reopen it 1168 another, or simply have the caller reopen it
@@ -1190,14 +1170,16 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
1190 } else /* write only file */ 1170 } else /* write only file */
1191 break; /* write only files are last so must be done */ 1171 break; /* write only files are last so must be done */
1192 } 1172 }
1193 read_unlock(&GlobalSMBSeslock); 1173 spin_unlock(&cifs_file_list_lock);
1194 return NULL; 1174 return NULL;
1195} 1175}
1196#endif 1176#endif
1197 1177
1198struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode) 1178struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
1179 bool fsuid_only)
1199{ 1180{
1200 struct cifsFileInfo *open_file; 1181 struct cifsFileInfo *open_file;
1182 struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
1201 bool any_available = false; 1183 bool any_available = false;
1202 int rc; 1184 int rc;
1203 1185
@@ -1211,53 +1193,39 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
1211 return NULL; 1193 return NULL;
1212 } 1194 }
1213 1195
1214 read_lock(&GlobalSMBSeslock); 1196 /* only filter by fsuid on multiuser mounts */
1197 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
1198 fsuid_only = false;
1199
1200 spin_lock(&cifs_file_list_lock);
1215refind_writable: 1201refind_writable:
1216 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { 1202 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
1217 if (open_file->closePend || 1203 if (!any_available && open_file->pid != current->tgid)
1218 (!any_available && open_file->pid != current->tgid))
1219 continue; 1204 continue;
1220 1205 if (fsuid_only && open_file->uid != current_fsuid())
1221 if (open_file->pfile && 1206 continue;
1222 ((open_file->pfile->f_flags & O_RDWR) || 1207 if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
1223 (open_file->pfile->f_flags & O_WRONLY))) {
1224 cifsFileInfo_get(open_file); 1208 cifsFileInfo_get(open_file);
1225 1209
1226 if (!open_file->invalidHandle) { 1210 if (!open_file->invalidHandle) {
1227 /* found a good writable file */ 1211 /* found a good writable file */
1228 read_unlock(&GlobalSMBSeslock); 1212 spin_unlock(&cifs_file_list_lock);
1229 return open_file; 1213 return open_file;
1230 } 1214 }
1231 1215
1232 read_unlock(&GlobalSMBSeslock); 1216 spin_unlock(&cifs_file_list_lock);
1217
1233 /* Had to unlock since following call can block */ 1218 /* Had to unlock since following call can block */
1234 rc = cifs_reopen_file(open_file->pfile, false); 1219 rc = cifs_reopen_file(open_file, false);
1235 if (!rc) { 1220 if (!rc)
1236 if (!open_file->closePend) 1221 return open_file;
1237 return open_file;
1238 else { /* start over in case this was deleted */
1239 /* since the list could be modified */
1240 read_lock(&GlobalSMBSeslock);
1241 cifsFileInfo_put(open_file);
1242 goto refind_writable;
1243 }
1244 }
1245 1222
1246 /* if it fails, try another handle if possible - 1223 /* if it fails, try another handle if possible */
1247 (we can not do this if closePending since
1248 loop could be modified - in which case we
1249 have to start at the beginning of the list
1250 again. Note that it would be bad
1251 to hold up writepages here (rather than
1252 in caller) with continuous retries */
1253 cFYI(1, "wp failed on reopen file"); 1224 cFYI(1, "wp failed on reopen file");
1254 read_lock(&GlobalSMBSeslock);
1255 /* can not use this handle, no write
1256 pending on this one after all */
1257 cifsFileInfo_put(open_file); 1225 cifsFileInfo_put(open_file);
1258 1226
1259 if (open_file->closePend) /* list could have changed */ 1227 spin_lock(&cifs_file_list_lock);
1260 goto refind_writable; 1228
1261 /* else we simply continue to the next entry. Thus 1229 /* else we simply continue to the next entry. Thus
1262 we do not loop on reopen errors. If we 1230 we do not loop on reopen errors. If we
1263 can not reopen the file, for example if we 1231 can not reopen the file, for example if we
@@ -1272,7 +1240,7 @@ refind_writable:
1272 any_available = true; 1240 any_available = true;
1273 goto refind_writable; 1241 goto refind_writable;
1274 } 1242 }
1275 read_unlock(&GlobalSMBSeslock); 1243 spin_unlock(&cifs_file_list_lock);
1276 return NULL; 1244 return NULL;
1277} 1245}
1278 1246
@@ -1284,7 +1252,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1284 int rc = -EFAULT; 1252 int rc = -EFAULT;
1285 int bytes_written = 0; 1253 int bytes_written = 0;
1286 struct cifs_sb_info *cifs_sb; 1254 struct cifs_sb_info *cifs_sb;
1287 struct cifsTconInfo *pTcon;
1288 struct inode *inode; 1255 struct inode *inode;
1289 struct cifsFileInfo *open_file; 1256 struct cifsFileInfo *open_file;
1290 1257
@@ -1293,7 +1260,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1293 1260
1294 inode = page->mapping->host; 1261 inode = page->mapping->host;
1295 cifs_sb = CIFS_SB(inode->i_sb); 1262 cifs_sb = CIFS_SB(inode->i_sb);
1296 pTcon = cifs_sb->tcon;
1297 1263
1298 offset += (loff_t)from; 1264 offset += (loff_t)from;
1299 write_data = kmap(page); 1265 write_data = kmap(page);
@@ -1314,10 +1280,10 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1314 if (mapping->host->i_size - offset < (loff_t)to) 1280 if (mapping->host->i_size - offset < (loff_t)to)
1315 to = (unsigned)(mapping->host->i_size - offset); 1281 to = (unsigned)(mapping->host->i_size - offset);
1316 1282
1317 open_file = find_writable_file(CIFS_I(mapping->host)); 1283 open_file = find_writable_file(CIFS_I(mapping->host), false);
1318 if (open_file) { 1284 if (open_file) {
1319 bytes_written = cifs_write(open_file->pfile, write_data, 1285 bytes_written = cifs_write(open_file, write_data,
1320 to-from, &offset); 1286 to - from, &offset);
1321 cifsFileInfo_put(open_file); 1287 cifsFileInfo_put(open_file);
1322 /* Does mm or vfs already set times? */ 1288 /* Does mm or vfs already set times? */
1323 inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb); 1289 inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
@@ -1337,7 +1303,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1337static int cifs_writepages(struct address_space *mapping, 1303static int cifs_writepages(struct address_space *mapping,
1338 struct writeback_control *wbc) 1304 struct writeback_control *wbc)
1339{ 1305{
1340 struct backing_dev_info *bdi = mapping->backing_dev_info;
1341 unsigned int bytes_to_write; 1306 unsigned int bytes_to_write;
1342 unsigned int bytes_written; 1307 unsigned int bytes_written;
1343 struct cifs_sb_info *cifs_sb; 1308 struct cifs_sb_info *cifs_sb;
@@ -1352,6 +1317,7 @@ static int cifs_writepages(struct address_space *mapping,
1352 int nr_pages; 1317 int nr_pages;
1353 __u64 offset = 0; 1318 __u64 offset = 0;
1354 struct cifsFileInfo *open_file; 1319 struct cifsFileInfo *open_file;
1320 struct cifsTconInfo *tcon;
1355 struct cifsInodeInfo *cifsi = CIFS_I(mapping->host); 1321 struct cifsInodeInfo *cifsi = CIFS_I(mapping->host);
1356 struct page *page; 1322 struct page *page;
1357 struct pagevec pvec; 1323 struct pagevec pvec;
@@ -1368,27 +1334,29 @@ static int cifs_writepages(struct address_space *mapping,
1368 if (cifs_sb->wsize < PAGE_CACHE_SIZE) 1334 if (cifs_sb->wsize < PAGE_CACHE_SIZE)
1369 return generic_writepages(mapping, wbc); 1335 return generic_writepages(mapping, wbc);
1370 1336
1371 if ((cifs_sb->tcon->ses) && (cifs_sb->tcon->ses->server))
1372 if (cifs_sb->tcon->ses->server->secMode &
1373 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
1374 if (!experimEnabled)
1375 return generic_writepages(mapping, wbc);
1376
1377 iov = kmalloc(32 * sizeof(struct kvec), GFP_KERNEL); 1337 iov = kmalloc(32 * sizeof(struct kvec), GFP_KERNEL);
1378 if (iov == NULL) 1338 if (iov == NULL)
1379 return generic_writepages(mapping, wbc); 1339 return generic_writepages(mapping, wbc);
1380 1340
1381
1382 /* 1341 /*
1383 * BB: Is this meaningful for a non-block-device file system? 1342 * if there's no open file, then this is likely to fail too,
1384 * If it is, we should test it again after we do I/O 1343 * but it'll at least handle the return. Maybe it should be
1344 * a BUG() instead?
1385 */ 1345 */
1386 if (wbc->nonblocking && bdi_write_congested(bdi)) { 1346 open_file = find_writable_file(CIFS_I(mapping->host), false);
1387 wbc->encountered_congestion = 1; 1347 if (!open_file) {
1388 kfree(iov); 1348 kfree(iov);
1389 return 0; 1349 return generic_writepages(mapping, wbc);
1390 } 1350 }
1391 1351
1352 tcon = tlink_tcon(open_file->tlink);
1353 if (!experimEnabled && tcon->ses->server->secMode &
1354 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
1355 cifsFileInfo_put(open_file);
1356 return generic_writepages(mapping, wbc);
1357 }
1358 cifsFileInfo_put(open_file);
1359
1392 xid = GetXid(); 1360 xid = GetXid();
1393 1361
1394 pagevec_init(&pvec, 0); 1362 pagevec_init(&pvec, 0);
@@ -1492,38 +1460,34 @@ retry:
1492 break; 1460 break;
1493 } 1461 }
1494 if (n_iov) { 1462 if (n_iov) {
1495 /* Search for a writable handle every time we call 1463 open_file = find_writable_file(CIFS_I(mapping->host),
1496 * CIFSSMBWrite2. We can't rely on the last handle 1464 false);
1497 * we used to still be valid
1498 */
1499 open_file = find_writable_file(CIFS_I(mapping->host));
1500 if (!open_file) { 1465 if (!open_file) {
1501 cERROR(1, "No writable handles for inode"); 1466 cERROR(1, "No writable handles for inode");
1502 rc = -EBADF; 1467 rc = -EBADF;
1503 } else { 1468 } else {
1504 long_op = cifs_write_timeout(cifsi, offset); 1469 long_op = cifs_write_timeout(cifsi, offset);
1505 rc = CIFSSMBWrite2(xid, cifs_sb->tcon, 1470 rc = CIFSSMBWrite2(xid, tcon, open_file->netfid,
1506 open_file->netfid,
1507 bytes_to_write, offset, 1471 bytes_to_write, offset,
1508 &bytes_written, iov, n_iov, 1472 &bytes_written, iov, n_iov,
1509 long_op); 1473 long_op);
1510 cifsFileInfo_put(open_file); 1474 cifsFileInfo_put(open_file);
1511 cifs_update_eof(cifsi, offset, bytes_written); 1475 cifs_update_eof(cifsi, offset, bytes_written);
1476 }
1512 1477
1513 if (rc || bytes_written < bytes_to_write) { 1478 if (rc || bytes_written < bytes_to_write) {
1514 cERROR(1, "Write2 ret %d, wrote %d", 1479 cERROR(1, "Write2 ret %d, wrote %d",
1515 rc, bytes_written); 1480 rc, bytes_written);
1516 /* BB what if continued retry is 1481 /* BB what if continued retry is
1517 requested via mount flags? */ 1482 requested via mount flags? */
1518 if (rc == -ENOSPC) 1483 if (rc == -ENOSPC)
1519 set_bit(AS_ENOSPC, &mapping->flags); 1484 set_bit(AS_ENOSPC, &mapping->flags);
1520 else 1485 else
1521 set_bit(AS_EIO, &mapping->flags); 1486 set_bit(AS_EIO, &mapping->flags);
1522 } else { 1487 } else {
1523 cifs_stats_bytes_written(cifs_sb->tcon, 1488 cifs_stats_bytes_written(tcon, bytes_written);
1524 bytes_written);
1525 }
1526 } 1489 }
1490
1527 for (i = 0; i < n_iov; i++) { 1491 for (i = 0; i < n_iov; i++) {
1528 page = pvec.pages[first + i]; 1492 page = pvec.pages[first + i];
1529 /* Should we also set page error on 1493 /* Should we also set page error on
@@ -1624,7 +1588,8 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
1624 /* BB check if anything else missing out of ppw 1588 /* BB check if anything else missing out of ppw
1625 such as updating last write time */ 1589 such as updating last write time */
1626 page_data = kmap(page); 1590 page_data = kmap(page);
1627 rc = cifs_write(file, page_data + offset, copied, &pos); 1591 rc = cifs_write(file->private_data, page_data + offset,
1592 copied, &pos);
1628 /* if (rc < 0) should we set writebehind rc? */ 1593 /* if (rc < 0) should we set writebehind rc? */
1629 kunmap(page); 1594 kunmap(page);
1630 1595
@@ -1665,7 +1630,7 @@ int cifs_fsync(struct file *file, int datasync)
1665 if (rc == 0) { 1630 if (rc == 0) {
1666 rc = CIFS_I(inode)->write_behind_rc; 1631 rc = CIFS_I(inode)->write_behind_rc;
1667 CIFS_I(inode)->write_behind_rc = 0; 1632 CIFS_I(inode)->write_behind_rc = 0;
1668 tcon = CIFS_SB(inode->i_sb)->tcon; 1633 tcon = tlink_tcon(smbfile->tlink);
1669 if (!rc && tcon && smbfile && 1634 if (!rc && tcon && smbfile &&
1670 !(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) 1635 !(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
1671 rc = CIFSSMBFlush(xid, tcon, smbfile->netfid); 1636 rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
@@ -1750,7 +1715,6 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
1750 1715
1751 xid = GetXid(); 1716 xid = GetXid();
1752 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 1717 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1753 pTcon = cifs_sb->tcon;
1754 1718
1755 if (file->private_data == NULL) { 1719 if (file->private_data == NULL) {
1756 rc = -EBADF; 1720 rc = -EBADF;
@@ -1758,6 +1722,7 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
1758 return rc; 1722 return rc;
1759 } 1723 }
1760 open_file = file->private_data; 1724 open_file = file->private_data;
1725 pTcon = tlink_tcon(open_file->tlink);
1761 1726
1762 if ((file->f_flags & O_ACCMODE) == O_WRONLY) 1727 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
1763 cFYI(1, "attempting read on write only file instance"); 1728 cFYI(1, "attempting read on write only file instance");
@@ -1771,9 +1736,8 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
1771 smb_read_data = NULL; 1736 smb_read_data = NULL;
1772 while (rc == -EAGAIN) { 1737 while (rc == -EAGAIN) {
1773 int buf_type = CIFS_NO_BUFFER; 1738 int buf_type = CIFS_NO_BUFFER;
1774 if ((open_file->invalidHandle) && 1739 if (open_file->invalidHandle) {
1775 (!open_file->closePend)) { 1740 rc = cifs_reopen_file(open_file, true);
1776 rc = cifs_reopen_file(file, true);
1777 if (rc != 0) 1741 if (rc != 0)
1778 break; 1742 break;
1779 } 1743 }
@@ -1831,7 +1795,6 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1831 1795
1832 xid = GetXid(); 1796 xid = GetXid();
1833 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 1797 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1834 pTcon = cifs_sb->tcon;
1835 1798
1836 if (file->private_data == NULL) { 1799 if (file->private_data == NULL) {
1837 rc = -EBADF; 1800 rc = -EBADF;
@@ -1839,6 +1802,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1839 return rc; 1802 return rc;
1840 } 1803 }
1841 open_file = file->private_data; 1804 open_file = file->private_data;
1805 pTcon = tlink_tcon(open_file->tlink);
1842 1806
1843 if ((file->f_flags & O_ACCMODE) == O_WRONLY) 1807 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
1844 cFYI(1, "attempting read on write only file instance"); 1808 cFYI(1, "attempting read on write only file instance");
@@ -1857,9 +1821,8 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1857 } 1821 }
1858 rc = -EAGAIN; 1822 rc = -EAGAIN;
1859 while (rc == -EAGAIN) { 1823 while (rc == -EAGAIN) {
1860 if ((open_file->invalidHandle) && 1824 if (open_file->invalidHandle) {
1861 (!open_file->closePend)) { 1825 rc = cifs_reopen_file(open_file, true);
1862 rc = cifs_reopen_file(file, true);
1863 if (rc != 0) 1826 if (rc != 0)
1864 break; 1827 break;
1865 } 1828 }
@@ -1974,7 +1937,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
1974 } 1937 }
1975 open_file = file->private_data; 1938 open_file = file->private_data;
1976 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 1939 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1977 pTcon = cifs_sb->tcon; 1940 pTcon = tlink_tcon(open_file->tlink);
1978 1941
1979 /* 1942 /*
1980 * Reads as many pages as possible from fscache. Returns -ENOBUFS 1943 * Reads as many pages as possible from fscache. Returns -ENOBUFS
@@ -2022,9 +1985,8 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
2022 read_size, contig_pages); 1985 read_size, contig_pages);
2023 rc = -EAGAIN; 1986 rc = -EAGAIN;
2024 while (rc == -EAGAIN) { 1987 while (rc == -EAGAIN) {
2025 if ((open_file->invalidHandle) && 1988 if (open_file->invalidHandle) {
2026 (!open_file->closePend)) { 1989 rc = cifs_reopen_file(open_file, true);
2027 rc = cifs_reopen_file(file, true);
2028 if (rc != 0) 1990 if (rc != 0)
2029 break; 1991 break;
2030 } 1992 }
@@ -2173,18 +2135,14 @@ static int is_inode_writable(struct cifsInodeInfo *cifs_inode)
2173{ 2135{
2174 struct cifsFileInfo *open_file; 2136 struct cifsFileInfo *open_file;
2175 2137
2176 read_lock(&GlobalSMBSeslock); 2138 spin_lock(&cifs_file_list_lock);
2177 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { 2139 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
2178 if (open_file->closePend) 2140 if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
2179 continue; 2141 spin_unlock(&cifs_file_list_lock);
2180 if (open_file->pfile &&
2181 ((open_file->pfile->f_flags & O_RDWR) ||
2182 (open_file->pfile->f_flags & O_WRONLY))) {
2183 read_unlock(&GlobalSMBSeslock);
2184 return 1; 2142 return 1;
2185 } 2143 }
2186 } 2144 }
2187 read_unlock(&GlobalSMBSeslock); 2145 spin_unlock(&cifs_file_list_lock);
2188 return 0; 2146 return 0;
2189} 2147}
2190 2148
@@ -2310,9 +2268,8 @@ void cifs_oplock_break(struct work_struct *work)
2310{ 2268{
2311 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, 2269 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
2312 oplock_break); 2270 oplock_break);
2313 struct inode *inode = cfile->pInode; 2271 struct inode *inode = cfile->dentry->d_inode;
2314 struct cifsInodeInfo *cinode = CIFS_I(inode); 2272 struct cifsInodeInfo *cinode = CIFS_I(inode);
2315 struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->mnt->mnt_sb);
2316 int rc, waitrc = 0; 2273 int rc, waitrc = 0;
2317 2274
2318 if (inode && S_ISREG(inode->i_mode)) { 2275 if (inode && S_ISREG(inode->i_mode)) {
@@ -2338,9 +2295,9 @@ void cifs_oplock_break(struct work_struct *work)
2338 * not bother sending an oplock release if session to server still is 2295 * not bother sending an oplock release if session to server still is
2339 * disconnected since oplock already released by the server 2296 * disconnected since oplock already released by the server
2340 */ 2297 */
2341 if (!cfile->closePend && !cfile->oplock_break_cancelled) { 2298 if (!cfile->oplock_break_cancelled) {
2342 rc = CIFSSMBLock(0, cifs_sb->tcon, cfile->netfid, 0, 0, 0, 0, 2299 rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0,
2343 LOCKING_ANDX_OPLOCK_RELEASE, false); 2300 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false);
2344 cFYI(1, "Oplock release rc = %d", rc); 2301 cFYI(1, "Oplock release rc = %d", rc);
2345 } 2302 }
2346 2303
@@ -2349,22 +2306,22 @@ void cifs_oplock_break(struct work_struct *work)
2349 * finished grabbing reference for us. Make sure it's done by 2306 * finished grabbing reference for us. Make sure it's done by
2350 * waiting for GlobalSMSSeslock. 2307 * waiting for GlobalSMSSeslock.
2351 */ 2308 */
2352 write_lock(&GlobalSMBSeslock); 2309 spin_lock(&cifs_file_list_lock);
2353 write_unlock(&GlobalSMBSeslock); 2310 spin_unlock(&cifs_file_list_lock);
2354 2311
2355 cifs_oplock_break_put(cfile); 2312 cifs_oplock_break_put(cfile);
2356} 2313}
2357 2314
2358void cifs_oplock_break_get(struct cifsFileInfo *cfile) 2315void cifs_oplock_break_get(struct cifsFileInfo *cfile)
2359{ 2316{
2360 mntget(cfile->mnt); 2317 cifs_sb_active(cfile->dentry->d_sb);
2361 cifsFileInfo_get(cfile); 2318 cifsFileInfo_get(cfile);
2362} 2319}
2363 2320
2364void cifs_oplock_break_put(struct cifsFileInfo *cfile) 2321void cifs_oplock_break_put(struct cifsFileInfo *cfile)
2365{ 2322{
2366 mntput(cfile->mnt);
2367 cifsFileInfo_put(cfile); 2323 cifsFileInfo_put(cfile);
2324 cifs_sb_deactive(cfile->dentry->d_sb);
2368} 2325}
2369 2326
2370const struct address_space_operations cifs_addr_ops = { 2327const struct address_space_operations cifs_addr_ops = {
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 9f3f5c4be161..a2ad94efcfe6 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -62,15 +62,15 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode)
62{ 62{
63 struct cifsInodeInfo *cifsi = CIFS_I(inode); 63 struct cifsInodeInfo *cifsi = CIFS_I(inode);
64 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 64 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
65 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
65 66
66 if (cifsi->fscache) 67 if (cifsi->fscache)
67 return; 68 return;
68 69
69 cifsi->fscache = fscache_acquire_cookie(cifs_sb->tcon->fscache, 70 cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
70 &cifs_fscache_inode_object_def, 71 &cifs_fscache_inode_object_def, cifsi);
71 cifsi); 72 cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache,
72 cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", 73 cifsi->fscache);
73 cifs_sb->tcon->fscache, cifsi->fscache);
74} 74}
75 75
76void cifs_fscache_release_inode_cookie(struct inode *inode) 76void cifs_fscache_release_inode_cookie(struct inode *inode)
@@ -117,7 +117,8 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode)
117 /* retire the current fscache cache and get a new one */ 117 /* retire the current fscache cache and get a new one */
118 fscache_relinquish_cookie(cifsi->fscache, 1); 118 fscache_relinquish_cookie(cifsi->fscache, 1);
119 119
120 cifsi->fscache = fscache_acquire_cookie(cifs_sb->tcon->fscache, 120 cifsi->fscache = fscache_acquire_cookie(
121 cifs_sb_master_tcon(cifs_sb)->fscache,
121 &cifs_fscache_inode_object_def, 122 &cifs_fscache_inode_object_def,
122 cifsi); 123 cifsi);
123 cFYI(1, "CIFS: new cookie 0x%p oldcookie 0x%p", 124 cFYI(1, "CIFS: new cookie 0x%p oldcookie 0x%p",
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 93f77d438d3c..94979309698a 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -52,7 +52,7 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
52 52
53 53
54 /* check if server can support readpages */ 54 /* check if server can support readpages */
55 if (cifs_sb->tcon->ses->server->maxBuf < 55 if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf <
56 PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE) 56 PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
57 inode->i_data.a_ops = &cifs_addr_ops_smallbuf; 57 inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
58 else 58 else
@@ -288,8 +288,8 @@ int cifs_get_file_info_unix(struct file *filp)
288 struct cifs_fattr fattr; 288 struct cifs_fattr fattr;
289 struct inode *inode = filp->f_path.dentry->d_inode; 289 struct inode *inode = filp->f_path.dentry->d_inode;
290 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 290 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
291 struct cifsTconInfo *tcon = cifs_sb->tcon;
292 struct cifsFileInfo *cfile = filp->private_data; 291 struct cifsFileInfo *cfile = filp->private_data;
292 struct cifsTconInfo *tcon = tlink_tcon(cfile->tlink);
293 293
294 xid = GetXid(); 294 xid = GetXid();
295 rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data); 295 rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -313,15 +313,21 @@ int cifs_get_inode_info_unix(struct inode **pinode,
313 FILE_UNIX_BASIC_INFO find_data; 313 FILE_UNIX_BASIC_INFO find_data;
314 struct cifs_fattr fattr; 314 struct cifs_fattr fattr;
315 struct cifsTconInfo *tcon; 315 struct cifsTconInfo *tcon;
316 struct tcon_link *tlink;
316 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 317 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
317 318
318 tcon = cifs_sb->tcon;
319 cFYI(1, "Getting info on %s", full_path); 319 cFYI(1, "Getting info on %s", full_path);
320 320
321 tlink = cifs_sb_tlink(cifs_sb);
322 if (IS_ERR(tlink))
323 return PTR_ERR(tlink);
324 tcon = tlink_tcon(tlink);
325
321 /* could have done a find first instead but this returns more info */ 326 /* could have done a find first instead but this returns more info */
322 rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data, 327 rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data,
323 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 328 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
324 CIFS_MOUNT_MAP_SPECIAL_CHR); 329 CIFS_MOUNT_MAP_SPECIAL_CHR);
330 cifs_put_tlink(tlink);
325 331
326 if (!rc) { 332 if (!rc) {
327 cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb); 333 cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
@@ -332,6 +338,13 @@ int cifs_get_inode_info_unix(struct inode **pinode,
332 return rc; 338 return rc;
333 } 339 }
334 340
341 /* check for Minshall+French symlinks */
342 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
343 int tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
344 if (tmprc)
345 cFYI(1, "CIFSCheckMFSymlink: %d", tmprc);
346 }
347
335 if (*pinode == NULL) { 348 if (*pinode == NULL) {
336 /* get new inode */ 349 /* get new inode */
337 cifs_fill_uniqueid(sb, &fattr); 350 cifs_fill_uniqueid(sb, &fattr);
@@ -353,7 +366,8 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
353 int rc; 366 int rc;
354 int oplock = 0; 367 int oplock = 0;
355 __u16 netfid; 368 __u16 netfid;
356 struct cifsTconInfo *pTcon = cifs_sb->tcon; 369 struct tcon_link *tlink;
370 struct cifsTconInfo *tcon;
357 char buf[24]; 371 char buf[24];
358 unsigned int bytes_read; 372 unsigned int bytes_read;
359 char *pbuf; 373 char *pbuf;
@@ -372,7 +386,12 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
372 return -EINVAL; /* EOPNOTSUPP? */ 386 return -EINVAL; /* EOPNOTSUPP? */
373 } 387 }
374 388
375 rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ, 389 tlink = cifs_sb_tlink(cifs_sb);
390 if (IS_ERR(tlink))
391 return PTR_ERR(tlink);
392 tcon = tlink_tcon(tlink);
393
394 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, GENERIC_READ,
376 CREATE_NOT_DIR, &netfid, &oplock, NULL, 395 CREATE_NOT_DIR, &netfid, &oplock, NULL,
377 cifs_sb->local_nls, 396 cifs_sb->local_nls,
378 cifs_sb->mnt_cifs_flags & 397 cifs_sb->mnt_cifs_flags &
@@ -380,7 +399,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
380 if (rc == 0) { 399 if (rc == 0) {
381 int buf_type = CIFS_NO_BUFFER; 400 int buf_type = CIFS_NO_BUFFER;
382 /* Read header */ 401 /* Read header */
383 rc = CIFSSMBRead(xid, pTcon, netfid, 402 rc = CIFSSMBRead(xid, tcon, netfid,
384 24 /* length */, 0 /* offset */, 403 24 /* length */, 0 /* offset */,
385 &bytes_read, &pbuf, &buf_type); 404 &bytes_read, &pbuf, &buf_type);
386 if ((rc == 0) && (bytes_read >= 8)) { 405 if ((rc == 0) && (bytes_read >= 8)) {
@@ -422,8 +441,9 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
422 fattr->cf_dtype = DT_REG; 441 fattr->cf_dtype = DT_REG;
423 rc = -EOPNOTSUPP; /* or some unknown SFU type */ 442 rc = -EOPNOTSUPP; /* or some unknown SFU type */
424 } 443 }
425 CIFSSMBClose(xid, pTcon, netfid); 444 CIFSSMBClose(xid, tcon, netfid);
426 } 445 }
446 cifs_put_tlink(tlink);
427 return rc; 447 return rc;
428} 448}
429 449
@@ -441,11 +461,19 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
441 ssize_t rc; 461 ssize_t rc;
442 char ea_value[4]; 462 char ea_value[4];
443 __u32 mode; 463 __u32 mode;
464 struct tcon_link *tlink;
465 struct cifsTconInfo *tcon;
444 466
445 rc = CIFSSMBQAllEAs(xid, cifs_sb->tcon, path, "SETFILEBITS", 467 tlink = cifs_sb_tlink(cifs_sb);
468 if (IS_ERR(tlink))
469 return PTR_ERR(tlink);
470 tcon = tlink_tcon(tlink);
471
472 rc = CIFSSMBQAllEAs(xid, tcon, path, "SETFILEBITS",
446 ea_value, 4 /* size of buf */, cifs_sb->local_nls, 473 ea_value, 4 /* size of buf */, cifs_sb->local_nls,
447 cifs_sb->mnt_cifs_flags & 474 cifs_sb->mnt_cifs_flags &
448 CIFS_MOUNT_MAP_SPECIAL_CHR); 475 CIFS_MOUNT_MAP_SPECIAL_CHR);
476 cifs_put_tlink(tlink);
449 if (rc < 0) 477 if (rc < 0)
450 return (int)rc; 478 return (int)rc;
451 else if (rc > 3) { 479 else if (rc > 3) {
@@ -468,6 +496,8 @@ static void
468cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, 496cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
469 struct cifs_sb_info *cifs_sb, bool adjust_tz) 497 struct cifs_sb_info *cifs_sb, bool adjust_tz)
470{ 498{
499 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
500
471 memset(fattr, 0, sizeof(*fattr)); 501 memset(fattr, 0, sizeof(*fattr));
472 fattr->cf_cifsattrs = le32_to_cpu(info->Attributes); 502 fattr->cf_cifsattrs = le32_to_cpu(info->Attributes);
473 if (info->DeletePending) 503 if (info->DeletePending)
@@ -482,8 +512,8 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
482 fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); 512 fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
483 513
484 if (adjust_tz) { 514 if (adjust_tz) {
485 fattr->cf_ctime.tv_sec += cifs_sb->tcon->ses->server->timeAdj; 515 fattr->cf_ctime.tv_sec += tcon->ses->server->timeAdj;
486 fattr->cf_mtime.tv_sec += cifs_sb->tcon->ses->server->timeAdj; 516 fattr->cf_mtime.tv_sec += tcon->ses->server->timeAdj;
487 } 517 }
488 518
489 fattr->cf_eof = le64_to_cpu(info->EndOfFile); 519 fattr->cf_eof = le64_to_cpu(info->EndOfFile);
@@ -515,8 +545,8 @@ int cifs_get_file_info(struct file *filp)
515 struct cifs_fattr fattr; 545 struct cifs_fattr fattr;
516 struct inode *inode = filp->f_path.dentry->d_inode; 546 struct inode *inode = filp->f_path.dentry->d_inode;
517 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 547 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
518 struct cifsTconInfo *tcon = cifs_sb->tcon;
519 struct cifsFileInfo *cfile = filp->private_data; 548 struct cifsFileInfo *cfile = filp->private_data;
549 struct cifsTconInfo *tcon = tlink_tcon(cfile->tlink);
520 550
521 xid = GetXid(); 551 xid = GetXid();
522 rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data); 552 rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -554,26 +584,33 @@ int cifs_get_inode_info(struct inode **pinode,
554{ 584{
555 int rc = 0, tmprc; 585 int rc = 0, tmprc;
556 struct cifsTconInfo *pTcon; 586 struct cifsTconInfo *pTcon;
587 struct tcon_link *tlink;
557 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 588 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
558 char *buf = NULL; 589 char *buf = NULL;
559 bool adjustTZ = false; 590 bool adjustTZ = false;
560 struct cifs_fattr fattr; 591 struct cifs_fattr fattr;
561 592
562 pTcon = cifs_sb->tcon; 593 tlink = cifs_sb_tlink(cifs_sb);
594 if (IS_ERR(tlink))
595 return PTR_ERR(tlink);
596 pTcon = tlink_tcon(tlink);
597
563 cFYI(1, "Getting info on %s", full_path); 598 cFYI(1, "Getting info on %s", full_path);
564 599
565 if ((pfindData == NULL) && (*pinode != NULL)) { 600 if ((pfindData == NULL) && (*pinode != NULL)) {
566 if (CIFS_I(*pinode)->clientCanCacheRead) { 601 if (CIFS_I(*pinode)->clientCanCacheRead) {
567 cFYI(1, "No need to revalidate cached inode sizes"); 602 cFYI(1, "No need to revalidate cached inode sizes");
568 return rc; 603 goto cgii_exit;
569 } 604 }
570 } 605 }
571 606
572 /* if file info not passed in then get it from server */ 607 /* if file info not passed in then get it from server */
573 if (pfindData == NULL) { 608 if (pfindData == NULL) {
574 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); 609 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
575 if (buf == NULL) 610 if (buf == NULL) {
576 return -ENOMEM; 611 rc = -ENOMEM;
612 goto cgii_exit;
613 }
577 pfindData = (FILE_ALL_INFO *)buf; 614 pfindData = (FILE_ALL_INFO *)buf;
578 615
579 /* could do find first instead but this returns more info */ 616 /* could do find first instead but this returns more info */
@@ -661,6 +698,13 @@ int cifs_get_inode_info(struct inode **pinode,
661 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) 698 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
662 cifs_sfu_mode(&fattr, full_path, cifs_sb, xid); 699 cifs_sfu_mode(&fattr, full_path, cifs_sb, xid);
663 700
701 /* check for Minshall+French symlinks */
702 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
703 tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
704 if (tmprc)
705 cFYI(1, "CIFSCheckMFSymlink: %d", tmprc);
706 }
707
664 if (!*pinode) { 708 if (!*pinode) {
665 *pinode = cifs_iget(sb, &fattr); 709 *pinode = cifs_iget(sb, &fattr);
666 if (!*pinode) 710 if (!*pinode)
@@ -671,6 +715,7 @@ int cifs_get_inode_info(struct inode **pinode,
671 715
672cgii_exit: 716cgii_exit:
673 kfree(buf); 717 kfree(buf);
718 cifs_put_tlink(tlink);
674 return rc; 719 return rc;
675} 720}
676 721
@@ -683,6 +728,7 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
683 int pplen = cifs_sb->prepathlen; 728 int pplen = cifs_sb->prepathlen;
684 int dfsplen; 729 int dfsplen;
685 char *full_path = NULL; 730 char *full_path = NULL;
731 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
686 732
687 /* if no prefix path, simply set path to the root of share to "" */ 733 /* if no prefix path, simply set path to the root of share to "" */
688 if (pplen == 0) { 734 if (pplen == 0) {
@@ -692,8 +738,8 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
692 return full_path; 738 return full_path;
693 } 739 }
694 740
695 if (cifs_sb->tcon && (cifs_sb->tcon->Flags & SMB_SHARE_IS_IN_DFS)) 741 if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
696 dfsplen = strnlen(cifs_sb->tcon->treeName, MAX_TREE_SIZE + 1); 742 dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
697 else 743 else
698 dfsplen = 0; 744 dfsplen = 0;
699 745
@@ -702,7 +748,7 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
702 return full_path; 748 return full_path;
703 749
704 if (dfsplen) { 750 if (dfsplen) {
705 strncpy(full_path, cifs_sb->tcon->treeName, dfsplen); 751 strncpy(full_path, tcon->treeName, dfsplen);
706 /* switch slash direction in prepath depending on whether 752 /* switch slash direction in prepath depending on whether
707 * windows or posix style path names 753 * windows or posix style path names
708 */ 754 */
@@ -801,6 +847,8 @@ retry_iget5_locked:
801 inode->i_flags |= S_NOATIME | S_NOCMTIME; 847 inode->i_flags |= S_NOATIME | S_NOCMTIME;
802 if (inode->i_state & I_NEW) { 848 if (inode->i_state & I_NEW) {
803 inode->i_ino = hash; 849 inode->i_ino = hash;
850 if (S_ISREG(inode->i_mode))
851 inode->i_data.backing_dev_info = sb->s_bdi;
804#ifdef CONFIG_CIFS_FSCACHE 852#ifdef CONFIG_CIFS_FSCACHE
805 /* initialize per-inode cache cookie pointer */ 853 /* initialize per-inode cache cookie pointer */
806 CIFS_I(inode)->fscache = NULL; 854 CIFS_I(inode)->fscache = NULL;
@@ -816,18 +864,18 @@ retry_iget5_locked:
816struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino) 864struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
817{ 865{
818 int xid; 866 int xid;
819 struct cifs_sb_info *cifs_sb; 867 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
820 struct inode *inode = NULL; 868 struct inode *inode = NULL;
821 long rc; 869 long rc;
822 char *full_path; 870 char *full_path;
871 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
823 872
824 cifs_sb = CIFS_SB(sb);
825 full_path = cifs_build_path_to_root(cifs_sb); 873 full_path = cifs_build_path_to_root(cifs_sb);
826 if (full_path == NULL) 874 if (full_path == NULL)
827 return ERR_PTR(-ENOMEM); 875 return ERR_PTR(-ENOMEM);
828 876
829 xid = GetXid(); 877 xid = GetXid();
830 if (cifs_sb->tcon->unix_ext) 878 if (tcon->unix_ext)
831 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid); 879 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
832 else 880 else
833 rc = cifs_get_inode_info(&inode, full_path, NULL, sb, 881 rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
@@ -838,10 +886,10 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
838 886
839#ifdef CONFIG_CIFS_FSCACHE 887#ifdef CONFIG_CIFS_FSCACHE
840 /* populate tcon->resource_id */ 888 /* populate tcon->resource_id */
841 cifs_sb->tcon->resource_id = CIFS_I(inode)->uniqueid; 889 tcon->resource_id = CIFS_I(inode)->uniqueid;
842#endif 890#endif
843 891
844 if (rc && cifs_sb->tcon->ipc) { 892 if (rc && tcon->ipc) {
845 cFYI(1, "ipc connection - fake read inode"); 893 cFYI(1, "ipc connection - fake read inode");
846 inode->i_mode |= S_IFDIR; 894 inode->i_mode |= S_IFDIR;
847 inode->i_nlink = 2; 895 inode->i_nlink = 2;
@@ -877,7 +925,8 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
877 struct cifsFileInfo *open_file; 925 struct cifsFileInfo *open_file;
878 struct cifsInodeInfo *cifsInode = CIFS_I(inode); 926 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
879 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 927 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
880 struct cifsTconInfo *pTcon = cifs_sb->tcon; 928 struct tcon_link *tlink = NULL;
929 struct cifsTconInfo *pTcon;
881 FILE_BASIC_INFO info_buf; 930 FILE_BASIC_INFO info_buf;
882 931
883 if (attrs == NULL) 932 if (attrs == NULL)
@@ -916,13 +965,22 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
916 /* 965 /*
917 * If the file is already open for write, just use that fileid 966 * If the file is already open for write, just use that fileid
918 */ 967 */
919 open_file = find_writable_file(cifsInode); 968 open_file = find_writable_file(cifsInode, true);
920 if (open_file) { 969 if (open_file) {
921 netfid = open_file->netfid; 970 netfid = open_file->netfid;
922 netpid = open_file->pid; 971 netpid = open_file->pid;
972 pTcon = tlink_tcon(open_file->tlink);
923 goto set_via_filehandle; 973 goto set_via_filehandle;
924 } 974 }
925 975
976 tlink = cifs_sb_tlink(cifs_sb);
977 if (IS_ERR(tlink)) {
978 rc = PTR_ERR(tlink);
979 tlink = NULL;
980 goto out;
981 }
982 pTcon = tlink_tcon(tlink);
983
926 /* 984 /*
927 * NT4 apparently returns success on this call, but it doesn't 985 * NT4 apparently returns success on this call, but it doesn't
928 * really work. 986 * really work.
@@ -966,6 +1024,8 @@ set_via_filehandle:
966 else 1024 else
967 cifsFileInfo_put(open_file); 1025 cifsFileInfo_put(open_file);
968out: 1026out:
1027 if (tlink != NULL)
1028 cifs_put_tlink(tlink);
969 return rc; 1029 return rc;
970} 1030}
971 1031
@@ -983,10 +1043,16 @@ cifs_rename_pending_delete(char *full_path, struct dentry *dentry, int xid)
983 struct inode *inode = dentry->d_inode; 1043 struct inode *inode = dentry->d_inode;
984 struct cifsInodeInfo *cifsInode = CIFS_I(inode); 1044 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
985 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 1045 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
986 struct cifsTconInfo *tcon = cifs_sb->tcon; 1046 struct tcon_link *tlink;
1047 struct cifsTconInfo *tcon;
987 __u32 dosattr, origattr; 1048 __u32 dosattr, origattr;
988 FILE_BASIC_INFO *info_buf = NULL; 1049 FILE_BASIC_INFO *info_buf = NULL;
989 1050
1051 tlink = cifs_sb_tlink(cifs_sb);
1052 if (IS_ERR(tlink))
1053 return PTR_ERR(tlink);
1054 tcon = tlink_tcon(tlink);
1055
990 rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, 1056 rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
991 DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR, 1057 DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR,
992 &netfid, &oplock, NULL, cifs_sb->local_nls, 1058 &netfid, &oplock, NULL, cifs_sb->local_nls,
@@ -1055,6 +1121,7 @@ out_close:
1055 CIFSSMBClose(xid, tcon, netfid); 1121 CIFSSMBClose(xid, tcon, netfid);
1056out: 1122out:
1057 kfree(info_buf); 1123 kfree(info_buf);
1124 cifs_put_tlink(tlink);
1058 return rc; 1125 return rc;
1059 1126
1060 /* 1127 /*
@@ -1094,12 +1161,18 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
1094 struct cifsInodeInfo *cifs_inode; 1161 struct cifsInodeInfo *cifs_inode;
1095 struct super_block *sb = dir->i_sb; 1162 struct super_block *sb = dir->i_sb;
1096 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 1163 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
1097 struct cifsTconInfo *tcon = cifs_sb->tcon; 1164 struct tcon_link *tlink;
1165 struct cifsTconInfo *tcon;
1098 struct iattr *attrs = NULL; 1166 struct iattr *attrs = NULL;
1099 __u32 dosattr = 0, origattr = 0; 1167 __u32 dosattr = 0, origattr = 0;
1100 1168
1101 cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry); 1169 cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry);
1102 1170
1171 tlink = cifs_sb_tlink(cifs_sb);
1172 if (IS_ERR(tlink))
1173 return PTR_ERR(tlink);
1174 tcon = tlink_tcon(tlink);
1175
1103 xid = GetXid(); 1176 xid = GetXid();
1104 1177
1105 /* Unlink can be called from rename so we can not take the 1178 /* Unlink can be called from rename so we can not take the
@@ -1107,8 +1180,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
1107 full_path = build_path_from_dentry(dentry); 1180 full_path = build_path_from_dentry(dentry);
1108 if (full_path == NULL) { 1181 if (full_path == NULL) {
1109 rc = -ENOMEM; 1182 rc = -ENOMEM;
1110 FreeXid(xid); 1183 goto unlink_out;
1111 return rc;
1112 } 1184 }
1113 1185
1114 if ((tcon->ses->capabilities & CAP_UNIX) && 1186 if ((tcon->ses->capabilities & CAP_UNIX) &&
@@ -1174,10 +1246,11 @@ out_reval:
1174 dir->i_ctime = dir->i_mtime = current_fs_time(sb); 1246 dir->i_ctime = dir->i_mtime = current_fs_time(sb);
1175 cifs_inode = CIFS_I(dir); 1247 cifs_inode = CIFS_I(dir);
1176 CIFS_I(dir)->time = 0; /* force revalidate of dir as well */ 1248 CIFS_I(dir)->time = 0; /* force revalidate of dir as well */
1177 1249unlink_out:
1178 kfree(full_path); 1250 kfree(full_path);
1179 kfree(attrs); 1251 kfree(attrs);
1180 FreeXid(xid); 1252 FreeXid(xid);
1253 cifs_put_tlink(tlink);
1181 return rc; 1254 return rc;
1182} 1255}
1183 1256
@@ -1186,6 +1259,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1186 int rc = 0, tmprc; 1259 int rc = 0, tmprc;
1187 int xid; 1260 int xid;
1188 struct cifs_sb_info *cifs_sb; 1261 struct cifs_sb_info *cifs_sb;
1262 struct tcon_link *tlink;
1189 struct cifsTconInfo *pTcon; 1263 struct cifsTconInfo *pTcon;
1190 char *full_path = NULL; 1264 char *full_path = NULL;
1191 struct inode *newinode = NULL; 1265 struct inode *newinode = NULL;
@@ -1193,16 +1267,18 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1193 1267
1194 cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode); 1268 cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode);
1195 1269
1196 xid = GetXid();
1197
1198 cifs_sb = CIFS_SB(inode->i_sb); 1270 cifs_sb = CIFS_SB(inode->i_sb);
1199 pTcon = cifs_sb->tcon; 1271 tlink = cifs_sb_tlink(cifs_sb);
1272 if (IS_ERR(tlink))
1273 return PTR_ERR(tlink);
1274 pTcon = tlink_tcon(tlink);
1275
1276 xid = GetXid();
1200 1277
1201 full_path = build_path_from_dentry(direntry); 1278 full_path = build_path_from_dentry(direntry);
1202 if (full_path == NULL) { 1279 if (full_path == NULL) {
1203 rc = -ENOMEM; 1280 rc = -ENOMEM;
1204 FreeXid(xid); 1281 goto mkdir_out;
1205 return rc;
1206 } 1282 }
1207 1283
1208 if ((pTcon->ses->capabilities & CAP_UNIX) && 1284 if ((pTcon->ses->capabilities & CAP_UNIX) &&
@@ -1360,6 +1436,7 @@ mkdir_get_info:
1360mkdir_out: 1436mkdir_out:
1361 kfree(full_path); 1437 kfree(full_path);
1362 FreeXid(xid); 1438 FreeXid(xid);
1439 cifs_put_tlink(tlink);
1363 return rc; 1440 return rc;
1364} 1441}
1365 1442
@@ -1368,6 +1445,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
1368 int rc = 0; 1445 int rc = 0;
1369 int xid; 1446 int xid;
1370 struct cifs_sb_info *cifs_sb; 1447 struct cifs_sb_info *cifs_sb;
1448 struct tcon_link *tlink;
1371 struct cifsTconInfo *pTcon; 1449 struct cifsTconInfo *pTcon;
1372 char *full_path = NULL; 1450 char *full_path = NULL;
1373 struct cifsInodeInfo *cifsInode; 1451 struct cifsInodeInfo *cifsInode;
@@ -1376,18 +1454,23 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
1376 1454
1377 xid = GetXid(); 1455 xid = GetXid();
1378 1456
1379 cifs_sb = CIFS_SB(inode->i_sb);
1380 pTcon = cifs_sb->tcon;
1381
1382 full_path = build_path_from_dentry(direntry); 1457 full_path = build_path_from_dentry(direntry);
1383 if (full_path == NULL) { 1458 if (full_path == NULL) {
1384 rc = -ENOMEM; 1459 rc = -ENOMEM;
1385 FreeXid(xid); 1460 goto rmdir_exit;
1386 return rc;
1387 } 1461 }
1388 1462
1463 cifs_sb = CIFS_SB(inode->i_sb);
1464 tlink = cifs_sb_tlink(cifs_sb);
1465 if (IS_ERR(tlink)) {
1466 rc = PTR_ERR(tlink);
1467 goto rmdir_exit;
1468 }
1469 pTcon = tlink_tcon(tlink);
1470
1389 rc = CIFSSMBRmDir(xid, pTcon, full_path, cifs_sb->local_nls, 1471 rc = CIFSSMBRmDir(xid, pTcon, full_path, cifs_sb->local_nls,
1390 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 1472 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
1473 cifs_put_tlink(tlink);
1391 1474
1392 if (!rc) { 1475 if (!rc) {
1393 drop_nlink(inode); 1476 drop_nlink(inode);
@@ -1408,6 +1491,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
1408 direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime = 1491 direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime =
1409 current_fs_time(inode->i_sb); 1492 current_fs_time(inode->i_sb);
1410 1493
1494rmdir_exit:
1411 kfree(full_path); 1495 kfree(full_path);
1412 FreeXid(xid); 1496 FreeXid(xid);
1413 return rc; 1497 return rc;
@@ -1418,10 +1502,16 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
1418 struct dentry *to_dentry, const char *toPath) 1502 struct dentry *to_dentry, const char *toPath)
1419{ 1503{
1420 struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb); 1504 struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb);
1421 struct cifsTconInfo *pTcon = cifs_sb->tcon; 1505 struct tcon_link *tlink;
1506 struct cifsTconInfo *pTcon;
1422 __u16 srcfid; 1507 __u16 srcfid;
1423 int oplock, rc; 1508 int oplock, rc;
1424 1509
1510 tlink = cifs_sb_tlink(cifs_sb);
1511 if (IS_ERR(tlink))
1512 return PTR_ERR(tlink);
1513 pTcon = tlink_tcon(tlink);
1514
1425 /* try path-based rename first */ 1515 /* try path-based rename first */
1426 rc = CIFSSMBRename(xid, pTcon, fromPath, toPath, cifs_sb->local_nls, 1516 rc = CIFSSMBRename(xid, pTcon, fromPath, toPath, cifs_sb->local_nls,
1427 cifs_sb->mnt_cifs_flags & 1517 cifs_sb->mnt_cifs_flags &
@@ -1433,11 +1523,11 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
1433 * rename by filehandle to various Windows servers. 1523 * rename by filehandle to various Windows servers.
1434 */ 1524 */
1435 if (rc == 0 || rc != -ETXTBSY) 1525 if (rc == 0 || rc != -ETXTBSY)
1436 return rc; 1526 goto do_rename_exit;
1437 1527
1438 /* open-file renames don't work across directories */ 1528 /* open-file renames don't work across directories */
1439 if (to_dentry->d_parent != from_dentry->d_parent) 1529 if (to_dentry->d_parent != from_dentry->d_parent)
1440 return rc; 1530 goto do_rename_exit;
1441 1531
1442 /* open the file to be renamed -- we need DELETE perms */ 1532 /* open the file to be renamed -- we need DELETE perms */
1443 rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE, 1533 rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE,
@@ -1453,7 +1543,8 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
1453 1543
1454 CIFSSMBClose(xid, pTcon, srcfid); 1544 CIFSSMBClose(xid, pTcon, srcfid);
1455 } 1545 }
1456 1546do_rename_exit:
1547 cifs_put_tlink(tlink);
1457 return rc; 1548 return rc;
1458} 1549}
1459 1550
@@ -1463,13 +1554,17 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
1463 char *fromName = NULL; 1554 char *fromName = NULL;
1464 char *toName = NULL; 1555 char *toName = NULL;
1465 struct cifs_sb_info *cifs_sb; 1556 struct cifs_sb_info *cifs_sb;
1557 struct tcon_link *tlink;
1466 struct cifsTconInfo *tcon; 1558 struct cifsTconInfo *tcon;
1467 FILE_UNIX_BASIC_INFO *info_buf_source = NULL; 1559 FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
1468 FILE_UNIX_BASIC_INFO *info_buf_target; 1560 FILE_UNIX_BASIC_INFO *info_buf_target;
1469 int xid, rc, tmprc; 1561 int xid, rc, tmprc;
1470 1562
1471 cifs_sb = CIFS_SB(source_dir->i_sb); 1563 cifs_sb = CIFS_SB(source_dir->i_sb);
1472 tcon = cifs_sb->tcon; 1564 tlink = cifs_sb_tlink(cifs_sb);
1565 if (IS_ERR(tlink))
1566 return PTR_ERR(tlink);
1567 tcon = tlink_tcon(tlink);
1473 1568
1474 xid = GetXid(); 1569 xid = GetXid();
1475 1570
@@ -1545,6 +1640,7 @@ cifs_rename_exit:
1545 kfree(fromName); 1640 kfree(fromName);
1546 kfree(toName); 1641 kfree(toName);
1547 FreeXid(xid); 1642 FreeXid(xid);
1643 cifs_put_tlink(tlink);
1548 return rc; 1644 return rc;
1549} 1645}
1550 1646
@@ -1597,11 +1693,12 @@ int cifs_revalidate_file(struct file *filp)
1597{ 1693{
1598 int rc = 0; 1694 int rc = 0;
1599 struct inode *inode = filp->f_path.dentry->d_inode; 1695 struct inode *inode = filp->f_path.dentry->d_inode;
1696 struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
1600 1697
1601 if (!cifs_inode_needs_reval(inode)) 1698 if (!cifs_inode_needs_reval(inode))
1602 goto check_inval; 1699 goto check_inval;
1603 1700
1604 if (CIFS_SB(inode->i_sb)->tcon->unix_ext) 1701 if (tlink_tcon(cfile->tlink)->unix_ext)
1605 rc = cifs_get_file_info_unix(filp); 1702 rc = cifs_get_file_info_unix(filp);
1606 else 1703 else
1607 rc = cifs_get_file_info(filp); 1704 rc = cifs_get_file_info(filp);
@@ -1642,7 +1739,7 @@ int cifs_revalidate_dentry(struct dentry *dentry)
1642 "jiffies %ld", full_path, inode, inode->i_count.counter, 1739 "jiffies %ld", full_path, inode, inode->i_count.counter,
1643 dentry, dentry->d_time, jiffies); 1740 dentry, dentry->d_time, jiffies);
1644 1741
1645 if (CIFS_SB(sb)->tcon->unix_ext) 1742 if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
1646 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid); 1743 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
1647 else 1744 else
1648 rc = cifs_get_inode_info(&inode, full_path, NULL, sb, 1745 rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
@@ -1658,13 +1755,29 @@ check_inval:
1658} 1755}
1659 1756
1660int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, 1757int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1661 struct kstat *stat) 1758 struct kstat *stat)
1662{ 1759{
1760 struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
1761 struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
1663 int err = cifs_revalidate_dentry(dentry); 1762 int err = cifs_revalidate_dentry(dentry);
1763
1664 if (!err) { 1764 if (!err) {
1665 generic_fillattr(dentry->d_inode, stat); 1765 generic_fillattr(dentry->d_inode, stat);
1666 stat->blksize = CIFS_MAX_MSGSIZE; 1766 stat->blksize = CIFS_MAX_MSGSIZE;
1667 stat->ino = CIFS_I(dentry->d_inode)->uniqueid; 1767 stat->ino = CIFS_I(dentry->d_inode)->uniqueid;
1768
1769 /*
1770 * If on a multiuser mount without unix extensions, and the
1771 * admin hasn't overridden them, set the ownership to the
1772 * fsuid/fsgid of the current process.
1773 */
1774 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) &&
1775 !tcon->unix_ext) {
1776 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID))
1777 stat->uid = current_fsuid();
1778 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID))
1779 stat->gid = current_fsgid();
1780 }
1668 } 1781 }
1669 return err; 1782 return err;
1670} 1783}
@@ -1706,7 +1819,8 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1706 struct cifsFileInfo *open_file; 1819 struct cifsFileInfo *open_file;
1707 struct cifsInodeInfo *cifsInode = CIFS_I(inode); 1820 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
1708 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 1821 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1709 struct cifsTconInfo *pTcon = cifs_sb->tcon; 1822 struct tcon_link *tlink = NULL;
1823 struct cifsTconInfo *pTcon = NULL;
1710 1824
1711 /* 1825 /*
1712 * To avoid spurious oplock breaks from server, in the case of 1826 * To avoid spurious oplock breaks from server, in the case of
@@ -1717,10 +1831,11 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1717 * writebehind data than the SMB timeout for the SetPathInfo 1831 * writebehind data than the SMB timeout for the SetPathInfo
1718 * request would allow 1832 * request would allow
1719 */ 1833 */
1720 open_file = find_writable_file(cifsInode); 1834 open_file = find_writable_file(cifsInode, true);
1721 if (open_file) { 1835 if (open_file) {
1722 __u16 nfid = open_file->netfid; 1836 __u16 nfid = open_file->netfid;
1723 __u32 npid = open_file->pid; 1837 __u32 npid = open_file->pid;
1838 pTcon = tlink_tcon(open_file->tlink);
1724 rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid, 1839 rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
1725 npid, false); 1840 npid, false);
1726 cifsFileInfo_put(open_file); 1841 cifsFileInfo_put(open_file);
@@ -1735,6 +1850,13 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1735 rc = -EINVAL; 1850 rc = -EINVAL;
1736 1851
1737 if (rc != 0) { 1852 if (rc != 0) {
1853 if (pTcon == NULL) {
1854 tlink = cifs_sb_tlink(cifs_sb);
1855 if (IS_ERR(tlink))
1856 return PTR_ERR(tlink);
1857 pTcon = tlink_tcon(tlink);
1858 }
1859
1738 /* Set file size by pathname rather than by handle 1860 /* Set file size by pathname rather than by handle
1739 either because no valid, writeable file handle for 1861 either because no valid, writeable file handle for
1740 it was found or because there was an error setting 1862 it was found or because there was an error setting
@@ -1764,6 +1886,8 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1764 CIFSSMBClose(xid, pTcon, netfid); 1886 CIFSSMBClose(xid, pTcon, netfid);
1765 } 1887 }
1766 } 1888 }
1889 if (tlink)
1890 cifs_put_tlink(tlink);
1767 } 1891 }
1768 1892
1769 if (rc == 0) { 1893 if (rc == 0) {
@@ -1784,7 +1908,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1784 struct inode *inode = direntry->d_inode; 1908 struct inode *inode = direntry->d_inode;
1785 struct cifsInodeInfo *cifsInode = CIFS_I(inode); 1909 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
1786 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 1910 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1787 struct cifsTconInfo *pTcon = cifs_sb->tcon; 1911 struct tcon_link *tlink;
1912 struct cifsTconInfo *pTcon;
1788 struct cifs_unix_set_info_args *args = NULL; 1913 struct cifs_unix_set_info_args *args = NULL;
1789 struct cifsFileInfo *open_file; 1914 struct cifsFileInfo *open_file;
1790 1915
@@ -1871,17 +1996,25 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1871 args->ctime = NO_CHANGE_64; 1996 args->ctime = NO_CHANGE_64;
1872 1997
1873 args->device = 0; 1998 args->device = 0;
1874 open_file = find_writable_file(cifsInode); 1999 open_file = find_writable_file(cifsInode, true);
1875 if (open_file) { 2000 if (open_file) {
1876 u16 nfid = open_file->netfid; 2001 u16 nfid = open_file->netfid;
1877 u32 npid = open_file->pid; 2002 u32 npid = open_file->pid;
2003 pTcon = tlink_tcon(open_file->tlink);
1878 rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid); 2004 rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
1879 cifsFileInfo_put(open_file); 2005 cifsFileInfo_put(open_file);
1880 } else { 2006 } else {
2007 tlink = cifs_sb_tlink(cifs_sb);
2008 if (IS_ERR(tlink)) {
2009 rc = PTR_ERR(tlink);
2010 goto out;
2011 }
2012 pTcon = tlink_tcon(tlink);
1881 rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args, 2013 rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args,
1882 cifs_sb->local_nls, 2014 cifs_sb->local_nls,
1883 cifs_sb->mnt_cifs_flags & 2015 cifs_sb->mnt_cifs_flags &
1884 CIFS_MOUNT_MAP_SPECIAL_CHR); 2016 CIFS_MOUNT_MAP_SPECIAL_CHR);
2017 cifs_put_tlink(tlink);
1885 } 2018 }
1886 2019
1887 if (rc) 2020 if (rc)
@@ -2062,7 +2195,7 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs)
2062{ 2195{
2063 struct inode *inode = direntry->d_inode; 2196 struct inode *inode = direntry->d_inode;
2064 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 2197 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
2065 struct cifsTconInfo *pTcon = cifs_sb->tcon; 2198 struct cifsTconInfo *pTcon = cifs_sb_master_tcon(cifs_sb);
2066 2199
2067 if (pTcon->unix_ext) 2200 if (pTcon->unix_ext)
2068 return cifs_setattr_unix(direntry, attrs); 2201 return cifs_setattr_unix(direntry, attrs);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 9d38a71c8e14..077bf756f342 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -37,11 +37,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
37 int xid; 37 int xid;
38 struct cifs_sb_info *cifs_sb; 38 struct cifs_sb_info *cifs_sb;
39#ifdef CONFIG_CIFS_POSIX 39#ifdef CONFIG_CIFS_POSIX
40 struct cifsFileInfo *pSMBFile = filep->private_data;
41 struct cifsTconInfo *tcon = tlink_tcon(pSMBFile->tlink);
40 __u64 ExtAttrBits = 0; 42 __u64 ExtAttrBits = 0;
41 __u64 ExtAttrMask = 0; 43 __u64 ExtAttrMask = 0;
42 __u64 caps; 44 __u64 caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
43 struct cifsTconInfo *tcon;
44 struct cifsFileInfo *pSMBFile = filep->private_data;
45#endif /* CONFIG_CIFS_POSIX */ 45#endif /* CONFIG_CIFS_POSIX */
46 46
47 xid = GetXid(); 47 xid = GetXid();
@@ -50,17 +50,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
50 50
51 cifs_sb = CIFS_SB(inode->i_sb); 51 cifs_sb = CIFS_SB(inode->i_sb);
52 52
53#ifdef CONFIG_CIFS_POSIX
54 tcon = cifs_sb->tcon;
55 if (tcon)
56 caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
57 else {
58 rc = -EIO;
59 FreeXid(xid);
60 return -EIO;
61 }
62#endif /* CONFIG_CIFS_POSIX */
63
64 switch (command) { 53 switch (command) {
65 case CIFS_IOC_CHECKUMOUNT: 54 case CIFS_IOC_CHECKUMOUNT:
66 cFYI(1, "User unmount attempted"); 55 cFYI(1, "User unmount attempted");
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 473ca8033656..85cdbf831e7b 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -28,6 +28,296 @@
28#include "cifsproto.h" 28#include "cifsproto.h"
29#include "cifs_debug.h" 29#include "cifs_debug.h"
30#include "cifs_fs_sb.h" 30#include "cifs_fs_sb.h"
31#include "md5.h"
32
33#define CIFS_MF_SYMLINK_LEN_OFFSET (4+1)
34#define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1))
35#define CIFS_MF_SYMLINK_LINK_OFFSET (CIFS_MF_SYMLINK_MD5_OFFSET+(32+1))
36#define CIFS_MF_SYMLINK_LINK_MAXLEN (1024)
37#define CIFS_MF_SYMLINK_FILE_SIZE \
38 (CIFS_MF_SYMLINK_LINK_OFFSET + CIFS_MF_SYMLINK_LINK_MAXLEN)
39
40#define CIFS_MF_SYMLINK_LEN_FORMAT "XSym\n%04u\n"
41#define CIFS_MF_SYMLINK_MD5_FORMAT \
42 "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n"
43#define CIFS_MF_SYMLINK_MD5_ARGS(md5_hash) \
44 md5_hash[0], md5_hash[1], md5_hash[2], md5_hash[3], \
45 md5_hash[4], md5_hash[5], md5_hash[6], md5_hash[7], \
46 md5_hash[8], md5_hash[9], md5_hash[10], md5_hash[11],\
47 md5_hash[12], md5_hash[13], md5_hash[14], md5_hash[15]
48
49static int
50CIFSParseMFSymlink(const u8 *buf,
51 unsigned int buf_len,
52 unsigned int *_link_len,
53 char **_link_str)
54{
55 int rc;
56 unsigned int link_len;
57 const char *md5_str1;
58 const char *link_str;
59 struct MD5Context md5_ctx;
60 u8 md5_hash[16];
61 char md5_str2[34];
62
63 if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
64 return -EINVAL;
65
66 md5_str1 = (const char *)&buf[CIFS_MF_SYMLINK_MD5_OFFSET];
67 link_str = (const char *)&buf[CIFS_MF_SYMLINK_LINK_OFFSET];
68
69 rc = sscanf(buf, CIFS_MF_SYMLINK_LEN_FORMAT, &link_len);
70 if (rc != 1)
71 return -EINVAL;
72
73 cifs_MD5_init(&md5_ctx);
74 cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len);
75 cifs_MD5_final(md5_hash, &md5_ctx);
76
77 snprintf(md5_str2, sizeof(md5_str2),
78 CIFS_MF_SYMLINK_MD5_FORMAT,
79 CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
80
81 if (strncmp(md5_str1, md5_str2, 17) != 0)
82 return -EINVAL;
83
84 if (_link_str) {
85 *_link_str = kstrndup(link_str, link_len, GFP_KERNEL);
86 if (!*_link_str)
87 return -ENOMEM;
88 }
89
90 *_link_len = link_len;
91 return 0;
92}
93
94static int
95CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
96{
97 unsigned int link_len;
98 unsigned int ofs;
99 struct MD5Context md5_ctx;
100 u8 md5_hash[16];
101
102 if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
103 return -EINVAL;
104
105 link_len = strlen(link_str);
106
107 if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)
108 return -ENAMETOOLONG;
109
110 cifs_MD5_init(&md5_ctx);
111 cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len);
112 cifs_MD5_final(md5_hash, &md5_ctx);
113
114 snprintf(buf, buf_len,
115 CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
116 link_len,
117 CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
118
119 ofs = CIFS_MF_SYMLINK_LINK_OFFSET;
120 memcpy(buf + ofs, link_str, link_len);
121
122 ofs += link_len;
123 if (ofs < CIFS_MF_SYMLINK_FILE_SIZE) {
124 buf[ofs] = '\n';
125 ofs++;
126 }
127
128 while (ofs < CIFS_MF_SYMLINK_FILE_SIZE) {
129 buf[ofs] = ' ';
130 ofs++;
131 }
132
133 return 0;
134}
135
136static int
137CIFSCreateMFSymLink(const int xid, struct cifsTconInfo *tcon,
138 const char *fromName, const char *toName,
139 const struct nls_table *nls_codepage, int remap)
140{
141 int rc;
142 int oplock = 0;
143 __u16 netfid = 0;
144 u8 *buf;
145 unsigned int bytes_written = 0;
146
147 buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
148 if (!buf)
149 return -ENOMEM;
150
151 rc = CIFSFormatMFSymlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName);
152 if (rc != 0) {
153 kfree(buf);
154 return rc;
155 }
156
157 rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE,
158 CREATE_NOT_DIR, &netfid, &oplock, NULL,
159 nls_codepage, remap);
160 if (rc != 0) {
161 kfree(buf);
162 return rc;
163 }
164
165 rc = CIFSSMBWrite(xid, tcon, netfid,
166 CIFS_MF_SYMLINK_FILE_SIZE /* length */,
167 0 /* offset */,
168 &bytes_written, buf, NULL, 0);
169 CIFSSMBClose(xid, tcon, netfid);
170 kfree(buf);
171 if (rc != 0)
172 return rc;
173
174 if (bytes_written != CIFS_MF_SYMLINK_FILE_SIZE)
175 return -EIO;
176
177 return 0;
178}
179
180static int
181CIFSQueryMFSymLink(const int xid, struct cifsTconInfo *tcon,
182 const unsigned char *searchName, char **symlinkinfo,
183 const struct nls_table *nls_codepage, int remap)
184{
185 int rc;
186 int oplock = 0;
187 __u16 netfid = 0;
188 u8 *buf;
189 char *pbuf;
190 unsigned int bytes_read = 0;
191 int buf_type = CIFS_NO_BUFFER;
192 unsigned int link_len = 0;
193 FILE_ALL_INFO file_info;
194
195 rc = CIFSSMBOpen(xid, tcon, searchName, FILE_OPEN, GENERIC_READ,
196 CREATE_NOT_DIR, &netfid, &oplock, &file_info,
197 nls_codepage, remap);
198 if (rc != 0)
199 return rc;
200
201 if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) {
202 CIFSSMBClose(xid, tcon, netfid);
203 /* it's not a symlink */
204 return -EINVAL;
205 }
206
207 buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
208 if (!buf)
209 return -ENOMEM;
210 pbuf = buf;
211
212 rc = CIFSSMBRead(xid, tcon, netfid,
213 CIFS_MF_SYMLINK_FILE_SIZE /* length */,
214 0 /* offset */,
215 &bytes_read, &pbuf, &buf_type);
216 CIFSSMBClose(xid, tcon, netfid);
217 if (rc != 0) {
218 kfree(buf);
219 return rc;
220 }
221
222 rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, symlinkinfo);
223 kfree(buf);
224 if (rc != 0)
225 return rc;
226
227 return 0;
228}
229
230bool
231CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr)
232{
233 if (!(fattr->cf_mode & S_IFREG))
234 /* it's not a symlink */
235 return false;
236
237 if (fattr->cf_eof != CIFS_MF_SYMLINK_FILE_SIZE)
238 /* it's not a symlink */
239 return false;
240
241 return true;
242}
243
244int
245CIFSCheckMFSymlink(struct cifs_fattr *fattr,
246 const unsigned char *path,
247 struct cifs_sb_info *cifs_sb, int xid)
248{
249 int rc;
250 int oplock = 0;
251 __u16 netfid = 0;
252 struct tcon_link *tlink;
253 struct cifsTconInfo *pTcon;
254 u8 *buf;
255 char *pbuf;
256 unsigned int bytes_read = 0;
257 int buf_type = CIFS_NO_BUFFER;
258 unsigned int link_len = 0;
259 FILE_ALL_INFO file_info;
260
261 if (!CIFSCouldBeMFSymlink(fattr))
262 /* it's not a symlink */
263 return 0;
264
265 tlink = cifs_sb_tlink(cifs_sb);
266 if (IS_ERR(tlink))
267 return PTR_ERR(tlink);
268 pTcon = tlink_tcon(tlink);
269
270 rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ,
271 CREATE_NOT_DIR, &netfid, &oplock, &file_info,
272 cifs_sb->local_nls,
273 cifs_sb->mnt_cifs_flags &
274 CIFS_MOUNT_MAP_SPECIAL_CHR);
275 if (rc != 0)
276 goto out;
277
278 if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) {
279 CIFSSMBClose(xid, pTcon, netfid);
280 /* it's not a symlink */
281 goto out;
282 }
283
284 buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
285 if (!buf) {
286 rc = -ENOMEM;
287 goto out;
288 }
289 pbuf = buf;
290
291 rc = CIFSSMBRead(xid, pTcon, netfid,
292 CIFS_MF_SYMLINK_FILE_SIZE /* length */,
293 0 /* offset */,
294 &bytes_read, &pbuf, &buf_type);
295 CIFSSMBClose(xid, pTcon, netfid);
296 if (rc != 0) {
297 kfree(buf);
298 goto out;
299 }
300
301 rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, NULL);
302 kfree(buf);
303 if (rc == -EINVAL) {
304 /* it's not a symlink */
305 rc = 0;
306 goto out;
307 }
308
309 if (rc != 0)
310 goto out;
311
312 /* it is a symlink */
313 fattr->cf_eof = link_len;
314 fattr->cf_mode &= ~S_IFMT;
315 fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO;
316 fattr->cf_dtype = DT_LNK;
317out:
318 cifs_put_tlink(tlink);
319 return rc;
320}
31 321
32int 322int
33cifs_hardlink(struct dentry *old_file, struct inode *inode, 323cifs_hardlink(struct dentry *old_file, struct inode *inode,
@@ -37,17 +327,17 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
37 int xid; 327 int xid;
38 char *fromName = NULL; 328 char *fromName = NULL;
39 char *toName = NULL; 329 char *toName = NULL;
40 struct cifs_sb_info *cifs_sb_target; 330 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
331 struct tcon_link *tlink;
41 struct cifsTconInfo *pTcon; 332 struct cifsTconInfo *pTcon;
42 struct cifsInodeInfo *cifsInode; 333 struct cifsInodeInfo *cifsInode;
43 334
44 xid = GetXid(); 335 tlink = cifs_sb_tlink(cifs_sb);
45 336 if (IS_ERR(tlink))
46 cifs_sb_target = CIFS_SB(inode->i_sb); 337 return PTR_ERR(tlink);
47 pTcon = cifs_sb_target->tcon; 338 pTcon = tlink_tcon(tlink);
48 339
49/* No need to check for cross device links since server will do that 340 xid = GetXid();
50 BB note DFS case in future though (when we may have to check) */
51 341
52 fromName = build_path_from_dentry(old_file); 342 fromName = build_path_from_dentry(old_file);
53 toName = build_path_from_dentry(direntry); 343 toName = build_path_from_dentry(direntry);
@@ -56,16 +346,15 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
56 goto cifs_hl_exit; 346 goto cifs_hl_exit;
57 } 347 }
58 348
59/* if (cifs_sb_target->tcon->ses->capabilities & CAP_UNIX)*/
60 if (pTcon->unix_ext) 349 if (pTcon->unix_ext)
61 rc = CIFSUnixCreateHardLink(xid, pTcon, fromName, toName, 350 rc = CIFSUnixCreateHardLink(xid, pTcon, fromName, toName,
62 cifs_sb_target->local_nls, 351 cifs_sb->local_nls,
63 cifs_sb_target->mnt_cifs_flags & 352 cifs_sb->mnt_cifs_flags &
64 CIFS_MOUNT_MAP_SPECIAL_CHR); 353 CIFS_MOUNT_MAP_SPECIAL_CHR);
65 else { 354 else {
66 rc = CIFSCreateHardLink(xid, pTcon, fromName, toName, 355 rc = CIFSCreateHardLink(xid, pTcon, fromName, toName,
67 cifs_sb_target->local_nls, 356 cifs_sb->local_nls,
68 cifs_sb_target->mnt_cifs_flags & 357 cifs_sb->mnt_cifs_flags &
69 CIFS_MOUNT_MAP_SPECIAL_CHR); 358 CIFS_MOUNT_MAP_SPECIAL_CHR);
70 if ((rc == -EIO) || (rc == -EINVAL)) 359 if ((rc == -EIO) || (rc == -EINVAL))
71 rc = -EOPNOTSUPP; 360 rc = -EOPNOTSUPP;
@@ -101,6 +390,7 @@ cifs_hl_exit:
101 kfree(fromName); 390 kfree(fromName);
102 kfree(toName); 391 kfree(toName);
103 FreeXid(xid); 392 FreeXid(xid);
393 cifs_put_tlink(tlink);
104 return rc; 394 return rc;
105} 395}
106 396
@@ -113,10 +403,19 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
113 char *full_path = NULL; 403 char *full_path = NULL;
114 char *target_path = NULL; 404 char *target_path = NULL;
115 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 405 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
116 struct cifsTconInfo *tcon = cifs_sb->tcon; 406 struct tcon_link *tlink = NULL;
407 struct cifsTconInfo *tcon;
117 408
118 xid = GetXid(); 409 xid = GetXid();
119 410
411 tlink = cifs_sb_tlink(cifs_sb);
412 if (IS_ERR(tlink)) {
413 rc = PTR_ERR(tlink);
414 tlink = NULL;
415 goto out;
416 }
417 tcon = tlink_tcon(tlink);
418
120 /* 419 /*
121 * For now, we just handle symlinks with unix extensions enabled. 420 * For now, we just handle symlinks with unix extensions enabled.
122 * Eventually we should handle NTFS reparse points, and MacOS 421 * Eventually we should handle NTFS reparse points, and MacOS
@@ -130,7 +429,8 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
130 * but there doesn't seem to be any harm in allowing the client to 429 * but there doesn't seem to be any harm in allowing the client to
131 * read them. 430 * read them.
132 */ 431 */
133 if (!(tcon->ses->capabilities & CAP_UNIX)) { 432 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
433 && !(tcon->ses->capabilities & CAP_UNIX)) {
134 rc = -EACCES; 434 rc = -EACCES;
135 goto out; 435 goto out;
136 } 436 }
@@ -141,8 +441,21 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
141 441
142 cFYI(1, "Full path: %s inode = 0x%p", full_path, inode); 442 cFYI(1, "Full path: %s inode = 0x%p", full_path, inode);
143 443
144 rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path, 444 rc = -EACCES;
145 cifs_sb->local_nls); 445 /*
446 * First try Minshall+French Symlinks, if configured
447 * and fallback to UNIX Extensions Symlinks.
448 */
449 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
450 rc = CIFSQueryMFSymLink(xid, tcon, full_path, &target_path,
451 cifs_sb->local_nls,
452 cifs_sb->mnt_cifs_flags &
453 CIFS_MOUNT_MAP_SPECIAL_CHR);
454
455 if ((rc != 0) && (tcon->ses->capabilities & CAP_UNIX))
456 rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
457 cifs_sb->local_nls);
458
146 kfree(full_path); 459 kfree(full_path);
147out: 460out:
148 if (rc != 0) { 461 if (rc != 0) {
@@ -151,6 +464,8 @@ out:
151 } 464 }
152 465
153 FreeXid(xid); 466 FreeXid(xid);
467 if (tlink)
468 cifs_put_tlink(tlink);
154 nd_set_link(nd, target_path); 469 nd_set_link(nd, target_path);
155 return NULL; 470 return NULL;
156} 471}
@@ -160,29 +475,37 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
160{ 475{
161 int rc = -EOPNOTSUPP; 476 int rc = -EOPNOTSUPP;
162 int xid; 477 int xid;
163 struct cifs_sb_info *cifs_sb; 478 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
479 struct tcon_link *tlink;
164 struct cifsTconInfo *pTcon; 480 struct cifsTconInfo *pTcon;
165 char *full_path = NULL; 481 char *full_path = NULL;
166 struct inode *newinode = NULL; 482 struct inode *newinode = NULL;
167 483
168 xid = GetXid(); 484 xid = GetXid();
169 485
170 cifs_sb = CIFS_SB(inode->i_sb); 486 tlink = cifs_sb_tlink(cifs_sb);
171 pTcon = cifs_sb->tcon; 487 if (IS_ERR(tlink)) {
488 rc = PTR_ERR(tlink);
489 goto symlink_exit;
490 }
491 pTcon = tlink_tcon(tlink);
172 492
173 full_path = build_path_from_dentry(direntry); 493 full_path = build_path_from_dentry(direntry);
174
175 if (full_path == NULL) { 494 if (full_path == NULL) {
176 rc = -ENOMEM; 495 rc = -ENOMEM;
177 FreeXid(xid); 496 goto symlink_exit;
178 return rc;
179 } 497 }
180 498
181 cFYI(1, "Full path: %s", full_path); 499 cFYI(1, "Full path: %s", full_path);
182 cFYI(1, "symname is %s", symname); 500 cFYI(1, "symname is %s", symname);
183 501
184 /* BB what if DFS and this volume is on different share? BB */ 502 /* BB what if DFS and this volume is on different share? BB */
185 if (pTcon->unix_ext) 503 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
504 rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname,
505 cifs_sb->local_nls,
506 cifs_sb->mnt_cifs_flags &
507 CIFS_MOUNT_MAP_SPECIAL_CHR);
508 else if (pTcon->unix_ext)
186 rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname, 509 rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
187 cifs_sb->local_nls); 510 cifs_sb->local_nls);
188 /* else 511 /* else
@@ -208,8 +531,9 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
208 d_instantiate(direntry, newinode); 531 d_instantiate(direntry, newinode);
209 } 532 }
210 } 533 }
211 534symlink_exit:
212 kfree(full_path); 535 kfree(full_path);
536 cifs_put_tlink(tlink);
213 FreeXid(xid); 537 FreeXid(xid);
214 return rc; 538 return rc;
215} 539}
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 3ccadc1326d6..1c681f6a6803 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -347,7 +347,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
347 if (current_fsuid() != treeCon->ses->linux_uid) { 347 if (current_fsuid() != treeCon->ses->linux_uid) {
348 cFYI(1, "Multiuser mode and UID " 348 cFYI(1, "Multiuser mode and UID "
349 "did not match tcon uid"); 349 "did not match tcon uid");
350 read_lock(&cifs_tcp_ses_lock); 350 spin_lock(&cifs_tcp_ses_lock);
351 list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) { 351 list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
352 ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list); 352 ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list);
353 if (ses->linux_uid == current_fsuid()) { 353 if (ses->linux_uid == current_fsuid()) {
@@ -361,7 +361,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
361 } 361 }
362 } 362 }
363 } 363 }
364 read_unlock(&cifs_tcp_ses_lock); 364 spin_unlock(&cifs_tcp_ses_lock);
365 } 365 }
366 } 366 }
367 } 367 }
@@ -551,7 +551,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
551 return false; 551 return false;
552 552
553 /* look up tcon based on tid & uid */ 553 /* look up tcon based on tid & uid */
554 read_lock(&cifs_tcp_ses_lock); 554 spin_lock(&cifs_tcp_ses_lock);
555 list_for_each(tmp, &srv->smb_ses_list) { 555 list_for_each(tmp, &srv->smb_ses_list) {
556 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list); 556 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
557 list_for_each(tmp1, &ses->tcon_list) { 557 list_for_each(tmp1, &ses->tcon_list) {
@@ -560,25 +560,15 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
560 continue; 560 continue;
561 561
562 cifs_stats_inc(&tcon->num_oplock_brks); 562 cifs_stats_inc(&tcon->num_oplock_brks);
563 read_lock(&GlobalSMBSeslock); 563 spin_lock(&cifs_file_list_lock);
564 list_for_each(tmp2, &tcon->openFileList) { 564 list_for_each(tmp2, &tcon->openFileList) {
565 netfile = list_entry(tmp2, struct cifsFileInfo, 565 netfile = list_entry(tmp2, struct cifsFileInfo,
566 tlist); 566 tlist);
567 if (pSMB->Fid != netfile->netfid) 567 if (pSMB->Fid != netfile->netfid)
568 continue; 568 continue;
569 569
570 /*
571 * don't do anything if file is about to be
572 * closed anyway.
573 */
574 if (netfile->closePend) {
575 read_unlock(&GlobalSMBSeslock);
576 read_unlock(&cifs_tcp_ses_lock);
577 return true;
578 }
579
580 cFYI(1, "file id match, oplock break"); 570 cFYI(1, "file id match, oplock break");
581 pCifsInode = CIFS_I(netfile->pInode); 571 pCifsInode = CIFS_I(netfile->dentry->d_inode);
582 pCifsInode->clientCanCacheAll = false; 572 pCifsInode->clientCanCacheAll = false;
583 if (pSMB->OplockLevel == 0) 573 if (pSMB->OplockLevel == 0)
584 pCifsInode->clientCanCacheRead = false; 574 pCifsInode->clientCanCacheRead = false;
@@ -594,17 +584,17 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
594 cifs_oplock_break_get(netfile); 584 cifs_oplock_break_get(netfile);
595 netfile->oplock_break_cancelled = false; 585 netfile->oplock_break_cancelled = false;
596 586
597 read_unlock(&GlobalSMBSeslock); 587 spin_unlock(&cifs_file_list_lock);
598 read_unlock(&cifs_tcp_ses_lock); 588 spin_unlock(&cifs_tcp_ses_lock);
599 return true; 589 return true;
600 } 590 }
601 read_unlock(&GlobalSMBSeslock); 591 spin_unlock(&cifs_file_list_lock);
602 read_unlock(&cifs_tcp_ses_lock); 592 spin_unlock(&cifs_tcp_ses_lock);
603 cFYI(1, "No matching file for oplock break"); 593 cFYI(1, "No matching file for oplock break");
604 return true; 594 return true;
605 } 595 }
606 } 596 }
607 read_unlock(&cifs_tcp_ses_lock); 597 spin_unlock(&cifs_tcp_ses_lock);
608 cFYI(1, "Can not process oplock break for non-existent connection"); 598 cFYI(1, "Can not process oplock break for non-existent connection");
609 return true; 599 return true;
610} 600}
@@ -729,6 +719,6 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
729 "properly. Hardlinks will not be recognized on this " 719 "properly. Hardlinks will not be recognized on this "
730 "mount. Consider mounting with the \"noserverino\" " 720 "mount. Consider mounting with the \"noserverino\" "
731 "option to silence this message.", 721 "option to silence this message.",
732 cifs_sb->tcon->treeName); 722 cifs_sb_master_tcon(cifs_sb)->treeName);
733 } 723 }
734} 724}
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
index 49c9a4e75319..5d52e4a3b1ed 100644
--- a/fs/cifs/ntlmssp.h
+++ b/fs/cifs/ntlmssp.h
@@ -61,6 +61,21 @@
61#define NTLMSSP_NEGOTIATE_KEY_XCH 0x40000000 61#define NTLMSSP_NEGOTIATE_KEY_XCH 0x40000000
62#define NTLMSSP_NEGOTIATE_56 0x80000000 62#define NTLMSSP_NEGOTIATE_56 0x80000000
63 63
64/* Define AV Pair Field IDs */
65enum av_field_type {
66 NTLMSSP_AV_EOL = 0,
67 NTLMSSP_AV_NB_COMPUTER_NAME,
68 NTLMSSP_AV_NB_DOMAIN_NAME,
69 NTLMSSP_AV_DNS_COMPUTER_NAME,
70 NTLMSSP_AV_DNS_DOMAIN_NAME,
71 NTLMSSP_AV_DNS_TREE_NAME,
72 NTLMSSP_AV_FLAGS,
73 NTLMSSP_AV_TIMESTAMP,
74 NTLMSSP_AV_RESTRICTION,
75 NTLMSSP_AV_TARGET_NAME,
76 NTLMSSP_AV_CHANNEL_BINDINGS
77};
78
64/* Although typedefs are not commonly used for structure definitions */ 79/* Although typedefs are not commonly used for structure definitions */
65/* in the Linux kernel, in this particular case they are useful */ 80/* in the Linux kernel, in this particular case they are useful */
66/* to more closely match the standards document for NTLMSSP from */ 81/* to more closely match the standards document for NTLMSSP from */
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index d5e591fab475..ef7bb7b50f58 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -102,7 +102,7 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
102 return NULL; 102 return NULL;
103 } 103 }
104 104
105 if (CIFS_SB(sb)->tcon->nocase) 105 if (cifs_sb_master_tcon(CIFS_SB(sb))->nocase)
106 dentry->d_op = &cifs_ci_dentry_ops; 106 dentry->d_op = &cifs_ci_dentry_ops;
107 else 107 else
108 dentry->d_op = &cifs_dentry_ops; 108 dentry->d_op = &cifs_dentry_ops;
@@ -171,7 +171,7 @@ static void
171cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info, 171cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info,
172 struct cifs_sb_info *cifs_sb) 172 struct cifs_sb_info *cifs_sb)
173{ 173{
174 int offset = cifs_sb->tcon->ses->server->timeAdj; 174 int offset = cifs_sb_master_tcon(cifs_sb)->ses->server->timeAdj;
175 175
176 memset(fattr, 0, sizeof(*fattr)); 176 memset(fattr, 0, sizeof(*fattr));
177 fattr->cf_atime = cnvrtDosUnixTm(info->LastAccessDate, 177 fattr->cf_atime = cnvrtDosUnixTm(info->LastAccessDate,
@@ -199,7 +199,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
199 int len; 199 int len;
200 int oplock = 0; 200 int oplock = 0;
201 int rc; 201 int rc;
202 struct cifsTconInfo *ptcon = cifs_sb->tcon; 202 struct cifsTconInfo *ptcon = cifs_sb_tcon(cifs_sb);
203 char *tmpbuffer; 203 char *tmpbuffer;
204 204
205 rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ, 205 rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ,
@@ -223,34 +223,35 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
223static int initiate_cifs_search(const int xid, struct file *file) 223static int initiate_cifs_search(const int xid, struct file *file)
224{ 224{
225 int rc = 0; 225 int rc = 0;
226 char *full_path; 226 char *full_path = NULL;
227 struct cifsFileInfo *cifsFile; 227 struct cifsFileInfo *cifsFile;
228 struct cifs_sb_info *cifs_sb; 228 struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
229 struct tcon_link *tlink;
229 struct cifsTconInfo *pTcon; 230 struct cifsTconInfo *pTcon;
230 231
231 if (file->private_data == NULL) { 232 tlink = cifs_sb_tlink(cifs_sb);
233 if (IS_ERR(tlink))
234 return PTR_ERR(tlink);
235 pTcon = tlink_tcon(tlink);
236
237 if (file->private_data == NULL)
232 file->private_data = 238 file->private_data =
233 kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); 239 kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
240 if (file->private_data == NULL) {
241 rc = -ENOMEM;
242 goto error_exit;
234 } 243 }
235 244
236 if (file->private_data == NULL)
237 return -ENOMEM;
238 cifsFile = file->private_data; 245 cifsFile = file->private_data;
239 cifsFile->invalidHandle = true; 246 cifsFile->invalidHandle = true;
240 cifsFile->srch_inf.endOfSearch = false; 247 cifsFile->srch_inf.endOfSearch = false;
241 248 cifsFile->tlink = cifs_get_tlink(tlink);
242 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
243 if (cifs_sb == NULL)
244 return -EINVAL;
245
246 pTcon = cifs_sb->tcon;
247 if (pTcon == NULL)
248 return -EINVAL;
249 249
250 full_path = build_path_from_dentry(file->f_path.dentry); 250 full_path = build_path_from_dentry(file->f_path.dentry);
251 251 if (full_path == NULL) {
252 if (full_path == NULL) 252 rc = -ENOMEM;
253 return -ENOMEM; 253 goto error_exit;
254 }
254 255
255 cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos); 256 cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos);
256 257
@@ -283,7 +284,9 @@ ffirst_retry:
283 cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; 284 cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
284 goto ffirst_retry; 285 goto ffirst_retry;
285 } 286 }
287error_exit:
286 kfree(full_path); 288 kfree(full_path);
289 cifs_put_tlink(tlink);
287 return rc; 290 return rc;
288} 291}
289 292
@@ -525,14 +528,14 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
525 (index_to_find < first_entry_in_buffer)) { 528 (index_to_find < first_entry_in_buffer)) {
526 /* close and restart search */ 529 /* close and restart search */
527 cFYI(1, "search backing up - close and restart search"); 530 cFYI(1, "search backing up - close and restart search");
528 write_lock(&GlobalSMBSeslock); 531 spin_lock(&cifs_file_list_lock);
529 if (!cifsFile->srch_inf.endOfSearch && 532 if (!cifsFile->srch_inf.endOfSearch &&
530 !cifsFile->invalidHandle) { 533 !cifsFile->invalidHandle) {
531 cifsFile->invalidHandle = true; 534 cifsFile->invalidHandle = true;
532 write_unlock(&GlobalSMBSeslock); 535 spin_unlock(&cifs_file_list_lock);
533 CIFSFindClose(xid, pTcon, cifsFile->netfid); 536 CIFSFindClose(xid, pTcon, cifsFile->netfid);
534 } else 537 } else
535 write_unlock(&GlobalSMBSeslock); 538 spin_unlock(&cifs_file_list_lock);
536 if (cifsFile->srch_inf.ntwrk_buf_start) { 539 if (cifsFile->srch_inf.ntwrk_buf_start) {
537 cFYI(1, "freeing SMB ff cache buf on search rewind"); 540 cFYI(1, "freeing SMB ff cache buf on search rewind");
538 if (cifsFile->srch_inf.smallBuf) 541 if (cifsFile->srch_inf.smallBuf)
@@ -738,6 +741,15 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
738 cifs_autodisable_serverino(cifs_sb); 741 cifs_autodisable_serverino(cifs_sb);
739 } 742 }
740 743
744 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) &&
745 CIFSCouldBeMFSymlink(&fattr))
746 /*
747 * trying to get the type and mode can be slow,
748 * so just call those regular files for now, and mark
749 * for reval
750 */
751 fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
752
741 ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); 753 ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
742 tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr); 754 tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr);
743 755
@@ -777,9 +789,17 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
777 xid = GetXid(); 789 xid = GetXid();
778 790
779 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 791 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
780 pTcon = cifs_sb->tcon; 792
781 if (pTcon == NULL) 793 /*
782 return -EINVAL; 794 * Ensure FindFirst doesn't fail before doing filldir() for '.' and
795 * '..'. Otherwise we won't be able to notify VFS in case of failure.
796 */
797 if (file->private_data == NULL) {
798 rc = initiate_cifs_search(xid, file);
799 cFYI(1, "initiate cifs search rc %d", rc);
800 if (rc)
801 goto rddir2_exit;
802 }
783 803
784 switch ((int) file->f_pos) { 804 switch ((int) file->f_pos) {
785 case 0: 805 case 0:
@@ -805,14 +825,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
805 if after then keep searching till find it */ 825 if after then keep searching till find it */
806 826
807 if (file->private_data == NULL) { 827 if (file->private_data == NULL) {
808 rc = initiate_cifs_search(xid, file);
809 cFYI(1, "initiate cifs search rc %d", rc);
810 if (rc) {
811 FreeXid(xid);
812 return rc;
813 }
814 }
815 if (file->private_data == NULL) {
816 rc = -EINVAL; 828 rc = -EINVAL;
817 FreeXid(xid); 829 FreeXid(xid);
818 return rc; 830 return rc;
@@ -829,6 +841,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
829 CIFSFindClose(xid, pTcon, cifsFile->netfid); 841 CIFSFindClose(xid, pTcon, cifsFile->netfid);
830 } */ 842 } */
831 843
844 pTcon = tlink_tcon(cifsFile->tlink);
832 rc = find_cifs_entry(xid, pTcon, file, 845 rc = find_cifs_entry(xid, pTcon, file,
833 &current_entry, &num_to_fill); 846 &current_entry, &num_to_fill);
834 if (rc) { 847 if (rc) {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 0a57cb7db5dd..2a11efd96592 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -80,7 +80,7 @@ static __le16 get_next_vcnum(struct cifsSesInfo *ses)
80 if (max_vcs < 2) 80 if (max_vcs < 2)
81 max_vcs = 0xFFFF; 81 max_vcs = 0xFFFF;
82 82
83 write_lock(&cifs_tcp_ses_lock); 83 spin_lock(&cifs_tcp_ses_lock);
84 if ((ses->need_reconnect) && is_first_ses_reconnect(ses)) 84 if ((ses->need_reconnect) && is_first_ses_reconnect(ses))
85 goto get_vc_num_exit; /* vcnum will be zero */ 85 goto get_vc_num_exit; /* vcnum will be zero */
86 for (i = ses->server->srv_count - 1; i < max_vcs; i++) { 86 for (i = ses->server->srv_count - 1; i < max_vcs; i++) {
@@ -112,7 +112,7 @@ static __le16 get_next_vcnum(struct cifsSesInfo *ses)
112 vcnum = i; 112 vcnum = i;
113 ses->vcnum = vcnum; 113 ses->vcnum = vcnum;
114get_vc_num_exit: 114get_vc_num_exit:
115 write_unlock(&cifs_tcp_ses_lock); 115 spin_unlock(&cifs_tcp_ses_lock);
116 116
117 return cpu_to_le16(vcnum); 117 return cpu_to_le16(vcnum);
118} 118}
@@ -383,6 +383,9 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
383static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, 383static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
384 struct cifsSesInfo *ses) 384 struct cifsSesInfo *ses)
385{ 385{
386 unsigned int tioffset; /* challenge message target info area */
387 unsigned int tilen; /* challenge message target info area length */
388
386 CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr; 389 CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
387 390
388 if (blob_len < sizeof(CHALLENGE_MESSAGE)) { 391 if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
@@ -399,12 +402,25 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
399 return -EINVAL; 402 return -EINVAL;
400 } 403 }
401 404
402 memcpy(ses->server->cryptKey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE); 405 memcpy(ses->cryptKey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
403 /* BB we could decode pblob->NegotiateFlags; some may be useful */ 406 /* BB we could decode pblob->NegotiateFlags; some may be useful */
404 /* In particular we can examine sign flags */ 407 /* In particular we can examine sign flags */
405 /* BB spec says that if AvId field of MsvAvTimestamp is populated then 408 /* BB spec says that if AvId field of MsvAvTimestamp is populated then
406 we must set the MIC field of the AUTHENTICATE_MESSAGE */ 409 we must set the MIC field of the AUTHENTICATE_MESSAGE */
407 410
411 tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset);
412 tilen = cpu_to_le16(pblob->TargetInfoArray.Length);
413 ses->tilen = tilen;
414 if (ses->tilen) {
415 ses->tiblob = kmalloc(tilen, GFP_KERNEL);
416 if (!ses->tiblob) {
417 cERROR(1, "Challenge target info allocation failure");
418 ses->tilen = 0;
419 return -ENOMEM;
420 }
421 memcpy(ses->tiblob, bcc_ptr + tioffset, ses->tilen);
422 }
423
408 return 0; 424 return 0;
409} 425}
410 426
@@ -425,7 +441,7 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
425 /* BB is NTLMV2 session security format easier to use here? */ 441 /* BB is NTLMV2 session security format easier to use here? */
426 flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET | 442 flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET |
427 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | 443 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
428 NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM; 444 NTLMSSP_NEGOTIATE_NTLM;
429 if (ses->server->secMode & 445 if (ses->server->secMode &
430 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) 446 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
431 flags |= NTLMSSP_NEGOTIATE_SIGN; 447 flags |= NTLMSSP_NEGOTIATE_SIGN;
@@ -448,13 +464,16 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
448 maximum possible size is fixed and small, making this approach cleaner. 464 maximum possible size is fixed and small, making this approach cleaner.
449 This function returns the length of the data in the blob */ 465 This function returns the length of the data in the blob */
450static int build_ntlmssp_auth_blob(unsigned char *pbuffer, 466static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
467 u16 *buflen,
451 struct cifsSesInfo *ses, 468 struct cifsSesInfo *ses,
452 const struct nls_table *nls_cp, bool first) 469 const struct nls_table *nls_cp)
453{ 470{
471 int rc;
472 unsigned int size;
454 AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer; 473 AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
455 __u32 flags; 474 __u32 flags;
456 unsigned char *tmp; 475 unsigned char *tmp;
457 char ntlm_session_key[CIFS_SESS_KEY_SIZE]; 476 struct ntlmv2_resp ntlmv2_response = {};
458 477
459 memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8); 478 memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
460 sec_blob->MessageType = NtLmAuthenticate; 479 sec_blob->MessageType = NtLmAuthenticate;
@@ -462,7 +481,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
462 flags = NTLMSSP_NEGOTIATE_56 | 481 flags = NTLMSSP_NEGOTIATE_56 |
463 NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO | 482 NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
464 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | 483 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
465 NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM; 484 NTLMSSP_NEGOTIATE_NTLM;
466 if (ses->server->secMode & 485 if (ses->server->secMode &
467 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) 486 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
468 flags |= NTLMSSP_NEGOTIATE_SIGN; 487 flags |= NTLMSSP_NEGOTIATE_SIGN;
@@ -477,19 +496,26 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
477 sec_blob->LmChallengeResponse.Length = 0; 496 sec_blob->LmChallengeResponse.Length = 0;
478 sec_blob->LmChallengeResponse.MaximumLength = 0; 497 sec_blob->LmChallengeResponse.MaximumLength = 0;
479 498
480 /* calculate session key, BB what about adding similar ntlmv2 path? */
481 SMBNTencrypt(ses->password, ses->server->cryptKey, ntlm_session_key);
482 if (first)
483 cifs_calculate_mac_key(&ses->server->mac_signing_key,
484 ntlm_session_key, ses->password);
485
486 memcpy(tmp, ntlm_session_key, CIFS_SESS_KEY_SIZE);
487 sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer); 499 sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer);
488 sec_blob->NtChallengeResponse.Length = cpu_to_le16(CIFS_SESS_KEY_SIZE); 500 rc = setup_ntlmv2_rsp(ses, (char *)&ntlmv2_response, nls_cp);
489 sec_blob->NtChallengeResponse.MaximumLength = 501 if (rc) {
490 cpu_to_le16(CIFS_SESS_KEY_SIZE); 502 cERROR(1, "Error %d during NTLMSSP authentication", rc);
503 goto setup_ntlmv2_ret;
504 }
505 size = sizeof(struct ntlmv2_resp);
506 memcpy(tmp, (char *)&ntlmv2_response, size);
507 tmp += size;
508 if (ses->tilen > 0) {
509 memcpy(tmp, ses->tiblob, ses->tilen);
510 tmp += ses->tilen;
511 }
491 512
492 tmp += CIFS_SESS_KEY_SIZE; 513 sec_blob->NtChallengeResponse.Length = cpu_to_le16(size + ses->tilen);
514 sec_blob->NtChallengeResponse.MaximumLength =
515 cpu_to_le16(size + ses->tilen);
516 kfree(ses->tiblob);
517 ses->tiblob = NULL;
518 ses->tilen = 0;
493 519
494 if (ses->domainName == NULL) { 520 if (ses->domainName == NULL) {
495 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); 521 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
@@ -501,7 +527,6 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
501 len = cifs_strtoUCS((__le16 *)tmp, ses->domainName, 527 len = cifs_strtoUCS((__le16 *)tmp, ses->domainName,
502 MAX_USERNAME_SIZE, nls_cp); 528 MAX_USERNAME_SIZE, nls_cp);
503 len *= 2; /* unicode is 2 bytes each */ 529 len *= 2; /* unicode is 2 bytes each */
504 len += 2; /* trailing null */
505 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); 530 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
506 sec_blob->DomainName.Length = cpu_to_le16(len); 531 sec_blob->DomainName.Length = cpu_to_le16(len);
507 sec_blob->DomainName.MaximumLength = cpu_to_le16(len); 532 sec_blob->DomainName.MaximumLength = cpu_to_le16(len);
@@ -518,7 +543,6 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
518 len = cifs_strtoUCS((__le16 *)tmp, ses->userName, 543 len = cifs_strtoUCS((__le16 *)tmp, ses->userName,
519 MAX_USERNAME_SIZE, nls_cp); 544 MAX_USERNAME_SIZE, nls_cp);
520 len *= 2; /* unicode is 2 bytes each */ 545 len *= 2; /* unicode is 2 bytes each */
521 len += 2; /* trailing null */
522 sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); 546 sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
523 sec_blob->UserName.Length = cpu_to_le16(len); 547 sec_blob->UserName.Length = cpu_to_le16(len);
524 sec_blob->UserName.MaximumLength = cpu_to_le16(len); 548 sec_blob->UserName.MaximumLength = cpu_to_le16(len);
@@ -533,7 +557,10 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
533 sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); 557 sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
534 sec_blob->SessionKey.Length = 0; 558 sec_blob->SessionKey.Length = 0;
535 sec_blob->SessionKey.MaximumLength = 0; 559 sec_blob->SessionKey.MaximumLength = 0;
536 return tmp - pbuffer; 560
561setup_ntlmv2_ret:
562 *buflen = tmp - pbuffer;
563 return rc;
537} 564}
538 565
539 566
@@ -545,19 +572,6 @@ static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
545 572
546 return; 573 return;
547} 574}
548
549static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
550 struct cifsSesInfo *ses,
551 const struct nls_table *nls, bool first_time)
552{
553 int bloblen;
554
555 bloblen = build_ntlmssp_auth_blob(&pSMB->req.SecurityBlob[0], ses, nls,
556 first_time);
557 pSMB->req.SecurityBlobLength = cpu_to_le16(bloblen);
558
559 return bloblen;
560}
561#endif 575#endif
562 576
563int 577int
@@ -579,15 +593,12 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
579 int bytes_remaining; 593 int bytes_remaining;
580 struct key *spnego_key = NULL; 594 struct key *spnego_key = NULL;
581 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ 595 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
582 bool first_time; 596 u16 blob_len;
597 char *ntlmsspblob = NULL;
583 598
584 if (ses == NULL) 599 if (ses == NULL)
585 return -EINVAL; 600 return -EINVAL;
586 601
587 read_lock(&cifs_tcp_ses_lock);
588 first_time = is_first_ses_reconnect(ses);
589 read_unlock(&cifs_tcp_ses_lock);
590
591 type = ses->server->secType; 602 type = ses->server->secType;
592 603
593 cFYI(1, "sess setup type %d", type); 604 cFYI(1, "sess setup type %d", type);
@@ -658,7 +669,7 @@ ssetup_ntlmssp_authenticate:
658 /* BB calculate hash with password */ 669 /* BB calculate hash with password */
659 /* and copy into bcc */ 670 /* and copy into bcc */
660 671
661 calc_lanman_hash(ses->password, ses->server->cryptKey, 672 calc_lanman_hash(ses->password, ses->cryptKey,
662 ses->server->secMode & SECMODE_PW_ENCRYPT ? 673 ses->server->secMode & SECMODE_PW_ENCRYPT ?
663 true : false, lnm_session_key); 674 true : false, lnm_session_key);
664 675
@@ -685,15 +696,11 @@ ssetup_ntlmssp_authenticate:
685 cpu_to_le16(CIFS_SESS_KEY_SIZE); 696 cpu_to_le16(CIFS_SESS_KEY_SIZE);
686 697
687 /* calculate session key */ 698 /* calculate session key */
688 SMBNTencrypt(ses->password, ses->server->cryptKey, 699 SMBNTencrypt(ses->password, ses->cryptKey, ntlm_session_key);
689 ntlm_session_key);
690 700
691 if (first_time) /* should this be moved into common code 701 cifs_calculate_session_key(&ses->auth_key,
692 with similar ntlmv2 path? */ 702 ntlm_session_key, ses->password);
693 cifs_calculate_mac_key(&ses->server->mac_signing_key,
694 ntlm_session_key, ses->password);
695 /* copy session key */ 703 /* copy session key */
696
697 memcpy(bcc_ptr, (char *)ntlm_session_key, CIFS_SESS_KEY_SIZE); 704 memcpy(bcc_ptr, (char *)ntlm_session_key, CIFS_SESS_KEY_SIZE);
698 bcc_ptr += CIFS_SESS_KEY_SIZE; 705 bcc_ptr += CIFS_SESS_KEY_SIZE;
699 memcpy(bcc_ptr, (char *)ntlm_session_key, CIFS_SESS_KEY_SIZE); 706 memcpy(bcc_ptr, (char *)ntlm_session_key, CIFS_SESS_KEY_SIZE);
@@ -725,16 +732,31 @@ ssetup_ntlmssp_authenticate:
725 pSMB->req_no_secext.CaseInsensitivePasswordLength = 0; 732 pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
726 /* cpu_to_le16(LM2_SESS_KEY_SIZE); */ 733 /* cpu_to_le16(LM2_SESS_KEY_SIZE); */
727 734
728 pSMB->req_no_secext.CaseSensitivePasswordLength =
729 cpu_to_le16(sizeof(struct ntlmv2_resp));
730
731 /* calculate session key */ 735 /* calculate session key */
732 setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp); 736 rc = setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp);
733 /* FIXME: calculate MAC key */ 737 if (rc) {
738 cERROR(1, "Error %d during NTLMv2 authentication", rc);
739 kfree(v2_sess_key);
740 goto ssetup_exit;
741 }
734 memcpy(bcc_ptr, (char *)v2_sess_key, 742 memcpy(bcc_ptr, (char *)v2_sess_key,
735 sizeof(struct ntlmv2_resp)); 743 sizeof(struct ntlmv2_resp));
736 bcc_ptr += sizeof(struct ntlmv2_resp); 744 bcc_ptr += sizeof(struct ntlmv2_resp);
737 kfree(v2_sess_key); 745 kfree(v2_sess_key);
746 /* set case sensitive password length after tilen may get
747 * assigned, tilen is 0 otherwise.
748 */
749 pSMB->req_no_secext.CaseSensitivePasswordLength =
750 cpu_to_le16(sizeof(struct ntlmv2_resp) + ses->tilen);
751 if (ses->tilen > 0) {
752 memcpy(bcc_ptr, ses->tiblob, ses->tilen);
753 bcc_ptr += ses->tilen;
754 /* we never did allocate ses->domainName to free */
755 kfree(ses->tiblob);
756 ses->tiblob = NULL;
757 ses->tilen = 0;
758 }
759
738 if (ses->capabilities & CAP_UNICODE) { 760 if (ses->capabilities & CAP_UNICODE) {
739 if (iov[0].iov_len % 2) { 761 if (iov[0].iov_len % 2) {
740 *bcc_ptr = 0; 762 *bcc_ptr = 0;
@@ -765,17 +787,14 @@ ssetup_ntlmssp_authenticate:
765 } 787 }
766 /* bail out if key is too long */ 788 /* bail out if key is too long */
767 if (msg->sesskey_len > 789 if (msg->sesskey_len >
768 sizeof(ses->server->mac_signing_key.data.krb5)) { 790 sizeof(ses->auth_key.data.krb5)) {
769 cERROR(1, "Kerberos signing key too long (%u bytes)", 791 cERROR(1, "Kerberos signing key too long (%u bytes)",
770 msg->sesskey_len); 792 msg->sesskey_len);
771 rc = -EOVERFLOW; 793 rc = -EOVERFLOW;
772 goto ssetup_exit; 794 goto ssetup_exit;
773 } 795 }
774 if (first_time) { 796 ses->auth_key.len = msg->sesskey_len;
775 ses->server->mac_signing_key.len = msg->sesskey_len; 797 memcpy(ses->auth_key.data.krb5, msg->data, msg->sesskey_len);
776 memcpy(ses->server->mac_signing_key.data.krb5,
777 msg->data, msg->sesskey_len);
778 }
779 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; 798 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
780 capabilities |= CAP_EXTENDED_SECURITY; 799 capabilities |= CAP_EXTENDED_SECURITY;
781 pSMB->req.Capabilities = cpu_to_le32(capabilities); 800 pSMB->req.Capabilities = cpu_to_le32(capabilities);
@@ -815,12 +834,30 @@ ssetup_ntlmssp_authenticate:
815 if (phase == NtLmNegotiate) { 834 if (phase == NtLmNegotiate) {
816 setup_ntlmssp_neg_req(pSMB, ses); 835 setup_ntlmssp_neg_req(pSMB, ses);
817 iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE); 836 iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
837 iov[1].iov_base = &pSMB->req.SecurityBlob[0];
818 } else if (phase == NtLmAuthenticate) { 838 } else if (phase == NtLmAuthenticate) {
819 int blob_len; 839 /* 5 is an empirical value, large enought to
820 blob_len = setup_ntlmssp_auth_req(pSMB, ses, 840 * hold authenticate message, max 10 of
821 nls_cp, 841 * av paris, doamin,user,workstation mames,
822 first_time); 842 * flags etc..
843 */
844 ntlmsspblob = kmalloc(
845 5*sizeof(struct _AUTHENTICATE_MESSAGE),
846 GFP_KERNEL);
847 if (!ntlmsspblob) {
848 cERROR(1, "Can't allocate NTLMSSP");
849 rc = -ENOMEM;
850 goto ssetup_exit;
851 }
852
853 rc = build_ntlmssp_auth_blob(ntlmsspblob,
854 &blob_len, ses, nls_cp);
855 if (rc)
856 goto ssetup_exit;
823 iov[1].iov_len = blob_len; 857 iov[1].iov_len = blob_len;
858 iov[1].iov_base = ntlmsspblob;
859 pSMB->req.SecurityBlobLength =
860 cpu_to_le16(blob_len);
824 /* Make sure that we tell the server that we 861 /* Make sure that we tell the server that we
825 are using the uid that it just gave us back 862 are using the uid that it just gave us back
826 on the response (challenge) */ 863 on the response (challenge) */
@@ -830,7 +867,6 @@ ssetup_ntlmssp_authenticate:
830 rc = -ENOSYS; 867 rc = -ENOSYS;
831 goto ssetup_exit; 868 goto ssetup_exit;
832 } 869 }
833 iov[1].iov_base = &pSMB->req.SecurityBlob[0];
834 /* unicode strings must be word aligned */ 870 /* unicode strings must be word aligned */
835 if ((iov[0].iov_len + iov[1].iov_len) % 2) { 871 if ((iov[0].iov_len + iov[1].iov_len) % 2) {
836 *bcc_ptr = 0; 872 *bcc_ptr = 0;
@@ -895,7 +931,6 @@ ssetup_ntlmssp_authenticate:
895 bcc_ptr = pByteArea(smb_buf); 931 bcc_ptr = pByteArea(smb_buf);
896 932
897 if (smb_buf->WordCount == 4) { 933 if (smb_buf->WordCount == 4) {
898 __u16 blob_len;
899 blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); 934 blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
900 if (blob_len > bytes_remaining) { 935 if (blob_len > bytes_remaining) {
901 cERROR(1, "bad security blob length %d", blob_len); 936 cERROR(1, "bad security blob length %d", blob_len);
@@ -931,6 +966,8 @@ ssetup_exit:
931 key_put(spnego_key); 966 key_put(spnego_key);
932 } 967 }
933 kfree(str_area); 968 kfree(str_area);
969 kfree(ntlmsspblob);
970 ntlmsspblob = NULL;
934 if (resp_buf_type == CIFS_SMALL_BUFFER) { 971 if (resp_buf_type == CIFS_SMALL_BUFFER) {
935 cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base); 972 cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base);
936 cifs_small_buf_release(iov[0].iov_base); 973 cifs_small_buf_release(iov[0].iov_base);
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 82f78c4d6978..a66c91eb6eb4 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -543,7 +543,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
543 (ses->server->secMode & (SECMODE_SIGN_REQUIRED | 543 (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
544 SECMODE_SIGN_ENABLED))) { 544 SECMODE_SIGN_ENABLED))) {
545 rc = cifs_verify_signature(midQ->resp_buf, 545 rc = cifs_verify_signature(midQ->resp_buf,
546 &ses->server->mac_signing_key, 546 &ses->server->session_key,
547 midQ->sequence_number+1); 547 midQ->sequence_number+1);
548 if (rc) { 548 if (rc) {
549 cERROR(1, "Unexpected SMB signature"); 549 cERROR(1, "Unexpected SMB signature");
@@ -731,7 +731,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
731 (ses->server->secMode & (SECMODE_SIGN_REQUIRED | 731 (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
732 SECMODE_SIGN_ENABLED))) { 732 SECMODE_SIGN_ENABLED))) {
733 rc = cifs_verify_signature(out_buf, 733 rc = cifs_verify_signature(out_buf,
734 &ses->server->mac_signing_key, 734 &ses->server->session_key,
735 midQ->sequence_number+1); 735 midQ->sequence_number+1);
736 if (rc) { 736 if (rc) {
737 cERROR(1, "Unexpected SMB signature"); 737 cERROR(1, "Unexpected SMB signature");
@@ -981,7 +981,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
981 (ses->server->secMode & (SECMODE_SIGN_REQUIRED | 981 (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
982 SECMODE_SIGN_ENABLED))) { 982 SECMODE_SIGN_ENABLED))) {
983 rc = cifs_verify_signature(out_buf, 983 rc = cifs_verify_signature(out_buf,
984 &ses->server->mac_signing_key, 984 &ses->server->session_key,
985 midQ->sequence_number+1); 985 midQ->sequence_number+1);
986 if (rc) { 986 if (rc) {
987 cERROR(1, "Unexpected SMB signature"); 987 cERROR(1, "Unexpected SMB signature");
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index a1509207bfa6..a264b744bb41 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -47,9 +47,10 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
47#ifdef CONFIG_CIFS_XATTR 47#ifdef CONFIG_CIFS_XATTR
48 int xid; 48 int xid;
49 struct cifs_sb_info *cifs_sb; 49 struct cifs_sb_info *cifs_sb;
50 struct tcon_link *tlink;
50 struct cifsTconInfo *pTcon; 51 struct cifsTconInfo *pTcon;
51 struct super_block *sb; 52 struct super_block *sb;
52 char *full_path; 53 char *full_path = NULL;
53 54
54 if (direntry == NULL) 55 if (direntry == NULL)
55 return -EIO; 56 return -EIO;
@@ -58,16 +59,19 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
58 sb = direntry->d_inode->i_sb; 59 sb = direntry->d_inode->i_sb;
59 if (sb == NULL) 60 if (sb == NULL)
60 return -EIO; 61 return -EIO;
61 xid = GetXid();
62 62
63 cifs_sb = CIFS_SB(sb); 63 cifs_sb = CIFS_SB(sb);
64 pTcon = cifs_sb->tcon; 64 tlink = cifs_sb_tlink(cifs_sb);
65 if (IS_ERR(tlink))
66 return PTR_ERR(tlink);
67 pTcon = tlink_tcon(tlink);
68
69 xid = GetXid();
65 70
66 full_path = build_path_from_dentry(direntry); 71 full_path = build_path_from_dentry(direntry);
67 if (full_path == NULL) { 72 if (full_path == NULL) {
68 rc = -ENOMEM; 73 rc = -ENOMEM;
69 FreeXid(xid); 74 goto remove_ea_exit;
70 return rc;
71 } 75 }
72 if (ea_name == NULL) { 76 if (ea_name == NULL) {
73 cFYI(1, "Null xattr names not supported"); 77 cFYI(1, "Null xattr names not supported");
@@ -91,6 +95,7 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
91remove_ea_exit: 95remove_ea_exit:
92 kfree(full_path); 96 kfree(full_path);
93 FreeXid(xid); 97 FreeXid(xid);
98 cifs_put_tlink(tlink);
94#endif 99#endif
95 return rc; 100 return rc;
96} 101}
@@ -102,6 +107,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
102#ifdef CONFIG_CIFS_XATTR 107#ifdef CONFIG_CIFS_XATTR
103 int xid; 108 int xid;
104 struct cifs_sb_info *cifs_sb; 109 struct cifs_sb_info *cifs_sb;
110 struct tcon_link *tlink;
105 struct cifsTconInfo *pTcon; 111 struct cifsTconInfo *pTcon;
106 struct super_block *sb; 112 struct super_block *sb;
107 char *full_path; 113 char *full_path;
@@ -113,16 +119,19 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
113 sb = direntry->d_inode->i_sb; 119 sb = direntry->d_inode->i_sb;
114 if (sb == NULL) 120 if (sb == NULL)
115 return -EIO; 121 return -EIO;
116 xid = GetXid();
117 122
118 cifs_sb = CIFS_SB(sb); 123 cifs_sb = CIFS_SB(sb);
119 pTcon = cifs_sb->tcon; 124 tlink = cifs_sb_tlink(cifs_sb);
125 if (IS_ERR(tlink))
126 return PTR_ERR(tlink);
127 pTcon = tlink_tcon(tlink);
128
129 xid = GetXid();
120 130
121 full_path = build_path_from_dentry(direntry); 131 full_path = build_path_from_dentry(direntry);
122 if (full_path == NULL) { 132 if (full_path == NULL) {
123 rc = -ENOMEM; 133 rc = -ENOMEM;
124 FreeXid(xid); 134 goto set_ea_exit;
125 return rc;
126 } 135 }
127 /* return dos attributes as pseudo xattr */ 136 /* return dos attributes as pseudo xattr */
128 /* return alt name if available as pseudo attr */ 137 /* return alt name if available as pseudo attr */
@@ -132,9 +141,8 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
132 returns as xattrs */ 141 returns as xattrs */
133 if (value_size > MAX_EA_VALUE_SIZE) { 142 if (value_size > MAX_EA_VALUE_SIZE) {
134 cFYI(1, "size of EA value too large"); 143 cFYI(1, "size of EA value too large");
135 kfree(full_path); 144 rc = -EOPNOTSUPP;
136 FreeXid(xid); 145 goto set_ea_exit;
137 return -EOPNOTSUPP;
138 } 146 }
139 147
140 if (ea_name == NULL) { 148 if (ea_name == NULL) {
@@ -198,6 +206,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
198set_ea_exit: 206set_ea_exit:
199 kfree(full_path); 207 kfree(full_path);
200 FreeXid(xid); 208 FreeXid(xid);
209 cifs_put_tlink(tlink);
201#endif 210#endif
202 return rc; 211 return rc;
203} 212}
@@ -209,6 +218,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
209#ifdef CONFIG_CIFS_XATTR 218#ifdef CONFIG_CIFS_XATTR
210 int xid; 219 int xid;
211 struct cifs_sb_info *cifs_sb; 220 struct cifs_sb_info *cifs_sb;
221 struct tcon_link *tlink;
212 struct cifsTconInfo *pTcon; 222 struct cifsTconInfo *pTcon;
213 struct super_block *sb; 223 struct super_block *sb;
214 char *full_path; 224 char *full_path;
@@ -221,16 +231,18 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
221 if (sb == NULL) 231 if (sb == NULL)
222 return -EIO; 232 return -EIO;
223 233
224 xid = GetXid();
225
226 cifs_sb = CIFS_SB(sb); 234 cifs_sb = CIFS_SB(sb);
227 pTcon = cifs_sb->tcon; 235 tlink = cifs_sb_tlink(cifs_sb);
236 if (IS_ERR(tlink))
237 return PTR_ERR(tlink);
238 pTcon = tlink_tcon(tlink);
239
240 xid = GetXid();
228 241
229 full_path = build_path_from_dentry(direntry); 242 full_path = build_path_from_dentry(direntry);
230 if (full_path == NULL) { 243 if (full_path == NULL) {
231 rc = -ENOMEM; 244 rc = -ENOMEM;
232 FreeXid(xid); 245 goto get_ea_exit;
233 return rc;
234 } 246 }
235 /* return dos attributes as pseudo xattr */ 247 /* return dos attributes as pseudo xattr */
236 /* return alt name if available as pseudo attr */ 248 /* return alt name if available as pseudo attr */
@@ -323,6 +335,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
323get_ea_exit: 335get_ea_exit:
324 kfree(full_path); 336 kfree(full_path);
325 FreeXid(xid); 337 FreeXid(xid);
338 cifs_put_tlink(tlink);
326#endif 339#endif
327 return rc; 340 return rc;
328} 341}
@@ -333,6 +346,7 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
333#ifdef CONFIG_CIFS_XATTR 346#ifdef CONFIG_CIFS_XATTR
334 int xid; 347 int xid;
335 struct cifs_sb_info *cifs_sb; 348 struct cifs_sb_info *cifs_sb;
349 struct tcon_link *tlink;
336 struct cifsTconInfo *pTcon; 350 struct cifsTconInfo *pTcon;
337 struct super_block *sb; 351 struct super_block *sb;
338 char *full_path; 352 char *full_path;
@@ -346,18 +360,20 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
346 return -EIO; 360 return -EIO;
347 361
348 cifs_sb = CIFS_SB(sb); 362 cifs_sb = CIFS_SB(sb);
349 pTcon = cifs_sb->tcon;
350
351 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) 363 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
352 return -EOPNOTSUPP; 364 return -EOPNOTSUPP;
353 365
366 tlink = cifs_sb_tlink(cifs_sb);
367 if (IS_ERR(tlink))
368 return PTR_ERR(tlink);
369 pTcon = tlink_tcon(tlink);
370
354 xid = GetXid(); 371 xid = GetXid();
355 372
356 full_path = build_path_from_dentry(direntry); 373 full_path = build_path_from_dentry(direntry);
357 if (full_path == NULL) { 374 if (full_path == NULL) {
358 rc = -ENOMEM; 375 rc = -ENOMEM;
359 FreeXid(xid); 376 goto list_ea_exit;
360 return rc;
361 } 377 }
362 /* return dos attributes as pseudo xattr */ 378 /* return dos attributes as pseudo xattr */
363 /* return alt name if available as pseudo attr */ 379 /* return alt name if available as pseudo attr */
@@ -370,8 +386,10 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
370 cifs_sb->mnt_cifs_flags & 386 cifs_sb->mnt_cifs_flags &
371 CIFS_MOUNT_MAP_SPECIAL_CHR); 387 CIFS_MOUNT_MAP_SPECIAL_CHR);
372 388
389list_ea_exit:
373 kfree(full_path); 390 kfree(full_path);
374 FreeXid(xid); 391 FreeXid(xid);
392 cifs_put_tlink(tlink);
375#endif 393#endif
376 return rc; 394 return rc;
377} 395}
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index a5bf5771a22a..9060f08e70cf 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -17,6 +17,7 @@
17#include <linux/string.h> 17#include <linux/string.h>
18#include <linux/list.h> 18#include <linux/list.h>
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/spinlock.h>
20 21
21#include <linux/coda.h> 22#include <linux/coda.h>
22#include <linux/coda_linux.h> 23#include <linux/coda_linux.h>
@@ -31,19 +32,23 @@ void coda_cache_enter(struct inode *inode, int mask)
31{ 32{
32 struct coda_inode_info *cii = ITOC(inode); 33 struct coda_inode_info *cii = ITOC(inode);
33 34
35 spin_lock(&cii->c_lock);
34 cii->c_cached_epoch = atomic_read(&permission_epoch); 36 cii->c_cached_epoch = atomic_read(&permission_epoch);
35 if (cii->c_uid != current_fsuid()) { 37 if (cii->c_uid != current_fsuid()) {
36 cii->c_uid = current_fsuid(); 38 cii->c_uid = current_fsuid();
37 cii->c_cached_perm = mask; 39 cii->c_cached_perm = mask;
38 } else 40 } else
39 cii->c_cached_perm |= mask; 41 cii->c_cached_perm |= mask;
42 spin_unlock(&cii->c_lock);
40} 43}
41 44
42/* remove cached acl from an inode */ 45/* remove cached acl from an inode */
43void coda_cache_clear_inode(struct inode *inode) 46void coda_cache_clear_inode(struct inode *inode)
44{ 47{
45 struct coda_inode_info *cii = ITOC(inode); 48 struct coda_inode_info *cii = ITOC(inode);
49 spin_lock(&cii->c_lock);
46 cii->c_cached_epoch = atomic_read(&permission_epoch) - 1; 50 cii->c_cached_epoch = atomic_read(&permission_epoch) - 1;
51 spin_unlock(&cii->c_lock);
47} 52}
48 53
49/* remove all acl caches */ 54/* remove all acl caches */
@@ -57,13 +62,15 @@ void coda_cache_clear_all(struct super_block *sb)
57int coda_cache_check(struct inode *inode, int mask) 62int coda_cache_check(struct inode *inode, int mask)
58{ 63{
59 struct coda_inode_info *cii = ITOC(inode); 64 struct coda_inode_info *cii = ITOC(inode);
60 int hit; 65 int hit;
61 66
62 hit = (mask & cii->c_cached_perm) == mask && 67 spin_lock(&cii->c_lock);
63 cii->c_uid == current_fsuid() && 68 hit = (mask & cii->c_cached_perm) == mask &&
64 cii->c_cached_epoch == atomic_read(&permission_epoch); 69 cii->c_uid == current_fsuid() &&
70 cii->c_cached_epoch == atomic_read(&permission_epoch);
71 spin_unlock(&cii->c_lock);
65 72
66 return hit; 73 return hit;
67} 74}
68 75
69 76
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index a7a780929eec..602240569c89 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -45,13 +45,15 @@ static void coda_fill_inode(struct inode *inode, struct coda_vattr *attr)
45static int coda_test_inode(struct inode *inode, void *data) 45static int coda_test_inode(struct inode *inode, void *data)
46{ 46{
47 struct CodaFid *fid = (struct CodaFid *)data; 47 struct CodaFid *fid = (struct CodaFid *)data;
48 return coda_fideq(&(ITOC(inode)->c_fid), fid); 48 struct coda_inode_info *cii = ITOC(inode);
49 return coda_fideq(&cii->c_fid, fid);
49} 50}
50 51
51static int coda_set_inode(struct inode *inode, void *data) 52static int coda_set_inode(struct inode *inode, void *data)
52{ 53{
53 struct CodaFid *fid = (struct CodaFid *)data; 54 struct CodaFid *fid = (struct CodaFid *)data;
54 ITOC(inode)->c_fid = *fid; 55 struct coda_inode_info *cii = ITOC(inode);
56 cii->c_fid = *fid;
55 return 0; 57 return 0;
56} 58}
57 59
@@ -71,6 +73,7 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid,
71 cii = ITOC(inode); 73 cii = ITOC(inode);
72 /* we still need to set i_ino for things like stat(2) */ 74 /* we still need to set i_ino for things like stat(2) */
73 inode->i_ino = hash; 75 inode->i_ino = hash;
76 /* inode is locked and unique, no need to grab cii->c_lock */
74 cii->c_mapcount = 0; 77 cii->c_mapcount = 0;
75 unlock_new_inode(inode); 78 unlock_new_inode(inode);
76 } 79 }
@@ -107,14 +110,20 @@ int coda_cnode_make(struct inode **inode, struct CodaFid *fid, struct super_bloc
107} 110}
108 111
109 112
113/* Although we treat Coda file identifiers as immutable, there is one
114 * special case for files created during a disconnection where they may
115 * not be globally unique. When an identifier collision is detected we
116 * first try to flush the cached inode from the kernel and finally
117 * resort to renaming/rehashing in-place. Userspace remembers both old
118 * and new values of the identifier to handle any in-flight upcalls.
119 * The real solution is to use globally unique UUIDs as identifiers, but
120 * retrofitting the existing userspace code for this is non-trivial. */
110void coda_replace_fid(struct inode *inode, struct CodaFid *oldfid, 121void coda_replace_fid(struct inode *inode, struct CodaFid *oldfid,
111 struct CodaFid *newfid) 122 struct CodaFid *newfid)
112{ 123{
113 struct coda_inode_info *cii; 124 struct coda_inode_info *cii = ITOC(inode);
114 unsigned long hash = coda_f2i(newfid); 125 unsigned long hash = coda_f2i(newfid);
115 126
116 cii = ITOC(inode);
117
118 BUG_ON(!coda_fideq(&cii->c_fid, oldfid)); 127 BUG_ON(!coda_fideq(&cii->c_fid, oldfid));
119 128
120 /* replace fid and rehash inode */ 129 /* replace fid and rehash inode */
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index ccd98b0f2b0b..5d8b35539601 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -17,7 +17,7 @@
17#include <linux/stat.h> 17#include <linux/stat.h>
18#include <linux/errno.h> 18#include <linux/errno.h>
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/smp_lock.h> 20#include <linux/spinlock.h>
21 21
22#include <asm/uaccess.h> 22#include <asm/uaccess.h>
23 23
@@ -116,15 +116,11 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
116 goto exit; 116 goto exit;
117 } 117 }
118 118
119 lock_kernel();
120
121 error = venus_lookup(dir->i_sb, coda_i2f(dir), name, length, 119 error = venus_lookup(dir->i_sb, coda_i2f(dir), name, length,
122 &type, &resfid); 120 &type, &resfid);
123 if (!error) 121 if (!error)
124 error = coda_cnode_make(&inode, &resfid, dir->i_sb); 122 error = coda_cnode_make(&inode, &resfid, dir->i_sb);
125 123
126 unlock_kernel();
127
128 if (error && error != -ENOENT) 124 if (error && error != -ENOENT)
129 return ERR_PTR(error); 125 return ERR_PTR(error);
130 126
@@ -140,28 +136,24 @@ exit:
140 136
141int coda_permission(struct inode *inode, int mask) 137int coda_permission(struct inode *inode, int mask)
142{ 138{
143 int error = 0; 139 int error;
144 140
145 mask &= MAY_READ | MAY_WRITE | MAY_EXEC; 141 mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
146 142
147 if (!mask) 143 if (!mask)
148 return 0; 144 return 0;
149 145
150 if ((mask & MAY_EXEC) && !execute_ok(inode)) 146 if ((mask & MAY_EXEC) && !execute_ok(inode))
151 return -EACCES; 147 return -EACCES;
152 148
153 lock_kernel();
154
155 if (coda_cache_check(inode, mask)) 149 if (coda_cache_check(inode, mask))
156 goto out; 150 return 0;
157 151
158 error = venus_access(inode->i_sb, coda_i2f(inode), mask); 152 error = venus_access(inode->i_sb, coda_i2f(inode), mask);
159 153
160 if (!error) 154 if (!error)
161 coda_cache_enter(inode, mask); 155 coda_cache_enter(inode, mask);
162 156
163 out:
164 unlock_kernel();
165 return error; 157 return error;
166} 158}
167 159
@@ -200,41 +192,34 @@ static inline void coda_dir_drop_nlink(struct inode *dir)
200/* creation routines: create, mknod, mkdir, link, symlink */ 192/* creation routines: create, mknod, mkdir, link, symlink */
201static int coda_create(struct inode *dir, struct dentry *de, int mode, struct nameidata *nd) 193static int coda_create(struct inode *dir, struct dentry *de, int mode, struct nameidata *nd)
202{ 194{
203 int error=0; 195 int error;
204 const char *name=de->d_name.name; 196 const char *name=de->d_name.name;
205 int length=de->d_name.len; 197 int length=de->d_name.len;
206 struct inode *inode; 198 struct inode *inode;
207 struct CodaFid newfid; 199 struct CodaFid newfid;
208 struct coda_vattr attrs; 200 struct coda_vattr attrs;
209 201
210 lock_kernel(); 202 if (coda_isroot(dir) && coda_iscontrol(name, length))
211
212 if (coda_isroot(dir) && coda_iscontrol(name, length)) {
213 unlock_kernel();
214 return -EPERM; 203 return -EPERM;
215 }
216 204
217 error = venus_create(dir->i_sb, coda_i2f(dir), name, length, 205 error = venus_create(dir->i_sb, coda_i2f(dir), name, length,
218 0, mode, &newfid, &attrs); 206 0, mode, &newfid, &attrs);
219 207 if (error)
220 if ( error ) { 208 goto err_out;
221 unlock_kernel();
222 d_drop(de);
223 return error;
224 }
225 209
226 inode = coda_iget(dir->i_sb, &newfid, &attrs); 210 inode = coda_iget(dir->i_sb, &newfid, &attrs);
227 if ( IS_ERR(inode) ) { 211 if (IS_ERR(inode)) {
228 unlock_kernel(); 212 error = PTR_ERR(inode);
229 d_drop(de); 213 goto err_out;
230 return PTR_ERR(inode);
231 } 214 }
232 215
233 /* invalidate the directory cnode's attributes */ 216 /* invalidate the directory cnode's attributes */
234 coda_dir_update_mtime(dir); 217 coda_dir_update_mtime(dir);
235 unlock_kernel();
236 d_instantiate(de, inode); 218 d_instantiate(de, inode);
237 return 0; 219 return 0;
220err_out:
221 d_drop(de);
222 return error;
238} 223}
239 224
240static int coda_mkdir(struct inode *dir, struct dentry *de, int mode) 225static int coda_mkdir(struct inode *dir, struct dentry *de, int mode)
@@ -246,36 +231,29 @@ static int coda_mkdir(struct inode *dir, struct dentry *de, int mode)
246 int error; 231 int error;
247 struct CodaFid newfid; 232 struct CodaFid newfid;
248 233
249 lock_kernel(); 234 if (coda_isroot(dir) && coda_iscontrol(name, len))
250
251 if (coda_isroot(dir) && coda_iscontrol(name, len)) {
252 unlock_kernel();
253 return -EPERM; 235 return -EPERM;
254 }
255 236
256 attrs.va_mode = mode; 237 attrs.va_mode = mode;
257 error = venus_mkdir(dir->i_sb, coda_i2f(dir), 238 error = venus_mkdir(dir->i_sb, coda_i2f(dir),
258 name, len, &newfid, &attrs); 239 name, len, &newfid, &attrs);
259 240 if (error)
260 if ( error ) { 241 goto err_out;
261 unlock_kernel();
262 d_drop(de);
263 return error;
264 }
265 242
266 inode = coda_iget(dir->i_sb, &newfid, &attrs); 243 inode = coda_iget(dir->i_sb, &newfid, &attrs);
267 if ( IS_ERR(inode) ) { 244 if (IS_ERR(inode)) {
268 unlock_kernel(); 245 error = PTR_ERR(inode);
269 d_drop(de); 246 goto err_out;
270 return PTR_ERR(inode);
271 } 247 }
272 248
273 /* invalidate the directory cnode's attributes */ 249 /* invalidate the directory cnode's attributes */
274 coda_dir_inc_nlink(dir); 250 coda_dir_inc_nlink(dir);
275 coda_dir_update_mtime(dir); 251 coda_dir_update_mtime(dir);
276 unlock_kernel();
277 d_instantiate(de, inode); 252 d_instantiate(de, inode);
278 return 0; 253 return 0;
254err_out:
255 d_drop(de);
256 return error;
279} 257}
280 258
281/* try to make de an entry in dir_inodde linked to source_de */ 259/* try to make de an entry in dir_inodde linked to source_de */
@@ -287,52 +265,38 @@ static int coda_link(struct dentry *source_de, struct inode *dir_inode,
287 int len = de->d_name.len; 265 int len = de->d_name.len;
288 int error; 266 int error;
289 267
290 lock_kernel(); 268 if (coda_isroot(dir_inode) && coda_iscontrol(name, len))
291
292 if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) {
293 unlock_kernel();
294 return -EPERM; 269 return -EPERM;
295 }
296 270
297 error = venus_link(dir_inode->i_sb, coda_i2f(inode), 271 error = venus_link(dir_inode->i_sb, coda_i2f(inode),
298 coda_i2f(dir_inode), (const char *)name, len); 272 coda_i2f(dir_inode), (const char *)name, len);
299
300 if (error) { 273 if (error) {
301 d_drop(de); 274 d_drop(de);
302 goto out; 275 return error;
303 } 276 }
304 277
305 coda_dir_update_mtime(dir_inode); 278 coda_dir_update_mtime(dir_inode);
306 atomic_inc(&inode->i_count); 279 ihold(inode);
307 d_instantiate(de, inode); 280 d_instantiate(de, inode);
308 inc_nlink(inode); 281 inc_nlink(inode);
309 282 return 0;
310out:
311 unlock_kernel();
312 return(error);
313} 283}
314 284
315 285
316static int coda_symlink(struct inode *dir_inode, struct dentry *de, 286static int coda_symlink(struct inode *dir_inode, struct dentry *de,
317 const char *symname) 287 const char *symname)
318{ 288{
319 const char *name = de->d_name.name; 289 const char *name = de->d_name.name;
320 int len = de->d_name.len; 290 int len = de->d_name.len;
321 int symlen; 291 int symlen;
322 int error = 0; 292 int error;
323
324 lock_kernel();
325 293
326 if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) { 294 if (coda_isroot(dir_inode) && coda_iscontrol(name, len))
327 unlock_kernel();
328 return -EPERM; 295 return -EPERM;
329 }
330 296
331 symlen = strlen(symname); 297 symlen = strlen(symname);
332 if ( symlen > CODA_MAXPATHLEN ) { 298 if (symlen > CODA_MAXPATHLEN)
333 unlock_kernel();
334 return -ENAMETOOLONG; 299 return -ENAMETOOLONG;
335 }
336 300
337 /* 301 /*
338 * This entry is now negative. Since we do not create 302 * This entry is now negative. Since we do not create
@@ -343,10 +307,9 @@ static int coda_symlink(struct inode *dir_inode, struct dentry *de,
343 symname, symlen); 307 symname, symlen);
344 308
345 /* mtime is no good anymore */ 309 /* mtime is no good anymore */
346 if ( !error ) 310 if (!error)
347 coda_dir_update_mtime(dir_inode); 311 coda_dir_update_mtime(dir_inode);
348 312
349 unlock_kernel();
350 return error; 313 return error;
351} 314}
352 315
@@ -357,17 +320,12 @@ static int coda_unlink(struct inode *dir, struct dentry *de)
357 const char *name = de->d_name.name; 320 const char *name = de->d_name.name;
358 int len = de->d_name.len; 321 int len = de->d_name.len;
359 322
360 lock_kernel();
361
362 error = venus_remove(dir->i_sb, coda_i2f(dir), name, len); 323 error = venus_remove(dir->i_sb, coda_i2f(dir), name, len);
363 if ( error ) { 324 if (error)
364 unlock_kernel();
365 return error; 325 return error;
366 }
367 326
368 coda_dir_update_mtime(dir); 327 coda_dir_update_mtime(dir);
369 drop_nlink(de->d_inode); 328 drop_nlink(de->d_inode);
370 unlock_kernel();
371 return 0; 329 return 0;
372} 330}
373 331
@@ -377,8 +335,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
377 int len = de->d_name.len; 335 int len = de->d_name.len;
378 int error; 336 int error;
379 337
380 lock_kernel();
381
382 error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len); 338 error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
383 if (!error) { 339 if (!error) {
384 /* VFS may delete the child */ 340 /* VFS may delete the child */
@@ -389,7 +345,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
389 coda_dir_drop_nlink(dir); 345 coda_dir_drop_nlink(dir);
390 coda_dir_update_mtime(dir); 346 coda_dir_update_mtime(dir);
391 } 347 }
392 unlock_kernel();
393 return error; 348 return error;
394} 349}
395 350
@@ -403,15 +358,12 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
403 int new_length = new_dentry->d_name.len; 358 int new_length = new_dentry->d_name.len;
404 int error; 359 int error;
405 360
406 lock_kernel();
407
408 error = venus_rename(old_dir->i_sb, coda_i2f(old_dir), 361 error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
409 coda_i2f(new_dir), old_length, new_length, 362 coda_i2f(new_dir), old_length, new_length,
410 (const char *) old_name, (const char *)new_name); 363 (const char *) old_name, (const char *)new_name);
411 364 if (!error) {
412 if ( !error ) { 365 if (new_dentry->d_inode) {
413 if ( new_dentry->d_inode ) { 366 if (S_ISDIR(new_dentry->d_inode->i_mode)) {
414 if ( S_ISDIR(new_dentry->d_inode->i_mode) ) {
415 coda_dir_drop_nlink(old_dir); 367 coda_dir_drop_nlink(old_dir);
416 coda_dir_inc_nlink(new_dir); 368 coda_dir_inc_nlink(new_dir);
417 } 369 }
@@ -423,8 +375,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
423 coda_flag_inode(new_dir, C_VATTR); 375 coda_flag_inode(new_dir, C_VATTR);
424 } 376 }
425 } 377 }
426 unlock_kernel();
427
428 return error; 378 return error;
429} 379}
430 380
@@ -594,10 +544,7 @@ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
594 struct inode *inode = de->d_inode; 544 struct inode *inode = de->d_inode;
595 struct coda_inode_info *cii; 545 struct coda_inode_info *cii;
596 546
597 if (!inode) 547 if (!inode || coda_isroot(inode))
598 return 1;
599 lock_kernel();
600 if (coda_isroot(inode))
601 goto out; 548 goto out;
602 if (is_bad_inode(inode)) 549 if (is_bad_inode(inode))
603 goto bad; 550 goto bad;
@@ -617,13 +564,12 @@ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
617 goto out; 564 goto out;
618 565
619 /* clear the flags. */ 566 /* clear the flags. */
567 spin_lock(&cii->c_lock);
620 cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH); 568 cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH);
621 569 spin_unlock(&cii->c_lock);
622bad: 570bad:
623 unlock_kernel();
624 return 0; 571 return 0;
625out: 572out:
626 unlock_kernel();
627 return 1; 573 return 1;
628} 574}
629 575
@@ -656,20 +602,19 @@ static int coda_dentry_delete(struct dentry * dentry)
656int coda_revalidate_inode(struct dentry *dentry) 602int coda_revalidate_inode(struct dentry *dentry)
657{ 603{
658 struct coda_vattr attr; 604 struct coda_vattr attr;
659 int error = 0; 605 int error;
660 int old_mode; 606 int old_mode;
661 ino_t old_ino; 607 ino_t old_ino;
662 struct inode *inode = dentry->d_inode; 608 struct inode *inode = dentry->d_inode;
663 struct coda_inode_info *cii = ITOC(inode); 609 struct coda_inode_info *cii = ITOC(inode);
664 610
665 lock_kernel(); 611 if (!cii->c_flags)
666 if ( !cii->c_flags ) 612 return 0;
667 goto ok;
668 613
669 if (cii->c_flags & (C_VATTR | C_PURGE | C_FLUSH)) { 614 if (cii->c_flags & (C_VATTR | C_PURGE | C_FLUSH)) {
670 error = venus_getattr(inode->i_sb, &(cii->c_fid), &attr); 615 error = venus_getattr(inode->i_sb, &(cii->c_fid), &attr);
671 if ( error ) 616 if (error)
672 goto return_bad; 617 return -EIO;
673 618
674 /* this inode may be lost if: 619 /* this inode may be lost if:
675 - it's ino changed 620 - it's ino changed
@@ -688,17 +633,13 @@ int coda_revalidate_inode(struct dentry *dentry)
688 /* the following can happen when a local fid is replaced 633 /* the following can happen when a local fid is replaced
689 with a global one, here we lose and declare the inode bad */ 634 with a global one, here we lose and declare the inode bad */
690 if (inode->i_ino != old_ino) 635 if (inode->i_ino != old_ino)
691 goto return_bad; 636 return -EIO;
692 637
693 coda_flag_inode_children(inode, C_FLUSH); 638 coda_flag_inode_children(inode, C_FLUSH);
639
640 spin_lock(&cii->c_lock);
694 cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH); 641 cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH);
642 spin_unlock(&cii->c_lock);
695 } 643 }
696
697ok:
698 unlock_kernel();
699 return 0; 644 return 0;
700
701return_bad:
702 unlock_kernel();
703 return -EIO;
704} 645}
diff --git a/fs/coda/file.c b/fs/coda/file.c
index ad3cd2abeeb4..c8b50ba4366a 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -15,7 +15,7 @@
15#include <linux/stat.h> 15#include <linux/stat.h>
16#include <linux/cred.h> 16#include <linux/cred.h>
17#include <linux/errno.h> 17#include <linux/errno.h>
18#include <linux/smp_lock.h> 18#include <linux/spinlock.h>
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <asm/uaccess.h> 21#include <asm/uaccess.h>
@@ -109,19 +109,24 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
109 109
110 coda_inode = coda_file->f_path.dentry->d_inode; 110 coda_inode = coda_file->f_path.dentry->d_inode;
111 host_inode = host_file->f_path.dentry->d_inode; 111 host_inode = host_file->f_path.dentry->d_inode;
112
113 cii = ITOC(coda_inode);
114 spin_lock(&cii->c_lock);
112 coda_file->f_mapping = host_file->f_mapping; 115 coda_file->f_mapping = host_file->f_mapping;
113 if (coda_inode->i_mapping == &coda_inode->i_data) 116 if (coda_inode->i_mapping == &coda_inode->i_data)
114 coda_inode->i_mapping = host_inode->i_mapping; 117 coda_inode->i_mapping = host_inode->i_mapping;
115 118
116 /* only allow additional mmaps as long as userspace isn't changing 119 /* only allow additional mmaps as long as userspace isn't changing
117 * the container file on us! */ 120 * the container file on us! */
118 else if (coda_inode->i_mapping != host_inode->i_mapping) 121 else if (coda_inode->i_mapping != host_inode->i_mapping) {
122 spin_unlock(&cii->c_lock);
119 return -EBUSY; 123 return -EBUSY;
124 }
120 125
121 /* keep track of how often the coda_inode/host_file has been mmapped */ 126 /* keep track of how often the coda_inode/host_file has been mmapped */
122 cii = ITOC(coda_inode);
123 cii->c_mapcount++; 127 cii->c_mapcount++;
124 cfi->cfi_mapcount++; 128 cfi->cfi_mapcount++;
129 spin_unlock(&cii->c_lock);
125 130
126 return host_file->f_op->mmap(host_file, vma); 131 return host_file->f_op->mmap(host_file, vma);
127} 132}
@@ -138,8 +143,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
138 if (!cfi) 143 if (!cfi)
139 return -ENOMEM; 144 return -ENOMEM;
140 145
141 lock_kernel();
142
143 error = venus_open(coda_inode->i_sb, coda_i2f(coda_inode), coda_flags, 146 error = venus_open(coda_inode->i_sb, coda_i2f(coda_inode), coda_flags,
144 &host_file); 147 &host_file);
145 if (!host_file) 148 if (!host_file)
@@ -147,7 +150,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
147 150
148 if (error) { 151 if (error) {
149 kfree(cfi); 152 kfree(cfi);
150 unlock_kernel();
151 return error; 153 return error;
152 } 154 }
153 155
@@ -159,8 +161,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
159 161
160 BUG_ON(coda_file->private_data != NULL); 162 BUG_ON(coda_file->private_data != NULL);
161 coda_file->private_data = cfi; 163 coda_file->private_data = cfi;
162
163 unlock_kernel();
164 return 0; 164 return 0;
165} 165}
166 166
@@ -171,9 +171,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
171 struct coda_file_info *cfi; 171 struct coda_file_info *cfi;
172 struct coda_inode_info *cii; 172 struct coda_inode_info *cii;
173 struct inode *host_inode; 173 struct inode *host_inode;
174 int err = 0; 174 int err;
175
176 lock_kernel();
177 175
178 cfi = CODA_FTOC(coda_file); 176 cfi = CODA_FTOC(coda_file);
179 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); 177 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
@@ -185,18 +183,18 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
185 cii = ITOC(coda_inode); 183 cii = ITOC(coda_inode);
186 184
187 /* did we mmap this file? */ 185 /* did we mmap this file? */
186 spin_lock(&cii->c_lock);
188 if (coda_inode->i_mapping == &host_inode->i_data) { 187 if (coda_inode->i_mapping == &host_inode->i_data) {
189 cii->c_mapcount -= cfi->cfi_mapcount; 188 cii->c_mapcount -= cfi->cfi_mapcount;
190 if (!cii->c_mapcount) 189 if (!cii->c_mapcount)
191 coda_inode->i_mapping = &coda_inode->i_data; 190 coda_inode->i_mapping = &coda_inode->i_data;
192 } 191 }
192 spin_unlock(&cii->c_lock);
193 193
194 fput(cfi->cfi_container); 194 fput(cfi->cfi_container);
195 kfree(coda_file->private_data); 195 kfree(coda_file->private_data);
196 coda_file->private_data = NULL; 196 coda_file->private_data = NULL;
197 197
198 unlock_kernel();
199
200 /* VFS fput ignores the return value from file_operations->release, so 198 /* VFS fput ignores the return value from file_operations->release, so
201 * there is no use returning an error here */ 199 * there is no use returning an error here */
202 return 0; 200 return 0;
@@ -207,7 +205,7 @@ int coda_fsync(struct file *coda_file, int datasync)
207 struct file *host_file; 205 struct file *host_file;
208 struct inode *coda_inode = coda_file->f_path.dentry->d_inode; 206 struct inode *coda_inode = coda_file->f_path.dentry->d_inode;
209 struct coda_file_info *cfi; 207 struct coda_file_info *cfi;
210 int err = 0; 208 int err;
211 209
212 if (!(S_ISREG(coda_inode->i_mode) || S_ISDIR(coda_inode->i_mode) || 210 if (!(S_ISREG(coda_inode->i_mode) || S_ISDIR(coda_inode->i_mode) ||
213 S_ISLNK(coda_inode->i_mode))) 211 S_ISLNK(coda_inode->i_mode)))
@@ -218,11 +216,8 @@ int coda_fsync(struct file *coda_file, int datasync)
218 host_file = cfi->cfi_container; 216 host_file = cfi->cfi_container;
219 217
220 err = vfs_fsync(host_file, datasync); 218 err = vfs_fsync(host_file, datasync);
221 if ( !err && !datasync ) { 219 if (!err && !datasync)
222 lock_kernel();
223 err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode)); 220 err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
224 unlock_kernel();
225 }
226 221
227 return err; 222 return err;
228} 223}
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 6526e6f21ecf..7993b96ca348 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -15,7 +15,8 @@
15#include <linux/stat.h> 15#include <linux/stat.h>
16#include <linux/errno.h> 16#include <linux/errno.h>
17#include <linux/unistd.h> 17#include <linux/unistd.h>
18#include <linux/smp_lock.h> 18#include <linux/mutex.h>
19#include <linux/spinlock.h>
19#include <linux/file.h> 20#include <linux/file.h>
20#include <linux/vfs.h> 21#include <linux/vfs.h>
21#include <linux/slab.h> 22#include <linux/slab.h>
@@ -51,6 +52,7 @@ static struct inode *coda_alloc_inode(struct super_block *sb)
51 ei->c_flags = 0; 52 ei->c_flags = 0;
52 ei->c_uid = 0; 53 ei->c_uid = 0;
53 ei->c_cached_perm = 0; 54 ei->c_cached_perm = 0;
55 spin_lock_init(&ei->c_lock);
54 return &ei->vfs_inode; 56 return &ei->vfs_inode;
55} 57}
56 58
@@ -143,7 +145,7 @@ static int get_device_index(struct coda_mount_data *data)
143static int coda_fill_super(struct super_block *sb, void *data, int silent) 145static int coda_fill_super(struct super_block *sb, void *data, int silent)
144{ 146{
145 struct inode *root = NULL; 147 struct inode *root = NULL;
146 struct venus_comm *vc = NULL; 148 struct venus_comm *vc;
147 struct CodaFid fid; 149 struct CodaFid fid;
148 int error; 150 int error;
149 int idx; 151 int idx;
@@ -157,21 +159,26 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
157 printk(KERN_INFO "coda_read_super: device index: %i\n", idx); 159 printk(KERN_INFO "coda_read_super: device index: %i\n", idx);
158 160
159 vc = &coda_comms[idx]; 161 vc = &coda_comms[idx];
162 mutex_lock(&vc->vc_mutex);
163
160 if (!vc->vc_inuse) { 164 if (!vc->vc_inuse) {
161 printk("coda_read_super: No pseudo device\n"); 165 printk("coda_read_super: No pseudo device\n");
162 return -EINVAL; 166 error = -EINVAL;
167 goto unlock_out;
163 } 168 }
164 169
165 if ( vc->vc_sb ) { 170 if (vc->vc_sb) {
166 printk("coda_read_super: Device already mounted\n"); 171 printk("coda_read_super: Device already mounted\n");
167 return -EBUSY; 172 error = -EBUSY;
173 goto unlock_out;
168 } 174 }
169 175
170 error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY); 176 error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY);
171 if (error) 177 if (error)
172 goto bdi_err; 178 goto unlock_out;
173 179
174 vc->vc_sb = sb; 180 vc->vc_sb = sb;
181 mutex_unlock(&vc->vc_mutex);
175 182
176 sb->s_fs_info = vc; 183 sb->s_fs_info = vc;
177 sb->s_flags |= MS_NOATIME; 184 sb->s_flags |= MS_NOATIME;
@@ -200,26 +207,33 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
200 printk("coda_read_super: rootinode is %ld dev %s\n", 207 printk("coda_read_super: rootinode is %ld dev %s\n",
201 root->i_ino, root->i_sb->s_id); 208 root->i_ino, root->i_sb->s_id);
202 sb->s_root = d_alloc_root(root); 209 sb->s_root = d_alloc_root(root);
203 if (!sb->s_root) 210 if (!sb->s_root) {
211 error = -EINVAL;
204 goto error; 212 goto error;
205 return 0; 213 }
214 return 0;
206 215
207 error: 216error:
208 bdi_destroy(&vc->bdi);
209 bdi_err:
210 if (root) 217 if (root)
211 iput(root); 218 iput(root);
212 if (vc)
213 vc->vc_sb = NULL;
214 219
215 return -EINVAL; 220 mutex_lock(&vc->vc_mutex);
221 bdi_destroy(&vc->bdi);
222 vc->vc_sb = NULL;
223 sb->s_fs_info = NULL;
224unlock_out:
225 mutex_unlock(&vc->vc_mutex);
226 return error;
216} 227}
217 228
218static void coda_put_super(struct super_block *sb) 229static void coda_put_super(struct super_block *sb)
219{ 230{
220 bdi_destroy(&coda_vcp(sb)->bdi); 231 struct venus_comm *vcp = coda_vcp(sb);
221 coda_vcp(sb)->vc_sb = NULL; 232 mutex_lock(&vcp->vc_mutex);
233 bdi_destroy(&vcp->bdi);
234 vcp->vc_sb = NULL;
222 sb->s_fs_info = NULL; 235 sb->s_fs_info = NULL;
236 mutex_unlock(&vcp->vc_mutex);
223 237
224 printk("Coda: Bye bye.\n"); 238 printk("Coda: Bye bye.\n");
225} 239}
@@ -245,8 +259,6 @@ int coda_setattr(struct dentry *de, struct iattr *iattr)
245 struct coda_vattr vattr; 259 struct coda_vattr vattr;
246 int error; 260 int error;
247 261
248 lock_kernel();
249
250 memset(&vattr, 0, sizeof(vattr)); 262 memset(&vattr, 0, sizeof(vattr));
251 263
252 inode->i_ctime = CURRENT_TIME_SEC; 264 inode->i_ctime = CURRENT_TIME_SEC;
@@ -256,13 +268,10 @@ int coda_setattr(struct dentry *de, struct iattr *iattr)
256 /* Venus is responsible for truncating the container-file!!! */ 268 /* Venus is responsible for truncating the container-file!!! */
257 error = venus_setattr(inode->i_sb, coda_i2f(inode), &vattr); 269 error = venus_setattr(inode->i_sb, coda_i2f(inode), &vattr);
258 270
259 if ( !error ) { 271 if (!error) {
260 coda_vattr_to_iattr(inode, &vattr); 272 coda_vattr_to_iattr(inode, &vattr);
261 coda_cache_clear_inode(inode); 273 coda_cache_clear_inode(inode);
262 } 274 }
263
264 unlock_kernel();
265
266 return error; 275 return error;
267} 276}
268 277
@@ -276,12 +285,8 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
276{ 285{
277 int error; 286 int error;
278 287
279 lock_kernel();
280
281 error = venus_statfs(dentry, buf); 288 error = venus_statfs(dentry, buf);
282 289
283 unlock_kernel();
284
285 if (error) { 290 if (error) {
286 /* fake something like AFS does */ 291 /* fake something like AFS does */
287 buf->f_blocks = 9000000; 292 buf->f_blocks = 9000000;
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index ca25d96d45c9..2fd89b5c5c7b 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -23,8 +23,6 @@
23#include <linux/coda_fs_i.h> 23#include <linux/coda_fs_i.h>
24#include <linux/coda_psdev.h> 24#include <linux/coda_psdev.h>
25 25
26#include <linux/smp_lock.h>
27
28/* pioctl ops */ 26/* pioctl ops */
29static int coda_ioctl_permission(struct inode *inode, int mask); 27static int coda_ioctl_permission(struct inode *inode, int mask);
30static long coda_pioctl(struct file *filp, unsigned int cmd, 28static long coda_pioctl(struct file *filp, unsigned int cmd,
@@ -39,6 +37,7 @@ const struct inode_operations coda_ioctl_inode_operations = {
39const struct file_operations coda_ioctl_operations = { 37const struct file_operations coda_ioctl_operations = {
40 .owner = THIS_MODULE, 38 .owner = THIS_MODULE,
41 .unlocked_ioctl = coda_pioctl, 39 .unlocked_ioctl = coda_pioctl,
40 .llseek = noop_llseek,
42}; 41};
43 42
44/* the coda pioctl inode ops */ 43/* the coda pioctl inode ops */
@@ -57,13 +56,9 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
57 struct inode *target_inode = NULL; 56 struct inode *target_inode = NULL;
58 struct coda_inode_info *cnp; 57 struct coda_inode_info *cnp;
59 58
60 lock_kernel();
61
62 /* get the Pioctl data arguments from user space */ 59 /* get the Pioctl data arguments from user space */
63 if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) { 60 if (copy_from_user(&data, (void __user *)user_data, sizeof(data)))
64 error = -EINVAL; 61 return -EINVAL;
65 goto out;
66 }
67 62
68 /* 63 /*
69 * Look up the pathname. Note that the pathname is in 64 * Look up the pathname. Note that the pathname is in
@@ -75,13 +70,12 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
75 error = user_lpath(data.path, &path); 70 error = user_lpath(data.path, &path);
76 71
77 if (error) 72 if (error)
78 goto out; 73 return error;
79 else 74
80 target_inode = path.dentry->d_inode; 75 target_inode = path.dentry->d_inode;
81 76
82 /* return if it is not a Coda inode */ 77 /* return if it is not a Coda inode */
83 if (target_inode->i_sb != inode->i_sb) { 78 if (target_inode->i_sb != inode->i_sb) {
84 path_put(&path);
85 error = -EINVAL; 79 error = -EINVAL;
86 goto out; 80 goto out;
87 } 81 }
@@ -90,10 +84,7 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
90 cnp = ITOC(target_inode); 84 cnp = ITOC(target_inode);
91 85
92 error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data); 86 error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
93
94 path_put(&path);
95
96out: 87out:
97 unlock_kernel(); 88 path_put(&path);
98 return error; 89 return error;
99} 90}
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 116af7546cf0..62647a8595e4 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -35,7 +35,7 @@
35#include <linux/poll.h> 35#include <linux/poll.h>
36#include <linux/init.h> 36#include <linux/init.h>
37#include <linux/list.h> 37#include <linux/list.h>
38#include <linux/smp_lock.h> 38#include <linux/mutex.h>
39#include <linux/device.h> 39#include <linux/device.h>
40#include <asm/io.h> 40#include <asm/io.h>
41#include <asm/system.h> 41#include <asm/system.h>
@@ -67,8 +67,10 @@ static unsigned int coda_psdev_poll(struct file *file, poll_table * wait)
67 unsigned int mask = POLLOUT | POLLWRNORM; 67 unsigned int mask = POLLOUT | POLLWRNORM;
68 68
69 poll_wait(file, &vcp->vc_waitq, wait); 69 poll_wait(file, &vcp->vc_waitq, wait);
70 mutex_lock(&vcp->vc_mutex);
70 if (!list_empty(&vcp->vc_pending)) 71 if (!list_empty(&vcp->vc_pending))
71 mask |= POLLIN | POLLRDNORM; 72 mask |= POLLIN | POLLRDNORM;
73 mutex_unlock(&vcp->vc_mutex);
72 74
73 return mask; 75 return mask;
74} 76}
@@ -108,16 +110,9 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
108 return -EFAULT; 110 return -EFAULT;
109 111
110 if (DOWNCALL(hdr.opcode)) { 112 if (DOWNCALL(hdr.opcode)) {
111 struct super_block *sb = NULL; 113 union outputArgs *dcbuf;
112 union outputArgs *dcbuf;
113 int size = sizeof(*dcbuf); 114 int size = sizeof(*dcbuf);
114 115
115 sb = vcp->vc_sb;
116 if ( !sb ) {
117 count = nbytes;
118 goto out;
119 }
120
121 if ( nbytes < sizeof(struct coda_out_hdr) ) { 116 if ( nbytes < sizeof(struct coda_out_hdr) ) {
122 printk("coda_downcall opc %d uniq %d, not enough!\n", 117 printk("coda_downcall opc %d uniq %d, not enough!\n",
123 hdr.opcode, hdr.unique); 118 hdr.opcode, hdr.unique);
@@ -137,9 +132,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
137 } 132 }
138 133
139 /* what downcall errors does Venus handle ? */ 134 /* what downcall errors does Venus handle ? */
140 lock_kernel(); 135 error = coda_downcall(vcp, hdr.opcode, dcbuf);
141 error = coda_downcall(hdr.opcode, dcbuf, sb);
142 unlock_kernel();
143 136
144 CODA_FREE(dcbuf, nbytes); 137 CODA_FREE(dcbuf, nbytes);
145 if (error) { 138 if (error) {
@@ -152,7 +145,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
152 } 145 }
153 146
154 /* Look for the message on the processing queue. */ 147 /* Look for the message on the processing queue. */
155 lock_kernel(); 148 mutex_lock(&vcp->vc_mutex);
156 list_for_each(lh, &vcp->vc_processing) { 149 list_for_each(lh, &vcp->vc_processing) {
157 tmp = list_entry(lh, struct upc_req , uc_chain); 150 tmp = list_entry(lh, struct upc_req , uc_chain);
158 if (tmp->uc_unique == hdr.unique) { 151 if (tmp->uc_unique == hdr.unique) {
@@ -161,7 +154,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
161 break; 154 break;
162 } 155 }
163 } 156 }
164 unlock_kernel(); 157 mutex_unlock(&vcp->vc_mutex);
165 158
166 if (!req) { 159 if (!req) {
167 printk("psdev_write: msg (%d, %d) not found\n", 160 printk("psdev_write: msg (%d, %d) not found\n",
@@ -216,7 +209,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
216 if (nbytes == 0) 209 if (nbytes == 0)
217 return 0; 210 return 0;
218 211
219 lock_kernel(); 212 mutex_lock(&vcp->vc_mutex);
220 213
221 add_wait_queue(&vcp->vc_waitq, &wait); 214 add_wait_queue(&vcp->vc_waitq, &wait);
222 set_current_state(TASK_INTERRUPTIBLE); 215 set_current_state(TASK_INTERRUPTIBLE);
@@ -230,7 +223,9 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
230 retval = -ERESTARTSYS; 223 retval = -ERESTARTSYS;
231 break; 224 break;
232 } 225 }
226 mutex_unlock(&vcp->vc_mutex);
233 schedule(); 227 schedule();
228 mutex_lock(&vcp->vc_mutex);
234 } 229 }
235 230
236 set_current_state(TASK_RUNNING); 231 set_current_state(TASK_RUNNING);
@@ -263,7 +258,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
263 CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr)); 258 CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr));
264 kfree(req); 259 kfree(req);
265out: 260out:
266 unlock_kernel(); 261 mutex_unlock(&vcp->vc_mutex);
267 return (count ? count : retval); 262 return (count ? count : retval);
268} 263}
269 264
@@ -276,10 +271,10 @@ static int coda_psdev_open(struct inode * inode, struct file * file)
276 if (idx < 0 || idx >= MAX_CODADEVS) 271 if (idx < 0 || idx >= MAX_CODADEVS)
277 return -ENODEV; 272 return -ENODEV;
278 273
279 lock_kernel();
280
281 err = -EBUSY; 274 err = -EBUSY;
282 vcp = &coda_comms[idx]; 275 vcp = &coda_comms[idx];
276 mutex_lock(&vcp->vc_mutex);
277
283 if (!vcp->vc_inuse) { 278 if (!vcp->vc_inuse) {
284 vcp->vc_inuse++; 279 vcp->vc_inuse++;
285 280
@@ -293,7 +288,7 @@ static int coda_psdev_open(struct inode * inode, struct file * file)
293 err = 0; 288 err = 0;
294 } 289 }
295 290
296 unlock_kernel(); 291 mutex_unlock(&vcp->vc_mutex);
297 return err; 292 return err;
298} 293}
299 294
@@ -308,7 +303,7 @@ static int coda_psdev_release(struct inode * inode, struct file * file)
308 return -1; 303 return -1;
309 } 304 }
310 305
311 lock_kernel(); 306 mutex_lock(&vcp->vc_mutex);
312 307
313 /* Wakeup clients so they can return. */ 308 /* Wakeup clients so they can return. */
314 list_for_each_entry_safe(req, tmp, &vcp->vc_pending, uc_chain) { 309 list_for_each_entry_safe(req, tmp, &vcp->vc_pending, uc_chain) {
@@ -333,7 +328,7 @@ static int coda_psdev_release(struct inode * inode, struct file * file)
333 328
334 file->private_data = NULL; 329 file->private_data = NULL;
335 vcp->vc_inuse--; 330 vcp->vc_inuse--;
336 unlock_kernel(); 331 mutex_unlock(&vcp->vc_mutex);
337 return 0; 332 return 0;
338} 333}
339 334
@@ -346,6 +341,7 @@ static const struct file_operations coda_psdev_fops = {
346 .unlocked_ioctl = coda_psdev_ioctl, 341 .unlocked_ioctl = coda_psdev_ioctl,
347 .open = coda_psdev_open, 342 .open = coda_psdev_open,
348 .release = coda_psdev_release, 343 .release = coda_psdev_release,
344 .llseek = noop_llseek,
349}; 345};
350 346
351static int init_coda_psdev(void) 347static int init_coda_psdev(void)
@@ -361,9 +357,11 @@ static int init_coda_psdev(void)
361 err = PTR_ERR(coda_psdev_class); 357 err = PTR_ERR(coda_psdev_class);
362 goto out_chrdev; 358 goto out_chrdev;
363 } 359 }
364 for (i = 0; i < MAX_CODADEVS; i++) 360 for (i = 0; i < MAX_CODADEVS; i++) {
361 mutex_init(&(&coda_comms[i])->vc_mutex);
365 device_create(coda_psdev_class, NULL, 362 device_create(coda_psdev_class, NULL,
366 MKDEV(CODA_PSDEV_MAJOR, i), NULL, "cfs%d", i); 363 MKDEV(CODA_PSDEV_MAJOR, i), NULL, "cfs%d", i);
364 }
367 coda_sysctl_init(); 365 coda_sysctl_init();
368 goto out; 366 goto out;
369 367
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index 4513b7258458..af78f007a2b0 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -14,7 +14,6 @@
14#include <linux/stat.h> 14#include <linux/stat.h>
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17#include <linux/smp_lock.h>
18 17
19#include <linux/coda.h> 18#include <linux/coda.h>
20#include <linux/coda_linux.h> 19#include <linux/coda_linux.h>
@@ -29,11 +28,9 @@ static int coda_symlink_filler(struct file *file, struct page *page)
29 unsigned int len = PAGE_SIZE; 28 unsigned int len = PAGE_SIZE;
30 char *p = kmap(page); 29 char *p = kmap(page);
31 30
32 lock_kernel();
33 cii = ITOC(inode); 31 cii = ITOC(inode);
34 32
35 error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len); 33 error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len);
36 unlock_kernel();
37 if (error) 34 if (error)
38 goto fail; 35 goto fail;
39 SetPageUptodate(page); 36 SetPageUptodate(page);
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index b8893ab6f9e6..c3563cab9758 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -27,6 +27,7 @@
27#include <linux/errno.h> 27#include <linux/errno.h>
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/mutex.h>
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
31#include <linux/vmalloc.h> 32#include <linux/vmalloc.h>
32#include <linux/vfs.h> 33#include <linux/vfs.h>
@@ -606,7 +607,8 @@ static void coda_unblock_signals(sigset_t *old)
606 (r)->uc_opcode != CODA_RELEASE) || \ 607 (r)->uc_opcode != CODA_RELEASE) || \
607 (r)->uc_flags & CODA_REQ_READ)) 608 (r)->uc_flags & CODA_REQ_READ))
608 609
609static inline void coda_waitfor_upcall(struct upc_req *req) 610static inline void coda_waitfor_upcall(struct venus_comm *vcp,
611 struct upc_req *req)
610{ 612{
611 DECLARE_WAITQUEUE(wait, current); 613 DECLARE_WAITQUEUE(wait, current);
612 unsigned long timeout = jiffies + coda_timeout * HZ; 614 unsigned long timeout = jiffies + coda_timeout * HZ;
@@ -639,10 +641,12 @@ static inline void coda_waitfor_upcall(struct upc_req *req)
639 break; 641 break;
640 } 642 }
641 643
644 mutex_unlock(&vcp->vc_mutex);
642 if (blocked) 645 if (blocked)
643 schedule_timeout(HZ); 646 schedule_timeout(HZ);
644 else 647 else
645 schedule(); 648 schedule();
649 mutex_lock(&vcp->vc_mutex);
646 } 650 }
647 if (blocked) 651 if (blocked)
648 coda_unblock_signals(&old); 652 coda_unblock_signals(&old);
@@ -667,18 +671,23 @@ static int coda_upcall(struct venus_comm *vcp,
667{ 671{
668 union outputArgs *out; 672 union outputArgs *out;
669 union inputArgs *sig_inputArgs; 673 union inputArgs *sig_inputArgs;
670 struct upc_req *req, *sig_req; 674 struct upc_req *req = NULL, *sig_req;
671 int error = 0; 675 int error;
676
677 mutex_lock(&vcp->vc_mutex);
672 678
673 if (!vcp->vc_inuse) { 679 if (!vcp->vc_inuse) {
674 printk(KERN_NOTICE "coda: Venus dead, not sending upcall\n"); 680 printk(KERN_NOTICE "coda: Venus dead, not sending upcall\n");
675 return -ENXIO; 681 error = -ENXIO;
682 goto exit;
676 } 683 }
677 684
678 /* Format the request message. */ 685 /* Format the request message. */
679 req = kmalloc(sizeof(struct upc_req), GFP_KERNEL); 686 req = kmalloc(sizeof(struct upc_req), GFP_KERNEL);
680 if (!req) 687 if (!req) {
681 return -ENOMEM; 688 error = -ENOMEM;
689 goto exit;
690 }
682 691
683 req->uc_data = (void *)buffer; 692 req->uc_data = (void *)buffer;
684 req->uc_flags = 0; 693 req->uc_flags = 0;
@@ -705,7 +714,7 @@ static int coda_upcall(struct venus_comm *vcp,
705 * ENODEV. */ 714 * ENODEV. */
706 715
707 /* Go to sleep. Wake up on signals only after the timeout. */ 716 /* Go to sleep. Wake up on signals only after the timeout. */
708 coda_waitfor_upcall(req); 717 coda_waitfor_upcall(vcp, req);
709 718
710 /* Op went through, interrupt or not... */ 719 /* Op went through, interrupt or not... */
711 if (req->uc_flags & CODA_REQ_WRITE) { 720 if (req->uc_flags & CODA_REQ_WRITE) {
@@ -759,6 +768,7 @@ static int coda_upcall(struct venus_comm *vcp,
759 768
760exit: 769exit:
761 kfree(req); 770 kfree(req);
771 mutex_unlock(&vcp->vc_mutex);
762 return error; 772 return error;
763} 773}
764 774
@@ -796,21 +806,24 @@ exit:
796 * 806 *
797 * CODA_REPLACE -- replace one CodaFid with another throughout the name cache */ 807 * CODA_REPLACE -- replace one CodaFid with another throughout the name cache */
798 808
799int coda_downcall(int opcode, union outputArgs * out, struct super_block *sb) 809int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out)
800{ 810{
801 struct inode *inode = NULL; 811 struct inode *inode = NULL;
802 struct CodaFid *fid, *newfid; 812 struct CodaFid *fid = NULL, *newfid;
813 struct super_block *sb;
803 814
804 /* Handle invalidation requests. */ 815 /* Handle invalidation requests. */
805 if ( !sb || !sb->s_root) 816 mutex_lock(&vcp->vc_mutex);
806 return 0; 817 sb = vcp->vc_sb;
818 if (!sb || !sb->s_root)
819 goto unlock_out;
807 820
808 switch (opcode) { 821 switch (opcode) {
809 case CODA_FLUSH: 822 case CODA_FLUSH:
810 coda_cache_clear_all(sb); 823 coda_cache_clear_all(sb);
811 shrink_dcache_sb(sb); 824 shrink_dcache_sb(sb);
812 if (sb->s_root->d_inode) 825 if (sb->s_root->d_inode)
813 coda_flag_inode(sb->s_root->d_inode, C_FLUSH); 826 coda_flag_inode(sb->s_root->d_inode, C_FLUSH);
814 break; 827 break;
815 828
816 case CODA_PURGEUSER: 829 case CODA_PURGEUSER:
@@ -819,45 +832,53 @@ int coda_downcall(int opcode, union outputArgs * out, struct super_block *sb)
819 832
820 case CODA_ZAPDIR: 833 case CODA_ZAPDIR:
821 fid = &out->coda_zapdir.CodaFid; 834 fid = &out->coda_zapdir.CodaFid;
822 inode = coda_fid_to_inode(fid, sb);
823 if (inode) {
824 coda_flag_inode_children(inode, C_PURGE);
825 coda_flag_inode(inode, C_VATTR);
826 }
827 break; 835 break;
828 836
829 case CODA_ZAPFILE: 837 case CODA_ZAPFILE:
830 fid = &out->coda_zapfile.CodaFid; 838 fid = &out->coda_zapfile.CodaFid;
831 inode = coda_fid_to_inode(fid, sb);
832 if (inode)
833 coda_flag_inode(inode, C_VATTR);
834 break; 839 break;
835 840
836 case CODA_PURGEFID: 841 case CODA_PURGEFID:
837 fid = &out->coda_purgefid.CodaFid; 842 fid = &out->coda_purgefid.CodaFid;
843 break;
844
845 case CODA_REPLACE:
846 fid = &out->coda_replace.OldFid;
847 break;
848 }
849 if (fid)
838 inode = coda_fid_to_inode(fid, sb); 850 inode = coda_fid_to_inode(fid, sb);
839 if (inode) {
840 coda_flag_inode_children(inode, C_PURGE);
841 851
842 /* catch the dentries later if some are still busy */ 852unlock_out:
843 coda_flag_inode(inode, C_PURGE); 853 mutex_unlock(&vcp->vc_mutex);
844 d_prune_aliases(inode);
845 854
846 } 855 if (!inode)
856 return 0;
857
858 switch (opcode) {
859 case CODA_ZAPDIR:
860 coda_flag_inode_children(inode, C_PURGE);
861 coda_flag_inode(inode, C_VATTR);
862 break;
863
864 case CODA_ZAPFILE:
865 coda_flag_inode(inode, C_VATTR);
866 break;
867
868 case CODA_PURGEFID:
869 coda_flag_inode_children(inode, C_PURGE);
870
871 /* catch the dentries later if some are still busy */
872 coda_flag_inode(inode, C_PURGE);
873 d_prune_aliases(inode);
847 break; 874 break;
848 875
849 case CODA_REPLACE: 876 case CODA_REPLACE:
850 fid = &out->coda_replace.OldFid;
851 newfid = &out->coda_replace.NewFid; 877 newfid = &out->coda_replace.NewFid;
852 inode = coda_fid_to_inode(fid, sb); 878 coda_replace_fid(inode, fid, newfid);
853 if (inode)
854 coda_replace_fid(inode, fid, newfid);
855 break; 879 break;
856 } 880 }
857 881 iput(inode);
858 if (inode)
859 iput(inode);
860
861 return 0; 882 return 0;
862} 883}
863 884
diff --git a/fs/compat.c b/fs/compat.c
index 718c7062aec1..f03abdadc401 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1153,7 +1153,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
1153{ 1153{
1154 compat_ssize_t tot_len; 1154 compat_ssize_t tot_len;
1155 struct iovec iovstack[UIO_FASTIOV]; 1155 struct iovec iovstack[UIO_FASTIOV];
1156 struct iovec *iov; 1156 struct iovec *iov = iovstack;
1157 ssize_t ret; 1157 ssize_t ret;
1158 io_fn_t fn; 1158 io_fn_t fn;
1159 iov_fn_t fnv; 1159 iov_fn_t fnv;
@@ -1963,7 +1963,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
1963} 1963}
1964#endif /* HAVE_SET_RESTORE_SIGMASK */ 1964#endif /* HAVE_SET_RESTORE_SIGMASK */
1965 1965
1966#if defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE) 1966#if (defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)) && !defined(CONFIG_NFSD_DEPRECATED)
1967/* Stuff for NFS server syscalls... */ 1967/* Stuff for NFS server syscalls... */
1968struct compat_nfsctl_svc { 1968struct compat_nfsctl_svc {
1969 u16 svc32_port; 1969 u16 svc32_port;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 03e59aa318eb..d0ad09d57789 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -599,69 +599,6 @@ static int do_smb_getmountuid(unsigned int fd, unsigned int cmd,
599#define HIDPGETCONNLIST _IOR('H', 210, int) 599#define HIDPGETCONNLIST _IOR('H', 210, int)
600#define HIDPGETCONNINFO _IOR('H', 211, int) 600#define HIDPGETCONNINFO _IOR('H', 211, int)
601 601
602#ifdef CONFIG_BLOCK
603struct raw32_config_request
604{
605 compat_int_t raw_minor;
606 __u64 block_major;
607 __u64 block_minor;
608} __attribute__((packed));
609
610static int get_raw32_request(struct raw_config_request *req, struct raw32_config_request __user *user_req)
611{
612 int ret;
613
614 if (!access_ok(VERIFY_READ, user_req, sizeof(struct raw32_config_request)))
615 return -EFAULT;
616
617 ret = __get_user(req->raw_minor, &user_req->raw_minor);
618 ret |= __get_user(req->block_major, &user_req->block_major);
619 ret |= __get_user(req->block_minor, &user_req->block_minor);
620
621 return ret ? -EFAULT : 0;
622}
623
624static int set_raw32_request(struct raw_config_request *req, struct raw32_config_request __user *user_req)
625{
626 int ret;
627
628 if (!access_ok(VERIFY_WRITE, user_req, sizeof(struct raw32_config_request)))
629 return -EFAULT;
630
631 ret = __put_user(req->raw_minor, &user_req->raw_minor);
632 ret |= __put_user(req->block_major, &user_req->block_major);
633 ret |= __put_user(req->block_minor, &user_req->block_minor);
634
635 return ret ? -EFAULT : 0;
636}
637
638static int raw_ioctl(unsigned fd, unsigned cmd,
639 struct raw32_config_request __user *user_req)
640{
641 int ret;
642
643 switch (cmd) {
644 case RAW_SETBIND:
645 default: { /* RAW_GETBIND */
646 struct raw_config_request req;
647 mm_segment_t oldfs = get_fs();
648
649 if ((ret = get_raw32_request(&req, user_req)))
650 return ret;
651
652 set_fs(KERNEL_DS);
653 ret = sys_ioctl(fd,cmd,(unsigned long)&req);
654 set_fs(oldfs);
655
656 if ((!ret) && (cmd == RAW_GETBIND)) {
657 ret = set_raw32_request(&req, user_req);
658 }
659 break;
660 }
661 }
662 return ret;
663}
664#endif /* CONFIG_BLOCK */
665 602
666struct serial_struct32 { 603struct serial_struct32 {
667 compat_int_t type; 604 compat_int_t type;
@@ -1262,9 +1199,6 @@ COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE5)
1262COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS) 1199COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS)
1263COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS) 1200COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS)
1264COMPATIBLE_IOCTL(OSS_GETVERSION) 1201COMPATIBLE_IOCTL(OSS_GETVERSION)
1265/* Raw devices */
1266COMPATIBLE_IOCTL(RAW_SETBIND)
1267COMPATIBLE_IOCTL(RAW_GETBIND)
1268/* SMB ioctls which do not need any translations */ 1202/* SMB ioctls which do not need any translations */
1269COMPATIBLE_IOCTL(SMB_IOC_NEWCONN) 1203COMPATIBLE_IOCTL(SMB_IOC_NEWCONN)
1270/* Watchdog */ 1204/* Watchdog */
@@ -1523,10 +1457,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
1523 case MTIOCGET32: 1457 case MTIOCGET32:
1524 case MTIOCPOS32: 1458 case MTIOCPOS32:
1525 return mt_ioctl_trans(fd, cmd, argp); 1459 return mt_ioctl_trans(fd, cmd, argp);
1526 /* Raw devices */
1527 case RAW_SETBIND:
1528 case RAW_GETBIND:
1529 return raw_ioctl(fd, cmd, argp);
1530#endif 1460#endif
1531 /* One SMB ioctl needs translations. */ 1461 /* One SMB ioctl needs translations. */
1532#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t) 1462#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index cf78d44a8d6a..253476d78ed8 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -135,6 +135,7 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
135{ 135{
136 struct inode * inode = new_inode(configfs_sb); 136 struct inode * inode = new_inode(configfs_sb);
137 if (inode) { 137 if (inode) {
138 inode->i_ino = get_next_ino();
138 inode->i_mapping->a_ops = &configfs_aops; 139 inode->i_mapping->a_ops = &configfs_aops;
139 inode->i_mapping->backing_dev_info = &configfs_backing_dev_info; 140 inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
140 inode->i_op = &configfs_inode_operations; 141 inode->i_op = &configfs_inode_operations;
diff --git a/fs/dcache.c b/fs/dcache.c
index 83293be48149..23702a9d4e6d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -67,33 +67,43 @@ struct dentry_stat_t dentry_stat = {
67 .age_limit = 45, 67 .age_limit = 45,
68}; 68};
69 69
70static void __d_free(struct dentry *dentry) 70static struct percpu_counter nr_dentry __cacheline_aligned_in_smp;
71static struct percpu_counter nr_dentry_unused __cacheline_aligned_in_smp;
72
73#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
74int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
75 size_t *lenp, loff_t *ppos)
76{
77 dentry_stat.nr_dentry = percpu_counter_sum_positive(&nr_dentry);
78 dentry_stat.nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
79 return proc_dointvec(table, write, buffer, lenp, ppos);
80}
81#endif
82
83static void __d_free(struct rcu_head *head)
71{ 84{
85 struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
86
72 WARN_ON(!list_empty(&dentry->d_alias)); 87 WARN_ON(!list_empty(&dentry->d_alias));
73 if (dname_external(dentry)) 88 if (dname_external(dentry))
74 kfree(dentry->d_name.name); 89 kfree(dentry->d_name.name);
75 kmem_cache_free(dentry_cache, dentry); 90 kmem_cache_free(dentry_cache, dentry);
76} 91}
77 92
78static void d_callback(struct rcu_head *head)
79{
80 struct dentry * dentry = container_of(head, struct dentry, d_u.d_rcu);
81 __d_free(dentry);
82}
83
84/* 93/*
85 * no dcache_lock, please. The caller must decrement dentry_stat.nr_dentry 94 * no dcache_lock, please.
86 * inside dcache_lock.
87 */ 95 */
88static void d_free(struct dentry *dentry) 96static void d_free(struct dentry *dentry)
89{ 97{
98 percpu_counter_dec(&nr_dentry);
90 if (dentry->d_op && dentry->d_op->d_release) 99 if (dentry->d_op && dentry->d_op->d_release)
91 dentry->d_op->d_release(dentry); 100 dentry->d_op->d_release(dentry);
101
92 /* if dentry was never inserted into hash, immediate free is OK */ 102 /* if dentry was never inserted into hash, immediate free is OK */
93 if (hlist_unhashed(&dentry->d_hash)) 103 if (hlist_unhashed(&dentry->d_hash))
94 __d_free(dentry); 104 __d_free(&dentry->d_u.d_rcu);
95 else 105 else
96 call_rcu(&dentry->d_u.d_rcu, d_callback); 106 call_rcu(&dentry->d_u.d_rcu, __d_free);
97} 107}
98 108
99/* 109/*
@@ -123,37 +133,34 @@ static void dentry_iput(struct dentry * dentry)
123} 133}
124 134
125/* 135/*
126 * dentry_lru_(add|add_tail|del|del_init) must be called with dcache_lock held. 136 * dentry_lru_(add|del|move_tail) must be called with dcache_lock held.
127 */ 137 */
128static void dentry_lru_add(struct dentry *dentry) 138static void dentry_lru_add(struct dentry *dentry)
129{ 139{
130 list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); 140 if (list_empty(&dentry->d_lru)) {
131 dentry->d_sb->s_nr_dentry_unused++; 141 list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
132 dentry_stat.nr_unused++; 142 dentry->d_sb->s_nr_dentry_unused++;
133} 143 percpu_counter_inc(&nr_dentry_unused);
134 144 }
135static void dentry_lru_add_tail(struct dentry *dentry)
136{
137 list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
138 dentry->d_sb->s_nr_dentry_unused++;
139 dentry_stat.nr_unused++;
140} 145}
141 146
142static void dentry_lru_del(struct dentry *dentry) 147static void dentry_lru_del(struct dentry *dentry)
143{ 148{
144 if (!list_empty(&dentry->d_lru)) { 149 if (!list_empty(&dentry->d_lru)) {
145 list_del(&dentry->d_lru); 150 list_del_init(&dentry->d_lru);
146 dentry->d_sb->s_nr_dentry_unused--; 151 dentry->d_sb->s_nr_dentry_unused--;
147 dentry_stat.nr_unused--; 152 percpu_counter_dec(&nr_dentry_unused);
148 } 153 }
149} 154}
150 155
151static void dentry_lru_del_init(struct dentry *dentry) 156static void dentry_lru_move_tail(struct dentry *dentry)
152{ 157{
153 if (likely(!list_empty(&dentry->d_lru))) { 158 if (list_empty(&dentry->d_lru)) {
154 list_del_init(&dentry->d_lru); 159 list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
155 dentry->d_sb->s_nr_dentry_unused--; 160 dentry->d_sb->s_nr_dentry_unused++;
156 dentry_stat.nr_unused--; 161 percpu_counter_inc(&nr_dentry_unused);
162 } else {
163 list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
157 } 164 }
158} 165}
159 166
@@ -172,7 +179,6 @@ static struct dentry *d_kill(struct dentry *dentry)
172 struct dentry *parent; 179 struct dentry *parent;
173 180
174 list_del(&dentry->d_u.d_child); 181 list_del(&dentry->d_u.d_child);
175 dentry_stat.nr_dentry--; /* For d_free, below */
176 /*drops the locks, at that point nobody can reach this dentry */ 182 /*drops the locks, at that point nobody can reach this dentry */
177 dentry_iput(dentry); 183 dentry_iput(dentry);
178 if (IS_ROOT(dentry)) 184 if (IS_ROOT(dentry))
@@ -237,13 +243,15 @@ repeat:
237 if (dentry->d_op->d_delete(dentry)) 243 if (dentry->d_op->d_delete(dentry))
238 goto unhash_it; 244 goto unhash_it;
239 } 245 }
246
240 /* Unreachable? Get rid of it */ 247 /* Unreachable? Get rid of it */
241 if (d_unhashed(dentry)) 248 if (d_unhashed(dentry))
242 goto kill_it; 249 goto kill_it;
243 if (list_empty(&dentry->d_lru)) { 250
244 dentry->d_flags |= DCACHE_REFERENCED; 251 /* Otherwise leave it cached and ensure it's on the LRU */
245 dentry_lru_add(dentry); 252 dentry->d_flags |= DCACHE_REFERENCED;
246 } 253 dentry_lru_add(dentry);
254
247 spin_unlock(&dentry->d_lock); 255 spin_unlock(&dentry->d_lock);
248 spin_unlock(&dcache_lock); 256 spin_unlock(&dcache_lock);
249 return; 257 return;
@@ -318,11 +326,10 @@ int d_invalidate(struct dentry * dentry)
318EXPORT_SYMBOL(d_invalidate); 326EXPORT_SYMBOL(d_invalidate);
319 327
320/* This should be called _only_ with dcache_lock held */ 328/* This should be called _only_ with dcache_lock held */
321
322static inline struct dentry * __dget_locked(struct dentry *dentry) 329static inline struct dentry * __dget_locked(struct dentry *dentry)
323{ 330{
324 atomic_inc(&dentry->d_count); 331 atomic_inc(&dentry->d_count);
325 dentry_lru_del_init(dentry); 332 dentry_lru_del(dentry);
326 return dentry; 333 return dentry;
327} 334}
328 335
@@ -441,73 +448,27 @@ static void prune_one_dentry(struct dentry * dentry)
441 448
442 if (dentry->d_op && dentry->d_op->d_delete) 449 if (dentry->d_op && dentry->d_op->d_delete)
443 dentry->d_op->d_delete(dentry); 450 dentry->d_op->d_delete(dentry);
444 dentry_lru_del_init(dentry); 451 dentry_lru_del(dentry);
445 __d_drop(dentry); 452 __d_drop(dentry);
446 dentry = d_kill(dentry); 453 dentry = d_kill(dentry);
447 spin_lock(&dcache_lock); 454 spin_lock(&dcache_lock);
448 } 455 }
449} 456}
450 457
451/* 458static void shrink_dentry_list(struct list_head *list)
452 * Shrink the dentry LRU on a given superblock.
453 * @sb : superblock to shrink dentry LRU.
454 * @count: If count is NULL, we prune all dentries on superblock.
455 * @flags: If flags is non-zero, we need to do special processing based on
456 * which flags are set. This means we don't need to maintain multiple
457 * similar copies of this loop.
458 */
459static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
460{ 459{
461 LIST_HEAD(referenced);
462 LIST_HEAD(tmp);
463 struct dentry *dentry; 460 struct dentry *dentry;
464 int cnt = 0;
465 461
466 BUG_ON(!sb); 462 while (!list_empty(list)) {
467 BUG_ON((flags & DCACHE_REFERENCED) && count == NULL); 463 dentry = list_entry(list->prev, struct dentry, d_lru);
468 spin_lock(&dcache_lock); 464 dentry_lru_del(dentry);
469 if (count != NULL)
470 /* called from prune_dcache() and shrink_dcache_parent() */
471 cnt = *count;
472restart:
473 if (count == NULL)
474 list_splice_init(&sb->s_dentry_lru, &tmp);
475 else {
476 while (!list_empty(&sb->s_dentry_lru)) {
477 dentry = list_entry(sb->s_dentry_lru.prev,
478 struct dentry, d_lru);
479 BUG_ON(dentry->d_sb != sb);
480 465
481 spin_lock(&dentry->d_lock);
482 /*
483 * If we are honouring the DCACHE_REFERENCED flag and
484 * the dentry has this flag set, don't free it. Clear
485 * the flag and put it back on the LRU.
486 */
487 if ((flags & DCACHE_REFERENCED)
488 && (dentry->d_flags & DCACHE_REFERENCED)) {
489 dentry->d_flags &= ~DCACHE_REFERENCED;
490 list_move(&dentry->d_lru, &referenced);
491 spin_unlock(&dentry->d_lock);
492 } else {
493 list_move_tail(&dentry->d_lru, &tmp);
494 spin_unlock(&dentry->d_lock);
495 cnt--;
496 if (!cnt)
497 break;
498 }
499 cond_resched_lock(&dcache_lock);
500 }
501 }
502 while (!list_empty(&tmp)) {
503 dentry = list_entry(tmp.prev, struct dentry, d_lru);
504 dentry_lru_del_init(dentry);
505 spin_lock(&dentry->d_lock);
506 /* 466 /*
507 * We found an inuse dentry which was not removed from 467 * We found an inuse dentry which was not removed from
508 * the LRU because of laziness during lookup. Do not free 468 * the LRU because of laziness during lookup. Do not free
509 * it - just keep it off the LRU list. 469 * it - just keep it off the LRU list.
510 */ 470 */
471 spin_lock(&dentry->d_lock);
511 if (atomic_read(&dentry->d_count)) { 472 if (atomic_read(&dentry->d_count)) {
512 spin_unlock(&dentry->d_lock); 473 spin_unlock(&dentry->d_lock);
513 continue; 474 continue;
@@ -516,13 +477,60 @@ restart:
516 /* dentry->d_lock was dropped in prune_one_dentry() */ 477 /* dentry->d_lock was dropped in prune_one_dentry() */
517 cond_resched_lock(&dcache_lock); 478 cond_resched_lock(&dcache_lock);
518 } 479 }
519 if (count == NULL && !list_empty(&sb->s_dentry_lru)) 480}
520 goto restart; 481
521 if (count != NULL) 482/**
522 *count = cnt; 483 * __shrink_dcache_sb - shrink the dentry LRU on a given superblock
484 * @sb: superblock to shrink dentry LRU.
485 * @count: number of entries to prune
486 * @flags: flags to control the dentry processing
487 *
488 * If flags contains DCACHE_REFERENCED reference dentries will not be pruned.
489 */
490static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
491{
492 /* called from prune_dcache() and shrink_dcache_parent() */
493 struct dentry *dentry;
494 LIST_HEAD(referenced);
495 LIST_HEAD(tmp);
496 int cnt = *count;
497
498 spin_lock(&dcache_lock);
499 while (!list_empty(&sb->s_dentry_lru)) {
500 dentry = list_entry(sb->s_dentry_lru.prev,
501 struct dentry, d_lru);
502 BUG_ON(dentry->d_sb != sb);
503
504 /*
505 * If we are honouring the DCACHE_REFERENCED flag and the
506 * dentry has this flag set, don't free it. Clear the flag
507 * and put it back on the LRU.
508 */
509 if (flags & DCACHE_REFERENCED) {
510 spin_lock(&dentry->d_lock);
511 if (dentry->d_flags & DCACHE_REFERENCED) {
512 dentry->d_flags &= ~DCACHE_REFERENCED;
513 list_move(&dentry->d_lru, &referenced);
514 spin_unlock(&dentry->d_lock);
515 cond_resched_lock(&dcache_lock);
516 continue;
517 }
518 spin_unlock(&dentry->d_lock);
519 }
520
521 list_move_tail(&dentry->d_lru, &tmp);
522 if (!--cnt)
523 break;
524 cond_resched_lock(&dcache_lock);
525 }
526
527 *count = cnt;
528 shrink_dentry_list(&tmp);
529
523 if (!list_empty(&referenced)) 530 if (!list_empty(&referenced))
524 list_splice(&referenced, &sb->s_dentry_lru); 531 list_splice(&referenced, &sb->s_dentry_lru);
525 spin_unlock(&dcache_lock); 532 spin_unlock(&dcache_lock);
533
526} 534}
527 535
528/** 536/**
@@ -538,7 +546,7 @@ static void prune_dcache(int count)
538{ 546{
539 struct super_block *sb, *p = NULL; 547 struct super_block *sb, *p = NULL;
540 int w_count; 548 int w_count;
541 int unused = dentry_stat.nr_unused; 549 int unused = percpu_counter_sum_positive(&nr_dentry_unused);
542 int prune_ratio; 550 int prune_ratio;
543 int pruned; 551 int pruned;
544 552
@@ -608,13 +616,19 @@ static void prune_dcache(int count)
608 * shrink_dcache_sb - shrink dcache for a superblock 616 * shrink_dcache_sb - shrink dcache for a superblock
609 * @sb: superblock 617 * @sb: superblock
610 * 618 *
611 * Shrink the dcache for the specified super block. This 619 * Shrink the dcache for the specified super block. This is used to free
612 * is used to free the dcache before unmounting a file 620 * the dcache before unmounting a file system.
613 * system
614 */ 621 */
615void shrink_dcache_sb(struct super_block * sb) 622void shrink_dcache_sb(struct super_block *sb)
616{ 623{
617 __shrink_dcache_sb(sb, NULL, 0); 624 LIST_HEAD(tmp);
625
626 spin_lock(&dcache_lock);
627 while (!list_empty(&sb->s_dentry_lru)) {
628 list_splice_init(&sb->s_dentry_lru, &tmp);
629 shrink_dentry_list(&tmp);
630 }
631 spin_unlock(&dcache_lock);
618} 632}
619EXPORT_SYMBOL(shrink_dcache_sb); 633EXPORT_SYMBOL(shrink_dcache_sb);
620 634
@@ -632,7 +646,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
632 646
633 /* detach this root from the system */ 647 /* detach this root from the system */
634 spin_lock(&dcache_lock); 648 spin_lock(&dcache_lock);
635 dentry_lru_del_init(dentry); 649 dentry_lru_del(dentry);
636 __d_drop(dentry); 650 __d_drop(dentry);
637 spin_unlock(&dcache_lock); 651 spin_unlock(&dcache_lock);
638 652
@@ -646,7 +660,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
646 spin_lock(&dcache_lock); 660 spin_lock(&dcache_lock);
647 list_for_each_entry(loop, &dentry->d_subdirs, 661 list_for_each_entry(loop, &dentry->d_subdirs,
648 d_u.d_child) { 662 d_u.d_child) {
649 dentry_lru_del_init(loop); 663 dentry_lru_del(loop);
650 __d_drop(loop); 664 __d_drop(loop);
651 cond_resched_lock(&dcache_lock); 665 cond_resched_lock(&dcache_lock);
652 } 666 }
@@ -703,20 +717,13 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
703 * otherwise we ascend to the parent and move to the 717 * otherwise we ascend to the parent and move to the
704 * next sibling if there is one */ 718 * next sibling if there is one */
705 if (!parent) 719 if (!parent)
706 goto out; 720 return;
707
708 dentry = parent; 721 dentry = parent;
709
710 } while (list_empty(&dentry->d_subdirs)); 722 } while (list_empty(&dentry->d_subdirs));
711 723
712 dentry = list_entry(dentry->d_subdirs.next, 724 dentry = list_entry(dentry->d_subdirs.next,
713 struct dentry, d_u.d_child); 725 struct dentry, d_u.d_child);
714 } 726 }
715out:
716 /* several dentries were freed, need to correct nr_dentry */
717 spin_lock(&dcache_lock);
718 dentry_stat.nr_dentry -= detached;
719 spin_unlock(&dcache_lock);
720} 727}
721 728
722/* 729/*
@@ -830,14 +837,15 @@ resume:
830 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); 837 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
831 next = tmp->next; 838 next = tmp->next;
832 839
833 dentry_lru_del_init(dentry);
834 /* 840 /*
835 * move only zero ref count dentries to the end 841 * move only zero ref count dentries to the end
836 * of the unused list for prune_dcache 842 * of the unused list for prune_dcache
837 */ 843 */
838 if (!atomic_read(&dentry->d_count)) { 844 if (!atomic_read(&dentry->d_count)) {
839 dentry_lru_add_tail(dentry); 845 dentry_lru_move_tail(dentry);
840 found++; 846 found++;
847 } else {
848 dentry_lru_del(dentry);
841 } 849 }
842 850
843 /* 851 /*
@@ -900,12 +908,16 @@ EXPORT_SYMBOL(shrink_dcache_parent);
900 */ 908 */
901static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) 909static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
902{ 910{
911 int nr_unused;
912
903 if (nr) { 913 if (nr) {
904 if (!(gfp_mask & __GFP_FS)) 914 if (!(gfp_mask & __GFP_FS))
905 return -1; 915 return -1;
906 prune_dcache(nr); 916 prune_dcache(nr);
907 } 917 }
908 return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; 918
919 nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
920 return (nr_unused / 100) * sysctl_vfs_cache_pressure;
909} 921}
910 922
911static struct shrinker dcache_shrinker = { 923static struct shrinker dcache_shrinker = {
@@ -972,9 +984,10 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
972 spin_lock(&dcache_lock); 984 spin_lock(&dcache_lock);
973 if (parent) 985 if (parent)
974 list_add(&dentry->d_u.d_child, &parent->d_subdirs); 986 list_add(&dentry->d_u.d_child, &parent->d_subdirs);
975 dentry_stat.nr_dentry++;
976 spin_unlock(&dcache_lock); 987 spin_unlock(&dcache_lock);
977 988
989 percpu_counter_inc(&nr_dentry);
990
978 return dentry; 991 return dentry;
979} 992}
980EXPORT_SYMBOL(d_alloc); 993EXPORT_SYMBOL(d_alloc);
@@ -1478,33 +1491,26 @@ out:
1478 * This is used by ncpfs in its readdir implementation. 1491 * This is used by ncpfs in its readdir implementation.
1479 * Zero is returned in the dentry is invalid. 1492 * Zero is returned in the dentry is invalid.
1480 */ 1493 */
1481 1494int d_validate(struct dentry *dentry, struct dentry *parent)
1482int d_validate(struct dentry *dentry, struct dentry *dparent)
1483{ 1495{
1484 struct hlist_head *base; 1496 struct hlist_head *head = d_hash(parent, dentry->d_name.hash);
1485 struct hlist_node *lhp; 1497 struct hlist_node *node;
1498 struct dentry *d;
1486 1499
1487 /* Check whether the ptr might be valid at all.. */ 1500 /* Check whether the ptr might be valid at all.. */
1488 if (!kmem_ptr_validate(dentry_cache, dentry)) 1501 if (!kmem_ptr_validate(dentry_cache, dentry))
1489 goto out; 1502 return 0;
1490 1503 if (dentry->d_parent != parent)
1491 if (dentry->d_parent != dparent) 1504 return 0;
1492 goto out;
1493 1505
1494 spin_lock(&dcache_lock); 1506 rcu_read_lock();
1495 base = d_hash(dparent, dentry->d_name.hash); 1507 hlist_for_each_entry_rcu(d, node, head, d_hash) {
1496 hlist_for_each(lhp,base) { 1508 if (d == dentry) {
1497 /* hlist_for_each_entry_rcu() not required for d_hash list 1509 dget(dentry);
1498 * as it is parsed under dcache_lock
1499 */
1500 if (dentry == hlist_entry(lhp, struct dentry, d_hash)) {
1501 __dget_locked(dentry);
1502 spin_unlock(&dcache_lock);
1503 return 1; 1510 return 1;
1504 } 1511 }
1505 } 1512 }
1506 spin_unlock(&dcache_lock); 1513 rcu_read_unlock();
1507out:
1508 return 0; 1514 return 0;
1509} 1515}
1510EXPORT_SYMBOL(d_validate); 1516EXPORT_SYMBOL(d_validate);
@@ -1994,7 +2000,7 @@ global_root:
1994 * Returns a pointer into the buffer or an error code if the 2000 * Returns a pointer into the buffer or an error code if the
1995 * path was too long. 2001 * path was too long.
1996 * 2002 *
1997 * "buflen" should be positive. Caller holds the dcache_lock. 2003 * "buflen" should be positive.
1998 * 2004 *
1999 * If path is not reachable from the supplied root, then the value of 2005 * If path is not reachable from the supplied root, then the value of
2000 * root is changed (without modifying refcounts). 2006 * root is changed (without modifying refcounts).
@@ -2006,10 +2012,12 @@ char *__d_path(const struct path *path, struct path *root,
2006 int error; 2012 int error;
2007 2013
2008 prepend(&res, &buflen, "\0", 1); 2014 prepend(&res, &buflen, "\0", 1);
2015 spin_lock(&dcache_lock);
2009 error = prepend_path(path, root, &res, &buflen); 2016 error = prepend_path(path, root, &res, &buflen);
2017 spin_unlock(&dcache_lock);
2018
2010 if (error) 2019 if (error)
2011 return ERR_PTR(error); 2020 return ERR_PTR(error);
2012
2013 return res; 2021 return res;
2014} 2022}
2015 2023
@@ -2419,6 +2427,9 @@ static void __init dcache_init(void)
2419{ 2427{
2420 int loop; 2428 int loop;
2421 2429
2430 percpu_counter_init(&nr_dentry, 0);
2431 percpu_counter_init(&nr_dentry_unused, 0);
2432
2422 /* 2433 /*
2423 * A constructor could be added for stable state like the lists, 2434 * A constructor could be added for stable state like the lists,
2424 * but it is probably not worth it because of the cache nature 2435 * but it is probably not worth it because of the cache nature
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 0210898458b2..89d394d8fe24 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -43,6 +43,7 @@ const struct file_operations debugfs_file_operations = {
43 .read = default_read_file, 43 .read = default_read_file,
44 .write = default_write_file, 44 .write = default_write_file,
45 .open = default_open, 45 .open = default_open,
46 .llseek = noop_llseek,
46}; 47};
47 48
48static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd) 49static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd)
@@ -454,6 +455,7 @@ static const struct file_operations fops_bool = {
454 .read = read_file_bool, 455 .read = read_file_bool,
455 .write = write_file_bool, 456 .write = write_file_bool,
456 .open = default_open, 457 .open = default_open,
458 .llseek = default_llseek,
457}; 459};
458 460
459/** 461/**
@@ -498,6 +500,7 @@ static ssize_t read_file_blob(struct file *file, char __user *user_buf,
498static const struct file_operations fops_blob = { 500static const struct file_operations fops_blob = {
499 .read = read_file_blob, 501 .read = read_file_blob,
500 .open = default_open, 502 .open = default_open,
503 .llseek = default_llseek,
501}; 504};
502 505
503/** 506/**
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 30a87b3dbcac..a4ed8380e98a 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -40,6 +40,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
40 struct inode *inode = new_inode(sb); 40 struct inode *inode = new_inode(sb);
41 41
42 if (inode) { 42 if (inode) {
43 inode->i_ino = get_next_ino();
43 inode->i_mode = mode; 44 inode->i_mode = mode;
44 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 45 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
45 switch (mode & S_IFMT) { 46 switch (mode & S_IFMT) {
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 48d74c7391d1..85882f6ba5f7 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -218,7 +218,7 @@ static struct page *dio_get_page(struct dio *dio)
218 * filesystems can use it to hold additional state between get_block calls and 218 * filesystems can use it to hold additional state between get_block calls and
219 * dio_complete. 219 * dio_complete.
220 */ 220 */
221static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async) 221static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)
222{ 222{
223 ssize_t transferred = 0; 223 ssize_t transferred = 0;
224 224
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index c6cf25158746..6b42ba807dfd 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -643,7 +643,8 @@ static ssize_t waiters_read(struct file *file, char __user *userbuf,
643static const struct file_operations waiters_fops = { 643static const struct file_operations waiters_fops = {
644 .owner = THIS_MODULE, 644 .owner = THIS_MODULE,
645 .open = waiters_open, 645 .open = waiters_open,
646 .read = waiters_read 646 .read = waiters_read,
647 .llseek = default_llseek,
647}; 648};
648 649
649void dlm_delete_debug_file(struct dlm_ls *ls) 650void dlm_delete_debug_file(struct dlm_ls *ls)
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 031dbe3a15ca..64e5f3efdd81 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1846,6 +1846,9 @@ static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
1846 struct dlm_lkb *gr; 1846 struct dlm_lkb *gr;
1847 1847
1848 list_for_each_entry(gr, head, lkb_statequeue) { 1848 list_for_each_entry(gr, head, lkb_statequeue) {
1849 /* skip self when sending basts to convertqueue */
1850 if (gr == lkb)
1851 continue;
1849 if (gr->lkb_bastfn && modes_require_bast(gr, lkb)) { 1852 if (gr->lkb_bastfn && modes_require_bast(gr, lkb)) {
1850 queue_bast(r, gr, lkb->lkb_rqmode); 1853 queue_bast(r, gr, lkb->lkb_rqmode);
1851 gr->lkb_highbast = lkb->lkb_rqmode; 1854 gr->lkb_highbast = lkb->lkb_rqmode;
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index d45c02db6943..30d8b85febbf 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -412,7 +412,8 @@ static const struct file_operations dev_fops = {
412 .read = dev_read, 412 .read = dev_read,
413 .write = dev_write, 413 .write = dev_write,
414 .poll = dev_poll, 414 .poll = dev_poll,
415 .owner = THIS_MODULE 415 .owner = THIS_MODULE,
416 .llseek = noop_llseek,
416}; 417};
417 418
418static struct miscdevice plock_dev_misc = { 419static struct miscdevice plock_dev_misc = {
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index b6272853130c..66d6c16bf440 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1009,6 +1009,7 @@ static const struct file_operations device_fops = {
1009 .write = device_write, 1009 .write = device_write,
1010 .poll = device_poll, 1010 .poll = device_poll,
1011 .owner = THIS_MODULE, 1011 .owner = THIS_MODULE,
1012 .llseek = noop_llseek,
1012}; 1013};
1013 1014
1014static const struct file_operations ctl_device_fops = { 1015static const struct file_operations ctl_device_fops = {
@@ -1017,6 +1018,7 @@ static const struct file_operations ctl_device_fops = {
1017 .read = device_read, 1018 .read = device_read,
1018 .write = device_write, 1019 .write = device_write,
1019 .owner = THIS_MODULE, 1020 .owner = THIS_MODULE,
1021 .llseek = noop_llseek,
1020}; 1022};
1021 1023
1022static struct miscdevice ctl_device = { 1024static struct miscdevice ctl_device = {
@@ -1029,6 +1031,7 @@ static const struct file_operations monitor_device_fops = {
1029 .open = monitor_device_open, 1031 .open = monitor_device_open,
1030 .release = monitor_device_close, 1032 .release = monitor_device_close,
1031 .owner = THIS_MODULE, 1033 .owner = THIS_MODULE,
1034 .llseek = noop_llseek,
1032}; 1035};
1033 1036
1034static struct miscdevice monitor_device = { 1037static struct miscdevice monitor_device = {
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 0032a9f5a3a9..40186b959429 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -477,7 +477,7 @@ ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat)
477static inline struct ecryptfs_file_info * 477static inline struct ecryptfs_file_info *
478ecryptfs_file_to_private(struct file *file) 478ecryptfs_file_to_private(struct file *file)
479{ 479{
480 return (struct ecryptfs_file_info *)file->private_data; 480 return file->private_data;
481} 481}
482 482
483static inline void 483static inline void
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 622c95140802..91da02987bff 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -31,7 +31,6 @@
31#include <linux/security.h> 31#include <linux/security.h>
32#include <linux/compat.h> 32#include <linux/compat.h>
33#include <linux/fs_stack.h> 33#include <linux/fs_stack.h>
34#include <linux/smp_lock.h>
35#include "ecryptfs_kernel.h" 34#include "ecryptfs_kernel.h"
36 35
37/** 36/**
@@ -284,11 +283,9 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
284 int rc = 0; 283 int rc = 0;
285 struct file *lower_file = NULL; 284 struct file *lower_file = NULL;
286 285
287 lock_kernel();
288 lower_file = ecryptfs_file_to_lower(file); 286 lower_file = ecryptfs_file_to_lower(file);
289 if (lower_file->f_op && lower_file->f_op->fasync) 287 if (lower_file->f_op && lower_file->f_op->fasync)
290 rc = lower_file->f_op->fasync(fd, lower_file, flag); 288 rc = lower_file->f_op->fasync(fd, lower_file, flag);
291 unlock_kernel();
292 return rc; 289 return rc;
293} 290}
294 291
@@ -332,6 +329,7 @@ const struct file_operations ecryptfs_dir_fops = {
332 .fsync = ecryptfs_fsync, 329 .fsync = ecryptfs_fsync,
333 .fasync = ecryptfs_fasync, 330 .fasync = ecryptfs_fasync,
334 .splice_read = generic_file_splice_read, 331 .splice_read = generic_file_splice_read,
332 .llseek = default_llseek,
335}; 333};
336 334
337const struct file_operations ecryptfs_main_fops = { 335const struct file_operations ecryptfs_main_fops = {
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 00208c3d7e92..940a82e63dc3 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -482,6 +482,7 @@ static const struct file_operations ecryptfs_miscdev_fops = {
482 .read = ecryptfs_miscdev_read, 482 .read = ecryptfs_miscdev_read,
483 .write = ecryptfs_miscdev_write, 483 .write = ecryptfs_miscdev_write,
484 .release = ecryptfs_miscdev_release, 484 .release = ecryptfs_miscdev_release,
485 .llseek = noop_llseek,
485}; 486};
486 487
487static struct miscdevice ecryptfs_miscdev = { 488static struct miscdevice ecryptfs_miscdev = {
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 6bd3f76fdf88..e0194b3e14d6 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -293,6 +293,7 @@ static const struct file_operations eventfd_fops = {
293 .poll = eventfd_poll, 293 .poll = eventfd_poll,
294 .read = eventfd_read, 294 .read = eventfd_read,
295 .write = eventfd_write, 295 .write = eventfd_write,
296 .llseek = noop_llseek,
296}; 297};
297 298
298/** 299/**
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 3817149919cb..8cf07242067d 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -77,9 +77,6 @@
77/* Maximum number of nesting allowed inside epoll sets */ 77/* Maximum number of nesting allowed inside epoll sets */
78#define EP_MAX_NESTS 4 78#define EP_MAX_NESTS 4
79 79
80/* Maximum msec timeout value storeable in a long int */
81#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
82
83#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) 80#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
84 81
85#define EP_UNACTIVE_PTR ((void *) -1L) 82#define EP_UNACTIVE_PTR ((void *) -1L)
@@ -674,7 +671,8 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
674/* File callbacks that implement the eventpoll file behaviour */ 671/* File callbacks that implement the eventpoll file behaviour */
675static const struct file_operations eventpoll_fops = { 672static const struct file_operations eventpoll_fops = {
676 .release = ep_eventpoll_release, 673 .release = ep_eventpoll_release,
677 .poll = ep_eventpoll_poll 674 .poll = ep_eventpoll_poll,
675 .llseek = noop_llseek,
678}; 676};
679 677
680/* Fast test to see if the file is an evenpoll file */ 678/* Fast test to see if the file is an evenpoll file */
@@ -1116,18 +1114,22 @@ static int ep_send_events(struct eventpoll *ep,
1116static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, 1114static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1117 int maxevents, long timeout) 1115 int maxevents, long timeout)
1118{ 1116{
1119 int res, eavail; 1117 int res, eavail, timed_out = 0;
1120 unsigned long flags; 1118 unsigned long flags;
1121 long jtimeout; 1119 long slack;
1122 wait_queue_t wait; 1120 wait_queue_t wait;
1123 1121 struct timespec end_time;
1124 /* 1122 ktime_t expires, *to = NULL;
1125 * Calculate the timeout by checking for the "infinite" value (-1) 1123
1126 * and the overflow condition. The passed timeout is in milliseconds, 1124 if (timeout > 0) {
1127 * that why (t * HZ) / 1000. 1125 ktime_get_ts(&end_time);
1128 */ 1126 timespec_add_ns(&end_time, (u64)timeout * NSEC_PER_MSEC);
1129 jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ? 1127 slack = select_estimate_accuracy(&end_time);
1130 MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000; 1128 to = &expires;
1129 *to = timespec_to_ktime(end_time);
1130 } else if (timeout == 0) {
1131 timed_out = 1;
1132 }
1131 1133
1132retry: 1134retry:
1133 spin_lock_irqsave(&ep->lock, flags); 1135 spin_lock_irqsave(&ep->lock, flags);
@@ -1149,7 +1151,7 @@ retry:
1149 * to TASK_INTERRUPTIBLE before doing the checks. 1151 * to TASK_INTERRUPTIBLE before doing the checks.
1150 */ 1152 */
1151 set_current_state(TASK_INTERRUPTIBLE); 1153 set_current_state(TASK_INTERRUPTIBLE);
1152 if (!list_empty(&ep->rdllist) || !jtimeout) 1154 if (!list_empty(&ep->rdllist) || timed_out)
1153 break; 1155 break;
1154 if (signal_pending(current)) { 1156 if (signal_pending(current)) {
1155 res = -EINTR; 1157 res = -EINTR;
@@ -1157,7 +1159,9 @@ retry:
1157 } 1159 }
1158 1160
1159 spin_unlock_irqrestore(&ep->lock, flags); 1161 spin_unlock_irqrestore(&ep->lock, flags);
1160 jtimeout = schedule_timeout(jtimeout); 1162 if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
1163 timed_out = 1;
1164
1161 spin_lock_irqsave(&ep->lock, flags); 1165 spin_lock_irqsave(&ep->lock, flags);
1162 } 1166 }
1163 __remove_wait_queue(&ep->wq, &wait); 1167 __remove_wait_queue(&ep->wq, &wait);
@@ -1175,7 +1179,7 @@ retry:
1175 * more luck. 1179 * more luck.
1176 */ 1180 */
1177 if (!res && eavail && 1181 if (!res && eavail &&
1178 !(res = ep_send_events(ep, events, maxevents)) && jtimeout) 1182 !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
1179 goto retry; 1183 goto retry;
1180 1184
1181 return res; 1185 return res;
diff --git a/fs/exec.c b/fs/exec.c
index 828dd2461d6b..99d33a1371e9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -54,6 +54,7 @@
54#include <linux/fsnotify.h> 54#include <linux/fsnotify.h>
55#include <linux/fs_struct.h> 55#include <linux/fs_struct.h>
56#include <linux/pipe_fs_i.h> 56#include <linux/pipe_fs_i.h>
57#include <linux/oom.h>
57 58
58#include <asm/uaccess.h> 59#include <asm/uaccess.h>
59#include <asm/mmu_context.h> 60#include <asm/mmu_context.h>
@@ -65,6 +66,12 @@ char core_pattern[CORENAME_MAX_SIZE] = "core";
65unsigned int core_pipe_limit; 66unsigned int core_pipe_limit;
66int suid_dumpable = 0; 67int suid_dumpable = 0;
67 68
69struct core_name {
70 char *corename;
71 int used, size;
72};
73static atomic_t call_count = ATOMIC_INIT(1);
74
68/* The maximal length of core_pattern is also specified in sysctl.c */ 75/* The maximal length of core_pattern is also specified in sysctl.c */
69 76
70static LIST_HEAD(formats); 77static LIST_HEAD(formats);
@@ -759,6 +766,10 @@ static int exec_mmap(struct mm_struct *mm)
759 tsk->mm = mm; 766 tsk->mm = mm;
760 tsk->active_mm = mm; 767 tsk->active_mm = mm;
761 activate_mm(active_mm, mm); 768 activate_mm(active_mm, mm);
769 if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
770 atomic_dec(&old_mm->oom_disable_count);
771 atomic_inc(&tsk->mm->oom_disable_count);
772 }
762 task_unlock(tsk); 773 task_unlock(tsk);
763 arch_pick_mmap_layout(mm); 774 arch_pick_mmap_layout(mm);
764 if (old_mm) { 775 if (old_mm) {
@@ -998,7 +1009,7 @@ int flush_old_exec(struct linux_binprm * bprm)
998 1009
999 bprm->mm = NULL; /* We're using it now */ 1010 bprm->mm = NULL; /* We're using it now */
1000 1011
1001 current->flags &= ~PF_RANDOMIZE; 1012 current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD);
1002 flush_thread(); 1013 flush_thread();
1003 current->personality &= ~bprm->per_clear; 1014 current->personality &= ~bprm->per_clear;
1004 1015
@@ -1078,14 +1089,14 @@ EXPORT_SYMBOL(setup_new_exec);
1078 */ 1089 */
1079int prepare_bprm_creds(struct linux_binprm *bprm) 1090int prepare_bprm_creds(struct linux_binprm *bprm)
1080{ 1091{
1081 if (mutex_lock_interruptible(&current->cred_guard_mutex)) 1092 if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
1082 return -ERESTARTNOINTR; 1093 return -ERESTARTNOINTR;
1083 1094
1084 bprm->cred = prepare_exec_creds(); 1095 bprm->cred = prepare_exec_creds();
1085 if (likely(bprm->cred)) 1096 if (likely(bprm->cred))
1086 return 0; 1097 return 0;
1087 1098
1088 mutex_unlock(&current->cred_guard_mutex); 1099 mutex_unlock(&current->signal->cred_guard_mutex);
1089 return -ENOMEM; 1100 return -ENOMEM;
1090} 1101}
1091 1102
@@ -1093,7 +1104,7 @@ void free_bprm(struct linux_binprm *bprm)
1093{ 1104{
1094 free_arg_pages(bprm); 1105 free_arg_pages(bprm);
1095 if (bprm->cred) { 1106 if (bprm->cred) {
1096 mutex_unlock(&current->cred_guard_mutex); 1107 mutex_unlock(&current->signal->cred_guard_mutex);
1097 abort_creds(bprm->cred); 1108 abort_creds(bprm->cred);
1098 } 1109 }
1099 kfree(bprm); 1110 kfree(bprm);
@@ -1114,13 +1125,13 @@ void install_exec_creds(struct linux_binprm *bprm)
1114 * credentials; any time after this it may be unlocked. 1125 * credentials; any time after this it may be unlocked.
1115 */ 1126 */
1116 security_bprm_committed_creds(bprm); 1127 security_bprm_committed_creds(bprm);
1117 mutex_unlock(&current->cred_guard_mutex); 1128 mutex_unlock(&current->signal->cred_guard_mutex);
1118} 1129}
1119EXPORT_SYMBOL(install_exec_creds); 1130EXPORT_SYMBOL(install_exec_creds);
1120 1131
1121/* 1132/*
1122 * determine how safe it is to execute the proposed program 1133 * determine how safe it is to execute the proposed program
1123 * - the caller must hold current->cred_guard_mutex to protect against 1134 * - the caller must hold ->cred_guard_mutex to protect against
1124 * PTRACE_ATTACH 1135 * PTRACE_ATTACH
1125 */ 1136 */
1126int check_unsafe_exec(struct linux_binprm *bprm) 1137int check_unsafe_exec(struct linux_binprm *bprm)
@@ -1401,7 +1412,6 @@ int do_execve(const char * filename,
1401 if (retval < 0) 1412 if (retval < 0)
1402 goto out; 1413 goto out;
1403 1414
1404 current->flags &= ~PF_KTHREAD;
1405 retval = search_binary_handler(bprm,regs); 1415 retval = search_binary_handler(bprm,regs);
1406 if (retval < 0) 1416 if (retval < 0)
1407 goto out; 1417 goto out;
@@ -1454,127 +1464,148 @@ void set_binfmt(struct linux_binfmt *new)
1454 1464
1455EXPORT_SYMBOL(set_binfmt); 1465EXPORT_SYMBOL(set_binfmt);
1456 1466
1467static int expand_corename(struct core_name *cn)
1468{
1469 char *old_corename = cn->corename;
1470
1471 cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
1472 cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
1473
1474 if (!cn->corename) {
1475 kfree(old_corename);
1476 return -ENOMEM;
1477 }
1478
1479 return 0;
1480}
1481
1482static int cn_printf(struct core_name *cn, const char *fmt, ...)
1483{
1484 char *cur;
1485 int need;
1486 int ret;
1487 va_list arg;
1488
1489 va_start(arg, fmt);
1490 need = vsnprintf(NULL, 0, fmt, arg);
1491 va_end(arg);
1492
1493 if (likely(need < cn->size - cn->used - 1))
1494 goto out_printf;
1495
1496 ret = expand_corename(cn);
1497 if (ret)
1498 goto expand_fail;
1499
1500out_printf:
1501 cur = cn->corename + cn->used;
1502 va_start(arg, fmt);
1503 vsnprintf(cur, need + 1, fmt, arg);
1504 va_end(arg);
1505 cn->used += need;
1506 return 0;
1507
1508expand_fail:
1509 return ret;
1510}
1511
1457/* format_corename will inspect the pattern parameter, and output a 1512/* format_corename will inspect the pattern parameter, and output a
1458 * name into corename, which must have space for at least 1513 * name into corename, which must have space for at least
1459 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. 1514 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
1460 */ 1515 */
1461static int format_corename(char *corename, long signr) 1516static int format_corename(struct core_name *cn, long signr)
1462{ 1517{
1463 const struct cred *cred = current_cred(); 1518 const struct cred *cred = current_cred();
1464 const char *pat_ptr = core_pattern; 1519 const char *pat_ptr = core_pattern;
1465 int ispipe = (*pat_ptr == '|'); 1520 int ispipe = (*pat_ptr == '|');
1466 char *out_ptr = corename;
1467 char *const out_end = corename + CORENAME_MAX_SIZE;
1468 int rc;
1469 int pid_in_pattern = 0; 1521 int pid_in_pattern = 0;
1522 int err = 0;
1523
1524 cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
1525 cn->corename = kmalloc(cn->size, GFP_KERNEL);
1526 cn->used = 0;
1527
1528 if (!cn->corename)
1529 return -ENOMEM;
1470 1530
1471 /* Repeat as long as we have more pattern to process and more output 1531 /* Repeat as long as we have more pattern to process and more output
1472 space */ 1532 space */
1473 while (*pat_ptr) { 1533 while (*pat_ptr) {
1474 if (*pat_ptr != '%') { 1534 if (*pat_ptr != '%') {
1475 if (out_ptr == out_end) 1535 if (*pat_ptr == 0)
1476 goto out; 1536 goto out;
1477 *out_ptr++ = *pat_ptr++; 1537 err = cn_printf(cn, "%c", *pat_ptr++);
1478 } else { 1538 } else {
1479 switch (*++pat_ptr) { 1539 switch (*++pat_ptr) {
1540 /* single % at the end, drop that */
1480 case 0: 1541 case 0:
1481 goto out; 1542 goto out;
1482 /* Double percent, output one percent */ 1543 /* Double percent, output one percent */
1483 case '%': 1544 case '%':
1484 if (out_ptr == out_end) 1545 err = cn_printf(cn, "%c", '%');
1485 goto out;
1486 *out_ptr++ = '%';
1487 break; 1546 break;
1488 /* pid */ 1547 /* pid */
1489 case 'p': 1548 case 'p':
1490 pid_in_pattern = 1; 1549 pid_in_pattern = 1;
1491 rc = snprintf(out_ptr, out_end - out_ptr, 1550 err = cn_printf(cn, "%d",
1492 "%d", task_tgid_vnr(current)); 1551 task_tgid_vnr(current));
1493 if (rc > out_end - out_ptr)
1494 goto out;
1495 out_ptr += rc;
1496 break; 1552 break;
1497 /* uid */ 1553 /* uid */
1498 case 'u': 1554 case 'u':
1499 rc = snprintf(out_ptr, out_end - out_ptr, 1555 err = cn_printf(cn, "%d", cred->uid);
1500 "%d", cred->uid);
1501 if (rc > out_end - out_ptr)
1502 goto out;
1503 out_ptr += rc;
1504 break; 1556 break;
1505 /* gid */ 1557 /* gid */
1506 case 'g': 1558 case 'g':
1507 rc = snprintf(out_ptr, out_end - out_ptr, 1559 err = cn_printf(cn, "%d", cred->gid);
1508 "%d", cred->gid);
1509 if (rc > out_end - out_ptr)
1510 goto out;
1511 out_ptr += rc;
1512 break; 1560 break;
1513 /* signal that caused the coredump */ 1561 /* signal that caused the coredump */
1514 case 's': 1562 case 's':
1515 rc = snprintf(out_ptr, out_end - out_ptr, 1563 err = cn_printf(cn, "%ld", signr);
1516 "%ld", signr);
1517 if (rc > out_end - out_ptr)
1518 goto out;
1519 out_ptr += rc;
1520 break; 1564 break;
1521 /* UNIX time of coredump */ 1565 /* UNIX time of coredump */
1522 case 't': { 1566 case 't': {
1523 struct timeval tv; 1567 struct timeval tv;
1524 do_gettimeofday(&tv); 1568 do_gettimeofday(&tv);
1525 rc = snprintf(out_ptr, out_end - out_ptr, 1569 err = cn_printf(cn, "%lu", tv.tv_sec);
1526 "%lu", tv.tv_sec);
1527 if (rc > out_end - out_ptr)
1528 goto out;
1529 out_ptr += rc;
1530 break; 1570 break;
1531 } 1571 }
1532 /* hostname */ 1572 /* hostname */
1533 case 'h': 1573 case 'h':
1534 down_read(&uts_sem); 1574 down_read(&uts_sem);
1535 rc = snprintf(out_ptr, out_end - out_ptr, 1575 err = cn_printf(cn, "%s",
1536 "%s", utsname()->nodename); 1576 utsname()->nodename);
1537 up_read(&uts_sem); 1577 up_read(&uts_sem);
1538 if (rc > out_end - out_ptr)
1539 goto out;
1540 out_ptr += rc;
1541 break; 1578 break;
1542 /* executable */ 1579 /* executable */
1543 case 'e': 1580 case 'e':
1544 rc = snprintf(out_ptr, out_end - out_ptr, 1581 err = cn_printf(cn, "%s", current->comm);
1545 "%s", current->comm);
1546 if (rc > out_end - out_ptr)
1547 goto out;
1548 out_ptr += rc;
1549 break; 1582 break;
1550 /* core limit size */ 1583 /* core limit size */
1551 case 'c': 1584 case 'c':
1552 rc = snprintf(out_ptr, out_end - out_ptr, 1585 err = cn_printf(cn, "%lu",
1553 "%lu", rlimit(RLIMIT_CORE)); 1586 rlimit(RLIMIT_CORE));
1554 if (rc > out_end - out_ptr)
1555 goto out;
1556 out_ptr += rc;
1557 break; 1587 break;
1558 default: 1588 default:
1559 break; 1589 break;
1560 } 1590 }
1561 ++pat_ptr; 1591 ++pat_ptr;
1562 } 1592 }
1593
1594 if (err)
1595 return err;
1563 } 1596 }
1597
1564 /* Backward compatibility with core_uses_pid: 1598 /* Backward compatibility with core_uses_pid:
1565 * 1599 *
1566 * If core_pattern does not include a %p (as is the default) 1600 * If core_pattern does not include a %p (as is the default)
1567 * and core_uses_pid is set, then .%pid will be appended to 1601 * and core_uses_pid is set, then .%pid will be appended to
1568 * the filename. Do not do this for piped commands. */ 1602 * the filename. Do not do this for piped commands. */
1569 if (!ispipe && !pid_in_pattern && core_uses_pid) { 1603 if (!ispipe && !pid_in_pattern && core_uses_pid) {
1570 rc = snprintf(out_ptr, out_end - out_ptr, 1604 err = cn_printf(cn, ".%d", task_tgid_vnr(current));
1571 ".%d", task_tgid_vnr(current)); 1605 if (err)
1572 if (rc > out_end - out_ptr) 1606 return err;
1573 goto out;
1574 out_ptr += rc;
1575 } 1607 }
1576out: 1608out:
1577 *out_ptr = 0;
1578 return ispipe; 1609 return ispipe;
1579} 1610}
1580 1611
@@ -1851,7 +1882,7 @@ static int umh_pipe_setup(struct subprocess_info *info)
1851void do_coredump(long signr, int exit_code, struct pt_regs *regs) 1882void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1852{ 1883{
1853 struct core_state core_state; 1884 struct core_state core_state;
1854 char corename[CORENAME_MAX_SIZE + 1]; 1885 struct core_name cn;
1855 struct mm_struct *mm = current->mm; 1886 struct mm_struct *mm = current->mm;
1856 struct linux_binfmt * binfmt; 1887 struct linux_binfmt * binfmt;
1857 const struct cred *old_cred; 1888 const struct cred *old_cred;
@@ -1906,7 +1937,13 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1906 */ 1937 */
1907 clear_thread_flag(TIF_SIGPENDING); 1938 clear_thread_flag(TIF_SIGPENDING);
1908 1939
1909 ispipe = format_corename(corename, signr); 1940 ispipe = format_corename(&cn, signr);
1941
1942 if (ispipe == -ENOMEM) {
1943 printk(KERN_WARNING "format_corename failed\n");
1944 printk(KERN_WARNING "Aborting core\n");
1945 goto fail_corename;
1946 }
1910 1947
1911 if (ispipe) { 1948 if (ispipe) {
1912 int dump_count; 1949 int dump_count;
@@ -1943,7 +1980,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1943 goto fail_dropcount; 1980 goto fail_dropcount;
1944 } 1981 }
1945 1982
1946 helper_argv = argv_split(GFP_KERNEL, corename+1, NULL); 1983 helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
1947 if (!helper_argv) { 1984 if (!helper_argv) {
1948 printk(KERN_WARNING "%s failed to allocate memory\n", 1985 printk(KERN_WARNING "%s failed to allocate memory\n",
1949 __func__); 1986 __func__);
@@ -1956,7 +1993,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1956 argv_free(helper_argv); 1993 argv_free(helper_argv);
1957 if (retval) { 1994 if (retval) {
1958 printk(KERN_INFO "Core dump to %s pipe failed\n", 1995 printk(KERN_INFO "Core dump to %s pipe failed\n",
1959 corename); 1996 cn.corename);
1960 goto close_fail; 1997 goto close_fail;
1961 } 1998 }
1962 } else { 1999 } else {
@@ -1965,7 +2002,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1965 if (cprm.limit < binfmt->min_coredump) 2002 if (cprm.limit < binfmt->min_coredump)
1966 goto fail_unlock; 2003 goto fail_unlock;
1967 2004
1968 cprm.file = filp_open(corename, 2005 cprm.file = filp_open(cn.corename,
1969 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 2006 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
1970 0600); 2007 0600);
1971 if (IS_ERR(cprm.file)) 2008 if (IS_ERR(cprm.file))
@@ -2007,6 +2044,8 @@ fail_dropcount:
2007 if (ispipe) 2044 if (ispipe)
2008 atomic_dec(&core_dump_count); 2045 atomic_dec(&core_dump_count);
2009fail_unlock: 2046fail_unlock:
2047 kfree(cn.corename);
2048fail_corename:
2010 coredump_finish(mm); 2049 coredump_finish(mm);
2011 revert_creds(old_cred); 2050 revert_creds(old_cred);
2012fail_creds: 2051fail_creds:
@@ -2014,3 +2053,43 @@ fail_creds:
2014fail: 2053fail:
2015 return; 2054 return;
2016} 2055}
2056
2057/*
2058 * Core dumping helper functions. These are the only things you should
2059 * do on a core-file: use only these functions to write out all the
2060 * necessary info.
2061 */
2062int dump_write(struct file *file, const void *addr, int nr)
2063{
2064 return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
2065}
2066EXPORT_SYMBOL(dump_write);
2067
2068int dump_seek(struct file *file, loff_t off)
2069{
2070 int ret = 1;
2071
2072 if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
2073 if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
2074 return 0;
2075 } else {
2076 char *buf = (char *)get_zeroed_page(GFP_KERNEL);
2077
2078 if (!buf)
2079 return 0;
2080 while (off > 0) {
2081 unsigned long n = off;
2082
2083 if (n > PAGE_SIZE)
2084 n = PAGE_SIZE;
2085 if (!dump_write(file, buf, n)) {
2086 ret = 0;
2087 break;
2088 }
2089 off -= n;
2090 }
2091 free_page((unsigned long)buf);
2092 }
2093 return ret;
2094}
2095EXPORT_SYMBOL(dump_seek);
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index d91e9d829bc1..dcc941d82d67 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -420,7 +420,7 @@ int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
420 err = exofs_write_begin(NULL, page->mapping, pos, len, 420 err = exofs_write_begin(NULL, page->mapping, pos, len,
421 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); 421 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
422 if (err) 422 if (err)
423 EXOFS_ERR("exofs_set_link: exofs_write_begin FAILD => %d\n", 423 EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n",
424 err); 424 err);
425 425
426 de->inode_no = cpu_to_le64(inode->i_ino); 426 de->inode_no = cpu_to_le64(inode->i_ino);
@@ -556,7 +556,7 @@ int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
556 err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0, 556 err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0,
557 &page, NULL); 557 &page, NULL);
558 if (err) 558 if (err)
559 EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILD => %d\n", 559 EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILED => %d\n",
560 err); 560 err);
561 if (pde) 561 if (pde)
562 pde->rec_len = cpu_to_le16(to - from); 562 pde->rec_len = cpu_to_le16(to - from);
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 68cb23e3bb98..b905c79b4f0a 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -46,10 +46,6 @@ static int exofs_file_fsync(struct file *filp, int datasync)
46{ 46{
47 int ret; 47 int ret;
48 struct inode *inode = filp->f_mapping->host; 48 struct inode *inode = filp->f_mapping->host;
49 struct writeback_control wbc = {
50 .sync_mode = WB_SYNC_ALL,
51 .nr_to_write = 0, /* metadata-only; caller takes care of data */
52 };
53 struct super_block *sb; 49 struct super_block *sb;
54 50
55 if (!(inode->i_state & I_DIRTY)) 51 if (!(inode->i_state & I_DIRTY))
@@ -57,7 +53,7 @@ static int exofs_file_fsync(struct file *filp, int datasync)
57 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 53 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
58 return 0; 54 return 0;
59 55
60 ret = sync_inode(inode, &wbc); 56 ret = sync_inode_metadata(inode, 1);
61 57
62 /* This is a good place to write the sb */ 58 /* This is a good place to write the sb */
63 /* TODO: Sechedule an sb-sync on create */ 59 /* TODO: Sechedule an sb-sync on create */
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index eb7368ebd8cd..42685424817b 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -54,6 +54,9 @@ struct page_collect {
54 unsigned nr_pages; 54 unsigned nr_pages;
55 unsigned long length; 55 unsigned long length;
56 loff_t pg_first; /* keep 64bit also in 32-arches */ 56 loff_t pg_first; /* keep 64bit also in 32-arches */
57 bool read_4_write; /* This means two things: that the read is sync
58 * And the pages should not be unlocked.
59 */
57}; 60};
58 61
59static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, 62static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
@@ -71,6 +74,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
71 pcol->nr_pages = 0; 74 pcol->nr_pages = 0;
72 pcol->length = 0; 75 pcol->length = 0;
73 pcol->pg_first = -1; 76 pcol->pg_first = -1;
77 pcol->read_4_write = false;
74} 78}
75 79
76static void _pcol_reset(struct page_collect *pcol) 80static void _pcol_reset(struct page_collect *pcol)
@@ -181,7 +185,7 @@ static void update_write_page(struct page *page, int ret)
181/* Called at the end of reads, to optionally unlock pages and update their 185/* Called at the end of reads, to optionally unlock pages and update their
182 * status. 186 * status.
183 */ 187 */
184static int __readpages_done(struct page_collect *pcol, bool do_unlock) 188static int __readpages_done(struct page_collect *pcol)
185{ 189{
186 int i; 190 int i;
187 u64 resid; 191 u64 resid;
@@ -217,7 +221,7 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock)
217 page_stat ? "bad_bytes" : "good_bytes"); 221 page_stat ? "bad_bytes" : "good_bytes");
218 222
219 ret = update_read_page(page, page_stat); 223 ret = update_read_page(page, page_stat);
220 if (do_unlock) 224 if (!pcol->read_4_write)
221 unlock_page(page); 225 unlock_page(page);
222 length += PAGE_SIZE; 226 length += PAGE_SIZE;
223 } 227 }
@@ -232,7 +236,7 @@ static void readpages_done(struct exofs_io_state *ios, void *p)
232{ 236{
233 struct page_collect *pcol = p; 237 struct page_collect *pcol = p;
234 238
235 __readpages_done(pcol, true); 239 __readpages_done(pcol);
236 atomic_dec(&pcol->sbi->s_curr_pending); 240 atomic_dec(&pcol->sbi->s_curr_pending);
237 kfree(pcol); 241 kfree(pcol);
238} 242}
@@ -253,7 +257,7 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
253 } 257 }
254} 258}
255 259
256static int read_exec(struct page_collect *pcol, bool is_sync) 260static int read_exec(struct page_collect *pcol)
257{ 261{
258 struct exofs_i_info *oi = exofs_i(pcol->inode); 262 struct exofs_i_info *oi = exofs_i(pcol->inode);
259 struct exofs_io_state *ios = pcol->ios; 263 struct exofs_io_state *ios = pcol->ios;
@@ -263,17 +267,14 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
263 if (!pcol->pages) 267 if (!pcol->pages)
264 return 0; 268 return 0;
265 269
266 /* see comment in _readpage() about sync reads */
267 WARN_ON(is_sync && (pcol->nr_pages != 1));
268
269 ios->pages = pcol->pages; 270 ios->pages = pcol->pages;
270 ios->nr_pages = pcol->nr_pages; 271 ios->nr_pages = pcol->nr_pages;
271 ios->length = pcol->length; 272 ios->length = pcol->length;
272 ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT; 273 ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
273 274
274 if (is_sync) { 275 if (pcol->read_4_write) {
275 exofs_oi_read(oi, pcol->ios); 276 exofs_oi_read(oi, pcol->ios);
276 return __readpages_done(pcol, false); 277 return __readpages_done(pcol);
277 } 278 }
278 279
279 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); 280 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
@@ -299,7 +300,7 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
299 return 0; 300 return 0;
300 301
301err: 302err:
302 if (!is_sync) 303 if (!pcol->read_4_write)
303 _unlock_pcol_pages(pcol, ret, READ); 304 _unlock_pcol_pages(pcol, ret, READ);
304 305
305 pcol_free(pcol); 306 pcol_free(pcol);
@@ -347,11 +348,12 @@ static int readpage_strip(void *data, struct page *page)
347 if (PageError(page)) 348 if (PageError(page))
348 ClearPageError(page); 349 ClearPageError(page);
349 350
350 unlock_page(page); 351 if (!pcol->read_4_write)
352 unlock_page(page);
351 EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page," 353 EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
352 " splitting\n", inode->i_ino, page->index); 354 " splitting\n", inode->i_ino, page->index);
353 355
354 return read_exec(pcol, false); 356 return read_exec(pcol);
355 } 357 }
356 358
357try_again: 359try_again:
@@ -361,7 +363,7 @@ try_again:
361 } else if (unlikely((pcol->pg_first + pcol->nr_pages) != 363 } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
362 page->index)) { 364 page->index)) {
363 /* Discontinuity detected, split the request */ 365 /* Discontinuity detected, split the request */
364 ret = read_exec(pcol, false); 366 ret = read_exec(pcol);
365 if (unlikely(ret)) 367 if (unlikely(ret))
366 goto fail; 368 goto fail;
367 goto try_again; 369 goto try_again;
@@ -386,7 +388,7 @@ try_again:
386 page, len, pcol->nr_pages, pcol->length); 388 page, len, pcol->nr_pages, pcol->length);
387 389
388 /* split the request, and start again with current page */ 390 /* split the request, and start again with current page */
389 ret = read_exec(pcol, false); 391 ret = read_exec(pcol);
390 if (unlikely(ret)) 392 if (unlikely(ret))
391 goto fail; 393 goto fail;
392 394
@@ -415,26 +417,24 @@ static int exofs_readpages(struct file *file, struct address_space *mapping,
415 return ret; 417 return ret;
416 } 418 }
417 419
418 return read_exec(&pcol, false); 420 return read_exec(&pcol);
419} 421}
420 422
421static int _readpage(struct page *page, bool is_sync) 423static int _readpage(struct page *page, bool read_4_write)
422{ 424{
423 struct page_collect pcol; 425 struct page_collect pcol;
424 int ret; 426 int ret;
425 427
426 _pcol_init(&pcol, 1, page->mapping->host); 428 _pcol_init(&pcol, 1, page->mapping->host);
427 429
428 /* readpage_strip might call read_exec(,is_sync==false) at several 430 pcol.read_4_write = read_4_write;
429 * places but not if we have a single page.
430 */
431 ret = readpage_strip(&pcol, page); 431 ret = readpage_strip(&pcol, page);
432 if (ret) { 432 if (ret) {
433 EXOFS_ERR("_readpage => %d\n", ret); 433 EXOFS_ERR("_readpage => %d\n", ret);
434 return ret; 434 return ret;
435 } 435 }
436 436
437 return read_exec(&pcol, is_sync); 437 return read_exec(&pcol);
438} 438}
439 439
440/* 440/*
@@ -505,7 +505,7 @@ static int write_exec(struct page_collect *pcol)
505 505
506 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); 506 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
507 if (!pcol_copy) { 507 if (!pcol_copy) {
508 EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n"); 508 EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n");
509 ret = -ENOMEM; 509 ret = -ENOMEM;
510 goto err; 510 goto err;
511 } 511 }
@@ -521,7 +521,7 @@ static int write_exec(struct page_collect *pcol)
521 521
522 ret = exofs_oi_write(oi, ios); 522 ret = exofs_oi_write(oi, ios);
523 if (unlikely(ret)) { 523 if (unlikely(ret)) {
524 EXOFS_ERR("write_exec: exofs_oi_write() Faild\n"); 524 EXOFS_ERR("write_exec: exofs_oi_write() Failed\n");
525 goto err; 525 goto err;
526 } 526 }
527 527
@@ -622,7 +622,7 @@ try_again:
622 /* split the request, next loop will start again */ 622 /* split the request, next loop will start again */
623 ret = write_exec(pcol); 623 ret = write_exec(pcol);
624 if (unlikely(ret)) { 624 if (unlikely(ret)) {
625 EXOFS_DBGMSG("write_exec faild => %d", ret); 625 EXOFS_DBGMSG("write_exec failed => %d", ret);
626 goto fail; 626 goto fail;
627 } 627 }
628 628
@@ -713,7 +713,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
713 ret = simple_write_begin(file, mapping, pos, len, flags, pagep, 713 ret = simple_write_begin(file, mapping, pos, len, flags, pagep,
714 fsdata); 714 fsdata);
715 if (ret) { 715 if (ret) {
716 EXOFS_DBGMSG("simple_write_begin faild\n"); 716 EXOFS_DBGMSG("simple_write_begin failed\n");
717 goto out; 717 goto out;
718 } 718 }
719 719
@@ -726,7 +726,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
726 if (ret) { 726 if (ret) {
727 /*SetPageError was done by _readpage. Is it ok?*/ 727 /*SetPageError was done by _readpage. Is it ok?*/
728 unlock_page(page); 728 unlock_page(page);
729 EXOFS_DBGMSG("__readpage_filler faild\n"); 729 EXOFS_DBGMSG("__readpage_filler failed\n");
730 } 730 }
731 } 731 }
732out: 732out:
@@ -1030,6 +1030,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
1030 memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data)); 1030 memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
1031 } 1031 }
1032 1032
1033 inode->i_mapping->backing_dev_info = sb->s_bdi;
1033 if (S_ISREG(inode->i_mode)) { 1034 if (S_ISREG(inode->i_mode)) {
1034 inode->i_op = &exofs_file_inode_operations; 1035 inode->i_op = &exofs_file_inode_operations;
1035 inode->i_fop = &exofs_file_operations; 1036 inode->i_fop = &exofs_file_operations;
@@ -1066,8 +1067,10 @@ bad_inode:
1066int __exofs_wait_obj_created(struct exofs_i_info *oi) 1067int __exofs_wait_obj_created(struct exofs_i_info *oi)
1067{ 1068{
1068 if (!obj_created(oi)) { 1069 if (!obj_created(oi)) {
1070 EXOFS_DBGMSG("!obj_created\n");
1069 BUG_ON(!obj_2bcreated(oi)); 1071 BUG_ON(!obj_2bcreated(oi));
1070 wait_event(oi->i_wq, obj_created(oi)); 1072 wait_event(oi->i_wq, obj_created(oi));
1073 EXOFS_DBGMSG("wait_event done\n");
1071 } 1074 }
1072 return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0; 1075 return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
1073} 1076}
@@ -1089,7 +1092,7 @@ static void create_done(struct exofs_io_state *ios, void *p)
1089 atomic_dec(&sbi->s_curr_pending); 1092 atomic_dec(&sbi->s_curr_pending);
1090 1093
1091 if (unlikely(ret)) { 1094 if (unlikely(ret)) {
1092 EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx", 1095 EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx",
1093 _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid)); 1096 _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid));
1094 /*TODO: When FS is corrupted creation can fail, object already 1097 /*TODO: When FS is corrupted creation can fail, object already
1095 * exist. Get rid of this asynchronous creation, if exist 1098 * exist. Get rid of this asynchronous creation, if exist
@@ -1101,7 +1104,6 @@ static void create_done(struct exofs_io_state *ios, void *p)
1101 1104
1102 set_obj_created(oi); 1105 set_obj_created(oi);
1103 1106
1104 atomic_dec(&inode->i_count);
1105 wake_up(&oi->i_wq); 1107 wake_up(&oi->i_wq);
1106} 1108}
1107 1109
@@ -1129,6 +1131,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1129 1131
1130 sbi = sb->s_fs_info; 1132 sbi = sb->s_fs_info;
1131 1133
1134 inode->i_mapping->backing_dev_info = sb->s_bdi;
1132 sb->s_dirt = 1; 1135 sb->s_dirt = 1;
1133 inode_init_owner(inode, dir, mode); 1136 inode_init_owner(inode, dir, mode);
1134 inode->i_ino = sbi->s_nextid++; 1137 inode->i_ino = sbi->s_nextid++;
@@ -1151,17 +1154,11 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1151 ios->obj.id = exofs_oi_objno(oi); 1154 ios->obj.id = exofs_oi_objno(oi);
1152 exofs_make_credential(oi->i_cred, &ios->obj); 1155 exofs_make_credential(oi->i_cred, &ios->obj);
1153 1156
1154 /* increment the refcount so that the inode will still be around when we
1155 * reach the callback
1156 */
1157 atomic_inc(&inode->i_count);
1158
1159 ios->done = create_done; 1157 ios->done = create_done;
1160 ios->private = inode; 1158 ios->private = inode;
1161 ios->cred = oi->i_cred; 1159 ios->cred = oi->i_cred;
1162 ret = exofs_sbi_create(ios); 1160 ret = exofs_sbi_create(ios);
1163 if (ret) { 1161 if (ret) {
1164 atomic_dec(&inode->i_count);
1165 exofs_put_io_state(ios); 1162 exofs_put_io_state(ios);
1166 return ERR_PTR(ret); 1163 return ERR_PTR(ret);
1167 } 1164 }
@@ -1209,7 +1206,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1209 1206
1210 args = kzalloc(sizeof(*args), GFP_KERNEL); 1207 args = kzalloc(sizeof(*args), GFP_KERNEL);
1211 if (!args) { 1208 if (!args) {
1212 EXOFS_DBGMSG("Faild kzalloc of args\n"); 1209 EXOFS_DBGMSG("Failed kzalloc of args\n");
1213 return -ENOMEM; 1210 return -ENOMEM;
1214 } 1211 }
1215 1212
@@ -1251,12 +1248,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1251 ios->out_attr_len = 1; 1248 ios->out_attr_len = 1;
1252 ios->out_attr = &attr; 1249 ios->out_attr = &attr;
1253 1250
1254 if (!obj_created(oi)) { 1251 wait_obj_created(oi);
1255 EXOFS_DBGMSG("!obj_created\n");
1256 BUG_ON(!obj_2bcreated(oi));
1257 wait_event(oi->i_wq, obj_created(oi));
1258 EXOFS_DBGMSG("wait_event done\n");
1259 }
1260 1252
1261 if (!do_sync) { 1253 if (!do_sync) {
1262 args->sbi = sbi; 1254 args->sbi = sbi;
@@ -1319,12 +1311,12 @@ void exofs_evict_inode(struct inode *inode)
1319 inode->i_size = 0; 1311 inode->i_size = 0;
1320 end_writeback(inode); 1312 end_writeback(inode);
1321 1313
1322 /* if we are deleting an obj that hasn't been created yet, wait */ 1314 /* if we are deleting an obj that hasn't been created yet, wait.
1323 if (!obj_created(oi)) { 1315 * This also makes sure that create_done cannot be called with an
1324 BUG_ON(!obj_2bcreated(oi)); 1316 * already evicted inode.
1325 wait_event(oi->i_wq, obj_created(oi)); 1317 */
1326 /* ignore the error attempt a remove anyway */ 1318 wait_obj_created(oi);
1327 } 1319 /* ignore the error, attempt a remove anyway */
1328 1320
1329 /* Now Remove the OSD objects */ 1321 /* Now Remove the OSD objects */
1330 ret = exofs_get_io_state(&sbi->layout, &ios); 1322 ret = exofs_get_io_state(&sbi->layout, &ios);
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 6550bf70e41d..f74a2ec027a6 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -55,7 +55,7 @@ int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
55 55
56 ret = osd_finalize_request(or, 0, cred, NULL); 56 ret = osd_finalize_request(or, 0, cred, NULL);
57 if (unlikely(ret)) { 57 if (unlikely(ret)) {
58 EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret); 58 EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret);
59 goto out; 59 goto out;
60 } 60 }
61 61
@@ -79,7 +79,7 @@ int exofs_get_io_state(struct exofs_layout *layout,
79 */ 79 */
80 ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL); 80 ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL);
81 if (unlikely(!ios)) { 81 if (unlikely(!ios)) {
82 EXOFS_DBGMSG("Faild kzalloc bytes=%d\n", 82 EXOFS_DBGMSG("Failed kzalloc bytes=%d\n",
83 exofs_io_state_size(layout->s_numdevs)); 83 exofs_io_state_size(layout->s_numdevs));
84 *pios = NULL; 84 *pios = NULL;
85 return -ENOMEM; 85 return -ENOMEM;
@@ -172,7 +172,7 @@ static int exofs_io_execute(struct exofs_io_state *ios)
172 172
173 ret = osd_finalize_request(or, 0, ios->cred, NULL); 173 ret = osd_finalize_request(or, 0, ios->cred, NULL);
174 if (unlikely(ret)) { 174 if (unlikely(ret)) {
175 EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", 175 EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n",
176 ret); 176 ret);
177 return ret; 177 return ret;
178 } 178 }
@@ -361,7 +361,7 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg,
361 361
362 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); 362 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
363 if (unlikely(!per_dev->bio)) { 363 if (unlikely(!per_dev->bio)) {
364 EXOFS_DBGMSG("Faild to allocate BIO size=%u\n", 364 EXOFS_DBGMSG("Failed to allocate BIO size=%u\n",
365 bio_size); 365 bio_size);
366 return -ENOMEM; 366 return -ENOMEM;
367 } 367 }
@@ -564,7 +564,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
564 master_dev->bio->bi_max_vecs); 564 master_dev->bio->bi_max_vecs);
565 if (unlikely(!bio)) { 565 if (unlikely(!bio)) {
566 EXOFS_DBGMSG( 566 EXOFS_DBGMSG(
567 "Faild to allocate BIO size=%u\n", 567 "Failed to allocate BIO size=%u\n",
568 master_dev->bio->bi_max_vecs); 568 master_dev->bio->bi_max_vecs);
569 ret = -ENOMEM; 569 ret = -ENOMEM;
570 goto out; 570 goto out;
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index b7dd0c236863..264e95d02830 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -153,7 +153,7 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir,
153 153
154 inode->i_ctime = CURRENT_TIME; 154 inode->i_ctime = CURRENT_TIME;
155 inode_inc_link_count(inode); 155 inode_inc_link_count(inode);
156 atomic_inc(&inode->i_count); 156 ihold(inode);
157 157
158 return exofs_add_nondir(dentry, inode); 158 return exofs_add_nondir(dentry, inode);
159} 159}
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index e9e175949a63..51b304056f10 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -74,21 +74,20 @@ static struct dentry *
74find_disconnected_root(struct dentry *dentry) 74find_disconnected_root(struct dentry *dentry)
75{ 75{
76 dget(dentry); 76 dget(dentry);
77 spin_lock(&dentry->d_lock); 77 while (!IS_ROOT(dentry)) {
78 while (!IS_ROOT(dentry) && 78 struct dentry *parent = dget_parent(dentry);
79 (dentry->d_parent->d_flags & DCACHE_DISCONNECTED)) { 79
80 struct dentry *parent = dentry->d_parent; 80 if (!(parent->d_flags & DCACHE_DISCONNECTED)) {
81 dget(parent); 81 dput(parent);
82 spin_unlock(&dentry->d_lock); 82 break;
83 }
84
83 dput(dentry); 85 dput(dentry);
84 dentry = parent; 86 dentry = parent;
85 spin_lock(&dentry->d_lock);
86 } 87 }
87 spin_unlock(&dentry->d_lock);
88 return dentry; 88 return dentry;
89} 89}
90 90
91
92/* 91/*
93 * Make sure target_dir is fully connected to the dentry tree. 92 * Make sure target_dir is fully connected to the dentry tree.
94 * 93 *
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 764109886ec0..2709b34206ab 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -98,7 +98,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
98 if (IS_DIRSYNC(dir)) { 98 if (IS_DIRSYNC(dir)) {
99 err = write_one_page(page, 1); 99 err = write_one_page(page, 1);
100 if (!err) 100 if (!err)
101 err = ext2_sync_inode(dir); 101 err = sync_inode_metadata(dir, 1);
102 } else { 102 } else {
103 unlock_page(page); 103 unlock_page(page);
104 } 104 }
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 416daa62242c..6346a2acf326 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -120,7 +120,6 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned);
120extern struct inode *ext2_iget (struct super_block *, unsigned long); 120extern struct inode *ext2_iget (struct super_block *, unsigned long);
121extern int ext2_write_inode (struct inode *, struct writeback_control *); 121extern int ext2_write_inode (struct inode *, struct writeback_control *);
122extern void ext2_evict_inode(struct inode *); 122extern void ext2_evict_inode(struct inode *);
123extern int ext2_sync_inode (struct inode *);
124extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); 123extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
125extern int ext2_setattr (struct dentry *, struct iattr *); 124extern int ext2_setattr (struct dentry *, struct iattr *);
126extern void ext2_set_inode_flags(struct inode *inode); 125extern void ext2_set_inode_flags(struct inode *inode);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 940c96168868..40ad210a5049 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -458,7 +458,7 @@ failed_out:
458 * the same format as ext2_get_branch() would do. We are calling it after 458 * the same format as ext2_get_branch() would do. We are calling it after
459 * we had read the existing part of chain and partial points to the last 459 * we had read the existing part of chain and partial points to the last
460 * triple of that (one with zero ->key). Upon the exit we have the same 460 * triple of that (one with zero ->key). Upon the exit we have the same
461 * picture as after the successful ext2_get_block(), excpet that in one 461 * picture as after the successful ext2_get_block(), except that in one
462 * place chain is disconnected - *branch->p is still zero (we did not 462 * place chain is disconnected - *branch->p is still zero (we did not
463 * set the last link), but branch->key contains the number that should 463 * set the last link), but branch->key contains the number that should
464 * be placed into *branch->p to fill that gap. 464 * be placed into *branch->p to fill that gap.
@@ -662,7 +662,7 @@ static int ext2_get_blocks(struct inode *inode,
662 mutex_lock(&ei->truncate_mutex); 662 mutex_lock(&ei->truncate_mutex);
663 /* 663 /*
664 * If the indirect block is missing while we are reading 664 * If the indirect block is missing while we are reading
665 * the chain(ext3_get_branch() returns -EAGAIN err), or 665 * the chain(ext2_get_branch() returns -EAGAIN err), or
666 * if the chain has been changed after we grab the semaphore, 666 * if the chain has been changed after we grab the semaphore,
667 * (either because another process truncated this branch, or 667 * (either because another process truncated this branch, or
668 * another get_block allocated this branch) re-grab the chain to see if 668 * another get_block allocated this branch) re-grab the chain to see if
@@ -1203,7 +1203,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
1203 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; 1203 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
1204 if (inode_needs_sync(inode)) { 1204 if (inode_needs_sync(inode)) {
1205 sync_mapping_buffers(inode->i_mapping); 1205 sync_mapping_buffers(inode->i_mapping);
1206 ext2_sync_inode (inode); 1206 sync_inode_metadata(inode, 1);
1207 } else { 1207 } else {
1208 mark_inode_dirty(inode); 1208 mark_inode_dirty(inode);
1209 } 1209 }
@@ -1523,15 +1523,6 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
1523 return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); 1523 return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
1524} 1524}
1525 1525
1526int ext2_sync_inode(struct inode *inode)
1527{
1528 struct writeback_control wbc = {
1529 .sync_mode = WB_SYNC_ALL,
1530 .nr_to_write = 0, /* sys_fsync did this */
1531 };
1532 return sync_inode(inode, &wbc);
1533}
1534
1535int ext2_setattr(struct dentry *dentry, struct iattr *iattr) 1526int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
1536{ 1527{
1537 struct inode *inode = dentry->d_inode; 1528 struct inode *inode = dentry->d_inode;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 71efb0e9a3f2..f8aecd2e3297 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -206,7 +206,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
206 206
207 inode->i_ctime = CURRENT_TIME_SEC; 207 inode->i_ctime = CURRENT_TIME_SEC;
208 inode_inc_link_count(inode); 208 inode_inc_link_count(inode);
209 atomic_inc(&inode->i_count); 209 ihold(inode);
210 210
211 err = ext2_add_link(dentry, inode); 211 err = ext2_add_link(dentry, inode);
212 if (!err) { 212 if (!err) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 1ec602673ea8..0901320671da 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -747,15 +747,16 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
747 __le32 features; 747 __le32 features;
748 int err; 748 int err;
749 749
750 err = -ENOMEM;
750 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 751 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
751 if (!sbi) 752 if (!sbi)
752 return -ENOMEM; 753 goto failed_unlock;
753 754
754 sbi->s_blockgroup_lock = 755 sbi->s_blockgroup_lock =
755 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); 756 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
756 if (!sbi->s_blockgroup_lock) { 757 if (!sbi->s_blockgroup_lock) {
757 kfree(sbi); 758 kfree(sbi);
758 return -ENOMEM; 759 goto failed_unlock;
759 } 760 }
760 sb->s_fs_info = sbi; 761 sb->s_fs_info = sbi;
761 sbi->s_sb_block = sb_block; 762 sbi->s_sb_block = sb_block;
@@ -1107,6 +1108,7 @@ failed_sbi:
1107 sb->s_fs_info = NULL; 1108 sb->s_fs_info = NULL;
1108 kfree(sbi->s_blockgroup_lock); 1109 kfree(sbi->s_blockgroup_lock);
1109 kfree(sbi); 1110 kfree(sbi);
1111failed_unlock:
1110 return ret; 1112 return ret;
1111} 1113}
1112 1114
@@ -1219,9 +1221,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1219 } 1221 }
1220 1222
1221 es = sbi->s_es; 1223 es = sbi->s_es;
1222 if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) != 1224 if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) {
1223 (old_mount_opt & EXT2_MOUNT_XIP)) &&
1224 invalidate_inodes(sb)) {
1225 ext2_msg(sb, KERN_WARNING, "warning: refusing change of " 1225 ext2_msg(sb, KERN_WARNING, "warning: refusing change of "
1226 "xip flag with busy inodes while remounting"); 1226 "xip flag with busy inodes while remounting");
1227 sbi->s_mount_opt &= ~EXT2_MOUNT_XIP; 1227 sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 8c29ae15129e..f84700be3274 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -699,7 +699,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
699 EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; 699 EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
700 inode->i_ctime = CURRENT_TIME_SEC; 700 inode->i_ctime = CURRENT_TIME_SEC;
701 if (IS_SYNC(inode)) { 701 if (IS_SYNC(inode)) {
702 error = ext2_sync_inode (inode); 702 error = sync_inode_metadata(inode, 1);
703 /* In case sync failed due to ENOSPC the inode was actually 703 /* In case sync failed due to ENOSPC the inode was actually
704 * written (only some dirty data were not) so we just proceed 704 * written (only some dirty data were not) so we just proceed
705 * as if nothing happened and cleanup the unused block */ 705 * as if nothing happened and cleanup the unused block */
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index d7e9f74dc3a6..09b13bb34c94 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -90,7 +90,6 @@ int ext3_sync_file(struct file *file, int datasync)
90 * storage 90 * storage
91 */ 91 */
92 if (needs_barrier) 92 if (needs_barrier)
93 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 93 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
94 BLKDEV_IFL_WAIT);
95 return ret; 94 return ret;
96} 95}
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 5e0faf4cda79..ad05353040a1 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1696,8 +1696,8 @@ static int ext3_journalled_writepage(struct page *page,
1696 * doesn't seem much point in redirtying the page here. 1696 * doesn't seem much point in redirtying the page here.
1697 */ 1697 */
1698 ClearPageChecked(page); 1698 ClearPageChecked(page);
1699 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 1699 ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE,
1700 ext3_get_block); 1700 ext3_get_block);
1701 if (ret != 0) { 1701 if (ret != 0) {
1702 ext3_journal_stop(handle); 1702 ext3_journal_stop(handle);
1703 goto out_unlock; 1703 goto out_unlock;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 2b35ddb70d65..bce9dce639b8 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2260,7 +2260,7 @@ retry:
2260 2260
2261 inode->i_ctime = CURRENT_TIME_SEC; 2261 inode->i_ctime = CURRENT_TIME_SEC;
2262 inc_nlink(inode); 2262 inc_nlink(inode);
2263 atomic_inc(&inode->i_count); 2263 ihold(inode);
2264 2264
2265 err = ext3_add_entry(handle, dentry, inode); 2265 err = ext3_add_entry(handle, dentry, inode);
2266 if (!err) { 2266 if (!err) {
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 5dbf4dba03c4..377768009106 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -411,9 +411,6 @@ static void ext3_put_super (struct super_block * sb)
411 int i, err; 411 int i, err;
412 412
413 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); 413 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
414
415 lock_kernel();
416
417 ext3_xattr_put_super(sb); 414 ext3_xattr_put_super(sb);
418 err = journal_destroy(sbi->s_journal); 415 err = journal_destroy(sbi->s_journal);
419 sbi->s_journal = NULL; 416 sbi->s_journal = NULL;
@@ -462,8 +459,6 @@ static void ext3_put_super (struct super_block * sb)
462 sb->s_fs_info = NULL; 459 sb->s_fs_info = NULL;
463 kfree(sbi->s_blockgroup_lock); 460 kfree(sbi->s_blockgroup_lock);
464 kfree(sbi); 461 kfree(sbi);
465
466 unlock_kernel();
467} 462}
468 463
469static struct kmem_cache *ext3_inode_cachep; 464static struct kmem_cache *ext3_inode_cachep;
@@ -1627,8 +1622,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1627 sbi->s_resgid = EXT3_DEF_RESGID; 1622 sbi->s_resgid = EXT3_DEF_RESGID;
1628 sbi->s_sb_block = sb_block; 1623 sbi->s_sb_block = sb_block;
1629 1624
1630 unlock_kernel();
1631
1632 blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE); 1625 blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
1633 if (!blocksize) { 1626 if (!blocksize) {
1634 ext3_msg(sb, KERN_ERR, "error: unable to set blocksize"); 1627 ext3_msg(sb, KERN_ERR, "error: unable to set blocksize");
@@ -1849,8 +1842,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1849 goto failed_mount; 1842 goto failed_mount;
1850 } 1843 }
1851 1844
1852 if (le32_to_cpu(es->s_blocks_count) > 1845 if (generic_check_addressable(sb->s_blocksize_bits,
1853 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { 1846 le32_to_cpu(es->s_blocks_count))) {
1854 ext3_msg(sb, KERN_ERR, 1847 ext3_msg(sb, KERN_ERR,
1855 "error: filesystem is too large to mount safely"); 1848 "error: filesystem is too large to mount safely");
1856 if (sizeof(sector_t) < 8) 1849 if (sizeof(sector_t) < 8)
@@ -2025,7 +2018,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
2025 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": 2018 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
2026 "writeback"); 2019 "writeback");
2027 2020
2028 lock_kernel();
2029 return 0; 2021 return 0;
2030 2022
2031cantfind_ext3: 2023cantfind_ext3:
@@ -2055,7 +2047,6 @@ out_fail:
2055 sb->s_fs_info = NULL; 2047 sb->s_fs_info = NULL;
2056 kfree(sbi->s_blockgroup_lock); 2048 kfree(sbi->s_blockgroup_lock);
2057 kfree(sbi); 2049 kfree(sbi);
2058 lock_kernel();
2059 return ret; 2050 return ret;
2060} 2051}
2061 2052
@@ -2538,8 +2529,6 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2538 int i; 2529 int i;
2539#endif 2530#endif
2540 2531
2541 lock_kernel();
2542
2543 /* Store the original options */ 2532 /* Store the original options */
2544 lock_super(sb); 2533 lock_super(sb);
2545 old_sb_flags = sb->s_flags; 2534 old_sb_flags = sb->s_flags;
@@ -2648,7 +2637,6 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2648 kfree(old_opts.s_qf_names[i]); 2637 kfree(old_opts.s_qf_names[i]);
2649#endif 2638#endif
2650 unlock_super(sb); 2639 unlock_super(sb);
2651 unlock_kernel();
2652 2640
2653 if (enable_quota) 2641 if (enable_quota)
2654 dquot_resume(sb, -1); 2642 dquot_resume(sb, -1);
@@ -2669,7 +2657,6 @@ restore_opts:
2669 } 2657 }
2670#endif 2658#endif
2671 unlock_super(sb); 2659 unlock_super(sb);
2672 unlock_kernel();
2673 return err; 2660 return err;
2674} 2661}
2675 2662
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 592adf2e546e..3f3ff5ee8f9d 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -128,10 +128,9 @@ int ext4_sync_file(struct file *file, int datasync)
128 (journal->j_fs_dev != journal->j_dev) && 128 (journal->j_fs_dev != journal->j_dev) &&
129 (journal->j_flags & JBD2_BARRIER)) 129 (journal->j_flags & JBD2_BARRIER))
130 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, 130 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
131 NULL, BLKDEV_IFL_WAIT); 131 NULL);
132 ret = jbd2_log_wait_commit(journal, commit_tid); 132 ret = jbd2_log_wait_commit(journal, commit_tid);
133 } else if (journal->j_flags & JBD2_BARRIER) 133 } else if (journal->j_flags & JBD2_BARRIER)
134 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 134 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
135 BLKDEV_IFL_WAIT);
136 return ret; 135 return ret;
137} 136}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4b8debeb3965..49635ef236f8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1538,10 +1538,10 @@ static int do_journal_get_write_access(handle_t *handle,
1538 if (!buffer_mapped(bh) || buffer_freed(bh)) 1538 if (!buffer_mapped(bh) || buffer_freed(bh))
1539 return 0; 1539 return 0;
1540 /* 1540 /*
1541 * __block_prepare_write() could have dirtied some buffers. Clean 1541 * __block_write_begin() could have dirtied some buffers. Clean
1542 * the dirty bit as jbd2_journal_get_write_access() could complain 1542 * the dirty bit as jbd2_journal_get_write_access() could complain
1543 * otherwise about fs integrity issues. Setting of the dirty bit 1543 * otherwise about fs integrity issues. Setting of the dirty bit
1544 * by __block_prepare_write() isn't a real problem here as we clear 1544 * by __block_write_begin() isn't a real problem here as we clear
1545 * the bit before releasing a page lock and thus writeback cannot 1545 * the bit before releasing a page lock and thus writeback cannot
1546 * ever write the buffer. 1546 * ever write the buffer.
1547 */ 1547 */
@@ -2550,8 +2550,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2550 if (buffer_delay(bh)) 2550 if (buffer_delay(bh))
2551 return 0; /* Not sure this could or should happen */ 2551 return 0; /* Not sure this could or should happen */
2552 /* 2552 /*
2553 * XXX: __block_prepare_write() unmaps passed block, 2553 * XXX: __block_write_begin() unmaps passed block, is it OK?
2554 * is it OK?
2555 */ 2554 */
2556 ret = ext4_da_reserve_space(inode, iblock); 2555 ret = ext4_da_reserve_space(inode, iblock);
2557 if (ret) 2556 if (ret)
@@ -2583,7 +2582,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2583/* 2582/*
2584 * This function is used as a standard get_block_t calback function 2583 * This function is used as a standard get_block_t calback function
2585 * when there is no desire to allocate any blocks. It is used as a 2584 * when there is no desire to allocate any blocks. It is used as a
2586 * callback function for block_prepare_write() and block_write_full_page(). 2585 * callback function for block_write_begin() and block_write_full_page().
2587 * These functions should only try to map a single block at a time. 2586 * These functions should only try to map a single block at a time.
2588 * 2587 *
2589 * Since this function doesn't do block allocations even if the caller 2588 * Since this function doesn't do block allocations even if the caller
@@ -2743,7 +2742,7 @@ static int ext4_writepage(struct page *page,
2743 * all are mapped and non delay. We don't want to 2742 * all are mapped and non delay. We don't want to
2744 * do block allocation here. 2743 * do block allocation here.
2745 */ 2744 */
2746 ret = block_prepare_write(page, 0, len, 2745 ret = __block_write_begin(page, 0, len,
2747 noalloc_get_block_write); 2746 noalloc_get_block_write);
2748 if (!ret) { 2747 if (!ret) {
2749 page_bufs = page_buffers(page); 2748 page_bufs = page_buffers(page);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4b4ad4b7ce57..42f77b1dc72d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2373,6 +2373,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
2373 printk(KERN_ERR "EXT4-fs: can't get new inode\n"); 2373 printk(KERN_ERR "EXT4-fs: can't get new inode\n");
2374 goto err_freesgi; 2374 goto err_freesgi;
2375 } 2375 }
2376 sbi->s_buddy_cache->i_ino = get_next_ino();
2376 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; 2377 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2377 for (i = 0; i < ngroups; i++) { 2378 for (i = 0; i < ngroups; i++) {
2378 desc = ext4_get_group_desc(sb, i, NULL); 2379 desc = ext4_get_group_desc(sb, i, NULL);
@@ -2566,7 +2567,7 @@ static inline void ext4_issue_discard(struct super_block *sb,
2566 discard_block = block + ext4_group_first_block_no(sb, block_group); 2567 discard_block = block + ext4_group_first_block_no(sb, block_group);
2567 trace_ext4_discard_blocks(sb, 2568 trace_ext4_discard_blocks(sb,
2568 (unsigned long long) discard_block, count); 2569 (unsigned long long) discard_block, count);
2569 ret = sb_issue_discard(sb, discard_block, count); 2570 ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
2570 if (ret == EOPNOTSUPP) { 2571 if (ret == EOPNOTSUPP) {
2571 ext4_warning(sb, "discard not supported, disabling"); 2572 ext4_warning(sb, "discard not supported, disabling");
2572 clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD); 2573 clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 314c0d3b3fa9..bd39885b5998 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2312,7 +2312,7 @@ retry:
2312 2312
2313 inode->i_ctime = ext4_current_time(inode); 2313 inode->i_ctime = ext4_current_time(inode);
2314 ext4_inc_count(handle, inode); 2314 ext4_inc_count(handle, inode);
2315 atomic_inc(&inode->i_count); 2315 ihold(inode);
2316 2316
2317 err = ext4_add_entry(handle, dentry, inode); 2317 err = ext4_add_entry(handle, dentry, inode);
2318 if (!err) { 2318 if (!err) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 26147746c272..8ecc1e590303 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -26,7 +26,6 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/blkdev.h> 27#include <linux/blkdev.h>
28#include <linux/parser.h> 28#include <linux/parser.h>
29#include <linux/smp_lock.h>
30#include <linux/buffer_head.h> 29#include <linux/buffer_head.h>
31#include <linux/exportfs.h> 30#include <linux/exportfs.h>
32#include <linux/vfs.h> 31#include <linux/vfs.h>
@@ -708,7 +707,6 @@ static void ext4_put_super(struct super_block *sb)
708 destroy_workqueue(sbi->dio_unwritten_wq); 707 destroy_workqueue(sbi->dio_unwritten_wq);
709 708
710 lock_super(sb); 709 lock_super(sb);
711 lock_kernel();
712 if (sb->s_dirt) 710 if (sb->s_dirt)
713 ext4_commit_super(sb, 1); 711 ext4_commit_super(sb, 1);
714 712
@@ -775,7 +773,6 @@ static void ext4_put_super(struct super_block *sb)
775 * Now that we are completely done shutting down the 773 * Now that we are completely done shutting down the
776 * superblock, we need to actually destroy the kobject. 774 * superblock, we need to actually destroy the kobject.
777 */ 775 */
778 unlock_kernel();
779 unlock_super(sb); 776 unlock_super(sb);
780 kobject_put(&sbi->s_kobj); 777 kobject_put(&sbi->s_kobj);
781 wait_for_completion(&sbi->s_kobj_unregister); 778 wait_for_completion(&sbi->s_kobj_unregister);
@@ -2588,8 +2585,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2588 sbi->s_sectors_written_start = 2585 sbi->s_sectors_written_start =
2589 part_stat_read(sb->s_bdev->bd_part, sectors[1]); 2586 part_stat_read(sb->s_bdev->bd_part, sectors[1]);
2590 2587
2591 unlock_kernel();
2592
2593 /* Cleanup superblock name */ 2588 /* Cleanup superblock name */
2594 for (cp = sb->s_id; (cp = strchr(cp, '/'));) 2589 for (cp = sb->s_id; (cp = strchr(cp, '/'));)
2595 *cp = '!'; 2590 *cp = '!';
@@ -2831,15 +2826,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2831 * Test whether we have more sectors than will fit in sector_t, 2826 * Test whether we have more sectors than will fit in sector_t,
2832 * and whether the max offset is addressable by the page cache. 2827 * and whether the max offset is addressable by the page cache.
2833 */ 2828 */
2834 if ((ext4_blocks_count(es) > 2829 ret = generic_check_addressable(sb->s_blocksize_bits,
2835 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) || 2830 ext4_blocks_count(es));
2836 (ext4_blocks_count(es) > 2831 if (ret) {
2837 (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
2838 ext4_msg(sb, KERN_ERR, "filesystem" 2832 ext4_msg(sb, KERN_ERR, "filesystem"
2839 " too large to mount safely on this system"); 2833 " too large to mount safely on this system");
2840 if (sizeof(sector_t) < 8) 2834 if (sizeof(sector_t) < 8)
2841 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); 2835 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
2842 ret = -EFBIG;
2843 goto failed_mount; 2836 goto failed_mount;
2844 } 2837 }
2845 2838
@@ -3166,7 +3159,6 @@ no_journal:
3166 if (es->s_error_count) 3159 if (es->s_error_count)
3167 mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ 3160 mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
3168 3161
3169 lock_kernel();
3170 kfree(orig_data); 3162 kfree(orig_data);
3171 return 0; 3163 return 0;
3172 3164
@@ -3213,7 +3205,6 @@ out_fail:
3213 sb->s_fs_info = NULL; 3205 sb->s_fs_info = NULL;
3214 kfree(sbi->s_blockgroup_lock); 3206 kfree(sbi->s_blockgroup_lock);
3215 kfree(sbi); 3207 kfree(sbi);
3216 lock_kernel();
3217out_free_orig: 3208out_free_orig:
3218 kfree(orig_data); 3209 kfree(orig_data);
3219 return ret; 3210 return ret;
@@ -3722,8 +3713,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3722#endif 3713#endif
3723 char *orig_data = kstrdup(data, GFP_KERNEL); 3714 char *orig_data = kstrdup(data, GFP_KERNEL);
3724 3715
3725 lock_kernel();
3726
3727 /* Store the original options */ 3716 /* Store the original options */
3728 lock_super(sb); 3717 lock_super(sb);
3729 old_sb_flags = sb->s_flags; 3718 old_sb_flags = sb->s_flags;
@@ -3858,7 +3847,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3858 kfree(old_opts.s_qf_names[i]); 3847 kfree(old_opts.s_qf_names[i]);
3859#endif 3848#endif
3860 unlock_super(sb); 3849 unlock_super(sb);
3861 unlock_kernel();
3862 if (enable_quota) 3850 if (enable_quota)
3863 dquot_resume(sb, -1); 3851 dquot_resume(sb, -1);
3864 3852
@@ -3884,7 +3872,6 @@ restore_opts:
3884 } 3872 }
3885#endif 3873#endif
3886 unlock_super(sb); 3874 unlock_super(sb);
3887 unlock_kernel();
3888 kfree(orig_data); 3875 kfree(orig_data);
3889 return err; 3876 return err;
3890} 3877}
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 81184d3b75a3..b47d2c9f4fa1 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -577,7 +577,8 @@ int fat_free_clusters(struct inode *inode, int cluster)
577 577
578 sb_issue_discard(sb, 578 sb_issue_discard(sb,
579 fat_clus_to_blknr(sbi, first_cl), 579 fat_clus_to_blknr(sbi, first_cl),
580 nr_clus * sbi->sec_per_clus); 580 nr_clus * sbi->sec_per_clus,
581 GFP_NOFS, 0);
581 582
582 first_cl = cluster; 583 first_cl = cluster;
583 } 584 }
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 830058057d33..ad6998a92c30 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -14,7 +14,6 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/time.h> 15#include <linux/time.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/smp_lock.h>
18#include <linux/seq_file.h> 17#include <linux/seq_file.h>
19#include <linux/pagemap.h> 18#include <linux/pagemap.h>
20#include <linux/mpage.h> 19#include <linux/mpage.h>
@@ -489,8 +488,6 @@ static void fat_put_super(struct super_block *sb)
489{ 488{
490 struct msdos_sb_info *sbi = MSDOS_SB(sb); 489 struct msdos_sb_info *sbi = MSDOS_SB(sb);
491 490
492 lock_kernel();
493
494 if (sb->s_dirt) 491 if (sb->s_dirt)
495 fat_write_super(sb); 492 fat_write_super(sb);
496 493
@@ -504,8 +501,6 @@ static void fat_put_super(struct super_block *sb)
504 501
505 sb->s_fs_info = NULL; 502 sb->s_fs_info = NULL;
506 kfree(sbi); 503 kfree(sbi);
507
508 unlock_kernel();
509} 504}
510 505
511static struct kmem_cache *fat_inode_cachep; 506static struct kmem_cache *fat_inode_cachep;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 1736f2356388..970e682ea754 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -255,10 +255,7 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
255 255
256 for (i = 0; i < nr_bhs; i++) { 256 for (i = 0; i < nr_bhs; i++) {
257 wait_on_buffer(bhs[i]); 257 wait_on_buffer(bhs[i]);
258 if (buffer_eopnotsupp(bhs[i])) { 258 if (!err && !buffer_uptodate(bhs[i]))
259 clear_buffer_eopnotsupp(bhs[i]);
260 err = -EOPNOTSUPP;
261 } else if (!err && !buffer_uptodate(bhs[i]))
262 err = -EIO; 259 err = -EIO;
263 } 260 }
264 return err; 261 return err;
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index bbc94ae4fd77..bbca5c186ae7 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -662,12 +662,16 @@ static int msdos_fill_super(struct super_block *sb, void *data, int silent)
662{ 662{
663 int res; 663 int res;
664 664
665 lock_super(sb);
665 res = fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, 0); 666 res = fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, 0);
666 if (res) 667 if (res) {
668 unlock_super(sb);
667 return res; 669 return res;
670 }
668 671
669 sb->s_flags |= MS_NOATIME; 672 sb->s_flags |= MS_NOATIME;
670 sb->s_root->d_op = &msdos_dentry_operations; 673 sb->s_root->d_op = &msdos_dentry_operations;
674 unlock_super(sb);
671 return 0; 675 return 0;
672} 676}
673 677
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 6fcc7e71fbaa..6f0f6c9a0152 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -1055,15 +1055,19 @@ static int vfat_fill_super(struct super_block *sb, void *data, int silent)
1055{ 1055{
1056 int res; 1056 int res;
1057 1057
1058 lock_super(sb);
1058 res = fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, 1); 1059 res = fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, 1);
1059 if (res) 1060 if (res) {
1061 unlock_super(sb);
1060 return res; 1062 return res;
1063 }
1061 1064
1062 if (MSDOS_SB(sb)->options.name_check != 's') 1065 if (MSDOS_SB(sb)->options.name_check != 's')
1063 sb->s_root->d_op = &vfat_ci_dentry_ops; 1066 sb->s_root->d_op = &vfat_ci_dentry_ops;
1064 else 1067 else
1065 sb->s_root->d_op = &vfat_dentry_ops; 1068 sb->s_root->d_op = &vfat_dentry_ops;
1066 1069
1070 unlock_super(sb);
1067 return 0; 1071 return 0;
1068} 1072}
1069 1073
diff --git a/fs/fcntl.c b/fs/fcntl.c
index f8cc34f542c3..ecc8b3954ed6 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -640,7 +640,7 @@ static void fasync_free_rcu(struct rcu_head *head)
640 * match the state "is the filp on a fasync list". 640 * match the state "is the filp on a fasync list".
641 * 641 *
642 */ 642 */
643static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) 643int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
644{ 644{
645 struct fasync_struct *fa, **fp; 645 struct fasync_struct *fa, **fp;
646 int result = 0; 646 int result = 0;
@@ -666,21 +666,31 @@ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
666 return result; 666 return result;
667} 667}
668 668
669struct fasync_struct *fasync_alloc(void)
670{
671 return kmem_cache_alloc(fasync_cache, GFP_KERNEL);
672}
673
669/* 674/*
670 * Add a fasync entry. Return negative on error, positive if 675 * NOTE! This can be used only for unused fasync entries:
671 * added, and zero if did nothing but change an existing one. 676 * entries that actually got inserted on the fasync list
677 * need to be released by rcu - see fasync_remove_entry.
678 */
679void fasync_free(struct fasync_struct *new)
680{
681 kmem_cache_free(fasync_cache, new);
682}
683
684/*
685 * Insert a new entry into the fasync list. Return the pointer to the
686 * old one if we didn't use the new one.
672 * 687 *
673 * NOTE! It is very important that the FASYNC flag always 688 * NOTE! It is very important that the FASYNC flag always
674 * match the state "is the filp on a fasync list". 689 * match the state "is the filp on a fasync list".
675 */ 690 */
676static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp) 691struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new)
677{ 692{
678 struct fasync_struct *new, *fa, **fp; 693 struct fasync_struct *fa, **fp;
679 int result = 0;
680
681 new = kmem_cache_alloc(fasync_cache, GFP_KERNEL);
682 if (!new)
683 return -ENOMEM;
684 694
685 spin_lock(&filp->f_lock); 695 spin_lock(&filp->f_lock);
686 spin_lock(&fasync_lock); 696 spin_lock(&fasync_lock);
@@ -691,8 +701,6 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
691 spin_lock_irq(&fa->fa_lock); 701 spin_lock_irq(&fa->fa_lock);
692 fa->fa_fd = fd; 702 fa->fa_fd = fd;
693 spin_unlock_irq(&fa->fa_lock); 703 spin_unlock_irq(&fa->fa_lock);
694
695 kmem_cache_free(fasync_cache, new);
696 goto out; 704 goto out;
697 } 705 }
698 706
@@ -702,13 +710,39 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
702 new->fa_fd = fd; 710 new->fa_fd = fd;
703 new->fa_next = *fapp; 711 new->fa_next = *fapp;
704 rcu_assign_pointer(*fapp, new); 712 rcu_assign_pointer(*fapp, new);
705 result = 1;
706 filp->f_flags |= FASYNC; 713 filp->f_flags |= FASYNC;
707 714
708out: 715out:
709 spin_unlock(&fasync_lock); 716 spin_unlock(&fasync_lock);
710 spin_unlock(&filp->f_lock); 717 spin_unlock(&filp->f_lock);
711 return result; 718 return fa;
719}
720
721/*
722 * Add a fasync entry. Return negative on error, positive if
723 * added, and zero if did nothing but change an existing one.
724 */
725static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
726{
727 struct fasync_struct *new;
728
729 new = fasync_alloc();
730 if (!new)
731 return -ENOMEM;
732
733 /*
734 * fasync_insert_entry() returns the old (update) entry if
735 * it existed.
736 *
737 * So free the (unused) new entry and return 0 to let the
738 * caller know that we didn't add any new fasync entries.
739 */
740 if (fasync_insert_entry(fd, filp, fapp, new)) {
741 fasync_free(new);
742 return 0;
743 }
744
745 return 1;
712} 746}
713 747
714/* 748/*
diff --git a/fs/fifo.c b/fs/fifo.c
index 5d6606ffc2d2..4e303c22d5ee 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -151,4 +151,5 @@ err_nocleanup:
151 */ 151 */
152const struct file_operations def_fifo_fops = { 152const struct file_operations def_fifo_fops = {
153 .open = fifo_open, /* will set read_ or write_pipefifo_fops */ 153 .open = fifo_open, /* will set read_ or write_pipefifo_fops */
154 .llseek = noop_llseek,
154}; 155};
diff --git a/fs/file_table.c b/fs/file_table.c
index a04bdd81c11c..c3dee381f1b4 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -60,7 +60,7 @@ static inline void file_free(struct file *f)
60/* 60/*
61 * Return the total number of open files in the system 61 * Return the total number of open files in the system
62 */ 62 */
63static int get_nr_files(void) 63static long get_nr_files(void)
64{ 64{
65 return percpu_counter_read_positive(&nr_files); 65 return percpu_counter_read_positive(&nr_files);
66} 66}
@@ -68,7 +68,7 @@ static int get_nr_files(void)
68/* 68/*
69 * Return the maximum number of open files in the system 69 * Return the maximum number of open files in the system
70 */ 70 */
71int get_max_files(void) 71unsigned long get_max_files(void)
72{ 72{
73 return files_stat.max_files; 73 return files_stat.max_files;
74} 74}
@@ -82,7 +82,7 @@ int proc_nr_files(ctl_table *table, int write,
82 void __user *buffer, size_t *lenp, loff_t *ppos) 82 void __user *buffer, size_t *lenp, loff_t *ppos)
83{ 83{
84 files_stat.nr_files = get_nr_files(); 84 files_stat.nr_files = get_nr_files();
85 return proc_dointvec(table, write, buffer, lenp, ppos); 85 return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
86} 86}
87#else 87#else
88int proc_nr_files(ctl_table *table, int write, 88int proc_nr_files(ctl_table *table, int write,
@@ -105,7 +105,7 @@ int proc_nr_files(ctl_table *table, int write,
105struct file *get_empty_filp(void) 105struct file *get_empty_filp(void)
106{ 106{
107 const struct cred *cred = current_cred(); 107 const struct cred *cred = current_cred();
108 static int old_max; 108 static long old_max;
109 struct file * f; 109 struct file * f;
110 110
111 /* 111 /*
@@ -140,8 +140,7 @@ struct file *get_empty_filp(void)
140over: 140over:
141 /* Ran out of filps - report that */ 141 /* Ran out of filps - report that */
142 if (get_nr_files() > old_max) { 142 if (get_nr_files() > old_max) {
143 printk(KERN_INFO "VFS: file-max limit %d reached\n", 143 pr_info("VFS: file-max limit %lu reached\n", get_max_files());
144 get_max_files());
145 old_max = get_nr_files(); 144 old_max = get_nr_files();
146 } 145 }
147 goto fail; 146 goto fail;
@@ -487,7 +486,7 @@ retry:
487 486
488void __init files_init(unsigned long mempages) 487void __init files_init(unsigned long mempages)
489{ 488{
490 int n; 489 unsigned long n;
491 490
492 filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, 491 filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
493 SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 492 SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
@@ -498,9 +497,7 @@ void __init files_init(unsigned long mempages)
498 */ 497 */
499 498
500 n = (mempages * (PAGE_SIZE / 1024)) / 10; 499 n = (mempages * (PAGE_SIZE / 1024)) / 10;
501 files_stat.max_files = n; 500 files_stat.max_files = max_t(unsigned long, n, NR_FILE);
502 if (files_stat.max_files < NR_FILE)
503 files_stat.max_files = NR_FILE;
504 files_defer_init(); 501 files_defer_init();
505 lg_lock_init(files_lglock); 502 lg_lock_init(files_lglock);
506 percpu_counter_init(&nr_files, 0); 503 percpu_counter_init(&nr_files, 0);
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 79d1b4ea13e7..8c04eac5079d 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -260,6 +260,7 @@ vxfs_get_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip)
260 struct inode *ip = NULL; 260 struct inode *ip = NULL;
261 261
262 if ((ip = new_inode(sbp))) { 262 if ((ip = new_inode(sbp))) {
263 ip->i_ino = get_next_ino();
263 vxfs_iinit(ip, vip); 264 vxfs_iinit(ip, vip);
264 ip->i_mapping->a_ops = &vxfs_aops; 265 ip->i_mapping->a_ops = &vxfs_aops;
265 } 266 }
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 0ec7bb2c95c6..6c5131d592f0 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -36,7 +36,6 @@
36#include <linux/highmem.h> 36#include <linux/highmem.h>
37#include <linux/kernel.h> 37#include <linux/kernel.h>
38#include <linux/pagemap.h> 38#include <linux/pagemap.h>
39#include <linux/smp_lock.h>
40 39
41#include "vxfs.h" 40#include "vxfs.h"
42#include "vxfs_dir.h" 41#include "vxfs_dir.h"
@@ -212,16 +211,12 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, struct nameidata *nd)
212 if (dp->d_name.len > VXFS_NAMELEN) 211 if (dp->d_name.len > VXFS_NAMELEN)
213 return ERR_PTR(-ENAMETOOLONG); 212 return ERR_PTR(-ENAMETOOLONG);
214 213
215 lock_kernel();
216 ino = vxfs_inode_by_name(dip, dp); 214 ino = vxfs_inode_by_name(dip, dp);
217 if (ino) { 215 if (ino) {
218 ip = vxfs_iget(dip->i_sb, ino); 216 ip = vxfs_iget(dip->i_sb, ino);
219 if (IS_ERR(ip)) { 217 if (IS_ERR(ip))
220 unlock_kernel();
221 return ERR_CAST(ip); 218 return ERR_CAST(ip);
222 }
223 } 219 }
224 unlock_kernel();
225 d_add(dp, ip); 220 d_add(dp, ip);
226 return NULL; 221 return NULL;
227} 222}
@@ -248,8 +243,6 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
248 u_long page, npages, block, pblocks, nblocks, offset; 243 u_long page, npages, block, pblocks, nblocks, offset;
249 loff_t pos; 244 loff_t pos;
250 245
251 lock_kernel();
252
253 switch ((long)fp->f_pos) { 246 switch ((long)fp->f_pos) {
254 case 0: 247 case 0:
255 if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0) 248 if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0)
@@ -265,10 +258,8 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
265 258
266 pos = fp->f_pos - 2; 259 pos = fp->f_pos - 2;
267 260
268 if (pos > VXFS_DIRROUND(ip->i_size)) { 261 if (pos > VXFS_DIRROUND(ip->i_size))
269 unlock_kernel();
270 return 0; 262 return 0;
271 }
272 263
273 npages = dir_pages(ip); 264 npages = dir_pages(ip);
274 nblocks = dir_blocks(ip); 265 nblocks = dir_blocks(ip);
@@ -327,6 +318,5 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
327done: 318done:
328 fp->f_pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2; 319 fp->f_pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
329out: 320out:
330 unlock_kernel();
331 return 0; 321 return 0;
332} 322}
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index dc0c041e85cb..71b0148b8784 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -38,7 +38,6 @@
38#include <linux/buffer_head.h> 38#include <linux/buffer_head.h>
39#include <linux/kernel.h> 39#include <linux/kernel.h>
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include <linux/smp_lock.h>
42#include <linux/stat.h> 41#include <linux/stat.h>
43#include <linux/vfs.h> 42#include <linux/vfs.h>
44#include <linux/mount.h> 43#include <linux/mount.h>
@@ -81,16 +80,12 @@ vxfs_put_super(struct super_block *sbp)
81{ 80{
82 struct vxfs_sb_info *infp = VXFS_SBI(sbp); 81 struct vxfs_sb_info *infp = VXFS_SBI(sbp);
83 82
84 lock_kernel();
85
86 vxfs_put_fake_inode(infp->vsi_fship); 83 vxfs_put_fake_inode(infp->vsi_fship);
87 vxfs_put_fake_inode(infp->vsi_ilist); 84 vxfs_put_fake_inode(infp->vsi_ilist);
88 vxfs_put_fake_inode(infp->vsi_stilist); 85 vxfs_put_fake_inode(infp->vsi_stilist);
89 86
90 brelse(infp->vsi_bp); 87 brelse(infp->vsi_bp);
91 kfree(infp); 88 kfree(infp);
92
93 unlock_kernel();
94} 89}
95 90
96/** 91/**
@@ -148,7 +143,7 @@ static int vxfs_remount(struct super_block *sb, int *flags, char *data)
148 * The superblock on success, else %NULL. 143 * The superblock on success, else %NULL.
149 * 144 *
150 * Locking: 145 * Locking:
151 * We are under the bkl and @sbp->s_lock. 146 * We are under @sbp->s_lock.
152 */ 147 */
153static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent) 148static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
154{ 149{
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 81e086d8aa57..aed881a76b22 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -52,8 +52,6 @@ struct wb_writeback_work {
52#define CREATE_TRACE_POINTS 52#define CREATE_TRACE_POINTS
53#include <trace/events/writeback.h> 53#include <trace/events/writeback.h>
54 54
55#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
56
57/* 55/*
58 * We don't actually have pdflush, but this one is exported though /proc... 56 * We don't actually have pdflush, but this one is exported though /proc...
59 */ 57 */
@@ -71,6 +69,21 @@ int writeback_in_progress(struct backing_dev_info *bdi)
71 return test_bit(BDI_writeback_running, &bdi->state); 69 return test_bit(BDI_writeback_running, &bdi->state);
72} 70}
73 71
72static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
73{
74 struct super_block *sb = inode->i_sb;
75
76 if (strcmp(sb->s_type->name, "bdev") == 0)
77 return inode->i_mapping->backing_dev_info;
78
79 return sb->s_bdi;
80}
81
82static inline struct inode *wb_inode(struct list_head *head)
83{
84 return list_entry(head, struct inode, i_wb_list);
85}
86
74static void bdi_queue_work(struct backing_dev_info *bdi, 87static void bdi_queue_work(struct backing_dev_info *bdi,
75 struct wb_writeback_work *work) 88 struct wb_writeback_work *work)
76{ 89{
@@ -164,11 +177,11 @@ static void redirty_tail(struct inode *inode)
164 if (!list_empty(&wb->b_dirty)) { 177 if (!list_empty(&wb->b_dirty)) {
165 struct inode *tail; 178 struct inode *tail;
166 179
167 tail = list_entry(wb->b_dirty.next, struct inode, i_list); 180 tail = wb_inode(wb->b_dirty.next);
168 if (time_before(inode->dirtied_when, tail->dirtied_when)) 181 if (time_before(inode->dirtied_when, tail->dirtied_when))
169 inode->dirtied_when = jiffies; 182 inode->dirtied_when = jiffies;
170 } 183 }
171 list_move(&inode->i_list, &wb->b_dirty); 184 list_move(&inode->i_wb_list, &wb->b_dirty);
172} 185}
173 186
174/* 187/*
@@ -178,7 +191,7 @@ static void requeue_io(struct inode *inode)
178{ 191{
179 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 192 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
180 193
181 list_move(&inode->i_list, &wb->b_more_io); 194 list_move(&inode->i_wb_list, &wb->b_more_io);
182} 195}
183 196
184static void inode_sync_complete(struct inode *inode) 197static void inode_sync_complete(struct inode *inode)
@@ -219,14 +232,14 @@ static void move_expired_inodes(struct list_head *delaying_queue,
219 int do_sb_sort = 0; 232 int do_sb_sort = 0;
220 233
221 while (!list_empty(delaying_queue)) { 234 while (!list_empty(delaying_queue)) {
222 inode = list_entry(delaying_queue->prev, struct inode, i_list); 235 inode = wb_inode(delaying_queue->prev);
223 if (older_than_this && 236 if (older_than_this &&
224 inode_dirtied_after(inode, *older_than_this)) 237 inode_dirtied_after(inode, *older_than_this))
225 break; 238 break;
226 if (sb && sb != inode->i_sb) 239 if (sb && sb != inode->i_sb)
227 do_sb_sort = 1; 240 do_sb_sort = 1;
228 sb = inode->i_sb; 241 sb = inode->i_sb;
229 list_move(&inode->i_list, &tmp); 242 list_move(&inode->i_wb_list, &tmp);
230 } 243 }
231 244
232 /* just one sb in list, splice to dispatch_queue and we're done */ 245 /* just one sb in list, splice to dispatch_queue and we're done */
@@ -237,12 +250,11 @@ static void move_expired_inodes(struct list_head *delaying_queue,
237 250
238 /* Move inodes from one superblock together */ 251 /* Move inodes from one superblock together */
239 while (!list_empty(&tmp)) { 252 while (!list_empty(&tmp)) {
240 inode = list_entry(tmp.prev, struct inode, i_list); 253 sb = wb_inode(tmp.prev)->i_sb;
241 sb = inode->i_sb;
242 list_for_each_prev_safe(pos, node, &tmp) { 254 list_for_each_prev_safe(pos, node, &tmp) {
243 inode = list_entry(pos, struct inode, i_list); 255 inode = wb_inode(pos);
244 if (inode->i_sb == sb) 256 if (inode->i_sb == sb)
245 list_move(&inode->i_list, dispatch_queue); 257 list_move(&inode->i_wb_list, dispatch_queue);
246 } 258 }
247 } 259 }
248} 260}
@@ -400,16 +412,13 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
400 * completion. 412 * completion.
401 */ 413 */
402 redirty_tail(inode); 414 redirty_tail(inode);
403 } else if (atomic_read(&inode->i_count)) {
404 /*
405 * The inode is clean, inuse
406 */
407 list_move(&inode->i_list, &inode_in_use);
408 } else { 415 } else {
409 /* 416 /*
410 * The inode is clean, unused 417 * The inode is clean. At this point we either have
418 * a reference to the inode or it's on it's way out.
419 * No need to add it back to the LRU.
411 */ 420 */
412 list_move(&inode->i_list, &inode_unused); 421 list_del_init(&inode->i_wb_list);
413 } 422 }
414 } 423 }
415 inode_sync_complete(inode); 424 inode_sync_complete(inode);
@@ -457,8 +466,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
457{ 466{
458 while (!list_empty(&wb->b_io)) { 467 while (!list_empty(&wb->b_io)) {
459 long pages_skipped; 468 long pages_skipped;
460 struct inode *inode = list_entry(wb->b_io.prev, 469 struct inode *inode = wb_inode(wb->b_io.prev);
461 struct inode, i_list);
462 470
463 if (inode->i_sb != sb) { 471 if (inode->i_sb != sb) {
464 if (only_this_sb) { 472 if (only_this_sb) {
@@ -479,10 +487,16 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
479 return 0; 487 return 0;
480 } 488 }
481 489
482 if (inode->i_state & (I_NEW | I_WILL_FREE)) { 490 /*
491 * Don't bother with new inodes or inodes beeing freed, first
492 * kind does not need peridic writeout yet, and for the latter
493 * kind writeout is handled by the freer.
494 */
495 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
483 requeue_io(inode); 496 requeue_io(inode);
484 continue; 497 continue;
485 } 498 }
499
486 /* 500 /*
487 * Was this inode dirtied after sync_sb_inodes was called? 501 * Was this inode dirtied after sync_sb_inodes was called?
488 * This keeps sync from extra jobs and livelock. 502 * This keeps sync from extra jobs and livelock.
@@ -490,7 +504,6 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
490 if (inode_dirtied_after(inode, wbc->wb_start)) 504 if (inode_dirtied_after(inode, wbc->wb_start))
491 return 1; 505 return 1;
492 506
493 BUG_ON(inode->i_state & I_FREEING);
494 __iget(inode); 507 __iget(inode);
495 pages_skipped = wbc->pages_skipped; 508 pages_skipped = wbc->pages_skipped;
496 writeback_single_inode(inode, wbc); 509 writeback_single_inode(inode, wbc);
@@ -528,8 +541,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
528 queue_io(wb, wbc->older_than_this); 541 queue_io(wb, wbc->older_than_this);
529 542
530 while (!list_empty(&wb->b_io)) { 543 while (!list_empty(&wb->b_io)) {
531 struct inode *inode = list_entry(wb->b_io.prev, 544 struct inode *inode = wb_inode(wb->b_io.prev);
532 struct inode, i_list);
533 struct super_block *sb = inode->i_sb; 545 struct super_block *sb = inode->i_sb;
534 546
535 if (!pin_sb_for_writeback(sb)) { 547 if (!pin_sb_for_writeback(sb)) {
@@ -574,7 +586,7 @@ static inline bool over_bground_thresh(void)
574 global_dirty_limits(&background_thresh, &dirty_thresh); 586 global_dirty_limits(&background_thresh, &dirty_thresh);
575 587
576 return (global_page_state(NR_FILE_DIRTY) + 588 return (global_page_state(NR_FILE_DIRTY) +
577 global_page_state(NR_UNSTABLE_NFS) >= background_thresh); 589 global_page_state(NR_UNSTABLE_NFS) > background_thresh);
578} 590}
579 591
580/* 592/*
@@ -667,8 +679,7 @@ static long wb_writeback(struct bdi_writeback *wb,
667 */ 679 */
668 spin_lock(&inode_lock); 680 spin_lock(&inode_lock);
669 if (!list_empty(&wb->b_more_io)) { 681 if (!list_empty(&wb->b_more_io)) {
670 inode = list_entry(wb->b_more_io.prev, 682 inode = wb_inode(wb->b_more_io.prev);
671 struct inode, i_list);
672 trace_wbc_writeback_wait(&wbc, wb->bdi); 683 trace_wbc_writeback_wait(&wbc, wb->bdi);
673 inode_wait_for_writeback(inode); 684 inode_wait_for_writeback(inode);
674 } 685 }
@@ -713,9 +724,13 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
713 return 0; 724 return 0;
714 725
715 wb->last_old_flush = jiffies; 726 wb->last_old_flush = jiffies;
727 /*
728 * Add in the number of potentially dirty inodes, because each inode
729 * write can dirty pagecache in the underlying blockdev.
730 */
716 nr_pages = global_page_state(NR_FILE_DIRTY) + 731 nr_pages = global_page_state(NR_FILE_DIRTY) +
717 global_page_state(NR_UNSTABLE_NFS) + 732 global_page_state(NR_UNSTABLE_NFS) +
718 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 733 get_nr_dirty_inodes();
719 734
720 if (nr_pages) { 735 if (nr_pages) {
721 struct wb_writeback_work work = { 736 struct wb_writeback_work work = {
@@ -782,7 +797,7 @@ int bdi_writeback_thread(void *data)
782 struct backing_dev_info *bdi = wb->bdi; 797 struct backing_dev_info *bdi = wb->bdi;
783 long pages_written; 798 long pages_written;
784 799
785 current->flags |= PF_FLUSHER | PF_SWAPWRITE; 800 current->flags |= PF_SWAPWRITE;
786 set_freezable(); 801 set_freezable();
787 wb->last_active = jiffies; 802 wb->last_active = jiffies;
788 803
@@ -954,7 +969,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
954 * dirty list. Add blockdev inodes as well. 969 * dirty list. Add blockdev inodes as well.
955 */ 970 */
956 if (!S_ISBLK(inode->i_mode)) { 971 if (!S_ISBLK(inode->i_mode)) {
957 if (hlist_unhashed(&inode->i_hash)) 972 if (inode_unhashed(inode))
958 goto out; 973 goto out;
959 } 974 }
960 if (inode->i_state & I_FREEING) 975 if (inode->i_state & I_FREEING)
@@ -982,7 +997,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
982 } 997 }
983 998
984 inode->dirtied_when = jiffies; 999 inode->dirtied_when = jiffies;
985 list_move(&inode->i_list, &bdi->wb.b_dirty); 1000 list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
986 } 1001 }
987 } 1002 }
988out: 1003out:
@@ -1082,8 +1097,7 @@ void writeback_inodes_sb(struct super_block *sb)
1082 1097
1083 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1098 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1084 1099
1085 work.nr_pages = nr_dirty + nr_unstable + 1100 work.nr_pages = nr_dirty + nr_unstable + get_nr_dirty_inodes();
1086 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
1087 1101
1088 bdi_queue_work(sb->s_bdi, &work); 1102 bdi_queue_work(sb->s_bdi, &work);
1089 wait_for_completion(&done); 1103 wait_for_completion(&done);
@@ -1190,3 +1204,23 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
1190 return ret; 1204 return ret;
1191} 1205}
1192EXPORT_SYMBOL(sync_inode); 1206EXPORT_SYMBOL(sync_inode);
1207
1208/**
1209 * sync_inode - write an inode to disk
1210 * @inode: the inode to sync
1211 * @wait: wait for I/O to complete.
1212 *
1213 * Write an inode to disk and adjust it's dirty state after completion.
1214 *
1215 * Note: only writes the actual inode, no associated data or other metadata.
1216 */
1217int sync_inode_metadata(struct inode *inode, int wait)
1218{
1219 struct writeback_control wbc = {
1220 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
1221 .nr_to_write = 0, /* metadata-only */
1222 };
1223
1224 return sync_inode(inode, &wbc);
1225}
1226EXPORT_SYMBOL(sync_inode_metadata);
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 3773fd63d2f9..4eba07661e5c 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -179,23 +179,27 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
179static const struct file_operations fuse_ctl_abort_ops = { 179static const struct file_operations fuse_ctl_abort_ops = {
180 .open = nonseekable_open, 180 .open = nonseekable_open,
181 .write = fuse_conn_abort_write, 181 .write = fuse_conn_abort_write,
182 .llseek = no_llseek,
182}; 183};
183 184
184static const struct file_operations fuse_ctl_waiting_ops = { 185static const struct file_operations fuse_ctl_waiting_ops = {
185 .open = nonseekable_open, 186 .open = nonseekable_open,
186 .read = fuse_conn_waiting_read, 187 .read = fuse_conn_waiting_read,
188 .llseek = no_llseek,
187}; 189};
188 190
189static const struct file_operations fuse_conn_max_background_ops = { 191static const struct file_operations fuse_conn_max_background_ops = {
190 .open = nonseekable_open, 192 .open = nonseekable_open,
191 .read = fuse_conn_max_background_read, 193 .read = fuse_conn_max_background_read,
192 .write = fuse_conn_max_background_write, 194 .write = fuse_conn_max_background_write,
195 .llseek = no_llseek,
193}; 196};
194 197
195static const struct file_operations fuse_conn_congestion_threshold_ops = { 198static const struct file_operations fuse_conn_congestion_threshold_ops = {
196 .open = nonseekable_open, 199 .open = nonseekable_open,
197 .read = fuse_conn_congestion_threshold_read, 200 .read = fuse_conn_congestion_threshold_read,
198 .write = fuse_conn_congestion_threshold_write, 201 .write = fuse_conn_congestion_threshold_write,
202 .llseek = no_llseek,
199}; 203};
200 204
201static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, 205static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
@@ -218,6 +222,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
218 if (!inode) 222 if (!inode)
219 return NULL; 223 return NULL;
220 224
225 inode->i_ino = get_next_ino();
221 inode->i_mode = mode; 226 inode->i_mode = mode;
222 inode->i_uid = fc->user_id; 227 inode->i_uid = fc->user_id;
223 inode->i_gid = fc->group_id; 228 inode->i_gid = fc->group_id;
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index e1f8171278bd..3e87cce5837d 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -182,6 +182,7 @@ static const struct file_operations cuse_frontend_fops = {
182 .unlocked_ioctl = cuse_file_ioctl, 182 .unlocked_ioctl = cuse_file_ioctl,
183 .compat_ioctl = cuse_file_compat_ioctl, 183 .compat_ioctl = cuse_file_compat_ioctl,
184 .poll = fuse_file_poll, 184 .poll = fuse_file_poll,
185 .llseek = noop_llseek,
185}; 186};
186 187
187 188
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index d367af1514ef..6e07696308dc 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -809,11 +809,9 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
809 int err; 809 int err;
810 struct page *page = *pagep; 810 struct page *page = *pagep;
811 811
812 if (page && zeroing && count < PAGE_SIZE) { 812 if (page && zeroing && count < PAGE_SIZE)
813 void *mapaddr = kmap_atomic(page, KM_USER1); 813 clear_highpage(page);
814 memset(mapaddr, 0, PAGE_SIZE); 814
815 kunmap_atomic(mapaddr, KM_USER1);
816 }
817 while (count) { 815 while (count) {
818 if (cs->write && cs->pipebufs && page) { 816 if (cs->write && cs->pipebufs && page) {
819 return fuse_ref_page(cs, page, offset, count); 817 return fuse_ref_page(cs, page, offset, count);
@@ -830,10 +828,10 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
830 } 828 }
831 } 829 }
832 if (page) { 830 if (page) {
833 void *mapaddr = kmap_atomic(page, KM_USER1); 831 void *mapaddr = kmap_atomic(page, KM_USER0);
834 void *buf = mapaddr + offset; 832 void *buf = mapaddr + offset;
835 offset += fuse_copy_do(cs, &buf, &count); 833 offset += fuse_copy_do(cs, &buf, &count);
836 kunmap_atomic(mapaddr, KM_USER1); 834 kunmap_atomic(mapaddr, KM_USER0);
837 } else 835 } else
838 offset += fuse_copy_do(cs, NULL, &count); 836 offset += fuse_copy_do(cs, NULL, &count);
839 } 837 }
@@ -1336,12 +1334,7 @@ out_finish:
1336 1334
1337static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) 1335static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
1338{ 1336{
1339 int i; 1337 release_pages(req->pages, req->num_pages, 0);
1340
1341 for (i = 0; i < req->num_pages; i++) {
1342 struct page *page = req->pages[i];
1343 page_cache_release(page);
1344 }
1345} 1338}
1346 1339
1347static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, 1340static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
@@ -1354,7 +1347,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
1354 loff_t file_size; 1347 loff_t file_size;
1355 unsigned int num; 1348 unsigned int num;
1356 unsigned int offset; 1349 unsigned int offset;
1357 size_t total_len; 1350 size_t total_len = 0;
1358 1351
1359 req = fuse_get_req(fc); 1352 req = fuse_get_req(fc);
1360 if (IS_ERR(req)) 1353 if (IS_ERR(req))
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index cc9665522148..c465ae066c62 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
1config GFS2_FS 1config GFS2_FS
2 tristate "GFS2 file system support" 2 tristate "GFS2 file system support"
3 depends on EXPERIMENTAL && (64BIT || LBDAF) 3 depends on (64BIT || LBDAF)
4 select DLM if GFS2_FS_LOCKING_DLM 4 select DLM if GFS2_FS_LOCKING_DLM
5 select CONFIGFS_FS if GFS2_FS_LOCKING_DLM 5 select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
6 select SYSFS if GFS2_FS_LOCKING_DLM 6 select SYSFS if GFS2_FS_LOCKING_DLM
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 194fe16d8418..4f36f8832b9b 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -36,8 +36,8 @@
36#include "glops.h" 36#include "glops.h"
37 37
38 38
39static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, 39void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
40 unsigned int from, unsigned int to) 40 unsigned int from, unsigned int to)
41{ 41{
42 struct buffer_head *head = page_buffers(page); 42 struct buffer_head *head = page_buffers(page);
43 unsigned int bsize = head->b_size; 43 unsigned int bsize = head->b_size;
@@ -615,10 +615,9 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
615 unsigned int data_blocks = 0, ind_blocks = 0, rblocks; 615 unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
616 int alloc_required; 616 int alloc_required;
617 int error = 0; 617 int error = 0;
618 struct gfs2_alloc *al; 618 struct gfs2_alloc *al = NULL;
619 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 619 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
620 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 620 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
621 unsigned to = from + len;
622 struct page *page; 621 struct page *page;
623 622
624 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); 623 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
@@ -663,6 +662,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
663 rblocks += RES_STATFS + RES_QUOTA; 662 rblocks += RES_STATFS + RES_QUOTA;
664 if (&ip->i_inode == sdp->sd_rindex) 663 if (&ip->i_inode == sdp->sd_rindex)
665 rblocks += 2 * RES_STATFS; 664 rblocks += 2 * RES_STATFS;
665 if (alloc_required)
666 rblocks += gfs2_rg_blocks(al);
666 667
667 error = gfs2_trans_begin(sdp, rblocks, 668 error = gfs2_trans_begin(sdp, rblocks,
668 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); 669 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -689,20 +690,18 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
689 } 690 }
690 691
691prepare_write: 692prepare_write:
692 error = block_prepare_write(page, from, to, gfs2_block_map); 693 error = __block_write_begin(page, from, len, gfs2_block_map);
693out: 694out:
694 if (error == 0) 695 if (error == 0)
695 return 0; 696 return 0;
696 697
697 page_cache_release(page); 698 page_cache_release(page);
698 699
699 /* 700 gfs2_trans_end(sdp);
700 * XXX(truncate): the call below should probably be replaced with
701 * a call to the gfs2-specific truncate blocks helper to actually
702 * release disk blocks..
703 */
704 if (pos + len > ip->i_inode.i_size) 701 if (pos + len > ip->i_inode.i_size)
705 truncate_setsize(&ip->i_inode, ip->i_inode.i_size); 702 gfs2_trim_blocks(&ip->i_inode);
703 goto out_trans_fail;
704
706out_endtrans: 705out_endtrans:
707 gfs2_trans_end(sdp); 706 gfs2_trans_end(sdp);
708out_trans_fail: 707out_trans_fail:
@@ -802,10 +801,8 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
802 page_cache_release(page); 801 page_cache_release(page);
803 802
804 if (copied) { 803 if (copied) {
805 if (inode->i_size < to) { 804 if (inode->i_size < to)
806 i_size_write(inode, to); 805 i_size_write(inode, to);
807 ip->i_disksize = inode->i_size;
808 }
809 gfs2_dinode_out(ip, di); 806 gfs2_dinode_out(ip, di);
810 mark_inode_dirty(inode); 807 mark_inode_dirty(inode);
811 } 808 }
@@ -876,8 +873,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
876 873
877 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 874 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
878 if (ret > 0) { 875 if (ret > 0) {
879 if (inode->i_size > ip->i_disksize)
880 ip->i_disksize = inode->i_size;
881 gfs2_dinode_out(ip, dibh->b_data); 876 gfs2_dinode_out(ip, dibh->b_data);
882 mark_inode_dirty(inode); 877 mark_inode_dirty(inode);
883 } 878 }
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6f482809d1a3..5476c066d4ee 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -50,7 +50,7 @@ struct strip_mine {
50 * @ip: the inode 50 * @ip: the inode
51 * @dibh: the dinode buffer 51 * @dibh: the dinode buffer
52 * @block: the block number that was allocated 52 * @block: the block number that was allocated
53 * @private: any locked page held by the caller process 53 * @page: The (optional) page. This is looked up if @page is NULL
54 * 54 *
55 * Returns: errno 55 * Returns: errno
56 */ 56 */
@@ -109,8 +109,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
109/** 109/**
110 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big 110 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
111 * @ip: The GFS2 inode to unstuff 111 * @ip: The GFS2 inode to unstuff
112 * @unstuffer: the routine that handles unstuffing a non-zero length file 112 * @page: The (optional) page. This is looked up if the @page is NULL
113 * @private: private data for the unstuffer
114 * 113 *
115 * This routine unstuffs a dinode and returns it to a "normal" state such 114 * This routine unstuffs a dinode and returns it to a "normal" state such
116 * that the height can be grown in the traditional way. 115 * that the height can be grown in the traditional way.
@@ -132,7 +131,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
132 if (error) 131 if (error)
133 goto out; 132 goto out;
134 133
135 if (ip->i_disksize) { 134 if (i_size_read(&ip->i_inode)) {
136 /* Get a free block, fill it with the stuffed data, 135 /* Get a free block, fill it with the stuffed data,
137 and write it out to disk */ 136 and write it out to disk */
138 137
@@ -161,7 +160,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
161 di = (struct gfs2_dinode *)dibh->b_data; 160 di = (struct gfs2_dinode *)dibh->b_data;
162 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); 161 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
163 162
164 if (ip->i_disksize) { 163 if (i_size_read(&ip->i_inode)) {
165 *(__be64 *)(di + 1) = cpu_to_be64(block); 164 *(__be64 *)(di + 1) = cpu_to_be64(block);
166 gfs2_add_inode_blocks(&ip->i_inode, 1); 165 gfs2_add_inode_blocks(&ip->i_inode, 1);
167 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); 166 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
@@ -885,83 +884,14 @@ out:
885} 884}
886 885
887/** 886/**
888 * do_grow - Make a file look bigger than it is
889 * @ip: the inode
890 * @size: the size to set the file to
891 *
892 * Called with an exclusive lock on @ip.
893 *
894 * Returns: errno
895 */
896
897static int do_grow(struct gfs2_inode *ip, u64 size)
898{
899 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
900 struct gfs2_alloc *al;
901 struct buffer_head *dibh;
902 int error;
903
904 al = gfs2_alloc_get(ip);
905 if (!al)
906 return -ENOMEM;
907
908 error = gfs2_quota_lock_check(ip);
909 if (error)
910 goto out;
911
912 al->al_requested = sdp->sd_max_height + RES_DATA;
913
914 error = gfs2_inplace_reserve(ip);
915 if (error)
916 goto out_gunlock_q;
917
918 error = gfs2_trans_begin(sdp,
919 sdp->sd_max_height + al->al_rgd->rd_length +
920 RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
921 if (error)
922 goto out_ipres;
923
924 error = gfs2_meta_inode_buffer(ip, &dibh);
925 if (error)
926 goto out_end_trans;
927
928 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
929 if (gfs2_is_stuffed(ip)) {
930 error = gfs2_unstuff_dinode(ip, NULL);
931 if (error)
932 goto out_brelse;
933 }
934 }
935
936 ip->i_disksize = size;
937 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
938 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
939 gfs2_dinode_out(ip, dibh->b_data);
940
941out_brelse:
942 brelse(dibh);
943out_end_trans:
944 gfs2_trans_end(sdp);
945out_ipres:
946 gfs2_inplace_release(ip);
947out_gunlock_q:
948 gfs2_quota_unlock(ip);
949out:
950 gfs2_alloc_put(ip);
951 return error;
952}
953
954
955/**
956 * gfs2_block_truncate_page - Deal with zeroing out data for truncate 887 * gfs2_block_truncate_page - Deal with zeroing out data for truncate
957 * 888 *
958 * This is partly borrowed from ext3. 889 * This is partly borrowed from ext3.
959 */ 890 */
960static int gfs2_block_truncate_page(struct address_space *mapping) 891static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
961{ 892{
962 struct inode *inode = mapping->host; 893 struct inode *inode = mapping->host;
963 struct gfs2_inode *ip = GFS2_I(inode); 894 struct gfs2_inode *ip = GFS2_I(inode);
964 loff_t from = inode->i_size;
965 unsigned long index = from >> PAGE_CACHE_SHIFT; 895 unsigned long index = from >> PAGE_CACHE_SHIFT;
966 unsigned offset = from & (PAGE_CACHE_SIZE-1); 896 unsigned offset = from & (PAGE_CACHE_SIZE-1);
967 unsigned blocksize, iblock, length, pos; 897 unsigned blocksize, iblock, length, pos;
@@ -1023,9 +953,11 @@ unlock:
1023 return err; 953 return err;
1024} 954}
1025 955
1026static int trunc_start(struct gfs2_inode *ip, u64 size) 956static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
1027{ 957{
1028 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 958 struct gfs2_inode *ip = GFS2_I(inode);
959 struct gfs2_sbd *sdp = GFS2_SB(inode);
960 struct address_space *mapping = inode->i_mapping;
1029 struct buffer_head *dibh; 961 struct buffer_head *dibh;
1030 int journaled = gfs2_is_jdata(ip); 962 int journaled = gfs2_is_jdata(ip);
1031 int error; 963 int error;
@@ -1039,31 +971,26 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
1039 if (error) 971 if (error)
1040 goto out; 972 goto out;
1041 973
974 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
975
1042 if (gfs2_is_stuffed(ip)) { 976 if (gfs2_is_stuffed(ip)) {
1043 u64 dsize = size + sizeof(struct gfs2_dinode); 977 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1044 ip->i_disksize = size;
1045 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1046 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1047 gfs2_dinode_out(ip, dibh->b_data);
1048 if (dsize > dibh->b_size)
1049 dsize = dibh->b_size;
1050 gfs2_buffer_clear_tail(dibh, dsize);
1051 error = 1;
1052 } else { 978 } else {
1053 if (size & (u64)(sdp->sd_sb.sb_bsize - 1)) 979 if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) {
1054 error = gfs2_block_truncate_page(ip->i_inode.i_mapping); 980 error = gfs2_block_truncate_page(mapping, newsize);
1055 981 if (error)
1056 if (!error) { 982 goto out_brelse;
1057 ip->i_disksize = size;
1058 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1059 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1060 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1061 gfs2_dinode_out(ip, dibh->b_data);
1062 } 983 }
984 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1063 } 985 }
1064 986
1065 brelse(dibh); 987 i_size_write(inode, newsize);
988 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
989 gfs2_dinode_out(ip, dibh->b_data);
1066 990
991 truncate_pagecache(inode, oldsize, newsize);
992out_brelse:
993 brelse(dibh);
1067out: 994out:
1068 gfs2_trans_end(sdp); 995 gfs2_trans_end(sdp);
1069 return error; 996 return error;
@@ -1123,7 +1050,7 @@ static int trunc_end(struct gfs2_inode *ip)
1123 if (error) 1050 if (error)
1124 goto out; 1051 goto out;
1125 1052
1126 if (!ip->i_disksize) { 1053 if (!i_size_read(&ip->i_inode)) {
1127 ip->i_height = 0; 1054 ip->i_height = 0;
1128 ip->i_goal = ip->i_no_addr; 1055 ip->i_goal = ip->i_no_addr;
1129 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); 1056 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
@@ -1143,92 +1070,154 @@ out:
1143 1070
1144/** 1071/**
1145 * do_shrink - make a file smaller 1072 * do_shrink - make a file smaller
1146 * @ip: the inode 1073 * @inode: the inode
1147 * @size: the size to make the file 1074 * @oldsize: the current inode size
1148 * @truncator: function to truncate the last partial block 1075 * @newsize: the size to make the file
1149 * 1076 *
1150 * Called with an exclusive lock on @ip. 1077 * Called with an exclusive lock on @inode. The @size must
1078 * be equal to or smaller than the current inode size.
1151 * 1079 *
1152 * Returns: errno 1080 * Returns: errno
1153 */ 1081 */
1154 1082
1155static int do_shrink(struct gfs2_inode *ip, u64 size) 1083static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize)
1156{ 1084{
1085 struct gfs2_inode *ip = GFS2_I(inode);
1157 int error; 1086 int error;
1158 1087
1159 error = trunc_start(ip, size); 1088 error = trunc_start(inode, oldsize, newsize);
1160 if (error < 0) 1089 if (error < 0)
1161 return error; 1090 return error;
1162 if (error > 0) 1091 if (gfs2_is_stuffed(ip))
1163 return 0; 1092 return 0;
1164 1093
1165 error = trunc_dealloc(ip, size); 1094 error = trunc_dealloc(ip, newsize);
1166 if (!error) 1095 if (error == 0)
1167 error = trunc_end(ip); 1096 error = trunc_end(ip);
1168 1097
1169 return error; 1098 return error;
1170} 1099}
1171 1100
1172static int do_touch(struct gfs2_inode *ip, u64 size) 1101void gfs2_trim_blocks(struct inode *inode)
1173{ 1102{
1174 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1103 u64 size = inode->i_size;
1104 int ret;
1105
1106 ret = do_shrink(inode, size, size);
1107 WARN_ON(ret != 0);
1108}
1109
1110/**
1111 * do_grow - Touch and update inode size
1112 * @inode: The inode
1113 * @size: The new size
1114 *
1115 * This function updates the timestamps on the inode and
1116 * may also increase the size of the inode. This function
1117 * must not be called with @size any smaller than the current
1118 * inode size.
1119 *
1120 * Although it is not strictly required to unstuff files here,
1121 * earlier versions of GFS2 have a bug in the stuffed file reading
1122 * code which will result in a buffer overrun if the size is larger
1123 * than the max stuffed file size. In order to prevent this from
1124 * occuring, such files are unstuffed, but in other cases we can
1125 * just update the inode size directly.
1126 *
1127 * Returns: 0 on success, or -ve on error
1128 */
1129
1130static int do_grow(struct inode *inode, u64 size)
1131{
1132 struct gfs2_inode *ip = GFS2_I(inode);
1133 struct gfs2_sbd *sdp = GFS2_SB(inode);
1175 struct buffer_head *dibh; 1134 struct buffer_head *dibh;
1135 struct gfs2_alloc *al = NULL;
1176 int error; 1136 int error;
1177 1137
1178 error = gfs2_trans_begin(sdp, RES_DINODE, 0); 1138 if (gfs2_is_stuffed(ip) &&
1139 (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
1140 al = gfs2_alloc_get(ip);
1141 if (al == NULL)
1142 return -ENOMEM;
1143
1144 error = gfs2_quota_lock_check(ip);
1145 if (error)
1146 goto do_grow_alloc_put;
1147
1148 al->al_requested = 1;
1149 error = gfs2_inplace_reserve(ip);
1150 if (error)
1151 goto do_grow_qunlock;
1152 }
1153
1154 error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0);
1179 if (error) 1155 if (error)
1180 return error; 1156 goto do_grow_release;
1181 1157
1182 down_write(&ip->i_rw_mutex); 1158 if (al) {
1159 error = gfs2_unstuff_dinode(ip, NULL);
1160 if (error)
1161 goto do_end_trans;
1162 }
1183 1163
1184 error = gfs2_meta_inode_buffer(ip, &dibh); 1164 error = gfs2_meta_inode_buffer(ip, &dibh);
1185 if (error) 1165 if (error)
1186 goto do_touch_out; 1166 goto do_end_trans;
1187 1167
1168 i_size_write(inode, size);
1188 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1169 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1189 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1170 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1190 gfs2_dinode_out(ip, dibh->b_data); 1171 gfs2_dinode_out(ip, dibh->b_data);
1191 brelse(dibh); 1172 brelse(dibh);
1192 1173
1193do_touch_out: 1174do_end_trans:
1194 up_write(&ip->i_rw_mutex);
1195 gfs2_trans_end(sdp); 1175 gfs2_trans_end(sdp);
1176do_grow_release:
1177 if (al) {
1178 gfs2_inplace_release(ip);
1179do_grow_qunlock:
1180 gfs2_quota_unlock(ip);
1181do_grow_alloc_put:
1182 gfs2_alloc_put(ip);
1183 }
1196 return error; 1184 return error;
1197} 1185}
1198 1186
1199/** 1187/**
1200 * gfs2_truncatei - make a file a given size 1188 * gfs2_setattr_size - make a file a given size
1201 * @ip: the inode 1189 * @inode: the inode
1202 * @size: the size to make the file 1190 * @newsize: the size to make the file
1203 * @truncator: function to truncate the last partial block
1204 * 1191 *
1205 * The file size can grow, shrink, or stay the same size. 1192 * The file size can grow, shrink, or stay the same size. This
1193 * is called holding i_mutex and an exclusive glock on the inode
1194 * in question.
1206 * 1195 *
1207 * Returns: errno 1196 * Returns: errno
1208 */ 1197 */
1209 1198
1210int gfs2_truncatei(struct gfs2_inode *ip, u64 size) 1199int gfs2_setattr_size(struct inode *inode, u64 newsize)
1211{ 1200{
1212 int error; 1201 int ret;
1202 u64 oldsize;
1213 1203
1214 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode))) 1204 BUG_ON(!S_ISREG(inode->i_mode));
1215 return -EINVAL;
1216 1205
1217 if (size > ip->i_disksize) 1206 ret = inode_newsize_ok(inode, newsize);
1218 error = do_grow(ip, size); 1207 if (ret)
1219 else if (size < ip->i_disksize) 1208 return ret;
1220 error = do_shrink(ip, size);
1221 else
1222 /* update time stamps */
1223 error = do_touch(ip, size);
1224 1209
1225 return error; 1210 oldsize = inode->i_size;
1211 if (newsize >= oldsize)
1212 return do_grow(inode, newsize);
1213
1214 return do_shrink(inode, oldsize, newsize);
1226} 1215}
1227 1216
1228int gfs2_truncatei_resume(struct gfs2_inode *ip) 1217int gfs2_truncatei_resume(struct gfs2_inode *ip)
1229{ 1218{
1230 int error; 1219 int error;
1231 error = trunc_dealloc(ip, ip->i_disksize); 1220 error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
1232 if (!error) 1221 if (!error)
1233 error = trunc_end(ip); 1222 error = trunc_end(ip);
1234 return error; 1223 return error;
@@ -1269,7 +1258,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1269 1258
1270 shift = sdp->sd_sb.sb_bsize_shift; 1259 shift = sdp->sd_sb.sb_bsize_shift;
1271 BUG_ON(gfs2_is_dir(ip)); 1260 BUG_ON(gfs2_is_dir(ip));
1272 end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift; 1261 end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
1273 lblock = offset >> shift; 1262 lblock = offset >> shift;
1274 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; 1263 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1275 if (lblock_stop > end_of_file) 1264 if (lblock_stop > end_of_file)
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index a20a5213135a..42fea03e2bd9 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -44,14 +44,16 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
44 } 44 }
45} 45}
46 46
47int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); 47extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
48int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create); 48extern int gfs2_block_map(struct inode *inode, sector_t lblock,
49int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen); 49 struct buffer_head *bh, int create);
50 50extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new,
51int gfs2_truncatei(struct gfs2_inode *ip, u64 size); 51 u64 *dblock, unsigned *extlen);
52int gfs2_truncatei_resume(struct gfs2_inode *ip); 52extern int gfs2_setattr_size(struct inode *inode, u64 size);
53int gfs2_file_dealloc(struct gfs2_inode *ip); 53extern void gfs2_trim_blocks(struct inode *inode);
54int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, 54extern int gfs2_truncatei_resume(struct gfs2_inode *ip);
55 unsigned int len); 55extern int gfs2_file_dealloc(struct gfs2_inode *ip);
56extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
57 unsigned int len);
56 58
57#endif /* __BMAP_DOT_H__ */ 59#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index bb7907bde3d8..6798755b3858 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -49,7 +49,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
49 ip = GFS2_I(inode); 49 ip = GFS2_I(inode);
50 } 50 }
51 51
52 if (sdp->sd_args.ar_localcaching) 52 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
53 goto valid; 53 goto valid;
54 54
55 had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL); 55 had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index b9dd88a78dd4..5c356d09c321 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -79,6 +79,9 @@
79#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1) 79#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
80#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1)) 80#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
81 81
82struct qstr gfs2_qdot __read_mostly;
83struct qstr gfs2_qdotdot __read_mostly;
84
82typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len, 85typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len,
83 u64 leaf_no, void *data); 86 u64 leaf_no, void *data);
84typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent, 87typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
@@ -127,8 +130,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
127 130
128 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 131 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
129 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); 132 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
130 if (ip->i_disksize < offset + size) 133 if (ip->i_inode.i_size < offset + size)
131 ip->i_disksize = offset + size; 134 i_size_write(&ip->i_inode, offset + size);
132 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 135 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
133 gfs2_dinode_out(ip, dibh->b_data); 136 gfs2_dinode_out(ip, dibh->b_data);
134 137
@@ -225,8 +228,8 @@ out:
225 if (error) 228 if (error)
226 return error; 229 return error;
227 230
228 if (ip->i_disksize < offset + copied) 231 if (ip->i_inode.i_size < offset + copied)
229 ip->i_disksize = offset + copied; 232 i_size_write(&ip->i_inode, offset + copied);
230 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 233 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
231 234
232 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 235 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -275,12 +278,13 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
275 unsigned int o; 278 unsigned int o;
276 int copied = 0; 279 int copied = 0;
277 int error = 0; 280 int error = 0;
281 u64 disksize = i_size_read(&ip->i_inode);
278 282
279 if (offset >= ip->i_disksize) 283 if (offset >= disksize)
280 return 0; 284 return 0;
281 285
282 if (offset + size > ip->i_disksize) 286 if (offset + size > disksize)
283 size = ip->i_disksize - offset; 287 size = disksize - offset;
284 288
285 if (!size) 289 if (!size)
286 return 0; 290 return 0;
@@ -727,7 +731,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
727 unsigned hsize = 1 << ip->i_depth; 731 unsigned hsize = 1 << ip->i_depth;
728 unsigned index; 732 unsigned index;
729 u64 ln; 733 u64 ln;
730 if (hsize * sizeof(u64) != ip->i_disksize) { 734 if (hsize * sizeof(u64) != i_size_read(inode)) {
731 gfs2_consist_inode(ip); 735 gfs2_consist_inode(ip);
732 return ERR_PTR(-EIO); 736 return ERR_PTR(-EIO);
733 } 737 }
@@ -879,7 +883,7 @@ static int dir_make_exhash(struct inode *inode)
879 for (x = sdp->sd_hash_ptrs; x--; lp++) 883 for (x = sdp->sd_hash_ptrs; x--; lp++)
880 *lp = cpu_to_be64(bn); 884 *lp = cpu_to_be64(bn);
881 885
882 dip->i_disksize = sdp->sd_sb.sb_bsize / 2; 886 i_size_write(inode, sdp->sd_sb.sb_bsize / 2);
883 gfs2_add_inode_blocks(&dip->i_inode, 1); 887 gfs2_add_inode_blocks(&dip->i_inode, 1);
884 dip->i_diskflags |= GFS2_DIF_EXHASH; 888 dip->i_diskflags |= GFS2_DIF_EXHASH;
885 889
@@ -1057,11 +1061,12 @@ static int dir_double_exhash(struct gfs2_inode *dip)
1057 u64 *buf; 1061 u64 *buf;
1058 u64 *from, *to; 1062 u64 *from, *to;
1059 u64 block; 1063 u64 block;
1064 u64 disksize = i_size_read(&dip->i_inode);
1060 int x; 1065 int x;
1061 int error = 0; 1066 int error = 0;
1062 1067
1063 hsize = 1 << dip->i_depth; 1068 hsize = 1 << dip->i_depth;
1064 if (hsize * sizeof(u64) != dip->i_disksize) { 1069 if (hsize * sizeof(u64) != disksize) {
1065 gfs2_consist_inode(dip); 1070 gfs2_consist_inode(dip);
1066 return -EIO; 1071 return -EIO;
1067 } 1072 }
@@ -1072,7 +1077,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
1072 if (!buf) 1077 if (!buf)
1073 return -ENOMEM; 1078 return -ENOMEM;
1074 1079
1075 for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) { 1080 for (block = disksize >> sdp->sd_hash_bsize_shift; block--;) {
1076 error = gfs2_dir_read_data(dip, (char *)buf, 1081 error = gfs2_dir_read_data(dip, (char *)buf,
1077 block * sdp->sd_hash_bsize, 1082 block * sdp->sd_hash_bsize,
1078 sdp->sd_hash_bsize, 1); 1083 sdp->sd_hash_bsize, 1);
@@ -1370,7 +1375,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
1370 unsigned depth = 0; 1375 unsigned depth = 0;
1371 1376
1372 hsize = 1 << dip->i_depth; 1377 hsize = 1 << dip->i_depth;
1373 if (hsize * sizeof(u64) != dip->i_disksize) { 1378 if (hsize * sizeof(u64) != i_size_read(inode)) {
1374 gfs2_consist_inode(dip); 1379 gfs2_consist_inode(dip);
1375 return -EIO; 1380 return -EIO;
1376 } 1381 }
@@ -1784,7 +1789,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
1784 int error = 0; 1789 int error = 0;
1785 1790
1786 hsize = 1 << dip->i_depth; 1791 hsize = 1 << dip->i_depth;
1787 if (hsize * sizeof(u64) != dip->i_disksize) { 1792 if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) {
1788 gfs2_consist_inode(dip); 1793 gfs2_consist_inode(dip);
1789 return -EIO; 1794 return -EIO;
1790 } 1795 }
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 4f919440c3be..a98f644bd3df 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -17,23 +17,24 @@ struct inode;
17struct gfs2_inode; 17struct gfs2_inode;
18struct gfs2_inum; 18struct gfs2_inum;
19 19
20struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *filename); 20extern struct inode *gfs2_dir_search(struct inode *dir,
21int gfs2_dir_check(struct inode *dir, const struct qstr *filename, 21 const struct qstr *filename);
22 const struct gfs2_inode *ip); 22extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
23int gfs2_dir_add(struct inode *inode, const struct qstr *filename, 23 const struct gfs2_inode *ip);
24 const struct gfs2_inode *ip, unsigned int type); 24extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
25int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename); 25 const struct gfs2_inode *ip, unsigned int type);
26int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, 26extern int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
27 filldir_t filldir); 27extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
28int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, 28 filldir_t filldir);
29 const struct gfs2_inode *nip, unsigned int new_type); 29extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
30 const struct gfs2_inode *nip, unsigned int new_type);
30 31
31int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); 32extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
32 33
33int gfs2_diradd_alloc_required(struct inode *dir, 34extern int gfs2_diradd_alloc_required(struct inode *dir,
34 const struct qstr *filename); 35 const struct qstr *filename);
35int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, 36extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
36 struct buffer_head **bhp); 37 struct buffer_head **bhp);
37 38
38static inline u32 gfs2_disk_hash(const char *data, int len) 39static inline u32 gfs2_disk_hash(const char *data, int len)
39{ 40{
@@ -61,4 +62,7 @@ static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct
61 memcpy(dent + 1, name->name, name->len); 62 memcpy(dent + 1, name->name, name->len);
62} 63}
63 64
65extern struct qstr gfs2_qdot;
66extern struct qstr gfs2_qdotdot;
67
64#endif /* __DIR_DOT_H__ */ 68#endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index dfe237a3f8ad..06d582732d34 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -126,16 +126,9 @@ static int gfs2_get_name(struct dentry *parent, char *name,
126 126
127static struct dentry *gfs2_get_parent(struct dentry *child) 127static struct dentry *gfs2_get_parent(struct dentry *child)
128{ 128{
129 struct qstr dotdot;
130 struct dentry *dentry; 129 struct dentry *dentry;
131 130
132 /* 131 dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
133 * XXX(hch): it would be a good idea to keep this around as a
134 * static variable.
135 */
136 gfs2_str2qstr(&dotdot, "..");
137
138 dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &dotdot, 1));
139 if (!IS_ERR(dentry)) 132 if (!IS_ERR(dentry))
140 dentry->d_op = &gfs2_dops; 133 dentry->d_op = &gfs2_dops;
141 return dentry; 134 return dentry;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4edd662c8232..aa996471ec5c 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -382,8 +382,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
382 rblocks = RES_DINODE + ind_blocks; 382 rblocks = RES_DINODE + ind_blocks;
383 if (gfs2_is_jdata(ip)) 383 if (gfs2_is_jdata(ip))
384 rblocks += data_blocks ? data_blocks : 1; 384 rblocks += data_blocks ? data_blocks : 1;
385 if (ind_blocks || data_blocks) 385 if (ind_blocks || data_blocks) {
386 rblocks += RES_STATFS + RES_QUOTA; 386 rblocks += RES_STATFS + RES_QUOTA;
387 rblocks += gfs2_rg_blocks(al);
388 }
387 ret = gfs2_trans_begin(sdp, rblocks, 0); 389 ret = gfs2_trans_begin(sdp, rblocks, 0);
388 if (ret) 390 if (ret)
389 goto out_trans_fail; 391 goto out_trans_fail;
@@ -491,7 +493,7 @@ static int gfs2_open(struct inode *inode, struct file *file)
491 goto fail; 493 goto fail;
492 494
493 if (!(file->f_flags & O_LARGEFILE) && 495 if (!(file->f_flags & O_LARGEFILE) &&
494 ip->i_disksize > MAX_NON_LFS) { 496 i_size_read(inode) > MAX_NON_LFS) {
495 error = -EOVERFLOW; 497 error = -EOVERFLOW;
496 goto fail_gunlock; 498 goto fail_gunlock;
497 } 499 }
@@ -620,6 +622,8 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
620 * cluster; until we do, disable leases (by just returning -EINVAL), 622 * cluster; until we do, disable leases (by just returning -EINVAL),
621 * unless the administrator has requested purely local locking. 623 * unless the administrator has requested purely local locking.
622 * 624 *
625 * Locking: called under lock_flocks
626 *
623 * Returns: errno 627 * Returns: errno
624 */ 628 */
625 629
@@ -771,6 +775,7 @@ const struct file_operations gfs2_dir_fops = {
771 .fsync = gfs2_fsync, 775 .fsync = gfs2_fsync,
772 .lock = gfs2_lock, 776 .lock = gfs2_lock,
773 .flock = gfs2_flock, 777 .flock = gfs2_flock,
778 .llseek = default_llseek,
774}; 779};
775 780
776#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ 781#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
@@ -797,5 +802,6 @@ const struct file_operations gfs2_dir_fops_nolock = {
797 .open = gfs2_open, 802 .open = gfs2_open,
798 .release = gfs2_close, 803 .release = gfs2_close,
799 .fsync = gfs2_fsync, 804 .fsync = gfs2_fsync,
805 .llseek = default_llseek,
800}; 806};
801 807
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9adf8f924e08..87778857f099 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -441,6 +441,8 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
441 else 441 else
442 gfs2_glock_put_nolock(gl); 442 gfs2_glock_put_nolock(gl);
443 } 443 }
444 if (held1 && held2 && list_empty(&gl->gl_holders))
445 clear_bit(GLF_QUEUED, &gl->gl_flags);
444 446
445 gl->gl_state = new_state; 447 gl->gl_state = new_state;
446 gl->gl_tchange = jiffies; 448 gl->gl_tchange = jiffies;
@@ -1012,6 +1014,7 @@ fail:
1012 if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt)) 1014 if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
1013 insert_pt = &gh2->gh_list; 1015 insert_pt = &gh2->gh_list;
1014 } 1016 }
1017 set_bit(GLF_QUEUED, &gl->gl_flags);
1015 if (likely(insert_pt == NULL)) { 1018 if (likely(insert_pt == NULL)) {
1016 list_add_tail(&gh->gh_list, &gl->gl_holders); 1019 list_add_tail(&gh->gh_list, &gl->gl_holders);
1017 if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY)) 1020 if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
@@ -1310,10 +1313,12 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
1310 1313
1311 gfs2_glock_hold(gl); 1314 gfs2_glock_hold(gl);
1312 holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; 1315 holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
1313 if (time_before(now, holdtime)) 1316 if (test_bit(GLF_QUEUED, &gl->gl_flags)) {
1314 delay = holdtime - now; 1317 if (time_before(now, holdtime))
1315 if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) 1318 delay = holdtime - now;
1316 delay = gl->gl_ops->go_min_hold_time; 1319 if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
1320 delay = gl->gl_ops->go_min_hold_time;
1321 }
1317 1322
1318 spin_lock(&gl->gl_spin); 1323 spin_lock(&gl->gl_spin);
1319 handle_callback(gl, state, delay); 1324 handle_callback(gl, state, delay);
@@ -1512,7 +1517,7 @@ static void clear_glock(struct gfs2_glock *gl)
1512 spin_unlock(&lru_lock); 1517 spin_unlock(&lru_lock);
1513 1518
1514 spin_lock(&gl->gl_spin); 1519 spin_lock(&gl->gl_spin);
1515 if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED) 1520 if (gl->gl_state != LM_ST_UNLOCKED)
1516 handle_callback(gl, LM_ST_UNLOCKED, 0); 1521 handle_callback(gl, LM_ST_UNLOCKED, 0);
1517 spin_unlock(&gl->gl_spin); 1522 spin_unlock(&gl->gl_spin);
1518 gfs2_glock_hold(gl); 1523 gfs2_glock_hold(gl);
@@ -1660,6 +1665,8 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
1660 *p++ = 'I'; 1665 *p++ = 'I';
1661 if (test_bit(GLF_FROZEN, gflags)) 1666 if (test_bit(GLF_FROZEN, gflags))
1662 *p++ = 'F'; 1667 *p++ = 'F';
1668 if (test_bit(GLF_QUEUED, gflags))
1669 *p++ = 'q';
1663 *p = 0; 1670 *p = 0;
1664 return buf; 1671 return buf;
1665} 1672}
@@ -1776,10 +1783,12 @@ int __init gfs2_glock_init(void)
1776 } 1783 }
1777#endif 1784#endif
1778 1785
1779 glock_workqueue = create_workqueue("glock_workqueue"); 1786 glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER |
1787 WQ_HIGHPRI | WQ_FREEZEABLE, 0);
1780 if (IS_ERR(glock_workqueue)) 1788 if (IS_ERR(glock_workqueue))
1781 return PTR_ERR(glock_workqueue); 1789 return PTR_ERR(glock_workqueue);
1782 gfs2_delete_workqueue = create_workqueue("delete_workqueue"); 1790 gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER |
1791 WQ_FREEZEABLE, 0);
1783 if (IS_ERR(gfs2_delete_workqueue)) { 1792 if (IS_ERR(gfs2_delete_workqueue)) {
1784 destroy_workqueue(glock_workqueue); 1793 destroy_workqueue(glock_workqueue);
1785 return PTR_ERR(gfs2_delete_workqueue); 1794 return PTR_ERR(gfs2_delete_workqueue);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 2bda1911b156..db1c26d6d220 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -215,7 +215,7 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
215void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); 215void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
216 216
217/** 217/**
218 * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock 218 * gfs2_glock_nq_init - initialize a holder and enqueue it on a glock
219 * @gl: the glock 219 * @gl: the glock
220 * @state: the state we're requesting 220 * @state: the state we're requesting
221 * @flags: the modifier flags 221 * @flags: the modifier flags
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 49f97d3bb690..0d149dcc04e5 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -262,13 +262,12 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
262 const struct gfs2_inode *ip = gl->gl_object; 262 const struct gfs2_inode *ip = gl->gl_object;
263 if (ip == NULL) 263 if (ip == NULL)
264 return 0; 264 return 0;
265 gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n", 265 gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n",
266 (unsigned long long)ip->i_no_formal_ino, 266 (unsigned long long)ip->i_no_formal_ino,
267 (unsigned long long)ip->i_no_addr, 267 (unsigned long long)ip->i_no_addr,
268 IF2DT(ip->i_inode.i_mode), ip->i_flags, 268 IF2DT(ip->i_inode.i_mode), ip->i_flags,
269 (unsigned int)ip->i_diskflags, 269 (unsigned int)ip->i_diskflags,
270 (unsigned long long)ip->i_inode.i_size, 270 (unsigned long long)i_size_read(&ip->i_inode));
271 (unsigned long long)ip->i_disksize);
272 return 0; 271 return 0;
273} 272}
274 273
@@ -453,7 +452,6 @@ const struct gfs2_glock_operations *gfs2_glops_list[] = {
453 [LM_TYPE_META] = &gfs2_meta_glops, 452 [LM_TYPE_META] = &gfs2_meta_glops,
454 [LM_TYPE_INODE] = &gfs2_inode_glops, 453 [LM_TYPE_INODE] = &gfs2_inode_glops,
455 [LM_TYPE_RGRP] = &gfs2_rgrp_glops, 454 [LM_TYPE_RGRP] = &gfs2_rgrp_glops,
456 [LM_TYPE_NONDISK] = &gfs2_trans_glops,
457 [LM_TYPE_IOPEN] = &gfs2_iopen_glops, 455 [LM_TYPE_IOPEN] = &gfs2_iopen_glops,
458 [LM_TYPE_FLOCK] = &gfs2_flock_glops, 456 [LM_TYPE_FLOCK] = &gfs2_flock_glops,
459 [LM_TYPE_NONDISK] = &gfs2_nondisk_glops, 457 [LM_TYPE_NONDISK] = &gfs2_nondisk_glops,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index fdbf4b366fa5..764fbb49efc8 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -196,6 +196,7 @@ enum {
196 GLF_REPLY_PENDING = 9, 196 GLF_REPLY_PENDING = 9,
197 GLF_INITIAL = 10, 197 GLF_INITIAL = 10,
198 GLF_FROZEN = 11, 198 GLF_FROZEN = 11,
199 GLF_QUEUED = 12,
199}; 200};
200 201
201struct gfs2_glock { 202struct gfs2_glock {
@@ -267,7 +268,6 @@ struct gfs2_inode {
267 u64 i_no_formal_ino; 268 u64 i_no_formal_ino;
268 u64 i_generation; 269 u64 i_generation;
269 u64 i_eattr; 270 u64 i_eattr;
270 loff_t i_disksize;
271 unsigned long i_flags; /* GIF_... */ 271 unsigned long i_flags; /* GIF_... */
272 struct gfs2_glock *i_gl; /* Move into i_gh? */ 272 struct gfs2_glock *i_gl; /* Move into i_gh? */
273 struct gfs2_holder i_iopen_gh; 273 struct gfs2_holder i_iopen_gh;
@@ -416,11 +416,8 @@ struct gfs2_args {
416 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ 416 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
417 char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */ 417 char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
418 unsigned int ar_spectator:1; /* Don't get a journal */ 418 unsigned int ar_spectator:1; /* Don't get a journal */
419 unsigned int ar_ignore_local_fs:1; /* Ignore optimisations */
420 unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */ 419 unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */
421 unsigned int ar_localcaching:1; /* Local caching */
422 unsigned int ar_debug:1; /* Oops on errors */ 420 unsigned int ar_debug:1; /* Oops on errors */
423 unsigned int ar_upgrade:1; /* Upgrade ondisk format */
424 unsigned int ar_posix_acl:1; /* Enable posix acls */ 421 unsigned int ar_posix_acl:1; /* Enable posix acls */
425 unsigned int ar_quota:2; /* off/account/on */ 422 unsigned int ar_quota:2; /* off/account/on */
426 unsigned int ar_suiddir:1; /* suiddir support */ 423 unsigned int ar_suiddir:1; /* suiddir support */
@@ -497,7 +494,7 @@ struct gfs2_sb_host {
497 */ 494 */
498 495
499struct lm_lockstruct { 496struct lm_lockstruct {
500 unsigned int ls_jid; 497 int ls_jid;
501 unsigned int ls_first; 498 unsigned int ls_first;
502 unsigned int ls_first_done; 499 unsigned int ls_first_done;
503 unsigned int ls_nodir; 500 unsigned int ls_nodir;
@@ -572,6 +569,7 @@ struct gfs2_sbd {
572 struct list_head sd_rindex_mru_list; 569 struct list_head sd_rindex_mru_list;
573 struct gfs2_rgrpd *sd_rindex_forward; 570 struct gfs2_rgrpd *sd_rindex_forward;
574 unsigned int sd_rgrps; 571 unsigned int sd_rgrps;
572 unsigned int sd_max_rg_data;
575 573
576 /* Journal index stuff */ 574 /* Journal index stuff */
577 575
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 08140f185a37..06370f8bd8cf 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -359,8 +359,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
359 * to do that. 359 * to do that.
360 */ 360 */
361 ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink); 361 ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
362 ip->i_disksize = be64_to_cpu(str->di_size); 362 i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
363 i_size_write(&ip->i_inode, ip->i_disksize);
364 gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); 363 gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
365 atime.tv_sec = be64_to_cpu(str->di_atime); 364 atime.tv_sec = be64_to_cpu(str->di_atime);
366 atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); 365 atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
@@ -1055,7 +1054,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
1055 str->di_uid = cpu_to_be32(ip->i_inode.i_uid); 1054 str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
1056 str->di_gid = cpu_to_be32(ip->i_inode.i_gid); 1055 str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
1057 str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); 1056 str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
1058 str->di_size = cpu_to_be64(ip->i_disksize); 1057 str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
1059 str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); 1058 str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
1060 str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); 1059 str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
1061 str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); 1060 str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
@@ -1085,8 +1084,8 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
1085 (unsigned long long)ip->i_no_formal_ino); 1084 (unsigned long long)ip->i_no_formal_ino);
1086 printk(KERN_INFO " no_addr = %llu\n", 1085 printk(KERN_INFO " no_addr = %llu\n",
1087 (unsigned long long)ip->i_no_addr); 1086 (unsigned long long)ip->i_no_addr);
1088 printk(KERN_INFO " i_disksize = %llu\n", 1087 printk(KERN_INFO " i_size = %llu\n",
1089 (unsigned long long)ip->i_disksize); 1088 (unsigned long long)i_size_read(&ip->i_inode));
1090 printk(KERN_INFO " blocks = %llu\n", 1089 printk(KERN_INFO " blocks = %llu\n",
1091 (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode)); 1090 (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
1092 printk(KERN_INFO " i_goal = %llu\n", 1091 printk(KERN_INFO " i_goal = %llu\n",
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 300ada3f21de..6720d7d5fbc6 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -19,6 +19,8 @@ extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
19extern int gfs2_internal_read(struct gfs2_inode *ip, 19extern int gfs2_internal_read(struct gfs2_inode *ip,
20 struct file_ra_state *ra_state, 20 struct file_ra_state *ra_state,
21 char *buf, loff_t *pos, unsigned size); 21 char *buf, loff_t *pos, unsigned size);
22extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
23 unsigned int from, unsigned int to);
22extern void gfs2_set_aops(struct inode *inode); 24extern void gfs2_set_aops(struct inode *inode);
23 25
24static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) 26static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
@@ -80,6 +82,19 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
80 dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr); 82 dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr);
81} 83}
82 84
85static inline int gfs2_check_internal_file_size(struct inode *inode,
86 u64 minsize, u64 maxsize)
87{
88 u64 size = i_size_read(inode);
89 if (size < minsize || size > maxsize)
90 goto err;
91 if (size & ((1 << inode->i_blkbits) - 1))
92 goto err;
93 return 0;
94err:
95 gfs2_consist_inode(GFS2_I(inode));
96 return -EIO;
97}
83 98
84extern void gfs2_set_iop(struct inode *inode); 99extern void gfs2_set_iop(struct inode *inode);
85extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 100extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 0e0470ed34c2..1c09425b45fd 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -42,9 +42,9 @@ static void gdlm_ast(void *arg)
42 ret |= LM_OUT_CANCELED; 42 ret |= LM_OUT_CANCELED;
43 goto out; 43 goto out;
44 case -EAGAIN: /* Try lock fails */ 44 case -EAGAIN: /* Try lock fails */
45 case -EDEADLK: /* Deadlock detected */
45 goto out; 46 goto out;
46 case -EINVAL: /* Invalid */ 47 case -ETIMEDOUT: /* Canceled due to timeout */
47 case -ENOMEM: /* Out of memory */
48 ret |= LM_OUT_ERROR; 48 ret |= LM_OUT_ERROR;
49 goto out; 49 goto out;
50 case 0: /* Success */ 50 case 0: /* Success */
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index ac750bd31a6f..eb01f3575e10 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -592,22 +592,13 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
592 lh->lh_hash = cpu_to_be32(hash); 592 lh->lh_hash = cpu_to_be32(hash);
593 593
594 bh->b_end_io = end_buffer_write_sync; 594 bh->b_end_io = end_buffer_write_sync;
595 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
596 goto skip_barrier;
597 get_bh(bh); 595 get_bh(bh);
598 submit_bh(WRITE_BARRIER | REQ_META, bh); 596 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
599 wait_on_buffer(bh);
600 if (buffer_eopnotsupp(bh)) {
601 clear_buffer_eopnotsupp(bh);
602 set_buffer_uptodate(bh);
603 fs_info(sdp, "barrier sync failed - disabling barriers\n");
604 set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
605 lock_buffer(bh);
606skip_barrier:
607 get_bh(bh);
608 submit_bh(WRITE_SYNC | REQ_META, bh); 597 submit_bh(WRITE_SYNC | REQ_META, bh);
609 wait_on_buffer(bh); 598 else
610 } 599 submit_bh(WRITE_FLUSH_FUA | REQ_META, bh);
600 wait_on_buffer(bh);
601
611 if (!buffer_uptodate(bh)) 602 if (!buffer_uptodate(bh))
612 gfs2_io_error_bh(sdp, bh); 603 gfs2_io_error_bh(sdp, bh);
613 brelse(bh); 604 brelse(bh);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index b1e9630eb46a..ebef7ab6e17e 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -24,6 +24,7 @@
24#include "glock.h" 24#include "glock.h"
25#include "quota.h" 25#include "quota.h"
26#include "recovery.h" 26#include "recovery.h"
27#include "dir.h"
27 28
28static struct shrinker qd_shrinker = { 29static struct shrinker qd_shrinker = {
29 .shrink = gfs2_shrink_qd_memory, 30 .shrink = gfs2_shrink_qd_memory,
@@ -78,6 +79,9 @@ static int __init init_gfs2_fs(void)
78{ 79{
79 int error; 80 int error;
80 81
82 gfs2_str2qstr(&gfs2_qdot, ".");
83 gfs2_str2qstr(&gfs2_qdotdot, "..");
84
81 error = gfs2_sys_init(); 85 error = gfs2_sys_init();
82 if (error) 86 if (error)
83 return error; 87 return error;
@@ -140,7 +144,7 @@ static int __init init_gfs2_fs(void)
140 144
141 error = -ENOMEM; 145 error = -ENOMEM;
142 gfs_recovery_wq = alloc_workqueue("gfs_recovery", 146 gfs_recovery_wq = alloc_workqueue("gfs_recovery",
143 WQ_NON_REENTRANT | WQ_RESCUER, 0); 147 WQ_MEM_RECLAIM | WQ_FREEZEABLE, 0);
144 if (!gfs_recovery_wq) 148 if (!gfs_recovery_wq)
145 goto fail_wq; 149 goto fail_wq;
146 150
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index f3b071f921aa..939739c7b3f9 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -55,7 +55,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
55 * activity, but those code paths have their own higher-level 55 * activity, but those code paths have their own higher-level
56 * throttling. 56 * throttling.
57 */ 57 */
58 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { 58 if (wbc->sync_mode != WB_SYNC_NONE) {
59 lock_buffer(bh); 59 lock_buffer(bh);
60 } else if (!trylock_buffer(bh)) { 60 } else if (!trylock_buffer(bh)) {
61 redirty_page_for_writepage(wbc, page); 61 redirty_page_for_writepage(wbc, page);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 4d4b1e8ac64c..cade1acbcea9 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -38,14 +38,6 @@
38#define DO 0 38#define DO 0
39#define UNDO 1 39#define UNDO 1
40 40
41static const u32 gfs2_old_fs_formats[] = {
42 0
43};
44
45static const u32 gfs2_old_multihost_formats[] = {
46 0
47};
48
49/** 41/**
50 * gfs2_tune_init - Fill a gfs2_tune structure with default values 42 * gfs2_tune_init - Fill a gfs2_tune structure with default values
51 * @gt: tune 43 * @gt: tune
@@ -135,8 +127,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
135 127
136static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent) 128static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
137{ 129{
138 unsigned int x;
139
140 if (sb->sb_magic != GFS2_MAGIC || 130 if (sb->sb_magic != GFS2_MAGIC ||
141 sb->sb_type != GFS2_METATYPE_SB) { 131 sb->sb_type != GFS2_METATYPE_SB) {
142 if (!silent) 132 if (!silent)
@@ -150,55 +140,9 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int sile
150 sb->sb_multihost_format == GFS2_FORMAT_MULTI) 140 sb->sb_multihost_format == GFS2_FORMAT_MULTI)
151 return 0; 141 return 0;
152 142
153 if (sb->sb_fs_format != GFS2_FORMAT_FS) { 143 fs_warn(sdp, "Unknown on-disk format, unable to mount\n");
154 for (x = 0; gfs2_old_fs_formats[x]; x++)
155 if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
156 break;
157 144
158 if (!gfs2_old_fs_formats[x]) { 145 return -EINVAL;
159 printk(KERN_WARNING
160 "GFS2: code version (%u, %u) is incompatible "
161 "with ondisk format (%u, %u)\n",
162 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
163 sb->sb_fs_format, sb->sb_multihost_format);
164 printk(KERN_WARNING
165 "GFS2: I don't know how to upgrade this FS\n");
166 return -EINVAL;
167 }
168 }
169
170 if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
171 for (x = 0; gfs2_old_multihost_formats[x]; x++)
172 if (gfs2_old_multihost_formats[x] ==
173 sb->sb_multihost_format)
174 break;
175
176 if (!gfs2_old_multihost_formats[x]) {
177 printk(KERN_WARNING
178 "GFS2: code version (%u, %u) is incompatible "
179 "with ondisk format (%u, %u)\n",
180 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
181 sb->sb_fs_format, sb->sb_multihost_format);
182 printk(KERN_WARNING
183 "GFS2: I don't know how to upgrade this FS\n");
184 return -EINVAL;
185 }
186 }
187
188 if (!sdp->sd_args.ar_upgrade) {
189 printk(KERN_WARNING
190 "GFS2: code version (%u, %u) is incompatible "
191 "with ondisk format (%u, %u)\n",
192 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
193 sb->sb_fs_format, sb->sb_multihost_format);
194 printk(KERN_INFO
195 "GFS2: Use the \"upgrade\" mount option to upgrade "
196 "the FS\n");
197 printk(KERN_INFO "GFS2: See the manual for more details\n");
198 return -EINVAL;
199 }
200
201 return 0;
202} 146}
203 147
204static void end_bio_io_page(struct bio *bio, int error) 148static void end_bio_io_page(struct bio *bio, int error)
@@ -586,7 +530,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
586 530
587 prev_db = 0; 531 prev_db = 0;
588 532
589 for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) { 533 for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) {
590 bh.b_state = 0; 534 bh.b_state = 0;
591 bh.b_blocknr = 0; 535 bh.b_blocknr = 0;
592 bh.b_size = 1 << ip->i_inode.i_blkbits; 536 bh.b_size = 1 << ip->i_inode.i_blkbits;
@@ -1022,7 +966,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
1022 if (!strcmp("lock_nolock", proto)) { 966 if (!strcmp("lock_nolock", proto)) {
1023 lm = &nolock_ops; 967 lm = &nolock_ops;
1024 sdp->sd_args.ar_localflocks = 1; 968 sdp->sd_args.ar_localflocks = 1;
1025 sdp->sd_args.ar_localcaching = 1;
1026#ifdef CONFIG_GFS2_FS_LOCKING_DLM 969#ifdef CONFIG_GFS2_FS_LOCKING_DLM
1027 } else if (!strcmp("lock_dlm", proto)) { 970 } else if (!strcmp("lock_dlm", proto)) {
1028 lm = &gfs2_dlm_ops; 971 lm = &gfs2_dlm_ops;
@@ -1113,8 +1056,6 @@ static int gfs2_journalid_wait(void *word)
1113 1056
1114static int wait_on_journal(struct gfs2_sbd *sdp) 1057static int wait_on_journal(struct gfs2_sbd *sdp)
1115{ 1058{
1116 if (sdp->sd_args.ar_spectator)
1117 return 0;
1118 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) 1059 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
1119 return 0; 1060 return 0;
1120 1061
@@ -1217,6 +1158,20 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1217 if (error) 1158 if (error)
1218 goto fail_sb; 1159 goto fail_sb;
1219 1160
1161 /*
1162 * If user space has failed to join the cluster or some similar
1163 * failure has occurred, then the journal id will contain a
1164 * negative (error) number. This will then be returned to the
1165 * caller (of the mount syscall). We do this even for spectator
1166 * mounts (which just write a jid of 0 to indicate "ok" even though
1167 * the jid is unused in the spectator case)
1168 */
1169 if (sdp->sd_lockstruct.ls_jid < 0) {
1170 error = sdp->sd_lockstruct.ls_jid;
1171 sdp->sd_lockstruct.ls_jid = 0;
1172 goto fail_sb;
1173 }
1174
1220 error = init_inodes(sdp, DO); 1175 error = init_inodes(sdp, DO);
1221 if (error) 1176 if (error)
1222 goto fail_sb; 1177 goto fail_sb;
@@ -1264,7 +1219,6 @@ fail_sb:
1264fail_locking: 1219fail_locking:
1265 init_locking(sdp, &mount_gh, UNDO); 1220 init_locking(sdp, &mount_gh, UNDO);
1266fail_lm: 1221fail_lm:
1267 invalidate_inodes(sb);
1268 gfs2_gl_hash_clear(sdp); 1222 gfs2_gl_hash_clear(sdp);
1269 gfs2_lm_unmount(sdp); 1223 gfs2_lm_unmount(sdp);
1270fail_sys: 1224fail_sys:
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1009be2c9737..12cbea7502c2 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -18,6 +18,8 @@
18#include <linux/gfs2_ondisk.h> 18#include <linux/gfs2_ondisk.h>
19#include <linux/crc32.h> 19#include <linux/crc32.h>
20#include <linux/fiemap.h> 20#include <linux/fiemap.h>
21#include <linux/swap.h>
22#include <linux/falloc.h>
21#include <asm/uaccess.h> 23#include <asm/uaccess.h>
22 24
23#include "gfs2.h" 25#include "gfs2.h"
@@ -217,7 +219,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
217 goto out_gunlock_q; 219 goto out_gunlock_q;
218 220
219 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 221 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
220 al->al_rgd->rd_length + 222 gfs2_rg_blocks(al) +
221 2 * RES_DINODE + RES_STATFS + 223 2 * RES_DINODE + RES_STATFS +
222 RES_QUOTA, 0); 224 RES_QUOTA, 0);
223 if (error) 225 if (error)
@@ -253,7 +255,7 @@ out_parent:
253 gfs2_holder_uninit(ghs); 255 gfs2_holder_uninit(ghs);
254 gfs2_holder_uninit(ghs + 1); 256 gfs2_holder_uninit(ghs + 1);
255 if (!error) { 257 if (!error) {
256 atomic_inc(&inode->i_count); 258 ihold(inode);
257 d_instantiate(dentry, inode); 259 d_instantiate(dentry, inode);
258 mark_inode_dirty(inode); 260 mark_inode_dirty(inode);
259 } 261 }
@@ -406,7 +408,6 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
406 408
407 ip = ghs[1].gh_gl->gl_object; 409 ip = ghs[1].gh_gl->gl_object;
408 410
409 ip->i_disksize = size;
410 i_size_write(inode, size); 411 i_size_write(inode, size);
411 412
412 error = gfs2_meta_inode_buffer(ip, &dibh); 413 error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -461,7 +462,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
461 ip = ghs[1].gh_gl->gl_object; 462 ip = ghs[1].gh_gl->gl_object;
462 463
463 ip->i_inode.i_nlink = 2; 464 ip->i_inode.i_nlink = 2;
464 ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); 465 i_size_write(inode, sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode));
465 ip->i_diskflags |= GFS2_DIF_JDATA; 466 ip->i_diskflags |= GFS2_DIF_JDATA;
466 ip->i_entries = 2; 467 ip->i_entries = 2;
467 468
@@ -470,18 +471,15 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
470 if (!gfs2_assert_withdraw(sdp, !error)) { 471 if (!gfs2_assert_withdraw(sdp, !error)) {
471 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; 472 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
472 struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1); 473 struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
473 struct qstr str;
474 474
475 gfs2_str2qstr(&str, ".");
476 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 475 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
477 gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent); 476 gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent);
478 dent->de_inum = di->di_num; /* already GFS2 endian */ 477 dent->de_inum = di->di_num; /* already GFS2 endian */
479 dent->de_type = cpu_to_be16(DT_DIR); 478 dent->de_type = cpu_to_be16(DT_DIR);
480 di->di_entries = cpu_to_be32(1); 479 di->di_entries = cpu_to_be32(1);
481 480
482 gfs2_str2qstr(&str, "..");
483 dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1)); 481 dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
484 gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent); 482 gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
485 483
486 gfs2_inum_out(dip, dent); 484 gfs2_inum_out(dip, dent);
487 dent->de_type = cpu_to_be16(DT_DIR); 485 dent->de_type = cpu_to_be16(DT_DIR);
@@ -522,7 +520,6 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
522static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, 520static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
523 struct gfs2_inode *ip) 521 struct gfs2_inode *ip)
524{ 522{
525 struct qstr dotname;
526 int error; 523 int error;
527 524
528 if (ip->i_entries != 2) { 525 if (ip->i_entries != 2) {
@@ -539,13 +536,11 @@ static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
539 if (error) 536 if (error)
540 return error; 537 return error;
541 538
542 gfs2_str2qstr(&dotname, "."); 539 error = gfs2_dir_del(ip, &gfs2_qdot);
543 error = gfs2_dir_del(ip, &dotname);
544 if (error) 540 if (error)
545 return error; 541 return error;
546 542
547 gfs2_str2qstr(&dotname, ".."); 543 error = gfs2_dir_del(ip, &gfs2_qdotdot);
548 error = gfs2_dir_del(ip, &dotname);
549 if (error) 544 if (error)
550 return error; 545 return error;
551 546
@@ -694,11 +689,8 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
694 struct inode *dir = &to->i_inode; 689 struct inode *dir = &to->i_inode;
695 struct super_block *sb = dir->i_sb; 690 struct super_block *sb = dir->i_sb;
696 struct inode *tmp; 691 struct inode *tmp;
697 struct qstr dotdot;
698 int error = 0; 692 int error = 0;
699 693
700 gfs2_str2qstr(&dotdot, "..");
701
702 igrab(dir); 694 igrab(dir);
703 695
704 for (;;) { 696 for (;;) {
@@ -711,7 +703,7 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
711 break; 703 break;
712 } 704 }
713 705
714 tmp = gfs2_lookupi(dir, &dotdot, 1); 706 tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1);
715 if (IS_ERR(tmp)) { 707 if (IS_ERR(tmp)) {
716 error = PTR_ERR(tmp); 708 error = PTR_ERR(tmp);
717 break; 709 break;
@@ -744,7 +736,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
744 struct gfs2_inode *ip = GFS2_I(odentry->d_inode); 736 struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
745 struct gfs2_inode *nip = NULL; 737 struct gfs2_inode *nip = NULL;
746 struct gfs2_sbd *sdp = GFS2_SB(odir); 738 struct gfs2_sbd *sdp = GFS2_SB(odir);
747 struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }; 739 struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh;
748 struct gfs2_rgrpd *nrgd; 740 struct gfs2_rgrpd *nrgd;
749 unsigned int num_gh; 741 unsigned int num_gh;
750 int dir_rename = 0; 742 int dir_rename = 0;
@@ -758,6 +750,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
758 return 0; 750 return 0;
759 } 751 }
760 752
753 error = gfs2_rindex_hold(sdp, &ri_gh);
754 if (error)
755 return error;
761 756
762 if (odip != ndip) { 757 if (odip != ndip) {
763 error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, 758 error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
@@ -887,12 +882,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
887 882
888 al->al_requested = sdp->sd_max_dirres; 883 al->al_requested = sdp->sd_max_dirres;
889 884
890 error = gfs2_inplace_reserve(ndip); 885 error = gfs2_inplace_reserve_ri(ndip);
891 if (error) 886 if (error)
892 goto out_gunlock_q; 887 goto out_gunlock_q;
893 888
894 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 889 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
895 al->al_rgd->rd_length + 890 gfs2_rg_blocks(al) +
896 4 * RES_DINODE + 4 * RES_LEAF + 891 4 * RES_DINODE + 4 * RES_LEAF +
897 RES_STATFS + RES_QUOTA + 4, 0); 892 RES_STATFS + RES_QUOTA + 4, 0);
898 if (error) 893 if (error)
@@ -920,9 +915,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
920 } 915 }
921 916
922 if (dir_rename) { 917 if (dir_rename) {
923 struct qstr name;
924 gfs2_str2qstr(&name, "..");
925
926 error = gfs2_change_nlink(ndip, +1); 918 error = gfs2_change_nlink(ndip, +1);
927 if (error) 919 if (error)
928 goto out_end_trans; 920 goto out_end_trans;
@@ -930,7 +922,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
930 if (error) 922 if (error)
931 goto out_end_trans; 923 goto out_end_trans;
932 924
933 error = gfs2_dir_mvino(ip, &name, ndip, DT_DIR); 925 error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
934 if (error) 926 if (error)
935 goto out_end_trans; 927 goto out_end_trans;
936 } else { 928 } else {
@@ -972,6 +964,7 @@ out_gunlock_r:
972 if (r_gh.gh_gl) 964 if (r_gh.gh_gl)
973 gfs2_glock_dq_uninit(&r_gh); 965 gfs2_glock_dq_uninit(&r_gh);
974out: 966out:
967 gfs2_glock_dq_uninit(&ri_gh);
975 return error; 968 return error;
976} 969}
977 970
@@ -990,7 +983,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
990 struct gfs2_inode *ip = GFS2_I(dentry->d_inode); 983 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
991 struct gfs2_holder i_gh; 984 struct gfs2_holder i_gh;
992 struct buffer_head *dibh; 985 struct buffer_head *dibh;
993 unsigned int x; 986 unsigned int x, size;
994 char *buf; 987 char *buf;
995 int error; 988 int error;
996 989
@@ -1002,7 +995,8 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
1002 return NULL; 995 return NULL;
1003 } 996 }
1004 997
1005 if (!ip->i_disksize) { 998 size = (unsigned int)i_size_read(&ip->i_inode);
999 if (size == 0) {
1006 gfs2_consist_inode(ip); 1000 gfs2_consist_inode(ip);
1007 buf = ERR_PTR(-EIO); 1001 buf = ERR_PTR(-EIO);
1008 goto out; 1002 goto out;
@@ -1014,7 +1008,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
1014 goto out; 1008 goto out;
1015 } 1009 }
1016 1010
1017 x = ip->i_disksize + 1; 1011 x = size + 1;
1018 buf = kmalloc(x, GFP_NOFS); 1012 buf = kmalloc(x, GFP_NOFS);
1019 if (!buf) 1013 if (!buf)
1020 buf = ERR_PTR(-ENOMEM); 1014 buf = ERR_PTR(-ENOMEM);
@@ -1071,30 +1065,6 @@ int gfs2_permission(struct inode *inode, int mask)
1071 return error; 1065 return error;
1072} 1066}
1073 1067
1074/*
1075 * XXX(truncate): the truncate_setsize calls should be moved to the end.
1076 */
1077static int setattr_size(struct inode *inode, struct iattr *attr)
1078{
1079 struct gfs2_inode *ip = GFS2_I(inode);
1080 struct gfs2_sbd *sdp = GFS2_SB(inode);
1081 int error;
1082
1083 if (attr->ia_size != ip->i_disksize) {
1084 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
1085 if (error)
1086 return error;
1087 truncate_setsize(inode, attr->ia_size);
1088 gfs2_trans_end(sdp);
1089 }
1090
1091 error = gfs2_truncatei(ip, attr->ia_size);
1092 if (error && (inode->i_size != ip->i_disksize))
1093 i_size_write(inode, ip->i_disksize);
1094
1095 return error;
1096}
1097
1098static int setattr_chown(struct inode *inode, struct iattr *attr) 1068static int setattr_chown(struct inode *inode, struct iattr *attr)
1099{ 1069{
1100 struct gfs2_inode *ip = GFS2_I(inode); 1070 struct gfs2_inode *ip = GFS2_I(inode);
@@ -1195,7 +1165,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
1195 goto out; 1165 goto out;
1196 1166
1197 if (attr->ia_valid & ATTR_SIZE) 1167 if (attr->ia_valid & ATTR_SIZE)
1198 error = setattr_size(inode, attr); 1168 error = gfs2_setattr_size(inode, attr->ia_size);
1199 else if (attr->ia_valid & (ATTR_UID | ATTR_GID)) 1169 else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
1200 error = setattr_chown(inode, attr); 1170 error = setattr_chown(inode, attr);
1201 else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode)) 1171 else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
@@ -1301,6 +1271,257 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
1301 return ret; 1271 return ret;
1302} 1272}
1303 1273
1274static void empty_write_end(struct page *page, unsigned from,
1275 unsigned to)
1276{
1277 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
1278
1279 page_zero_new_buffers(page, from, to);
1280 flush_dcache_page(page);
1281 mark_page_accessed(page);
1282
1283 if (!gfs2_is_writeback(ip))
1284 gfs2_page_add_databufs(ip, page, from, to);
1285
1286 block_commit_write(page, from, to);
1287}
1288
1289
1290static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
1291{
1292 unsigned start, end, next;
1293 struct buffer_head *bh, *head;
1294 int error;
1295
1296 if (!page_has_buffers(page)) {
1297 error = __block_write_begin(page, from, to - from, gfs2_block_map);
1298 if (unlikely(error))
1299 return error;
1300
1301 empty_write_end(page, from, to);
1302 return 0;
1303 }
1304
1305 bh = head = page_buffers(page);
1306 next = end = 0;
1307 while (next < from) {
1308 next += bh->b_size;
1309 bh = bh->b_this_page;
1310 }
1311 start = next;
1312 do {
1313 next += bh->b_size;
1314 if (buffer_mapped(bh)) {
1315 if (end) {
1316 error = __block_write_begin(page, start, end - start,
1317 gfs2_block_map);
1318 if (unlikely(error))
1319 return error;
1320 empty_write_end(page, start, end);
1321 end = 0;
1322 }
1323 start = next;
1324 }
1325 else
1326 end = next;
1327 bh = bh->b_this_page;
1328 } while (next < to);
1329
1330 if (end) {
1331 error = __block_write_begin(page, start, end - start, gfs2_block_map);
1332 if (unlikely(error))
1333 return error;
1334 empty_write_end(page, start, end);
1335 }
1336
1337 return 0;
1338}
1339
1340static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
1341 int mode)
1342{
1343 struct gfs2_inode *ip = GFS2_I(inode);
1344 struct buffer_head *dibh;
1345 int error;
1346 u64 start = offset >> PAGE_CACHE_SHIFT;
1347 unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
1348 u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
1349 pgoff_t curr;
1350 struct page *page;
1351 unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
1352 unsigned int from, to;
1353
1354 if (!end_offset)
1355 end_offset = PAGE_CACHE_SIZE;
1356
1357 error = gfs2_meta_inode_buffer(ip, &dibh);
1358 if (unlikely(error))
1359 goto out;
1360
1361 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1362
1363 if (gfs2_is_stuffed(ip)) {
1364 error = gfs2_unstuff_dinode(ip, NULL);
1365 if (unlikely(error))
1366 goto out;
1367 }
1368
1369 curr = start;
1370 offset = start << PAGE_CACHE_SHIFT;
1371 from = start_offset;
1372 to = PAGE_CACHE_SIZE;
1373 while (curr <= end) {
1374 page = grab_cache_page_write_begin(inode->i_mapping, curr,
1375 AOP_FLAG_NOFS);
1376 if (unlikely(!page)) {
1377 error = -ENOMEM;
1378 goto out;
1379 }
1380
1381 if (curr == end)
1382 to = end_offset;
1383 error = write_empty_blocks(page, from, to);
1384 if (!error && offset + to > inode->i_size &&
1385 !(mode & FALLOC_FL_KEEP_SIZE)) {
1386 i_size_write(inode, offset + to);
1387 }
1388 unlock_page(page);
1389 page_cache_release(page);
1390 if (error)
1391 goto out;
1392 curr++;
1393 offset += PAGE_CACHE_SIZE;
1394 from = 0;
1395 }
1396
1397 gfs2_dinode_out(ip, dibh->b_data);
1398 mark_inode_dirty(inode);
1399
1400 brelse(dibh);
1401
1402out:
1403 return error;
1404}
1405
1406static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
1407 unsigned int *data_blocks, unsigned int *ind_blocks)
1408{
1409 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1410 unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
1411 unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
1412
1413 for (tmp = max_data; tmp > sdp->sd_diptrs;) {
1414 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
1415 max_data -= tmp;
1416 }
1417 /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
1418 so it might end up with fewer data blocks */
1419 if (max_data <= *data_blocks)
1420 return;
1421 *data_blocks = max_data;
1422 *ind_blocks = max_blocks - max_data;
1423 *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
1424 if (*len > max) {
1425 *len = max;
1426 gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
1427 }
1428}
1429
1430static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset,
1431 loff_t len)
1432{
1433 struct gfs2_sbd *sdp = GFS2_SB(inode);
1434 struct gfs2_inode *ip = GFS2_I(inode);
1435 unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
1436 loff_t bytes, max_bytes;
1437 struct gfs2_alloc *al;
1438 int error;
1439 loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
1440 next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
1441
1442 offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
1443 sdp->sd_sb.sb_bsize_shift;
1444
1445 len = next - offset;
1446 bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
1447 if (!bytes)
1448 bytes = UINT_MAX;
1449
1450 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
1451 error = gfs2_glock_nq(&ip->i_gh);
1452 if (unlikely(error))
1453 goto out_uninit;
1454
1455 if (!gfs2_write_alloc_required(ip, offset, len))
1456 goto out_unlock;
1457
1458 while (len > 0) {
1459 if (len < bytes)
1460 bytes = len;
1461 al = gfs2_alloc_get(ip);
1462 if (!al) {
1463 error = -ENOMEM;
1464 goto out_unlock;
1465 }
1466
1467 error = gfs2_quota_lock_check(ip);
1468 if (error)
1469 goto out_alloc_put;
1470
1471retry:
1472 gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
1473
1474 al->al_requested = data_blocks + ind_blocks;
1475 error = gfs2_inplace_reserve(ip);
1476 if (error) {
1477 if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
1478 bytes >>= 1;
1479 goto retry;
1480 }
1481 goto out_qunlock;
1482 }
1483 max_bytes = bytes;
1484 calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
1485 al->al_requested = data_blocks + ind_blocks;
1486
1487 rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
1488 RES_RG_HDR + gfs2_rg_blocks(al);
1489 if (gfs2_is_jdata(ip))
1490 rblocks += data_blocks ? data_blocks : 1;
1491
1492 error = gfs2_trans_begin(sdp, rblocks,
1493 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
1494 if (error)
1495 goto out_trans_fail;
1496
1497 error = fallocate_chunk(inode, offset, max_bytes, mode);
1498 gfs2_trans_end(sdp);
1499
1500 if (error)
1501 goto out_trans_fail;
1502
1503 len -= max_bytes;
1504 offset += max_bytes;
1505 gfs2_inplace_release(ip);
1506 gfs2_quota_unlock(ip);
1507 gfs2_alloc_put(ip);
1508 }
1509 goto out_unlock;
1510
1511out_trans_fail:
1512 gfs2_inplace_release(ip);
1513out_qunlock:
1514 gfs2_quota_unlock(ip);
1515out_alloc_put:
1516 gfs2_alloc_put(ip);
1517out_unlock:
1518 gfs2_glock_dq(&ip->i_gh);
1519out_uninit:
1520 gfs2_holder_uninit(&ip->i_gh);
1521 return error;
1522}
1523
1524
1304static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 1525static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1305 u64 start, u64 len) 1526 u64 start, u64 len)
1306{ 1527{
@@ -1351,6 +1572,7 @@ const struct inode_operations gfs2_file_iops = {
1351 .getxattr = gfs2_getxattr, 1572 .getxattr = gfs2_getxattr,
1352 .listxattr = gfs2_listxattr, 1573 .listxattr = gfs2_listxattr,
1353 .removexattr = gfs2_removexattr, 1574 .removexattr = gfs2_removexattr,
1575 .fallocate = gfs2_fallocate,
1354 .fiemap = gfs2_fiemap, 1576 .fiemap = gfs2_fiemap,
1355}; 1577};
1356 1578
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 1bc6b5695e6d..58a9b9998b42 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -735,10 +735,8 @@ get_a_page:
735 goto out; 735 goto out;
736 736
737 size = loc + sizeof(struct gfs2_quota); 737 size = loc + sizeof(struct gfs2_quota);
738 if (size > inode->i_size) { 738 if (size > inode->i_size)
739 ip->i_disksize = size;
740 i_size_write(inode, size); 739 i_size_write(inode, size);
741 }
742 inode->i_mtime = inode->i_atime = CURRENT_TIME; 740 inode->i_mtime = inode->i_atime = CURRENT_TIME;
743 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 741 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
744 gfs2_dinode_out(ip, dibh->b_data); 742 gfs2_dinode_out(ip, dibh->b_data);
@@ -817,7 +815,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
817 goto out_alloc; 815 goto out_alloc;
818 816
819 if (nalloc) 817 if (nalloc)
820 blocks += al->al_rgd->rd_length + nalloc * ind_blocks + RES_STATFS; 818 blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS;
821 819
822 error = gfs2_trans_begin(sdp, blocks, 0); 820 error = gfs2_trans_begin(sdp, blocks, 0);
823 if (error) 821 if (error)
@@ -1190,18 +1188,17 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *
1190int gfs2_quota_init(struct gfs2_sbd *sdp) 1188int gfs2_quota_init(struct gfs2_sbd *sdp)
1191{ 1189{
1192 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); 1190 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
1193 unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; 1191 u64 size = i_size_read(sdp->sd_qc_inode);
1192 unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift;
1194 unsigned int x, slot = 0; 1193 unsigned int x, slot = 0;
1195 unsigned int found = 0; 1194 unsigned int found = 0;
1196 u64 dblock; 1195 u64 dblock;
1197 u32 extlen = 0; 1196 u32 extlen = 0;
1198 int error; 1197 int error;
1199 1198
1200 if (!ip->i_disksize || ip->i_disksize > (64 << 20) || 1199 if (gfs2_check_internal_file_size(sdp->sd_qc_inode, 1, 64 << 20))
1201 ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) {
1202 gfs2_consist_inode(ip);
1203 return -EIO; 1200 return -EIO;
1204 } 1201
1205 sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block; 1202 sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
1206 sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE); 1203 sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
1207 1204
@@ -1589,6 +1586,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1589 error = gfs2_inplace_reserve(ip); 1586 error = gfs2_inplace_reserve(ip);
1590 if (error) 1587 if (error)
1591 goto out_alloc; 1588 goto out_alloc;
1589 blocks += gfs2_rg_blocks(al);
1592 } 1590 }
1593 1591
1594 error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0); 1592 error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index f7f89a94a5a4..f2a02edcac8f 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -455,11 +455,13 @@ void gfs2_recover_func(struct work_struct *work)
455 int ro = 0; 455 int ro = 0;
456 unsigned int pass; 456 unsigned int pass;
457 int error; 457 int error;
458 int jlocked = 0;
458 459
459 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { 460 if (sdp->sd_args.ar_spectator ||
461 (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) {
460 fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n", 462 fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
461 jd->jd_jid); 463 jd->jd_jid);
462 464 jlocked = 1;
463 /* Acquire the journal lock so we can do recovery */ 465 /* Acquire the journal lock so we can do recovery */
464 466
465 error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops, 467 error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
@@ -554,13 +556,12 @@ void gfs2_recover_func(struct work_struct *work)
554 jd->jd_jid, t); 556 jd->jd_jid, t);
555 } 557 }
556 558
557 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
558 gfs2_glock_dq_uninit(&ji_gh);
559
560 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); 559 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
561 560
562 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) 561 if (jlocked) {
562 gfs2_glock_dq_uninit(&ji_gh);
563 gfs2_glock_dq_uninit(&j_gh); 563 gfs2_glock_dq_uninit(&j_gh);
564 }
564 565
565 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); 566 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
566 goto done; 567 goto done;
@@ -568,7 +569,7 @@ void gfs2_recover_func(struct work_struct *work)
568fail_gunlock_tr: 569fail_gunlock_tr:
569 gfs2_glock_dq_uninit(&t_gh); 570 gfs2_glock_dq_uninit(&t_gh);
570fail_gunlock_ji: 571fail_gunlock_ji:
571 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { 572 if (jlocked) {
572 gfs2_glock_dq_uninit(&ji_gh); 573 gfs2_glock_dq_uninit(&ji_gh);
573fail_gunlock_j: 574fail_gunlock_j:
574 gfs2_glock_dq_uninit(&j_gh); 575 gfs2_glock_dq_uninit(&j_gh);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 171a744f8e45..bef3ab6cf5c1 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
500 for (rgrps = 0;; rgrps++) { 500 for (rgrps = 0;; rgrps++) {
501 loff_t pos = rgrps * sizeof(struct gfs2_rindex); 501 loff_t pos = rgrps * sizeof(struct gfs2_rindex);
502 502
503 if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize) 503 if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode))
504 break; 504 break;
505 error = gfs2_internal_read(ip, &ra_state, buf, &pos, 505 error = gfs2_internal_read(ip, &ra_state, buf, &pos,
506 sizeof(struct gfs2_rindex)); 506 sizeof(struct gfs2_rindex));
@@ -588,7 +588,9 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
588 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 588 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
589 struct inode *inode = &ip->i_inode; 589 struct inode *inode = &ip->i_inode;
590 struct file_ra_state ra_state; 590 struct file_ra_state ra_state;
591 u64 rgrp_count = ip->i_disksize; 591 u64 rgrp_count = i_size_read(inode);
592 struct gfs2_rgrpd *rgd;
593 unsigned int max_data = 0;
592 int error; 594 int error;
593 595
594 do_div(rgrp_count, sizeof(struct gfs2_rindex)); 596 do_div(rgrp_count, sizeof(struct gfs2_rindex));
@@ -603,6 +605,10 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
603 } 605 }
604 } 606 }
605 607
608 list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
609 if (rgd->rd_data > max_data)
610 max_data = rgd->rd_data;
611 sdp->sd_max_rg_data = max_data;
606 sdp->sd_rindex_uptodate = 1; 612 sdp->sd_rindex_uptodate = 1;
607 return 0; 613 return 0;
608} 614}
@@ -622,13 +628,15 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
622 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 628 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
623 struct inode *inode = &ip->i_inode; 629 struct inode *inode = &ip->i_inode;
624 struct file_ra_state ra_state; 630 struct file_ra_state ra_state;
631 struct gfs2_rgrpd *rgd;
632 unsigned int max_data = 0;
625 int error; 633 int error;
626 634
627 file_ra_state_init(&ra_state, inode->i_mapping); 635 file_ra_state_init(&ra_state, inode->i_mapping);
628 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) { 636 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
629 /* Ignore partials */ 637 /* Ignore partials */
630 if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) > 638 if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
631 ip->i_disksize) 639 i_size_read(inode))
632 break; 640 break;
633 error = read_rindex_entry(ip, &ra_state); 641 error = read_rindex_entry(ip, &ra_state);
634 if (error) { 642 if (error) {
@@ -636,6 +644,10 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
636 return error; 644 return error;
637 } 645 }
638 } 646 }
647 list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
648 if (rgd->rd_data > max_data)
649 max_data = rgd->rd_data;
650 sdp->sd_max_rg_data = max_data;
639 651
640 sdp->sd_rindex_uptodate = 1; 652 sdp->sd_rindex_uptodate = 1;
641 return 0; 653 return 0;
@@ -854,8 +866,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
854 if ((start + nr_sects) != blk) { 866 if ((start + nr_sects) != blk) {
855 rv = blkdev_issue_discard(bdev, start, 867 rv = blkdev_issue_discard(bdev, start,
856 nr_sects, GFP_NOFS, 868 nr_sects, GFP_NOFS,
857 BLKDEV_IFL_WAIT | 869 0);
858 BLKDEV_IFL_BARRIER);
859 if (rv) 870 if (rv)
860 goto fail; 871 goto fail;
861 nr_sects = 0; 872 nr_sects = 0;
@@ -869,8 +880,7 @@ start_new_extent:
869 } 880 }
870 } 881 }
871 if (nr_sects) { 882 if (nr_sects) {
872 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 883 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 0);
873 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
874 if (rv) 884 if (rv)
875 goto fail; 885 goto fail;
876 } 886 }
@@ -1188,7 +1198,8 @@ out:
1188 * Returns: errno 1198 * Returns: errno
1189 */ 1199 */
1190 1200
1191int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) 1201int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
1202 char *file, unsigned int line)
1192{ 1203{
1193 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1204 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1194 struct gfs2_alloc *al = ip->i_alloc; 1205 struct gfs2_alloc *al = ip->i_alloc;
@@ -1199,12 +1210,15 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
1199 return -EINVAL; 1210 return -EINVAL;
1200 1211
1201try_again: 1212try_again:
1202 /* We need to hold the rindex unless the inode we're using is 1213 if (hold_rindex) {
1203 the rindex itself, in which case it's already held. */ 1214 /* We need to hold the rindex unless the inode we're using is
1204 if (ip != GFS2_I(sdp->sd_rindex)) 1215 the rindex itself, in which case it's already held. */
1205 error = gfs2_rindex_hold(sdp, &al->al_ri_gh); 1216 if (ip != GFS2_I(sdp->sd_rindex))
1206 else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */ 1217 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
1207 error = gfs2_ri_update_special(ip); 1218 else if (!sdp->sd_rgrps) /* We may not have the rindex read
1219 in, so: */
1220 error = gfs2_ri_update_special(ip);
1221 }
1208 1222
1209 if (error) 1223 if (error)
1210 return error; 1224 return error;
@@ -1215,7 +1229,7 @@ try_again:
1215 try to free it, and try the allocation again. */ 1229 try to free it, and try the allocation again. */
1216 error = get_local_rgrp(ip, &unlinked, &last_unlinked); 1230 error = get_local_rgrp(ip, &unlinked, &last_unlinked);
1217 if (error) { 1231 if (error) {
1218 if (ip != GFS2_I(sdp->sd_rindex)) 1232 if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
1219 gfs2_glock_dq_uninit(&al->al_ri_gh); 1233 gfs2_glock_dq_uninit(&al->al_ri_gh);
1220 if (error != -EAGAIN) 1234 if (error != -EAGAIN)
1221 return error; 1235 return error;
@@ -1257,7 +1271,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
1257 al->al_rgd = NULL; 1271 al->al_rgd = NULL;
1258 if (al->al_rgd_gh.gh_gl) 1272 if (al->al_rgd_gh.gh_gl)
1259 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1273 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1260 if (ip != GFS2_I(sdp->sd_rindex)) 1274 if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl)
1261 gfs2_glock_dq_uninit(&al->al_ri_gh); 1275 gfs2_glock_dq_uninit(&al->al_ri_gh);
1262} 1276}
1263 1277
@@ -1496,11 +1510,19 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
1496 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1510 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1497 struct buffer_head *dibh; 1511 struct buffer_head *dibh;
1498 struct gfs2_alloc *al = ip->i_alloc; 1512 struct gfs2_alloc *al = ip->i_alloc;
1499 struct gfs2_rgrpd *rgd = al->al_rgd; 1513 struct gfs2_rgrpd *rgd;
1500 u32 goal, blk; 1514 u32 goal, blk;
1501 u64 block; 1515 u64 block;
1502 int error; 1516 int error;
1503 1517
1518 /* Only happens if there is a bug in gfs2, return something distinctive
1519 * to ensure that it is noticed.
1520 */
1521 if (al == NULL)
1522 return -ECANCELED;
1523
1524 rgd = al->al_rgd;
1525
1504 if (rgrp_contains_block(rgd, ip->i_goal)) 1526 if (rgrp_contains_block(rgd, ip->i_goal))
1505 goal = ip->i_goal - rgd->rd_data0; 1527 goal = ip->i_goal - rgd->rd_data0;
1506 else 1528 else
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index f07119d89557..0e35c0466f9a 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -39,10 +39,12 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
39 ip->i_alloc = NULL; 39 ip->i_alloc = NULL;
40} 40}
41 41
42extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, 42extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
43 unsigned int line); 43 char *file, unsigned int line);
44#define gfs2_inplace_reserve(ip) \ 44#define gfs2_inplace_reserve(ip) \
45gfs2_inplace_reserve_i((ip), __FILE__, __LINE__) 45 gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__)
46#define gfs2_inplace_reserve_ri(ip) \
47 gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__)
46 48
47extern void gfs2_inplace_release(struct gfs2_inode *ip); 49extern void gfs2_inplace_release(struct gfs2_inode *ip);
48 50
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 77cb9f830ee4..2b2c4997430b 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -85,6 +85,7 @@ static const match_table_t tokens = {
85 {Opt_locktable, "locktable=%s"}, 85 {Opt_locktable, "locktable=%s"},
86 {Opt_hostdata, "hostdata=%s"}, 86 {Opt_hostdata, "hostdata=%s"},
87 {Opt_spectator, "spectator"}, 87 {Opt_spectator, "spectator"},
88 {Opt_spectator, "norecovery"},
88 {Opt_ignore_local_fs, "ignore_local_fs"}, 89 {Opt_ignore_local_fs, "ignore_local_fs"},
89 {Opt_localflocks, "localflocks"}, 90 {Opt_localflocks, "localflocks"},
90 {Opt_localcaching, "localcaching"}, 91 {Opt_localcaching, "localcaching"},
@@ -159,13 +160,13 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
159 args->ar_spectator = 1; 160 args->ar_spectator = 1;
160 break; 161 break;
161 case Opt_ignore_local_fs: 162 case Opt_ignore_local_fs:
162 args->ar_ignore_local_fs = 1; 163 /* Retained for backwards compat only */
163 break; 164 break;
164 case Opt_localflocks: 165 case Opt_localflocks:
165 args->ar_localflocks = 1; 166 args->ar_localflocks = 1;
166 break; 167 break;
167 case Opt_localcaching: 168 case Opt_localcaching:
168 args->ar_localcaching = 1; 169 /* Retained for backwards compat only */
169 break; 170 break;
170 case Opt_debug: 171 case Opt_debug:
171 if (args->ar_errors == GFS2_ERRORS_PANIC) { 172 if (args->ar_errors == GFS2_ERRORS_PANIC) {
@@ -179,7 +180,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
179 args->ar_debug = 0; 180 args->ar_debug = 0;
180 break; 181 break;
181 case Opt_upgrade: 182 case Opt_upgrade:
182 args->ar_upgrade = 1; 183 /* Retained for backwards compat only */
183 break; 184 break;
184 case Opt_acl: 185 case Opt_acl:
185 args->ar_posix_acl = 1; 186 args->ar_posix_acl = 1;
@@ -342,15 +343,14 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
342{ 343{
343 struct gfs2_inode *ip = GFS2_I(jd->jd_inode); 344 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
344 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); 345 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
346 u64 size = i_size_read(jd->jd_inode);
345 347
346 if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) || 348 if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30))
347 (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) {
348 gfs2_consist_inode(ip);
349 return -EIO; 349 return -EIO;
350 }
351 jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
352 350
353 if (gfs2_write_alloc_required(ip, 0, ip->i_disksize)) { 351 jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift;
352
353 if (gfs2_write_alloc_required(ip, 0, size)) {
354 gfs2_consist_inode(ip); 354 gfs2_consist_inode(ip);
355 return -EIO; 355 return -EIO;
356 } 356 }
@@ -857,7 +857,6 @@ restart:
857 gfs2_clear_rgrpd(sdp); 857 gfs2_clear_rgrpd(sdp);
858 gfs2_jindex_free(sdp); 858 gfs2_jindex_free(sdp);
859 /* Take apart glock structures and buffer lists */ 859 /* Take apart glock structures and buffer lists */
860 invalidate_inodes(sdp->sd_vfs);
861 gfs2_gl_hash_clear(sdp); 860 gfs2_gl_hash_clear(sdp);
862 /* Unmount the locking protocol */ 861 /* Unmount the locking protocol */
863 gfs2_lm_unmount(sdp); 862 gfs2_lm_unmount(sdp);
@@ -1129,9 +1128,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
1129 1128
1130 /* Some flags must not be changed */ 1129 /* Some flags must not be changed */
1131 if (args_neq(&args, &sdp->sd_args, spectator) || 1130 if (args_neq(&args, &sdp->sd_args, spectator) ||
1132 args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
1133 args_neq(&args, &sdp->sd_args, localflocks) || 1131 args_neq(&args, &sdp->sd_args, localflocks) ||
1134 args_neq(&args, &sdp->sd_args, localcaching) ||
1135 args_neq(&args, &sdp->sd_args, meta)) 1132 args_neq(&args, &sdp->sd_args, meta))
1136 return -EINVAL; 1133 return -EINVAL;
1137 1134
@@ -1234,16 +1231,10 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1234 seq_printf(s, ",hostdata=%s", args->ar_hostdata); 1231 seq_printf(s, ",hostdata=%s", args->ar_hostdata);
1235 if (args->ar_spectator) 1232 if (args->ar_spectator)
1236 seq_printf(s, ",spectator"); 1233 seq_printf(s, ",spectator");
1237 if (args->ar_ignore_local_fs)
1238 seq_printf(s, ",ignore_local_fs");
1239 if (args->ar_localflocks) 1234 if (args->ar_localflocks)
1240 seq_printf(s, ",localflocks"); 1235 seq_printf(s, ",localflocks");
1241 if (args->ar_localcaching)
1242 seq_printf(s, ",localcaching");
1243 if (args->ar_debug) 1236 if (args->ar_debug)
1244 seq_printf(s, ",debug"); 1237 seq_printf(s, ",debug");
1245 if (args->ar_upgrade)
1246 seq_printf(s, ",upgrade");
1247 if (args->ar_posix_acl) 1238 if (args->ar_posix_acl)
1248 seq_printf(s, ",acl"); 1239 seq_printf(s, ",acl");
1249 if (args->ar_quota != GFS2_QUOTA_DEFAULT) { 1240 if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index ccacffd2faaa..748ccb557c18 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -230,7 +230,10 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
230 230
231 if (gltype > LM_TYPE_JOURNAL) 231 if (gltype > LM_TYPE_JOURNAL)
232 return -EINVAL; 232 return -EINVAL;
233 glops = gfs2_glops_list[gltype]; 233 if (gltype == LM_TYPE_NONDISK && glnum == GFS2_TRANS_LOCK)
234 glops = &gfs2_trans_glops;
235 else
236 glops = gfs2_glops_list[gltype];
234 if (glops == NULL) 237 if (glops == NULL)
235 return -EINVAL; 238 return -EINVAL;
236 if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags)) 239 if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags))
@@ -399,31 +402,32 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
399 402
400static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf) 403static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
401{ 404{
402 return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid); 405 return sprintf(buf, "%d\n", sdp->sd_lockstruct.ls_jid);
403} 406}
404 407
405static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len) 408static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
406{ 409{
407 unsigned jid; 410 int jid;
408 int rv; 411 int rv;
409 412
410 rv = sscanf(buf, "%u", &jid); 413 rv = sscanf(buf, "%d", &jid);
411 if (rv != 1) 414 if (rv != 1)
412 return -EINVAL; 415 return -EINVAL;
413 416
414 spin_lock(&sdp->sd_jindex_spin); 417 spin_lock(&sdp->sd_jindex_spin);
415 rv = -EINVAL; 418 rv = -EINVAL;
416 if (sdp->sd_args.ar_spectator)
417 goto out;
418 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) 419 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
419 goto out; 420 goto out;
420 rv = -EBUSY; 421 rv = -EBUSY;
421 if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) 422 if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
422 goto out; 423 goto out;
424 rv = 0;
425 if (sdp->sd_args.ar_spectator && jid > 0)
426 rv = jid = -EINVAL;
423 sdp->sd_lockstruct.ls_jid = jid; 427 sdp->sd_lockstruct.ls_jid = jid;
428 clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
424 smp_mb__after_clear_bit(); 429 smp_mb__after_clear_bit();
425 wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); 430 wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
426 rv = 0;
427out: 431out:
428 spin_unlock(&sdp->sd_jindex_spin); 432 spin_unlock(&sdp->sd_jindex_spin);
429 return rv ? rv : len; 433 return rv ? rv : len;
@@ -617,7 +621,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
617 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); 621 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
618 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); 622 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
619 if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) 623 if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags))
620 add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid); 624 add_uevent_var(env, "JOURNALID=%d", sdp->sd_lockstruct.ls_jid);
621 if (gfs2_uuid_valid(uuid)) 625 if (gfs2_uuid_valid(uuid))
622 add_uevent_var(env, "UUID=%pUB", uuid); 626 add_uevent_var(env, "UUID=%pUB", uuid);
623 return 0; 627 return 0;
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 148d55c14171..cedb0bb96d96 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -39,7 +39,8 @@
39 {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \ 39 {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \
40 {(1UL << GLF_REPLY_PENDING), "r" }, \ 40 {(1UL << GLF_REPLY_PENDING), "r" }, \
41 {(1UL << GLF_INITIAL), "I" }, \ 41 {(1UL << GLF_INITIAL), "I" }, \
42 {(1UL << GLF_FROZEN), "F" }) 42 {(1UL << GLF_FROZEN), "F" }, \
43 {(1UL << GLF_QUEUED), "q" })
43 44
44#ifndef NUMPTY 45#ifndef NUMPTY
45#define NUMPTY 46#define NUMPTY
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index edf9d4bd908e..fb56b783e028 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -20,11 +20,20 @@ struct gfs2_glock;
20#define RES_JDATA 1 20#define RES_JDATA 1
21#define RES_DATA 1 21#define RES_DATA 1
22#define RES_LEAF 1 22#define RES_LEAF 1
23#define RES_RG_HDR 1
23#define RES_RG_BIT 2 24#define RES_RG_BIT 2
24#define RES_EATTR 1 25#define RES_EATTR 1
25#define RES_STATFS 1 26#define RES_STATFS 1
26#define RES_QUOTA 2 27#define RES_QUOTA 2
27 28
29/* reserve either the number of blocks to be allocated plus the rg header
30 * block, or all of the blocks in the rg, whichever is smaller */
31static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al)
32{
33 return (al->al_requested < al->al_rgd->rd_length)?
34 al->al_requested + 1 : al->al_rgd->rd_length;
35}
36
28int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, 37int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
29 unsigned int revokes); 38 unsigned int revokes);
30 39
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 776af6eb4bcb..30b58f07c8a6 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -734,7 +734,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
734 goto out_gunlock_q; 734 goto out_gunlock_q;
735 735
736 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), 736 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
737 blks + al->al_rgd->rd_length + 737 blks + gfs2_rg_blocks(al) +
738 RES_DINODE + RES_STATFS + RES_QUOTA, 0); 738 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
739 if (error) 739 if (error)
740 goto out_ipres; 740 goto out_ipres;
diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c
index 4129cdb3f0d8..571abe97b42a 100644
--- a/fs/hfs/bfind.c
+++ b/fs/hfs/bfind.c
@@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
23 fd->search_key = ptr; 23 fd->search_key = ptr;
24 fd->key = ptr + tree->max_key_len + 2; 24 fd->key = ptr + tree->max_key_len + 2;
25 dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); 25 dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
26 down(&tree->tree_lock); 26 mutex_lock(&tree->tree_lock);
27 return 0; 27 return 0;
28} 28}
29 29
@@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
32 hfs_bnode_put(fd->bnode); 32 hfs_bnode_put(fd->bnode);
33 kfree(fd->search_key); 33 kfree(fd->search_key);
34 dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); 34 dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
35 up(&fd->tree->tree_lock); 35 mutex_unlock(&fd->tree->tree_lock);
36 fd->tree = NULL; 36 fd->tree = NULL;
37} 37}
38 38
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 38a0a9917d7f..3ebc437736fe 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -27,7 +27,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
27 if (!tree) 27 if (!tree)
28 return NULL; 28 return NULL;
29 29
30 init_MUTEX(&tree->tree_lock); 30 mutex_init(&tree->tree_lock);
31 spin_lock_init(&tree->hash_lock); 31 spin_lock_init(&tree->hash_lock);
32 /* Set the correct compare function */ 32 /* Set the correct compare function */
33 tree->sb = sb; 33 tree->sb = sb;
diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h
index cc51905ac21d..2a1d712f85dc 100644
--- a/fs/hfs/btree.h
+++ b/fs/hfs/btree.h
@@ -33,7 +33,7 @@ struct hfs_btree {
33 unsigned int depth; 33 unsigned int depth;
34 34
35 //unsigned int map1_size, map_size; 35 //unsigned int map1_size, map_size;
36 struct semaphore tree_lock; 36 struct mutex tree_lock;
37 37
38 unsigned int pages_per_bnode; 38 unsigned int pages_per_bnode;
39 spinlock_t hash_lock; 39 spinlock_t hash_lock;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 4f55651aaa51..c8cffb81e849 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -147,8 +147,6 @@ struct hfs_sb_info {
147 u16 blockoffset; 147 u16 blockoffset;
148 148
149 int fs_div; 149 int fs_div;
150
151 struct hlist_head rsrc_inodes;
152}; 150};
153 151
154#define HFS_FLG_BITMAP_DIRTY 0 152#define HFS_FLG_BITMAP_DIRTY 0
@@ -254,17 +252,6 @@ static inline void hfs_bitmap_dirty(struct super_block *sb)
254 sb->s_dirt = 1; 252 sb->s_dirt = 1;
255} 253}
256 254
257static inline void hfs_buffer_sync(struct buffer_head *bh)
258{
259 while (buffer_locked(bh)) {
260 wait_on_buffer(bh);
261 }
262 if (buffer_dirty(bh)) {
263 ll_rw_block(WRITE, 1, &bh);
264 wait_on_buffer(bh);
265 }
266}
267
268#define sb_bread512(sb, sec, data) ({ \ 255#define sb_bread512(sb, sec, data) ({ \
269 struct buffer_head *__bh; \ 256 struct buffer_head *__bh; \
270 sector_t __block; \ 257 sector_t __block; \
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 397b7adc7ce6..dffb4e996643 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -524,7 +524,7 @@ static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry,
524 HFS_I(inode)->rsrc_inode = dir; 524 HFS_I(inode)->rsrc_inode = dir;
525 HFS_I(dir)->rsrc_inode = inode; 525 HFS_I(dir)->rsrc_inode = inode;
526 igrab(dir); 526 igrab(dir);
527 hlist_add_head(&inode->i_hash, &HFS_SB(dir->i_sb)->rsrc_inodes); 527 hlist_add_fake(&inode->i_hash);
528 mark_inode_dirty(inode); 528 mark_inode_dirty(inode);
529out: 529out:
530 d_add(dentry, inode); 530 d_add(dentry, inode);
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 86428f5ac991..1563d5ce5764 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -220,7 +220,7 @@ int hfs_mdb_get(struct super_block *sb)
220 mdb->drLsMod = hfs_mtime(); 220 mdb->drLsMod = hfs_mtime();
221 221
222 mark_buffer_dirty(HFS_SB(sb)->mdb_bh); 222 mark_buffer_dirty(HFS_SB(sb)->mdb_bh);
223 hfs_buffer_sync(HFS_SB(sb)->mdb_bh); 223 sync_dirty_buffer(HFS_SB(sb)->mdb_bh);
224 } 224 }
225 225
226 return 0; 226 return 0;
@@ -287,7 +287,7 @@ void hfs_mdb_commit(struct super_block *sb)
287 HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT); 287 HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT);
288 HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT); 288 HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT);
289 mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh); 289 mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh);
290 hfs_buffer_sync(HFS_SB(sb)->alt_mdb_bh); 290 sync_dirty_buffer(HFS_SB(sb)->alt_mdb_bh);
291 } 291 }
292 292
293 if (test_and_clear_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags)) { 293 if (test_and_clear_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags)) {
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 34235d4bf08b..6ee1586f2334 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -20,7 +20,6 @@
20#include <linux/parser.h> 20#include <linux/parser.h>
21#include <linux/seq_file.h> 21#include <linux/seq_file.h>
22#include <linux/slab.h> 22#include <linux/slab.h>
23#include <linux/smp_lock.h>
24#include <linux/vfs.h> 23#include <linux/vfs.h>
25 24
26#include "hfs_fs.h" 25#include "hfs_fs.h"
@@ -79,15 +78,11 @@ static int hfs_sync_fs(struct super_block *sb, int wait)
79 */ 78 */
80static void hfs_put_super(struct super_block *sb) 79static void hfs_put_super(struct super_block *sb)
81{ 80{
82 lock_kernel();
83
84 if (sb->s_dirt) 81 if (sb->s_dirt)
85 hfs_write_super(sb); 82 hfs_write_super(sb);
86 hfs_mdb_close(sb); 83 hfs_mdb_close(sb);
87 /* release the MDB's resources */ 84 /* release the MDB's resources */
88 hfs_mdb_put(sb); 85 hfs_mdb_put(sb);
89
90 unlock_kernel();
91} 86}
92 87
93/* 88/*
@@ -385,8 +380,8 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
385 sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL); 380 sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL);
386 if (!sbi) 381 if (!sbi)
387 return -ENOMEM; 382 return -ENOMEM;
383
388 sb->s_fs_info = sbi; 384 sb->s_fs_info = sbi;
389 INIT_HLIST_HEAD(&sbi->rsrc_inodes);
390 385
391 res = -EINVAL; 386 res = -EINVAL;
392 if (!parse_options((char *)data, sbi)) { 387 if (!parse_options((char *)data, sbi)) {
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index 5007a41f1be9..d182438c7ae4 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
23 fd->search_key = ptr; 23 fd->search_key = ptr;
24 fd->key = ptr + tree->max_key_len + 2; 24 fd->key = ptr + tree->max_key_len + 2;
25 dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); 25 dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
26 down(&tree->tree_lock); 26 mutex_lock(&tree->tree_lock);
27 return 0; 27 return 0;
28} 28}
29 29
@@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
32 hfs_bnode_put(fd->bnode); 32 hfs_bnode_put(fd->bnode);
33 kfree(fd->search_key); 33 kfree(fd->search_key);
34 dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); 34 dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
35 up(&fd->tree->tree_lock); 35 mutex_unlock(&fd->tree->tree_lock);
36 fd->tree = NULL; 36 fd->tree = NULL;
37} 37}
38 38
@@ -52,6 +52,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
52 rec = (e + b) / 2; 52 rec = (e + b) / 2;
53 len = hfs_brec_lenoff(bnode, rec, &off); 53 len = hfs_brec_lenoff(bnode, rec, &off);
54 keylen = hfs_brec_keylen(bnode, rec); 54 keylen = hfs_brec_keylen(bnode, rec);
55 if (keylen == 0) {
56 res = -EINVAL;
57 goto fail;
58 }
55 hfs_bnode_read(bnode, fd->key, off, keylen); 59 hfs_bnode_read(bnode, fd->key, off, keylen);
56 cmpval = bnode->tree->keycmp(fd->key, fd->search_key); 60 cmpval = bnode->tree->keycmp(fd->key, fd->search_key);
57 if (!cmpval) { 61 if (!cmpval) {
@@ -67,6 +71,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
67 if (rec != e && e >= 0) { 71 if (rec != e && e >= 0) {
68 len = hfs_brec_lenoff(bnode, e, &off); 72 len = hfs_brec_lenoff(bnode, e, &off);
69 keylen = hfs_brec_keylen(bnode, e); 73 keylen = hfs_brec_keylen(bnode, e);
74 if (keylen == 0) {
75 res = -EINVAL;
76 goto fail;
77 }
70 hfs_bnode_read(bnode, fd->key, off, keylen); 78 hfs_bnode_read(bnode, fd->key, off, keylen);
71 } 79 }
72done: 80done:
@@ -75,6 +83,7 @@ done:
75 fd->keylength = keylen; 83 fd->keylength = keylen;
76 fd->entryoffset = off + keylen; 84 fd->entryoffset = off + keylen;
77 fd->entrylength = len - keylen; 85 fd->entrylength = len - keylen;
86fail:
78 return res; 87 return res;
79} 88}
80 89
@@ -198,6 +207,10 @@ int hfs_brec_goto(struct hfs_find_data *fd, int cnt)
198 207
199 len = hfs_brec_lenoff(bnode, fd->record, &off); 208 len = hfs_brec_lenoff(bnode, fd->record, &off);
200 keylen = hfs_brec_keylen(bnode, fd->record); 209 keylen = hfs_brec_keylen(bnode, fd->record);
210 if (keylen == 0) {
211 res = -EINVAL;
212 goto out;
213 }
201 fd->keyoffset = off; 214 fd->keyoffset = off;
202 fd->keylength = keylen; 215 fd->keylength = keylen;
203 fd->entryoffset = off + keylen; 216 fd->entryoffset = off + keylen;
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index ea30afc2a03c..ad57f5991eb1 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -17,6 +17,7 @@
17 17
18int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max) 18int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max)
19{ 19{
20 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
20 struct page *page; 21 struct page *page;
21 struct address_space *mapping; 22 struct address_space *mapping;
22 __be32 *pptr, *curr, *end; 23 __be32 *pptr, *curr, *end;
@@ -29,8 +30,8 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
29 return size; 30 return size;
30 31
31 dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len); 32 dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
32 mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex); 33 mutex_lock(&sbi->alloc_mutex);
33 mapping = HFSPLUS_SB(sb).alloc_file->i_mapping; 34 mapping = sbi->alloc_file->i_mapping;
34 page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); 35 page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
35 if (IS_ERR(page)) { 36 if (IS_ERR(page)) {
36 start = size; 37 start = size;
@@ -150,16 +151,17 @@ done:
150 set_page_dirty(page); 151 set_page_dirty(page);
151 kunmap(page); 152 kunmap(page);
152 *max = offset + (curr - pptr) * 32 + i - start; 153 *max = offset + (curr - pptr) * 32 + i - start;
153 HFSPLUS_SB(sb).free_blocks -= *max; 154 sbi->free_blocks -= *max;
154 sb->s_dirt = 1; 155 sb->s_dirt = 1;
155 dprint(DBG_BITMAP, "-> %u,%u\n", start, *max); 156 dprint(DBG_BITMAP, "-> %u,%u\n", start, *max);
156out: 157out:
157 mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex); 158 mutex_unlock(&sbi->alloc_mutex);
158 return start; 159 return start;
159} 160}
160 161
161int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) 162int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
162{ 163{
164 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
163 struct page *page; 165 struct page *page;
164 struct address_space *mapping; 166 struct address_space *mapping;
165 __be32 *pptr, *curr, *end; 167 __be32 *pptr, *curr, *end;
@@ -172,11 +174,11 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
172 174
173 dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count); 175 dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count);
174 /* are all of the bits in range? */ 176 /* are all of the bits in range? */
175 if ((offset + count) > HFSPLUS_SB(sb).total_blocks) 177 if ((offset + count) > sbi->total_blocks)
176 return -2; 178 return -2;
177 179
178 mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex); 180 mutex_lock(&sbi->alloc_mutex);
179 mapping = HFSPLUS_SB(sb).alloc_file->i_mapping; 181 mapping = sbi->alloc_file->i_mapping;
180 pnr = offset / PAGE_CACHE_BITS; 182 pnr = offset / PAGE_CACHE_BITS;
181 page = read_mapping_page(mapping, pnr, NULL); 183 page = read_mapping_page(mapping, pnr, NULL);
182 pptr = kmap(page); 184 pptr = kmap(page);
@@ -224,9 +226,9 @@ done:
224out: 226out:
225 set_page_dirty(page); 227 set_page_dirty(page);
226 kunmap(page); 228 kunmap(page);
227 HFSPLUS_SB(sb).free_blocks += len; 229 sbi->free_blocks += len;
228 sb->s_dirt = 1; 230 sb->s_dirt = 1;
229 mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex); 231 mutex_unlock(&sbi->alloc_mutex);
230 232
231 return 0; 233 return 0;
232} 234}
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index c88e5d72a402..2f39d05443e1 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -42,10 +42,13 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
42 recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2); 42 recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2);
43 if (!recoff) 43 if (!recoff)
44 return 0; 44 return 0;
45 if (node->tree->attributes & HFS_TREE_BIGKEYS) 45
46 retval = hfs_bnode_read_u16(node, recoff) + 2; 46 retval = hfs_bnode_read_u16(node, recoff) + 2;
47 else 47 if (retval > node->tree->max_key_len + 2) {
48 retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1; 48 printk(KERN_ERR "hfs: keylen %d too large\n",
49 retval);
50 retval = 0;
51 }
49 } 52 }
50 return retval; 53 return retval;
51} 54}
@@ -216,7 +219,7 @@ skip:
216static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) 219static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
217{ 220{
218 struct hfs_btree *tree; 221 struct hfs_btree *tree;
219 struct hfs_bnode *node, *new_node; 222 struct hfs_bnode *node, *new_node, *next_node;
220 struct hfs_bnode_desc node_desc; 223 struct hfs_bnode_desc node_desc;
221 int num_recs, new_rec_off, new_off, old_rec_off; 224 int num_recs, new_rec_off, new_off, old_rec_off;
222 int data_start, data_end, size; 225 int data_start, data_end, size;
@@ -235,6 +238,17 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
235 new_node->type = node->type; 238 new_node->type = node->type;
236 new_node->height = node->height; 239 new_node->height = node->height;
237 240
241 if (node->next)
242 next_node = hfs_bnode_find(tree, node->next);
243 else
244 next_node = NULL;
245
246 if (IS_ERR(next_node)) {
247 hfs_bnode_put(node);
248 hfs_bnode_put(new_node);
249 return next_node;
250 }
251
238 size = tree->node_size / 2 - node->num_recs * 2 - 14; 252 size = tree->node_size / 2 - node->num_recs * 2 - 14;
239 old_rec_off = tree->node_size - 4; 253 old_rec_off = tree->node_size - 4;
240 num_recs = 1; 254 num_recs = 1;
@@ -248,6 +262,8 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
248 /* panic? */ 262 /* panic? */
249 hfs_bnode_put(node); 263 hfs_bnode_put(node);
250 hfs_bnode_put(new_node); 264 hfs_bnode_put(new_node);
265 if (next_node)
266 hfs_bnode_put(next_node);
251 return ERR_PTR(-ENOSPC); 267 return ERR_PTR(-ENOSPC);
252 } 268 }
253 269
@@ -302,8 +318,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
302 hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc)); 318 hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc));
303 319
304 /* update next bnode header */ 320 /* update next bnode header */
305 if (new_node->next) { 321 if (next_node) {
306 struct hfs_bnode *next_node = hfs_bnode_find(tree, new_node->next);
307 next_node->prev = new_node->this; 322 next_node->prev = new_node->this;
308 hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc)); 323 hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc));
309 node_desc.prev = cpu_to_be32(next_node->prev); 324 node_desc.prev = cpu_to_be32(next_node->prev);
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index e49fcee1e293..22e4d4e32999 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -30,7 +30,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
30 if (!tree) 30 if (!tree)
31 return NULL; 31 return NULL;
32 32
33 init_MUTEX(&tree->tree_lock); 33 mutex_init(&tree->tree_lock);
34 spin_lock_init(&tree->hash_lock); 34 spin_lock_init(&tree->hash_lock);
35 tree->sb = sb; 35 tree->sb = sb;
36 tree->cnid = id; 36 tree->cnid = id;
@@ -39,10 +39,16 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
39 goto free_tree; 39 goto free_tree;
40 tree->inode = inode; 40 tree->inode = inode;
41 41
42 if (!HFSPLUS_I(tree->inode)->first_blocks) {
43 printk(KERN_ERR
44 "hfs: invalid btree extent records (0 size).\n");
45 goto free_inode;
46 }
47
42 mapping = tree->inode->i_mapping; 48 mapping = tree->inode->i_mapping;
43 page = read_mapping_page(mapping, 0, NULL); 49 page = read_mapping_page(mapping, 0, NULL);
44 if (IS_ERR(page)) 50 if (IS_ERR(page))
45 goto free_tree; 51 goto free_inode;
46 52
47 /* Load the header */ 53 /* Load the header */
48 head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); 54 head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
@@ -57,27 +63,56 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
57 tree->max_key_len = be16_to_cpu(head->max_key_len); 63 tree->max_key_len = be16_to_cpu(head->max_key_len);
58 tree->depth = be16_to_cpu(head->depth); 64 tree->depth = be16_to_cpu(head->depth);
59 65
60 /* Set the correct compare function */ 66 /* Verify the tree and set the correct compare function */
61 if (id == HFSPLUS_EXT_CNID) { 67 switch (id) {
68 case HFSPLUS_EXT_CNID:
69 if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) {
70 printk(KERN_ERR "hfs: invalid extent max_key_len %d\n",
71 tree->max_key_len);
72 goto fail_page;
73 }
74 if (tree->attributes & HFS_TREE_VARIDXKEYS) {
75 printk(KERN_ERR "hfs: invalid extent btree flag\n");
76 goto fail_page;
77 }
78
62 tree->keycmp = hfsplus_ext_cmp_key; 79 tree->keycmp = hfsplus_ext_cmp_key;
63 } else if (id == HFSPLUS_CAT_CNID) { 80 break;
64 if ((HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX) && 81 case HFSPLUS_CAT_CNID:
82 if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) {
83 printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n",
84 tree->max_key_len);
85 goto fail_page;
86 }
87 if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) {
88 printk(KERN_ERR "hfs: invalid catalog btree flag\n");
89 goto fail_page;
90 }
91
92 if (test_bit(HFSPLUS_SB_HFSX, &HFSPLUS_SB(sb)->flags) &&
65 (head->key_type == HFSPLUS_KEY_BINARY)) 93 (head->key_type == HFSPLUS_KEY_BINARY))
66 tree->keycmp = hfsplus_cat_bin_cmp_key; 94 tree->keycmp = hfsplus_cat_bin_cmp_key;
67 else { 95 else {
68 tree->keycmp = hfsplus_cat_case_cmp_key; 96 tree->keycmp = hfsplus_cat_case_cmp_key;
69 HFSPLUS_SB(sb).flags |= HFSPLUS_SB_CASEFOLD; 97 set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
70 } 98 }
71 } else { 99 break;
100 default:
72 printk(KERN_ERR "hfs: unknown B*Tree requested\n"); 101 printk(KERN_ERR "hfs: unknown B*Tree requested\n");
73 goto fail_page; 102 goto fail_page;
74 } 103 }
75 104
105 if (!(tree->attributes & HFS_TREE_BIGKEYS)) {
106 printk(KERN_ERR "hfs: invalid btree flag\n");
107 goto fail_page;
108 }
109
76 size = tree->node_size; 110 size = tree->node_size;
77 if (!is_power_of_2(size)) 111 if (!is_power_of_2(size))
78 goto fail_page; 112 goto fail_page;
79 if (!tree->node_count) 113 if (!tree->node_count)
80 goto fail_page; 114 goto fail_page;
115
81 tree->node_size_shift = ffs(size) - 1; 116 tree->node_size_shift = ffs(size) - 1;
82 117
83 tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 118 tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
@@ -87,10 +122,11 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
87 return tree; 122 return tree;
88 123
89 fail_page: 124 fail_page:
90 tree->inode->i_mapping->a_ops = &hfsplus_aops;
91 page_cache_release(page); 125 page_cache_release(page);
92 free_tree: 126 free_inode:
127 tree->inode->i_mapping->a_ops = &hfsplus_aops;
93 iput(tree->inode); 128 iput(tree->inode);
129 free_tree:
94 kfree(tree); 130 kfree(tree);
95 return NULL; 131 return NULL;
96} 132}
@@ -192,17 +228,18 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
192 228
193 while (!tree->free_nodes) { 229 while (!tree->free_nodes) {
194 struct inode *inode = tree->inode; 230 struct inode *inode = tree->inode;
231 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
195 u32 count; 232 u32 count;
196 int res; 233 int res;
197 234
198 res = hfsplus_file_extend(inode); 235 res = hfsplus_file_extend(inode);
199 if (res) 236 if (res)
200 return ERR_PTR(res); 237 return ERR_PTR(res);
201 HFSPLUS_I(inode).phys_size = inode->i_size = 238 hip->phys_size = inode->i_size =
202 (loff_t)HFSPLUS_I(inode).alloc_blocks << 239 (loff_t)hip->alloc_blocks <<
203 HFSPLUS_SB(tree->sb).alloc_blksz_shift; 240 HFSPLUS_SB(tree->sb)->alloc_blksz_shift;
204 HFSPLUS_I(inode).fs_blocks = HFSPLUS_I(inode).alloc_blocks << 241 hip->fs_blocks =
205 HFSPLUS_SB(tree->sb).fs_shift; 242 hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift;
206 inode_set_bytes(inode, inode->i_size); 243 inode_set_bytes(inode, inode->i_size);
207 count = inode->i_size >> tree->node_size_shift; 244 count = inode->i_size >> tree->node_size_shift;
208 tree->free_nodes = count - tree->node_count; 245 tree->free_nodes = count - tree->node_count;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index f6874acb2cf2..8af45fc5b051 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -67,7 +67,7 @@ static void hfsplus_cat_build_key_uni(hfsplus_btree_key *key, u32 parent,
67 key->key_len = cpu_to_be16(6 + ustrlen); 67 key->key_len = cpu_to_be16(6 + ustrlen);
68} 68}
69 69
70static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms) 70void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms)
71{ 71{
72 if (inode->i_flags & S_IMMUTABLE) 72 if (inode->i_flags & S_IMMUTABLE)
73 perms->rootflags |= HFSPLUS_FLG_IMMUTABLE; 73 perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
@@ -77,15 +77,24 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
77 perms->rootflags |= HFSPLUS_FLG_APPEND; 77 perms->rootflags |= HFSPLUS_FLG_APPEND;
78 else 78 else
79 perms->rootflags &= ~HFSPLUS_FLG_APPEND; 79 perms->rootflags &= ~HFSPLUS_FLG_APPEND;
80 HFSPLUS_I(inode).rootflags = perms->rootflags; 80
81 HFSPLUS_I(inode).userflags = perms->userflags; 81 perms->userflags = HFSPLUS_I(inode)->userflags;
82 perms->mode = cpu_to_be16(inode->i_mode); 82 perms->mode = cpu_to_be16(inode->i_mode);
83 perms->owner = cpu_to_be32(inode->i_uid); 83 perms->owner = cpu_to_be32(inode->i_uid);
84 perms->group = cpu_to_be32(inode->i_gid); 84 perms->group = cpu_to_be32(inode->i_gid);
85
86 if (S_ISREG(inode->i_mode))
87 perms->dev = cpu_to_be32(inode->i_nlink);
88 else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
89 perms->dev = cpu_to_be32(inode->i_rdev);
90 else
91 perms->dev = 0;
85} 92}
86 93
87static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode) 94static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode)
88{ 95{
96 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
97
89 if (S_ISDIR(inode->i_mode)) { 98 if (S_ISDIR(inode->i_mode)) {
90 struct hfsplus_cat_folder *folder; 99 struct hfsplus_cat_folder *folder;
91 100
@@ -93,13 +102,13 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
93 memset(folder, 0, sizeof(*folder)); 102 memset(folder, 0, sizeof(*folder));
94 folder->type = cpu_to_be16(HFSPLUS_FOLDER); 103 folder->type = cpu_to_be16(HFSPLUS_FOLDER);
95 folder->id = cpu_to_be32(inode->i_ino); 104 folder->id = cpu_to_be32(inode->i_ino);
96 HFSPLUS_I(inode).create_date = 105 HFSPLUS_I(inode)->create_date =
97 folder->create_date = 106 folder->create_date =
98 folder->content_mod_date = 107 folder->content_mod_date =
99 folder->attribute_mod_date = 108 folder->attribute_mod_date =
100 folder->access_date = hfsp_now2mt(); 109 folder->access_date = hfsp_now2mt();
101 hfsplus_set_perms(inode, &folder->permissions); 110 hfsplus_cat_set_perms(inode, &folder->permissions);
102 if (inode == HFSPLUS_SB(inode->i_sb).hidden_dir) 111 if (inode == sbi->hidden_dir)
103 /* invisible and namelocked */ 112 /* invisible and namelocked */
104 folder->user_info.frFlags = cpu_to_be16(0x5000); 113 folder->user_info.frFlags = cpu_to_be16(0x5000);
105 return sizeof(*folder); 114 return sizeof(*folder);
@@ -111,19 +120,19 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
111 file->type = cpu_to_be16(HFSPLUS_FILE); 120 file->type = cpu_to_be16(HFSPLUS_FILE);
112 file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS); 121 file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS);
113 file->id = cpu_to_be32(cnid); 122 file->id = cpu_to_be32(cnid);
114 HFSPLUS_I(inode).create_date = 123 HFSPLUS_I(inode)->create_date =
115 file->create_date = 124 file->create_date =
116 file->content_mod_date = 125 file->content_mod_date =
117 file->attribute_mod_date = 126 file->attribute_mod_date =
118 file->access_date = hfsp_now2mt(); 127 file->access_date = hfsp_now2mt();
119 if (cnid == inode->i_ino) { 128 if (cnid == inode->i_ino) {
120 hfsplus_set_perms(inode, &file->permissions); 129 hfsplus_cat_set_perms(inode, &file->permissions);
121 if (S_ISLNK(inode->i_mode)) { 130 if (S_ISLNK(inode->i_mode)) {
122 file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE); 131 file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE);
123 file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR); 132 file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR);
124 } else { 133 } else {
125 file->user_info.fdType = cpu_to_be32(HFSPLUS_SB(inode->i_sb).type); 134 file->user_info.fdType = cpu_to_be32(sbi->type);
126 file->user_info.fdCreator = cpu_to_be32(HFSPLUS_SB(inode->i_sb).creator); 135 file->user_info.fdCreator = cpu_to_be32(sbi->creator);
127 } 136 }
128 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE) 137 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
129 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); 138 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
@@ -131,8 +140,8 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
131 file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE); 140 file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE);
132 file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR); 141 file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR);
133 file->user_info.fdFlags = cpu_to_be16(0x100); 142 file->user_info.fdFlags = cpu_to_be16(0x100);
134 file->create_date = HFSPLUS_I(HFSPLUS_SB(inode->i_sb).hidden_dir).create_date; 143 file->create_date = HFSPLUS_I(sbi->hidden_dir)->create_date;
135 file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode).dev); 144 file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode)->linkid);
136 } 145 }
137 return sizeof(*file); 146 return sizeof(*file);
138 } 147 }
@@ -180,15 +189,14 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
180 189
181int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode) 190int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode)
182{ 191{
192 struct super_block *sb = dir->i_sb;
183 struct hfs_find_data fd; 193 struct hfs_find_data fd;
184 struct super_block *sb;
185 hfsplus_cat_entry entry; 194 hfsplus_cat_entry entry;
186 int entry_size; 195 int entry_size;
187 int err; 196 int err;
188 197
189 dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink); 198 dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink);
190 sb = dir->i_sb; 199 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
191 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
192 200
193 hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); 201 hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
194 entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ? 202 entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
@@ -234,7 +242,7 @@ err2:
234 242
235int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) 243int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
236{ 244{
237 struct super_block *sb; 245 struct super_block *sb = dir->i_sb;
238 struct hfs_find_data fd; 246 struct hfs_find_data fd;
239 struct hfsplus_fork_raw fork; 247 struct hfsplus_fork_raw fork;
240 struct list_head *pos; 248 struct list_head *pos;
@@ -242,8 +250,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
242 u16 type; 250 u16 type;
243 251
244 dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); 252 dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
245 sb = dir->i_sb; 253 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
246 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
247 254
248 if (!str) { 255 if (!str) {
249 int len; 256 int len;
@@ -279,7 +286,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
279 hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC); 286 hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC);
280 } 287 }
281 288
282 list_for_each(pos, &HFSPLUS_I(dir).open_dir_list) { 289 list_for_each(pos, &HFSPLUS_I(dir)->open_dir_list) {
283 struct hfsplus_readdir_data *rd = 290 struct hfsplus_readdir_data *rd =
284 list_entry(pos, struct hfsplus_readdir_data, list); 291 list_entry(pos, struct hfsplus_readdir_data, list);
285 if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0) 292 if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0)
@@ -312,7 +319,7 @@ int hfsplus_rename_cat(u32 cnid,
312 struct inode *src_dir, struct qstr *src_name, 319 struct inode *src_dir, struct qstr *src_name,
313 struct inode *dst_dir, struct qstr *dst_name) 320 struct inode *dst_dir, struct qstr *dst_name)
314{ 321{
315 struct super_block *sb; 322 struct super_block *sb = src_dir->i_sb;
316 struct hfs_find_data src_fd, dst_fd; 323 struct hfs_find_data src_fd, dst_fd;
317 hfsplus_cat_entry entry; 324 hfsplus_cat_entry entry;
318 int entry_size, type; 325 int entry_size, type;
@@ -320,8 +327,7 @@ int hfsplus_rename_cat(u32 cnid,
320 327
321 dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name, 328 dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name,
322 dst_dir->i_ino, dst_name->name); 329 dst_dir->i_ino, dst_name->name);
323 sb = src_dir->i_sb; 330 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd);
324 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &src_fd);
325 dst_fd = src_fd; 331 dst_fd = src_fd;
326 332
327 /* find the old dir entry and read the data */ 333 /* find the old dir entry and read the data */
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 764fd1bdca88..e318bbc0daf6 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -39,7 +39,7 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
39 39
40 dentry->d_op = &hfsplus_dentry_operations; 40 dentry->d_op = &hfsplus_dentry_operations;
41 dentry->d_fsdata = NULL; 41 dentry->d_fsdata = NULL;
42 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 42 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
43 hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name); 43 hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
44again: 44again:
45 err = hfs_brec_read(&fd, &entry, sizeof(entry)); 45 err = hfs_brec_read(&fd, &entry, sizeof(entry));
@@ -68,9 +68,9 @@ again:
68 cnid = be32_to_cpu(entry.file.id); 68 cnid = be32_to_cpu(entry.file.id);
69 if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) && 69 if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) &&
70 entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) && 70 entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
71 (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb).hidden_dir).create_date || 71 (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->create_date ||
72 entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode).create_date) && 72 entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode)->create_date) &&
73 HFSPLUS_SB(sb).hidden_dir) { 73 HFSPLUS_SB(sb)->hidden_dir) {
74 struct qstr str; 74 struct qstr str;
75 char name[32]; 75 char name[32];
76 76
@@ -86,7 +86,8 @@ again:
86 linkid = be32_to_cpu(entry.file.permissions.dev); 86 linkid = be32_to_cpu(entry.file.permissions.dev);
87 str.len = sprintf(name, "iNode%d", linkid); 87 str.len = sprintf(name, "iNode%d", linkid);
88 str.name = name; 88 str.name = name;
89 hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_SB(sb).hidden_dir->i_ino, &str); 89 hfsplus_cat_build_key(sb, fd.search_key,
90 HFSPLUS_SB(sb)->hidden_dir->i_ino, &str);
90 goto again; 91 goto again;
91 } 92 }
92 } else if (!dentry->d_fsdata) 93 } else if (!dentry->d_fsdata)
@@ -101,7 +102,7 @@ again:
101 if (IS_ERR(inode)) 102 if (IS_ERR(inode))
102 return ERR_CAST(inode); 103 return ERR_CAST(inode);
103 if (S_ISREG(inode->i_mode)) 104 if (S_ISREG(inode->i_mode))
104 HFSPLUS_I(inode).dev = linkid; 105 HFSPLUS_I(inode)->linkid = linkid;
105out: 106out:
106 d_add(dentry, inode); 107 d_add(dentry, inode);
107 return NULL; 108 return NULL;
@@ -124,7 +125,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
124 if (filp->f_pos >= inode->i_size) 125 if (filp->f_pos >= inode->i_size)
125 return 0; 126 return 0;
126 127
127 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 128 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
128 hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); 129 hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
129 err = hfs_brec_find(&fd); 130 err = hfs_brec_find(&fd);
130 if (err) 131 if (err)
@@ -180,8 +181,9 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
180 err = -EIO; 181 err = -EIO;
181 goto out; 182 goto out;
182 } 183 }
183 if (HFSPLUS_SB(sb).hidden_dir && 184 if (HFSPLUS_SB(sb)->hidden_dir &&
184 HFSPLUS_SB(sb).hidden_dir->i_ino == be32_to_cpu(entry.folder.id)) 185 HFSPLUS_SB(sb)->hidden_dir->i_ino ==
186 be32_to_cpu(entry.folder.id))
185 goto next; 187 goto next;
186 if (filldir(dirent, strbuf, len, filp->f_pos, 188 if (filldir(dirent, strbuf, len, filp->f_pos,
187 be32_to_cpu(entry.folder.id), DT_DIR)) 189 be32_to_cpu(entry.folder.id), DT_DIR))
@@ -217,7 +219,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
217 } 219 }
218 filp->private_data = rd; 220 filp->private_data = rd;
219 rd->file = filp; 221 rd->file = filp;
220 list_add(&rd->list, &HFSPLUS_I(inode).open_dir_list); 222 list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list);
221 } 223 }
222 memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key)); 224 memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key));
223out: 225out:
@@ -229,38 +231,18 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file)
229{ 231{
230 struct hfsplus_readdir_data *rd = file->private_data; 232 struct hfsplus_readdir_data *rd = file->private_data;
231 if (rd) { 233 if (rd) {
234 mutex_lock(&inode->i_mutex);
232 list_del(&rd->list); 235 list_del(&rd->list);
236 mutex_unlock(&inode->i_mutex);
233 kfree(rd); 237 kfree(rd);
234 } 238 }
235 return 0; 239 return 0;
236} 240}
237 241
238static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
239 struct nameidata *nd)
240{
241 struct inode *inode;
242 int res;
243
244 inode = hfsplus_new_inode(dir->i_sb, mode);
245 if (!inode)
246 return -ENOSPC;
247
248 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
249 if (res) {
250 inode->i_nlink = 0;
251 hfsplus_delete_inode(inode);
252 iput(inode);
253 return res;
254 }
255 hfsplus_instantiate(dentry, inode, inode->i_ino);
256 mark_inode_dirty(inode);
257 return 0;
258}
259
260static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, 242static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
261 struct dentry *dst_dentry) 243 struct dentry *dst_dentry)
262{ 244{
263 struct super_block *sb = dst_dir->i_sb; 245 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dst_dir->i_sb);
264 struct inode *inode = src_dentry->d_inode; 246 struct inode *inode = src_dentry->d_inode;
265 struct inode *src_dir = src_dentry->d_parent->d_inode; 247 struct inode *src_dir = src_dentry->d_parent->d_inode;
266 struct qstr str; 248 struct qstr str;
@@ -270,7 +252,10 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
270 252
271 if (HFSPLUS_IS_RSRC(inode)) 253 if (HFSPLUS_IS_RSRC(inode))
272 return -EPERM; 254 return -EPERM;
255 if (!S_ISREG(inode->i_mode))
256 return -EPERM;
273 257
258 mutex_lock(&sbi->vh_mutex);
274 if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) { 259 if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) {
275 for (;;) { 260 for (;;) {
276 get_random_bytes(&id, sizeof(cnid)); 261 get_random_bytes(&id, sizeof(cnid));
@@ -279,40 +264,41 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
279 str.len = sprintf(name, "iNode%d", id); 264 str.len = sprintf(name, "iNode%d", id);
280 res = hfsplus_rename_cat(inode->i_ino, 265 res = hfsplus_rename_cat(inode->i_ino,
281 src_dir, &src_dentry->d_name, 266 src_dir, &src_dentry->d_name,
282 HFSPLUS_SB(sb).hidden_dir, &str); 267 sbi->hidden_dir, &str);
283 if (!res) 268 if (!res)
284 break; 269 break;
285 if (res != -EEXIST) 270 if (res != -EEXIST)
286 return res; 271 goto out;
287 } 272 }
288 HFSPLUS_I(inode).dev = id; 273 HFSPLUS_I(inode)->linkid = id;
289 cnid = HFSPLUS_SB(sb).next_cnid++; 274 cnid = sbi->next_cnid++;
290 src_dentry->d_fsdata = (void *)(unsigned long)cnid; 275 src_dentry->d_fsdata = (void *)(unsigned long)cnid;
291 res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode); 276 res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode);
292 if (res) 277 if (res)
293 /* panic? */ 278 /* panic? */
294 return res; 279 goto out;
295 HFSPLUS_SB(sb).file_count++; 280 sbi->file_count++;
296 } 281 }
297 cnid = HFSPLUS_SB(sb).next_cnid++; 282 cnid = sbi->next_cnid++;
298 res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode); 283 res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode);
299 if (res) 284 if (res)
300 return res; 285 goto out;
301 286
302 inc_nlink(inode); 287 inc_nlink(inode);
303 hfsplus_instantiate(dst_dentry, inode, cnid); 288 hfsplus_instantiate(dst_dentry, inode, cnid);
304 atomic_inc(&inode->i_count); 289 ihold(inode);
305 inode->i_ctime = CURRENT_TIME_SEC; 290 inode->i_ctime = CURRENT_TIME_SEC;
306 mark_inode_dirty(inode); 291 mark_inode_dirty(inode);
307 HFSPLUS_SB(sb).file_count++; 292 sbi->file_count++;
308 sb->s_dirt = 1; 293 dst_dir->i_sb->s_dirt = 1;
309 294out:
310 return 0; 295 mutex_unlock(&sbi->vh_mutex);
296 return res;
311} 297}
312 298
313static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) 299static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
314{ 300{
315 struct super_block *sb = dir->i_sb; 301 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
316 struct inode *inode = dentry->d_inode; 302 struct inode *inode = dentry->d_inode;
317 struct qstr str; 303 struct qstr str;
318 char name[32]; 304 char name[32];
@@ -322,21 +308,22 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
322 if (HFSPLUS_IS_RSRC(inode)) 308 if (HFSPLUS_IS_RSRC(inode))
323 return -EPERM; 309 return -EPERM;
324 310
311 mutex_lock(&sbi->vh_mutex);
325 cnid = (u32)(unsigned long)dentry->d_fsdata; 312 cnid = (u32)(unsigned long)dentry->d_fsdata;
326 if (inode->i_ino == cnid && 313 if (inode->i_ino == cnid &&
327 atomic_read(&HFSPLUS_I(inode).opencnt)) { 314 atomic_read(&HFSPLUS_I(inode)->opencnt)) {
328 str.name = name; 315 str.name = name;
329 str.len = sprintf(name, "temp%lu", inode->i_ino); 316 str.len = sprintf(name, "temp%lu", inode->i_ino);
330 res = hfsplus_rename_cat(inode->i_ino, 317 res = hfsplus_rename_cat(inode->i_ino,
331 dir, &dentry->d_name, 318 dir, &dentry->d_name,
332 HFSPLUS_SB(sb).hidden_dir, &str); 319 sbi->hidden_dir, &str);
333 if (!res) 320 if (!res)
334 inode->i_flags |= S_DEAD; 321 inode->i_flags |= S_DEAD;
335 return res; 322 goto out;
336 } 323 }
337 res = hfsplus_delete_cat(cnid, dir, &dentry->d_name); 324 res = hfsplus_delete_cat(cnid, dir, &dentry->d_name);
338 if (res) 325 if (res)
339 return res; 326 goto out;
340 327
341 if (inode->i_nlink > 0) 328 if (inode->i_nlink > 0)
342 drop_nlink(inode); 329 drop_nlink(inode);
@@ -344,10 +331,10 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
344 clear_nlink(inode); 331 clear_nlink(inode);
345 if (!inode->i_nlink) { 332 if (!inode->i_nlink) {
346 if (inode->i_ino != cnid) { 333 if (inode->i_ino != cnid) {
347 HFSPLUS_SB(sb).file_count--; 334 sbi->file_count--;
348 if (!atomic_read(&HFSPLUS_I(inode).opencnt)) { 335 if (!atomic_read(&HFSPLUS_I(inode)->opencnt)) {
349 res = hfsplus_delete_cat(inode->i_ino, 336 res = hfsplus_delete_cat(inode->i_ino,
350 HFSPLUS_SB(sb).hidden_dir, 337 sbi->hidden_dir,
351 NULL); 338 NULL);
352 if (!res) 339 if (!res)
353 hfsplus_delete_inode(inode); 340 hfsplus_delete_inode(inode);
@@ -356,107 +343,108 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
356 } else 343 } else
357 hfsplus_delete_inode(inode); 344 hfsplus_delete_inode(inode);
358 } else 345 } else
359 HFSPLUS_SB(sb).file_count--; 346 sbi->file_count--;
360 inode->i_ctime = CURRENT_TIME_SEC; 347 inode->i_ctime = CURRENT_TIME_SEC;
361 mark_inode_dirty(inode); 348 mark_inode_dirty(inode);
362 349out:
350 mutex_unlock(&sbi->vh_mutex);
363 return res; 351 return res;
364} 352}
365 353
366static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
367{
368 struct inode *inode;
369 int res;
370
371 inode = hfsplus_new_inode(dir->i_sb, S_IFDIR | mode);
372 if (!inode)
373 return -ENOSPC;
374
375 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
376 if (res) {
377 inode->i_nlink = 0;
378 hfsplus_delete_inode(inode);
379 iput(inode);
380 return res;
381 }
382 hfsplus_instantiate(dentry, inode, inode->i_ino);
383 mark_inode_dirty(inode);
384 return 0;
385}
386
387static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry) 354static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
388{ 355{
389 struct inode *inode; 356 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
357 struct inode *inode = dentry->d_inode;
390 int res; 358 int res;
391 359
392 inode = dentry->d_inode;
393 if (inode->i_size != 2) 360 if (inode->i_size != 2)
394 return -ENOTEMPTY; 361 return -ENOTEMPTY;
362
363 mutex_lock(&sbi->vh_mutex);
395 res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name); 364 res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name);
396 if (res) 365 if (res)
397 return res; 366 goto out;
398 clear_nlink(inode); 367 clear_nlink(inode);
399 inode->i_ctime = CURRENT_TIME_SEC; 368 inode->i_ctime = CURRENT_TIME_SEC;
400 hfsplus_delete_inode(inode); 369 hfsplus_delete_inode(inode);
401 mark_inode_dirty(inode); 370 mark_inode_dirty(inode);
402 return 0; 371out:
372 mutex_unlock(&sbi->vh_mutex);
373 return res;
403} 374}
404 375
405static int hfsplus_symlink(struct inode *dir, struct dentry *dentry, 376static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
406 const char *symname) 377 const char *symname)
407{ 378{
408 struct super_block *sb; 379 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
409 struct inode *inode; 380 struct inode *inode;
410 int res; 381 int res = -ENOSPC;
411 382
412 sb = dir->i_sb; 383 mutex_lock(&sbi->vh_mutex);
413 inode = hfsplus_new_inode(sb, S_IFLNK | S_IRWXUGO); 384 inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO);
414 if (!inode) 385 if (!inode)
415 return -ENOSPC; 386 goto out;
416 387
417 res = page_symlink(inode, symname, strlen(symname) + 1); 388 res = page_symlink(inode, symname, strlen(symname) + 1);
418 if (res) { 389 if (res)
419 inode->i_nlink = 0; 390 goto out_err;
420 hfsplus_delete_inode(inode);
421 iput(inode);
422 return res;
423 }
424 391
425 mark_inode_dirty(inode);
426 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); 392 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
393 if (res)
394 goto out_err;
427 395
428 if (!res) { 396 hfsplus_instantiate(dentry, inode, inode->i_ino);
429 hfsplus_instantiate(dentry, inode, inode->i_ino); 397 mark_inode_dirty(inode);
430 mark_inode_dirty(inode); 398 goto out;
431 }
432 399
400out_err:
401 inode->i_nlink = 0;
402 hfsplus_delete_inode(inode);
403 iput(inode);
404out:
405 mutex_unlock(&sbi->vh_mutex);
433 return res; 406 return res;
434} 407}
435 408
436static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, 409static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
437 int mode, dev_t rdev) 410 int mode, dev_t rdev)
438{ 411{
439 struct super_block *sb; 412 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
440 struct inode *inode; 413 struct inode *inode;
441 int res; 414 int res = -ENOSPC;
442 415
443 sb = dir->i_sb; 416 mutex_lock(&sbi->vh_mutex);
444 inode = hfsplus_new_inode(sb, mode); 417 inode = hfsplus_new_inode(dir->i_sb, mode);
445 if (!inode) 418 if (!inode)
446 return -ENOSPC; 419 goto out;
420
421 if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode))
422 init_special_inode(inode, mode, rdev);
447 423
448 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); 424 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
449 if (res) { 425 if (res) {
450 inode->i_nlink = 0; 426 inode->i_nlink = 0;
451 hfsplus_delete_inode(inode); 427 hfsplus_delete_inode(inode);
452 iput(inode); 428 iput(inode);
453 return res; 429 goto out;
454 } 430 }
455 init_special_inode(inode, mode, rdev); 431
456 hfsplus_instantiate(dentry, inode, inode->i_ino); 432 hfsplus_instantiate(dentry, inode, inode->i_ino);
457 mark_inode_dirty(inode); 433 mark_inode_dirty(inode);
434out:
435 mutex_unlock(&sbi->vh_mutex);
436 return res;
437}
458 438
459 return 0; 439static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
440 struct nameidata *nd)
441{
442 return hfsplus_mknod(dir, dentry, mode, 0);
443}
444
445static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
446{
447 return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0);
460} 448}
461 449
462static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, 450static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -466,7 +454,10 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
466 454
467 /* Unlink destination if it already exists */ 455 /* Unlink destination if it already exists */
468 if (new_dentry->d_inode) { 456 if (new_dentry->d_inode) {
469 res = hfsplus_unlink(new_dir, new_dentry); 457 if (S_ISDIR(new_dentry->d_inode->i_mode))
458 res = hfsplus_rmdir(new_dir, new_dentry);
459 else
460 res = hfsplus_unlink(new_dir, new_dentry);
470 if (res) 461 if (res)
471 return res; 462 return res;
472 } 463 }
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 0022eec63cda..0c9cb1820a52 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -85,35 +85,49 @@ static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext)
85 85
86static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) 86static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd)
87{ 87{
88 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
88 int res; 89 int res;
89 90
90 hfsplus_ext_build_key(fd->search_key, inode->i_ino, HFSPLUS_I(inode).cached_start, 91 WARN_ON(!mutex_is_locked(&hip->extents_lock));
91 HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); 92
93 hfsplus_ext_build_key(fd->search_key, inode->i_ino, hip->cached_start,
94 HFSPLUS_IS_RSRC(inode) ?
95 HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
96
92 res = hfs_brec_find(fd); 97 res = hfs_brec_find(fd);
93 if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_NEW) { 98 if (hip->flags & HFSPLUS_FLG_EXT_NEW) {
94 if (res != -ENOENT) 99 if (res != -ENOENT)
95 return; 100 return;
96 hfs_brec_insert(fd, HFSPLUS_I(inode).cached_extents, sizeof(hfsplus_extent_rec)); 101 hfs_brec_insert(fd, hip->cached_extents,
97 HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); 102 sizeof(hfsplus_extent_rec));
103 hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
98 } else { 104 } else {
99 if (res) 105 if (res)
100 return; 106 return;
101 hfs_bnode_write(fd->bnode, HFSPLUS_I(inode).cached_extents, fd->entryoffset, fd->entrylength); 107 hfs_bnode_write(fd->bnode, hip->cached_extents,
102 HFSPLUS_I(inode).flags &= ~HFSPLUS_FLG_EXT_DIRTY; 108 fd->entryoffset, fd->entrylength);
109 hip->flags &= ~HFSPLUS_FLG_EXT_DIRTY;
103 } 110 }
104} 111}
105 112
106void hfsplus_ext_write_extent(struct inode *inode) 113static void hfsplus_ext_write_extent_locked(struct inode *inode)
107{ 114{
108 if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY) { 115 if (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_EXT_DIRTY) {
109 struct hfs_find_data fd; 116 struct hfs_find_data fd;
110 117
111 hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd); 118 hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
112 __hfsplus_ext_write_extent(inode, &fd); 119 __hfsplus_ext_write_extent(inode, &fd);
113 hfs_find_exit(&fd); 120 hfs_find_exit(&fd);
114 } 121 }
115} 122}
116 123
124void hfsplus_ext_write_extent(struct inode *inode)
125{
126 mutex_lock(&HFSPLUS_I(inode)->extents_lock);
127 hfsplus_ext_write_extent_locked(inode);
128 mutex_unlock(&HFSPLUS_I(inode)->extents_lock);
129}
130
117static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd, 131static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
118 struct hfsplus_extent *extent, 132 struct hfsplus_extent *extent,
119 u32 cnid, u32 block, u8 type) 133 u32 cnid, u32 block, u8 type)
@@ -136,33 +150,39 @@ static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
136 150
137static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block) 151static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block)
138{ 152{
153 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
139 int res; 154 int res;
140 155
141 if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY) 156 WARN_ON(!mutex_is_locked(&hip->extents_lock));
157
158 if (hip->flags & HFSPLUS_FLG_EXT_DIRTY)
142 __hfsplus_ext_write_extent(inode, fd); 159 __hfsplus_ext_write_extent(inode, fd);
143 160
144 res = __hfsplus_ext_read_extent(fd, HFSPLUS_I(inode).cached_extents, inode->i_ino, 161 res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino,
145 block, HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); 162 block, HFSPLUS_IS_RSRC(inode) ?
163 HFSPLUS_TYPE_RSRC :
164 HFSPLUS_TYPE_DATA);
146 if (!res) { 165 if (!res) {
147 HFSPLUS_I(inode).cached_start = be32_to_cpu(fd->key->ext.start_block); 166 hip->cached_start = be32_to_cpu(fd->key->ext.start_block);
148 HFSPLUS_I(inode).cached_blocks = hfsplus_ext_block_count(HFSPLUS_I(inode).cached_extents); 167 hip->cached_blocks = hfsplus_ext_block_count(hip->cached_extents);
149 } else { 168 } else {
150 HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0; 169 hip->cached_start = hip->cached_blocks = 0;
151 HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); 170 hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
152 } 171 }
153 return res; 172 return res;
154} 173}
155 174
156static int hfsplus_ext_read_extent(struct inode *inode, u32 block) 175static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
157{ 176{
177 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
158 struct hfs_find_data fd; 178 struct hfs_find_data fd;
159 int res; 179 int res;
160 180
161 if (block >= HFSPLUS_I(inode).cached_start && 181 if (block >= hip->cached_start &&
162 block < HFSPLUS_I(inode).cached_start + HFSPLUS_I(inode).cached_blocks) 182 block < hip->cached_start + hip->cached_blocks)
163 return 0; 183 return 0;
164 184
165 hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd); 185 hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
166 res = __hfsplus_ext_cache_extent(&fd, inode, block); 186 res = __hfsplus_ext_cache_extent(&fd, inode, block);
167 hfs_find_exit(&fd); 187 hfs_find_exit(&fd);
168 return res; 188 return res;
@@ -172,21 +192,21 @@ static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
172int hfsplus_get_block(struct inode *inode, sector_t iblock, 192int hfsplus_get_block(struct inode *inode, sector_t iblock,
173 struct buffer_head *bh_result, int create) 193 struct buffer_head *bh_result, int create)
174{ 194{
175 struct super_block *sb; 195 struct super_block *sb = inode->i_sb;
196 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
197 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
176 int res = -EIO; 198 int res = -EIO;
177 u32 ablock, dblock, mask; 199 u32 ablock, dblock, mask;
178 int shift; 200 int shift;
179 201
180 sb = inode->i_sb;
181
182 /* Convert inode block to disk allocation block */ 202 /* Convert inode block to disk allocation block */
183 shift = HFSPLUS_SB(sb).alloc_blksz_shift - sb->s_blocksize_bits; 203 shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
184 ablock = iblock >> HFSPLUS_SB(sb).fs_shift; 204 ablock = iblock >> sbi->fs_shift;
185 205
186 if (iblock >= HFSPLUS_I(inode).fs_blocks) { 206 if (iblock >= hip->fs_blocks) {
187 if (iblock > HFSPLUS_I(inode).fs_blocks || !create) 207 if (iblock > hip->fs_blocks || !create)
188 return -EIO; 208 return -EIO;
189 if (ablock >= HFSPLUS_I(inode).alloc_blocks) { 209 if (ablock >= hip->alloc_blocks) {
190 res = hfsplus_file_extend(inode); 210 res = hfsplus_file_extend(inode);
191 if (res) 211 if (res)
192 return res; 212 return res;
@@ -194,33 +214,33 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
194 } else 214 } else
195 create = 0; 215 create = 0;
196 216
197 if (ablock < HFSPLUS_I(inode).first_blocks) { 217 if (ablock < hip->first_blocks) {
198 dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).first_extents, ablock); 218 dblock = hfsplus_ext_find_block(hip->first_extents, ablock);
199 goto done; 219 goto done;
200 } 220 }
201 221
202 if (inode->i_ino == HFSPLUS_EXT_CNID) 222 if (inode->i_ino == HFSPLUS_EXT_CNID)
203 return -EIO; 223 return -EIO;
204 224
205 mutex_lock(&HFSPLUS_I(inode).extents_lock); 225 mutex_lock(&hip->extents_lock);
206 res = hfsplus_ext_read_extent(inode, ablock); 226 res = hfsplus_ext_read_extent(inode, ablock);
207 if (!res) { 227 if (!res) {
208 dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).cached_extents, ablock - 228 dblock = hfsplus_ext_find_block(hip->cached_extents,
209 HFSPLUS_I(inode).cached_start); 229 ablock - hip->cached_start);
210 } else { 230 } else {
211 mutex_unlock(&HFSPLUS_I(inode).extents_lock); 231 mutex_unlock(&hip->extents_lock);
212 return -EIO; 232 return -EIO;
213 } 233 }
214 mutex_unlock(&HFSPLUS_I(inode).extents_lock); 234 mutex_unlock(&hip->extents_lock);
215 235
216done: 236done:
217 dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock); 237 dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock);
218 mask = (1 << HFSPLUS_SB(sb).fs_shift) - 1; 238 mask = (1 << sbi->fs_shift) - 1;
219 map_bh(bh_result, sb, (dblock << HFSPLUS_SB(sb).fs_shift) + HFSPLUS_SB(sb).blockoffset + (iblock & mask)); 239 map_bh(bh_result, sb, (dblock << sbi->fs_shift) + sbi->blockoffset + (iblock & mask));
220 if (create) { 240 if (create) {
221 set_buffer_new(bh_result); 241 set_buffer_new(bh_result);
222 HFSPLUS_I(inode).phys_size += sb->s_blocksize; 242 hip->phys_size += sb->s_blocksize;
223 HFSPLUS_I(inode).fs_blocks++; 243 hip->fs_blocks++;
224 inode_add_bytes(inode, sb->s_blocksize); 244 inode_add_bytes(inode, sb->s_blocksize);
225 mark_inode_dirty(inode); 245 mark_inode_dirty(inode);
226 } 246 }
@@ -327,7 +347,7 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw
327 if (total_blocks == blocks) 347 if (total_blocks == blocks)
328 return 0; 348 return 0;
329 349
330 hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd); 350 hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
331 do { 351 do {
332 res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid, 352 res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid,
333 total_blocks, type); 353 total_blocks, type);
@@ -348,29 +368,33 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw
348int hfsplus_file_extend(struct inode *inode) 368int hfsplus_file_extend(struct inode *inode)
349{ 369{
350 struct super_block *sb = inode->i_sb; 370 struct super_block *sb = inode->i_sb;
371 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
372 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
351 u32 start, len, goal; 373 u32 start, len, goal;
352 int res; 374 int res;
353 375
354 if (HFSPLUS_SB(sb).alloc_file->i_size * 8 < HFSPLUS_SB(sb).total_blocks - HFSPLUS_SB(sb).free_blocks + 8) { 376 if (sbi->alloc_file->i_size * 8 <
377 sbi->total_blocks - sbi->free_blocks + 8) {
355 // extend alloc file 378 // extend alloc file
356 printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n", HFSPLUS_SB(sb).alloc_file->i_size * 8, 379 printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n",
357 HFSPLUS_SB(sb).total_blocks, HFSPLUS_SB(sb).free_blocks); 380 sbi->alloc_file->i_size * 8,
381 sbi->total_blocks, sbi->free_blocks);
358 return -ENOSPC; 382 return -ENOSPC;
359 } 383 }
360 384
361 mutex_lock(&HFSPLUS_I(inode).extents_lock); 385 mutex_lock(&hip->extents_lock);
362 if (HFSPLUS_I(inode).alloc_blocks == HFSPLUS_I(inode).first_blocks) 386 if (hip->alloc_blocks == hip->first_blocks)
363 goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).first_extents); 387 goal = hfsplus_ext_lastblock(hip->first_extents);
364 else { 388 else {
365 res = hfsplus_ext_read_extent(inode, HFSPLUS_I(inode).alloc_blocks); 389 res = hfsplus_ext_read_extent(inode, hip->alloc_blocks);
366 if (res) 390 if (res)
367 goto out; 391 goto out;
368 goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).cached_extents); 392 goal = hfsplus_ext_lastblock(hip->cached_extents);
369 } 393 }
370 394
371 len = HFSPLUS_I(inode).clump_blocks; 395 len = hip->clump_blocks;
372 start = hfsplus_block_allocate(sb, HFSPLUS_SB(sb).total_blocks, goal, &len); 396 start = hfsplus_block_allocate(sb, sbi->total_blocks, goal, &len);
373 if (start >= HFSPLUS_SB(sb).total_blocks) { 397 if (start >= sbi->total_blocks) {
374 start = hfsplus_block_allocate(sb, goal, 0, &len); 398 start = hfsplus_block_allocate(sb, goal, 0, &len);
375 if (start >= goal) { 399 if (start >= goal) {
376 res = -ENOSPC; 400 res = -ENOSPC;
@@ -379,56 +403,56 @@ int hfsplus_file_extend(struct inode *inode)
379 } 403 }
380 404
381 dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); 405 dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
382 if (HFSPLUS_I(inode).alloc_blocks <= HFSPLUS_I(inode).first_blocks) { 406
383 if (!HFSPLUS_I(inode).first_blocks) { 407 if (hip->alloc_blocks <= hip->first_blocks) {
408 if (!hip->first_blocks) {
384 dprint(DBG_EXTENT, "first extents\n"); 409 dprint(DBG_EXTENT, "first extents\n");
385 /* no extents yet */ 410 /* no extents yet */
386 HFSPLUS_I(inode).first_extents[0].start_block = cpu_to_be32(start); 411 hip->first_extents[0].start_block = cpu_to_be32(start);
387 HFSPLUS_I(inode).first_extents[0].block_count = cpu_to_be32(len); 412 hip->first_extents[0].block_count = cpu_to_be32(len);
388 res = 0; 413 res = 0;
389 } else { 414 } else {
390 /* try to append to extents in inode */ 415 /* try to append to extents in inode */
391 res = hfsplus_add_extent(HFSPLUS_I(inode).first_extents, 416 res = hfsplus_add_extent(hip->first_extents,
392 HFSPLUS_I(inode).alloc_blocks, 417 hip->alloc_blocks,
393 start, len); 418 start, len);
394 if (res == -ENOSPC) 419 if (res == -ENOSPC)
395 goto insert_extent; 420 goto insert_extent;
396 } 421 }
397 if (!res) { 422 if (!res) {
398 hfsplus_dump_extent(HFSPLUS_I(inode).first_extents); 423 hfsplus_dump_extent(hip->first_extents);
399 HFSPLUS_I(inode).first_blocks += len; 424 hip->first_blocks += len;
400 } 425 }
401 } else { 426 } else {
402 res = hfsplus_add_extent(HFSPLUS_I(inode).cached_extents, 427 res = hfsplus_add_extent(hip->cached_extents,
403 HFSPLUS_I(inode).alloc_blocks - 428 hip->alloc_blocks - hip->cached_start,
404 HFSPLUS_I(inode).cached_start,
405 start, len); 429 start, len);
406 if (!res) { 430 if (!res) {
407 hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); 431 hfsplus_dump_extent(hip->cached_extents);
408 HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY; 432 hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
409 HFSPLUS_I(inode).cached_blocks += len; 433 hip->cached_blocks += len;
410 } else if (res == -ENOSPC) 434 } else if (res == -ENOSPC)
411 goto insert_extent; 435 goto insert_extent;
412 } 436 }
413out: 437out:
414 mutex_unlock(&HFSPLUS_I(inode).extents_lock); 438 mutex_unlock(&hip->extents_lock);
415 if (!res) { 439 if (!res) {
416 HFSPLUS_I(inode).alloc_blocks += len; 440 hip->alloc_blocks += len;
417 mark_inode_dirty(inode); 441 mark_inode_dirty(inode);
418 } 442 }
419 return res; 443 return res;
420 444
421insert_extent: 445insert_extent:
422 dprint(DBG_EXTENT, "insert new extent\n"); 446 dprint(DBG_EXTENT, "insert new extent\n");
423 hfsplus_ext_write_extent(inode); 447 hfsplus_ext_write_extent_locked(inode);
424 448
425 memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); 449 memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
426 HFSPLUS_I(inode).cached_extents[0].start_block = cpu_to_be32(start); 450 hip->cached_extents[0].start_block = cpu_to_be32(start);
427 HFSPLUS_I(inode).cached_extents[0].block_count = cpu_to_be32(len); 451 hip->cached_extents[0].block_count = cpu_to_be32(len);
428 hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); 452 hfsplus_dump_extent(hip->cached_extents);
429 HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW; 453 hip->flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW;
430 HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).alloc_blocks; 454 hip->cached_start = hip->alloc_blocks;
431 HFSPLUS_I(inode).cached_blocks = len; 455 hip->cached_blocks = len;
432 456
433 res = 0; 457 res = 0;
434 goto out; 458 goto out;
@@ -437,13 +461,15 @@ insert_extent:
437void hfsplus_file_truncate(struct inode *inode) 461void hfsplus_file_truncate(struct inode *inode)
438{ 462{
439 struct super_block *sb = inode->i_sb; 463 struct super_block *sb = inode->i_sb;
464 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
440 struct hfs_find_data fd; 465 struct hfs_find_data fd;
441 u32 alloc_cnt, blk_cnt, start; 466 u32 alloc_cnt, blk_cnt, start;
442 int res; 467 int res;
443 468
444 dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", inode->i_ino, 469 dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n",
445 (long long)HFSPLUS_I(inode).phys_size, inode->i_size); 470 inode->i_ino, (long long)hip->phys_size, inode->i_size);
446 if (inode->i_size > HFSPLUS_I(inode).phys_size) { 471
472 if (inode->i_size > hip->phys_size) {
447 struct address_space *mapping = inode->i_mapping; 473 struct address_space *mapping = inode->i_mapping;
448 struct page *page; 474 struct page *page;
449 void *fsdata; 475 void *fsdata;
@@ -460,47 +486,48 @@ void hfsplus_file_truncate(struct inode *inode)
460 return; 486 return;
461 mark_inode_dirty(inode); 487 mark_inode_dirty(inode);
462 return; 488 return;
463 } else if (inode->i_size == HFSPLUS_I(inode).phys_size) 489 } else if (inode->i_size == hip->phys_size)
464 return; 490 return;
465 491
466 blk_cnt = (inode->i_size + HFSPLUS_SB(sb).alloc_blksz - 1) >> HFSPLUS_SB(sb).alloc_blksz_shift; 492 blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >>
467 alloc_cnt = HFSPLUS_I(inode).alloc_blocks; 493 HFSPLUS_SB(sb)->alloc_blksz_shift;
494 alloc_cnt = hip->alloc_blocks;
468 if (blk_cnt == alloc_cnt) 495 if (blk_cnt == alloc_cnt)
469 goto out; 496 goto out;
470 497
471 mutex_lock(&HFSPLUS_I(inode).extents_lock); 498 mutex_lock(&hip->extents_lock);
472 hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd); 499 hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
473 while (1) { 500 while (1) {
474 if (alloc_cnt == HFSPLUS_I(inode).first_blocks) { 501 if (alloc_cnt == hip->first_blocks) {
475 hfsplus_free_extents(sb, HFSPLUS_I(inode).first_extents, 502 hfsplus_free_extents(sb, hip->first_extents,
476 alloc_cnt, alloc_cnt - blk_cnt); 503 alloc_cnt, alloc_cnt - blk_cnt);
477 hfsplus_dump_extent(HFSPLUS_I(inode).first_extents); 504 hfsplus_dump_extent(hip->first_extents);
478 HFSPLUS_I(inode).first_blocks = blk_cnt; 505 hip->first_blocks = blk_cnt;
479 break; 506 break;
480 } 507 }
481 res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt); 508 res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt);
482 if (res) 509 if (res)
483 break; 510 break;
484 start = HFSPLUS_I(inode).cached_start; 511 start = hip->cached_start;
485 hfsplus_free_extents(sb, HFSPLUS_I(inode).cached_extents, 512 hfsplus_free_extents(sb, hip->cached_extents,
486 alloc_cnt - start, alloc_cnt - blk_cnt); 513 alloc_cnt - start, alloc_cnt - blk_cnt);
487 hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); 514 hfsplus_dump_extent(hip->cached_extents);
488 if (blk_cnt > start) { 515 if (blk_cnt > start) {
489 HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY; 516 hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
490 break; 517 break;
491 } 518 }
492 alloc_cnt = start; 519 alloc_cnt = start;
493 HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0; 520 hip->cached_start = hip->cached_blocks = 0;
494 HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); 521 hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
495 hfs_brec_remove(&fd); 522 hfs_brec_remove(&fd);
496 } 523 }
497 hfs_find_exit(&fd); 524 hfs_find_exit(&fd);
498 mutex_unlock(&HFSPLUS_I(inode).extents_lock); 525 mutex_unlock(&hip->extents_lock);
499 526
500 HFSPLUS_I(inode).alloc_blocks = blk_cnt; 527 hip->alloc_blocks = blk_cnt;
501out: 528out:
502 HFSPLUS_I(inode).phys_size = inode->i_size; 529 hip->phys_size = inode->i_size;
503 HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; 530 hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
504 inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits); 531 inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
505 mark_inode_dirty(inode); 532 mark_inode_dirty(inode);
506} 533}
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index dc856be3c2b0..cb3653efb57a 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -62,7 +62,7 @@ struct hfs_btree {
62 unsigned int depth; 62 unsigned int depth;
63 63
64 //unsigned int map1_size, map_size; 64 //unsigned int map1_size, map_size;
65 struct semaphore tree_lock; 65 struct mutex tree_lock;
66 66
67 unsigned int pages_per_bnode; 67 unsigned int pages_per_bnode;
68 spinlock_t hash_lock; 68 spinlock_t hash_lock;
@@ -121,16 +121,21 @@ struct hfsplus_sb_info {
121 u32 sect_count; 121 u32 sect_count;
122 int fs_shift; 122 int fs_shift;
123 123
124 /* Stuff in host order from Vol Header */ 124 /* immutable data from the volume header */
125 u32 alloc_blksz; 125 u32 alloc_blksz;
126 int alloc_blksz_shift; 126 int alloc_blksz_shift;
127 u32 total_blocks; 127 u32 total_blocks;
128 u32 data_clump_blocks, rsrc_clump_blocks;
129
130 /* mutable data from the volume header, protected by alloc_mutex */
128 u32 free_blocks; 131 u32 free_blocks;
129 u32 next_alloc; 132 struct mutex alloc_mutex;
133
134 /* mutable data from the volume header, protected by vh_mutex */
130 u32 next_cnid; 135 u32 next_cnid;
131 u32 file_count; 136 u32 file_count;
132 u32 folder_count; 137 u32 folder_count;
133 u32 data_clump_blocks, rsrc_clump_blocks; 138 struct mutex vh_mutex;
134 139
135 /* Config options */ 140 /* Config options */
136 u32 creator; 141 u32 creator;
@@ -143,40 +148,50 @@ struct hfsplus_sb_info {
143 int part, session; 148 int part, session;
144 149
145 unsigned long flags; 150 unsigned long flags;
146
147 struct hlist_head rsrc_inodes;
148}; 151};
149 152
150#define HFSPLUS_SB_WRITEBACKUP 0x0001 153#define HFSPLUS_SB_WRITEBACKUP 0
151#define HFSPLUS_SB_NODECOMPOSE 0x0002 154#define HFSPLUS_SB_NODECOMPOSE 1
152#define HFSPLUS_SB_FORCE 0x0004 155#define HFSPLUS_SB_FORCE 2
153#define HFSPLUS_SB_HFSX 0x0008 156#define HFSPLUS_SB_HFSX 3
154#define HFSPLUS_SB_CASEFOLD 0x0010 157#define HFSPLUS_SB_CASEFOLD 4
155 158
156 159
157struct hfsplus_inode_info { 160struct hfsplus_inode_info {
158 struct mutex extents_lock;
159 u32 clump_blocks, alloc_blocks;
160 sector_t fs_blocks;
161 /* Allocation extents from catalog record or volume header */
162 hfsplus_extent_rec first_extents;
163 u32 first_blocks;
164 hfsplus_extent_rec cached_extents;
165 u32 cached_start, cached_blocks;
166 atomic_t opencnt; 161 atomic_t opencnt;
167 162
168 struct inode *rsrc_inode; 163 /*
164 * Extent allocation information, protected by extents_lock.
165 */
166 u32 first_blocks;
167 u32 clump_blocks;
168 u32 alloc_blocks;
169 u32 cached_start;
170 u32 cached_blocks;
171 hfsplus_extent_rec first_extents;
172 hfsplus_extent_rec cached_extents;
169 unsigned long flags; 173 unsigned long flags;
174 struct mutex extents_lock;
170 175
176 /*
177 * Immutable data.
178 */
179 struct inode *rsrc_inode;
171 __be32 create_date; 180 __be32 create_date;
172 /* Device number in hfsplus_permissions in catalog */
173 u32 dev;
174 /* BSD system and user file flags */
175 u8 rootflags;
176 u8 userflags;
177 181
182 /*
183 * Protected by sbi->vh_mutex.
184 */
185 u32 linkid;
186
187 /*
188 * Protected by i_mutex.
189 */
190 sector_t fs_blocks;
191 u8 userflags; /* BSD user file flags */
178 struct list_head open_dir_list; 192 struct list_head open_dir_list;
179 loff_t phys_size; 193 loff_t phys_size;
194
180 struct inode vfs_inode; 195 struct inode vfs_inode;
181}; 196};
182 197
@@ -184,8 +199,8 @@ struct hfsplus_inode_info {
184#define HFSPLUS_FLG_EXT_DIRTY 0x0002 199#define HFSPLUS_FLG_EXT_DIRTY 0x0002
185#define HFSPLUS_FLG_EXT_NEW 0x0004 200#define HFSPLUS_FLG_EXT_NEW 0x0004
186 201
187#define HFSPLUS_IS_DATA(inode) (!(HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC)) 202#define HFSPLUS_IS_DATA(inode) (!(HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC))
188#define HFSPLUS_IS_RSRC(inode) (HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC) 203#define HFSPLUS_IS_RSRC(inode) (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC)
189 204
190struct hfs_find_data { 205struct hfs_find_data {
191 /* filled by caller */ 206 /* filled by caller */
@@ -311,6 +326,7 @@ int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *);
311int hfsplus_delete_cat(u32, struct inode *, struct qstr *); 326int hfsplus_delete_cat(u32, struct inode *, struct qstr *);
312int hfsplus_rename_cat(u32, struct inode *, struct qstr *, 327int hfsplus_rename_cat(u32, struct inode *, struct qstr *,
313 struct inode *, struct qstr *); 328 struct inode *, struct qstr *);
329void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms);
314 330
315/* dir.c */ 331/* dir.c */
316extern const struct inode_operations hfsplus_dir_inode_operations; 332extern const struct inode_operations hfsplus_dir_inode_operations;
@@ -372,26 +388,15 @@ int hfsplus_read_wrapper(struct super_block *);
372int hfs_part_find(struct super_block *, sector_t *, sector_t *); 388int hfs_part_find(struct super_block *, sector_t *, sector_t *);
373 389
374/* access macros */ 390/* access macros */
375/*
376static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb) 391static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
377{ 392{
378 return sb->s_fs_info; 393 return sb->s_fs_info;
379} 394}
395
380static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode) 396static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
381{ 397{
382 return list_entry(inode, struct hfsplus_inode_info, vfs_inode); 398 return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
383} 399}
384*/
385#define HFSPLUS_SB(super) (*(struct hfsplus_sb_info *)(super)->s_fs_info)
386#define HFSPLUS_I(inode) (*list_entry(inode, struct hfsplus_inode_info, vfs_inode))
387
388#if 1
389#define hfsplus_kmap(p) ({ struct page *__p = (p); kmap(__p); })
390#define hfsplus_kunmap(p) ({ struct page *__p = (p); kunmap(__p); __p; })
391#else
392#define hfsplus_kmap(p) kmap(p)
393#define hfsplus_kunmap(p) kunmap(p)
394#endif
395 400
396#define sb_bread512(sb, sec, data) ({ \ 401#define sb_bread512(sb, sec, data) ({ \
397 struct buffer_head *__bh; \ 402 struct buffer_head *__bh; \
@@ -419,6 +424,4 @@ static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
419#define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec) 424#define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec)
420#define hfsp_now2mt() __hfsp_ut2mt(get_seconds()) 425#define hfsp_now2mt() __hfsp_ut2mt(get_seconds())
421 426
422#define kdev_t_to_nr(x) (x)
423
424#endif 427#endif
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index fe99fe8db61a..6892899fd6fb 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -200,6 +200,7 @@ struct hfsplus_cat_key {
200 struct hfsplus_unistr name; 200 struct hfsplus_unistr name;
201} __packed; 201} __packed;
202 202
203#define HFSPLUS_CAT_KEYLEN (sizeof(struct hfsplus_cat_key))
203 204
204/* Structs from hfs.h */ 205/* Structs from hfs.h */
205struct hfsp_point { 206struct hfsp_point {
@@ -323,7 +324,7 @@ struct hfsplus_ext_key {
323 __be32 start_block; 324 __be32 start_block;
324} __packed; 325} __packed;
325 326
326#define HFSPLUS_EXT_KEYLEN 12 327#define HFSPLUS_EXT_KEYLEN sizeof(struct hfsplus_ext_key)
327 328
328/* HFS+ generic BTree key */ 329/* HFS+ generic BTree key */
329typedef union { 330typedef union {
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index c5a979d62c65..8afd7e84f98d 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -36,7 +36,7 @@ static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
36 *pagep = NULL; 36 *pagep = NULL;
37 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 37 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
38 hfsplus_get_block, 38 hfsplus_get_block,
39 &HFSPLUS_I(mapping->host).phys_size); 39 &HFSPLUS_I(mapping->host)->phys_size);
40 if (unlikely(ret)) { 40 if (unlikely(ret)) {
41 loff_t isize = mapping->host->i_size; 41 loff_t isize = mapping->host->i_size;
42 if (pos + len > isize) 42 if (pos + len > isize)
@@ -62,13 +62,13 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
62 62
63 switch (inode->i_ino) { 63 switch (inode->i_ino) {
64 case HFSPLUS_EXT_CNID: 64 case HFSPLUS_EXT_CNID:
65 tree = HFSPLUS_SB(sb).ext_tree; 65 tree = HFSPLUS_SB(sb)->ext_tree;
66 break; 66 break;
67 case HFSPLUS_CAT_CNID: 67 case HFSPLUS_CAT_CNID:
68 tree = HFSPLUS_SB(sb).cat_tree; 68 tree = HFSPLUS_SB(sb)->cat_tree;
69 break; 69 break;
70 case HFSPLUS_ATTR_CNID: 70 case HFSPLUS_ATTR_CNID:
71 tree = HFSPLUS_SB(sb).attr_tree; 71 tree = HFSPLUS_SB(sb)->attr_tree;
72 break; 72 break;
73 default: 73 default:
74 BUG(); 74 BUG();
@@ -172,12 +172,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
172 struct hfs_find_data fd; 172 struct hfs_find_data fd;
173 struct super_block *sb = dir->i_sb; 173 struct super_block *sb = dir->i_sb;
174 struct inode *inode = NULL; 174 struct inode *inode = NULL;
175 struct hfsplus_inode_info *hip;
175 int err; 176 int err;
176 177
177 if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc")) 178 if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
178 goto out; 179 goto out;
179 180
180 inode = HFSPLUS_I(dir).rsrc_inode; 181 inode = HFSPLUS_I(dir)->rsrc_inode;
181 if (inode) 182 if (inode)
182 goto out; 183 goto out;
183 184
@@ -185,12 +186,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
185 if (!inode) 186 if (!inode)
186 return ERR_PTR(-ENOMEM); 187 return ERR_PTR(-ENOMEM);
187 188
189 hip = HFSPLUS_I(inode);
188 inode->i_ino = dir->i_ino; 190 inode->i_ino = dir->i_ino;
189 INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); 191 INIT_LIST_HEAD(&hip->open_dir_list);
190 mutex_init(&HFSPLUS_I(inode).extents_lock); 192 mutex_init(&hip->extents_lock);
191 HFSPLUS_I(inode).flags = HFSPLUS_FLG_RSRC; 193 hip->flags = HFSPLUS_FLG_RSRC;
192 194
193 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 195 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
194 err = hfsplus_find_cat(sb, dir->i_ino, &fd); 196 err = hfsplus_find_cat(sb, dir->i_ino, &fd);
195 if (!err) 197 if (!err)
196 err = hfsplus_cat_read_inode(inode, &fd); 198 err = hfsplus_cat_read_inode(inode, &fd);
@@ -199,10 +201,18 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
199 iput(inode); 201 iput(inode);
200 return ERR_PTR(err); 202 return ERR_PTR(err);
201 } 203 }
202 HFSPLUS_I(inode).rsrc_inode = dir; 204 hip->rsrc_inode = dir;
203 HFSPLUS_I(dir).rsrc_inode = inode; 205 HFSPLUS_I(dir)->rsrc_inode = inode;
204 igrab(dir); 206 igrab(dir);
205 hlist_add_head(&inode->i_hash, &HFSPLUS_SB(sb).rsrc_inodes); 207
208 /*
209 * __mark_inode_dirty expects inodes to be hashed. Since we don't
210 * want resource fork inodes in the regular inode space, we make them
211 * appear hashed, but do not put on any lists. hlist_del()
212 * will work fine and require no locking.
213 */
214 hlist_add_fake(&inode->i_hash);
215
206 mark_inode_dirty(inode); 216 mark_inode_dirty(inode);
207out: 217out:
208 d_add(dentry, inode); 218 d_add(dentry, inode);
@@ -211,30 +221,27 @@ out:
211 221
212static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir) 222static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir)
213{ 223{
214 struct super_block *sb = inode->i_sb; 224 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
215 u16 mode; 225 u16 mode;
216 226
217 mode = be16_to_cpu(perms->mode); 227 mode = be16_to_cpu(perms->mode);
218 228
219 inode->i_uid = be32_to_cpu(perms->owner); 229 inode->i_uid = be32_to_cpu(perms->owner);
220 if (!inode->i_uid && !mode) 230 if (!inode->i_uid && !mode)
221 inode->i_uid = HFSPLUS_SB(sb).uid; 231 inode->i_uid = sbi->uid;
222 232
223 inode->i_gid = be32_to_cpu(perms->group); 233 inode->i_gid = be32_to_cpu(perms->group);
224 if (!inode->i_gid && !mode) 234 if (!inode->i_gid && !mode)
225 inode->i_gid = HFSPLUS_SB(sb).gid; 235 inode->i_gid = sbi->gid;
226 236
227 if (dir) { 237 if (dir) {
228 mode = mode ? (mode & S_IALLUGO) : 238 mode = mode ? (mode & S_IALLUGO) : (S_IRWXUGO & ~(sbi->umask));
229 (S_IRWXUGO & ~(HFSPLUS_SB(sb).umask));
230 mode |= S_IFDIR; 239 mode |= S_IFDIR;
231 } else if (!mode) 240 } else if (!mode)
232 mode = S_IFREG | ((S_IRUGO|S_IWUGO) & 241 mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask));
233 ~(HFSPLUS_SB(sb).umask));
234 inode->i_mode = mode; 242 inode->i_mode = mode;
235 243
236 HFSPLUS_I(inode).rootflags = perms->rootflags; 244 HFSPLUS_I(inode)->userflags = perms->userflags;
237 HFSPLUS_I(inode).userflags = perms->userflags;
238 if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE) 245 if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE)
239 inode->i_flags |= S_IMMUTABLE; 246 inode->i_flags |= S_IMMUTABLE;
240 else 247 else
@@ -245,30 +252,13 @@ static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, i
245 inode->i_flags &= ~S_APPEND; 252 inode->i_flags &= ~S_APPEND;
246} 253}
247 254
248static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
249{
250 if (inode->i_flags & S_IMMUTABLE)
251 perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
252 else
253 perms->rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
254 if (inode->i_flags & S_APPEND)
255 perms->rootflags |= HFSPLUS_FLG_APPEND;
256 else
257 perms->rootflags &= ~HFSPLUS_FLG_APPEND;
258 perms->userflags = HFSPLUS_I(inode).userflags;
259 perms->mode = cpu_to_be16(inode->i_mode);
260 perms->owner = cpu_to_be32(inode->i_uid);
261 perms->group = cpu_to_be32(inode->i_gid);
262 perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev);
263}
264
265static int hfsplus_file_open(struct inode *inode, struct file *file) 255static int hfsplus_file_open(struct inode *inode, struct file *file)
266{ 256{
267 if (HFSPLUS_IS_RSRC(inode)) 257 if (HFSPLUS_IS_RSRC(inode))
268 inode = HFSPLUS_I(inode).rsrc_inode; 258 inode = HFSPLUS_I(inode)->rsrc_inode;
269 if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) 259 if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
270 return -EOVERFLOW; 260 return -EOVERFLOW;
271 atomic_inc(&HFSPLUS_I(inode).opencnt); 261 atomic_inc(&HFSPLUS_I(inode)->opencnt);
272 return 0; 262 return 0;
273} 263}
274 264
@@ -277,12 +267,13 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
277 struct super_block *sb = inode->i_sb; 267 struct super_block *sb = inode->i_sb;
278 268
279 if (HFSPLUS_IS_RSRC(inode)) 269 if (HFSPLUS_IS_RSRC(inode))
280 inode = HFSPLUS_I(inode).rsrc_inode; 270 inode = HFSPLUS_I(inode)->rsrc_inode;
281 if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) { 271 if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) {
282 mutex_lock(&inode->i_mutex); 272 mutex_lock(&inode->i_mutex);
283 hfsplus_file_truncate(inode); 273 hfsplus_file_truncate(inode);
284 if (inode->i_flags & S_DEAD) { 274 if (inode->i_flags & S_DEAD) {
285 hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL); 275 hfsplus_delete_cat(inode->i_ino,
276 HFSPLUS_SB(sb)->hidden_dir, NULL);
286 hfsplus_delete_inode(inode); 277 hfsplus_delete_inode(inode);
287 } 278 }
288 mutex_unlock(&inode->i_mutex); 279 mutex_unlock(&inode->i_mutex);
@@ -361,47 +352,52 @@ static const struct file_operations hfsplus_file_operations = {
361 352
362struct inode *hfsplus_new_inode(struct super_block *sb, int mode) 353struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
363{ 354{
355 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
364 struct inode *inode = new_inode(sb); 356 struct inode *inode = new_inode(sb);
357 struct hfsplus_inode_info *hip;
358
365 if (!inode) 359 if (!inode)
366 return NULL; 360 return NULL;
367 361
368 inode->i_ino = HFSPLUS_SB(sb).next_cnid++; 362 inode->i_ino = sbi->next_cnid++;
369 inode->i_mode = mode; 363 inode->i_mode = mode;
370 inode->i_uid = current_fsuid(); 364 inode->i_uid = current_fsuid();
371 inode->i_gid = current_fsgid(); 365 inode->i_gid = current_fsgid();
372 inode->i_nlink = 1; 366 inode->i_nlink = 1;
373 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 367 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
374 INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); 368
375 mutex_init(&HFSPLUS_I(inode).extents_lock); 369 hip = HFSPLUS_I(inode);
376 atomic_set(&HFSPLUS_I(inode).opencnt, 0); 370 INIT_LIST_HEAD(&hip->open_dir_list);
377 HFSPLUS_I(inode).flags = 0; 371 mutex_init(&hip->extents_lock);
378 memset(HFSPLUS_I(inode).first_extents, 0, sizeof(hfsplus_extent_rec)); 372 atomic_set(&hip->opencnt, 0);
379 memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); 373 hip->flags = 0;
380 HFSPLUS_I(inode).alloc_blocks = 0; 374 memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));
381 HFSPLUS_I(inode).first_blocks = 0; 375 memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
382 HFSPLUS_I(inode).cached_start = 0; 376 hip->alloc_blocks = 0;
383 HFSPLUS_I(inode).cached_blocks = 0; 377 hip->first_blocks = 0;
384 HFSPLUS_I(inode).phys_size = 0; 378 hip->cached_start = 0;
385 HFSPLUS_I(inode).fs_blocks = 0; 379 hip->cached_blocks = 0;
386 HFSPLUS_I(inode).rsrc_inode = NULL; 380 hip->phys_size = 0;
381 hip->fs_blocks = 0;
382 hip->rsrc_inode = NULL;
387 if (S_ISDIR(inode->i_mode)) { 383 if (S_ISDIR(inode->i_mode)) {
388 inode->i_size = 2; 384 inode->i_size = 2;
389 HFSPLUS_SB(sb).folder_count++; 385 sbi->folder_count++;
390 inode->i_op = &hfsplus_dir_inode_operations; 386 inode->i_op = &hfsplus_dir_inode_operations;
391 inode->i_fop = &hfsplus_dir_operations; 387 inode->i_fop = &hfsplus_dir_operations;
392 } else if (S_ISREG(inode->i_mode)) { 388 } else if (S_ISREG(inode->i_mode)) {
393 HFSPLUS_SB(sb).file_count++; 389 sbi->file_count++;
394 inode->i_op = &hfsplus_file_inode_operations; 390 inode->i_op = &hfsplus_file_inode_operations;
395 inode->i_fop = &hfsplus_file_operations; 391 inode->i_fop = &hfsplus_file_operations;
396 inode->i_mapping->a_ops = &hfsplus_aops; 392 inode->i_mapping->a_ops = &hfsplus_aops;
397 HFSPLUS_I(inode).clump_blocks = HFSPLUS_SB(sb).data_clump_blocks; 393 hip->clump_blocks = sbi->data_clump_blocks;
398 } else if (S_ISLNK(inode->i_mode)) { 394 } else if (S_ISLNK(inode->i_mode)) {
399 HFSPLUS_SB(sb).file_count++; 395 sbi->file_count++;
400 inode->i_op = &page_symlink_inode_operations; 396 inode->i_op = &page_symlink_inode_operations;
401 inode->i_mapping->a_ops = &hfsplus_aops; 397 inode->i_mapping->a_ops = &hfsplus_aops;
402 HFSPLUS_I(inode).clump_blocks = 1; 398 hip->clump_blocks = 1;
403 } else 399 } else
404 HFSPLUS_SB(sb).file_count++; 400 sbi->file_count++;
405 insert_inode_hash(inode); 401 insert_inode_hash(inode);
406 mark_inode_dirty(inode); 402 mark_inode_dirty(inode);
407 sb->s_dirt = 1; 403 sb->s_dirt = 1;
@@ -414,11 +410,11 @@ void hfsplus_delete_inode(struct inode *inode)
414 struct super_block *sb = inode->i_sb; 410 struct super_block *sb = inode->i_sb;
415 411
416 if (S_ISDIR(inode->i_mode)) { 412 if (S_ISDIR(inode->i_mode)) {
417 HFSPLUS_SB(sb).folder_count--; 413 HFSPLUS_SB(sb)->folder_count--;
418 sb->s_dirt = 1; 414 sb->s_dirt = 1;
419 return; 415 return;
420 } 416 }
421 HFSPLUS_SB(sb).file_count--; 417 HFSPLUS_SB(sb)->file_count--;
422 if (S_ISREG(inode->i_mode)) { 418 if (S_ISREG(inode->i_mode)) {
423 if (!inode->i_nlink) { 419 if (!inode->i_nlink) {
424 inode->i_size = 0; 420 inode->i_size = 0;
@@ -434,34 +430,39 @@ void hfsplus_delete_inode(struct inode *inode)
434void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork) 430void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
435{ 431{
436 struct super_block *sb = inode->i_sb; 432 struct super_block *sb = inode->i_sb;
433 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
434 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
437 u32 count; 435 u32 count;
438 int i; 436 int i;
439 437
440 memcpy(&HFSPLUS_I(inode).first_extents, &fork->extents, 438 memcpy(&hip->first_extents, &fork->extents, sizeof(hfsplus_extent_rec));
441 sizeof(hfsplus_extent_rec));
442 for (count = 0, i = 0; i < 8; i++) 439 for (count = 0, i = 0; i < 8; i++)
443 count += be32_to_cpu(fork->extents[i].block_count); 440 count += be32_to_cpu(fork->extents[i].block_count);
444 HFSPLUS_I(inode).first_blocks = count; 441 hip->first_blocks = count;
445 memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); 442 memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
446 HFSPLUS_I(inode).cached_start = 0; 443 hip->cached_start = 0;
447 HFSPLUS_I(inode).cached_blocks = 0; 444 hip->cached_blocks = 0;
448 445
449 HFSPLUS_I(inode).alloc_blocks = be32_to_cpu(fork->total_blocks); 446 hip->alloc_blocks = be32_to_cpu(fork->total_blocks);
450 inode->i_size = HFSPLUS_I(inode).phys_size = be64_to_cpu(fork->total_size); 447 hip->phys_size = inode->i_size = be64_to_cpu(fork->total_size);
451 HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; 448 hip->fs_blocks =
452 inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits); 449 (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
453 HFSPLUS_I(inode).clump_blocks = be32_to_cpu(fork->clump_size) >> HFSPLUS_SB(sb).alloc_blksz_shift; 450 inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
454 if (!HFSPLUS_I(inode).clump_blocks) 451 hip->clump_blocks =
455 HFSPLUS_I(inode).clump_blocks = HFSPLUS_IS_RSRC(inode) ? HFSPLUS_SB(sb).rsrc_clump_blocks : 452 be32_to_cpu(fork->clump_size) >> sbi->alloc_blksz_shift;
456 HFSPLUS_SB(sb).data_clump_blocks; 453 if (!hip->clump_blocks) {
454 hip->clump_blocks = HFSPLUS_IS_RSRC(inode) ?
455 sbi->rsrc_clump_blocks :
456 sbi->data_clump_blocks;
457 }
457} 458}
458 459
459void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork) 460void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
460{ 461{
461 memcpy(&fork->extents, &HFSPLUS_I(inode).first_extents, 462 memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents,
462 sizeof(hfsplus_extent_rec)); 463 sizeof(hfsplus_extent_rec));
463 fork->total_size = cpu_to_be64(inode->i_size); 464 fork->total_size = cpu_to_be64(inode->i_size);
464 fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode).alloc_blocks); 465 fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode)->alloc_blocks);
465} 466}
466 467
467int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) 468int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
@@ -472,7 +473,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
472 473
473 type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset); 474 type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset);
474 475
475 HFSPLUS_I(inode).dev = 0; 476 HFSPLUS_I(inode)->linkid = 0;
476 if (type == HFSPLUS_FOLDER) { 477 if (type == HFSPLUS_FOLDER) {
477 struct hfsplus_cat_folder *folder = &entry.folder; 478 struct hfsplus_cat_folder *folder = &entry.folder;
478 479
@@ -486,8 +487,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
486 inode->i_atime = hfsp_mt2ut(folder->access_date); 487 inode->i_atime = hfsp_mt2ut(folder->access_date);
487 inode->i_mtime = hfsp_mt2ut(folder->content_mod_date); 488 inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
488 inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date); 489 inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date);
489 HFSPLUS_I(inode).create_date = folder->create_date; 490 HFSPLUS_I(inode)->create_date = folder->create_date;
490 HFSPLUS_I(inode).fs_blocks = 0; 491 HFSPLUS_I(inode)->fs_blocks = 0;
491 inode->i_op = &hfsplus_dir_inode_operations; 492 inode->i_op = &hfsplus_dir_inode_operations;
492 inode->i_fop = &hfsplus_dir_operations; 493 inode->i_fop = &hfsplus_dir_operations;
493 } else if (type == HFSPLUS_FILE) { 494 } else if (type == HFSPLUS_FILE) {
@@ -518,7 +519,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
518 inode->i_atime = hfsp_mt2ut(file->access_date); 519 inode->i_atime = hfsp_mt2ut(file->access_date);
519 inode->i_mtime = hfsp_mt2ut(file->content_mod_date); 520 inode->i_mtime = hfsp_mt2ut(file->content_mod_date);
520 inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date); 521 inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date);
521 HFSPLUS_I(inode).create_date = file->create_date; 522 HFSPLUS_I(inode)->create_date = file->create_date;
522 } else { 523 } else {
523 printk(KERN_ERR "hfs: bad catalog entry used to create inode\n"); 524 printk(KERN_ERR "hfs: bad catalog entry used to create inode\n");
524 res = -EIO; 525 res = -EIO;
@@ -533,12 +534,12 @@ int hfsplus_cat_write_inode(struct inode *inode)
533 hfsplus_cat_entry entry; 534 hfsplus_cat_entry entry;
534 535
535 if (HFSPLUS_IS_RSRC(inode)) 536 if (HFSPLUS_IS_RSRC(inode))
536 main_inode = HFSPLUS_I(inode).rsrc_inode; 537 main_inode = HFSPLUS_I(inode)->rsrc_inode;
537 538
538 if (!main_inode->i_nlink) 539 if (!main_inode->i_nlink)
539 return 0; 540 return 0;
540 541
541 if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb).cat_tree, &fd)) 542 if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb)->cat_tree, &fd))
542 /* panic? */ 543 /* panic? */
543 return -EIO; 544 return -EIO;
544 545
@@ -554,7 +555,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
554 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, 555 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
555 sizeof(struct hfsplus_cat_folder)); 556 sizeof(struct hfsplus_cat_folder));
556 /* simple node checks? */ 557 /* simple node checks? */
557 hfsplus_set_perms(inode, &folder->permissions); 558 hfsplus_cat_set_perms(inode, &folder->permissions);
558 folder->access_date = hfsp_ut2mt(inode->i_atime); 559 folder->access_date = hfsp_ut2mt(inode->i_atime);
559 folder->content_mod_date = hfsp_ut2mt(inode->i_mtime); 560 folder->content_mod_date = hfsp_ut2mt(inode->i_mtime);
560 folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime); 561 folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
@@ -576,11 +577,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
576 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, 577 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
577 sizeof(struct hfsplus_cat_file)); 578 sizeof(struct hfsplus_cat_file));
578 hfsplus_inode_write_fork(inode, &file->data_fork); 579 hfsplus_inode_write_fork(inode, &file->data_fork);
579 if (S_ISREG(inode->i_mode)) 580 hfsplus_cat_set_perms(inode, &file->permissions);
580 HFSPLUS_I(inode).dev = inode->i_nlink;
581 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
582 HFSPLUS_I(inode).dev = kdev_t_to_nr(inode->i_rdev);
583 hfsplus_set_perms(inode, &file->permissions);
584 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE) 581 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
585 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); 582 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
586 else 583 else
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index ac405f099026..5b4667e08ef7 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -17,83 +17,98 @@
17#include <linux/mount.h> 17#include <linux/mount.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/xattr.h> 19#include <linux/xattr.h>
20#include <linux/smp_lock.h>
21#include <asm/uaccess.h> 20#include <asm/uaccess.h>
22#include "hfsplus_fs.h" 21#include "hfsplus_fs.h"
23 22
24long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 23static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
25{ 24{
26 struct inode *inode = filp->f_path.dentry->d_inode; 25 struct inode *inode = file->f_path.dentry->d_inode;
26 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
27 unsigned int flags = 0;
28
29 if (inode->i_flags & S_IMMUTABLE)
30 flags |= FS_IMMUTABLE_FL;
31 if (inode->i_flags |= S_APPEND)
32 flags |= FS_APPEND_FL;
33 if (hip->userflags & HFSPLUS_FLG_NODUMP)
34 flags |= FS_NODUMP_FL;
35
36 return put_user(flags, user_flags);
37}
38
39static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
40{
41 struct inode *inode = file->f_path.dentry->d_inode;
42 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
27 unsigned int flags; 43 unsigned int flags;
44 int err = 0;
28 45
29 lock_kernel(); 46 err = mnt_want_write(file->f_path.mnt);
30 switch (cmd) { 47 if (err)
31 case HFSPLUS_IOC_EXT2_GETFLAGS: 48 goto out;
32 flags = 0;
33 if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_IMMUTABLE)
34 flags |= FS_IMMUTABLE_FL; /* EXT2_IMMUTABLE_FL */
35 if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_APPEND)
36 flags |= FS_APPEND_FL; /* EXT2_APPEND_FL */
37 if (HFSPLUS_I(inode).userflags & HFSPLUS_FLG_NODUMP)
38 flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */
39 return put_user(flags, (int __user *)arg);
40 case HFSPLUS_IOC_EXT2_SETFLAGS: {
41 int err = 0;
42 err = mnt_want_write(filp->f_path.mnt);
43 if (err) {
44 unlock_kernel();
45 return err;
46 }
47 49
48 if (!is_owner_or_cap(inode)) { 50 if (!is_owner_or_cap(inode)) {
49 err = -EACCES; 51 err = -EACCES;
50 goto setflags_out; 52 goto out_drop_write;
51 } 53 }
52 if (get_user(flags, (int __user *)arg)) {
53 err = -EFAULT;
54 goto setflags_out;
55 }
56 if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) ||
57 HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) {
58 if (!capable(CAP_LINUX_IMMUTABLE)) {
59 err = -EPERM;
60 goto setflags_out;
61 }
62 }
63 54
64 /* don't silently ignore unsupported ext2 flags */ 55 if (get_user(flags, user_flags)) {
65 if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) { 56 err = -EFAULT;
66 err = -EOPNOTSUPP; 57 goto out_drop_write;
67 goto setflags_out; 58 }
68 } 59
69 if (flags & FS_IMMUTABLE_FL) { /* EXT2_IMMUTABLE_FL */ 60 mutex_lock(&inode->i_mutex);
70 inode->i_flags |= S_IMMUTABLE; 61
71 HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE; 62 if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) ||
72 } else { 63 inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
73 inode->i_flags &= ~S_IMMUTABLE; 64 if (!capable(CAP_LINUX_IMMUTABLE)) {
74 HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_IMMUTABLE; 65 err = -EPERM;
75 } 66 goto out_unlock_inode;
76 if (flags & FS_APPEND_FL) { /* EXT2_APPEND_FL */
77 inode->i_flags |= S_APPEND;
78 HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_APPEND;
79 } else {
80 inode->i_flags &= ~S_APPEND;
81 HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_APPEND;
82 } 67 }
83 if (flags & FS_NODUMP_FL) /* EXT2_NODUMP_FL */
84 HFSPLUS_I(inode).userflags |= HFSPLUS_FLG_NODUMP;
85 else
86 HFSPLUS_I(inode).userflags &= ~HFSPLUS_FLG_NODUMP;
87
88 inode->i_ctime = CURRENT_TIME_SEC;
89 mark_inode_dirty(inode);
90setflags_out:
91 mnt_drop_write(filp->f_path.mnt);
92 unlock_kernel();
93 return err;
94 } 68 }
69
70 /* don't silently ignore unsupported ext2 flags */
71 if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
72 err = -EOPNOTSUPP;
73 goto out_unlock_inode;
74 }
75
76 if (flags & FS_IMMUTABLE_FL)
77 inode->i_flags |= S_IMMUTABLE;
78 else
79 inode->i_flags &= ~S_IMMUTABLE;
80
81 if (flags & FS_APPEND_FL)
82 inode->i_flags |= S_APPEND;
83 else
84 inode->i_flags &= ~S_APPEND;
85
86 if (flags & FS_NODUMP_FL)
87 hip->userflags |= HFSPLUS_FLG_NODUMP;
88 else
89 hip->userflags &= ~HFSPLUS_FLG_NODUMP;
90
91 inode->i_ctime = CURRENT_TIME_SEC;
92 mark_inode_dirty(inode);
93
94out_unlock_inode:
95 mutex_lock(&inode->i_mutex);
96out_drop_write:
97 mnt_drop_write(file->f_path.mnt);
98out:
99 return err;
100}
101
102long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
103{
104 void __user *argp = (void __user *)arg;
105
106 switch (cmd) {
107 case HFSPLUS_IOC_EXT2_GETFLAGS:
108 return hfsplus_ioctl_getflags(file, argp);
109 case HFSPLUS_IOC_EXT2_SETFLAGS:
110 return hfsplus_ioctl_setflags(file, argp);
95 default: 111 default:
96 unlock_kernel();
97 return -ENOTTY; 112 return -ENOTTY;
98 } 113 }
99} 114}
@@ -110,7 +125,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
110 if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode)) 125 if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode))
111 return -EOPNOTSUPP; 126 return -EOPNOTSUPP;
112 127
113 res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd); 128 res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
114 if (res) 129 if (res)
115 return res; 130 return res;
116 res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); 131 res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
@@ -153,7 +168,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
153 return -EOPNOTSUPP; 168 return -EOPNOTSUPP;
154 169
155 if (size) { 170 if (size) {
156 res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd); 171 res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
157 if (res) 172 if (res)
158 return res; 173 return res;
159 res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); 174 res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
@@ -177,7 +192,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
177 } else 192 } else
178 res = size ? -ERANGE : 4; 193 res = size ? -ERANGE : 4;
179 } else 194 } else
180 res = -ENODATA; 195 res = -EOPNOTSUPP;
181out: 196out:
182 if (size) 197 if (size)
183 hfs_find_exit(&fd); 198 hfs_find_exit(&fd);
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 572628b4b07d..f9ab276a4d8d 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -143,13 +143,13 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
143 kfree(p); 143 kfree(p);
144 break; 144 break;
145 case opt_decompose: 145 case opt_decompose:
146 sbi->flags &= ~HFSPLUS_SB_NODECOMPOSE; 146 clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
147 break; 147 break;
148 case opt_nodecompose: 148 case opt_nodecompose:
149 sbi->flags |= HFSPLUS_SB_NODECOMPOSE; 149 set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
150 break; 150 break;
151 case opt_force: 151 case opt_force:
152 sbi->flags |= HFSPLUS_SB_FORCE; 152 set_bit(HFSPLUS_SB_FORCE, &sbi->flags);
153 break; 153 break;
154 default: 154 default:
155 return 0; 155 return 0;
@@ -171,7 +171,7 @@ done:
171 171
172int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt) 172int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
173{ 173{
174 struct hfsplus_sb_info *sbi = &HFSPLUS_SB(mnt->mnt_sb); 174 struct hfsplus_sb_info *sbi = HFSPLUS_SB(mnt->mnt_sb);
175 175
176 if (sbi->creator != HFSPLUS_DEF_CR_TYPE) 176 if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
177 seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator); 177 seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
@@ -184,7 +184,7 @@ int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
184 seq_printf(seq, ",session=%u", sbi->session); 184 seq_printf(seq, ",session=%u", sbi->session);
185 if (sbi->nls) 185 if (sbi->nls)
186 seq_printf(seq, ",nls=%s", sbi->nls->charset); 186 seq_printf(seq, ",nls=%s", sbi->nls->charset);
187 if (sbi->flags & HFSPLUS_SB_NODECOMPOSE) 187 if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags))
188 seq_printf(seq, ",nodecompose"); 188 seq_printf(seq, ",nodecompose");
189 return 0; 189 return 0;
190} 190}
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index 1528a6fd0299..208b16c645cc 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -74,6 +74,7 @@ struct old_pmap {
74int hfs_part_find(struct super_block *sb, 74int hfs_part_find(struct super_block *sb,
75 sector_t *part_start, sector_t *part_size) 75 sector_t *part_start, sector_t *part_size)
76{ 76{
77 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
77 struct buffer_head *bh; 78 struct buffer_head *bh;
78 __be16 *data; 79 __be16 *data;
79 int i, size, res; 80 int i, size, res;
@@ -95,7 +96,7 @@ int hfs_part_find(struct super_block *sb,
95 for (i = 0; i < size; p++, i++) { 96 for (i = 0; i < size; p++, i++) {
96 if (p->pdStart && p->pdSize && 97 if (p->pdStart && p->pdSize &&
97 p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ && 98 p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
98 (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) { 99 (sbi->part < 0 || sbi->part == i)) {
99 *part_start += be32_to_cpu(p->pdStart); 100 *part_start += be32_to_cpu(p->pdStart);
100 *part_size = be32_to_cpu(p->pdSize); 101 *part_size = be32_to_cpu(p->pdSize);
101 res = 0; 102 res = 0;
@@ -111,7 +112,7 @@ int hfs_part_find(struct super_block *sb,
111 size = be32_to_cpu(pm->pmMapBlkCnt); 112 size = be32_to_cpu(pm->pmMapBlkCnt);
112 for (i = 0; i < size;) { 113 for (i = 0; i < size;) {
113 if (!memcmp(pm->pmPartType,"Apple_HFS", 9) && 114 if (!memcmp(pm->pmPartType,"Apple_HFS", 9) &&
114 (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) { 115 (sbi->part < 0 || sbi->part == i)) {
115 *part_start += be32_to_cpu(pm->pmPyPartStart); 116 *part_start += be32_to_cpu(pm->pmPyPartStart);
116 *part_size = be32_to_cpu(pm->pmPartBlkCnt); 117 *part_size = be32_to_cpu(pm->pmPartBlkCnt);
117 res = 0; 118 res = 0;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 3b55c050c742..9a88d7536103 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -12,7 +12,6 @@
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/smp_lock.h>
16#include <linux/vfs.h> 15#include <linux/vfs.h>
17#include <linux/nls.h> 16#include <linux/nls.h>
18 17
@@ -21,40 +20,11 @@ static void hfsplus_destroy_inode(struct inode *inode);
21 20
22#include "hfsplus_fs.h" 21#include "hfsplus_fs.h"
23 22
24struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) 23static int hfsplus_system_read_inode(struct inode *inode)
25{ 24{
26 struct hfs_find_data fd; 25 struct hfsplus_vh *vhdr = HFSPLUS_SB(inode->i_sb)->s_vhdr;
27 struct hfsplus_vh *vhdr;
28 struct inode *inode;
29 long err = -EIO;
30
31 inode = iget_locked(sb, ino);
32 if (!inode)
33 return ERR_PTR(-ENOMEM);
34 if (!(inode->i_state & I_NEW))
35 return inode;
36 26
37 INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); 27 switch (inode->i_ino) {
38 mutex_init(&HFSPLUS_I(inode).extents_lock);
39 HFSPLUS_I(inode).flags = 0;
40 HFSPLUS_I(inode).rsrc_inode = NULL;
41 atomic_set(&HFSPLUS_I(inode).opencnt, 0);
42
43 if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
44 read_inode:
45 hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
46 err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
47 if (!err)
48 err = hfsplus_cat_read_inode(inode, &fd);
49 hfs_find_exit(&fd);
50 if (err)
51 goto bad_inode;
52 goto done;
53 }
54 vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr;
55 switch(inode->i_ino) {
56 case HFSPLUS_ROOT_CNID:
57 goto read_inode;
58 case HFSPLUS_EXT_CNID: 28 case HFSPLUS_EXT_CNID:
59 hfsplus_inode_read_fork(inode, &vhdr->ext_file); 29 hfsplus_inode_read_fork(inode, &vhdr->ext_file);
60 inode->i_mapping->a_ops = &hfsplus_btree_aops; 30 inode->i_mapping->a_ops = &hfsplus_btree_aops;
@@ -75,74 +45,101 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
75 inode->i_mapping->a_ops = &hfsplus_btree_aops; 45 inode->i_mapping->a_ops = &hfsplus_btree_aops;
76 break; 46 break;
77 default: 47 default:
78 goto bad_inode; 48 return -EIO;
49 }
50
51 return 0;
52}
53
54struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
55{
56 struct hfs_find_data fd;
57 struct inode *inode;
58 int err;
59
60 inode = iget_locked(sb, ino);
61 if (!inode)
62 return ERR_PTR(-ENOMEM);
63 if (!(inode->i_state & I_NEW))
64 return inode;
65
66 INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list);
67 mutex_init(&HFSPLUS_I(inode)->extents_lock);
68 HFSPLUS_I(inode)->flags = 0;
69 HFSPLUS_I(inode)->rsrc_inode = NULL;
70 atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
71
72 if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
73 inode->i_ino == HFSPLUS_ROOT_CNID) {
74 hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
75 err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
76 if (!err)
77 err = hfsplus_cat_read_inode(inode, &fd);
78 hfs_find_exit(&fd);
79 } else {
80 err = hfsplus_system_read_inode(inode);
81 }
82
83 if (err) {
84 iget_failed(inode);
85 return ERR_PTR(err);
79 } 86 }
80 87
81done:
82 unlock_new_inode(inode); 88 unlock_new_inode(inode);
83 return inode; 89 return inode;
84
85bad_inode:
86 iget_failed(inode);
87 return ERR_PTR(err);
88} 90}
89 91
90static int hfsplus_write_inode(struct inode *inode, 92static int hfsplus_system_write_inode(struct inode *inode)
91 struct writeback_control *wbc)
92{ 93{
93 struct hfsplus_vh *vhdr; 94 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
94 int ret = 0; 95 struct hfsplus_vh *vhdr = sbi->s_vhdr;
96 struct hfsplus_fork_raw *fork;
97 struct hfs_btree *tree = NULL;
95 98
96 dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
97 hfsplus_ext_write_extent(inode);
98 if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
99 return hfsplus_cat_write_inode(inode);
100 }
101 vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr;
102 switch (inode->i_ino) { 99 switch (inode->i_ino) {
103 case HFSPLUS_ROOT_CNID:
104 ret = hfsplus_cat_write_inode(inode);
105 break;
106 case HFSPLUS_EXT_CNID: 100 case HFSPLUS_EXT_CNID:
107 if (vhdr->ext_file.total_size != cpu_to_be64(inode->i_size)) { 101 fork = &vhdr->ext_file;
108 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; 102 tree = sbi->ext_tree;
109 inode->i_sb->s_dirt = 1;
110 }
111 hfsplus_inode_write_fork(inode, &vhdr->ext_file);
112 hfs_btree_write(HFSPLUS_SB(inode->i_sb).ext_tree);
113 break; 103 break;
114 case HFSPLUS_CAT_CNID: 104 case HFSPLUS_CAT_CNID:
115 if (vhdr->cat_file.total_size != cpu_to_be64(inode->i_size)) { 105 fork = &vhdr->cat_file;
116 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; 106 tree = sbi->cat_tree;
117 inode->i_sb->s_dirt = 1;
118 }
119 hfsplus_inode_write_fork(inode, &vhdr->cat_file);
120 hfs_btree_write(HFSPLUS_SB(inode->i_sb).cat_tree);
121 break; 107 break;
122 case HFSPLUS_ALLOC_CNID: 108 case HFSPLUS_ALLOC_CNID:
123 if (vhdr->alloc_file.total_size != cpu_to_be64(inode->i_size)) { 109 fork = &vhdr->alloc_file;
124 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
125 inode->i_sb->s_dirt = 1;
126 }
127 hfsplus_inode_write_fork(inode, &vhdr->alloc_file);
128 break; 110 break;
129 case HFSPLUS_START_CNID: 111 case HFSPLUS_START_CNID:
130 if (vhdr->start_file.total_size != cpu_to_be64(inode->i_size)) { 112 fork = &vhdr->start_file;
131 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
132 inode->i_sb->s_dirt = 1;
133 }
134 hfsplus_inode_write_fork(inode, &vhdr->start_file);
135 break; 113 break;
136 case HFSPLUS_ATTR_CNID: 114 case HFSPLUS_ATTR_CNID:
137 if (vhdr->attr_file.total_size != cpu_to_be64(inode->i_size)) { 115 fork = &vhdr->attr_file;
138 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; 116 tree = sbi->attr_tree;
139 inode->i_sb->s_dirt = 1; 117 default:
140 } 118 return -EIO;
141 hfsplus_inode_write_fork(inode, &vhdr->attr_file); 119 }
142 hfs_btree_write(HFSPLUS_SB(inode->i_sb).attr_tree); 120
143 break; 121 if (fork->total_size != cpu_to_be64(inode->i_size)) {
122 set_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags);
123 inode->i_sb->s_dirt = 1;
144 } 124 }
145 return ret; 125 hfsplus_inode_write_fork(inode, fork);
126 if (tree)
127 hfs_btree_write(tree);
128 return 0;
129}
130
131static int hfsplus_write_inode(struct inode *inode,
132 struct writeback_control *wbc)
133{
134 dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
135
136 hfsplus_ext_write_extent(inode);
137
138 if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
139 inode->i_ino == HFSPLUS_ROOT_CNID)
140 return hfsplus_cat_write_inode(inode);
141 else
142 return hfsplus_system_write_inode(inode);
146} 143}
147 144
148static void hfsplus_evict_inode(struct inode *inode) 145static void hfsplus_evict_inode(struct inode *inode)
@@ -151,51 +148,53 @@ static void hfsplus_evict_inode(struct inode *inode)
151 truncate_inode_pages(&inode->i_data, 0); 148 truncate_inode_pages(&inode->i_data, 0);
152 end_writeback(inode); 149 end_writeback(inode);
153 if (HFSPLUS_IS_RSRC(inode)) { 150 if (HFSPLUS_IS_RSRC(inode)) {
154 HFSPLUS_I(HFSPLUS_I(inode).rsrc_inode).rsrc_inode = NULL; 151 HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
155 iput(HFSPLUS_I(inode).rsrc_inode); 152 iput(HFSPLUS_I(inode)->rsrc_inode);
156 } 153 }
157} 154}
158 155
159int hfsplus_sync_fs(struct super_block *sb, int wait) 156int hfsplus_sync_fs(struct super_block *sb, int wait)
160{ 157{
161 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; 158 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
159 struct hfsplus_vh *vhdr = sbi->s_vhdr;
162 160
163 dprint(DBG_SUPER, "hfsplus_write_super\n"); 161 dprint(DBG_SUPER, "hfsplus_write_super\n");
164 162
165 lock_super(sb); 163 mutex_lock(&sbi->vh_mutex);
164 mutex_lock(&sbi->alloc_mutex);
166 sb->s_dirt = 0; 165 sb->s_dirt = 0;
167 166
168 vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks); 167 vhdr->free_blocks = cpu_to_be32(sbi->free_blocks);
169 vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc); 168 vhdr->next_cnid = cpu_to_be32(sbi->next_cnid);
170 vhdr->next_cnid = cpu_to_be32(HFSPLUS_SB(sb).next_cnid); 169 vhdr->folder_count = cpu_to_be32(sbi->folder_count);
171 vhdr->folder_count = cpu_to_be32(HFSPLUS_SB(sb).folder_count); 170 vhdr->file_count = cpu_to_be32(sbi->file_count);
172 vhdr->file_count = cpu_to_be32(HFSPLUS_SB(sb).file_count);
173 171
174 mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); 172 mark_buffer_dirty(sbi->s_vhbh);
175 if (HFSPLUS_SB(sb).flags & HFSPLUS_SB_WRITEBACKUP) { 173 if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) {
176 if (HFSPLUS_SB(sb).sect_count) { 174 if (sbi->sect_count) {
177 struct buffer_head *bh; 175 struct buffer_head *bh;
178 u32 block, offset; 176 u32 block, offset;
179 177
180 block = HFSPLUS_SB(sb).blockoffset; 178 block = sbi->blockoffset;
181 block += (HFSPLUS_SB(sb).sect_count - 2) >> (sb->s_blocksize_bits - 9); 179 block += (sbi->sect_count - 2) >> (sb->s_blocksize_bits - 9);
182 offset = ((HFSPLUS_SB(sb).sect_count - 2) << 9) & (sb->s_blocksize - 1); 180 offset = ((sbi->sect_count - 2) << 9) & (sb->s_blocksize - 1);
183 printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n", HFSPLUS_SB(sb).blockoffset, 181 printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n",
184 HFSPLUS_SB(sb).sect_count, block, offset); 182 sbi->blockoffset, sbi->sect_count,
183 block, offset);
185 bh = sb_bread(sb, block); 184 bh = sb_bread(sb, block);
186 if (bh) { 185 if (bh) {
187 vhdr = (struct hfsplus_vh *)(bh->b_data + offset); 186 vhdr = (struct hfsplus_vh *)(bh->b_data + offset);
188 if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) { 187 if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) {
189 memcpy(vhdr, HFSPLUS_SB(sb).s_vhdr, sizeof(*vhdr)); 188 memcpy(vhdr, sbi->s_vhdr, sizeof(*vhdr));
190 mark_buffer_dirty(bh); 189 mark_buffer_dirty(bh);
191 brelse(bh); 190 brelse(bh);
192 } else 191 } else
193 printk(KERN_WARNING "hfs: backup not found!\n"); 192 printk(KERN_WARNING "hfs: backup not found!\n");
194 } 193 }
195 } 194 }
196 HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP;
197 } 195 }
198 unlock_super(sb); 196 mutex_unlock(&sbi->alloc_mutex);
197 mutex_unlock(&sbi->vh_mutex);
199 return 0; 198 return 0;
200} 199}
201 200
@@ -209,48 +208,48 @@ static void hfsplus_write_super(struct super_block *sb)
209 208
210static void hfsplus_put_super(struct super_block *sb) 209static void hfsplus_put_super(struct super_block *sb)
211{ 210{
211 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
212
212 dprint(DBG_SUPER, "hfsplus_put_super\n"); 213 dprint(DBG_SUPER, "hfsplus_put_super\n");
214
213 if (!sb->s_fs_info) 215 if (!sb->s_fs_info)
214 return; 216 return;
215 217
216 lock_kernel();
217
218 if (sb->s_dirt) 218 if (sb->s_dirt)
219 hfsplus_write_super(sb); 219 hfsplus_write_super(sb);
220 if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) { 220 if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) {
221 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; 221 struct hfsplus_vh *vhdr = sbi->s_vhdr;
222 222
223 vhdr->modify_date = hfsp_now2mt(); 223 vhdr->modify_date = hfsp_now2mt();
224 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT); 224 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT);
225 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT); 225 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT);
226 mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); 226 mark_buffer_dirty(sbi->s_vhbh);
227 sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh); 227 sync_dirty_buffer(sbi->s_vhbh);
228 } 228 }
229 229
230 hfs_btree_close(HFSPLUS_SB(sb).cat_tree); 230 hfs_btree_close(sbi->cat_tree);
231 hfs_btree_close(HFSPLUS_SB(sb).ext_tree); 231 hfs_btree_close(sbi->ext_tree);
232 iput(HFSPLUS_SB(sb).alloc_file); 232 iput(sbi->alloc_file);
233 iput(HFSPLUS_SB(sb).hidden_dir); 233 iput(sbi->hidden_dir);
234 brelse(HFSPLUS_SB(sb).s_vhbh); 234 brelse(sbi->s_vhbh);
235 unload_nls(HFSPLUS_SB(sb).nls); 235 unload_nls(sbi->nls);
236 kfree(sb->s_fs_info); 236 kfree(sb->s_fs_info);
237 sb->s_fs_info = NULL; 237 sb->s_fs_info = NULL;
238
239 unlock_kernel();
240} 238}
241 239
242static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) 240static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
243{ 241{
244 struct super_block *sb = dentry->d_sb; 242 struct super_block *sb = dentry->d_sb;
243 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
245 u64 id = huge_encode_dev(sb->s_bdev->bd_dev); 244 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
246 245
247 buf->f_type = HFSPLUS_SUPER_MAGIC; 246 buf->f_type = HFSPLUS_SUPER_MAGIC;
248 buf->f_bsize = sb->s_blocksize; 247 buf->f_bsize = sb->s_blocksize;
249 buf->f_blocks = HFSPLUS_SB(sb).total_blocks << HFSPLUS_SB(sb).fs_shift; 248 buf->f_blocks = sbi->total_blocks << sbi->fs_shift;
250 buf->f_bfree = HFSPLUS_SB(sb).free_blocks << HFSPLUS_SB(sb).fs_shift; 249 buf->f_bfree = sbi->free_blocks << sbi->fs_shift;
251 buf->f_bavail = buf->f_bfree; 250 buf->f_bavail = buf->f_bfree;
252 buf->f_files = 0xFFFFFFFF; 251 buf->f_files = 0xFFFFFFFF;
253 buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid; 252 buf->f_ffree = 0xFFFFFFFF - sbi->next_cnid;
254 buf->f_fsid.val[0] = (u32)id; 253 buf->f_fsid.val[0] = (u32)id;
255 buf->f_fsid.val[1] = (u32)(id >> 32); 254 buf->f_fsid.val[1] = (u32)(id >> 32);
256 buf->f_namelen = HFSPLUS_MAX_STRLEN; 255 buf->f_namelen = HFSPLUS_MAX_STRLEN;
@@ -263,11 +262,11 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
263 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 262 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
264 return 0; 263 return 0;
265 if (!(*flags & MS_RDONLY)) { 264 if (!(*flags & MS_RDONLY)) {
266 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; 265 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr;
267 struct hfsplus_sb_info sbi; 266 struct hfsplus_sb_info sbi;
268 267
269 memset(&sbi, 0, sizeof(struct hfsplus_sb_info)); 268 memset(&sbi, 0, sizeof(struct hfsplus_sb_info));
270 sbi.nls = HFSPLUS_SB(sb).nls; 269 sbi.nls = HFSPLUS_SB(sb)->nls;
271 if (!hfsplus_parse_options(data, &sbi)) 270 if (!hfsplus_parse_options(data, &sbi))
272 return -EINVAL; 271 return -EINVAL;
273 272
@@ -276,7 +275,7 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
276 "running fsck.hfsplus is recommended. leaving read-only.\n"); 275 "running fsck.hfsplus is recommended. leaving read-only.\n");
277 sb->s_flags |= MS_RDONLY; 276 sb->s_flags |= MS_RDONLY;
278 *flags |= MS_RDONLY; 277 *flags |= MS_RDONLY;
279 } else if (sbi.flags & HFSPLUS_SB_FORCE) { 278 } else if (test_bit(HFSPLUS_SB_FORCE, &sbi.flags)) {
280 /* nothing */ 279 /* nothing */
281 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { 280 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
282 printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n"); 281 printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n");
@@ -320,7 +319,8 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
320 return -ENOMEM; 319 return -ENOMEM;
321 320
322 sb->s_fs_info = sbi; 321 sb->s_fs_info = sbi;
323 INIT_HLIST_HEAD(&sbi->rsrc_inodes); 322 mutex_init(&sbi->alloc_mutex);
323 mutex_init(&sbi->vh_mutex);
324 hfsplus_fill_defaults(sbi); 324 hfsplus_fill_defaults(sbi);
325 if (!hfsplus_parse_options(data, sbi)) { 325 if (!hfsplus_parse_options(data, sbi)) {
326 printk(KERN_ERR "hfs: unable to parse mount options\n"); 326 printk(KERN_ERR "hfs: unable to parse mount options\n");
@@ -344,7 +344,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
344 err = -EINVAL; 344 err = -EINVAL;
345 goto cleanup; 345 goto cleanup;
346 } 346 }
347 vhdr = HFSPLUS_SB(sb).s_vhdr; 347 vhdr = sbi->s_vhdr;
348 348
349 /* Copy parts of the volume header into the superblock */ 349 /* Copy parts of the volume header into the superblock */
350 sb->s_magic = HFSPLUS_VOLHEAD_SIG; 350 sb->s_magic = HFSPLUS_VOLHEAD_SIG;
@@ -353,18 +353,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
353 printk(KERN_ERR "hfs: wrong filesystem version\n"); 353 printk(KERN_ERR "hfs: wrong filesystem version\n");
354 goto cleanup; 354 goto cleanup;
355 } 355 }
356 HFSPLUS_SB(sb).total_blocks = be32_to_cpu(vhdr->total_blocks); 356 sbi->total_blocks = be32_to_cpu(vhdr->total_blocks);
357 HFSPLUS_SB(sb).free_blocks = be32_to_cpu(vhdr->free_blocks); 357 sbi->free_blocks = be32_to_cpu(vhdr->free_blocks);
358 HFSPLUS_SB(sb).next_alloc = be32_to_cpu(vhdr->next_alloc); 358 sbi->next_cnid = be32_to_cpu(vhdr->next_cnid);
359 HFSPLUS_SB(sb).next_cnid = be32_to_cpu(vhdr->next_cnid); 359 sbi->file_count = be32_to_cpu(vhdr->file_count);
360 HFSPLUS_SB(sb).file_count = be32_to_cpu(vhdr->file_count); 360 sbi->folder_count = be32_to_cpu(vhdr->folder_count);
361 HFSPLUS_SB(sb).folder_count = be32_to_cpu(vhdr->folder_count); 361 sbi->data_clump_blocks =
362 HFSPLUS_SB(sb).data_clump_blocks = be32_to_cpu(vhdr->data_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift; 362 be32_to_cpu(vhdr->data_clump_sz) >> sbi->alloc_blksz_shift;
363 if (!HFSPLUS_SB(sb).data_clump_blocks) 363 if (!sbi->data_clump_blocks)
364 HFSPLUS_SB(sb).data_clump_blocks = 1; 364 sbi->data_clump_blocks = 1;
365 HFSPLUS_SB(sb).rsrc_clump_blocks = be32_to_cpu(vhdr->rsrc_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift; 365 sbi->rsrc_clump_blocks =
366 if (!HFSPLUS_SB(sb).rsrc_clump_blocks) 366 be32_to_cpu(vhdr->rsrc_clump_sz) >> sbi->alloc_blksz_shift;
367 HFSPLUS_SB(sb).rsrc_clump_blocks = 1; 367 if (!sbi->rsrc_clump_blocks)
368 sbi->rsrc_clump_blocks = 1;
368 369
369 /* Set up operations so we can load metadata */ 370 /* Set up operations so we can load metadata */
370 sb->s_op = &hfsplus_sops; 371 sb->s_op = &hfsplus_sops;
@@ -374,7 +375,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
374 printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, " 375 printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, "
375 "running fsck.hfsplus is recommended. mounting read-only.\n"); 376 "running fsck.hfsplus is recommended. mounting read-only.\n");
376 sb->s_flags |= MS_RDONLY; 377 sb->s_flags |= MS_RDONLY;
377 } else if (sbi->flags & HFSPLUS_SB_FORCE) { 378 } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
378 /* nothing */ 379 /* nothing */
379 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { 380 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
380 printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n"); 381 printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n");
@@ -384,16 +385,15 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
384 "use the force option at your own risk, mounting read-only.\n"); 385 "use the force option at your own risk, mounting read-only.\n");
385 sb->s_flags |= MS_RDONLY; 386 sb->s_flags |= MS_RDONLY;
386 } 387 }
387 sbi->flags &= ~HFSPLUS_SB_FORCE;
388 388
389 /* Load metadata objects (B*Trees) */ 389 /* Load metadata objects (B*Trees) */
390 HFSPLUS_SB(sb).ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); 390 sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
391 if (!HFSPLUS_SB(sb).ext_tree) { 391 if (!sbi->ext_tree) {
392 printk(KERN_ERR "hfs: failed to load extents file\n"); 392 printk(KERN_ERR "hfs: failed to load extents file\n");
393 goto cleanup; 393 goto cleanup;
394 } 394 }
395 HFSPLUS_SB(sb).cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID); 395 sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
396 if (!HFSPLUS_SB(sb).cat_tree) { 396 if (!sbi->cat_tree) {
397 printk(KERN_ERR "hfs: failed to load catalog file\n"); 397 printk(KERN_ERR "hfs: failed to load catalog file\n");
398 goto cleanup; 398 goto cleanup;
399 } 399 }
@@ -404,7 +404,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
404 err = PTR_ERR(inode); 404 err = PTR_ERR(inode);
405 goto cleanup; 405 goto cleanup;
406 } 406 }
407 HFSPLUS_SB(sb).alloc_file = inode; 407 sbi->alloc_file = inode;
408 408
409 /* Load the root directory */ 409 /* Load the root directory */
410 root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID); 410 root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID);
@@ -423,7 +423,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
423 423
424 str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1; 424 str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
425 str.name = HFSP_HIDDENDIR_NAME; 425 str.name = HFSP_HIDDENDIR_NAME;
426 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 426 hfs_find_init(sbi->cat_tree, &fd);
427 hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str); 427 hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str);
428 if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { 428 if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
429 hfs_find_exit(&fd); 429 hfs_find_exit(&fd);
@@ -434,7 +434,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
434 err = PTR_ERR(inode); 434 err = PTR_ERR(inode);
435 goto cleanup; 435 goto cleanup;
436 } 436 }
437 HFSPLUS_SB(sb).hidden_dir = inode; 437 sbi->hidden_dir = inode;
438 } else 438 } else
439 hfs_find_exit(&fd); 439 hfs_find_exit(&fd);
440 440
@@ -449,15 +449,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
449 be32_add_cpu(&vhdr->write_count, 1); 449 be32_add_cpu(&vhdr->write_count, 1);
450 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT); 450 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
451 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT); 451 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
452 mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); 452 mark_buffer_dirty(sbi->s_vhbh);
453 sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh); 453 sync_dirty_buffer(sbi->s_vhbh);
454 454
455 if (!HFSPLUS_SB(sb).hidden_dir) { 455 if (!sbi->hidden_dir) {
456 printk(KERN_DEBUG "hfs: create hidden dir...\n"); 456 printk(KERN_DEBUG "hfs: create hidden dir...\n");
457 HFSPLUS_SB(sb).hidden_dir = hfsplus_new_inode(sb, S_IFDIR); 457
458 hfsplus_create_cat(HFSPLUS_SB(sb).hidden_dir->i_ino, sb->s_root->d_inode, 458 mutex_lock(&sbi->vh_mutex);
459 &str, HFSPLUS_SB(sb).hidden_dir); 459 sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
460 mark_inode_dirty(HFSPLUS_SB(sb).hidden_dir); 460 hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode,
461 &str, sbi->hidden_dir);
462 mutex_unlock(&sbi->vh_mutex);
463
464 mark_inode_dirty(sbi->hidden_dir);
461 } 465 }
462out: 466out:
463 unload_nls(sbi->nls); 467 unload_nls(sbi->nls);
@@ -486,7 +490,7 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb)
486 490
487static void hfsplus_destroy_inode(struct inode *inode) 491static void hfsplus_destroy_inode(struct inode *inode)
488{ 492{
489 kmem_cache_free(hfsplus_inode_cachep, &HFSPLUS_I(inode)); 493 kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode));
490} 494}
491 495
492#define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info) 496#define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info)
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 628ccf6fa402..b66d67de882c 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -121,7 +121,7 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc)
121int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p) 121int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p)
122{ 122{
123 const hfsplus_unichr *ip; 123 const hfsplus_unichr *ip;
124 struct nls_table *nls = HFSPLUS_SB(sb).nls; 124 struct nls_table *nls = HFSPLUS_SB(sb)->nls;
125 u8 *op; 125 u8 *op;
126 u16 cc, c0, c1; 126 u16 cc, c0, c1;
127 u16 *ce1, *ce2; 127 u16 *ce1, *ce2;
@@ -132,7 +132,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
132 ustrlen = be16_to_cpu(ustr->length); 132 ustrlen = be16_to_cpu(ustr->length);
133 len = *len_p; 133 len = *len_p;
134 ce1 = NULL; 134 ce1 = NULL;
135 compose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); 135 compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
136 136
137 while (ustrlen > 0) { 137 while (ustrlen > 0) {
138 c0 = be16_to_cpu(*ip++); 138 c0 = be16_to_cpu(*ip++);
@@ -246,7 +246,7 @@ out:
246static inline int asc2unichar(struct super_block *sb, const char *astr, int len, 246static inline int asc2unichar(struct super_block *sb, const char *astr, int len,
247 wchar_t *uc) 247 wchar_t *uc)
248{ 248{
249 int size = HFSPLUS_SB(sb).nls->char2uni(astr, len, uc); 249 int size = HFSPLUS_SB(sb)->nls->char2uni(astr, len, uc);
250 if (size <= 0) { 250 if (size <= 0) {
251 *uc = '?'; 251 *uc = '?';
252 size = 1; 252 size = 1;
@@ -293,7 +293,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
293 u16 *dstr, outlen = 0; 293 u16 *dstr, outlen = 0;
294 wchar_t c; 294 wchar_t c;
295 295
296 decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); 296 decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
297 while (outlen < HFSPLUS_MAX_STRLEN && len > 0) { 297 while (outlen < HFSPLUS_MAX_STRLEN && len > 0) {
298 size = asc2unichar(sb, astr, len, &c); 298 size = asc2unichar(sb, astr, len, &c);
299 299
@@ -330,8 +330,8 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
330 wchar_t c; 330 wchar_t c;
331 u16 c2; 331 u16 c2;
332 332
333 casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD); 333 casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
334 decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); 334 decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
335 hash = init_name_hash(); 335 hash = init_name_hash();
336 astr = str->name; 336 astr = str->name;
337 len = str->len; 337 len = str->len;
@@ -373,8 +373,8 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
373 u16 c1, c2; 373 u16 c1, c2;
374 wchar_t c; 374 wchar_t c;
375 375
376 casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD); 376 casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
377 decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); 377 decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
378 astr1 = s1->name; 378 astr1 = s1->name;
379 len1 = s1->len; 379 len1 = s1->len;
380 astr2 = s2->name; 380 astr2 = s2->name;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index bed78ac8f6d1..8972c20b3216 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -65,8 +65,8 @@ static int hfsplus_get_last_session(struct super_block *sb,
65 *start = 0; 65 *start = 0;
66 *size = sb->s_bdev->bd_inode->i_size >> 9; 66 *size = sb->s_bdev->bd_inode->i_size >> 9;
67 67
68 if (HFSPLUS_SB(sb).session >= 0) { 68 if (HFSPLUS_SB(sb)->session >= 0) {
69 te.cdte_track = HFSPLUS_SB(sb).session; 69 te.cdte_track = HFSPLUS_SB(sb)->session;
70 te.cdte_format = CDROM_LBA; 70 te.cdte_format = CDROM_LBA;
71 res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te); 71 res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te);
72 if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) { 72 if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) {
@@ -87,6 +87,7 @@ static int hfsplus_get_last_session(struct super_block *sb,
87/* Takes in super block, returns true if good data read */ 87/* Takes in super block, returns true if good data read */
88int hfsplus_read_wrapper(struct super_block *sb) 88int hfsplus_read_wrapper(struct super_block *sb)
89{ 89{
90 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
90 struct buffer_head *bh; 91 struct buffer_head *bh;
91 struct hfsplus_vh *vhdr; 92 struct hfsplus_vh *vhdr;
92 struct hfsplus_wd wd; 93 struct hfsplus_wd wd;
@@ -122,7 +123,7 @@ int hfsplus_read_wrapper(struct super_block *sb)
122 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG)) 123 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
123 break; 124 break;
124 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) { 125 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) {
125 HFSPLUS_SB(sb).flags |= HFSPLUS_SB_HFSX; 126 set_bit(HFSPLUS_SB_HFSX, &sbi->flags);
126 break; 127 break;
127 } 128 }
128 brelse(bh); 129 brelse(bh);
@@ -143,11 +144,11 @@ int hfsplus_read_wrapper(struct super_block *sb)
143 if (blocksize < HFSPLUS_SECTOR_SIZE || 144 if (blocksize < HFSPLUS_SECTOR_SIZE ||
144 ((blocksize - 1) & blocksize)) 145 ((blocksize - 1) & blocksize))
145 return -EINVAL; 146 return -EINVAL;
146 HFSPLUS_SB(sb).alloc_blksz = blocksize; 147 sbi->alloc_blksz = blocksize;
147 HFSPLUS_SB(sb).alloc_blksz_shift = 0; 148 sbi->alloc_blksz_shift = 0;
148 while ((blocksize >>= 1) != 0) 149 while ((blocksize >>= 1) != 0)
149 HFSPLUS_SB(sb).alloc_blksz_shift++; 150 sbi->alloc_blksz_shift++;
150 blocksize = min(HFSPLUS_SB(sb).alloc_blksz, (u32)PAGE_SIZE); 151 blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE);
151 152
152 /* align block size to block offset */ 153 /* align block size to block offset */
153 while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1)) 154 while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1))
@@ -158,23 +159,26 @@ int hfsplus_read_wrapper(struct super_block *sb)
158 return -EINVAL; 159 return -EINVAL;
159 } 160 }
160 161
161 HFSPLUS_SB(sb).blockoffset = part_start >> 162 sbi->blockoffset =
162 (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT); 163 part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
163 HFSPLUS_SB(sb).sect_count = part_size; 164 sbi->sect_count = part_size;
164 HFSPLUS_SB(sb).fs_shift = HFSPLUS_SB(sb).alloc_blksz_shift - 165 sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
165 sb->s_blocksize_bits;
166 166
167 bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr); 167 bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
168 if (!bh) 168 if (!bh)
169 return -EIO; 169 return -EIO;
170 170
171 /* should still be the same... */ 171 /* should still be the same... */
172 if (vhdr->signature != (HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX ? 172 if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) {
173 cpu_to_be16(HFSPLUS_VOLHEAD_SIGX) : 173 if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIGX))
174 cpu_to_be16(HFSPLUS_VOLHEAD_SIG))) 174 goto error;
175 goto error; 175 } else {
176 HFSPLUS_SB(sb).s_vhbh = bh; 176 if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
177 HFSPLUS_SB(sb).s_vhdr = vhdr; 177 goto error;
178 }
179
180 sbi->s_vhbh = bh;
181 sbi->s_vhdr = vhdr;
178 182
179 return 0; 183 return 0;
180 error: 184 error:
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 6bbd75c5589b..bf15a43016b9 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -28,12 +28,7 @@
28 * #define ATTR_KILL_SUID 2048 28 * #define ATTR_KILL_SUID 2048
29 * #define ATTR_KILL_SGID 4096 29 * #define ATTR_KILL_SGID 4096
30 * 30 *
31 * and this is because they were added in 2.5 development in this patch: 31 * and this is because they were added in 2.5 development.
32 *
33 * http://linux.bkbits.net:8080/linux-2.5/
34 * cset@3caf4a12k4XgDzK7wyK-TGpSZ9u2Ww?nav=index.html
35 * |src/.|src/include|src/include/linux|related/include/linux/fs.h
36 *
37 * Actually, they are not needed by most ->setattr() methods - they are set by 32 * Actually, they are not needed by most ->setattr() methods - they are set by
38 * callers of notify_change() to notify that the setuid/setgid bits must be 33 * callers of notify_change() to notify that the setuid/setgid bits must be
39 * dropped. 34 * dropped.
@@ -96,7 +91,6 @@ extern int rename_file(char *from, char *to);
96extern int do_statfs(char *root, long *bsize_out, long long *blocks_out, 91extern int do_statfs(char *root, long *bsize_out, long long *blocks_out,
97 long long *bfree_out, long long *bavail_out, 92 long long *bfree_out, long long *bavail_out,
98 long long *files_out, long long *ffree_out, 93 long long *files_out, long long *ffree_out,
99 void *fsid_out, int fsid_size, long *namelen_out, 94 void *fsid_out, int fsid_size, long *namelen_out);
100 long *spare_out);
101 95
102#endif 96#endif
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index f7dc9b5f9ef8..cd7c93917cc7 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -217,7 +217,7 @@ int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
217 err = do_statfs(dentry->d_sb->s_fs_info, 217 err = do_statfs(dentry->d_sb->s_fs_info,
218 &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files, 218 &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
219 &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid), 219 &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid),
220 &sf->f_namelen, sf->f_spare); 220 &sf->f_namelen);
221 if (err) 221 if (err)
222 return err; 222 return err;
223 sf->f_blocks = f_blocks; 223 sf->f_blocks = f_blocks;
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 6777aa06ce2c..d51a98384bc0 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -94,8 +94,7 @@ void *open_dir(char *path, int *err_out)
94 94
95 dir = opendir(path); 95 dir = opendir(path);
96 *err_out = errno; 96 *err_out = errno;
97 if (dir == NULL) 97
98 return NULL;
99 return dir; 98 return dir;
100} 99}
101 100
@@ -205,7 +204,7 @@ int set_attr(const char *file, struct hostfs_iattr *attrs, int fd)
205 if (attrs->ia_valid & HOSTFS_ATTR_MODE) { 204 if (attrs->ia_valid & HOSTFS_ATTR_MODE) {
206 if (fd >= 0) { 205 if (fd >= 0) {
207 if (fchmod(fd, attrs->ia_mode) != 0) 206 if (fchmod(fd, attrs->ia_mode) != 0)
208 return (-errno); 207 return -errno;
209 } else if (chmod(file, attrs->ia_mode) != 0) { 208 } else if (chmod(file, attrs->ia_mode) != 0) {
210 return -errno; 209 return -errno;
211 } 210 }
@@ -364,8 +363,7 @@ int rename_file(char *from, char *to)
364int do_statfs(char *root, long *bsize_out, long long *blocks_out, 363int do_statfs(char *root, long *bsize_out, long long *blocks_out,
365 long long *bfree_out, long long *bavail_out, 364 long long *bfree_out, long long *bavail_out,
366 long long *files_out, long long *ffree_out, 365 long long *files_out, long long *ffree_out,
367 void *fsid_out, int fsid_size, long *namelen_out, 366 void *fsid_out, int fsid_size, long *namelen_out)
368 long *spare_out)
369{ 367{
370 struct statfs64 buf; 368 struct statfs64 buf;
371 int err; 369 int err;
@@ -384,10 +382,6 @@ int do_statfs(char *root, long *bsize_out, long long *blocks_out,
384 sizeof(buf.f_fsid) > fsid_size ? fsid_size : 382 sizeof(buf.f_fsid) > fsid_size ? fsid_size :
385 sizeof(buf.f_fsid)); 383 sizeof(buf.f_fsid));
386 *namelen_out = buf.f_namelen; 384 *namelen_out = buf.f_namelen;
387 spare_out[0] = buf.f_spare[0]; 385
388 spare_out[1] = buf.f_spare[1];
389 spare_out[2] = buf.f_spare[2];
390 spare_out[3] = buf.f_spare[3];
391 spare_out[4] = buf.f_spare[4];
392 return 0; 386 return 0;
393} 387}
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
index 56bd15c5bf6c..63b6f5632318 100644
--- a/fs/hpfs/Kconfig
+++ b/fs/hpfs/Kconfig
@@ -1,6 +1,7 @@
1config HPFS_FS 1config HPFS_FS
2 tristate "OS/2 HPFS file system support" 2 tristate "OS/2 HPFS file system support"
3 depends on BLOCK 3 depends on BLOCK
4 depends on BKL # nontrivial to fix
4 help 5 help
5 OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS 6 OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
6 is the file system used for organizing files on OS/2 hard disk 7 is the file system used for organizing files on OS/2 hard disk
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 2607010be2fe..c969a1aa163a 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -477,11 +477,15 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
477 477
478 int o; 478 int o;
479 479
480 lock_kernel();
481
480 save_mount_options(s, options); 482 save_mount_options(s, options);
481 483
482 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 484 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
483 if (!sbi) 485 if (!sbi) {
486 unlock_kernel();
484 return -ENOMEM; 487 return -ENOMEM;
488 }
485 s->s_fs_info = sbi; 489 s->s_fs_info = sbi;
486 490
487 sbi->sb_bmp_dir = NULL; 491 sbi->sb_bmp_dir = NULL;
@@ -666,6 +670,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
666 root->i_blocks = 5; 670 root->i_blocks = 5;
667 hpfs_brelse4(&qbh); 671 hpfs_brelse4(&qbh);
668 } 672 }
673 unlock_kernel();
669 return 0; 674 return 0;
670 675
671bail4: brelse(bh2); 676bail4: brelse(bh2);
@@ -677,6 +682,7 @@ bail0:
677 kfree(sbi->sb_cp_table); 682 kfree(sbi->sb_cp_table);
678 s->s_fs_info = NULL; 683 s->s_fs_info = NULL;
679 kfree(sbi); 684 kfree(sbi);
685 unlock_kernel();
680 return -EINVAL; 686 return -EINVAL;
681} 687}
682 688
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 7b027720d820..4e2a45ea6140 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -598,6 +598,7 @@ static const struct file_operations hppfs_dir_fops = {
598 .readdir = hppfs_readdir, 598 .readdir = hppfs_readdir,
599 .open = hppfs_dir_open, 599 .open = hppfs_dir_open,
600 .fsync = hppfs_fsync, 600 .fsync = hppfs_fsync,
601 .llseek = default_llseek,
601}; 602};
602 603
603static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf) 604static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6e5bd42f3860..b14be3f781c7 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -31,6 +31,7 @@
31#include <linux/statfs.h> 31#include <linux/statfs.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/magic.h> 33#include <linux/magic.h>
34#include <linux/migrate.h>
34 35
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36 37
@@ -455,6 +456,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
455 inode = new_inode(sb); 456 inode = new_inode(sb);
456 if (inode) { 457 if (inode) {
457 struct hugetlbfs_inode_info *info; 458 struct hugetlbfs_inode_info *info;
459 inode->i_ino = get_next_ino();
458 inode->i_mode = mode; 460 inode->i_mode = mode;
459 inode->i_uid = uid; 461 inode->i_uid = uid;
460 inode->i_gid = gid; 462 inode->i_gid = gid;
@@ -573,6 +575,19 @@ static int hugetlbfs_set_page_dirty(struct page *page)
573 return 0; 575 return 0;
574} 576}
575 577
578static int hugetlbfs_migrate_page(struct address_space *mapping,
579 struct page *newpage, struct page *page)
580{
581 int rc;
582
583 rc = migrate_huge_page_move_mapping(mapping, newpage, page);
584 if (rc)
585 return rc;
586 migrate_page_copy(newpage, page);
587
588 return 0;
589}
590
576static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) 591static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
577{ 592{
578 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); 593 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
@@ -659,6 +674,7 @@ static const struct address_space_operations hugetlbfs_aops = {
659 .write_begin = hugetlbfs_write_begin, 674 .write_begin = hugetlbfs_write_begin,
660 .write_end = hugetlbfs_write_end, 675 .write_end = hugetlbfs_write_end,
661 .set_page_dirty = hugetlbfs_set_page_dirty, 676 .set_page_dirty = hugetlbfs_set_page_dirty,
677 .migratepage = hugetlbfs_migrate_page,
662}; 678};
663 679
664 680
@@ -674,6 +690,7 @@ const struct file_operations hugetlbfs_file_operations = {
674 .mmap = hugetlbfs_file_mmap, 690 .mmap = hugetlbfs_file_mmap,
675 .fsync = noop_fsync, 691 .fsync = noop_fsync,
676 .get_unmapped_area = hugetlb_get_unmapped_area, 692 .get_unmapped_area = hugetlb_get_unmapped_area,
693 .llseek = default_llseek,
677}; 694};
678 695
679static const struct inode_operations hugetlbfs_dir_inode_operations = { 696static const struct inode_operations hugetlbfs_dir_inode_operations = {
diff --git a/fs/inode.c b/fs/inode.c
index 86464332e590..ae2727ab0c3a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -24,11 +24,11 @@
24#include <linux/mount.h> 24#include <linux/mount.h>
25#include <linux/async.h> 25#include <linux/async.h>
26#include <linux/posix_acl.h> 26#include <linux/posix_acl.h>
27#include <linux/ima.h>
27 28
28/* 29/*
29 * This is needed for the following functions: 30 * This is needed for the following functions:
30 * - inode_has_buffers 31 * - inode_has_buffers
31 * - invalidate_inode_buffers
32 * - invalidate_bdev 32 * - invalidate_bdev
33 * 33 *
34 * FIXME: remove all knowledge of the buffer layer from this file 34 * FIXME: remove all knowledge of the buffer layer from this file
@@ -72,8 +72,7 @@ static unsigned int i_hash_shift __read_mostly;
72 * allowing for low-overhead inode sync() operations. 72 * allowing for low-overhead inode sync() operations.
73 */ 73 */
74 74
75LIST_HEAD(inode_in_use); 75static LIST_HEAD(inode_lru);
76LIST_HEAD(inode_unused);
77static struct hlist_head *inode_hashtable __read_mostly; 76static struct hlist_head *inode_hashtable __read_mostly;
78 77
79/* 78/*
@@ -103,8 +102,41 @@ static DECLARE_RWSEM(iprune_sem);
103 */ 102 */
104struct inodes_stat_t inodes_stat; 103struct inodes_stat_t inodes_stat;
105 104
105static struct percpu_counter nr_inodes __cacheline_aligned_in_smp;
106static struct percpu_counter nr_inodes_unused __cacheline_aligned_in_smp;
107
106static struct kmem_cache *inode_cachep __read_mostly; 108static struct kmem_cache *inode_cachep __read_mostly;
107 109
110static inline int get_nr_inodes(void)
111{
112 return percpu_counter_sum_positive(&nr_inodes);
113}
114
115static inline int get_nr_inodes_unused(void)
116{
117 return percpu_counter_sum_positive(&nr_inodes_unused);
118}
119
120int get_nr_dirty_inodes(void)
121{
122 int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
123 return nr_dirty > 0 ? nr_dirty : 0;
124
125}
126
127/*
128 * Handle nr_inode sysctl
129 */
130#ifdef CONFIG_SYSCTL
131int proc_nr_inodes(ctl_table *table, int write,
132 void __user *buffer, size_t *lenp, loff_t *ppos)
133{
134 inodes_stat.nr_inodes = get_nr_inodes();
135 inodes_stat.nr_unused = get_nr_inodes_unused();
136 return proc_dointvec(table, write, buffer, lenp, ppos);
137}
138#endif
139
108static void wake_up_inode(struct inode *inode) 140static void wake_up_inode(struct inode *inode)
109{ 141{
110 /* 142 /*
@@ -192,6 +224,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
192 inode->i_fsnotify_mask = 0; 224 inode->i_fsnotify_mask = 0;
193#endif 225#endif
194 226
227 percpu_counter_inc(&nr_inodes);
228
195 return 0; 229 return 0;
196out: 230out:
197 return -ENOMEM; 231 return -ENOMEM;
@@ -232,11 +266,13 @@ void __destroy_inode(struct inode *inode)
232 if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED) 266 if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
233 posix_acl_release(inode->i_default_acl); 267 posix_acl_release(inode->i_default_acl);
234#endif 268#endif
269 percpu_counter_dec(&nr_inodes);
235} 270}
236EXPORT_SYMBOL(__destroy_inode); 271EXPORT_SYMBOL(__destroy_inode);
237 272
238void destroy_inode(struct inode *inode) 273static void destroy_inode(struct inode *inode)
239{ 274{
275 BUG_ON(!list_empty(&inode->i_lru));
240 __destroy_inode(inode); 276 __destroy_inode(inode);
241 if (inode->i_sb->s_op->destroy_inode) 277 if (inode->i_sb->s_op->destroy_inode)
242 inode->i_sb->s_op->destroy_inode(inode); 278 inode->i_sb->s_op->destroy_inode(inode);
@@ -255,6 +291,8 @@ void inode_init_once(struct inode *inode)
255 INIT_HLIST_NODE(&inode->i_hash); 291 INIT_HLIST_NODE(&inode->i_hash);
256 INIT_LIST_HEAD(&inode->i_dentry); 292 INIT_LIST_HEAD(&inode->i_dentry);
257 INIT_LIST_HEAD(&inode->i_devices); 293 INIT_LIST_HEAD(&inode->i_devices);
294 INIT_LIST_HEAD(&inode->i_wb_list);
295 INIT_LIST_HEAD(&inode->i_lru);
258 INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); 296 INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
259 spin_lock_init(&inode->i_data.tree_lock); 297 spin_lock_init(&inode->i_data.tree_lock);
260 spin_lock_init(&inode->i_data.i_mmap_lock); 298 spin_lock_init(&inode->i_data.i_mmap_lock);
@@ -281,14 +319,109 @@ static void init_once(void *foo)
281 */ 319 */
282void __iget(struct inode *inode) 320void __iget(struct inode *inode)
283{ 321{
284 if (atomic_inc_return(&inode->i_count) != 1) 322 atomic_inc(&inode->i_count);
285 return; 323}
324
325/*
326 * get additional reference to inode; caller must already hold one.
327 */
328void ihold(struct inode *inode)
329{
330 WARN_ON(atomic_inc_return(&inode->i_count) < 2);
331}
332EXPORT_SYMBOL(ihold);
333
334static void inode_lru_list_add(struct inode *inode)
335{
336 if (list_empty(&inode->i_lru)) {
337 list_add(&inode->i_lru, &inode_lru);
338 percpu_counter_inc(&nr_inodes_unused);
339 }
340}
286 341
287 if (!(inode->i_state & (I_DIRTY|I_SYNC))) 342static void inode_lru_list_del(struct inode *inode)
288 list_move(&inode->i_list, &inode_in_use); 343{
289 inodes_stat.nr_unused--; 344 if (!list_empty(&inode->i_lru)) {
345 list_del_init(&inode->i_lru);
346 percpu_counter_dec(&nr_inodes_unused);
347 }
348}
349
350static inline void __inode_sb_list_add(struct inode *inode)
351{
352 list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
290} 353}
291 354
355/**
356 * inode_sb_list_add - add inode to the superblock list of inodes
357 * @inode: inode to add
358 */
359void inode_sb_list_add(struct inode *inode)
360{
361 spin_lock(&inode_lock);
362 __inode_sb_list_add(inode);
363 spin_unlock(&inode_lock);
364}
365EXPORT_SYMBOL_GPL(inode_sb_list_add);
366
367static inline void __inode_sb_list_del(struct inode *inode)
368{
369 list_del_init(&inode->i_sb_list);
370}
371
372static unsigned long hash(struct super_block *sb, unsigned long hashval)
373{
374 unsigned long tmp;
375
376 tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
377 L1_CACHE_BYTES;
378 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
379 return tmp & I_HASHMASK;
380}
381
382/**
383 * __insert_inode_hash - hash an inode
384 * @inode: unhashed inode
385 * @hashval: unsigned long value used to locate this object in the
386 * inode_hashtable.
387 *
388 * Add an inode to the inode hash for this superblock.
389 */
390void __insert_inode_hash(struct inode *inode, unsigned long hashval)
391{
392 struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
393
394 spin_lock(&inode_lock);
395 hlist_add_head(&inode->i_hash, b);
396 spin_unlock(&inode_lock);
397}
398EXPORT_SYMBOL(__insert_inode_hash);
399
400/**
401 * __remove_inode_hash - remove an inode from the hash
402 * @inode: inode to unhash
403 *
404 * Remove an inode from the superblock.
405 */
406static void __remove_inode_hash(struct inode *inode)
407{
408 hlist_del_init(&inode->i_hash);
409}
410
411/**
412 * remove_inode_hash - remove an inode from the hash
413 * @inode: inode to unhash
414 *
415 * Remove an inode from the superblock.
416 */
417void remove_inode_hash(struct inode *inode)
418{
419 spin_lock(&inode_lock);
420 hlist_del_init(&inode->i_hash);
421 spin_unlock(&inode_lock);
422}
423EXPORT_SYMBOL(remove_inode_hash);
424
292void end_writeback(struct inode *inode) 425void end_writeback(struct inode *inode)
293{ 426{
294 might_sleep(); 427 might_sleep();
@@ -327,101 +460,113 @@ static void evict(struct inode *inode)
327 */ 460 */
328static void dispose_list(struct list_head *head) 461static void dispose_list(struct list_head *head)
329{ 462{
330 int nr_disposed = 0;
331
332 while (!list_empty(head)) { 463 while (!list_empty(head)) {
333 struct inode *inode; 464 struct inode *inode;
334 465
335 inode = list_first_entry(head, struct inode, i_list); 466 inode = list_first_entry(head, struct inode, i_lru);
336 list_del(&inode->i_list); 467 list_del_init(&inode->i_lru);
337 468
338 evict(inode); 469 evict(inode);
339 470
340 spin_lock(&inode_lock); 471 spin_lock(&inode_lock);
341 hlist_del_init(&inode->i_hash); 472 __remove_inode_hash(inode);
342 list_del_init(&inode->i_sb_list); 473 __inode_sb_list_del(inode);
343 spin_unlock(&inode_lock); 474 spin_unlock(&inode_lock);
344 475
345 wake_up_inode(inode); 476 wake_up_inode(inode);
346 destroy_inode(inode); 477 destroy_inode(inode);
347 nr_disposed++;
348 } 478 }
349 spin_lock(&inode_lock);
350 inodes_stat.nr_inodes -= nr_disposed;
351 spin_unlock(&inode_lock);
352} 479}
353 480
354/* 481/**
355 * Invalidate all inodes for a device. 482 * evict_inodes - evict all evictable inodes for a superblock
483 * @sb: superblock to operate on
484 *
485 * Make sure that no inodes with zero refcount are retained. This is
486 * called by superblock shutdown after having MS_ACTIVE flag removed,
487 * so any inode reaching zero refcount during or after that call will
488 * be immediately evicted.
356 */ 489 */
357static int invalidate_list(struct list_head *head, struct list_head *dispose) 490void evict_inodes(struct super_block *sb)
358{ 491{
359 struct list_head *next; 492 struct inode *inode, *next;
360 int busy = 0, count = 0; 493 LIST_HEAD(dispose);
361
362 next = head->next;
363 for (;;) {
364 struct list_head *tmp = next;
365 struct inode *inode;
366 494
367 /* 495 down_write(&iprune_sem);
368 * We can reschedule here without worrying about the list's
369 * consistency because the per-sb list of inodes must not
370 * change during umount anymore, and because iprune_sem keeps
371 * shrink_icache_memory() away.
372 */
373 cond_resched_lock(&inode_lock);
374 496
375 next = next->next; 497 spin_lock(&inode_lock);
376 if (tmp == head) 498 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
377 break; 499 if (atomic_read(&inode->i_count))
378 inode = list_entry(tmp, struct inode, i_sb_list);
379 if (inode->i_state & I_NEW)
380 continue; 500 continue;
381 invalidate_inode_buffers(inode); 501
382 if (!atomic_read(&inode->i_count)) { 502 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
383 list_move(&inode->i_list, dispose); 503 WARN_ON(1);
384 WARN_ON(inode->i_state & I_NEW);
385 inode->i_state |= I_FREEING;
386 count++;
387 continue; 504 continue;
388 } 505 }
389 busy = 1; 506
507 inode->i_state |= I_FREEING;
508
509 /*
510 * Move the inode off the IO lists and LRU once I_FREEING is
511 * set so that it won't get moved back on there if it is dirty.
512 */
513 list_move(&inode->i_lru, &dispose);
514 list_del_init(&inode->i_wb_list);
515 if (!(inode->i_state & (I_DIRTY | I_SYNC)))
516 percpu_counter_dec(&nr_inodes_unused);
390 } 517 }
391 /* only unused inodes may be cached with i_count zero */ 518 spin_unlock(&inode_lock);
392 inodes_stat.nr_unused -= count; 519
393 return busy; 520 dispose_list(&dispose);
521 up_write(&iprune_sem);
394} 522}
395 523
396/** 524/**
397 * invalidate_inodes - discard the inodes on a device 525 * invalidate_inodes - attempt to free all inodes on a superblock
398 * @sb: superblock 526 * @sb: superblock to operate on
399 * 527 *
400 * Discard all of the inodes for a given superblock. If the discard 528 * Attempts to free all inodes for a given superblock. If there were any
401 * fails because there are busy inodes then a non zero value is returned. 529 * busy inodes return a non-zero value, else zero.
402 * If the discard is successful all the inodes have been discarded.
403 */ 530 */
404int invalidate_inodes(struct super_block *sb) 531int invalidate_inodes(struct super_block *sb)
405{ 532{
406 int busy; 533 int busy = 0;
407 LIST_HEAD(throw_away); 534 struct inode *inode, *next;
535 LIST_HEAD(dispose);
408 536
409 down_write(&iprune_sem); 537 down_write(&iprune_sem);
538
410 spin_lock(&inode_lock); 539 spin_lock(&inode_lock);
411 fsnotify_unmount_inodes(&sb->s_inodes); 540 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
412 busy = invalidate_list(&sb->s_inodes, &throw_away); 541 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
542 continue;
543 if (atomic_read(&inode->i_count)) {
544 busy = 1;
545 continue;
546 }
547
548 inode->i_state |= I_FREEING;
549
550 /*
551 * Move the inode off the IO lists and LRU once I_FREEING is
552 * set so that it won't get moved back on there if it is dirty.
553 */
554 list_move(&inode->i_lru, &dispose);
555 list_del_init(&inode->i_wb_list);
556 if (!(inode->i_state & (I_DIRTY | I_SYNC)))
557 percpu_counter_dec(&nr_inodes_unused);
558 }
413 spin_unlock(&inode_lock); 559 spin_unlock(&inode_lock);
414 560
415 dispose_list(&throw_away); 561 dispose_list(&dispose);
416 up_write(&iprune_sem); 562 up_write(&iprune_sem);
417 563
418 return busy; 564 return busy;
419} 565}
420EXPORT_SYMBOL(invalidate_inodes);
421 566
422static int can_unuse(struct inode *inode) 567static int can_unuse(struct inode *inode)
423{ 568{
424 if (inode->i_state) 569 if (inode->i_state & ~I_REFERENCED)
425 return 0; 570 return 0;
426 if (inode_has_buffers(inode)) 571 if (inode_has_buffers(inode))
427 return 0; 572 return 0;
@@ -433,22 +578,24 @@ static int can_unuse(struct inode *inode)
433} 578}
434 579
435/* 580/*
436 * Scan `goal' inodes on the unused list for freeable ones. They are moved to 581 * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
437 * a temporary list and then are freed outside inode_lock by dispose_list(). 582 * temporary list and then are freed outside inode_lock by dispose_list().
438 * 583 *
439 * Any inodes which are pinned purely because of attached pagecache have their 584 * Any inodes which are pinned purely because of attached pagecache have their
440 * pagecache removed. We expect the final iput() on that inode to add it to 585 * pagecache removed. If the inode has metadata buffers attached to
441 * the front of the inode_unused list. So look for it there and if the 586 * mapping->private_list then try to remove them.
442 * inode is still freeable, proceed. The right inode is found 99.9% of the
443 * time in testing on a 4-way.
444 * 587 *
445 * If the inode has metadata buffers attached to mapping->private_list then 588 * If the inode has the I_REFERENCED flag set, then it means that it has been
446 * try to remove them. 589 * used recently - the flag is set in iput_final(). When we encounter such an
590 * inode, clear the flag and move it to the back of the LRU so it gets another
591 * pass through the LRU before it gets reclaimed. This is necessary because of
592 * the fact we are doing lazy LRU updates to minimise lock contention so the
593 * LRU does not have strict ordering. Hence we don't want to reclaim inodes
594 * with this flag set because they are the inodes that are out of order.
447 */ 595 */
448static void prune_icache(int nr_to_scan) 596static void prune_icache(int nr_to_scan)
449{ 597{
450 LIST_HEAD(freeable); 598 LIST_HEAD(freeable);
451 int nr_pruned = 0;
452 int nr_scanned; 599 int nr_scanned;
453 unsigned long reap = 0; 600 unsigned long reap = 0;
454 601
@@ -457,13 +604,26 @@ static void prune_icache(int nr_to_scan)
457 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { 604 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
458 struct inode *inode; 605 struct inode *inode;
459 606
460 if (list_empty(&inode_unused)) 607 if (list_empty(&inode_lru))
461 break; 608 break;
462 609
463 inode = list_entry(inode_unused.prev, struct inode, i_list); 610 inode = list_entry(inode_lru.prev, struct inode, i_lru);
464 611
465 if (inode->i_state || atomic_read(&inode->i_count)) { 612 /*
466 list_move(&inode->i_list, &inode_unused); 613 * Referenced or dirty inodes are still in use. Give them
614 * another pass through the LRU as we canot reclaim them now.
615 */
616 if (atomic_read(&inode->i_count) ||
617 (inode->i_state & ~I_REFERENCED)) {
618 list_del_init(&inode->i_lru);
619 percpu_counter_dec(&nr_inodes_unused);
620 continue;
621 }
622
623 /* recently referenced inodes get one more pass */
624 if (inode->i_state & I_REFERENCED) {
625 list_move(&inode->i_lru, &inode_lru);
626 inode->i_state &= ~I_REFERENCED;
467 continue; 627 continue;
468 } 628 }
469 if (inode_has_buffers(inode) || inode->i_data.nrpages) { 629 if (inode_has_buffers(inode) || inode->i_data.nrpages) {
@@ -475,18 +635,23 @@ static void prune_icache(int nr_to_scan)
475 iput(inode); 635 iput(inode);
476 spin_lock(&inode_lock); 636 spin_lock(&inode_lock);
477 637
478 if (inode != list_entry(inode_unused.next, 638 if (inode != list_entry(inode_lru.next,
479 struct inode, i_list)) 639 struct inode, i_lru))
480 continue; /* wrong inode or list_empty */ 640 continue; /* wrong inode or list_empty */
481 if (!can_unuse(inode)) 641 if (!can_unuse(inode))
482 continue; 642 continue;
483 } 643 }
484 list_move(&inode->i_list, &freeable);
485 WARN_ON(inode->i_state & I_NEW); 644 WARN_ON(inode->i_state & I_NEW);
486 inode->i_state |= I_FREEING; 645 inode->i_state |= I_FREEING;
487 nr_pruned++; 646
647 /*
648 * Move the inode off the IO lists and LRU once I_FREEING is
649 * set so that it won't get moved back on there if it is dirty.
650 */
651 list_move(&inode->i_lru, &freeable);
652 list_del_init(&inode->i_wb_list);
653 percpu_counter_dec(&nr_inodes_unused);
488 } 654 }
489 inodes_stat.nr_unused -= nr_pruned;
490 if (current_is_kswapd()) 655 if (current_is_kswapd())
491 __count_vm_events(KSWAPD_INODESTEAL, reap); 656 __count_vm_events(KSWAPD_INODESTEAL, reap);
492 else 657 else
@@ -518,7 +683,7 @@ static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
518 return -1; 683 return -1;
519 prune_icache(nr); 684 prune_icache(nr);
520 } 685 }
521 return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; 686 return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure;
522} 687}
523 688
524static struct shrinker icache_shrinker = { 689static struct shrinker icache_shrinker = {
@@ -529,9 +694,6 @@ static struct shrinker icache_shrinker = {
529static void __wait_on_freeing_inode(struct inode *inode); 694static void __wait_on_freeing_inode(struct inode *inode);
530/* 695/*
531 * Called with the inode lock held. 696 * Called with the inode lock held.
532 * NOTE: we are not increasing the inode-refcount, you must call __iget()
533 * by hand after calling find_inode now! This simplifies iunique and won't
534 * add any additional branch in the common code.
535 */ 697 */
536static struct inode *find_inode(struct super_block *sb, 698static struct inode *find_inode(struct super_block *sb,
537 struct hlist_head *head, 699 struct hlist_head *head,
@@ -551,9 +713,10 @@ repeat:
551 __wait_on_freeing_inode(inode); 713 __wait_on_freeing_inode(inode);
552 goto repeat; 714 goto repeat;
553 } 715 }
554 break; 716 __iget(inode);
717 return inode;
555 } 718 }
556 return node ? inode : NULL; 719 return NULL;
557} 720}
558 721
559/* 722/*
@@ -576,53 +739,49 @@ repeat:
576 __wait_on_freeing_inode(inode); 739 __wait_on_freeing_inode(inode);
577 goto repeat; 740 goto repeat;
578 } 741 }
579 break; 742 __iget(inode);
743 return inode;
580 } 744 }
581 return node ? inode : NULL; 745 return NULL;
582}
583
584static unsigned long hash(struct super_block *sb, unsigned long hashval)
585{
586 unsigned long tmp;
587
588 tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
589 L1_CACHE_BYTES;
590 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
591 return tmp & I_HASHMASK;
592}
593
594static inline void
595__inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
596 struct inode *inode)
597{
598 inodes_stat.nr_inodes++;
599 list_add(&inode->i_list, &inode_in_use);
600 list_add(&inode->i_sb_list, &sb->s_inodes);
601 if (head)
602 hlist_add_head(&inode->i_hash, head);
603} 746}
604 747
605/** 748/*
606 * inode_add_to_lists - add a new inode to relevant lists 749 * Each cpu owns a range of LAST_INO_BATCH numbers.
607 * @sb: superblock inode belongs to 750 * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
608 * @inode: inode to mark in use 751 * to renew the exhausted range.
609 * 752 *
610 * When an inode is allocated it needs to be accounted for, added to the in use 753 * This does not significantly increase overflow rate because every CPU can
611 * list, the owning superblock and the inode hash. This needs to be done under 754 * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
612 * the inode_lock, so export a function to do this rather than the inode lock 755 * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
613 * itself. We calculate the hash list to add to here so it is all internal 756 * 2^32 range, and is a worst-case. Even a 50% wastage would only increase
614 * which requires the caller to have already set up the inode number in the 757 * overflow rate by 2x, which does not seem too significant.
615 * inode to add. 758 *
759 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
760 * error if st_ino won't fit in target struct field. Use 32bit counter
761 * here to attempt to avoid that.
616 */ 762 */
617void inode_add_to_lists(struct super_block *sb, struct inode *inode) 763#define LAST_INO_BATCH 1024
764static DEFINE_PER_CPU(unsigned int, last_ino);
765
766unsigned int get_next_ino(void)
618{ 767{
619 struct hlist_head *head = inode_hashtable + hash(sb, inode->i_ino); 768 unsigned int *p = &get_cpu_var(last_ino);
769 unsigned int res = *p;
620 770
621 spin_lock(&inode_lock); 771#ifdef CONFIG_SMP
622 __inode_add_to_lists(sb, head, inode); 772 if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
623 spin_unlock(&inode_lock); 773 static atomic_t shared_last_ino;
774 int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);
775
776 res = next - LAST_INO_BATCH;
777 }
778#endif
779
780 *p = ++res;
781 put_cpu_var(last_ino);
782 return res;
624} 783}
625EXPORT_SYMBOL_GPL(inode_add_to_lists); 784EXPORT_SYMBOL(get_next_ino);
626 785
627/** 786/**
628 * new_inode - obtain an inode 787 * new_inode - obtain an inode
@@ -638,12 +797,6 @@ EXPORT_SYMBOL_GPL(inode_add_to_lists);
638 */ 797 */
639struct inode *new_inode(struct super_block *sb) 798struct inode *new_inode(struct super_block *sb)
640{ 799{
641 /*
642 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
643 * error if st_ino won't fit in target struct field. Use 32bit counter
644 * here to attempt to avoid that.
645 */
646 static unsigned int last_ino;
647 struct inode *inode; 800 struct inode *inode;
648 801
649 spin_lock_prefetch(&inode_lock); 802 spin_lock_prefetch(&inode_lock);
@@ -651,8 +804,7 @@ struct inode *new_inode(struct super_block *sb)
651 inode = alloc_inode(sb); 804 inode = alloc_inode(sb);
652 if (inode) { 805 if (inode) {
653 spin_lock(&inode_lock); 806 spin_lock(&inode_lock);
654 __inode_add_to_lists(sb, NULL, inode); 807 __inode_sb_list_add(inode);
655 inode->i_ino = ++last_ino;
656 inode->i_state = 0; 808 inode->i_state = 0;
657 spin_unlock(&inode_lock); 809 spin_unlock(&inode_lock);
658 } 810 }
@@ -663,7 +815,7 @@ EXPORT_SYMBOL(new_inode);
663void unlock_new_inode(struct inode *inode) 815void unlock_new_inode(struct inode *inode)
664{ 816{
665#ifdef CONFIG_DEBUG_LOCK_ALLOC 817#ifdef CONFIG_DEBUG_LOCK_ALLOC
666 if (inode->i_mode & S_IFDIR) { 818 if (S_ISDIR(inode->i_mode)) {
667 struct file_system_type *type = inode->i_sb->s_type; 819 struct file_system_type *type = inode->i_sb->s_type;
668 820
669 /* Set new key only if filesystem hasn't already changed it */ 821 /* Set new key only if filesystem hasn't already changed it */
@@ -720,7 +872,8 @@ static struct inode *get_new_inode(struct super_block *sb,
720 if (set(inode, data)) 872 if (set(inode, data))
721 goto set_failed; 873 goto set_failed;
722 874
723 __inode_add_to_lists(sb, head, inode); 875 hlist_add_head(&inode->i_hash, head);
876 __inode_sb_list_add(inode);
724 inode->i_state = I_NEW; 877 inode->i_state = I_NEW;
725 spin_unlock(&inode_lock); 878 spin_unlock(&inode_lock);
726 879
@@ -735,7 +888,6 @@ static struct inode *get_new_inode(struct super_block *sb,
735 * us. Use the old inode instead of the one we just 888 * us. Use the old inode instead of the one we just
736 * allocated. 889 * allocated.
737 */ 890 */
738 __iget(old);
739 spin_unlock(&inode_lock); 891 spin_unlock(&inode_lock);
740 destroy_inode(inode); 892 destroy_inode(inode);
741 inode = old; 893 inode = old;
@@ -767,7 +919,8 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
767 old = find_inode_fast(sb, head, ino); 919 old = find_inode_fast(sb, head, ino);
768 if (!old) { 920 if (!old) {
769 inode->i_ino = ino; 921 inode->i_ino = ino;
770 __inode_add_to_lists(sb, head, inode); 922 hlist_add_head(&inode->i_hash, head);
923 __inode_sb_list_add(inode);
771 inode->i_state = I_NEW; 924 inode->i_state = I_NEW;
772 spin_unlock(&inode_lock); 925 spin_unlock(&inode_lock);
773 926
@@ -782,7 +935,6 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
782 * us. Use the old inode instead of the one we just 935 * us. Use the old inode instead of the one we just
783 * allocated. 936 * allocated.
784 */ 937 */
785 __iget(old);
786 spin_unlock(&inode_lock); 938 spin_unlock(&inode_lock);
787 destroy_inode(inode); 939 destroy_inode(inode);
788 inode = old; 940 inode = old;
@@ -791,6 +943,27 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
791 return inode; 943 return inode;
792} 944}
793 945
946/*
947 * search the inode cache for a matching inode number.
948 * If we find one, then the inode number we are trying to
949 * allocate is not unique and so we should not use it.
950 *
951 * Returns 1 if the inode number is unique, 0 if it is not.
952 */
953static int test_inode_iunique(struct super_block *sb, unsigned long ino)
954{
955 struct hlist_head *b = inode_hashtable + hash(sb, ino);
956 struct hlist_node *node;
957 struct inode *inode;
958
959 hlist_for_each_entry(inode, node, b, i_hash) {
960 if (inode->i_ino == ino && inode->i_sb == sb)
961 return 0;
962 }
963
964 return 1;
965}
966
794/** 967/**
795 * iunique - get a unique inode number 968 * iunique - get a unique inode number
796 * @sb: superblock 969 * @sb: superblock
@@ -812,19 +985,18 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
812 * error if st_ino won't fit in target struct field. Use 32bit counter 985 * error if st_ino won't fit in target struct field. Use 32bit counter
813 * here to attempt to avoid that. 986 * here to attempt to avoid that.
814 */ 987 */
988 static DEFINE_SPINLOCK(iunique_lock);
815 static unsigned int counter; 989 static unsigned int counter;
816 struct inode *inode;
817 struct hlist_head *head;
818 ino_t res; 990 ino_t res;
819 991
820 spin_lock(&inode_lock); 992 spin_lock(&inode_lock);
993 spin_lock(&iunique_lock);
821 do { 994 do {
822 if (counter <= max_reserved) 995 if (counter <= max_reserved)
823 counter = max_reserved + 1; 996 counter = max_reserved + 1;
824 res = counter++; 997 res = counter++;
825 head = inode_hashtable + hash(sb, res); 998 } while (!test_inode_iunique(sb, res));
826 inode = find_inode_fast(sb, head, res); 999 spin_unlock(&iunique_lock);
827 } while (inode != NULL);
828 spin_unlock(&inode_lock); 1000 spin_unlock(&inode_lock);
829 1001
830 return res; 1002 return res;
@@ -876,7 +1048,6 @@ static struct inode *ifind(struct super_block *sb,
876 spin_lock(&inode_lock); 1048 spin_lock(&inode_lock);
877 inode = find_inode(sb, head, test, data); 1049 inode = find_inode(sb, head, test, data);
878 if (inode) { 1050 if (inode) {
879 __iget(inode);
880 spin_unlock(&inode_lock); 1051 spin_unlock(&inode_lock);
881 if (likely(wait)) 1052 if (likely(wait))
882 wait_on_inode(inode); 1053 wait_on_inode(inode);
@@ -909,7 +1080,6 @@ static struct inode *ifind_fast(struct super_block *sb,
909 spin_lock(&inode_lock); 1080 spin_lock(&inode_lock);
910 inode = find_inode_fast(sb, head, ino); 1081 inode = find_inode_fast(sb, head, ino);
911 if (inode) { 1082 if (inode) {
912 __iget(inode);
913 spin_unlock(&inode_lock); 1083 spin_unlock(&inode_lock);
914 wait_on_inode(inode); 1084 wait_on_inode(inode);
915 return inode; 1085 return inode;
@@ -1095,7 +1265,7 @@ int insert_inode_locked(struct inode *inode)
1095 __iget(old); 1265 __iget(old);
1096 spin_unlock(&inode_lock); 1266 spin_unlock(&inode_lock);
1097 wait_on_inode(old); 1267 wait_on_inode(old);
1098 if (unlikely(!hlist_unhashed(&old->i_hash))) { 1268 if (unlikely(!inode_unhashed(old))) {
1099 iput(old); 1269 iput(old);
1100 return -EBUSY; 1270 return -EBUSY;
1101 } 1271 }
@@ -1134,7 +1304,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
1134 __iget(old); 1304 __iget(old);
1135 spin_unlock(&inode_lock); 1305 spin_unlock(&inode_lock);
1136 wait_on_inode(old); 1306 wait_on_inode(old);
1137 if (unlikely(!hlist_unhashed(&old->i_hash))) { 1307 if (unlikely(!inode_unhashed(old))) {
1138 iput(old); 1308 iput(old);
1139 return -EBUSY; 1309 return -EBUSY;
1140 } 1310 }
@@ -1143,36 +1313,6 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
1143} 1313}
1144EXPORT_SYMBOL(insert_inode_locked4); 1314EXPORT_SYMBOL(insert_inode_locked4);
1145 1315
1146/**
1147 * __insert_inode_hash - hash an inode
1148 * @inode: unhashed inode
1149 * @hashval: unsigned long value used to locate this object in the
1150 * inode_hashtable.
1151 *
1152 * Add an inode to the inode hash for this superblock.
1153 */
1154void __insert_inode_hash(struct inode *inode, unsigned long hashval)
1155{
1156 struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
1157 spin_lock(&inode_lock);
1158 hlist_add_head(&inode->i_hash, head);
1159 spin_unlock(&inode_lock);
1160}
1161EXPORT_SYMBOL(__insert_inode_hash);
1162
1163/**
1164 * remove_inode_hash - remove an inode from the hash
1165 * @inode: inode to unhash
1166 *
1167 * Remove an inode from the superblock.
1168 */
1169void remove_inode_hash(struct inode *inode)
1170{
1171 spin_lock(&inode_lock);
1172 hlist_del_init(&inode->i_hash);
1173 spin_unlock(&inode_lock);
1174}
1175EXPORT_SYMBOL(remove_inode_hash);
1176 1316
1177int generic_delete_inode(struct inode *inode) 1317int generic_delete_inode(struct inode *inode)
1178{ 1318{
@@ -1187,7 +1327,7 @@ EXPORT_SYMBOL(generic_delete_inode);
1187 */ 1327 */
1188int generic_drop_inode(struct inode *inode) 1328int generic_drop_inode(struct inode *inode)
1189{ 1329{
1190 return !inode->i_nlink || hlist_unhashed(&inode->i_hash); 1330 return !inode->i_nlink || inode_unhashed(inode);
1191} 1331}
1192EXPORT_SYMBOL_GPL(generic_drop_inode); 1332EXPORT_SYMBOL_GPL(generic_drop_inode);
1193 1333
@@ -1213,10 +1353,11 @@ static void iput_final(struct inode *inode)
1213 drop = generic_drop_inode(inode); 1353 drop = generic_drop_inode(inode);
1214 1354
1215 if (!drop) { 1355 if (!drop) {
1216 if (!(inode->i_state & (I_DIRTY|I_SYNC)))
1217 list_move(&inode->i_list, &inode_unused);
1218 inodes_stat.nr_unused++;
1219 if (sb->s_flags & MS_ACTIVE) { 1356 if (sb->s_flags & MS_ACTIVE) {
1357 inode->i_state |= I_REFERENCED;
1358 if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
1359 inode_lru_list_add(inode);
1360 }
1220 spin_unlock(&inode_lock); 1361 spin_unlock(&inode_lock);
1221 return; 1362 return;
1222 } 1363 }
@@ -1227,19 +1368,23 @@ static void iput_final(struct inode *inode)
1227 spin_lock(&inode_lock); 1368 spin_lock(&inode_lock);
1228 WARN_ON(inode->i_state & I_NEW); 1369 WARN_ON(inode->i_state & I_NEW);
1229 inode->i_state &= ~I_WILL_FREE; 1370 inode->i_state &= ~I_WILL_FREE;
1230 inodes_stat.nr_unused--; 1371 __remove_inode_hash(inode);
1231 hlist_del_init(&inode->i_hash);
1232 } 1372 }
1233 list_del_init(&inode->i_list); 1373
1234 list_del_init(&inode->i_sb_list);
1235 WARN_ON(inode->i_state & I_NEW); 1374 WARN_ON(inode->i_state & I_NEW);
1236 inode->i_state |= I_FREEING; 1375 inode->i_state |= I_FREEING;
1237 inodes_stat.nr_inodes--; 1376
1377 /*
1378 * Move the inode off the IO lists and LRU once I_FREEING is
1379 * set so that it won't get moved back on there if it is dirty.
1380 */
1381 inode_lru_list_del(inode);
1382 list_del_init(&inode->i_wb_list);
1383
1384 __inode_sb_list_del(inode);
1238 spin_unlock(&inode_lock); 1385 spin_unlock(&inode_lock);
1239 evict(inode); 1386 evict(inode);
1240 spin_lock(&inode_lock); 1387 remove_inode_hash(inode);
1241 hlist_del_init(&inode->i_hash);
1242 spin_unlock(&inode_lock);
1243 wake_up_inode(inode); 1388 wake_up_inode(inode);
1244 BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); 1389 BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
1245 destroy_inode(inode); 1390 destroy_inode(inode);
@@ -1503,6 +1648,8 @@ void __init inode_init(void)
1503 SLAB_MEM_SPREAD), 1648 SLAB_MEM_SPREAD),
1504 init_once); 1649 init_once);
1505 register_shrinker(&icache_shrinker); 1650 register_shrinker(&icache_shrinker);
1651 percpu_counter_init(&nr_inodes, 0);
1652 percpu_counter_init(&nr_inodes_unused, 0);
1506 1653
1507 /* Hash may have been set up in inode_init_early */ 1654 /* Hash may have been set up in inode_init_early */
1508 if (!hashdist) 1655 if (!hashdist)
diff --git a/fs/internal.h b/fs/internal.h
index a6910e91cee8..ebad3b90752d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -101,3 +101,10 @@ extern void put_super(struct super_block *sb);
101struct nameidata; 101struct nameidata;
102extern struct file *nameidata_to_filp(struct nameidata *); 102extern struct file *nameidata_to_filp(struct nameidata *);
103extern void release_open_intent(struct nameidata *); 103extern void release_open_intent(struct nameidata *);
104
105/*
106 * inode.c
107 */
108extern int get_nr_dirty_inodes(void);
109extern int evict_inodes(struct super_block *);
110extern int invalidate_inodes(struct super_block *);
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index e0aca9a0ac68..0542b6eedf80 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -10,7 +10,6 @@
10 * 10 *
11 * isofs directory handling functions 11 * isofs directory handling functions
12 */ 12 */
13#include <linux/smp_lock.h>
14#include <linux/gfp.h> 13#include <linux/gfp.h>
15#include "isofs.h" 14#include "isofs.h"
16 15
@@ -255,18 +254,19 @@ static int isofs_readdir(struct file *filp,
255 char *tmpname; 254 char *tmpname;
256 struct iso_directory_record *tmpde; 255 struct iso_directory_record *tmpde;
257 struct inode *inode = filp->f_path.dentry->d_inode; 256 struct inode *inode = filp->f_path.dentry->d_inode;
257 struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
258 258
259 tmpname = (char *)__get_free_page(GFP_KERNEL); 259 tmpname = (char *)__get_free_page(GFP_KERNEL);
260 if (tmpname == NULL) 260 if (tmpname == NULL)
261 return -ENOMEM; 261 return -ENOMEM;
262 262
263 lock_kernel(); 263 mutex_lock(&sbi->s_mutex);
264 tmpde = (struct iso_directory_record *) (tmpname+1024); 264 tmpde = (struct iso_directory_record *) (tmpname+1024);
265 265
266 result = do_isofs_readdir(inode, filp, dirent, filldir, tmpname, tmpde); 266 result = do_isofs_readdir(inode, filp, dirent, filldir, tmpname, tmpde);
267 267
268 free_page((unsigned long) tmpname); 268 free_page((unsigned long) tmpname);
269 unlock_kernel(); 269 mutex_unlock(&sbi->s_mutex);
270 return result; 270 return result;
271} 271}
272 272
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 5a44811b5027..79cf7f616bbe 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -17,7 +17,6 @@
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/nls.h> 18#include <linux/nls.h>
19#include <linux/ctype.h> 19#include <linux/ctype.h>
20#include <linux/smp_lock.h>
21#include <linux/statfs.h> 20#include <linux/statfs.h>
22#include <linux/cdrom.h> 21#include <linux/cdrom.h>
23#include <linux/parser.h> 22#include <linux/parser.h>
@@ -44,11 +43,7 @@ static void isofs_put_super(struct super_block *sb)
44 struct isofs_sb_info *sbi = ISOFS_SB(sb); 43 struct isofs_sb_info *sbi = ISOFS_SB(sb);
45 44
46#ifdef CONFIG_JOLIET 45#ifdef CONFIG_JOLIET
47 lock_kernel();
48
49 unload_nls(sbi->s_nls_iocharset); 46 unload_nls(sbi->s_nls_iocharset);
50
51 unlock_kernel();
52#endif 47#endif
53 48
54 kfree(sbi); 49 kfree(sbi);
@@ -549,6 +544,34 @@ static unsigned int isofs_get_last_session(struct super_block *sb, s32 session)
549} 544}
550 545
551/* 546/*
547 * Check if root directory is empty (has less than 3 files).
548 *
549 * Used to detect broken CDs where ISO root directory is empty but Joliet root
550 * directory is OK. If such CD has Rock Ridge extensions, they will be disabled
551 * (and Joliet used instead) or else no files would be visible.
552 */
553static bool rootdir_empty(struct super_block *sb, unsigned long block)
554{
555 int offset = 0, files = 0, de_len;
556 struct iso_directory_record *de;
557 struct buffer_head *bh;
558
559 bh = sb_bread(sb, block);
560 if (!bh)
561 return true;
562 while (files < 3) {
563 de = (struct iso_directory_record *) (bh->b_data + offset);
564 de_len = *(unsigned char *) de;
565 if (de_len == 0)
566 break;
567 files++;
568 offset += de_len;
569 }
570 brelse(bh);
571 return files < 3;
572}
573
574/*
552 * Initialize the superblock and read the root inode. 575 * Initialize the superblock and read the root inode.
553 * 576 *
554 * Note: a check_disk_change() has been done immediately prior 577 * Note: a check_disk_change() has been done immediately prior
@@ -823,6 +846,7 @@ root_found:
823 sbi->s_utf8 = opt.utf8; 846 sbi->s_utf8 = opt.utf8;
824 sbi->s_nocompress = opt.nocompress; 847 sbi->s_nocompress = opt.nocompress;
825 sbi->s_overriderockperm = opt.overriderockperm; 848 sbi->s_overriderockperm = opt.overriderockperm;
849 mutex_init(&sbi->s_mutex);
826 /* 850 /*
827 * It would be incredibly stupid to allow people to mark every file 851 * It would be incredibly stupid to allow people to mark every file
828 * on the disk as suid, so we merely allow them to set the default 852 * on the disk as suid, so we merely allow them to set the default
@@ -847,6 +871,18 @@ root_found:
847 goto out_no_root; 871 goto out_no_root;
848 872
849 /* 873 /*
874 * Fix for broken CDs with Rock Ridge and empty ISO root directory but
875 * correct Joliet root directory.
876 */
877 if (sbi->s_rock == 1 && joliet_level &&
878 rootdir_empty(s, sbi->s_firstdatazone)) {
879 printk(KERN_NOTICE
880 "ISOFS: primary root directory is empty. "
881 "Disabling Rock Ridge and switching to Joliet.");
882 sbi->s_rock = 0;
883 }
884
885 /*
850 * If this disk has both Rock Ridge and Joliet on it, then we 886 * If this disk has both Rock Ridge and Joliet on it, then we
851 * want to use Rock Ridge by default. This can be overridden 887 * want to use Rock Ridge by default. This can be overridden
852 * by using the norock mount option. There is still one other 888 * by using the norock mount option. There is still one other
@@ -966,27 +1002,23 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
966 * or getblk() if they are not. Returns the number of blocks inserted 1002 * or getblk() if they are not. Returns the number of blocks inserted
967 * (-ve == error.) 1003 * (-ve == error.)
968 */ 1004 */
969int isofs_get_blocks(struct inode *inode, sector_t iblock_s, 1005int isofs_get_blocks(struct inode *inode, sector_t iblock,
970 struct buffer_head **bh, unsigned long nblocks) 1006 struct buffer_head **bh, unsigned long nblocks)
971{ 1007{
972 unsigned long b_off; 1008 unsigned long b_off = iblock;
973 unsigned offset, sect_size; 1009 unsigned offset, sect_size;
974 unsigned int firstext; 1010 unsigned int firstext;
975 unsigned long nextblk, nextoff; 1011 unsigned long nextblk, nextoff;
976 long iblock = (long)iblock_s;
977 int section, rv, error; 1012 int section, rv, error;
978 struct iso_inode_info *ei = ISOFS_I(inode); 1013 struct iso_inode_info *ei = ISOFS_I(inode);
979 1014
980 lock_kernel();
981
982 error = -EIO; 1015 error = -EIO;
983 rv = 0; 1016 rv = 0;
984 if (iblock < 0 || iblock != iblock_s) { 1017 if (iblock != b_off) {
985 printk(KERN_DEBUG "%s: block number too large\n", __func__); 1018 printk(KERN_DEBUG "%s: block number too large\n", __func__);
986 goto abort; 1019 goto abort;
987 } 1020 }
988 1021
989 b_off = iblock;
990 1022
991 offset = 0; 1023 offset = 0;
992 firstext = ei->i_first_extent; 1024 firstext = ei->i_first_extent;
@@ -1004,8 +1036,9 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
1004 * I/O errors. 1036 * I/O errors.
1005 */ 1037 */
1006 if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) { 1038 if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) {
1007 printk(KERN_DEBUG "%s: block >= EOF (%ld, %ld)\n", 1039 printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n",
1008 __func__, iblock, (unsigned long) inode->i_size); 1040 __func__, b_off,
1041 (unsigned long long)inode->i_size);
1009 goto abort; 1042 goto abort;
1010 } 1043 }
1011 1044
@@ -1031,9 +1064,9 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
1031 if (++section > 100) { 1064 if (++section > 100) {
1032 printk(KERN_DEBUG "%s: More than 100 file sections ?!?" 1065 printk(KERN_DEBUG "%s: More than 100 file sections ?!?"
1033 " aborting...\n", __func__); 1066 " aborting...\n", __func__);
1034 printk(KERN_DEBUG "%s: block=%ld firstext=%u sect_size=%u " 1067 printk(KERN_DEBUG "%s: block=%lu firstext=%u sect_size=%u "
1035 "nextblk=%lu nextoff=%lu\n", __func__, 1068 "nextblk=%lu nextoff=%lu\n", __func__,
1036 iblock, firstext, (unsigned) sect_size, 1069 b_off, firstext, (unsigned) sect_size,
1037 nextblk, nextoff); 1070 nextblk, nextoff);
1038 goto abort; 1071 goto abort;
1039 } 1072 }
@@ -1054,7 +1087,6 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
1054 1087
1055 error = 0; 1088 error = 0;
1056abort: 1089abort:
1057 unlock_kernel();
1058 return rv != 0 ? rv : error; 1090 return rv != 0 ? rv : error;
1059} 1091}
1060 1092
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index 7d33de84f52a..2882dc089f87 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -55,6 +55,7 @@ struct isofs_sb_info {
55 gid_t s_gid; 55 gid_t s_gid;
56 uid_t s_uid; 56 uid_t s_uid;
57 struct nls_table *s_nls_iocharset; /* Native language support table */ 57 struct nls_table *s_nls_iocharset; /* Native language support table */
58 struct mutex s_mutex; /* replaces BKL, please remove if possible */
58}; 59};
59 60
60#define ISOFS_INVALID_MODE ((mode_t) -1) 61#define ISOFS_INVALID_MODE ((mode_t) -1)
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index ab438beb867c..0d23abfd4280 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -6,7 +6,6 @@
6 * (C) 1991 Linus Torvalds - minix filesystem 6 * (C) 1991 Linus Torvalds - minix filesystem
7 */ 7 */
8 8
9#include <linux/smp_lock.h>
10#include <linux/gfp.h> 9#include <linux/gfp.h>
11#include "isofs.h" 10#include "isofs.h"
12 11
@@ -168,6 +167,7 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
168 int found; 167 int found;
169 unsigned long uninitialized_var(block); 168 unsigned long uninitialized_var(block);
170 unsigned long uninitialized_var(offset); 169 unsigned long uninitialized_var(offset);
170 struct isofs_sb_info *sbi = ISOFS_SB(dir->i_sb);
171 struct inode *inode; 171 struct inode *inode;
172 struct page *page; 172 struct page *page;
173 173
@@ -177,7 +177,7 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
177 if (!page) 177 if (!page)
178 return ERR_PTR(-ENOMEM); 178 return ERR_PTR(-ENOMEM);
179 179
180 lock_kernel(); 180 mutex_lock(&sbi->s_mutex);
181 found = isofs_find_entry(dir, dentry, 181 found = isofs_find_entry(dir, dentry,
182 &block, &offset, 182 &block, &offset,
183 page_address(page), 183 page_address(page),
@@ -188,10 +188,10 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
188 if (found) { 188 if (found) {
189 inode = isofs_iget(dir->i_sb, block, offset); 189 inode = isofs_iget(dir->i_sb, block, offset);
190 if (IS_ERR(inode)) { 190 if (IS_ERR(inode)) {
191 unlock_kernel(); 191 mutex_unlock(&sbi->s_mutex);
192 return ERR_CAST(inode); 192 return ERR_CAST(inode);
193 } 193 }
194 } 194 }
195 unlock_kernel(); 195 mutex_unlock(&sbi->s_mutex);
196 return d_splice_alias(inode, dentry); 196 return d_splice_alias(inode, dentry);
197} 197}
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 96a685c550fd..f9cd04db6eab 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -8,7 +8,6 @@
8 8
9#include <linux/slab.h> 9#include <linux/slab.h>
10#include <linux/pagemap.h> 10#include <linux/pagemap.h>
11#include <linux/smp_lock.h>
12 11
13#include "isofs.h" 12#include "isofs.h"
14#include "rock.h" 13#include "rock.h"
@@ -661,6 +660,7 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
661{ 660{
662 struct inode *inode = page->mapping->host; 661 struct inode *inode = page->mapping->host;
663 struct iso_inode_info *ei = ISOFS_I(inode); 662 struct iso_inode_info *ei = ISOFS_I(inode);
663 struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
664 char *link = kmap(page); 664 char *link = kmap(page);
665 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); 665 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
666 struct buffer_head *bh; 666 struct buffer_head *bh;
@@ -673,12 +673,12 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
673 struct rock_state rs; 673 struct rock_state rs;
674 int ret; 674 int ret;
675 675
676 if (!ISOFS_SB(inode->i_sb)->s_rock) 676 if (!sbi->s_rock)
677 goto error; 677 goto error;
678 678
679 init_rock_state(&rs, inode); 679 init_rock_state(&rs, inode);
680 block = ei->i_iget5_block; 680 block = ei->i_iget5_block;
681 lock_kernel(); 681 mutex_lock(&sbi->s_mutex);
682 bh = sb_bread(inode->i_sb, block); 682 bh = sb_bread(inode->i_sb, block);
683 if (!bh) 683 if (!bh)
684 goto out_noread; 684 goto out_noread;
@@ -748,7 +748,7 @@ repeat:
748 goto fail; 748 goto fail;
749 brelse(bh); 749 brelse(bh);
750 *rpnt = '\0'; 750 *rpnt = '\0';
751 unlock_kernel(); 751 mutex_unlock(&sbi->s_mutex);
752 SetPageUptodate(page); 752 SetPageUptodate(page);
753 kunmap(page); 753 kunmap(page);
754 unlock_page(page); 754 unlock_page(page);
@@ -765,7 +765,7 @@ out_bad_span:
765 printk("symlink spans iso9660 blocks\n"); 765 printk("symlink spans iso9660 blocks\n");
766fail: 766fail:
767 brelse(bh); 767 brelse(bh);
768 unlock_kernel(); 768 mutex_unlock(&sbi->s_mutex);
769error: 769error:
770 SetPageError(page); 770 SetPageError(page);
771 kunmap(page); 771 kunmap(page);
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 95d8c11c929e..85a6883c0aca 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -137,34 +137,10 @@ static int journal_write_commit_record(journal_t *journal,
137 JBUFFER_TRACE(descriptor, "write commit block"); 137 JBUFFER_TRACE(descriptor, "write commit block");
138 set_buffer_dirty(bh); 138 set_buffer_dirty(bh);
139 139
140 if (journal->j_flags & JFS_BARRIER) { 140 if (journal->j_flags & JFS_BARRIER)
141 ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_BARRIER); 141 ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
142 142 else
143 /*
144 * Is it possible for another commit to fail at roughly
145 * the same time as this one? If so, we don't want to
146 * trust the barrier flag in the super, but instead want
147 * to remember if we sent a barrier request
148 */
149 if (ret == -EOPNOTSUPP) {
150 char b[BDEVNAME_SIZE];
151
152 printk(KERN_WARNING
153 "JBD: barrier-based sync failed on %s - "
154 "disabling barriers\n",
155 bdevname(journal->j_dev, b));
156 spin_lock(&journal->j_state_lock);
157 journal->j_flags &= ~JFS_BARRIER;
158 spin_unlock(&journal->j_state_lock);
159
160 /* And try again, without the barrier */
161 set_buffer_uptodate(bh);
162 set_buffer_dirty(bh);
163 ret = sync_dirty_buffer(bh);
164 }
165 } else {
166 ret = sync_dirty_buffer(bh); 143 ret = sync_dirty_buffer(bh);
167 }
168 144
169 put_bh(bh); /* One for getblk() */ 145 put_bh(bh); /* One for getblk() */
170 journal_put_journal_head(descriptor); 146 journal_put_journal_head(descriptor);
@@ -318,7 +294,7 @@ void journal_commit_transaction(journal_t *journal)
318 int first_tag = 0; 294 int first_tag = 0;
319 int tag_flag; 295 int tag_flag;
320 int i; 296 int i;
321 int write_op = WRITE; 297 int write_op = WRITE_SYNC;
322 298
323 /* 299 /*
324 * First job: lock down the current transaction and wait for 300 * First job: lock down the current transaction and wait for
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 5247e7ffdcb4..6571a056e55d 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -532,8 +532,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
532 */ 532 */
533 if ((journal->j_fs_dev != journal->j_dev) && 533 if ((journal->j_fs_dev != journal->j_dev) &&
534 (journal->j_flags & JBD2_BARRIER)) 534 (journal->j_flags & JBD2_BARRIER))
535 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL, 535 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
536 BLKDEV_IFL_WAIT);
537 if (!(journal->j_flags & JBD2_ABORT)) 536 if (!(journal->j_flags & JBD2_ABORT))
538 jbd2_journal_update_superblock(journal, 1); 537 jbd2_journal_update_superblock(journal, 1);
539 return 0; 538 return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7c068c189d80..bc6be8bda1cc 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -134,25 +134,11 @@ static int journal_submit_commit_record(journal_t *journal,
134 134
135 if (journal->j_flags & JBD2_BARRIER && 135 if (journal->j_flags & JBD2_BARRIER &&
136 !JBD2_HAS_INCOMPAT_FEATURE(journal, 136 !JBD2_HAS_INCOMPAT_FEATURE(journal,
137 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 137 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
138 ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh); 138 ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh);
139 if (ret == -EOPNOTSUPP) { 139 else
140 printk(KERN_WARNING
141 "JBD2: Disabling barriers on %s, "
142 "not supported by device\n", journal->j_devname);
143 write_lock(&journal->j_state_lock);
144 journal->j_flags &= ~JBD2_BARRIER;
145 write_unlock(&journal->j_state_lock);
146
147 /* And try again, without the barrier */
148 lock_buffer(bh);
149 set_buffer_uptodate(bh);
150 clear_buffer_dirty(bh);
151 ret = submit_bh(WRITE_SYNC_PLUG, bh);
152 }
153 } else {
154 ret = submit_bh(WRITE_SYNC_PLUG, bh); 140 ret = submit_bh(WRITE_SYNC_PLUG, bh);
155 } 141
156 *cbh = bh; 142 *cbh = bh;
157 return ret; 143 return ret;
158} 144}
@@ -166,29 +152,8 @@ static int journal_wait_on_commit_record(journal_t *journal,
166{ 152{
167 int ret = 0; 153 int ret = 0;
168 154
169retry:
170 clear_buffer_dirty(bh); 155 clear_buffer_dirty(bh);
171 wait_on_buffer(bh); 156 wait_on_buffer(bh);
172 if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
173 printk(KERN_WARNING
174 "JBD2: %s: disabling barries on %s - not supported "
175 "by device\n", __func__, journal->j_devname);
176 write_lock(&journal->j_state_lock);
177 journal->j_flags &= ~JBD2_BARRIER;
178 write_unlock(&journal->j_state_lock);
179
180 lock_buffer(bh);
181 clear_buffer_dirty(bh);
182 set_buffer_uptodate(bh);
183 bh->b_end_io = journal_end_buffer_io_sync;
184
185 ret = submit_bh(WRITE_SYNC_PLUG, bh);
186 if (ret) {
187 unlock_buffer(bh);
188 return ret;
189 }
190 goto retry;
191 }
192 157
193 if (unlikely(!buffer_uptodate(bh))) 158 if (unlikely(!buffer_uptodate(bh)))
194 ret = -EIO; 159 ret = -EIO;
@@ -360,7 +325,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
360 int tag_bytes = journal_tag_bytes(journal); 325 int tag_bytes = journal_tag_bytes(journal);
361 struct buffer_head *cbh = NULL; /* For transactional checksums */ 326 struct buffer_head *cbh = NULL; /* For transactional checksums */
362 __u32 crc32_sum = ~0; 327 __u32 crc32_sum = ~0;
363 int write_op = WRITE; 328 int write_op = WRITE_SYNC;
364 329
365 /* 330 /*
366 * First job: lock down the current transaction and wait for 331 * First job: lock down the current transaction and wait for
@@ -701,6 +666,16 @@ start_journal_io:
701 } 666 }
702 } 667 }
703 668
669 err = journal_finish_inode_data_buffers(journal, commit_transaction);
670 if (err) {
671 printk(KERN_WARNING
672 "JBD2: Detected IO errors while flushing file data "
673 "on %s\n", journal->j_devname);
674 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
675 jbd2_journal_abort(journal, err);
676 err = 0;
677 }
678
704 /* 679 /*
705 * If the journal is not located on the file system device, 680 * If the journal is not located on the file system device,
706 * then we must flush the file system device before we issue 681 * then we must flush the file system device before we issue
@@ -709,8 +684,7 @@ start_journal_io:
709 if (commit_transaction->t_flushed_data_blocks && 684 if (commit_transaction->t_flushed_data_blocks &&
710 (journal->j_fs_dev != journal->j_dev) && 685 (journal->j_fs_dev != journal->j_dev) &&
711 (journal->j_flags & JBD2_BARRIER)) 686 (journal->j_flags & JBD2_BARRIER))
712 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL, 687 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
713 BLKDEV_IFL_WAIT);
714 688
715 /* Done it all: now write the commit record asynchronously. */ 689 /* Done it all: now write the commit record asynchronously. */
716 if (JBD2_HAS_INCOMPAT_FEATURE(journal, 690 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -719,19 +693,6 @@ start_journal_io:
719 &cbh, crc32_sum); 693 &cbh, crc32_sum);
720 if (err) 694 if (err)
721 __jbd2_journal_abort_hard(journal); 695 __jbd2_journal_abort_hard(journal);
722 if (journal->j_flags & JBD2_BARRIER)
723 blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
724 BLKDEV_IFL_WAIT);
725 }
726
727 err = journal_finish_inode_data_buffers(journal, commit_transaction);
728 if (err) {
729 printk(KERN_WARNING
730 "JBD2: Detected IO errors while flushing file data "
731 "on %s\n", journal->j_devname);
732 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
733 jbd2_journal_abort(journal, err);
734 err = 0;
735 } 696 }
736 697
737 /* Lo and behold: we have just managed to send a transaction to 698 /* Lo and behold: we have just managed to send a transaction to
@@ -845,6 +806,11 @@ wait_for_iobuf:
845 } 806 }
846 if (!err && !is_journal_aborted(journal)) 807 if (!err && !is_journal_aborted(journal))
847 err = journal_wait_on_commit_record(journal, cbh); 808 err = journal_wait_on_commit_record(journal, cbh);
809 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
810 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
811 journal->j_flags & JBD2_BARRIER) {
812 blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
813 }
848 814
849 if (err) 815 if (err)
850 jbd2_journal_abort(journal, err); 816 jbd2_journal_abort(journal, err);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 0e8014ea6b94..262419f83d80 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1371,6 +1371,10 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
1371 1371
1372 if (!compat && !ro && !incompat) 1372 if (!compat && !ro && !incompat)
1373 return 1; 1373 return 1;
1374 /* Load journal superblock if it is not loaded yet. */
1375 if (journal->j_format_version == 0 &&
1376 journal_get_superblock(journal) != 0)
1377 return 0;
1374 if (journal->j_format_version == 1) 1378 if (journal->j_format_version == 1)
1375 return 0; 1379 return 0;
1376 1380
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index ed78a3cf3cb0..79121aa5858b 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -289,7 +289,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
289 mutex_unlock(&f->sem); 289 mutex_unlock(&f->sem);
290 d_instantiate(dentry, old_dentry->d_inode); 290 d_instantiate(dentry, old_dentry->d_inode);
291 dir_i->i_mtime = dir_i->i_ctime = ITIME(now); 291 dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
292 atomic_inc(&old_dentry->d_inode->i_count); 292 ihold(old_dentry->d_inode);
293 } 293 }
294 return ret; 294 return ret;
295} 295}
@@ -864,7 +864,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
864 printk(KERN_NOTICE "jffs2_rename(): Link succeeded, unlink failed (err %d). You now have a hard link\n", ret); 864 printk(KERN_NOTICE "jffs2_rename(): Link succeeded, unlink failed (err %d). You now have a hard link\n", ret);
865 /* Might as well let the VFS know */ 865 /* Might as well let the VFS know */
866 d_instantiate(new_dentry, old_dentry->d_inode); 866 d_instantiate(new_dentry, old_dentry->d_inode);
867 atomic_inc(&old_dentry->d_inode->i_count); 867 ihold(old_dentry->d_inode);
868 new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now); 868 new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
869 return ret; 869 return ret;
870 } 870 }
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 6b2964a19850..d9beb06e6fca 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -21,7 +21,6 @@
21#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
22#include <linux/vfs.h> 22#include <linux/vfs.h>
23#include <linux/crc32.h> 23#include <linux/crc32.h>
24#include <linux/smp_lock.h>
25#include "nodelist.h" 24#include "nodelist.h"
26 25
27static int jffs2_flash_setup(struct jffs2_sb_info *c); 26static int jffs2_flash_setup(struct jffs2_sb_info *c);
@@ -391,7 +390,6 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
391 This also catches the case where it was stopped and this 390 This also catches the case where it was stopped and this
392 is just a remount to restart it. 391 is just a remount to restart it.
393 Flush the writebuffer, if neccecary, else we loose it */ 392 Flush the writebuffer, if neccecary, else we loose it */
394 lock_kernel();
395 if (!(sb->s_flags & MS_RDONLY)) { 393 if (!(sb->s_flags & MS_RDONLY)) {
396 jffs2_stop_garbage_collect_thread(c); 394 jffs2_stop_garbage_collect_thread(c);
397 mutex_lock(&c->alloc_sem); 395 mutex_lock(&c->alloc_sem);
@@ -403,8 +401,6 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
403 jffs2_start_garbage_collect_thread(c); 401 jffs2_start_garbage_collect_thread(c);
404 402
405 *flags |= MS_NOATIME; 403 *flags |= MS_NOATIME;
406
407 unlock_kernel();
408 return 0; 404 return 0;
409} 405}
410 406
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 662bba099501..d1ae5dfc22b9 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -12,7 +12,6 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/smp_lock.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/list.h> 16#include <linux/list.h>
18#include <linux/fs.h> 17#include <linux/fs.h>
@@ -146,6 +145,7 @@ static const struct super_operations jffs2_super_operations =
146static int jffs2_fill_super(struct super_block *sb, void *data, int silent) 145static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
147{ 146{
148 struct jffs2_sb_info *c; 147 struct jffs2_sb_info *c;
148 int ret;
149 149
150 D1(printk(KERN_DEBUG "jffs2_get_sb_mtd():" 150 D1(printk(KERN_DEBUG "jffs2_get_sb_mtd():"
151 " New superblock for device %d (\"%s\")\n", 151 " New superblock for device %d (\"%s\")\n",
@@ -175,7 +175,8 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
175#ifdef CONFIG_JFFS2_FS_POSIX_ACL 175#ifdef CONFIG_JFFS2_FS_POSIX_ACL
176 sb->s_flags |= MS_POSIXACL; 176 sb->s_flags |= MS_POSIXACL;
177#endif 177#endif
178 return jffs2_do_fill_super(sb, data, silent); 178 ret = jffs2_do_fill_super(sb, data, silent);
179 return ret;
179} 180}
180 181
181static int jffs2_get_sb(struct file_system_type *fs_type, 182static int jffs2_get_sb(struct file_system_type *fs_type,
@@ -192,8 +193,6 @@ static void jffs2_put_super (struct super_block *sb)
192 193
193 D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n")); 194 D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n"));
194 195
195 lock_kernel();
196
197 if (sb->s_dirt) 196 if (sb->s_dirt)
198 jffs2_write_super(sb); 197 jffs2_write_super(sb);
199 198
@@ -215,8 +214,6 @@ static void jffs2_put_super (struct super_block *sb)
215 if (c->mtd->sync) 214 if (c->mtd->sync)
216 c->mtd->sync(c->mtd); 215 c->mtd->sync(c->mtd);
217 216
218 unlock_kernel();
219
220 D1(printk(KERN_DEBUG "jffs2_put_super returning\n")); 217 D1(printk(KERN_DEBUG "jffs2_put_super returning\n"));
221} 218}
222 219
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index f8332dc8eeb2..3a09423b6c22 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -497,7 +497,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
497 * appear hashed, but do not put on any lists. hlist_del() 497 * appear hashed, but do not put on any lists. hlist_del()
498 * will work fine and require no locking. 498 * will work fine and require no locking.
499 */ 499 */
500 ip->i_hash.pprev = &ip->i_hash.next; 500 hlist_add_fake(&ip->i_hash);
501 501
502 return (ip); 502 return (ip);
503} 503}
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index c51af2a14516..e1b8493b9aaa 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1010,15 +1010,13 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
1010 * option 2 - shutdown file systems 1010 * option 2 - shutdown file systems
1011 * associated with log ? 1011 * associated with log ?
1012 * option 3 - extend log ? 1012 * option 3 - extend log ?
1013 */
1014 /*
1015 * option 4 - second chance 1013 * option 4 - second chance
1016 * 1014 *
1017 * mark log wrapped, and continue. 1015 * mark log wrapped, and continue.
1018 * when all active transactions are completed, 1016 * when all active transactions are completed,
1019 * mark log vaild for recovery. 1017 * mark log valid for recovery.
1020 * if crashed during invalid state, log state 1018 * if crashed during invalid state, log state
1021 * implies invald log, forcing fsck(). 1019 * implies invalid log, forcing fsck().
1022 */ 1020 */
1023 /* mark log state log wrap in log superblock */ 1021 /* mark log state log wrap in log superblock */
1024 /* log->state = LOGWRAP; */ 1022 /* log->state = LOGWRAP; */
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 7b698f2ec45a..9895595fd2f2 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -97,7 +97,7 @@ int jfs_mount(struct super_block *sb)
97 97
98 ipaimap = diReadSpecial(sb, AGGREGATE_I, 0); 98 ipaimap = diReadSpecial(sb, AGGREGATE_I, 0);
99 if (ipaimap == NULL) { 99 if (ipaimap == NULL) {
100 jfs_err("jfs_mount: Faild to read AGGREGATE_I"); 100 jfs_err("jfs_mount: Failed to read AGGREGATE_I");
101 rc = -EIO; 101 rc = -EIO;
102 goto errout20; 102 goto errout20;
103 } 103 }
@@ -148,7 +148,7 @@ int jfs_mount(struct super_block *sb)
148 if ((sbi->mntflag & JFS_BAD_SAIT) == 0) { 148 if ((sbi->mntflag & JFS_BAD_SAIT) == 0) {
149 ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1); 149 ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1);
150 if (!ipaimap2) { 150 if (!ipaimap2) {
151 jfs_err("jfs_mount: Faild to read AGGREGATE_I"); 151 jfs_err("jfs_mount: Failed to read AGGREGATE_I");
152 rc = -EIO; 152 rc = -EIO;
153 goto errout35; 153 goto errout35;
154 } 154 }
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index d945ea76b445..9466957ec841 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1279,7 +1279,7 @@ int txCommit(tid_t tid, /* transaction identifier */
1279 * lazy commit thread finishes processing 1279 * lazy commit thread finishes processing
1280 */ 1280 */
1281 if (tblk->xflag & COMMIT_DELETE) { 1281 if (tblk->xflag & COMMIT_DELETE) {
1282 atomic_inc(&tblk->u.ip->i_count); 1282 ihold(tblk->u.ip);
1283 /* 1283 /*
1284 * Avoid a rare deadlock 1284 * Avoid a rare deadlock
1285 * 1285 *
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index a9cf8e8675be..231ca4af9bce 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -839,7 +839,7 @@ static int jfs_link(struct dentry *old_dentry,
839 ip->i_ctime = CURRENT_TIME; 839 ip->i_ctime = CURRENT_TIME;
840 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 840 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
841 mark_inode_dirty(dir); 841 mark_inode_dirty(dir);
842 atomic_inc(&ip->i_count); 842 ihold(ip);
843 843
844 iplist[0] = ip; 844 iplist[0] = ip;
845 iplist[1] = dir; 845 iplist[1] = dir;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index ec8c3e4baca3..68eee2bf629e 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -33,7 +33,6 @@
33#include <linux/slab.h> 33#include <linux/slab.h>
34#include <asm/uaccess.h> 34#include <asm/uaccess.h>
35#include <linux/seq_file.h> 35#include <linux/seq_file.h>
36#include <linux/smp_lock.h>
37 36
38#include "jfs_incore.h" 37#include "jfs_incore.h"
39#include "jfs_filsys.h" 38#include "jfs_filsys.h"
@@ -176,8 +175,6 @@ static void jfs_put_super(struct super_block *sb)
176 175
177 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); 176 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
178 177
179 lock_kernel();
180
181 rc = jfs_umount(sb); 178 rc = jfs_umount(sb);
182 if (rc) 179 if (rc)
183 jfs_err("jfs_umount failed with return code %d", rc); 180 jfs_err("jfs_umount failed with return code %d", rc);
@@ -188,8 +185,6 @@ static void jfs_put_super(struct super_block *sb)
188 iput(sbi->direct_inode); 185 iput(sbi->direct_inode);
189 186
190 kfree(sbi); 187 kfree(sbi);
191
192 unlock_kernel();
193} 188}
194 189
195enum { 190enum {
@@ -369,19 +364,16 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
369 if (!parse_options(data, sb, &newLVSize, &flag)) { 364 if (!parse_options(data, sb, &newLVSize, &flag)) {
370 return -EINVAL; 365 return -EINVAL;
371 } 366 }
372 lock_kernel(); 367
373 if (newLVSize) { 368 if (newLVSize) {
374 if (sb->s_flags & MS_RDONLY) { 369 if (sb->s_flags & MS_RDONLY) {
375 printk(KERN_ERR 370 printk(KERN_ERR
376 "JFS: resize requires volume to be mounted read-write\n"); 371 "JFS: resize requires volume to be mounted read-write\n");
377 unlock_kernel();
378 return -EROFS; 372 return -EROFS;
379 } 373 }
380 rc = jfs_extendfs(sb, newLVSize, 0); 374 rc = jfs_extendfs(sb, newLVSize, 0);
381 if (rc) { 375 if (rc)
382 unlock_kernel();
383 return rc; 376 return rc;
384 }
385 } 377 }
386 378
387 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { 379 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
@@ -397,36 +389,30 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
397 /* mark the fs r/w for quota activity */ 389 /* mark the fs r/w for quota activity */
398 sb->s_flags &= ~MS_RDONLY; 390 sb->s_flags &= ~MS_RDONLY;
399 391
400 unlock_kernel();
401 dquot_resume(sb, -1); 392 dquot_resume(sb, -1);
402 return ret; 393 return ret;
403 } 394 }
404 if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) { 395 if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
405 rc = dquot_suspend(sb, -1); 396 rc = dquot_suspend(sb, -1);
406 if (rc < 0) { 397 if (rc < 0) {
407 unlock_kernel();
408 return rc; 398 return rc;
409 } 399 }
410 rc = jfs_umount_rw(sb); 400 rc = jfs_umount_rw(sb);
411 JFS_SBI(sb)->flag = flag; 401 JFS_SBI(sb)->flag = flag;
412 unlock_kernel();
413 return rc; 402 return rc;
414 } 403 }
415 if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY)) 404 if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY))
416 if (!(sb->s_flags & MS_RDONLY)) { 405 if (!(sb->s_flags & MS_RDONLY)) {
417 rc = jfs_umount_rw(sb); 406 rc = jfs_umount_rw(sb);
418 if (rc) { 407 if (rc)
419 unlock_kernel();
420 return rc; 408 return rc;
421 } 409
422 JFS_SBI(sb)->flag = flag; 410 JFS_SBI(sb)->flag = flag;
423 ret = jfs_mount_rw(sb, 1); 411 ret = jfs_mount_rw(sb, 1);
424 unlock_kernel();
425 return ret; 412 return ret;
426 } 413 }
427 JFS_SBI(sb)->flag = flag; 414 JFS_SBI(sb)->flag = flag;
428 415
429 unlock_kernel();
430 return 0; 416 return 0;
431} 417}
432 418
@@ -446,6 +432,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
446 sbi = kzalloc(sizeof (struct jfs_sb_info), GFP_KERNEL); 432 sbi = kzalloc(sizeof (struct jfs_sb_info), GFP_KERNEL);
447 if (!sbi) 433 if (!sbi)
448 return -ENOMEM; 434 return -ENOMEM;
435
449 sb->s_fs_info = sbi; 436 sb->s_fs_info = sbi;
450 sbi->sb = sb; 437 sbi->sb = sb;
451 sbi->uid = sbi->gid = sbi->umask = -1; 438 sbi->uid = sbi->gid = sbi->umask = -1;
diff --git a/fs/libfs.c b/fs/libfs.c
index 0a9da95317f7..304a5132ca27 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -255,7 +255,7 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
255 255
256 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 256 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
257 inc_nlink(inode); 257 inc_nlink(inode);
258 atomic_inc(&inode->i_count); 258 ihold(inode);
259 dget(dentry); 259 dget(dentry);
260 d_instantiate(dentry, inode); 260 d_instantiate(dentry, inode);
261 return 0; 261 return 0;
@@ -892,10 +892,6 @@ EXPORT_SYMBOL_GPL(generic_fh_to_parent);
892 */ 892 */
893int generic_file_fsync(struct file *file, int datasync) 893int generic_file_fsync(struct file *file, int datasync)
894{ 894{
895 struct writeback_control wbc = {
896 .sync_mode = WB_SYNC_ALL,
897 .nr_to_write = 0, /* metadata-only; caller takes care of data */
898 };
899 struct inode *inode = file->f_mapping->host; 895 struct inode *inode = file->f_mapping->host;
900 int err; 896 int err;
901 int ret; 897 int ret;
@@ -906,13 +902,42 @@ int generic_file_fsync(struct file *file, int datasync)
906 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 902 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
907 return ret; 903 return ret;
908 904
909 err = sync_inode(inode, &wbc); 905 err = sync_inode_metadata(inode, 1);
910 if (ret == 0) 906 if (ret == 0)
911 ret = err; 907 ret = err;
912 return ret; 908 return ret;
913} 909}
914EXPORT_SYMBOL(generic_file_fsync); 910EXPORT_SYMBOL(generic_file_fsync);
915 911
912/**
913 * generic_check_addressable - Check addressability of file system
914 * @blocksize_bits: log of file system block size
915 * @num_blocks: number of blocks in file system
916 *
917 * Determine whether a file system with @num_blocks blocks (and a
918 * block size of 2**@blocksize_bits) is addressable by the sector_t
919 * and page cache of the system. Return 0 if so and -EFBIG otherwise.
920 */
921int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
922{
923 u64 last_fs_block = num_blocks - 1;
924 u64 last_fs_page =
925 last_fs_block >> (PAGE_CACHE_SHIFT - blocksize_bits);
926
927 if (unlikely(num_blocks == 0))
928 return 0;
929
930 if ((blocksize_bits < 9) || (blocksize_bits > PAGE_CACHE_SHIFT))
931 return -EINVAL;
932
933 if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
934 (last_fs_page > (pgoff_t)(~0ULL))) {
935 return -EFBIG;
936 }
937 return 0;
938}
939EXPORT_SYMBOL(generic_check_addressable);
940
916/* 941/*
917 * No-op implementation of ->fsync for in-memory filesystems. 942 * No-op implementation of ->fsync for in-memory filesystems.
918 */ 943 */
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 64fd427c993c..d5bb86866e6c 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -42,6 +42,7 @@ struct nlm_wait {
42}; 42};
43 43
44static LIST_HEAD(nlm_blocked); 44static LIST_HEAD(nlm_blocked);
45static DEFINE_SPINLOCK(nlm_blocked_lock);
45 46
46/** 47/**
47 * nlmclnt_init - Set up per-NFS mount point lockd data structures 48 * nlmclnt_init - Set up per-NFS mount point lockd data structures
@@ -97,7 +98,10 @@ struct nlm_wait *nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *
97 block->b_lock = fl; 98 block->b_lock = fl;
98 init_waitqueue_head(&block->b_wait); 99 init_waitqueue_head(&block->b_wait);
99 block->b_status = nlm_lck_blocked; 100 block->b_status = nlm_lck_blocked;
101
102 spin_lock(&nlm_blocked_lock);
100 list_add(&block->b_list, &nlm_blocked); 103 list_add(&block->b_list, &nlm_blocked);
104 spin_unlock(&nlm_blocked_lock);
101 } 105 }
102 return block; 106 return block;
103} 107}
@@ -106,7 +110,9 @@ void nlmclnt_finish_block(struct nlm_wait *block)
106{ 110{
107 if (block == NULL) 111 if (block == NULL)
108 return; 112 return;
113 spin_lock(&nlm_blocked_lock);
109 list_del(&block->b_list); 114 list_del(&block->b_list);
115 spin_unlock(&nlm_blocked_lock);
110 kfree(block); 116 kfree(block);
111} 117}
112 118
@@ -154,6 +160,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
154 * Look up blocked request based on arguments. 160 * Look up blocked request based on arguments.
155 * Warning: must not use cookie to match it! 161 * Warning: must not use cookie to match it!
156 */ 162 */
163 spin_lock(&nlm_blocked_lock);
157 list_for_each_entry(block, &nlm_blocked, b_list) { 164 list_for_each_entry(block, &nlm_blocked, b_list) {
158 struct file_lock *fl_blocked = block->b_lock; 165 struct file_lock *fl_blocked = block->b_lock;
159 166
@@ -178,6 +185,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
178 wake_up(&block->b_wait); 185 wake_up(&block->b_wait);
179 res = nlm_granted; 186 res = nlm_granted;
180 } 187 }
188 spin_unlock(&nlm_blocked_lock);
181 return res; 189 return res;
182} 190}
183 191
@@ -216,10 +224,6 @@ reclaimer(void *ptr)
216 allow_signal(SIGKILL); 224 allow_signal(SIGKILL);
217 225
218 down_write(&host->h_rwsem); 226 down_write(&host->h_rwsem);
219
220 /* This one ensures that our parent doesn't terminate while the
221 * reclaim is in progress */
222 lock_kernel();
223 lockd_up(); /* note: this cannot fail as lockd is already running */ 227 lockd_up(); /* note: this cannot fail as lockd is already running */
224 228
225 dprintk("lockd: reclaiming locks for host %s\n", host->h_name); 229 dprintk("lockd: reclaiming locks for host %s\n", host->h_name);
@@ -260,16 +264,17 @@ restart:
260 dprintk("NLM: done reclaiming locks for host %s\n", host->h_name); 264 dprintk("NLM: done reclaiming locks for host %s\n", host->h_name);
261 265
262 /* Now, wake up all processes that sleep on a blocked lock */ 266 /* Now, wake up all processes that sleep on a blocked lock */
267 spin_lock(&nlm_blocked_lock);
263 list_for_each_entry(block, &nlm_blocked, b_list) { 268 list_for_each_entry(block, &nlm_blocked, b_list) {
264 if (block->b_host == host) { 269 if (block->b_host == host) {
265 block->b_status = nlm_lck_denied_grace_period; 270 block->b_status = nlm_lck_denied_grace_period;
266 wake_up(&block->b_wait); 271 wake_up(&block->b_wait);
267 } 272 }
268 } 273 }
274 spin_unlock(&nlm_blocked_lock);
269 275
270 /* Release host handle after use */ 276 /* Release host handle after use */
271 nlm_release_host(host); 277 nlm_release_host(host);
272 lockd_down(); 278 lockd_down();
273 unlock_kernel();
274 return 0; 279 return 0;
275} 280}
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 7932c399fab4..47ea1e1925b8 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -166,7 +166,6 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
166 /* Set up the argument struct */ 166 /* Set up the argument struct */
167 nlmclnt_setlockargs(call, fl); 167 nlmclnt_setlockargs(call, fl);
168 168
169 lock_kernel();
170 if (IS_SETLK(cmd) || IS_SETLKW(cmd)) { 169 if (IS_SETLK(cmd) || IS_SETLKW(cmd)) {
171 if (fl->fl_type != F_UNLCK) { 170 if (fl->fl_type != F_UNLCK) {
172 call->a_args.block = IS_SETLKW(cmd) ? 1 : 0; 171 call->a_args.block = IS_SETLKW(cmd) ? 1 : 0;
@@ -177,10 +176,8 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
177 status = nlmclnt_test(call, fl); 176 status = nlmclnt_test(call, fl);
178 else 177 else
179 status = -EINVAL; 178 status = -EINVAL;
180
181 fl->fl_ops->fl_release_private(fl); 179 fl->fl_ops->fl_release_private(fl);
182 fl->fl_ops = NULL; 180 fl->fl_ops = NULL;
183 unlock_kernel();
184 181
185 dprintk("lockd: clnt proc returns %d\n", status); 182 dprintk("lockd: clnt proc returns %d\n", status);
186 return status; 183 return status;
@@ -226,9 +223,7 @@ void nlm_release_call(struct nlm_rqst *call)
226 223
227static void nlmclnt_rpc_release(void *data) 224static void nlmclnt_rpc_release(void *data)
228{ 225{
229 lock_kernel();
230 nlm_release_call(data); 226 nlm_release_call(data);
231 unlock_kernel();
232} 227}
233 228
234static int nlm_wait_on_grace(wait_queue_head_t *queue) 229static int nlm_wait_on_grace(wait_queue_head_t *queue)
@@ -448,14 +443,18 @@ out:
448 443
449static void nlmclnt_locks_copy_lock(struct file_lock *new, struct file_lock *fl) 444static void nlmclnt_locks_copy_lock(struct file_lock *new, struct file_lock *fl)
450{ 445{
446 spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock);
451 new->fl_u.nfs_fl.state = fl->fl_u.nfs_fl.state; 447 new->fl_u.nfs_fl.state = fl->fl_u.nfs_fl.state;
452 new->fl_u.nfs_fl.owner = nlm_get_lockowner(fl->fl_u.nfs_fl.owner); 448 new->fl_u.nfs_fl.owner = nlm_get_lockowner(fl->fl_u.nfs_fl.owner);
453 list_add_tail(&new->fl_u.nfs_fl.list, &fl->fl_u.nfs_fl.owner->host->h_granted); 449 list_add_tail(&new->fl_u.nfs_fl.list, &fl->fl_u.nfs_fl.owner->host->h_granted);
450 spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock);
454} 451}
455 452
456static void nlmclnt_locks_release_private(struct file_lock *fl) 453static void nlmclnt_locks_release_private(struct file_lock *fl)
457{ 454{
455 spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock);
458 list_del(&fl->fl_u.nfs_fl.list); 456 list_del(&fl->fl_u.nfs_fl.list);
457 spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock);
459 nlm_put_lockowner(fl->fl_u.nfs_fl.owner); 458 nlm_put_lockowner(fl->fl_u.nfs_fl.owner);
460} 459}
461 460
@@ -721,9 +720,7 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
721die: 720die:
722 return; 721 return;
723 retry_rebind: 722 retry_rebind:
724 lock_kernel();
725 nlm_rebind_host(req->a_host); 723 nlm_rebind_host(req->a_host);
726 unlock_kernel();
727 retry_unlock: 724 retry_unlock:
728 rpc_restart_call(task); 725 rpc_restart_call(task);
729} 726}
@@ -801,9 +798,7 @@ retry_cancel:
801 /* Don't ever retry more than 3 times */ 798 /* Don't ever retry more than 3 times */
802 if (req->a_retries++ >= NLMCLNT_MAX_RETRIES) 799 if (req->a_retries++ >= NLMCLNT_MAX_RETRIES)
803 goto die; 800 goto die;
804 lock_kernel();
805 nlm_rebind_host(req->a_host); 801 nlm_rebind_host(req->a_host);
806 unlock_kernel();
807 rpc_restart_call(task); 802 rpc_restart_call(task);
808 rpc_delay(task, 30 * HZ); 803 rpc_delay(task, 30 * HZ);
809} 804}
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index bb464d12104c..25e21e4023b2 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -353,6 +353,7 @@ nlm_bind_host(struct nlm_host *host)
353 .to_retries = 5U, 353 .to_retries = 5U,
354 }; 354 };
355 struct rpc_create_args args = { 355 struct rpc_create_args args = {
356 .net = &init_net,
356 .protocol = host->h_proto, 357 .protocol = host->h_proto,
357 .address = nlm_addr(host), 358 .address = nlm_addr(host),
358 .addrsize = host->h_addrlen, 359 .addrsize = host->h_addrlen,
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index e3015464fbab..e0c918949644 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -69,6 +69,7 @@ static struct rpc_clnt *nsm_create(void)
69 .sin_addr.s_addr = htonl(INADDR_LOOPBACK), 69 .sin_addr.s_addr = htonl(INADDR_LOOPBACK),
70 }; 70 };
71 struct rpc_create_args args = { 71 struct rpc_create_args args = {
72 .net = &init_net,
72 .protocol = XPRT_TRANSPORT_UDP, 73 .protocol = XPRT_TRANSPORT_UDP,
73 .address = (struct sockaddr *)&sin, 74 .address = (struct sockaddr *)&sin,
74 .addrsize = sizeof(sin), 75 .addrsize = sizeof(sin),
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index f1bacf1a0391..abfff9d7979d 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -22,7 +22,6 @@
22#include <linux/in.h> 22#include <linux/in.h>
23#include <linux/uio.h> 23#include <linux/uio.h>
24#include <linux/smp.h> 24#include <linux/smp.h>
25#include <linux/smp_lock.h>
26#include <linux/mutex.h> 25#include <linux/mutex.h>
27#include <linux/kthread.h> 26#include <linux/kthread.h>
28#include <linux/freezer.h> 27#include <linux/freezer.h>
@@ -130,15 +129,6 @@ lockd(void *vrqstp)
130 129
131 dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n"); 130 dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n");
132 131
133 /*
134 * FIXME: it would be nice if lockd didn't spend its entire life
135 * running under the BKL. At the very least, it would be good to
136 * have someone clarify what it's intended to protect here. I've
137 * seen some handwavy posts about posix locking needing to be
138 * done under the BKL, but it's far from clear.
139 */
140 lock_kernel();
141
142 if (!nlm_timeout) 132 if (!nlm_timeout)
143 nlm_timeout = LOCKD_DFLT_TIMEO; 133 nlm_timeout = LOCKD_DFLT_TIMEO;
144 nlmsvc_timeout = nlm_timeout * HZ; 134 nlmsvc_timeout = nlm_timeout * HZ;
@@ -195,7 +185,6 @@ lockd(void *vrqstp)
195 if (nlmsvc_ops) 185 if (nlmsvc_ops)
196 nlmsvc_invalidate_all(); 186 nlmsvc_invalidate_all();
197 nlm_shutdown_hosts(); 187 nlm_shutdown_hosts();
198 unlock_kernel();
199 return 0; 188 return 0;
200} 189}
201 190
@@ -206,7 +195,7 @@ static int create_lockd_listener(struct svc_serv *serv, const char *name,
206 195
207 xprt = svc_find_xprt(serv, name, family, 0); 196 xprt = svc_find_xprt(serv, name, family, 0);
208 if (xprt == NULL) 197 if (xprt == NULL)
209 return svc_create_xprt(serv, name, family, port, 198 return svc_create_xprt(serv, name, &init_net, family, port,
210 SVC_SOCK_DEFAULTS); 199 SVC_SOCK_DEFAULTS);
211 svc_xprt_put(xprt); 200 svc_xprt_put(xprt);
212 return 0; 201 return 0;
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 031c6569a134..a336e832475d 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -230,9 +230,7 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
230 230
231static void nlm4svc_callback_release(void *data) 231static void nlm4svc_callback_release(void *data)
232{ 232{
233 lock_kernel();
234 nlm_release_call(data); 233 nlm_release_call(data);
235 unlock_kernel();
236} 234}
237 235
238static const struct rpc_call_ops nlm4svc_callback_ops = { 236static const struct rpc_call_ops nlm4svc_callback_ops = {
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 84055d31bfc5..c462d346acbd 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -52,12 +52,13 @@ static const struct rpc_call_ops nlmsvc_grant_ops;
52 * The list of blocked locks to retry 52 * The list of blocked locks to retry
53 */ 53 */
54static LIST_HEAD(nlm_blocked); 54static LIST_HEAD(nlm_blocked);
55static DEFINE_SPINLOCK(nlm_blocked_lock);
55 56
56/* 57/*
57 * Insert a blocked lock into the global list 58 * Insert a blocked lock into the global list
58 */ 59 */
59static void 60static void
60nlmsvc_insert_block(struct nlm_block *block, unsigned long when) 61nlmsvc_insert_block_locked(struct nlm_block *block, unsigned long when)
61{ 62{
62 struct nlm_block *b; 63 struct nlm_block *b;
63 struct list_head *pos; 64 struct list_head *pos;
@@ -87,6 +88,13 @@ nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
87 block->b_when = when; 88 block->b_when = when;
88} 89}
89 90
91static void nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
92{
93 spin_lock(&nlm_blocked_lock);
94 nlmsvc_insert_block_locked(block, when);
95 spin_unlock(&nlm_blocked_lock);
96}
97
90/* 98/*
91 * Remove a block from the global list 99 * Remove a block from the global list
92 */ 100 */
@@ -94,7 +102,9 @@ static inline void
94nlmsvc_remove_block(struct nlm_block *block) 102nlmsvc_remove_block(struct nlm_block *block)
95{ 103{
96 if (!list_empty(&block->b_list)) { 104 if (!list_empty(&block->b_list)) {
105 spin_lock(&nlm_blocked_lock);
97 list_del_init(&block->b_list); 106 list_del_init(&block->b_list);
107 spin_unlock(&nlm_blocked_lock);
98 nlmsvc_release_block(block); 108 nlmsvc_release_block(block);
99 } 109 }
100} 110}
@@ -651,7 +661,7 @@ static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf,
651 struct nlm_block *block; 661 struct nlm_block *block;
652 int rc = -ENOENT; 662 int rc = -ENOENT;
653 663
654 lock_kernel(); 664 spin_lock(&nlm_blocked_lock);
655 list_for_each_entry(block, &nlm_blocked, b_list) { 665 list_for_each_entry(block, &nlm_blocked, b_list) {
656 if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) { 666 if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
657 dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n", 667 dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n",
@@ -665,13 +675,13 @@ static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf,
665 } else if (result == 0) 675 } else if (result == 0)
666 block->b_granted = 1; 676 block->b_granted = 1;
667 677
668 nlmsvc_insert_block(block, 0); 678 nlmsvc_insert_block_locked(block, 0);
669 svc_wake_up(block->b_daemon); 679 svc_wake_up(block->b_daemon);
670 rc = 0; 680 rc = 0;
671 break; 681 break;
672 } 682 }
673 } 683 }
674 unlock_kernel(); 684 spin_unlock(&nlm_blocked_lock);
675 if (rc == -ENOENT) 685 if (rc == -ENOENT)
676 printk(KERN_WARNING "lockd: grant for unknown block\n"); 686 printk(KERN_WARNING "lockd: grant for unknown block\n");
677 return rc; 687 return rc;
@@ -690,14 +700,16 @@ nlmsvc_notify_blocked(struct file_lock *fl)
690 struct nlm_block *block; 700 struct nlm_block *block;
691 701
692 dprintk("lockd: VFS unblock notification for block %p\n", fl); 702 dprintk("lockd: VFS unblock notification for block %p\n", fl);
703 spin_lock(&nlm_blocked_lock);
693 list_for_each_entry(block, &nlm_blocked, b_list) { 704 list_for_each_entry(block, &nlm_blocked, b_list) {
694 if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) { 705 if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
695 nlmsvc_insert_block(block, 0); 706 nlmsvc_insert_block_locked(block, 0);
707 spin_unlock(&nlm_blocked_lock);
696 svc_wake_up(block->b_daemon); 708 svc_wake_up(block->b_daemon);
697 return; 709 return;
698 } 710 }
699 } 711 }
700 712 spin_unlock(&nlm_blocked_lock);
701 printk(KERN_WARNING "lockd: notification for unknown block!\n"); 713 printk(KERN_WARNING "lockd: notification for unknown block!\n");
702} 714}
703 715
@@ -803,7 +815,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
803 815
804 dprintk("lockd: GRANT_MSG RPC callback\n"); 816 dprintk("lockd: GRANT_MSG RPC callback\n");
805 817
806 lock_kernel(); 818 spin_lock(&nlm_blocked_lock);
807 /* if the block is not on a list at this point then it has 819 /* if the block is not on a list at this point then it has
808 * been invalidated. Don't try to requeue it. 820 * been invalidated. Don't try to requeue it.
809 * 821 *
@@ -825,19 +837,20 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
825 /* Call was successful, now wait for client callback */ 837 /* Call was successful, now wait for client callback */
826 timeout = 60 * HZ; 838 timeout = 60 * HZ;
827 } 839 }
828 nlmsvc_insert_block(block, timeout); 840 nlmsvc_insert_block_locked(block, timeout);
829 svc_wake_up(block->b_daemon); 841 svc_wake_up(block->b_daemon);
830out: 842out:
831 unlock_kernel(); 843 spin_unlock(&nlm_blocked_lock);
832} 844}
833 845
846/*
847 * FIXME: nlmsvc_release_block() grabs a mutex. This is not allowed for an
848 * .rpc_release rpc_call_op
849 */
834static void nlmsvc_grant_release(void *data) 850static void nlmsvc_grant_release(void *data)
835{ 851{
836 struct nlm_rqst *call = data; 852 struct nlm_rqst *call = data;
837
838 lock_kernel();
839 nlmsvc_release_block(call->a_block); 853 nlmsvc_release_block(call->a_block);
840 unlock_kernel();
841} 854}
842 855
843static const struct rpc_call_ops nlmsvc_grant_ops = { 856static const struct rpc_call_ops nlmsvc_grant_ops = {
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 0f2ab741ae7c..c3069f38d602 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -260,9 +260,7 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
260 260
261static void nlmsvc_callback_release(void *data) 261static void nlmsvc_callback_release(void *data)
262{ 262{
263 lock_kernel();
264 nlm_release_call(data); 263 nlm_release_call(data);
265 unlock_kernel();
266} 264}
267 265
268static const struct rpc_call_ops nlmsvc_callback_ops = { 266static const struct rpc_call_ops nlmsvc_callback_ops = {
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index d0ef94cfb3da..1ca0679c80bf 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -170,6 +170,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
170 170
171again: 171again:
172 file->f_locks = 0; 172 file->f_locks = 0;
173 lock_flocks(); /* protects i_flock list */
173 for (fl = inode->i_flock; fl; fl = fl->fl_next) { 174 for (fl = inode->i_flock; fl; fl = fl->fl_next) {
174 if (fl->fl_lmops != &nlmsvc_lock_operations) 175 if (fl->fl_lmops != &nlmsvc_lock_operations)
175 continue; 176 continue;
@@ -181,6 +182,7 @@ again:
181 if (match(lockhost, host)) { 182 if (match(lockhost, host)) {
182 struct file_lock lock = *fl; 183 struct file_lock lock = *fl;
183 184
185 unlock_flocks();
184 lock.fl_type = F_UNLCK; 186 lock.fl_type = F_UNLCK;
185 lock.fl_start = 0; 187 lock.fl_start = 0;
186 lock.fl_end = OFFSET_MAX; 188 lock.fl_end = OFFSET_MAX;
@@ -192,6 +194,7 @@ again:
192 goto again; 194 goto again;
193 } 195 }
194 } 196 }
197 unlock_flocks();
195 198
196 return 0; 199 return 0;
197} 200}
@@ -226,10 +229,14 @@ nlm_file_inuse(struct nlm_file *file)
226 if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares) 229 if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
227 return 1; 230 return 1;
228 231
232 lock_flocks();
229 for (fl = inode->i_flock; fl; fl = fl->fl_next) { 233 for (fl = inode->i_flock; fl; fl = fl->fl_next) {
230 if (fl->fl_lmops == &nlmsvc_lock_operations) 234 if (fl->fl_lmops == &nlmsvc_lock_operations) {
235 unlock_flocks();
231 return 1; 236 return 1;
237 }
232 } 238 }
239 unlock_flocks();
233 file->f_locks = 0; 240 file->f_locks = 0;
234 return 0; 241 return 0;
235} 242}
diff --git a/fs/locks.c b/fs/locks.c
index ab24d49fc048..50ec15927aab 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -142,14 +142,32 @@ int lease_break_time = 45;
142 142
143static LIST_HEAD(file_lock_list); 143static LIST_HEAD(file_lock_list);
144static LIST_HEAD(blocked_list); 144static LIST_HEAD(blocked_list);
145static DEFINE_SPINLOCK(file_lock_lock);
146
147/*
148 * Protects the two list heads above, plus the inode->i_flock list
149 * FIXME: should use a spinlock, once lockd and ceph are ready.
150 */
151void lock_flocks(void)
152{
153 spin_lock(&file_lock_lock);
154}
155EXPORT_SYMBOL_GPL(lock_flocks);
156
157void unlock_flocks(void)
158{
159 spin_unlock(&file_lock_lock);
160}
161EXPORT_SYMBOL_GPL(unlock_flocks);
145 162
146static struct kmem_cache *filelock_cache __read_mostly; 163static struct kmem_cache *filelock_cache __read_mostly;
147 164
148/* Allocate an empty lock structure. */ 165/* Allocate an empty lock structure. */
149static struct file_lock *locks_alloc_lock(void) 166struct file_lock *locks_alloc_lock(void)
150{ 167{
151 return kmem_cache_alloc(filelock_cache, GFP_KERNEL); 168 return kmem_cache_alloc(filelock_cache, GFP_KERNEL);
152} 169}
170EXPORT_SYMBOL_GPL(locks_alloc_lock);
153 171
154void locks_release_private(struct file_lock *fl) 172void locks_release_private(struct file_lock *fl)
155{ 173{
@@ -511,9 +529,9 @@ static void __locks_delete_block(struct file_lock *waiter)
511 */ 529 */
512static void locks_delete_block(struct file_lock *waiter) 530static void locks_delete_block(struct file_lock *waiter)
513{ 531{
514 lock_kernel(); 532 lock_flocks();
515 __locks_delete_block(waiter); 533 __locks_delete_block(waiter);
516 unlock_kernel(); 534 unlock_flocks();
517} 535}
518 536
519/* Insert waiter into blocker's block list. 537/* Insert waiter into blocker's block list.
@@ -644,7 +662,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
644{ 662{
645 struct file_lock *cfl; 663 struct file_lock *cfl;
646 664
647 lock_kernel(); 665 lock_flocks();
648 for (cfl = filp->f_path.dentry->d_inode->i_flock; cfl; cfl = cfl->fl_next) { 666 for (cfl = filp->f_path.dentry->d_inode->i_flock; cfl; cfl = cfl->fl_next) {
649 if (!IS_POSIX(cfl)) 667 if (!IS_POSIX(cfl))
650 continue; 668 continue;
@@ -657,7 +675,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
657 fl->fl_pid = pid_vnr(cfl->fl_nspid); 675 fl->fl_pid = pid_vnr(cfl->fl_nspid);
658 } else 676 } else
659 fl->fl_type = F_UNLCK; 677 fl->fl_type = F_UNLCK;
660 unlock_kernel(); 678 unlock_flocks();
661 return; 679 return;
662} 680}
663EXPORT_SYMBOL(posix_test_lock); 681EXPORT_SYMBOL(posix_test_lock);
@@ -730,18 +748,16 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
730 int error = 0; 748 int error = 0;
731 int found = 0; 749 int found = 0;
732 750
733 lock_kernel(); 751 if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
734 if (request->fl_flags & FL_ACCESS)
735 goto find_conflict;
736
737 if (request->fl_type != F_UNLCK) {
738 error = -ENOMEM;
739 new_fl = locks_alloc_lock(); 752 new_fl = locks_alloc_lock();
740 if (new_fl == NULL) 753 if (!new_fl)
741 goto out; 754 return -ENOMEM;
742 error = 0;
743 } 755 }
744 756
757 lock_flocks();
758 if (request->fl_flags & FL_ACCESS)
759 goto find_conflict;
760
745 for_each_lock(inode, before) { 761 for_each_lock(inode, before) {
746 struct file_lock *fl = *before; 762 struct file_lock *fl = *before;
747 if (IS_POSIX(fl)) 763 if (IS_POSIX(fl))
@@ -767,8 +783,11 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
767 * If a higher-priority process was blocked on the old file lock, 783 * If a higher-priority process was blocked on the old file lock,
768 * give it the opportunity to lock the file. 784 * give it the opportunity to lock the file.
769 */ 785 */
770 if (found) 786 if (found) {
787 unlock_flocks();
771 cond_resched(); 788 cond_resched();
789 lock_flocks();
790 }
772 791
773find_conflict: 792find_conflict:
774 for_each_lock(inode, before) { 793 for_each_lock(inode, before) {
@@ -794,7 +813,7 @@ find_conflict:
794 error = 0; 813 error = 0;
795 814
796out: 815out:
797 unlock_kernel(); 816 unlock_flocks();
798 if (new_fl) 817 if (new_fl)
799 locks_free_lock(new_fl); 818 locks_free_lock(new_fl);
800 return error; 819 return error;
@@ -823,7 +842,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
823 new_fl2 = locks_alloc_lock(); 842 new_fl2 = locks_alloc_lock();
824 } 843 }
825 844
826 lock_kernel(); 845 lock_flocks();
827 if (request->fl_type != F_UNLCK) { 846 if (request->fl_type != F_UNLCK) {
828 for_each_lock(inode, before) { 847 for_each_lock(inode, before) {
829 fl = *before; 848 fl = *before;
@@ -991,7 +1010,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
991 locks_wake_up_blocks(left); 1010 locks_wake_up_blocks(left);
992 } 1011 }
993 out: 1012 out:
994 unlock_kernel(); 1013 unlock_flocks();
995 /* 1014 /*
996 * Free any unused locks. 1015 * Free any unused locks.
997 */ 1016 */
@@ -1066,14 +1085,14 @@ int locks_mandatory_locked(struct inode *inode)
1066 /* 1085 /*
1067 * Search the lock list for this inode for any POSIX locks. 1086 * Search the lock list for this inode for any POSIX locks.
1068 */ 1087 */
1069 lock_kernel(); 1088 lock_flocks();
1070 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 1089 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
1071 if (!IS_POSIX(fl)) 1090 if (!IS_POSIX(fl))
1072 continue; 1091 continue;
1073 if (fl->fl_owner != owner) 1092 if (fl->fl_owner != owner)
1074 break; 1093 break;
1075 } 1094 }
1076 unlock_kernel(); 1095 unlock_flocks();
1077 return fl ? -EAGAIN : 0; 1096 return fl ? -EAGAIN : 0;
1078} 1097}
1079 1098
@@ -1186,7 +1205,7 @@ int __break_lease(struct inode *inode, unsigned int mode)
1186 1205
1187 new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK); 1206 new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
1188 1207
1189 lock_kernel(); 1208 lock_flocks();
1190 1209
1191 time_out_leases(inode); 1210 time_out_leases(inode);
1192 1211
@@ -1247,8 +1266,10 @@ restart:
1247 break_time++; 1266 break_time++;
1248 } 1267 }
1249 locks_insert_block(flock, new_fl); 1268 locks_insert_block(flock, new_fl);
1269 unlock_flocks();
1250 error = wait_event_interruptible_timeout(new_fl->fl_wait, 1270 error = wait_event_interruptible_timeout(new_fl->fl_wait,
1251 !new_fl->fl_next, break_time); 1271 !new_fl->fl_next, break_time);
1272 lock_flocks();
1252 __locks_delete_block(new_fl); 1273 __locks_delete_block(new_fl);
1253 if (error >= 0) { 1274 if (error >= 0) {
1254 if (error == 0) 1275 if (error == 0)
@@ -1263,7 +1284,7 @@ restart:
1263 } 1284 }
1264 1285
1265out: 1286out:
1266 unlock_kernel(); 1287 unlock_flocks();
1267 if (!IS_ERR(new_fl)) 1288 if (!IS_ERR(new_fl))
1268 locks_free_lock(new_fl); 1289 locks_free_lock(new_fl);
1269 return error; 1290 return error;
@@ -1319,7 +1340,7 @@ int fcntl_getlease(struct file *filp)
1319 struct file_lock *fl; 1340 struct file_lock *fl;
1320 int type = F_UNLCK; 1341 int type = F_UNLCK;
1321 1342
1322 lock_kernel(); 1343 lock_flocks();
1323 time_out_leases(filp->f_path.dentry->d_inode); 1344 time_out_leases(filp->f_path.dentry->d_inode);
1324 for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl); 1345 for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl);
1325 fl = fl->fl_next) { 1346 fl = fl->fl_next) {
@@ -1328,7 +1349,7 @@ int fcntl_getlease(struct file *filp)
1328 break; 1349 break;
1329 } 1350 }
1330 } 1351 }
1331 unlock_kernel(); 1352 unlock_flocks();
1332 return type; 1353 return type;
1333} 1354}
1334 1355
@@ -1341,12 +1362,11 @@ int fcntl_getlease(struct file *filp)
1341 * The (input) flp->fl_lmops->fl_break function is required 1362 * The (input) flp->fl_lmops->fl_break function is required
1342 * by break_lease(). 1363 * by break_lease().
1343 * 1364 *
1344 * Called with kernel lock held. 1365 * Called with file_lock_lock held.
1345 */ 1366 */
1346int generic_setlease(struct file *filp, long arg, struct file_lock **flp) 1367int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1347{ 1368{
1348 struct file_lock *fl, **before, **my_before = NULL, *lease; 1369 struct file_lock *fl, **before, **my_before = NULL, *lease;
1349 struct file_lock *new_fl = NULL;
1350 struct dentry *dentry = filp->f_path.dentry; 1370 struct dentry *dentry = filp->f_path.dentry;
1351 struct inode *inode = dentry->d_inode; 1371 struct inode *inode = dentry->d_inode;
1352 int error, rdlease_count = 0, wrlease_count = 0; 1372 int error, rdlease_count = 0, wrlease_count = 0;
@@ -1366,11 +1386,6 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1366 lease = *flp; 1386 lease = *flp;
1367 1387
1368 if (arg != F_UNLCK) { 1388 if (arg != F_UNLCK) {
1369 error = -ENOMEM;
1370 new_fl = locks_alloc_lock();
1371 if (new_fl == NULL)
1372 goto out;
1373
1374 error = -EAGAIN; 1389 error = -EAGAIN;
1375 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) 1390 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
1376 goto out; 1391 goto out;
@@ -1415,7 +1430,6 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1415 goto out; 1430 goto out;
1416 } 1431 }
1417 1432
1418 error = 0;
1419 if (arg == F_UNLCK) 1433 if (arg == F_UNLCK)
1420 goto out; 1434 goto out;
1421 1435
@@ -1423,20 +1437,24 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1423 if (!leases_enable) 1437 if (!leases_enable)
1424 goto out; 1438 goto out;
1425 1439
1426 locks_copy_lock(new_fl, lease); 1440 locks_insert_lock(before, lease);
1427 locks_insert_lock(before, new_fl);
1428
1429 *flp = new_fl;
1430 return 0; 1441 return 0;
1431 1442
1432out: 1443out:
1433 if (new_fl != NULL) 1444 locks_free_lock(lease);
1434 locks_free_lock(new_fl);
1435 return error; 1445 return error;
1436} 1446}
1437EXPORT_SYMBOL(generic_setlease); 1447EXPORT_SYMBOL(generic_setlease);
1438 1448
1439 /** 1449static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
1450{
1451 if (filp->f_op && filp->f_op->setlease)
1452 return filp->f_op->setlease(filp, arg, lease);
1453 else
1454 return generic_setlease(filp, arg, lease);
1455}
1456
1457/**
1440 * vfs_setlease - sets a lease on an open file 1458 * vfs_setlease - sets a lease on an open file
1441 * @filp: file pointer 1459 * @filp: file pointer
1442 * @arg: type of lease to obtain 1460 * @arg: type of lease to obtain
@@ -1467,12 +1485,9 @@ int vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
1467{ 1485{
1468 int error; 1486 int error;
1469 1487
1470 lock_kernel(); 1488 lock_flocks();
1471 if (filp->f_op && filp->f_op->setlease) 1489 error = __vfs_setlease(filp, arg, lease);
1472 error = filp->f_op->setlease(filp, arg, lease); 1490 unlock_flocks();
1473 else
1474 error = generic_setlease(filp, arg, lease);
1475 unlock_kernel();
1476 1491
1477 return error; 1492 return error;
1478} 1493}
@@ -1490,33 +1505,47 @@ EXPORT_SYMBOL_GPL(vfs_setlease);
1490 */ 1505 */
1491int fcntl_setlease(unsigned int fd, struct file *filp, long arg) 1506int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
1492{ 1507{
1493 struct file_lock fl, *flp = &fl; 1508 struct file_lock *fl;
1509 struct fasync_struct *new;
1494 struct inode *inode = filp->f_path.dentry->d_inode; 1510 struct inode *inode = filp->f_path.dentry->d_inode;
1495 int error; 1511 int error;
1496 1512
1497 locks_init_lock(&fl); 1513 fl = lease_alloc(filp, arg);
1498 error = lease_init(filp, arg, &fl); 1514 if (IS_ERR(fl))
1499 if (error) 1515 return PTR_ERR(fl);
1500 return error;
1501 1516
1502 lock_kernel(); 1517 new = fasync_alloc();
1503 1518 if (!new) {
1504 error = vfs_setlease(filp, arg, &flp); 1519 locks_free_lock(fl);
1520 return -ENOMEM;
1521 }
1522 lock_flocks();
1523 error = __vfs_setlease(filp, arg, &fl);
1505 if (error || arg == F_UNLCK) 1524 if (error || arg == F_UNLCK)
1506 goto out_unlock; 1525 goto out_unlock;
1507 1526
1508 error = fasync_helper(fd, filp, 1, &flp->fl_fasync); 1527 /*
1528 * fasync_insert_entry() returns the old entry if any.
1529 * If there was no old entry, then it used 'new' and
1530 * inserted it into the fasync list. Clear new so that
1531 * we don't release it here.
1532 */
1533 if (!fasync_insert_entry(fd, filp, &fl->fl_fasync, new))
1534 new = NULL;
1535
1509 if (error < 0) { 1536 if (error < 0) {
1510 /* remove lease just inserted by setlease */ 1537 /* remove lease just inserted by setlease */
1511 flp->fl_type = F_UNLCK | F_INPROGRESS; 1538 fl->fl_type = F_UNLCK | F_INPROGRESS;
1512 flp->fl_break_time = jiffies - 10; 1539 fl->fl_break_time = jiffies - 10;
1513 time_out_leases(inode); 1540 time_out_leases(inode);
1514 goto out_unlock; 1541 goto out_unlock;
1515 } 1542 }
1516 1543
1517 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); 1544 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
1518out_unlock: 1545out_unlock:
1519 unlock_kernel(); 1546 unlock_flocks();
1547 if (new)
1548 fasync_free(new);
1520 return error; 1549 return error;
1521} 1550}
1522 1551
@@ -2020,7 +2049,7 @@ void locks_remove_flock(struct file *filp)
2020 fl.fl_ops->fl_release_private(&fl); 2049 fl.fl_ops->fl_release_private(&fl);
2021 } 2050 }
2022 2051
2023 lock_kernel(); 2052 lock_flocks();
2024 before = &inode->i_flock; 2053 before = &inode->i_flock;
2025 2054
2026 while ((fl = *before) != NULL) { 2055 while ((fl = *before) != NULL) {
@@ -2038,7 +2067,7 @@ void locks_remove_flock(struct file *filp)
2038 } 2067 }
2039 before = &fl->fl_next; 2068 before = &fl->fl_next;
2040 } 2069 }
2041 unlock_kernel(); 2070 unlock_flocks();
2042} 2071}
2043 2072
2044/** 2073/**
@@ -2053,12 +2082,12 @@ posix_unblock_lock(struct file *filp, struct file_lock *waiter)
2053{ 2082{
2054 int status = 0; 2083 int status = 0;
2055 2084
2056 lock_kernel(); 2085 lock_flocks();
2057 if (waiter->fl_next) 2086 if (waiter->fl_next)
2058 __locks_delete_block(waiter); 2087 __locks_delete_block(waiter);
2059 else 2088 else
2060 status = -ENOENT; 2089 status = -ENOENT;
2061 unlock_kernel(); 2090 unlock_flocks();
2062 return status; 2091 return status;
2063} 2092}
2064 2093
@@ -2085,7 +2114,7 @@ EXPORT_SYMBOL_GPL(vfs_cancel_lock);
2085#include <linux/seq_file.h> 2114#include <linux/seq_file.h>
2086 2115
2087static void lock_get_status(struct seq_file *f, struct file_lock *fl, 2116static void lock_get_status(struct seq_file *f, struct file_lock *fl,
2088 int id, char *pfx) 2117 loff_t id, char *pfx)
2089{ 2118{
2090 struct inode *inode = NULL; 2119 struct inode *inode = NULL;
2091 unsigned int fl_pid; 2120 unsigned int fl_pid;
@@ -2098,7 +2127,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
2098 if (fl->fl_file != NULL) 2127 if (fl->fl_file != NULL)
2099 inode = fl->fl_file->f_path.dentry->d_inode; 2128 inode = fl->fl_file->f_path.dentry->d_inode;
2100 2129
2101 seq_printf(f, "%d:%s ", id, pfx); 2130 seq_printf(f, "%lld:%s ", id, pfx);
2102 if (IS_POSIX(fl)) { 2131 if (IS_POSIX(fl)) {
2103 seq_printf(f, "%6s %s ", 2132 seq_printf(f, "%6s %s ",
2104 (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ", 2133 (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ",
@@ -2161,30 +2190,33 @@ static int locks_show(struct seq_file *f, void *v)
2161 2190
2162 fl = list_entry(v, struct file_lock, fl_link); 2191 fl = list_entry(v, struct file_lock, fl_link);
2163 2192
2164 lock_get_status(f, fl, (long)f->private, ""); 2193 lock_get_status(f, fl, *((loff_t *)f->private), "");
2165 2194
2166 list_for_each_entry(bfl, &fl->fl_block, fl_block) 2195 list_for_each_entry(bfl, &fl->fl_block, fl_block)
2167 lock_get_status(f, bfl, (long)f->private, " ->"); 2196 lock_get_status(f, bfl, *((loff_t *)f->private), " ->");
2168 2197
2169 f->private++;
2170 return 0; 2198 return 0;
2171} 2199}
2172 2200
2173static void *locks_start(struct seq_file *f, loff_t *pos) 2201static void *locks_start(struct seq_file *f, loff_t *pos)
2174{ 2202{
2175 lock_kernel(); 2203 loff_t *p = f->private;
2176 f->private = (void *)1; 2204
2205 lock_flocks();
2206 *p = (*pos + 1);
2177 return seq_list_start(&file_lock_list, *pos); 2207 return seq_list_start(&file_lock_list, *pos);
2178} 2208}
2179 2209
2180static void *locks_next(struct seq_file *f, void *v, loff_t *pos) 2210static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
2181{ 2211{
2212 loff_t *p = f->private;
2213 ++*p;
2182 return seq_list_next(v, &file_lock_list, pos); 2214 return seq_list_next(v, &file_lock_list, pos);
2183} 2215}
2184 2216
2185static void locks_stop(struct seq_file *f, void *v) 2217static void locks_stop(struct seq_file *f, void *v)
2186{ 2218{
2187 unlock_kernel(); 2219 unlock_flocks();
2188} 2220}
2189 2221
2190static const struct seq_operations locks_seq_operations = { 2222static const struct seq_operations locks_seq_operations = {
@@ -2196,14 +2228,14 @@ static const struct seq_operations locks_seq_operations = {
2196 2228
2197static int locks_open(struct inode *inode, struct file *filp) 2229static int locks_open(struct inode *inode, struct file *filp)
2198{ 2230{
2199 return seq_open(filp, &locks_seq_operations); 2231 return seq_open_private(filp, &locks_seq_operations, sizeof(loff_t));
2200} 2232}
2201 2233
2202static const struct file_operations proc_locks_operations = { 2234static const struct file_operations proc_locks_operations = {
2203 .open = locks_open, 2235 .open = locks_open,
2204 .read = seq_read, 2236 .read = seq_read,
2205 .llseek = seq_lseek, 2237 .llseek = seq_lseek,
2206 .release = seq_release, 2238 .release = seq_release_private,
2207}; 2239};
2208 2240
2209static int __init proc_locks_init(void) 2241static int __init proc_locks_init(void)
@@ -2231,7 +2263,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
2231{ 2263{
2232 struct file_lock *fl; 2264 struct file_lock *fl;
2233 int result = 1; 2265 int result = 1;
2234 lock_kernel(); 2266 lock_flocks();
2235 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 2267 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
2236 if (IS_POSIX(fl)) { 2268 if (IS_POSIX(fl)) {
2237 if (fl->fl_type == F_RDLCK) 2269 if (fl->fl_type == F_RDLCK)
@@ -2248,7 +2280,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
2248 result = 0; 2280 result = 0;
2249 break; 2281 break;
2250 } 2282 }
2251 unlock_kernel(); 2283 unlock_flocks();
2252 return result; 2284 return result;
2253} 2285}
2254 2286
@@ -2271,7 +2303,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
2271{ 2303{
2272 struct file_lock *fl; 2304 struct file_lock *fl;
2273 int result = 1; 2305 int result = 1;
2274 lock_kernel(); 2306 lock_flocks();
2275 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 2307 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
2276 if (IS_POSIX(fl)) { 2308 if (IS_POSIX(fl)) {
2277 if ((fl->fl_end < start) || (fl->fl_start > (start + len))) 2309 if ((fl->fl_end < start) || (fl->fl_start > (start + len)))
@@ -2286,7 +2318,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
2286 result = 0; 2318 result = 0;
2287 break; 2319 break;
2288 } 2320 }
2289 unlock_kernel(); 2321 unlock_flocks();
2290 return result; 2322 return result;
2291} 2323}
2292 2324
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 9777eb5b5522..409dfd65e9a1 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -569,7 +569,7 @@ static int logfs_link(struct dentry *old_dentry, struct inode *dir,
569 return -EMLINK; 569 return -EMLINK;
570 570
571 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 571 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
572 atomic_inc(&inode->i_count); 572 ihold(inode);
573 inode->i_nlink++; 573 inode->i_nlink++;
574 mark_inode_dirty_sync(inode); 574 mark_inode_dirty_sync(inode);
575 575
@@ -827,4 +827,5 @@ const struct file_operations logfs_dir_fops = {
827 .unlocked_ioctl = logfs_ioctl, 827 .unlocked_ioctl = logfs_ioctl,
828 .readdir = logfs_readdir, 828 .readdir = logfs_readdir,
829 .read = generic_read_dir, 829 .read = generic_read_dir,
830 .llseek = default_llseek,
830}; 831};
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index f3f3578393a4..c0d35a3accef 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -101,7 +101,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir,
101 101
102 inode->i_ctime = CURRENT_TIME_SEC; 102 inode->i_ctime = CURRENT_TIME_SEC;
103 inode_inc_link_count(inode); 103 inode_inc_link_count(inode);
104 atomic_inc(&inode->i_count); 104 ihold(inode);
105 return add_nondir(dentry, inode); 105 return add_nondir(dentry, inode);
106} 106}
107 107
diff --git a/fs/namei.c b/fs/namei.c
index 24896e833565..f7dbc06857ab 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1121,11 +1121,13 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
1121static struct dentry *__lookup_hash(struct qstr *name, 1121static struct dentry *__lookup_hash(struct qstr *name,
1122 struct dentry *base, struct nameidata *nd) 1122 struct dentry *base, struct nameidata *nd)
1123{ 1123{
1124 struct inode *inode = base->d_inode;
1124 struct dentry *dentry; 1125 struct dentry *dentry;
1125 struct inode *inode;
1126 int err; 1126 int err;
1127 1127
1128 inode = base->d_inode; 1128 err = exec_permission(inode);
1129 if (err)
1130 return ERR_PTR(err);
1129 1131
1130 /* 1132 /*
1131 * See if the low-level filesystem might want 1133 * See if the low-level filesystem might want
@@ -1161,11 +1163,6 @@ out:
1161 */ 1163 */
1162static struct dentry *lookup_hash(struct nameidata *nd) 1164static struct dentry *lookup_hash(struct nameidata *nd)
1163{ 1165{
1164 int err;
1165
1166 err = exec_permission(nd->path.dentry->d_inode);
1167 if (err)
1168 return ERR_PTR(err);
1169 return __lookup_hash(&nd->last, nd->path.dentry, nd); 1166 return __lookup_hash(&nd->last, nd->path.dentry, nd);
1170} 1167}
1171 1168
@@ -1213,9 +1210,6 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
1213 if (err) 1210 if (err)
1214 return ERR_PTR(err); 1211 return ERR_PTR(err);
1215 1212
1216 err = exec_permission(base->d_inode);
1217 if (err)
1218 return ERR_PTR(err);
1219 return __lookup_hash(&this, base, NULL); 1213 return __lookup_hash(&this, base, NULL);
1220} 1214}
1221 1215
@@ -2291,7 +2285,7 @@ static long do_unlinkat(int dfd, const char __user *pathname)
2291 goto slashes; 2285 goto slashes;
2292 inode = dentry->d_inode; 2286 inode = dentry->d_inode;
2293 if (inode) 2287 if (inode)
2294 atomic_inc(&inode->i_count); 2288 ihold(inode);
2295 error = mnt_want_write(nd.path.mnt); 2289 error = mnt_want_write(nd.path.mnt);
2296 if (error) 2290 if (error)
2297 goto exit2; 2291 goto exit2;
diff --git a/fs/namespace.c b/fs/namespace.c
index a72eaabfe8f2..8a415c9c5e55 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -595,7 +595,7 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
595 goto out_free; 595 goto out_free;
596 } 596 }
597 597
598 mnt->mnt_flags = old->mnt_flags; 598 mnt->mnt_flags = old->mnt_flags & ~MNT_WRITE_HOLD;
599 atomic_inc(&sb->s_active); 599 atomic_inc(&sb->s_active);
600 mnt->mnt_sb = sb; 600 mnt->mnt_sb = sb;
601 mnt->mnt_root = dget(root); 601 mnt->mnt_root = dget(root);
@@ -1744,9 +1744,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
1744 if (!capable(CAP_SYS_ADMIN)) 1744 if (!capable(CAP_SYS_ADMIN))
1745 return -EPERM; 1745 return -EPERM;
1746 1746
1747 lock_kernel();
1748 mnt = do_kern_mount(type, flags, name, data); 1747 mnt = do_kern_mount(type, flags, name, data);
1749 unlock_kernel();
1750 if (IS_ERR(mnt)) 1748 if (IS_ERR(mnt))
1751 return PTR_ERR(mnt); 1749 return PTR_ERR(mnt);
1752 1750
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 9578cbe0cd58..aac8832e919e 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -95,6 +95,34 @@ const struct dentry_operations ncp_root_dentry_operations =
95}; 95};
96 96
97 97
98#define ncp_namespace(i) (NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber])
99
100static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator)
101{
102#ifdef CONFIG_NCPFS_SMALLDOS
103 int ns = ncp_namespace(i);
104
105 if ((ns == NW_NS_DOS)
106#ifdef CONFIG_NCPFS_OS2_NS
107 || ((ns == NW_NS_OS2) && (nscreator == NW_NS_DOS))
108#endif /* CONFIG_NCPFS_OS2_NS */
109 )
110 return 0;
111#endif /* CONFIG_NCPFS_SMALLDOS */
112 return 1;
113}
114
115#define ncp_preserve_case(i) (ncp_namespace(i) != NW_NS_DOS)
116
117static inline int ncp_case_sensitive(struct dentry *dentry)
118{
119#ifdef CONFIG_NCPFS_NFS_NS
120 return ncp_namespace(dentry->d_inode) == NW_NS_NFS;
121#else
122 return 0;
123#endif /* CONFIG_NCPFS_NFS_NS */
124}
125
98/* 126/*
99 * Note: leave the hash unchanged if the directory 127 * Note: leave the hash unchanged if the directory
100 * is case-sensitive. 128 * is case-sensitive.
@@ -102,13 +130,12 @@ const struct dentry_operations ncp_root_dentry_operations =
102static int 130static int
103ncp_hash_dentry(struct dentry *dentry, struct qstr *this) 131ncp_hash_dentry(struct dentry *dentry, struct qstr *this)
104{ 132{
105 struct nls_table *t; 133 if (!ncp_case_sensitive(dentry)) {
106 unsigned long hash; 134 struct nls_table *t;
107 int i; 135 unsigned long hash;
108 136 int i;
109 t = NCP_IO_TABLE(dentry);
110 137
111 if (!ncp_case_sensitive(dentry->d_inode)) { 138 t = NCP_IO_TABLE(dentry);
112 hash = init_name_hash(); 139 hash = init_name_hash();
113 for (i=0; i<this->len ; i++) 140 for (i=0; i<this->len ; i++)
114 hash = partial_name_hash(ncp_tolower(t, this->name[i]), 141 hash = partial_name_hash(ncp_tolower(t, this->name[i]),
@@ -124,7 +151,7 @@ ncp_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
124 if (a->len != b->len) 151 if (a->len != b->len)
125 return 1; 152 return 1;
126 153
127 if (ncp_case_sensitive(dentry->d_inode)) 154 if (ncp_case_sensitive(dentry))
128 return strncmp(a->name, b->name, a->len); 155 return strncmp(a->name, b->name, a->len);
129 156
130 return ncp_strnicmp(NCP_IO_TABLE(dentry), a->name, b->name, a->len); 157 return ncp_strnicmp(NCP_IO_TABLE(dentry), a->name, b->name, a->len);
@@ -266,7 +293,7 @@ leave_me:;
266 293
267 294
268static int 295static int
269__ncp_lookup_validate(struct dentry *dentry) 296ncp_lookup_validate(struct dentry *dentry, struct nameidata *nd)
270{ 297{
271 struct ncp_server *server; 298 struct ncp_server *server;
272 struct dentry *parent; 299 struct dentry *parent;
@@ -283,9 +310,6 @@ __ncp_lookup_validate(struct dentry *dentry)
283 310
284 server = NCP_SERVER(dir); 311 server = NCP_SERVER(dir);
285 312
286 if (!ncp_conn_valid(server))
287 goto finished;
288
289 /* 313 /*
290 * Inspired by smbfs: 314 * Inspired by smbfs:
291 * The default validation is based on dentry age: 315 * The default validation is based on dentry age:
@@ -304,8 +328,11 @@ __ncp_lookup_validate(struct dentry *dentry)
304 if (ncp_is_server_root(dir)) { 328 if (ncp_is_server_root(dir)) {
305 res = ncp_io2vol(server, __name, &len, dentry->d_name.name, 329 res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
306 dentry->d_name.len, 1); 330 dentry->d_name.len, 1);
307 if (!res) 331 if (!res) {
308 res = ncp_lookup_volume(server, __name, &(finfo.i)); 332 res = ncp_lookup_volume(server, __name, &(finfo.i));
333 if (!res)
334 ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
335 }
309 } else { 336 } else {
310 res = ncp_io2vol(server, __name, &len, dentry->d_name.name, 337 res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
311 dentry->d_name.len, !ncp_preserve_case(dir)); 338 dentry->d_name.len, !ncp_preserve_case(dir));
@@ -320,13 +347,17 @@ __ncp_lookup_validate(struct dentry *dentry)
320 * what we remember, it's not valid any more. 347 * what we remember, it's not valid any more.
321 */ 348 */
322 if (!res) { 349 if (!res) {
323 if (finfo.i.dirEntNum == NCP_FINFO(dentry->d_inode)->dirEntNum) { 350 struct inode *inode = dentry->d_inode;
351
352 mutex_lock(&inode->i_mutex);
353 if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) {
324 ncp_new_dentry(dentry); 354 ncp_new_dentry(dentry);
325 val=1; 355 val=1;
326 } else 356 } else
327 DDPRINTK("ncp_lookup_validate: found, but dirEntNum changed\n"); 357 DDPRINTK("ncp_lookup_validate: found, but dirEntNum changed\n");
328 358
329 ncp_update_inode2(dentry->d_inode, &finfo); 359 ncp_update_inode2(inode, &finfo);
360 mutex_unlock(&inode->i_mutex);
330 } 361 }
331 362
332finished: 363finished:
@@ -335,16 +366,6 @@ finished:
335 return val; 366 return val;
336} 367}
337 368
338static int
339ncp_lookup_validate(struct dentry * dentry, struct nameidata *nd)
340{
341 int res;
342 lock_kernel();
343 res = __ncp_lookup_validate(dentry);
344 unlock_kernel();
345 return res;
346}
347
348static struct dentry * 369static struct dentry *
349ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos) 370ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
350{ 371{
@@ -411,8 +432,6 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
411 int result, mtime_valid = 0; 432 int result, mtime_valid = 0;
412 time_t mtime = 0; 433 time_t mtime = 0;
413 434
414 lock_kernel();
415
416 ctl.page = NULL; 435 ctl.page = NULL;
417 ctl.cache = NULL; 436 ctl.cache = NULL;
418 437
@@ -421,6 +440,7 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
421 (int) filp->f_pos); 440 (int) filp->f_pos);
422 441
423 result = -EIO; 442 result = -EIO;
443 /* Do not generate '.' and '..' when server is dead. */
424 if (!ncp_conn_valid(server)) 444 if (!ncp_conn_valid(server))
425 goto out; 445 goto out;
426 446
@@ -532,6 +552,12 @@ read_really:
532 ctl.head.end = ctl.fpos - 1; 552 ctl.head.end = ctl.fpos - 1;
533 ctl.head.eof = ctl.valid; 553 ctl.head.eof = ctl.valid;
534finished: 554finished:
555 if (ctl.page) {
556 kunmap(ctl.page);
557 SetPageUptodate(ctl.page);
558 unlock_page(ctl.page);
559 page_cache_release(ctl.page);
560 }
535 if (page) { 561 if (page) {
536 cache->head = ctl.head; 562 cache->head = ctl.head;
537 kunmap(page); 563 kunmap(page);
@@ -539,23 +565,17 @@ finished:
539 unlock_page(page); 565 unlock_page(page);
540 page_cache_release(page); 566 page_cache_release(page);
541 } 567 }
542 if (ctl.page) {
543 kunmap(ctl.page);
544 SetPageUptodate(ctl.page);
545 unlock_page(ctl.page);
546 page_cache_release(ctl.page);
547 }
548out: 568out:
549 unlock_kernel();
550 return result; 569 return result;
551} 570}
552 571
553static int 572static int
554ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir, 573ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
555 struct ncp_cache_control *ctrl, struct ncp_entry_info *entry) 574 struct ncp_cache_control *ctrl, struct ncp_entry_info *entry,
575 int inval_childs)
556{ 576{
557 struct dentry *newdent, *dentry = filp->f_path.dentry; 577 struct dentry *newdent, *dentry = filp->f_path.dentry;
558 struct inode *newino, *inode = dentry->d_inode; 578 struct inode *dir = dentry->d_inode;
559 struct ncp_cache_control ctl = *ctrl; 579 struct ncp_cache_control ctl = *ctrl;
560 struct qstr qname; 580 struct qstr qname;
561 int valid = 0; 581 int valid = 0;
@@ -564,9 +584,9 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
564 __u8 __name[NCP_MAXPATHLEN + 1]; 584 __u8 __name[NCP_MAXPATHLEN + 1];
565 585
566 qname.len = sizeof(__name); 586 qname.len = sizeof(__name);
567 if (ncp_vol2io(NCP_SERVER(inode), __name, &qname.len, 587 if (ncp_vol2io(NCP_SERVER(dir), __name, &qname.len,
568 entry->i.entryName, entry->i.nameLen, 588 entry->i.entryName, entry->i.nameLen,
569 !ncp_preserve_entry_case(inode, entry->i.NSCreator))) 589 !ncp_preserve_entry_case(dir, entry->i.NSCreator)))
570 return 1; /* I'm not sure */ 590 return 1; /* I'm not sure */
571 591
572 qname.name = __name; 592 qname.name = __name;
@@ -584,22 +604,64 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
584 goto end_advance; 604 goto end_advance;
585 } else { 605 } else {
586 hashed = 1; 606 hashed = 1;
587 memcpy((char *) newdent->d_name.name, qname.name, 607
588 newdent->d_name.len); 608 /* If case sensitivity changed for this volume, all entries below this one
609 should be thrown away. This entry itself is not affected, as its case
610 sensitivity is controlled by its own parent. */
611 if (inval_childs)
612 shrink_dcache_parent(newdent);
613
614 /*
615 * It is not as dangerous as it looks. NetWare's OS2 namespace is
616 * case preserving yet case insensitive. So we update dentry's name
617 * as received from server. We found dentry via d_lookup with our
618 * hash, so we know that hash does not change, and so replacing name
619 * should be reasonably safe.
620 */
621 if (qname.len == newdent->d_name.len &&
622 memcmp(newdent->d_name.name, qname.name, newdent->d_name.len)) {
623 struct inode *inode = newdent->d_inode;
624
625 /*
626 * Inside ncpfs all uses of d_name are either for debugging,
627 * or on functions which acquire inode mutex (mknod, creat,
628 * lookup). So grab i_mutex here, to be sure. d_path
629 * uses dcache_lock when generating path, so we should too.
630 * And finally d_compare is protected by dentry's d_lock, so
631 * here we go.
632 */
633 if (inode)
634 mutex_lock(&inode->i_mutex);
635 spin_lock(&dcache_lock);
636 spin_lock(&newdent->d_lock);
637 memcpy((char *) newdent->d_name.name, qname.name,
638 newdent->d_name.len);
639 spin_unlock(&newdent->d_lock);
640 spin_unlock(&dcache_lock);
641 if (inode)
642 mutex_unlock(&inode->i_mutex);
643 }
589 } 644 }
590 645
591 if (!newdent->d_inode) { 646 if (!newdent->d_inode) {
647 struct inode *inode;
648
592 entry->opened = 0; 649 entry->opened = 0;
593 entry->ino = iunique(inode->i_sb, 2); 650 entry->ino = iunique(dir->i_sb, 2);
594 newino = ncp_iget(inode->i_sb, entry); 651 inode = ncp_iget(dir->i_sb, entry);
595 if (newino) { 652 if (inode) {
596 newdent->d_op = &ncp_dentry_operations; 653 newdent->d_op = &ncp_dentry_operations;
597 d_instantiate(newdent, newino); 654 d_instantiate(newdent, inode);
598 if (!hashed) 655 if (!hashed)
599 d_rehash(newdent); 656 d_rehash(newdent);
600 } 657 }
601 } else 658 } else {
602 ncp_update_inode2(newdent->d_inode, entry); 659 struct inode *inode = newdent->d_inode;
660
661 mutex_lock(&inode->i_mutex);
662 ncp_update_inode2(inode, entry);
663 mutex_unlock(&inode->i_mutex);
664 }
603 665
604 if (newdent->d_inode) { 666 if (newdent->d_inode) {
605 ino = newdent->d_inode->i_ino; 667 ino = newdent->d_inode->i_ino;
@@ -617,7 +679,7 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
617 ctl.cache = NULL; 679 ctl.cache = NULL;
618 ctl.idx -= NCP_DIRCACHE_SIZE; 680 ctl.idx -= NCP_DIRCACHE_SIZE;
619 ctl.ofs += 1; 681 ctl.ofs += 1;
620 ctl.page = grab_cache_page(&inode->i_data, ctl.ofs); 682 ctl.page = grab_cache_page(&dir->i_data, ctl.ofs);
621 if (ctl.page) 683 if (ctl.page)
622 ctl.cache = kmap(ctl.page); 684 ctl.cache = kmap(ctl.page);
623 } 685 }
@@ -633,7 +695,7 @@ end_advance:
633 if (!ino) 695 if (!ino)
634 ino = find_inode_number(dentry, &qname); 696 ino = find_inode_number(dentry, &qname);
635 if (!ino) 697 if (!ino)
636 ino = iunique(inode->i_sb, 2); 698 ino = iunique(dir->i_sb, 2);
637 ctl.filled = filldir(dirent, qname.name, qname.len, 699 ctl.filled = filldir(dirent, qname.name, qname.len,
638 filp->f_pos, ino, DT_UNKNOWN); 700 filp->f_pos, ino, DT_UNKNOWN);
639 if (!ctl.filled) 701 if (!ctl.filled)
@@ -660,6 +722,7 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
660 (unsigned long) filp->f_pos); 722 (unsigned long) filp->f_pos);
661 723
662 for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) { 724 for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) {
725 int inval_dentry;
663 726
664 if (ncp_get_volume_info_with_number(server, i, &info) != 0) 727 if (ncp_get_volume_info_with_number(server, i, &info) != 0)
665 return; 728 return;
@@ -675,8 +738,9 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
675 info.volume_name); 738 info.volume_name);
676 continue; 739 continue;
677 } 740 }
741 inval_dentry = ncp_update_known_namespace(server, entry.i.volNumber, NULL);
678 entry.volume = entry.i.volNumber; 742 entry.volume = entry.i.volNumber;
679 if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry)) 743 if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, inval_dentry))
680 return; 744 return;
681 } 745 }
682} 746}
@@ -739,7 +803,7 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
739 rpl += onerpl; 803 rpl += onerpl;
740 rpls -= onerpl; 804 rpls -= onerpl;
741 entry.volume = entry.i.volNumber; 805 entry.volume = entry.i.volNumber;
742 if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry)) 806 if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, 0))
743 break; 807 break;
744 } 808 }
745 } while (more); 809 } while (more);
@@ -775,17 +839,19 @@ int ncp_conn_logged_in(struct super_block *sb)
775 if (dent) { 839 if (dent) {
776 struct inode* ino = dent->d_inode; 840 struct inode* ino = dent->d_inode;
777 if (ino) { 841 if (ino) {
842 ncp_update_known_namespace(server, volNumber, NULL);
778 NCP_FINFO(ino)->volNumber = volNumber; 843 NCP_FINFO(ino)->volNumber = volNumber;
779 NCP_FINFO(ino)->dirEntNum = dirEntNum; 844 NCP_FINFO(ino)->dirEntNum = dirEntNum;
780 NCP_FINFO(ino)->DosDirNum = DosDirNum; 845 NCP_FINFO(ino)->DosDirNum = DosDirNum;
846 result = 0;
781 } else { 847 } else {
782 DPRINTK("ncpfs: sb->s_root->d_inode == NULL!\n"); 848 DPRINTK("ncpfs: sb->s_root->d_inode == NULL!\n");
783 } 849 }
784 } else { 850 } else {
785 DPRINTK("ncpfs: sb->s_root == NULL!\n"); 851 DPRINTK("ncpfs: sb->s_root == NULL!\n");
786 } 852 }
787 } 853 } else
788 result = 0; 854 result = 0;
789 855
790out: 856out:
791 return result; 857 return result;
@@ -799,7 +865,6 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
799 int error, res, len; 865 int error, res, len;
800 __u8 __name[NCP_MAXPATHLEN + 1]; 866 __u8 __name[NCP_MAXPATHLEN + 1];
801 867
802 lock_kernel();
803 error = -EIO; 868 error = -EIO;
804 if (!ncp_conn_valid(server)) 869 if (!ncp_conn_valid(server))
805 goto finished; 870 goto finished;
@@ -813,6 +878,8 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
813 dentry->d_name.len, 1); 878 dentry->d_name.len, 1);
814 if (!res) 879 if (!res)
815 res = ncp_lookup_volume(server, __name, &(finfo.i)); 880 res = ncp_lookup_volume(server, __name, &(finfo.i));
881 if (!res)
882 ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
816 } else { 883 } else {
817 res = ncp_io2vol(server, __name, &len, dentry->d_name.name, 884 res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
818 dentry->d_name.len, !ncp_preserve_case(dir)); 885 dentry->d_name.len, !ncp_preserve_case(dir));
@@ -846,7 +913,6 @@ add_entry:
846 913
847finished: 914finished:
848 PPRINTK("ncp_lookup: result=%d\n", error); 915 PPRINTK("ncp_lookup: result=%d\n", error);
849 unlock_kernel();
850 return ERR_PTR(error); 916 return ERR_PTR(error);
851} 917}
852 918
@@ -887,11 +953,6 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
887 PPRINTK("ncp_create_new: creating %s/%s, mode=%x\n", 953 PPRINTK("ncp_create_new: creating %s/%s, mode=%x\n",
888 dentry->d_parent->d_name.name, dentry->d_name.name, mode); 954 dentry->d_parent->d_name.name, dentry->d_name.name, mode);
889 955
890 error = -EIO;
891 lock_kernel();
892 if (!ncp_conn_valid(server))
893 goto out;
894
895 ncp_age_dentry(server, dentry); 956 ncp_age_dentry(server, dentry);
896 len = sizeof(__name); 957 len = sizeof(__name);
897 error = ncp_io2vol(server, __name, &len, dentry->d_name.name, 958 error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
@@ -917,6 +978,8 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
917 if (result) { 978 if (result) {
918 if (result == 0x87) 979 if (result == 0x87)
919 error = -ENAMETOOLONG; 980 error = -ENAMETOOLONG;
981 else if (result < 0)
982 error = result;
920 DPRINTK("ncp_create: %s/%s failed\n", 983 DPRINTK("ncp_create: %s/%s failed\n",
921 dentry->d_parent->d_name.name, dentry->d_name.name); 984 dentry->d_parent->d_name.name, dentry->d_name.name);
922 goto out; 985 goto out;
@@ -935,7 +998,6 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
935 998
936 error = ncp_instantiate(dir, dentry, &finfo); 999 error = ncp_instantiate(dir, dentry, &finfo);
937out: 1000out:
938 unlock_kernel();
939 return error; 1001 return error;
940} 1002}
941 1003
@@ -955,11 +1017,6 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
955 DPRINTK("ncp_mkdir: making %s/%s\n", 1017 DPRINTK("ncp_mkdir: making %s/%s\n",
956 dentry->d_parent->d_name.name, dentry->d_name.name); 1018 dentry->d_parent->d_name.name, dentry->d_name.name);
957 1019
958 error = -EIO;
959 lock_kernel();
960 if (!ncp_conn_valid(server))
961 goto out;
962
963 ncp_age_dentry(server, dentry); 1020 ncp_age_dentry(server, dentry);
964 len = sizeof(__name); 1021 len = sizeof(__name);
965 error = ncp_io2vol(server, __name, &len, dentry->d_name.name, 1022 error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
@@ -967,12 +1024,11 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
967 if (error) 1024 if (error)
968 goto out; 1025 goto out;
969 1026
970 error = -EACCES; 1027 error = ncp_open_create_file_or_subdir(server, dir, __name,
971 if (ncp_open_create_file_or_subdir(server, dir, __name,
972 OC_MODE_CREATE, aDIR, 1028 OC_MODE_CREATE, aDIR,
973 cpu_to_le16(0xffff), 1029 cpu_to_le16(0xffff),
974 &finfo) == 0) 1030 &finfo);
975 { 1031 if (error == 0) {
976 if (ncp_is_nfs_extras(server, finfo.volume)) { 1032 if (ncp_is_nfs_extras(server, finfo.volume)) {
977 mode |= S_IFDIR; 1033 mode |= S_IFDIR;
978 finfo.i.nfs.mode = mode; 1034 finfo.i.nfs.mode = mode;
@@ -983,9 +1039,10 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
983 goto out; 1039 goto out;
984 } 1040 }
985 error = ncp_instantiate(dir, dentry, &finfo); 1041 error = ncp_instantiate(dir, dentry, &finfo);
1042 } else if (error > 0) {
1043 error = -EACCES;
986 } 1044 }
987out: 1045out:
988 unlock_kernel();
989 return error; 1046 return error;
990} 1047}
991 1048
@@ -998,11 +1055,6 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
998 DPRINTK("ncp_rmdir: removing %s/%s\n", 1055 DPRINTK("ncp_rmdir: removing %s/%s\n",
999 dentry->d_parent->d_name.name, dentry->d_name.name); 1056 dentry->d_parent->d_name.name, dentry->d_name.name);
1000 1057
1001 error = -EIO;
1002 lock_kernel();
1003 if (!ncp_conn_valid(server))
1004 goto out;
1005
1006 error = -EBUSY; 1058 error = -EBUSY;
1007 if (!d_unhashed(dentry)) 1059 if (!d_unhashed(dentry))
1008 goto out; 1060 goto out;
@@ -1036,11 +1088,10 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
1036 error = -ENOENT; 1088 error = -ENOENT;
1037 break; 1089 break;
1038 default: 1090 default:
1039 error = -EACCES; 1091 error = result < 0 ? result : -EACCES;
1040 break; 1092 break;
1041 } 1093 }
1042out: 1094out:
1043 unlock_kernel();
1044 return error; 1095 return error;
1045} 1096}
1046 1097
@@ -1050,15 +1101,10 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
1050 struct ncp_server *server; 1101 struct ncp_server *server;
1051 int error; 1102 int error;
1052 1103
1053 lock_kernel();
1054 server = NCP_SERVER(dir); 1104 server = NCP_SERVER(dir);
1055 DPRINTK("ncp_unlink: unlinking %s/%s\n", 1105 DPRINTK("ncp_unlink: unlinking %s/%s\n",
1056 dentry->d_parent->d_name.name, dentry->d_name.name); 1106 dentry->d_parent->d_name.name, dentry->d_name.name);
1057 1107
1058 error = -EIO;
1059 if (!ncp_conn_valid(server))
1060 goto out;
1061
1062 /* 1108 /*
1063 * Check whether to close the file ... 1109 * Check whether to close the file ...
1064 */ 1110 */
@@ -1097,12 +1143,9 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
1097 error = -ENOENT; 1143 error = -ENOENT;
1098 break; 1144 break;
1099 default: 1145 default:
1100 error = -EACCES; 1146 error = error < 0 ? error : -EACCES;
1101 break; 1147 break;
1102 } 1148 }
1103
1104out:
1105 unlock_kernel();
1106 return error; 1149 return error;
1107} 1150}
1108 1151
@@ -1118,11 +1161,6 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
1118 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, 1161 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1119 new_dentry->d_parent->d_name.name, new_dentry->d_name.name); 1162 new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
1120 1163
1121 error = -EIO;
1122 lock_kernel();
1123 if (!ncp_conn_valid(server))
1124 goto out;
1125
1126 ncp_age_dentry(server, old_dentry); 1164 ncp_age_dentry(server, old_dentry);
1127 ncp_age_dentry(server, new_dentry); 1165 ncp_age_dentry(server, new_dentry);
1128 1166
@@ -1161,11 +1199,10 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
1161 error = -ENOENT; 1199 error = -ENOENT;
1162 break; 1200 break;
1163 default: 1201 default:
1164 error = -EACCES; 1202 error = error < 0 ? error : -EACCES;
1165 break; 1203 break;
1166 } 1204 }
1167out: 1205out:
1168 unlock_kernel();
1169 return error; 1206 return error;
1170} 1207}
1171 1208
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 3639cc5cbdae..6c754f70c529 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -113,9 +113,6 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
113 DPRINTK("ncp_file_read: enter %s/%s\n", 113 DPRINTK("ncp_file_read: enter %s/%s\n",
114 dentry->d_parent->d_name.name, dentry->d_name.name); 114 dentry->d_parent->d_name.name, dentry->d_name.name);
115 115
116 if (!ncp_conn_valid(NCP_SERVER(inode)))
117 return -EIO;
118
119 pos = *ppos; 116 pos = *ppos;
120 117
121 if ((ssize_t) count < 0) { 118 if ((ssize_t) count < 0) {
@@ -192,13 +189,11 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
192 189
193 DPRINTK("ncp_file_write: enter %s/%s\n", 190 DPRINTK("ncp_file_write: enter %s/%s\n",
194 dentry->d_parent->d_name.name, dentry->d_name.name); 191 dentry->d_parent->d_name.name, dentry->d_name.name);
195 if (!ncp_conn_valid(NCP_SERVER(inode)))
196 return -EIO;
197 if ((ssize_t) count < 0) 192 if ((ssize_t) count < 0)
198 return -EINVAL; 193 return -EINVAL;
199 pos = *ppos; 194 pos = *ppos;
200 if (file->f_flags & O_APPEND) { 195 if (file->f_flags & O_APPEND) {
201 pos = inode->i_size; 196 pos = i_size_read(inode);
202 } 197 }
203 198
204 if (pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) { 199 if (pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
@@ -264,8 +259,11 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
264 259
265 *ppos = pos; 260 *ppos = pos;
266 261
267 if (pos > inode->i_size) { 262 if (pos > i_size_read(inode)) {
268 inode->i_size = pos; 263 mutex_lock(&inode->i_mutex);
264 if (pos > i_size_read(inode))
265 i_size_write(inode, pos);
266 mutex_unlock(&inode->i_mutex);
269 } 267 }
270 DPRINTK("ncp_file_write: exit %s/%s\n", 268 DPRINTK("ncp_file_write: exit %s/%s\n",
271 dentry->d_parent->d_name.name, dentry->d_name.name); 269 dentry->d_parent->d_name.name, dentry->d_name.name);
@@ -281,18 +279,9 @@ static int ncp_release(struct inode *inode, struct file *file) {
281 return 0; 279 return 0;
282} 280}
283 281
284static loff_t ncp_remote_llseek(struct file *file, loff_t offset, int origin)
285{
286 loff_t ret;
287 lock_kernel();
288 ret = generic_file_llseek_unlocked(file, offset, origin);
289 unlock_kernel();
290 return ret;
291}
292
293const struct file_operations ncp_file_operations = 282const struct file_operations ncp_file_operations =
294{ 283{
295 .llseek = ncp_remote_llseek, 284 .llseek = generic_file_llseek,
296 .read = ncp_file_read, 285 .read = ncp_file_read,
297 .write = ncp_file_write, 286 .write = ncp_file_write,
298 .unlocked_ioctl = ncp_ioctl, 287 .unlocked_ioctl = ncp_ioctl,
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index b4de38cf49f5..985fabb26aca 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -139,7 +139,7 @@ static void ncp_update_dates(struct inode *inode, struct nw_info_struct *nwi)
139 inode->i_mode = nwi->nfs.mode; 139 inode->i_mode = nwi->nfs.mode;
140 } 140 }
141 141
142 inode->i_blocks = (inode->i_size + NCP_BLOCK_SIZE - 1) >> NCP_BLOCK_SHIFT; 142 inode->i_blocks = (i_size_read(inode) + NCP_BLOCK_SIZE - 1) >> NCP_BLOCK_SHIFT;
143 143
144 inode->i_mtime.tv_sec = ncp_date_dos2unix(nwi->modifyTime, nwi->modifyDate); 144 inode->i_mtime.tv_sec = ncp_date_dos2unix(nwi->modifyTime, nwi->modifyDate);
145 inode->i_ctime.tv_sec = ncp_date_dos2unix(nwi->creationTime, nwi->creationDate); 145 inode->i_ctime.tv_sec = ncp_date_dos2unix(nwi->creationTime, nwi->creationDate);
@@ -158,18 +158,21 @@ static void ncp_update_attrs(struct inode *inode, struct ncp_entry_info *nwinfo)
158 inode->i_mode = server->m.dir_mode; 158 inode->i_mode = server->m.dir_mode;
159 /* for directories dataStreamSize seems to be some 159 /* for directories dataStreamSize seems to be some
160 Object ID ??? */ 160 Object ID ??? */
161 inode->i_size = NCP_BLOCK_SIZE; 161 i_size_write(inode, NCP_BLOCK_SIZE);
162 } else { 162 } else {
163 u32 size;
164
163 inode->i_mode = server->m.file_mode; 165 inode->i_mode = server->m.file_mode;
164 inode->i_size = le32_to_cpu(nwi->dataStreamSize); 166 size = le32_to_cpu(nwi->dataStreamSize);
167 i_size_write(inode, size);
165#ifdef CONFIG_NCPFS_EXTRAS 168#ifdef CONFIG_NCPFS_EXTRAS
166 if ((server->m.flags & (NCP_MOUNT_EXTRAS|NCP_MOUNT_SYMLINKS)) 169 if ((server->m.flags & (NCP_MOUNT_EXTRAS|NCP_MOUNT_SYMLINKS))
167 && (nwi->attributes & aSHARED)) { 170 && (nwi->attributes & aSHARED)) {
168 switch (nwi->attributes & (aHIDDEN|aSYSTEM)) { 171 switch (nwi->attributes & (aHIDDEN|aSYSTEM)) {
169 case aHIDDEN: 172 case aHIDDEN:
170 if (server->m.flags & NCP_MOUNT_SYMLINKS) { 173 if (server->m.flags & NCP_MOUNT_SYMLINKS) {
171 if (/* (inode->i_size >= NCP_MIN_SYMLINK_SIZE) 174 if (/* (size >= NCP_MIN_SYMLINK_SIZE)
172 && */ (inode->i_size <= NCP_MAX_SYMLINK_SIZE)) { 175 && */ (size <= NCP_MAX_SYMLINK_SIZE)) {
173 inode->i_mode = (inode->i_mode & ~S_IFMT) | S_IFLNK; 176 inode->i_mode = (inode->i_mode & ~S_IFMT) | S_IFLNK;
174 NCP_FINFO(inode)->flags |= NCPI_KLUDGE_SYMLINK; 177 NCP_FINFO(inode)->flags |= NCPI_KLUDGE_SYMLINK;
175 break; 178 break;
@@ -208,7 +211,7 @@ void ncp_update_inode2(struct inode* inode, struct ncp_entry_info *nwinfo)
208} 211}
209 212
210/* 213/*
211 * Fill in the inode based on the ncp_entry_info structure. 214 * Fill in the inode based on the ncp_entry_info structure. Used only for brand new inodes.
212 */ 215 */
213static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo) 216static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
214{ 217{
@@ -254,6 +257,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
254 if (inode) { 257 if (inode) {
255 atomic_set(&NCP_FINFO(inode)->opened, info->opened); 258 atomic_set(&NCP_FINFO(inode)->opened, info->opened);
256 259
260 inode->i_mapping->backing_dev_info = sb->s_bdi;
257 inode->i_ino = info->ino; 261 inode->i_ino = info->ino;
258 ncp_set_attr(inode, info); 262 ncp_set_attr(inode, info);
259 if (S_ISREG(inode->i_mode)) { 263 if (S_ISREG(inode->i_mode)) {
@@ -299,10 +303,12 @@ ncp_evict_inode(struct inode *inode)
299 303
300static void ncp_stop_tasks(struct ncp_server *server) { 304static void ncp_stop_tasks(struct ncp_server *server) {
301 struct sock* sk = server->ncp_sock->sk; 305 struct sock* sk = server->ncp_sock->sk;
302 306
307 lock_sock(sk);
303 sk->sk_error_report = server->error_report; 308 sk->sk_error_report = server->error_report;
304 sk->sk_data_ready = server->data_ready; 309 sk->sk_data_ready = server->data_ready;
305 sk->sk_write_space = server->write_space; 310 sk->sk_write_space = server->write_space;
311 release_sock(sk);
306 del_timer_sync(&server->timeout_tm); 312 del_timer_sync(&server->timeout_tm);
307 flush_scheduled_work(); 313 flush_scheduled_work();
308} 314}
@@ -565,10 +571,12 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
565/* server->conn_status = 0; */ 571/* server->conn_status = 0; */
566/* server->root_dentry = NULL; */ 572/* server->root_dentry = NULL; */
567/* server->root_setuped = 0; */ 573/* server->root_setuped = 0; */
574 mutex_init(&server->root_setup_lock);
568#ifdef CONFIG_NCPFS_PACKET_SIGNING 575#ifdef CONFIG_NCPFS_PACKET_SIGNING
569/* server->sign_wanted = 0; */ 576/* server->sign_wanted = 0; */
570/* server->sign_active = 0; */ 577/* server->sign_active = 0; */
571#endif 578#endif
579 init_rwsem(&server->auth_rwsem);
572 server->auth.auth_type = NCP_AUTH_NONE; 580 server->auth.auth_type = NCP_AUTH_NONE;
573/* server->auth.object_name_len = 0; */ 581/* server->auth.object_name_len = 0; */
574/* server->auth.object_name = NULL; */ 582/* server->auth.object_name = NULL; */
@@ -593,16 +601,12 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
593 server->nls_io = load_nls_default(); 601 server->nls_io = load_nls_default();
594#endif /* CONFIG_NCPFS_NLS */ 602#endif /* CONFIG_NCPFS_NLS */
595 603
596 server->dentry_ttl = 0; /* no caching */ 604 atomic_set(&server->dentry_ttl, 0); /* no caching */
597 605
598 INIT_LIST_HEAD(&server->tx.requests); 606 INIT_LIST_HEAD(&server->tx.requests);
599 mutex_init(&server->rcv.creq_mutex); 607 mutex_init(&server->rcv.creq_mutex);
600 server->tx.creq = NULL; 608 server->tx.creq = NULL;
601 server->rcv.creq = NULL; 609 server->rcv.creq = NULL;
602 server->data_ready = sock->sk->sk_data_ready;
603 server->write_space = sock->sk->sk_write_space;
604 server->error_report = sock->sk->sk_error_report;
605 sock->sk->sk_user_data = server;
606 610
607 init_timer(&server->timeout_tm); 611 init_timer(&server->timeout_tm);
608#undef NCP_PACKET_SIZE 612#undef NCP_PACKET_SIZE
@@ -619,6 +623,11 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
619 if (server->rxbuf == NULL) 623 if (server->rxbuf == NULL)
620 goto out_txbuf; 624 goto out_txbuf;
621 625
626 lock_sock(sock->sk);
627 server->data_ready = sock->sk->sk_data_ready;
628 server->write_space = sock->sk->sk_write_space;
629 server->error_report = sock->sk->sk_error_report;
630 sock->sk->sk_user_data = server;
622 sock->sk->sk_data_ready = ncp_tcp_data_ready; 631 sock->sk->sk_data_ready = ncp_tcp_data_ready;
623 sock->sk->sk_error_report = ncp_tcp_error_report; 632 sock->sk->sk_error_report = ncp_tcp_error_report;
624 if (sock->type == SOCK_STREAM) { 633 if (sock->type == SOCK_STREAM) {
@@ -634,6 +643,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
634 server->timeout_tm.data = (unsigned long)server; 643 server->timeout_tm.data = (unsigned long)server;
635 server->timeout_tm.function = ncpdgram_timeout_call; 644 server->timeout_tm.function = ncpdgram_timeout_call;
636 } 645 }
646 release_sock(sock->sk);
637 647
638 ncp_lock_server(server); 648 ncp_lock_server(server);
639 error = ncp_connect(server); 649 error = ncp_connect(server);
@@ -658,8 +668,10 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
658 goto out_disconnect; 668 goto out_disconnect;
659 } 669 }
660 } 670 }
671 ncp_lock_server(server);
661 if (options & 2) 672 if (options & 2)
662 server->sign_wanted = 1; 673 server->sign_wanted = 1;
674 ncp_unlock_server(server);
663 } 675 }
664 else 676 else
665#endif /* CONFIG_NCPFS_PACKET_SIGNING */ 677#endif /* CONFIG_NCPFS_PACKET_SIGNING */
@@ -720,6 +732,9 @@ out_nls:
720 unload_nls(server->nls_io); 732 unload_nls(server->nls_io);
721 unload_nls(server->nls_vol); 733 unload_nls(server->nls_vol);
722#endif 734#endif
735 mutex_destroy(&server->rcv.creq_mutex);
736 mutex_destroy(&server->root_setup_lock);
737 mutex_destroy(&server->mutex);
723out_fput2: 738out_fput2:
724 if (server->info_filp) 739 if (server->info_filp)
725 fput(server->info_filp); 740 fput(server->info_filp);
@@ -743,8 +758,6 @@ static void ncp_put_super(struct super_block *sb)
743{ 758{
744 struct ncp_server *server = NCP_SBP(sb); 759 struct ncp_server *server = NCP_SBP(sb);
745 760
746 lock_kernel();
747
748 ncp_lock_server(server); 761 ncp_lock_server(server);
749 ncp_disconnect(server); 762 ncp_disconnect(server);
750 ncp_unlock_server(server); 763 ncp_unlock_server(server);
@@ -756,6 +769,9 @@ static void ncp_put_super(struct super_block *sb)
756 unload_nls(server->nls_vol); 769 unload_nls(server->nls_vol);
757 unload_nls(server->nls_io); 770 unload_nls(server->nls_io);
758#endif /* CONFIG_NCPFS_NLS */ 771#endif /* CONFIG_NCPFS_NLS */
772 mutex_destroy(&server->rcv.creq_mutex);
773 mutex_destroy(&server->root_setup_lock);
774 mutex_destroy(&server->mutex);
759 775
760 if (server->info_filp) 776 if (server->info_filp)
761 fput(server->info_filp); 777 fput(server->info_filp);
@@ -771,8 +787,6 @@ static void ncp_put_super(struct super_block *sb)
771 vfree(server->packet); 787 vfree(server->packet);
772 sb->s_fs_info = NULL; 788 sb->s_fs_info = NULL;
773 kfree(server); 789 kfree(server);
774
775 unlock_kernel();
776} 790}
777 791
778static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf) 792static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -851,10 +865,8 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
851 865
852 result = -EIO; 866 result = -EIO;
853 867
854 lock_kernel();
855
856 server = NCP_SERVER(inode); 868 server = NCP_SERVER(inode);
857 if ((!server) || !ncp_conn_valid(server)) 869 if (!server) /* How this could happen? */
858 goto out; 870 goto out;
859 871
860 /* ageing the dentry to force validation */ 872 /* ageing the dentry to force validation */
@@ -981,8 +993,6 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
981 result = ncp_modify_file_or_subdir_dos_info(NCP_SERVER(inode), 993 result = ncp_modify_file_or_subdir_dos_info(NCP_SERVER(inode),
982 inode, info_mask, &info); 994 inode, info_mask, &info);
983 if (result != 0) { 995 if (result != 0) {
984 result = -EACCES;
985
986 if (info_mask == (DM_CREATE_TIME | DM_CREATE_DATE)) { 996 if (info_mask == (DM_CREATE_TIME | DM_CREATE_DATE)) {
987 /* NetWare seems not to allow this. I 997 /* NetWare seems not to allow this. I
988 do not know why. So, just tell the 998 do not know why. So, just tell the
@@ -1005,7 +1015,8 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
1005 mark_inode_dirty(inode); 1015 mark_inode_dirty(inode);
1006 1016
1007out: 1017out:
1008 unlock_kernel(); 1018 if (result > 0)
1019 result = -EACCES;
1009 return result; 1020 return result;
1010} 1021}
1011 1022
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 84a8cfc4e38e..c2a1f9a155c3 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -35,16 +35,11 @@
35#define NCP_PACKET_SIZE_INTERNAL 65536 35#define NCP_PACKET_SIZE_INTERNAL 65536
36 36
37static int 37static int
38ncp_get_fs_info(struct ncp_server * server, struct file *file, 38ncp_get_fs_info(struct ncp_server * server, struct inode *inode,
39 struct ncp_fs_info __user *arg) 39 struct ncp_fs_info __user *arg)
40{ 40{
41 struct inode *inode = file->f_path.dentry->d_inode;
42 struct ncp_fs_info info; 41 struct ncp_fs_info info;
43 42
44 if (file_permission(file, MAY_WRITE) != 0
45 && current_uid() != server->m.mounted_uid)
46 return -EACCES;
47
48 if (copy_from_user(&info, arg, sizeof(info))) 43 if (copy_from_user(&info, arg, sizeof(info)))
49 return -EFAULT; 44 return -EFAULT;
50 45
@@ -65,16 +60,11 @@ ncp_get_fs_info(struct ncp_server * server, struct file *file,
65} 60}
66 61
67static int 62static int
68ncp_get_fs_info_v2(struct ncp_server * server, struct file *file, 63ncp_get_fs_info_v2(struct ncp_server * server, struct inode *inode,
69 struct ncp_fs_info_v2 __user * arg) 64 struct ncp_fs_info_v2 __user * arg)
70{ 65{
71 struct inode *inode = file->f_path.dentry->d_inode;
72 struct ncp_fs_info_v2 info2; 66 struct ncp_fs_info_v2 info2;
73 67
74 if (file_permission(file, MAY_WRITE) != 0
75 && current_uid() != server->m.mounted_uid)
76 return -EACCES;
77
78 if (copy_from_user(&info2, arg, sizeof(info2))) 68 if (copy_from_user(&info2, arg, sizeof(info2)))
79 return -EFAULT; 69 return -EFAULT;
80 70
@@ -136,16 +126,11 @@ struct compat_ncp_privatedata_ioctl
136#define NCP_IOC_SETPRIVATEDATA_32 _IOR('n', 10, struct compat_ncp_privatedata_ioctl) 126#define NCP_IOC_SETPRIVATEDATA_32 _IOR('n', 10, struct compat_ncp_privatedata_ioctl)
137 127
138static int 128static int
139ncp_get_compat_fs_info_v2(struct ncp_server * server, struct file *file, 129ncp_get_compat_fs_info_v2(struct ncp_server * server, struct inode *inode,
140 struct compat_ncp_fs_info_v2 __user * arg) 130 struct compat_ncp_fs_info_v2 __user * arg)
141{ 131{
142 struct inode *inode = file->f_path.dentry->d_inode;
143 struct compat_ncp_fs_info_v2 info2; 132 struct compat_ncp_fs_info_v2 info2;
144 133
145 if (file_permission(file, MAY_WRITE) != 0
146 && current_uid() != server->m.mounted_uid)
147 return -EACCES;
148
149 if (copy_from_user(&info2, arg, sizeof(info2))) 134 if (copy_from_user(&info2, arg, sizeof(info2)))
150 return -EFAULT; 135 return -EFAULT;
151 136
@@ -182,11 +167,8 @@ ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
182 struct nls_table *iocharset; 167 struct nls_table *iocharset;
183 struct nls_table *oldset_io; 168 struct nls_table *oldset_io;
184 struct nls_table *oldset_cp; 169 struct nls_table *oldset_cp;
185 170 int utf8;
186 if (!capable(CAP_SYS_ADMIN)) 171 int err;
187 return -EACCES;
188 if (server->root_setuped)
189 return -EBUSY;
190 172
191 if (copy_from_user(&user, arg, sizeof(user))) 173 if (copy_from_user(&user, arg, sizeof(user)))
192 return -EFAULT; 174 return -EFAULT;
@@ -206,28 +188,40 @@ ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
206 user.iocharset[NCP_IOCSNAME_LEN] = 0; 188 user.iocharset[NCP_IOCSNAME_LEN] = 0;
207 if (!user.iocharset[0] || !strcmp(user.iocharset, "default")) { 189 if (!user.iocharset[0] || !strcmp(user.iocharset, "default")) {
208 iocharset = load_nls_default(); 190 iocharset = load_nls_default();
209 NCP_CLR_FLAG(server, NCP_FLAG_UTF8); 191 utf8 = 0;
210 } else if (!strcmp(user.iocharset, "utf8")) { 192 } else if (!strcmp(user.iocharset, "utf8")) {
211 iocharset = load_nls_default(); 193 iocharset = load_nls_default();
212 NCP_SET_FLAG(server, NCP_FLAG_UTF8); 194 utf8 = 1;
213 } else { 195 } else {
214 iocharset = load_nls(user.iocharset); 196 iocharset = load_nls(user.iocharset);
215 if (!iocharset) { 197 if (!iocharset) {
216 unload_nls(codepage); 198 unload_nls(codepage);
217 return -EBADRQC; 199 return -EBADRQC;
218 } 200 }
219 NCP_CLR_FLAG(server, NCP_FLAG_UTF8); 201 utf8 = 0;
220 } 202 }
221 203
222 oldset_cp = server->nls_vol; 204 mutex_lock(&server->root_setup_lock);
223 server->nls_vol = codepage; 205 if (server->root_setuped) {
224 oldset_io = server->nls_io; 206 oldset_cp = codepage;
225 server->nls_io = iocharset; 207 oldset_io = iocharset;
226 208 err = -EBUSY;
209 } else {
210 if (utf8)
211 NCP_SET_FLAG(server, NCP_FLAG_UTF8);
212 else
213 NCP_CLR_FLAG(server, NCP_FLAG_UTF8);
214 oldset_cp = server->nls_vol;
215 server->nls_vol = codepage;
216 oldset_io = server->nls_io;
217 server->nls_io = iocharset;
218 err = 0;
219 }
220 mutex_unlock(&server->root_setup_lock);
227 unload_nls(oldset_cp); 221 unload_nls(oldset_cp);
228 unload_nls(oldset_io); 222 unload_nls(oldset_io);
229 223
230 return 0; 224 return err;
231} 225}
232 226
233static int 227static int
@@ -237,6 +231,7 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
237 int len; 231 int len;
238 232
239 memset(&user, 0, sizeof(user)); 233 memset(&user, 0, sizeof(user));
234 mutex_lock(&server->root_setup_lock);
240 if (server->nls_vol && server->nls_vol->charset) { 235 if (server->nls_vol && server->nls_vol->charset) {
241 len = strlen(server->nls_vol->charset); 236 len = strlen(server->nls_vol->charset);
242 if (len > NCP_IOCSNAME_LEN) 237 if (len > NCP_IOCSNAME_LEN)
@@ -254,6 +249,7 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
254 strncpy(user.iocharset, server->nls_io->charset, len); 249 strncpy(user.iocharset, server->nls_io->charset, len);
255 user.iocharset[len] = 0; 250 user.iocharset[len] = 0;
256 } 251 }
252 mutex_unlock(&server->root_setup_lock);
257 253
258 if (copy_to_user(arg, &user, sizeof(user))) 254 if (copy_to_user(arg, &user, sizeof(user)))
259 return -EFAULT; 255 return -EFAULT;
@@ -261,25 +257,19 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
261} 257}
262#endif /* CONFIG_NCPFS_NLS */ 258#endif /* CONFIG_NCPFS_NLS */
263 259
264static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 260static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg)
265{ 261{
266 struct inode *inode = filp->f_dentry->d_inode;
267 struct ncp_server *server = NCP_SERVER(inode); 262 struct ncp_server *server = NCP_SERVER(inode);
268 int result; 263 int result;
269 struct ncp_ioctl_request request; 264 struct ncp_ioctl_request request;
270 char* bouncebuffer; 265 char* bouncebuffer;
271 void __user *argp = (void __user *)arg; 266 void __user *argp = (void __user *)arg;
272 uid_t uid = current_uid();
273 267
274 switch (cmd) { 268 switch (cmd) {
275#ifdef CONFIG_COMPAT 269#ifdef CONFIG_COMPAT
276 case NCP_IOC_NCPREQUEST_32: 270 case NCP_IOC_NCPREQUEST_32:
277#endif 271#endif
278 case NCP_IOC_NCPREQUEST: 272 case NCP_IOC_NCPREQUEST:
279 if (file_permission(filp, MAY_WRITE) != 0
280 && uid != server->m.mounted_uid)
281 return -EACCES;
282
283#ifdef CONFIG_COMPAT 273#ifdef CONFIG_COMPAT
284 if (cmd == NCP_IOC_NCPREQUEST_32) { 274 if (cmd == NCP_IOC_NCPREQUEST_32) {
285 struct compat_ncp_ioctl_request request32; 275 struct compat_ncp_ioctl_request request32;
@@ -314,7 +304,7 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
314 server->current_size = request.size; 304 server->current_size = request.size;
315 memcpy(server->packet, bouncebuffer, request.size); 305 memcpy(server->packet, bouncebuffer, request.size);
316 306
317 result = ncp_request2(server, request.function, 307 result = ncp_request2(server, request.function,
318 bouncebuffer, NCP_PACKET_SIZE_INTERNAL); 308 bouncebuffer, NCP_PACKET_SIZE_INTERNAL);
319 if (result < 0) 309 if (result < 0)
320 result = -EIO; 310 result = -EIO;
@@ -331,69 +321,69 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
331 321
332 case NCP_IOC_CONN_LOGGED_IN: 322 case NCP_IOC_CONN_LOGGED_IN:
333 323
334 if (!capable(CAP_SYS_ADMIN))
335 return -EACCES;
336 if (!(server->m.int_flags & NCP_IMOUNT_LOGGEDIN_POSSIBLE)) 324 if (!(server->m.int_flags & NCP_IMOUNT_LOGGEDIN_POSSIBLE))
337 return -EINVAL; 325 return -EINVAL;
326 mutex_lock(&server->root_setup_lock);
338 if (server->root_setuped) 327 if (server->root_setuped)
339 return -EBUSY; 328 result = -EBUSY;
340 server->root_setuped = 1; 329 else {
341 return ncp_conn_logged_in(inode->i_sb); 330 result = ncp_conn_logged_in(inode->i_sb);
331 if (result == 0)
332 server->root_setuped = 1;
333 }
334 mutex_unlock(&server->root_setup_lock);
335 return result;
342 336
343 case NCP_IOC_GET_FS_INFO: 337 case NCP_IOC_GET_FS_INFO:
344 return ncp_get_fs_info(server, filp, argp); 338 return ncp_get_fs_info(server, inode, argp);
345 339
346 case NCP_IOC_GET_FS_INFO_V2: 340 case NCP_IOC_GET_FS_INFO_V2:
347 return ncp_get_fs_info_v2(server, filp, argp); 341 return ncp_get_fs_info_v2(server, inode, argp);
348 342
349#ifdef CONFIG_COMPAT 343#ifdef CONFIG_COMPAT
350 case NCP_IOC_GET_FS_INFO_V2_32: 344 case NCP_IOC_GET_FS_INFO_V2_32:
351 return ncp_get_compat_fs_info_v2(server, filp, argp); 345 return ncp_get_compat_fs_info_v2(server, inode, argp);
352#endif 346#endif
353 /* we have too many combinations of CONFIG_COMPAT, 347 /* we have too many combinations of CONFIG_COMPAT,
354 * CONFIG_64BIT and CONFIG_UID16, so just handle 348 * CONFIG_64BIT and CONFIG_UID16, so just handle
355 * any of the possible ioctls */ 349 * any of the possible ioctls */
356 case NCP_IOC_GETMOUNTUID16: 350 case NCP_IOC_GETMOUNTUID16:
357 case NCP_IOC_GETMOUNTUID32: 351 {
358 case NCP_IOC_GETMOUNTUID64:
359 if (file_permission(filp, MAY_READ) != 0
360 && uid != server->m.mounted_uid)
361 return -EACCES;
362
363 if (cmd == NCP_IOC_GETMOUNTUID16) {
364 u16 uid; 352 u16 uid;
353
365 SET_UID(uid, server->m.mounted_uid); 354 SET_UID(uid, server->m.mounted_uid);
366 if (put_user(uid, (u16 __user *)argp)) 355 if (put_user(uid, (u16 __user *)argp))
367 return -EFAULT; 356 return -EFAULT;
368 } else if (cmd == NCP_IOC_GETMOUNTUID32) { 357 return 0;
369 if (put_user(server->m.mounted_uid,
370 (u32 __user *)argp))
371 return -EFAULT;
372 } else {
373 if (put_user(server->m.mounted_uid,
374 (u64 __user *)argp))
375 return -EFAULT;
376 } 358 }
359 case NCP_IOC_GETMOUNTUID32:
360 if (put_user(server->m.mounted_uid,
361 (u32 __user *)argp))
362 return -EFAULT;
363 return 0;
364 case NCP_IOC_GETMOUNTUID64:
365 if (put_user(server->m.mounted_uid,
366 (u64 __user *)argp))
367 return -EFAULT;
377 return 0; 368 return 0;
378 369
379 case NCP_IOC_GETROOT: 370 case NCP_IOC_GETROOT:
380 { 371 {
381 struct ncp_setroot_ioctl sr; 372 struct ncp_setroot_ioctl sr;
382 373
383 if (file_permission(filp, MAY_READ) != 0 374 result = -EACCES;
384 && uid != server->m.mounted_uid) 375 mutex_lock(&server->root_setup_lock);
385 return -EACCES;
386
387 if (server->m.mounted_vol[0]) { 376 if (server->m.mounted_vol[0]) {
388 struct dentry* dentry = inode->i_sb->s_root; 377 struct dentry* dentry = inode->i_sb->s_root;
389 378
390 if (dentry) { 379 if (dentry) {
391 struct inode* s_inode = dentry->d_inode; 380 struct inode* s_inode = dentry->d_inode;
392 381
393 if (s_inode) { 382 if (s_inode) {
394 sr.volNumber = NCP_FINFO(s_inode)->volNumber; 383 sr.volNumber = NCP_FINFO(s_inode)->volNumber;
395 sr.dirEntNum = NCP_FINFO(s_inode)->dirEntNum; 384 sr.dirEntNum = NCP_FINFO(s_inode)->dirEntNum;
396 sr.namespace = server->name_space[sr.volNumber]; 385 sr.namespace = server->name_space[sr.volNumber];
386 result = 0;
397 } else 387 } else
398 DPRINTK("ncpfs: s_root->d_inode==NULL\n"); 388 DPRINTK("ncpfs: s_root->d_inode==NULL\n");
399 } else 389 } else
@@ -402,10 +392,12 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
402 sr.volNumber = -1; 392 sr.volNumber = -1;
403 sr.namespace = 0; 393 sr.namespace = 0;
404 sr.dirEntNum = 0; 394 sr.dirEntNum = 0;
395 result = 0;
405 } 396 }
406 if (copy_to_user(argp, &sr, sizeof(sr))) 397 mutex_unlock(&server->root_setup_lock);
407 return -EFAULT; 398 if (!result && copy_to_user(argp, &sr, sizeof(sr)))
408 return 0; 399 result = -EFAULT;
400 return result;
409 } 401 }
410 402
411 case NCP_IOC_SETROOT: 403 case NCP_IOC_SETROOT:
@@ -416,103 +408,114 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
416 __le32 dosde; 408 __le32 dosde;
417 struct dentry* dentry; 409 struct dentry* dentry;
418 410
419 if (!capable(CAP_SYS_ADMIN))
420 {
421 return -EACCES;
422 }
423 if (server->root_setuped) return -EBUSY;
424 if (copy_from_user(&sr, argp, sizeof(sr))) 411 if (copy_from_user(&sr, argp, sizeof(sr)))
425 return -EFAULT; 412 return -EFAULT;
426 if (sr.volNumber < 0) { 413 mutex_lock(&server->root_setup_lock);
427 server->m.mounted_vol[0] = 0; 414 if (server->root_setuped)
428 vnum = NCP_NUMBER_OF_VOLUMES; 415 result = -EBUSY;
429 de = 0; 416 else {
430 dosde = 0; 417 if (sr.volNumber < 0) {
431 } else if (sr.volNumber >= NCP_NUMBER_OF_VOLUMES) { 418 server->m.mounted_vol[0] = 0;
432 return -EINVAL; 419 vnum = NCP_NUMBER_OF_VOLUMES;
433 } else if (ncp_mount_subdir(server, sr.volNumber, 420 de = 0;
434 sr.namespace, sr.dirEntNum, 421 dosde = 0;
435 &vnum, &de, &dosde)) { 422 result = 0;
436 return -ENOENT; 423 } else if (sr.volNumber >= NCP_NUMBER_OF_VOLUMES) {
437 } 424 result = -EINVAL;
438 425 } else if (ncp_mount_subdir(server, sr.volNumber,
439 dentry = inode->i_sb->s_root; 426 sr.namespace, sr.dirEntNum,
440 server->root_setuped = 1; 427 &vnum, &de, &dosde)) {
441 if (dentry) { 428 result = -ENOENT;
442 struct inode* s_inode = dentry->d_inode;
443
444 if (s_inode) {
445 NCP_FINFO(s_inode)->volNumber = vnum;
446 NCP_FINFO(s_inode)->dirEntNum = de;
447 NCP_FINFO(s_inode)->DosDirNum = dosde;
448 } else 429 } else
449 DPRINTK("ncpfs: s_root->d_inode==NULL\n"); 430 result = 0;
450 } else 431
451 DPRINTK("ncpfs: s_root==NULL\n"); 432 if (result == 0) {
433 dentry = inode->i_sb->s_root;
434 if (dentry) {
435 struct inode* s_inode = dentry->d_inode;
436
437 if (s_inode) {
438 NCP_FINFO(s_inode)->volNumber = vnum;
439 NCP_FINFO(s_inode)->dirEntNum = de;
440 NCP_FINFO(s_inode)->DosDirNum = dosde;
441 server->root_setuped = 1;
442 } else {
443 DPRINTK("ncpfs: s_root->d_inode==NULL\n");
444 result = -EIO;
445 }
446 } else {
447 DPRINTK("ncpfs: s_root==NULL\n");
448 result = -EIO;
449 }
450 }
451 result = 0;
452 }
453 mutex_unlock(&server->root_setup_lock);
452 454
453 return 0; 455 return result;
454 } 456 }
455 457
456#ifdef CONFIG_NCPFS_PACKET_SIGNING 458#ifdef CONFIG_NCPFS_PACKET_SIGNING
457 case NCP_IOC_SIGN_INIT: 459 case NCP_IOC_SIGN_INIT:
458 if (file_permission(filp, MAY_WRITE) != 0 460 {
459 && uid != server->m.mounted_uid) 461 struct ncp_sign_init sign;
460 return -EACCES;
461
462 if (argp) {
463 if (server->sign_wanted)
464 {
465 struct ncp_sign_init sign;
466 462
463 if (argp)
467 if (copy_from_user(&sign, argp, sizeof(sign))) 464 if (copy_from_user(&sign, argp, sizeof(sign)))
468 return -EFAULT; 465 return -EFAULT;
469 memcpy(server->sign_root,sign.sign_root,8); 466 ncp_lock_server(server);
470 memcpy(server->sign_last,sign.sign_last,16); 467 mutex_lock(&server->rcv.creq_mutex);
471 server->sign_active = 1; 468 if (argp) {
469 if (server->sign_wanted) {
470 memcpy(server->sign_root,sign.sign_root,8);
471 memcpy(server->sign_last,sign.sign_last,16);
472 server->sign_active = 1;
473 }
474 /* ignore when signatures not wanted */
475 } else {
476 server->sign_active = 0;
472 } 477 }
473 /* ignore when signatures not wanted */ 478 mutex_unlock(&server->rcv.creq_mutex);
474 } else { 479 ncp_unlock_server(server);
475 server->sign_active = 0; 480 return 0;
476 } 481 }
477 return 0; 482
478
479 case NCP_IOC_SIGN_WANTED: 483 case NCP_IOC_SIGN_WANTED:
480 if (file_permission(filp, MAY_READ) != 0 484 {
481 && uid != server->m.mounted_uid) 485 int state;
482 return -EACCES; 486
483 487 ncp_lock_server(server);
484 if (put_user(server->sign_wanted, (int __user *)argp)) 488 state = server->sign_wanted;
485 return -EFAULT; 489 ncp_unlock_server(server);
486 return 0; 490 if (put_user(state, (int __user *)argp))
491 return -EFAULT;
492 return 0;
493 }
487 494
488 case NCP_IOC_SET_SIGN_WANTED: 495 case NCP_IOC_SET_SIGN_WANTED:
489 { 496 {
490 int newstate; 497 int newstate;
491 498
492 if (file_permission(filp, MAY_WRITE) != 0
493 && uid != server->m.mounted_uid)
494 return -EACCES;
495
496 /* get only low 8 bits... */ 499 /* get only low 8 bits... */
497 if (get_user(newstate, (unsigned char __user *)argp)) 500 if (get_user(newstate, (unsigned char __user *)argp))
498 return -EFAULT; 501 return -EFAULT;
502 result = 0;
503 ncp_lock_server(server);
499 if (server->sign_active) { 504 if (server->sign_active) {
500 /* cannot turn signatures OFF when active */ 505 /* cannot turn signatures OFF when active */
501 if (!newstate) return -EINVAL; 506 if (!newstate)
507 result = -EINVAL;
502 } else { 508 } else {
503 server->sign_wanted = newstate != 0; 509 server->sign_wanted = newstate != 0;
504 } 510 }
505 return 0; 511 ncp_unlock_server(server);
512 return result;
506 } 513 }
507 514
508#endif /* CONFIG_NCPFS_PACKET_SIGNING */ 515#endif /* CONFIG_NCPFS_PACKET_SIGNING */
509 516
510#ifdef CONFIG_NCPFS_IOCTL_LOCKING 517#ifdef CONFIG_NCPFS_IOCTL_LOCKING
511 case NCP_IOC_LOCKUNLOCK: 518 case NCP_IOC_LOCKUNLOCK:
512 if (file_permission(filp, MAY_WRITE) != 0
513 && uid != server->m.mounted_uid)
514 return -EACCES;
515
516 { 519 {
517 struct ncp_lock_ioctl rqdata; 520 struct ncp_lock_ioctl rqdata;
518 521
@@ -541,16 +544,13 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
541 { 544 {
542 return result; 545 return result;
543 } 546 }
544 result = -EIO;
545 if (!ncp_conn_valid(server))
546 goto outrel;
547 result = -EISDIR; 547 result = -EISDIR;
548 if (!S_ISREG(inode->i_mode)) 548 if (!S_ISREG(inode->i_mode))
549 goto outrel; 549 goto outrel;
550 if (rqdata.cmd == NCP_LOCK_CLEAR) 550 if (rqdata.cmd == NCP_LOCK_CLEAR)
551 { 551 {
552 result = ncp_ClearPhysicalRecord(NCP_SERVER(inode), 552 result = ncp_ClearPhysicalRecord(NCP_SERVER(inode),
553 NCP_FINFO(inode)->file_handle, 553 NCP_FINFO(inode)->file_handle,
554 rqdata.offset, 554 rqdata.offset,
555 rqdata.length); 555 rqdata.length);
556 if (result > 0) result = 0; /* no such lock */ 556 if (result > 0) result = 0; /* no such lock */
@@ -573,7 +573,7 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
573 rqdata.timeout); 573 rqdata.timeout);
574 if (result > 0) result = -EAGAIN; 574 if (result > 0) result = -EAGAIN;
575 } 575 }
576outrel: 576outrel:
577 ncp_inode_close(inode); 577 ncp_inode_close(inode);
578 return result; 578 return result;
579 } 579 }
@@ -581,60 +581,62 @@ outrel:
581 581
582#ifdef CONFIG_COMPAT 582#ifdef CONFIG_COMPAT
583 case NCP_IOC_GETOBJECTNAME_32: 583 case NCP_IOC_GETOBJECTNAME_32:
584 if (uid != server->m.mounted_uid)
585 return -EACCES;
586 { 584 {
587 struct compat_ncp_objectname_ioctl user; 585 struct compat_ncp_objectname_ioctl user;
588 size_t outl; 586 size_t outl;
589 587
590 if (copy_from_user(&user, argp, sizeof(user))) 588 if (copy_from_user(&user, argp, sizeof(user)))
591 return -EFAULT; 589 return -EFAULT;
590 down_read(&server->auth_rwsem);
592 user.auth_type = server->auth.auth_type; 591 user.auth_type = server->auth.auth_type;
593 outl = user.object_name_len; 592 outl = user.object_name_len;
594 user.object_name_len = server->auth.object_name_len; 593 user.object_name_len = server->auth.object_name_len;
595 if (outl > user.object_name_len) 594 if (outl > user.object_name_len)
596 outl = user.object_name_len; 595 outl = user.object_name_len;
596 result = 0;
597 if (outl) { 597 if (outl) {
598 if (copy_to_user(compat_ptr(user.object_name), 598 if (copy_to_user(compat_ptr(user.object_name),
599 server->auth.object_name, 599 server->auth.object_name,
600 outl)) return -EFAULT; 600 outl))
601 result = -EFAULT;
601 } 602 }
602 if (copy_to_user(argp, &user, sizeof(user))) 603 up_read(&server->auth_rwsem);
603 return -EFAULT; 604 if (!result && copy_to_user(argp, &user, sizeof(user)))
604 return 0; 605 result = -EFAULT;
606 return result;
605 } 607 }
606#endif 608#endif
607 609
608 case NCP_IOC_GETOBJECTNAME: 610 case NCP_IOC_GETOBJECTNAME:
609 if (uid != server->m.mounted_uid)
610 return -EACCES;
611 { 611 {
612 struct ncp_objectname_ioctl user; 612 struct ncp_objectname_ioctl user;
613 size_t outl; 613 size_t outl;
614 614
615 if (copy_from_user(&user, argp, sizeof(user))) 615 if (copy_from_user(&user, argp, sizeof(user)))
616 return -EFAULT; 616 return -EFAULT;
617 down_read(&server->auth_rwsem);
617 user.auth_type = server->auth.auth_type; 618 user.auth_type = server->auth.auth_type;
618 outl = user.object_name_len; 619 outl = user.object_name_len;
619 user.object_name_len = server->auth.object_name_len; 620 user.object_name_len = server->auth.object_name_len;
620 if (outl > user.object_name_len) 621 if (outl > user.object_name_len)
621 outl = user.object_name_len; 622 outl = user.object_name_len;
623 result = 0;
622 if (outl) { 624 if (outl) {
623 if (copy_to_user(user.object_name, 625 if (copy_to_user(user.object_name,
624 server->auth.object_name, 626 server->auth.object_name,
625 outl)) return -EFAULT; 627 outl))
628 result = -EFAULT;
626 } 629 }
627 if (copy_to_user(argp, &user, sizeof(user))) 630 up_read(&server->auth_rwsem);
628 return -EFAULT; 631 if (!result && copy_to_user(argp, &user, sizeof(user)))
629 return 0; 632 result = -EFAULT;
633 return result;
630 } 634 }
631 635
632#ifdef CONFIG_COMPAT 636#ifdef CONFIG_COMPAT
633 case NCP_IOC_SETOBJECTNAME_32: 637 case NCP_IOC_SETOBJECTNAME_32:
634#endif 638#endif
635 case NCP_IOC_SETOBJECTNAME: 639 case NCP_IOC_SETOBJECTNAME:
636 if (uid != server->m.mounted_uid)
637 return -EACCES;
638 { 640 {
639 struct ncp_objectname_ioctl user; 641 struct ncp_objectname_ioctl user;
640 void* newname; 642 void* newname;
@@ -666,9 +668,7 @@ outrel:
666 } else { 668 } else {
667 newname = NULL; 669 newname = NULL;
668 } 670 }
669 /* enter critical section */ 671 down_write(&server->auth_rwsem);
670 /* maybe that kfree can sleep so do that this way */
671 /* it is at least more SMP friendly (in future...) */
672 oldname = server->auth.object_name; 672 oldname = server->auth.object_name;
673 oldnamelen = server->auth.object_name_len; 673 oldnamelen = server->auth.object_name_len;
674 oldprivate = server->priv.data; 674 oldprivate = server->priv.data;
@@ -678,7 +678,7 @@ outrel:
678 server->auth.object_name = newname; 678 server->auth.object_name = newname;
679 server->priv.len = 0; 679 server->priv.len = 0;
680 server->priv.data = NULL; 680 server->priv.data = NULL;
681 /* leave critical section */ 681 up_write(&server->auth_rwsem);
682 kfree(oldprivate); 682 kfree(oldprivate);
683 kfree(oldname); 683 kfree(oldname);
684 return 0; 684 return 0;
@@ -688,8 +688,6 @@ outrel:
688 case NCP_IOC_GETPRIVATEDATA_32: 688 case NCP_IOC_GETPRIVATEDATA_32:
689#endif 689#endif
690 case NCP_IOC_GETPRIVATEDATA: 690 case NCP_IOC_GETPRIVATEDATA:
691 if (uid != server->m.mounted_uid)
692 return -EACCES;
693 { 691 {
694 struct ncp_privatedata_ioctl user; 692 struct ncp_privatedata_ioctl user;
695 size_t outl; 693 size_t outl;
@@ -706,14 +704,20 @@ outrel:
706 if (copy_from_user(&user, argp, sizeof(user))) 704 if (copy_from_user(&user, argp, sizeof(user)))
707 return -EFAULT; 705 return -EFAULT;
708 706
707 down_read(&server->auth_rwsem);
709 outl = user.len; 708 outl = user.len;
710 user.len = server->priv.len; 709 user.len = server->priv.len;
711 if (outl > user.len) outl = user.len; 710 if (outl > user.len) outl = user.len;
711 result = 0;
712 if (outl) { 712 if (outl) {
713 if (copy_to_user(user.data, 713 if (copy_to_user(user.data,
714 server->priv.data, 714 server->priv.data,
715 outl)) return -EFAULT; 715 outl))
716 result = -EFAULT;
716 } 717 }
718 up_read(&server->auth_rwsem);
719 if (result)
720 return result;
717#ifdef CONFIG_COMPAT 721#ifdef CONFIG_COMPAT
718 if (cmd == NCP_IOC_GETPRIVATEDATA_32) { 722 if (cmd == NCP_IOC_GETPRIVATEDATA_32) {
719 struct compat_ncp_privatedata_ioctl user32; 723 struct compat_ncp_privatedata_ioctl user32;
@@ -733,8 +737,6 @@ outrel:
733 case NCP_IOC_SETPRIVATEDATA_32: 737 case NCP_IOC_SETPRIVATEDATA_32:
734#endif 738#endif
735 case NCP_IOC_SETPRIVATEDATA: 739 case NCP_IOC_SETPRIVATEDATA:
736 if (uid != server->m.mounted_uid)
737 return -EACCES;
738 { 740 {
739 struct ncp_privatedata_ioctl user; 741 struct ncp_privatedata_ioctl user;
740 void* new; 742 void* new;
@@ -762,12 +764,12 @@ outrel:
762 } else { 764 } else {
763 new = NULL; 765 new = NULL;
764 } 766 }
765 /* enter critical section */ 767 down_write(&server->auth_rwsem);
766 old = server->priv.data; 768 old = server->priv.data;
767 oldlen = server->priv.len; 769 oldlen = server->priv.len;
768 server->priv.len = user.len; 770 server->priv.len = user.len;
769 server->priv.data = new; 771 server->priv.data = new;
770 /* leave critical section */ 772 up_write(&server->auth_rwsem);
771 kfree(old); 773 kfree(old);
772 return 0; 774 return 0;
773 } 775 }
@@ -775,17 +777,13 @@ outrel:
775#ifdef CONFIG_NCPFS_NLS 777#ifdef CONFIG_NCPFS_NLS
776 case NCP_IOC_SETCHARSETS: 778 case NCP_IOC_SETCHARSETS:
777 return ncp_set_charsets(server, argp); 779 return ncp_set_charsets(server, argp);
778 780
779 case NCP_IOC_GETCHARSETS: 781 case NCP_IOC_GETCHARSETS:
780 return ncp_get_charsets(server, argp); 782 return ncp_get_charsets(server, argp);
781 783
782#endif /* CONFIG_NCPFS_NLS */ 784#endif /* CONFIG_NCPFS_NLS */
783 785
784 case NCP_IOC_SETDENTRYTTL: 786 case NCP_IOC_SETDENTRYTTL:
785 if (file_permission(filp, MAY_WRITE) != 0 &&
786 uid != server->m.mounted_uid)
787 return -EACCES;
788
789 { 787 {
790 u_int32_t user; 788 u_int32_t user;
791 789
@@ -795,13 +793,13 @@ outrel:
795 if (user > 20000) 793 if (user > 20000)
796 return -EINVAL; 794 return -EINVAL;
797 user = (user * HZ) / 1000; 795 user = (user * HZ) / 1000;
798 server->dentry_ttl = user; 796 atomic_set(&server->dentry_ttl, user);
799 return 0; 797 return 0;
800 } 798 }
801 799
802 case NCP_IOC_GETDENTRYTTL: 800 case NCP_IOC_GETDENTRYTTL:
803 { 801 {
804 u_int32_t user = (server->dentry_ttl * 1000) / HZ; 802 u_int32_t user = (atomic_read(&server->dentry_ttl) * 1000) / HZ;
805 if (copy_to_user(argp, &user, sizeof(user))) 803 if (copy_to_user(argp, &user, sizeof(user)))
806 return -EFAULT; 804 return -EFAULT;
807 return 0; 805 return 0;
@@ -811,59 +809,103 @@ outrel:
811 return -EINVAL; 809 return -EINVAL;
812} 810}
813 811
814static int ncp_ioctl_need_write(unsigned int cmd) 812long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
815{ 813{
814 struct inode *inode = filp->f_dentry->d_inode;
815 struct ncp_server *server = NCP_SERVER(inode);
816 uid_t uid = current_uid();
817 int need_drop_write = 0;
818 long ret;
819
816 switch (cmd) { 820 switch (cmd) {
817 case NCP_IOC_GET_FS_INFO:
818 case NCP_IOC_GET_FS_INFO_V2:
819 case NCP_IOC_NCPREQUEST:
820 case NCP_IOC_SETDENTRYTTL:
821 case NCP_IOC_SIGN_INIT:
822 case NCP_IOC_LOCKUNLOCK:
823 case NCP_IOC_SET_SIGN_WANTED:
824 return 1;
825 case NCP_IOC_GETOBJECTNAME:
826 case NCP_IOC_SETOBJECTNAME:
827 case NCP_IOC_GETPRIVATEDATA:
828 case NCP_IOC_SETPRIVATEDATA:
829 case NCP_IOC_SETCHARSETS: 821 case NCP_IOC_SETCHARSETS:
830 case NCP_IOC_GETCHARSETS:
831 case NCP_IOC_CONN_LOGGED_IN: 822 case NCP_IOC_CONN_LOGGED_IN:
832 case NCP_IOC_GETDENTRYTTL:
833 case NCP_IOC_GETMOUNTUID2:
834 case NCP_IOC_SIGN_WANTED:
835 case NCP_IOC_GETROOT:
836 case NCP_IOC_SETROOT: 823 case NCP_IOC_SETROOT:
837 return 0; 824 if (!capable(CAP_SYS_ADMIN)) {
838 default: 825 ret = -EACCES;
839 /* unknown IOCTL command, assume write */ 826 goto out;
840 return 1; 827 }
828 break;
841 } 829 }
842} 830 if (server->m.mounted_uid != uid) {
843 831 switch (cmd) {
844long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
845{
846 long ret;
847
848 lock_kernel();
849 if (ncp_ioctl_need_write(cmd)) {
850 /* 832 /*
851 * inside the ioctl(), any failures which 833 * Only mount owner can issue these ioctls. Information
852 * are because of file_permission() are 834 * necessary to authenticate to other NDS servers are
853 * -EACCESS, so it seems consistent to keep 835 * stored here.
854 * that here.
855 */ 836 */
856 if (mnt_want_write(filp->f_path.mnt)) { 837 case NCP_IOC_GETOBJECTNAME:
838 case NCP_IOC_SETOBJECTNAME:
839 case NCP_IOC_GETPRIVATEDATA:
840 case NCP_IOC_SETPRIVATEDATA:
841#ifdef CONFIG_COMPAT
842 case NCP_IOC_GETOBJECTNAME_32:
843 case NCP_IOC_SETOBJECTNAME_32:
844 case NCP_IOC_GETPRIVATEDATA_32:
845 case NCP_IOC_SETPRIVATEDATA_32:
846#endif
857 ret = -EACCES; 847 ret = -EACCES;
858 goto out; 848 goto out;
849 /*
850 * These require write access on the inode if user id
851 * does not match. Note that they do not write to the
852 * file... But old code did mnt_want_write, so I keep
853 * it as is. Of course not for mountpoint owner, as
854 * that breaks read-only mounts altogether as ncpmount
855 * needs working NCP_IOC_NCPREQUEST and
856 * NCP_IOC_GET_FS_INFO. Some of these codes (setdentryttl,
857 * signinit, setsignwanted) should be probably restricted
858 * to owner only, or even more to CAP_SYS_ADMIN).
859 */
860 case NCP_IOC_GET_FS_INFO:
861 case NCP_IOC_GET_FS_INFO_V2:
862 case NCP_IOC_NCPREQUEST:
863 case NCP_IOC_SETDENTRYTTL:
864 case NCP_IOC_SIGN_INIT:
865 case NCP_IOC_LOCKUNLOCK:
866 case NCP_IOC_SET_SIGN_WANTED:
867#ifdef CONFIG_COMPAT
868 case NCP_IOC_GET_FS_INFO_V2_32:
869 case NCP_IOC_NCPREQUEST_32:
870#endif
871 ret = mnt_want_write_file(filp);
872 if (ret)
873 goto out;
874 need_drop_write = 1;
875 ret = inode_permission(inode, MAY_WRITE);
876 if (ret)
877 goto outDropWrite;
878 break;
879 /*
880 * Read access required.
881 */
882 case NCP_IOC_GETMOUNTUID16:
883 case NCP_IOC_GETMOUNTUID32:
884 case NCP_IOC_GETMOUNTUID64:
885 case NCP_IOC_GETROOT:
886 case NCP_IOC_SIGN_WANTED:
887 ret = inode_permission(inode, MAY_READ);
888 if (ret)
889 goto out;
890 break;
891 /*
892 * Anybody can read these.
893 */
894 case NCP_IOC_GETCHARSETS:
895 case NCP_IOC_GETDENTRYTTL:
896 default:
897 /* Three codes below are protected by CAP_SYS_ADMIN above. */
898 case NCP_IOC_SETCHARSETS:
899 case NCP_IOC_CONN_LOGGED_IN:
900 case NCP_IOC_SETROOT:
901 break;
859 } 902 }
860 } 903 }
861 ret = __ncp_ioctl(filp, cmd, arg); 904 ret = __ncp_ioctl(inode, cmd, arg);
862 if (ncp_ioctl_need_write(cmd)) 905outDropWrite:
906 if (need_drop_write)
863 mnt_drop_write(filp->f_path.mnt); 907 mnt_drop_write(filp->f_path.mnt);
864
865out: 908out:
866 unlock_kernel();
867 return ret; 909 return ret;
868} 910}
869 911
@@ -872,10 +914,8 @@ long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
872{ 914{
873 long ret; 915 long ret;
874 916
875 lock_kernel();
876 arg = (unsigned long) compat_ptr(arg); 917 arg = (unsigned long) compat_ptr(arg);
877 ret = ncp_ioctl(file, cmd, arg); 918 ret = ncp_ioctl(file, cmd, arg);
878 unlock_kernel();
879 return ret; 919 return ret;
880} 920}
881#endif 921#endif
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index 0ec6237a5970..a95615a0b6ac 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -107,17 +107,17 @@ ncp_reply_data(struct ncp_server *server, int offset)
107 return &(server->packet[sizeof(struct ncp_reply_header) + offset]); 107 return &(server->packet[sizeof(struct ncp_reply_header) + offset]);
108} 108}
109 109
110static inline u8 BVAL(void *data) 110static inline u8 BVAL(const void *data)
111{ 111{
112 return *(u8 *)data; 112 return *(const u8 *)data;
113} 113}
114 114
115static u8 ncp_reply_byte(struct ncp_server *server, int offset) 115static u8 ncp_reply_byte(struct ncp_server *server, int offset)
116{ 116{
117 return *(u8 *)ncp_reply_data(server, offset); 117 return *(const u8 *)ncp_reply_data(server, offset);
118} 118}
119 119
120static inline u16 WVAL_LH(void *data) 120static inline u16 WVAL_LH(const void *data)
121{ 121{
122 return get_unaligned_le16(data); 122 return get_unaligned_le16(data);
123} 123}
@@ -134,7 +134,7 @@ ncp_reply_be16(struct ncp_server *server, int offset)
134 return get_unaligned_be16(ncp_reply_data(server, offset)); 134 return get_unaligned_be16(ncp_reply_data(server, offset));
135} 135}
136 136
137static inline u32 DVAL_LH(void *data) 137static inline u32 DVAL_LH(const void *data)
138{ 138{
139 return get_unaligned_le32(data); 139 return get_unaligned_le32(data);
140} 140}
@@ -349,9 +349,9 @@ int ncp_dirhandle_free(struct ncp_server* server, __u8 dirhandle) {
349 return result; 349 return result;
350} 350}
351 351
352void ncp_extract_file_info(void *structure, struct nw_info_struct *target) 352void ncp_extract_file_info(const void *structure, struct nw_info_struct *target)
353{ 353{
354 __u8 *name_len; 354 const __u8 *name_len;
355 const int info_struct_size = offsetof(struct nw_info_struct, nameLen); 355 const int info_struct_size = offsetof(struct nw_info_struct, nameLen);
356 356
357 memcpy(target, structure, info_struct_size); 357 memcpy(target, structure, info_struct_size);
@@ -364,7 +364,7 @@ void ncp_extract_file_info(void *structure, struct nw_info_struct *target)
364} 364}
365 365
366#ifdef CONFIG_NCPFS_NFS_NS 366#ifdef CONFIG_NCPFS_NFS_NS
367static inline void ncp_extract_nfs_info(unsigned char *structure, 367static inline void ncp_extract_nfs_info(const unsigned char *structure,
368 struct nw_nfs_info *target) 368 struct nw_nfs_info *target)
369{ 369{
370 target->mode = DVAL_LH(structure); 370 target->mode = DVAL_LH(structure);
@@ -417,7 +417,7 @@ int ncp_obtain_nfs_info(struct ncp_server *server,
417 * Returns information for a (one-component) name relative to 417 * Returns information for a (one-component) name relative to
418 * the specified directory. 418 * the specified directory.
419 */ 419 */
420int ncp_obtain_info(struct ncp_server *server, struct inode *dir, char *path, 420int ncp_obtain_info(struct ncp_server *server, struct inode *dir, const char *path,
421 struct nw_info_struct *target) 421 struct nw_info_struct *target)
422{ 422{
423 __u8 volnum = NCP_FINFO(dir)->volNumber; 423 __u8 volnum = NCP_FINFO(dir)->volNumber;
@@ -452,16 +452,16 @@ out:
452#ifdef CONFIG_NCPFS_NFS_NS 452#ifdef CONFIG_NCPFS_NFS_NS
453static int 453static int
454ncp_obtain_DOS_dir_base(struct ncp_server *server, 454ncp_obtain_DOS_dir_base(struct ncp_server *server,
455 __u8 volnum, __le32 dirent, 455 __u8 ns, __u8 volnum, __le32 dirent,
456 char *path, /* At most 1 component */ 456 const char *path, /* At most 1 component */
457 __le32 *DOS_dir_base) 457 __le32 *DOS_dir_base)
458{ 458{
459 int result; 459 int result;
460 460
461 ncp_init_request(server); 461 ncp_init_request(server);
462 ncp_add_byte(server, 6); /* subfunction */ 462 ncp_add_byte(server, 6); /* subfunction */
463 ncp_add_byte(server, server->name_space[volnum]); 463 ncp_add_byte(server, ns);
464 ncp_add_byte(server, server->name_space[volnum]); 464 ncp_add_byte(server, ns);
465 ncp_add_word(server, cpu_to_le16(0x8006)); /* get all */ 465 ncp_add_word(server, cpu_to_le16(0x8006)); /* get all */
466 ncp_add_dword(server, RIM_DIRECTORY); 466 ncp_add_dword(server, RIM_DIRECTORY);
467 ncp_add_handle_path(server, volnum, dirent, 1, path); 467 ncp_add_handle_path(server, volnum, dirent, 1, path);
@@ -523,10 +523,27 @@ ncp_get_known_namespace(struct ncp_server *server, __u8 volume)
523#endif /* defined(CONFIG_NCPFS_OS2_NS) || defined(CONFIG_NCPFS_NFS_NS) */ 523#endif /* defined(CONFIG_NCPFS_OS2_NS) || defined(CONFIG_NCPFS_NFS_NS) */
524} 524}
525 525
526int
527ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns)
528{
529 int ns = ncp_get_known_namespace(server, volume);
530
531 if (ret_ns)
532 *ret_ns = ns;
533
534 DPRINTK("lookup_vol: namespace[%d] = %d\n",
535 volume, server->name_space[volume]);
536
537 if (server->name_space[volume] == ns)
538 return 0;
539 server->name_space[volume] = ns;
540 return 1;
541}
542
526static int 543static int
527ncp_ObtainSpecificDirBase(struct ncp_server *server, 544ncp_ObtainSpecificDirBase(struct ncp_server *server,
528 __u8 nsSrc, __u8 nsDst, __u8 vol_num, __le32 dir_base, 545 __u8 nsSrc, __u8 nsDst, __u8 vol_num, __le32 dir_base,
529 char *path, /* At most 1 component */ 546 const char *path, /* At most 1 component */
530 __le32 *dirEntNum, __le32 *DosDirNum) 547 __le32 *dirEntNum, __le32 *DosDirNum)
531{ 548{
532 int result; 549 int result;
@@ -560,14 +577,13 @@ ncp_mount_subdir(struct ncp_server *server,
560{ 577{
561 int dstNS; 578 int dstNS;
562 int result; 579 int result;
563 580
564 dstNS = ncp_get_known_namespace(server, volNumber); 581 ncp_update_known_namespace(server, volNumber, &dstNS);
565 if ((result = ncp_ObtainSpecificDirBase(server, srcNS, dstNS, volNumber, 582 if ((result = ncp_ObtainSpecificDirBase(server, srcNS, dstNS, volNumber,
566 dirEntNum, NULL, newDirEnt, newDosEnt)) != 0) 583 dirEntNum, NULL, newDirEnt, newDosEnt)) != 0)
567 { 584 {
568 return result; 585 return result;
569 } 586 }
570 server->name_space[volNumber] = dstNS;
571 *volume = volNumber; 587 *volume = volNumber;
572 server->m.mounted_vol[1] = 0; 588 server->m.mounted_vol[1] = 0;
573 server->m.mounted_vol[0] = 'X'; 589 server->m.mounted_vol[0] = 'X';
@@ -575,11 +591,10 @@ ncp_mount_subdir(struct ncp_server *server,
575} 591}
576 592
577int 593int
578ncp_get_volume_root(struct ncp_server *server, const char *volname, 594ncp_get_volume_root(struct ncp_server *server,
579 __u32* volume, __le32* dirent, __le32* dosdirent) 595 const char *volname, __u32* volume, __le32* dirent, __le32* dosdirent)
580{ 596{
581 int result; 597 int result;
582 __u8 volnum;
583 598
584 DPRINTK("ncp_get_volume_root: looking up vol %s\n", volname); 599 DPRINTK("ncp_get_volume_root: looking up vol %s\n", volname);
585 600
@@ -601,21 +616,14 @@ ncp_get_volume_root(struct ncp_server *server, const char *volname,
601 return result; 616 return result;
602 } 617 }
603 *dirent = *dosdirent = ncp_reply_dword(server, 4); 618 *dirent = *dosdirent = ncp_reply_dword(server, 4);
604 volnum = ncp_reply_byte(server, 8); 619 *volume = ncp_reply_byte(server, 8);
605 ncp_unlock_server(server); 620 ncp_unlock_server(server);
606 *volume = volnum;
607
608 server->name_space[volnum] = ncp_get_known_namespace(server, volnum);
609
610 DPRINTK("lookup_vol: namespace[%d] = %d\n",
611 volnum, server->name_space[volnum]);
612
613 return 0; 621 return 0;
614} 622}
615 623
616int 624int
617ncp_lookup_volume(struct ncp_server *server, const char *volname, 625ncp_lookup_volume(struct ncp_server *server,
618 struct nw_info_struct *target) 626 const char *volname, struct nw_info_struct *target)
619{ 627{
620 int result; 628 int result;
621 629
@@ -625,6 +633,7 @@ ncp_lookup_volume(struct ncp_server *server, const char *volname,
625 if (result) { 633 if (result) {
626 return result; 634 return result;
627 } 635 }
636 ncp_update_known_namespace(server, target->volNumber, NULL);
628 target->nameLen = strlen(volname); 637 target->nameLen = strlen(volname);
629 memcpy(target->entryName, volname, target->nameLen+1); 638 memcpy(target->entryName, volname, target->nameLen+1);
630 target->attributes = aDIR; 639 target->attributes = aDIR;
@@ -676,8 +685,8 @@ int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent,
676{ 685{
677 int result = 0; 686 int result = 0;
678 687
688 ncp_init_request(server);
679 if (server->name_space[volnum] == NW_NS_NFS) { 689 if (server->name_space[volnum] == NW_NS_NFS) {
680 ncp_init_request(server);
681 ncp_add_byte(server, 25); /* subfunction */ 690 ncp_add_byte(server, 25); /* subfunction */
682 ncp_add_byte(server, server->name_space[volnum]); 691 ncp_add_byte(server, server->name_space[volnum]);
683 ncp_add_byte(server, NW_NS_NFS); 692 ncp_add_byte(server, NW_NS_NFS);
@@ -690,8 +699,8 @@ int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent,
690 ncp_add_dword_lh(server, 1); /* nlinks */ 699 ncp_add_dword_lh(server, 1); /* nlinks */
691 ncp_add_dword_lh(server, rdev); 700 ncp_add_dword_lh(server, rdev);
692 result = ncp_request(server, 87); 701 result = ncp_request(server, 87);
693 ncp_unlock_server(server);
694 } 702 }
703 ncp_unlock_server(server);
695 return result; 704 return result;
696} 705}
697#endif 706#endif
@@ -700,7 +709,7 @@ int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent,
700static int 709static int
701ncp_DeleteNSEntry(struct ncp_server *server, 710ncp_DeleteNSEntry(struct ncp_server *server,
702 __u8 have_dir_base, __u8 volnum, __le32 dirent, 711 __u8 have_dir_base, __u8 volnum, __le32 dirent,
703 char* name, __u8 ns, __le16 attr) 712 const char* name, __u8 ns, __le16 attr)
704{ 713{
705 int result; 714 int result;
706 715
@@ -734,23 +743,25 @@ ncp_del_file_or_subdir2(struct ncp_server *server,
734 743
735int 744int
736ncp_del_file_or_subdir(struct ncp_server *server, 745ncp_del_file_or_subdir(struct ncp_server *server,
737 struct inode *dir, char *name) 746 struct inode *dir, const char *name)
738{ 747{
739 __u8 volnum = NCP_FINFO(dir)->volNumber; 748 __u8 volnum = NCP_FINFO(dir)->volNumber;
740 __le32 dirent = NCP_FINFO(dir)->dirEntNum; 749 __le32 dirent = NCP_FINFO(dir)->dirEntNum;
750 int name_space;
741 751
752 name_space = server->name_space[volnum];
742#ifdef CONFIG_NCPFS_NFS_NS 753#ifdef CONFIG_NCPFS_NFS_NS
743 if (server->name_space[volnum]==NW_NS_NFS) 754 if (name_space == NW_NS_NFS)
744 { 755 {
745 int result; 756 int result;
746 757
747 result=ncp_obtain_DOS_dir_base(server, volnum, dirent, name, &dirent); 758 result=ncp_obtain_DOS_dir_base(server, name_space, volnum, dirent, name, &dirent);
748 if (result) return result; 759 if (result) return result;
749 return ncp_DeleteNSEntry(server, 1, volnum, dirent, NULL, NW_NS_DOS, cpu_to_le16(0x8006)); 760 name = NULL;
761 name_space = NW_NS_DOS;
750 } 762 }
751 else
752#endif /* CONFIG_NCPFS_NFS_NS */ 763#endif /* CONFIG_NCPFS_NFS_NS */
753 return ncp_DeleteNSEntry(server, 1, volnum, dirent, name, server->name_space[volnum], cpu_to_le16(0x8006)); 764 return ncp_DeleteNSEntry(server, 1, volnum, dirent, name, name_space, cpu_to_le16(0x8006));
754} 765}
755 766
756static inline void ConvertToNWfromDWORD(__u16 v0, __u16 v1, __u8 ret[6]) 767static inline void ConvertToNWfromDWORD(__u16 v0, __u16 v1, __u8 ret[6])
@@ -765,7 +776,7 @@ static inline void ConvertToNWfromDWORD(__u16 v0, __u16 v1, __u8 ret[6])
765/* If both dir and name are NULL, then in target there's already a 776/* If both dir and name are NULL, then in target there's already a
766 looked-up entry that wants to be opened. */ 777 looked-up entry that wants to be opened. */
767int ncp_open_create_file_or_subdir(struct ncp_server *server, 778int ncp_open_create_file_or_subdir(struct ncp_server *server,
768 struct inode *dir, char *name, 779 struct inode *dir, const char *name,
769 int open_create_mode, 780 int open_create_mode,
770 __le32 create_attributes, 781 __le32 create_attributes,
771 __le16 desired_acc_rights, 782 __le16 desired_acc_rights,
@@ -890,8 +901,8 @@ int ncp_search_for_fileset(struct ncp_server *server,
890 901
891static int 902static int
892ncp_RenameNSEntry(struct ncp_server *server, 903ncp_RenameNSEntry(struct ncp_server *server,
893 struct inode *old_dir, char *old_name, __le16 old_type, 904 struct inode *old_dir, const char *old_name, __le16 old_type,
894 struct inode *new_dir, char *new_name) 905 struct inode *new_dir, const char *new_name)
895{ 906{
896 int result = -EINVAL; 907 int result = -EINVAL;
897 908
@@ -929,8 +940,8 @@ out:
929} 940}
930 941
931int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server, 942int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server,
932 struct inode *old_dir, char *old_name, 943 struct inode *old_dir, const char *old_name,
933 struct inode *new_dir, char *new_name) 944 struct inode *new_dir, const char *new_name)
934{ 945{
935 int result; 946 int result;
936 __le16 old_type = cpu_to_le16(0x06); 947 __le16 old_type = cpu_to_le16(0x06);
@@ -958,7 +969,7 @@ int
958ncp_read_kernel(struct ncp_server *server, const char *file_id, 969ncp_read_kernel(struct ncp_server *server, const char *file_id,
959 __u32 offset, __u16 to_read, char *target, int *bytes_read) 970 __u32 offset, __u16 to_read, char *target, int *bytes_read)
960{ 971{
961 char *source; 972 const char *source;
962 int result; 973 int result;
963 974
964 ncp_init_request(server); 975 ncp_init_request(server);
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 2441d1ab57dc..3c57eca634ce 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -65,10 +65,11 @@ static inline void ncp_inode_close(struct inode *inode) {
65 atomic_dec(&NCP_FINFO(inode)->opened); 65 atomic_dec(&NCP_FINFO(inode)->opened);
66} 66}
67 67
68void ncp_extract_file_info(void* src, struct nw_info_struct* target); 68void ncp_extract_file_info(const void* src, struct nw_info_struct* target);
69int ncp_obtain_info(struct ncp_server *server, struct inode *, char *, 69int ncp_obtain_info(struct ncp_server *server, struct inode *, const char *,
70 struct nw_info_struct *target); 70 struct nw_info_struct *target);
71int ncp_obtain_nfs_info(struct ncp_server *server, struct nw_info_struct *target); 71int ncp_obtain_nfs_info(struct ncp_server *server, struct nw_info_struct *target);
72int ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns);
72int ncp_get_volume_root(struct ncp_server *server, const char *volname, 73int ncp_get_volume_root(struct ncp_server *server, const char *volname,
73 __u32 *volume, __le32 *dirent, __le32 *dosdirent); 74 __u32 *volume, __le32 *dirent, __le32 *dosdirent);
74int ncp_lookup_volume(struct ncp_server *, const char *, struct nw_info_struct *); 75int ncp_lookup_volume(struct ncp_server *, const char *, struct nw_info_struct *);
@@ -80,8 +81,8 @@ int ncp_modify_nfs_info(struct ncp_server *, __u8 volnum, __le32 dirent,
80 __u32 mode, __u32 rdev); 81 __u32 mode, __u32 rdev);
81 82
82int ncp_del_file_or_subdir2(struct ncp_server *, struct dentry*); 83int ncp_del_file_or_subdir2(struct ncp_server *, struct dentry*);
83int ncp_del_file_or_subdir(struct ncp_server *, struct inode *, char *); 84int ncp_del_file_or_subdir(struct ncp_server *, struct inode *, const char *);
84int ncp_open_create_file_or_subdir(struct ncp_server *, struct inode *, char *, 85int ncp_open_create_file_or_subdir(struct ncp_server *, struct inode *, const char *,
85 int, __le32, __le16, struct ncp_entry_info *); 86 int, __le32, __le16, struct ncp_entry_info *);
86 87
87int ncp_initialize_search(struct ncp_server *, struct inode *, 88int ncp_initialize_search(struct ncp_server *, struct inode *,
@@ -93,7 +94,7 @@ int ncp_search_for_fileset(struct ncp_server *server,
93 char** rbuf, size_t* rsize); 94 char** rbuf, size_t* rsize);
94 95
95int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server, 96int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server,
96 struct inode *, char *, struct inode *, char *); 97 struct inode *, const char *, struct inode *, const char *);
97 98
98 99
99int 100int
@@ -170,13 +171,13 @@ static inline int ncp_strnicmp(struct nls_table *t, const unsigned char *s1,
170#endif /* CONFIG_NCPFS_NLS */ 171#endif /* CONFIG_NCPFS_NLS */
171 172
172#define NCP_GET_AGE(dentry) (jiffies - (dentry)->d_time) 173#define NCP_GET_AGE(dentry) (jiffies - (dentry)->d_time)
173#define NCP_MAX_AGE(server) ((server)->dentry_ttl) 174#define NCP_MAX_AGE(server) atomic_read(&(server)->dentry_ttl)
174#define NCP_TEST_AGE(server,dentry) (NCP_GET_AGE(dentry) < NCP_MAX_AGE(server)) 175#define NCP_TEST_AGE(server,dentry) (NCP_GET_AGE(dentry) < NCP_MAX_AGE(server))
175 176
176static inline void 177static inline void
177ncp_age_dentry(struct ncp_server* server, struct dentry* dentry) 178ncp_age_dentry(struct ncp_server* server, struct dentry* dentry)
178{ 179{
179 dentry->d_time = jiffies - server->dentry_ttl; 180 dentry->d_time = jiffies - NCP_MAX_AGE(server);
180} 181}
181 182
182static inline void 183static inline void
diff --git a/fs/ncpfs/ncpsign_kernel.c b/fs/ncpfs/ncpsign_kernel.c
index 7c0b5c21e6cf..d8b2d7e6910b 100644
--- a/fs/ncpfs/ncpsign_kernel.c
+++ b/fs/ncpfs/ncpsign_kernel.c
@@ -15,21 +15,21 @@
15 15
16/* i386: 32-bit, little endian, handles mis-alignment */ 16/* i386: 32-bit, little endian, handles mis-alignment */
17#ifdef __i386__ 17#ifdef __i386__
18#define GET_LE32(p) (*(int *)(p)) 18#define GET_LE32(p) (*(const int *)(p))
19#define PUT_LE32(p,v) { *(int *)(p)=v; } 19#define PUT_LE32(p,v) { *(int *)(p)=v; }
20#else 20#else
21/* from include/ncplib.h */ 21/* from include/ncplib.h */
22#define BVAL(buf,pos) (((__u8 *)(buf))[pos]) 22#define BVAL(buf,pos) (((const __u8 *)(buf))[pos])
23#define PVAL(buf,pos) ((unsigned)BVAL(buf,pos)) 23#define PVAL(buf,pos) ((unsigned)BVAL(buf,pos))
24#define BSET(buf,pos,val) (BVAL(buf,pos) = (val)) 24#define BSET(buf,pos,val) (((__u8 *)(buf))[pos] = (val))
25 25
26static inline __u16 26static inline __u16
27WVAL_LH(__u8 * buf, int pos) 27WVAL_LH(const __u8 * buf, int pos)
28{ 28{
29 return PVAL(buf, pos) | PVAL(buf, pos + 1) << 8; 29 return PVAL(buf, pos) | PVAL(buf, pos + 1) << 8;
30} 30}
31static inline __u32 31static inline __u32
32DVAL_LH(__u8 * buf, int pos) 32DVAL_LH(const __u8 * buf, int pos)
33{ 33{
34 return WVAL_LH(buf, pos) | WVAL_LH(buf, pos + 2) << 16; 34 return WVAL_LH(buf, pos) | WVAL_LH(buf, pos + 2) << 16;
35} 35}
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index c7ff6c700a6e..668bd267346e 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -746,7 +746,6 @@ static int ncp_do_request(struct ncp_server *server, int size,
746 return -EIO; 746 return -EIO;
747 } 747 }
748 if (!ncp_conn_valid(server)) { 748 if (!ncp_conn_valid(server)) {
749 printk(KERN_ERR "ncpfs: Connection invalid!\n");
750 return -EIO; 749 return -EIO;
751 } 750 }
752 { 751 {
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index f7e13db613cb..ba306658a6db 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -76,13 +76,17 @@ config NFS_V4
76 76
77config NFS_V4_1 77config NFS_V4_1
78 bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" 78 bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
79 depends on NFS_V4 && EXPERIMENTAL 79 depends on NFS_FS && NFS_V4 && EXPERIMENTAL
80 select PNFS_FILE_LAYOUT
80 help 81 help
81 This option enables support for minor version 1 of the NFSv4 protocol 82 This option enables support for minor version 1 of the NFSv4 protocol
82 (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client. 83 (RFC 5661) in the kernel's NFS client.
83 84
84 If unsure, say N. 85 If unsure, say N.
85 86
87config PNFS_FILE_LAYOUT
88 tristate
89
86config ROOT_NFS 90config ROOT_NFS
87 bool "Root file system on NFS" 91 bool "Root file system on NFS"
88 depends on NFS_FS=y && IP_PNP 92 depends on NFS_FS=y && IP_PNP
@@ -117,3 +121,14 @@ config NFS_USE_KERNEL_DNS
117 select DNS_RESOLVER 121 select DNS_RESOLVER
118 select KEYS 122 select KEYS
119 default y 123 default y
124
125config NFS_USE_NEW_IDMAPPER
126 bool "Use the new idmapper upcall routine"
127 depends on NFS_V4 && KEYS
128 help
129 Say Y here if you want NFS to use the new idmapper upcall functions.
130 You will need /sbin/request-key (usually provided by the keyutils
131 package). For details, read
132 <file:Documentation/filesystems/nfs/idmapper.txt>.
133
134 If you are unsure, say N.
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index da7fda639eac..4776ff9e3814 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,5 +15,9 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
15 delegation.o idmap.o \ 15 delegation.o idmap.o \
16 callback.o callback_xdr.o callback_proc.o \ 16 callback.o callback_xdr.o callback_proc.o \
17 nfs4namespace.o 17 nfs4namespace.o
18nfs-$(CONFIG_NFS_V4_1) += pnfs.o
18nfs-$(CONFIG_SYSCTL) += sysctl.o 19nfs-$(CONFIG_SYSCTL) += sysctl.o
19nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o 20nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
21
22obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
23nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index e17b49e2eabd..aeec017fe814 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -109,7 +109,7 @@ nfs4_callback_up(struct svc_serv *serv)
109{ 109{
110 int ret; 110 int ret;
111 111
112 ret = svc_create_xprt(serv, "tcp", PF_INET, 112 ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET,
113 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); 113 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
114 if (ret <= 0) 114 if (ret <= 0)
115 goto out_err; 115 goto out_err;
@@ -117,7 +117,7 @@ nfs4_callback_up(struct svc_serv *serv)
117 dprintk("NFS: Callback listener port = %u (af %u)\n", 117 dprintk("NFS: Callback listener port = %u (af %u)\n",
118 nfs_callback_tcpport, PF_INET); 118 nfs_callback_tcpport, PF_INET);
119 119
120 ret = svc_create_xprt(serv, "tcp", PF_INET6, 120 ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6,
121 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); 121 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
122 if (ret > 0) { 122 if (ret > 0) {
123 nfs_callback_tcpport6 = ret; 123 nfs_callback_tcpport6 = ret;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 930d10fecdaf..2950fca0c61b 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -118,11 +118,11 @@ int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const n
118 if (delegation == NULL) 118 if (delegation == NULL)
119 return 0; 119 return 0;
120 120
121 /* seqid is 4-bytes long */ 121 if (stateid->stateid.seqid != 0)
122 if (((u32 *) &stateid->data)[0] != 0)
123 return 0; 122 return 0;
124 if (memcmp(&delegation->stateid.data[4], &stateid->data[4], 123 if (memcmp(&delegation->stateid.stateid.other,
125 sizeof(stateid->data)-4)) 124 &stateid->stateid.other,
125 NFS4_STATEID_OTHER_SIZE))
126 return 0; 126 return 0;
127 127
128 return 1; 128 return 1;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index e7340729af89..0870d0d4efc0 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -48,6 +48,7 @@
48#include "iostat.h" 48#include "iostat.h"
49#include "internal.h" 49#include "internal.h"
50#include "fscache.h" 50#include "fscache.h"
51#include "pnfs.h"
51 52
52#define NFSDBG_FACILITY NFSDBG_CLIENT 53#define NFSDBG_FACILITY NFSDBG_CLIENT
53 54
@@ -155,7 +156,9 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
155 cred = rpc_lookup_machine_cred(); 156 cred = rpc_lookup_machine_cred();
156 if (!IS_ERR(cred)) 157 if (!IS_ERR(cred))
157 clp->cl_machine_cred = cred; 158 clp->cl_machine_cred = cred;
158 159#if defined(CONFIG_NFS_V4_1)
160 INIT_LIST_HEAD(&clp->cl_layouts);
161#endif
159 nfs_fscache_get_client_cookie(clp); 162 nfs_fscache_get_client_cookie(clp);
160 163
161 return clp; 164 return clp;
@@ -252,6 +255,7 @@ void nfs_put_client(struct nfs_client *clp)
252 nfs_free_client(clp); 255 nfs_free_client(clp);
253 } 256 }
254} 257}
258EXPORT_SYMBOL_GPL(nfs_put_client);
255 259
256#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 260#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
257/* 261/*
@@ -601,6 +605,7 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
601{ 605{
602 struct rpc_clnt *clnt = NULL; 606 struct rpc_clnt *clnt = NULL;
603 struct rpc_create_args args = { 607 struct rpc_create_args args = {
608 .net = &init_net,
604 .protocol = clp->cl_proto, 609 .protocol = clp->cl_proto,
605 .address = (struct sockaddr *)&clp->cl_addr, 610 .address = (struct sockaddr *)&clp->cl_addr,
606 .addrsize = clp->cl_addrlen, 611 .addrsize = clp->cl_addrlen,
@@ -635,7 +640,8 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
635 */ 640 */
636static void nfs_destroy_server(struct nfs_server *server) 641static void nfs_destroy_server(struct nfs_server *server)
637{ 642{
638 if (!(server->flags & NFS_MOUNT_NONLM)) 643 if (!(server->flags & NFS_MOUNT_LOCAL_FLOCK) ||
644 !(server->flags & NFS_MOUNT_LOCAL_FCNTL))
639 nlmclnt_done(server->nlm_host); 645 nlmclnt_done(server->nlm_host);
640} 646}
641 647
@@ -657,7 +663,8 @@ static int nfs_start_lockd(struct nfs_server *server)
657 663
658 if (nlm_init.nfs_version > 3) 664 if (nlm_init.nfs_version > 3)
659 return 0; 665 return 0;
660 if (server->flags & NFS_MOUNT_NONLM) 666 if ((server->flags & NFS_MOUNT_LOCAL_FLOCK) &&
667 (server->flags & NFS_MOUNT_LOCAL_FCNTL))
661 return 0; 668 return 0;
662 669
663 switch (clp->cl_proto) { 670 switch (clp->cl_proto) {
@@ -898,11 +905,13 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
898 if (server->wsize > NFS_MAX_FILE_IO_SIZE) 905 if (server->wsize > NFS_MAX_FILE_IO_SIZE)
899 server->wsize = NFS_MAX_FILE_IO_SIZE; 906 server->wsize = NFS_MAX_FILE_IO_SIZE;
900 server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 907 server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
908 set_pnfs_layoutdriver(server, fsinfo->layouttype);
909
901 server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); 910 server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
902 911
903 server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); 912 server->dtsize = nfs_block_size(fsinfo->dtpref, NULL);
904 if (server->dtsize > PAGE_CACHE_SIZE) 913 if (server->dtsize > PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES)
905 server->dtsize = PAGE_CACHE_SIZE; 914 server->dtsize = PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES;
906 if (server->dtsize > server->rsize) 915 if (server->dtsize > server->rsize)
907 server->dtsize = server->rsize; 916 server->dtsize = server->rsize;
908 917
@@ -913,6 +922,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
913 922
914 server->maxfilesize = fsinfo->maxfilesize; 923 server->maxfilesize = fsinfo->maxfilesize;
915 924
925 server->time_delta = fsinfo->time_delta;
926
916 /* We're airborne Set socket buffersize */ 927 /* We're airborne Set socket buffersize */
917 rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100); 928 rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
918} 929}
@@ -935,6 +946,7 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
935 } 946 }
936 947
937 fsinfo.fattr = fattr; 948 fsinfo.fattr = fattr;
949 fsinfo.layouttype = 0;
938 error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); 950 error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
939 if (error < 0) 951 if (error < 0)
940 goto out_error; 952 goto out_error;
@@ -1017,6 +1029,7 @@ void nfs_free_server(struct nfs_server *server)
1017{ 1029{
1018 dprintk("--> nfs_free_server()\n"); 1030 dprintk("--> nfs_free_server()\n");
1019 1031
1032 unset_pnfs_layoutdriver(server);
1020 spin_lock(&nfs_client_lock); 1033 spin_lock(&nfs_client_lock);
1021 list_del(&server->client_link); 1034 list_del(&server->client_link);
1022 list_del(&server->master_link); 1035 list_del(&server->master_link);
@@ -1356,8 +1369,9 @@ static int nfs4_init_server(struct nfs_server *server,
1356 1369
1357 /* Initialise the client representation from the mount data */ 1370 /* Initialise the client representation from the mount data */
1358 server->flags = data->flags; 1371 server->flags = data->flags;
1359 server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR| 1372 server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|NFS_CAP_POSIX_LOCK;
1360 NFS_CAP_POSIX_LOCK; 1373 if (!(data->flags & NFS_MOUNT_NORDIRPLUS))
1374 server->caps |= NFS_CAP_READDIRPLUS;
1361 server->options = data->options; 1375 server->options = data->options;
1362 1376
1363 /* Get a client record */ 1377 /* Get a client record */
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index b9c3c43cea1d..232a7eead33a 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -71,20 +71,20 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
71 if (inode->i_flock == NULL) 71 if (inode->i_flock == NULL)
72 goto out; 72 goto out;
73 73
74 /* Protect inode->i_flock using the BKL */ 74 /* Protect inode->i_flock using the file locks lock */
75 lock_kernel(); 75 lock_flocks();
76 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 76 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
77 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) 77 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
78 continue; 78 continue;
79 if (nfs_file_open_context(fl->fl_file) != ctx) 79 if (nfs_file_open_context(fl->fl_file) != ctx)
80 continue; 80 continue;
81 unlock_kernel(); 81 unlock_flocks();
82 status = nfs4_lock_delegation_recall(state, fl); 82 status = nfs4_lock_delegation_recall(state, fl);
83 if (status < 0) 83 if (status < 0)
84 goto out; 84 goto out;
85 lock_kernel(); 85 lock_flocks();
86 } 86 }
87 unlock_kernel(); 87 unlock_flocks();
88out: 88out:
89 return status; 89 return status;
90} 90}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e257172d438c..07ac3847e562 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,11 +33,12 @@
33#include <linux/namei.h> 33#include <linux/namei.h>
34#include <linux/mount.h> 34#include <linux/mount.h>
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/vmalloc.h>
36 37
37#include "nfs4_fs.h"
38#include "delegation.h" 38#include "delegation.h"
39#include "iostat.h" 39#include "iostat.h"
40#include "internal.h" 40#include "internal.h"
41#include "fscache.h"
41 42
42/* #define NFS_DEBUG_VERBOSE 1 */ 43/* #define NFS_DEBUG_VERBOSE 1 */
43 44
@@ -55,6 +56,7 @@ static int nfs_rename(struct inode *, struct dentry *,
55 struct inode *, struct dentry *); 56 struct inode *, struct dentry *);
56static int nfs_fsync_dir(struct file *, int); 57static int nfs_fsync_dir(struct file *, int);
57static loff_t nfs_llseek_dir(struct file *, loff_t, int); 58static loff_t nfs_llseek_dir(struct file *, loff_t, int);
59static int nfs_readdir_clear_array(struct page*, gfp_t);
58 60
59const struct file_operations nfs_dir_operations = { 61const struct file_operations nfs_dir_operations = {
60 .llseek = nfs_llseek_dir, 62 .llseek = nfs_llseek_dir,
@@ -80,6 +82,10 @@ const struct inode_operations nfs_dir_inode_operations = {
80 .setattr = nfs_setattr, 82 .setattr = nfs_setattr,
81}; 83};
82 84
85const struct address_space_operations nfs_dir_addr_space_ops = {
86 .releasepage = nfs_readdir_clear_array,
87};
88
83#ifdef CONFIG_NFS_V3 89#ifdef CONFIG_NFS_V3
84const struct inode_operations nfs3_dir_inode_operations = { 90const struct inode_operations nfs3_dir_inode_operations = {
85 .create = nfs_create, 91 .create = nfs_create,
@@ -104,8 +110,9 @@ const struct inode_operations nfs3_dir_inode_operations = {
104#ifdef CONFIG_NFS_V4 110#ifdef CONFIG_NFS_V4
105 111
106static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *); 112static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *);
113static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd);
107const struct inode_operations nfs4_dir_inode_operations = { 114const struct inode_operations nfs4_dir_inode_operations = {
108 .create = nfs_create, 115 .create = nfs_open_create,
109 .lookup = nfs_atomic_lookup, 116 .lookup = nfs_atomic_lookup,
110 .link = nfs_link, 117 .link = nfs_link,
111 .unlink = nfs_unlink, 118 .unlink = nfs_unlink,
@@ -150,51 +157,197 @@ nfs_opendir(struct inode *inode, struct file *filp)
150 return res; 157 return res;
151} 158}
152 159
153typedef __be32 * (*decode_dirent_t)(__be32 *, struct nfs_entry *, int); 160struct nfs_cache_array_entry {
161 u64 cookie;
162 u64 ino;
163 struct qstr string;
164};
165
166struct nfs_cache_array {
167 unsigned int size;
168 int eof_index;
169 u64 last_cookie;
170 struct nfs_cache_array_entry array[0];
171};
172
173#define MAX_READDIR_ARRAY ((PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry))
174
175typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
154typedef struct { 176typedef struct {
155 struct file *file; 177 struct file *file;
156 struct page *page; 178 struct page *page;
157 unsigned long page_index; 179 unsigned long page_index;
158 __be32 *ptr;
159 u64 *dir_cookie; 180 u64 *dir_cookie;
160 loff_t current_index; 181 loff_t current_index;
161 struct nfs_entry *entry;
162 decode_dirent_t decode; 182 decode_dirent_t decode;
163 int plus; 183
164 unsigned long timestamp; 184 unsigned long timestamp;
165 unsigned long gencount; 185 unsigned long gencount;
166 int timestamp_valid; 186 unsigned int cache_entry_index;
187 unsigned int plus:1;
188 unsigned int eof:1;
167} nfs_readdir_descriptor_t; 189} nfs_readdir_descriptor_t;
168 190
169/* Now we cache directories properly, by stuffing the dirent 191/*
170 * data directly in the page cache. 192 * The caller is responsible for calling nfs_readdir_release_array(page)
171 *
172 * Inode invalidation due to refresh etc. takes care of
173 * _everything_, no sloppy entry flushing logic, no extraneous
174 * copying, network direct to page cache, the way it was meant
175 * to be.
176 *
177 * NOTE: Dirent information verification is done always by the
178 * page-in of the RPC reply, nowhere else, this simplies
179 * things substantially.
180 */ 193 */
181static 194static
182int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) 195struct nfs_cache_array *nfs_readdir_get_array(struct page *page)
196{
197 if (page == NULL)
198 return ERR_PTR(-EIO);
199 return (struct nfs_cache_array *)kmap(page);
200}
201
202static
203void nfs_readdir_release_array(struct page *page)
204{
205 kunmap(page);
206}
207
208/*
209 * we are freeing strings created by nfs_add_to_readdir_array()
210 */
211static
212int nfs_readdir_clear_array(struct page *page, gfp_t mask)
213{
214 struct nfs_cache_array *array = nfs_readdir_get_array(page);
215 int i;
216 for (i = 0; i < array->size; i++)
217 kfree(array->array[i].string.name);
218 nfs_readdir_release_array(page);
219 return 0;
220}
221
222/*
223 * the caller is responsible for freeing qstr.name
224 * when called by nfs_readdir_add_to_array, the strings will be freed in
225 * nfs_clear_readdir_array()
226 */
227static
228int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len)
229{
230 string->len = len;
231 string->name = kmemdup(name, len, GFP_KERNEL);
232 if (string->name == NULL)
233 return -ENOMEM;
234 string->hash = full_name_hash(name, len);
235 return 0;
236}
237
238static
239int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
240{
241 struct nfs_cache_array *array = nfs_readdir_get_array(page);
242 struct nfs_cache_array_entry *cache_entry;
243 int ret;
244
245 if (IS_ERR(array))
246 return PTR_ERR(array);
247 ret = -EIO;
248 if (array->size >= MAX_READDIR_ARRAY)
249 goto out;
250
251 cache_entry = &array->array[array->size];
252 cache_entry->cookie = entry->prev_cookie;
253 cache_entry->ino = entry->ino;
254 ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);
255 if (ret)
256 goto out;
257 array->last_cookie = entry->cookie;
258 if (entry->eof == 1)
259 array->eof_index = array->size;
260 array->size++;
261out:
262 nfs_readdir_release_array(page);
263 return ret;
264}
265
266static
267int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
268{
269 loff_t diff = desc->file->f_pos - desc->current_index;
270 unsigned int index;
271
272 if (diff < 0)
273 goto out_eof;
274 if (diff >= array->size) {
275 if (array->eof_index > 0)
276 goto out_eof;
277 desc->current_index += array->size;
278 return -EAGAIN;
279 }
280
281 index = (unsigned int)diff;
282 *desc->dir_cookie = array->array[index].cookie;
283 desc->cache_entry_index = index;
284 if (index == array->eof_index)
285 desc->eof = 1;
286 return 0;
287out_eof:
288 desc->eof = 1;
289 return -EBADCOOKIE;
290}
291
292static
293int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
294{
295 int i;
296 int status = -EAGAIN;
297
298 for (i = 0; i < array->size; i++) {
299 if (i == array->eof_index) {
300 desc->eof = 1;
301 status = -EBADCOOKIE;
302 }
303 if (array->array[i].cookie == *desc->dir_cookie) {
304 desc->cache_entry_index = i;
305 status = 0;
306 break;
307 }
308 }
309
310 return status;
311}
312
313static
314int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
315{
316 struct nfs_cache_array *array;
317 int status = -EBADCOOKIE;
318
319 if (desc->dir_cookie == NULL)
320 goto out;
321
322 array = nfs_readdir_get_array(desc->page);
323 if (IS_ERR(array)) {
324 status = PTR_ERR(array);
325 goto out;
326 }
327
328 if (*desc->dir_cookie == 0)
329 status = nfs_readdir_search_for_pos(array, desc);
330 else
331 status = nfs_readdir_search_for_cookie(array, desc);
332
333 nfs_readdir_release_array(desc->page);
334out:
335 return status;
336}
337
338/* Fill a page with xdr information before transferring to the cache page */
339static
340int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
341 struct nfs_entry *entry, struct file *file, struct inode *inode)
183{ 342{
184 struct file *file = desc->file;
185 struct inode *inode = file->f_path.dentry->d_inode;
186 struct rpc_cred *cred = nfs_file_cred(file); 343 struct rpc_cred *cred = nfs_file_cred(file);
187 unsigned long timestamp, gencount; 344 unsigned long timestamp, gencount;
188 int error; 345 int error;
189 346
190 dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n",
191 __func__, (long long)desc->entry->cookie,
192 page->index);
193
194 again: 347 again:
195 timestamp = jiffies; 348 timestamp = jiffies;
196 gencount = nfs_inc_attr_generation_counter(); 349 gencount = nfs_inc_attr_generation_counter();
197 error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page, 350 error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages,
198 NFS_SERVER(inode)->dtsize, desc->plus); 351 NFS_SERVER(inode)->dtsize, desc->plus);
199 if (error < 0) { 352 if (error < 0) {
200 /* We requested READDIRPLUS, but the server doesn't grok it */ 353 /* We requested READDIRPLUS, but the server doesn't grok it */
@@ -208,190 +361,292 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
208 } 361 }
209 desc->timestamp = timestamp; 362 desc->timestamp = timestamp;
210 desc->gencount = gencount; 363 desc->gencount = gencount;
211 desc->timestamp_valid = 1; 364error:
212 SetPageUptodate(page); 365 return error;
213 /* Ensure consistent page alignment of the data.
214 * Note: assumes we have exclusive access to this mapping either
215 * through inode->i_mutex or some other mechanism.
216 */
217 if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
218 /* Should never happen */
219 nfs_zap_mapping(inode, inode->i_mapping);
220 }
221 unlock_page(page);
222 return 0;
223 error:
224 unlock_page(page);
225 return -EIO;
226} 366}
227 367
228static inline 368/* Fill in an entry based on the xdr code stored in desc->page */
229int dir_decode(nfs_readdir_descriptor_t *desc) 369static
370int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct xdr_stream *stream)
230{ 371{
231 __be32 *p = desc->ptr; 372 __be32 *p = desc->decode(stream, entry, NFS_SERVER(desc->file->f_path.dentry->d_inode), desc->plus);
232 p = desc->decode(p, desc->entry, desc->plus);
233 if (IS_ERR(p)) 373 if (IS_ERR(p))
234 return PTR_ERR(p); 374 return PTR_ERR(p);
235 desc->ptr = p; 375
236 if (desc->timestamp_valid) { 376 entry->fattr->time_start = desc->timestamp;
237 desc->entry->fattr->time_start = desc->timestamp; 377 entry->fattr->gencount = desc->gencount;
238 desc->entry->fattr->gencount = desc->gencount;
239 } else
240 desc->entry->fattr->valid &= ~NFS_ATTR_FATTR;
241 return 0; 378 return 0;
242} 379}
243 380
244static inline 381static
245void dir_page_release(nfs_readdir_descriptor_t *desc) 382int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
246{ 383{
247 kunmap(desc->page); 384 struct nfs_inode *node;
248 page_cache_release(desc->page); 385 if (dentry->d_inode == NULL)
249 desc->page = NULL; 386 goto different;
250 desc->ptr = NULL; 387 node = NFS_I(dentry->d_inode);
388 if (node->fh.size != entry->fh->size)
389 goto different;
390 if (strncmp(node->fh.data, entry->fh->data, node->fh.size) != 0)
391 goto different;
392 return 1;
393different:
394 return 0;
251} 395}
252 396
253/* 397static
254 * Given a pointer to a buffer that has already been filled by a call 398void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
255 * to readdir, find the next entry with cookie '*desc->dir_cookie'.
256 *
257 * If the end of the buffer has been reached, return -EAGAIN, if not,
258 * return the offset within the buffer of the next entry to be
259 * read.
260 */
261static inline
262int find_dirent(nfs_readdir_descriptor_t *desc)
263{ 399{
264 struct nfs_entry *entry = desc->entry; 400 struct qstr filename = {
265 int loop_count = 0, 401 .len = entry->len,
266 status; 402 .name = entry->name,
403 };
404 struct dentry *dentry;
405 struct dentry *alias;
406 struct inode *dir = parent->d_inode;
407 struct inode *inode;
267 408
268 while((status = dir_decode(desc)) == 0) { 409 if (filename.name[0] == '.') {
269 dfprintk(DIRCACHE, "NFS: %s: examining cookie %Lu\n", 410 if (filename.len == 1)
270 __func__, (unsigned long long)entry->cookie); 411 return;
271 if (entry->prev_cookie == *desc->dir_cookie) 412 if (filename.len == 2 && filename.name[1] == '.')
272 break; 413 return;
273 if (loop_count++ > 200) { 414 }
274 loop_count = 0; 415 filename.hash = full_name_hash(filename.name, filename.len);
275 schedule(); 416
417 dentry = d_lookup(parent, &filename);
418 if (dentry != NULL) {
419 if (nfs_same_file(dentry, entry)) {
420 nfs_refresh_inode(dentry->d_inode, entry->fattr);
421 goto out;
422 } else {
423 d_drop(dentry);
424 dput(dentry);
276 } 425 }
277 } 426 }
278 return status; 427
428 dentry = d_alloc(parent, &filename);
429 if (dentry == NULL)
430 return;
431
432 dentry->d_op = NFS_PROTO(dir)->dentry_ops;
433 inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
434 if (IS_ERR(inode))
435 goto out;
436
437 alias = d_materialise_unique(dentry, inode);
438 if (IS_ERR(alias))
439 goto out;
440 else if (alias) {
441 nfs_set_verifier(alias, nfs_save_change_attribute(dir));
442 dput(alias);
443 } else
444 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
445
446out:
447 dput(dentry);
448}
449
450/* Perform conversion from xdr to cache array */
451static
452void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
453 void *xdr_page, struct page *page, unsigned int buflen)
454{
455 struct xdr_stream stream;
456 struct xdr_buf buf;
457 __be32 *ptr = xdr_page;
458 int status;
459 struct nfs_cache_array *array;
460
461 buf.head->iov_base = xdr_page;
462 buf.head->iov_len = buflen;
463 buf.tail->iov_len = 0;
464 buf.page_base = 0;
465 buf.page_len = 0;
466 buf.buflen = buf.head->iov_len;
467 buf.len = buf.head->iov_len;
468
469 xdr_init_decode(&stream, &buf, ptr);
470
471
472 do {
473 status = xdr_decode(desc, entry, &stream);
474 if (status != 0)
475 break;
476
477 if (nfs_readdir_add_to_array(entry, page) == -1)
478 break;
479 if (desc->plus == 1)
480 nfs_prime_dcache(desc->file->f_path.dentry, entry);
481 } while (!entry->eof);
482
483 if (status == -EBADCOOKIE && entry->eof) {
484 array = nfs_readdir_get_array(page);
485 array->eof_index = array->size - 1;
486 status = 0;
487 nfs_readdir_release_array(page);
488 }
489}
490
491static
492void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages)
493{
494 unsigned int i;
495 for (i = 0; i < npages; i++)
496 put_page(pages[i]);
497}
498
499static
500void nfs_readdir_free_large_page(void *ptr, struct page **pages,
501 unsigned int npages)
502{
503 vm_unmap_ram(ptr, npages);
504 nfs_readdir_free_pagearray(pages, npages);
279} 505}
280 506
281/* 507/*
282 * Given a pointer to a buffer that has already been filled by a call 508 * nfs_readdir_large_page will allocate pages that must be freed with a call
283 * to readdir, find the entry at offset 'desc->file->f_pos'. 509 * to nfs_readdir_free_large_page
284 *
285 * If the end of the buffer has been reached, return -EAGAIN, if not,
286 * return the offset within the buffer of the next entry to be
287 * read.
288 */ 510 */
289static inline 511static
290int find_dirent_index(nfs_readdir_descriptor_t *desc) 512void *nfs_readdir_large_page(struct page **pages, unsigned int npages)
291{ 513{
292 struct nfs_entry *entry = desc->entry; 514 void *ptr;
293 int loop_count = 0, 515 unsigned int i;
294 status; 516
517 for (i = 0; i < npages; i++) {
518 struct page *page = alloc_page(GFP_KERNEL);
519 if (page == NULL)
520 goto out_freepages;
521 pages[i] = page;
522 }
295 523
296 for(;;) { 524 ptr = vm_map_ram(pages, npages, 0, PAGE_KERNEL);
297 status = dir_decode(desc); 525 if (!IS_ERR_OR_NULL(ptr))
298 if (status) 526 return ptr;
299 break; 527out_freepages:
528 nfs_readdir_free_pagearray(pages, i);
529 return NULL;
530}
531
532static
533int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
534{
535 struct page *pages[NFS_MAX_READDIR_PAGES];
536 void *pages_ptr = NULL;
537 struct nfs_entry entry;
538 struct file *file = desc->file;
539 struct nfs_cache_array *array;
540 int status = 0;
541 unsigned int array_size = ARRAY_SIZE(pages);
542
543 entry.prev_cookie = 0;
544 entry.cookie = *desc->dir_cookie;
545 entry.eof = 0;
546 entry.fh = nfs_alloc_fhandle();
547 entry.fattr = nfs_alloc_fattr();
548 if (entry.fh == NULL || entry.fattr == NULL)
549 goto out;
300 550
301 dfprintk(DIRCACHE, "NFS: found cookie %Lu at index %Ld\n", 551 array = nfs_readdir_get_array(page);
302 (unsigned long long)entry->cookie, desc->current_index); 552 memset(array, 0, sizeof(struct nfs_cache_array));
553 array->eof_index = -1;
303 554
304 if (desc->file->f_pos == desc->current_index) { 555 pages_ptr = nfs_readdir_large_page(pages, array_size);
305 *desc->dir_cookie = entry->cookie; 556 if (!pages_ptr)
557 goto out_release_array;
558 do {
559 status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
560
561 if (status < 0)
306 break; 562 break;
307 } 563 nfs_readdir_page_filler(desc, &entry, pages_ptr, page, array_size * PAGE_SIZE);
308 desc->current_index++; 564 } while (array->eof_index < 0 && array->size < MAX_READDIR_ARRAY);
309 if (loop_count++ > 200) { 565
310 loop_count = 0; 566 nfs_readdir_free_large_page(pages_ptr, pages, array_size);
311 schedule(); 567out_release_array:
312 } 568 nfs_readdir_release_array(page);
313 } 569out:
570 nfs_free_fattr(entry.fattr);
571 nfs_free_fhandle(entry.fh);
314 return status; 572 return status;
315} 573}
316 574
317/* 575/*
318 * Find the given page, and call find_dirent() or find_dirent_index in 576 * Now we cache directories properly, by converting xdr information
319 * order to try to return the next entry. 577 * to an array that can be used for lookups later. This results in
578 * fewer cache pages, since we can store more information on each page.
579 * We only need to convert from xdr once so future lookups are much simpler
320 */ 580 */
321static inline 581static
322int find_dirent_page(nfs_readdir_descriptor_t *desc) 582int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
323{ 583{
324 struct inode *inode = desc->file->f_path.dentry->d_inode; 584 struct inode *inode = desc->file->f_path.dentry->d_inode;
325 struct page *page;
326 int status;
327 585
328 dfprintk(DIRCACHE, "NFS: %s: searching page %ld for target %Lu\n", 586 if (nfs_readdir_xdr_to_array(desc, page, inode) < 0)
329 __func__, desc->page_index, 587 goto error;
330 (long long) *desc->dir_cookie); 588 SetPageUptodate(page);
331 589
332 /* If we find the page in the page_cache, we cannot be sure 590 if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
333 * how fresh the data is, so we will ignore readdir_plus attributes. 591 /* Should never happen */
334 */ 592 nfs_zap_mapping(inode, inode->i_mapping);
335 desc->timestamp_valid = 0;
336 page = read_cache_page(inode->i_mapping, desc->page_index,
337 (filler_t *)nfs_readdir_filler, desc);
338 if (IS_ERR(page)) {
339 status = PTR_ERR(page);
340 goto out;
341 } 593 }
594 unlock_page(page);
595 return 0;
596 error:
597 unlock_page(page);
598 return -EIO;
599}
342 600
343 /* NOTE: Someone else may have changed the READDIRPLUS flag */ 601static
344 desc->page = page; 602void cache_page_release(nfs_readdir_descriptor_t *desc)
345 desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */ 603{
346 if (*desc->dir_cookie != 0) 604 page_cache_release(desc->page);
347 status = find_dirent(desc); 605 desc->page = NULL;
348 else 606}
349 status = find_dirent_index(desc); 607
350 if (status < 0) 608static
351 dir_page_release(desc); 609struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
352 out: 610{
353 dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status); 611 struct page *page;
354 return status; 612 page = read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
613 desc->page_index, (filler_t *)nfs_readdir_filler, desc);
614 if (IS_ERR(page))
615 desc->eof = 1;
616 return page;
355} 617}
356 618
357/* 619/*
358 * Recurse through the page cache pages, and return a 620 * Returns 0 if desc->dir_cookie was found on page desc->page_index
359 * filled nfs_entry structure of the next directory entry if possible.
360 *
361 * The target for the search is '*desc->dir_cookie' if non-0,
362 * 'desc->file->f_pos' otherwise
363 */ 621 */
622static
623int find_cache_page(nfs_readdir_descriptor_t *desc)
624{
625 int res;
626
627 desc->page = get_cache_page(desc);
628 if (IS_ERR(desc->page))
629 return PTR_ERR(desc->page);
630
631 res = nfs_readdir_search_array(desc);
632 if (res == 0)
633 return 0;
634 cache_page_release(desc);
635 return res;
636}
637
638/* Search for desc->dir_cookie from the beginning of the page cache */
364static inline 639static inline
365int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) 640int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
366{ 641{
367 int loop_count = 0; 642 int res = -EAGAIN;
368 int res;
369
370 /* Always search-by-index from the beginning of the cache */
371 if (*desc->dir_cookie == 0) {
372 dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for offset %Ld\n",
373 (long long)desc->file->f_pos);
374 desc->page_index = 0;
375 desc->entry->cookie = desc->entry->prev_cookie = 0;
376 desc->entry->eof = 0;
377 desc->current_index = 0;
378 } else
379 dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for cookie %Lu\n",
380 (unsigned long long)*desc->dir_cookie);
381 643
382 for (;;) { 644 while (1) {
383 res = find_dirent_page(desc); 645 res = find_cache_page(desc);
384 if (res != -EAGAIN) 646 if (res != -EAGAIN)
385 break; 647 break;
386 /* Align to beginning of next page */ 648 desc->page_index++;
387 desc->page_index ++;
388 if (loop_count++ > 200) {
389 loop_count = 0;
390 schedule();
391 }
392 } 649 }
393
394 dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, res);
395 return res; 650 return res;
396} 651}
397 652
@@ -400,8 +655,6 @@ static inline unsigned int dt_type(struct inode *inode)
400 return (inode->i_mode >> 12) & 15; 655 return (inode->i_mode >> 12) & 15;
401} 656}
402 657
403static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc);
404
405/* 658/*
406 * Once we've found the start of the dirent within a page: fill 'er up... 659 * Once we've found the start of the dirent within a page: fill 'er up...
407 */ 660 */
@@ -410,49 +663,36 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
410 filldir_t filldir) 663 filldir_t filldir)
411{ 664{
412 struct file *file = desc->file; 665 struct file *file = desc->file;
413 struct nfs_entry *entry = desc->entry; 666 int i = 0;
414 struct dentry *dentry = NULL; 667 int res = 0;
415 u64 fileid; 668 struct nfs_cache_array *array = NULL;
416 int loop_count = 0, 669 unsigned int d_type = DT_UNKNOWN;
417 res; 670 struct dentry *dentry = NULL;
418
419 dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling starting @ cookie %Lu\n",
420 (unsigned long long)entry->cookie);
421
422 for(;;) {
423 unsigned d_type = DT_UNKNOWN;
424 /* Note: entry->prev_cookie contains the cookie for
425 * retrieving the current dirent on the server */
426 fileid = entry->ino;
427
428 /* Get a dentry if we have one */
429 if (dentry != NULL)
430 dput(dentry);
431 dentry = nfs_readdir_lookup(desc);
432 671
433 /* Use readdirplus info */ 672 array = nfs_readdir_get_array(desc->page);
434 if (dentry != NULL && dentry->d_inode != NULL) {
435 d_type = dt_type(dentry->d_inode);
436 fileid = NFS_FILEID(dentry->d_inode);
437 }
438 673
439 res = filldir(dirent, entry->name, entry->len, 674 for (i = desc->cache_entry_index; i < array->size; i++) {
440 file->f_pos, nfs_compat_user_ino64(fileid), 675 d_type = DT_UNKNOWN;
441 d_type); 676
677 res = filldir(dirent, array->array[i].string.name,
678 array->array[i].string.len, file->f_pos,
679 nfs_compat_user_ino64(array->array[i].ino), d_type);
442 if (res < 0) 680 if (res < 0)
443 break; 681 break;
444 file->f_pos++; 682 file->f_pos++;
445 *desc->dir_cookie = entry->cookie; 683 desc->cache_entry_index = i;
446 if (dir_decode(desc) != 0) { 684 if (i < (array->size-1))
447 desc->page_index ++; 685 *desc->dir_cookie = array->array[i+1].cookie;
686 else
687 *desc->dir_cookie = array->last_cookie;
688 if (i == array->eof_index) {
689 desc->eof = 1;
448 break; 690 break;
449 } 691 }
450 if (loop_count++ > 200) {
451 loop_count = 0;
452 schedule();
453 }
454 } 692 }
455 dir_page_release(desc); 693
694 nfs_readdir_release_array(desc->page);
695 cache_page_release(desc);
456 if (dentry != NULL) 696 if (dentry != NULL)
457 dput(dentry); 697 dput(dentry);
458 dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", 698 dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
@@ -476,12 +716,9 @@ static inline
476int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, 716int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
477 filldir_t filldir) 717 filldir_t filldir)
478{ 718{
479 struct file *file = desc->file;
480 struct inode *inode = file->f_path.dentry->d_inode;
481 struct rpc_cred *cred = nfs_file_cred(file);
482 struct page *page = NULL; 719 struct page *page = NULL;
483 int status; 720 int status;
484 unsigned long timestamp, gencount; 721 struct inode *inode = desc->file->f_path.dentry->d_inode;
485 722
486 dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", 723 dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
487 (unsigned long long)*desc->dir_cookie); 724 (unsigned long long)*desc->dir_cookie);
@@ -491,38 +728,22 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
491 status = -ENOMEM; 728 status = -ENOMEM;
492 goto out; 729 goto out;
493 } 730 }
494 timestamp = jiffies; 731
495 gencount = nfs_inc_attr_generation_counter(); 732 if (nfs_readdir_xdr_to_array(desc, page, inode) == -1) {
496 status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred,
497 *desc->dir_cookie, page,
498 NFS_SERVER(inode)->dtsize,
499 desc->plus);
500 desc->page = page;
501 desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */
502 if (status >= 0) {
503 desc->timestamp = timestamp;
504 desc->gencount = gencount;
505 desc->timestamp_valid = 1;
506 if ((status = dir_decode(desc)) == 0)
507 desc->entry->prev_cookie = *desc->dir_cookie;
508 } else
509 status = -EIO; 733 status = -EIO;
510 if (status < 0)
511 goto out_release; 734 goto out_release;
735 }
512 736
737 desc->page_index = 0;
738 desc->page = page;
513 status = nfs_do_filldir(desc, dirent, filldir); 739 status = nfs_do_filldir(desc, dirent, filldir);
514 740
515 /* Reset read descriptor so it searches the page cache from
516 * the start upon the next call to readdir_search_pagecache() */
517 desc->page_index = 0;
518 desc->entry->cookie = desc->entry->prev_cookie = 0;
519 desc->entry->eof = 0;
520 out: 741 out:
521 dfprintk(DIRCACHE, "NFS: %s: returns %d\n", 742 dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
522 __func__, status); 743 __func__, status);
523 return status; 744 return status;
524 out_release: 745 out_release:
525 dir_page_release(desc); 746 cache_page_release(desc);
526 goto out; 747 goto out;
527} 748}
528 749
@@ -536,7 +757,6 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
536 struct inode *inode = dentry->d_inode; 757 struct inode *inode = dentry->d_inode;
537 nfs_readdir_descriptor_t my_desc, 758 nfs_readdir_descriptor_t my_desc,
538 *desc = &my_desc; 759 *desc = &my_desc;
539 struct nfs_entry my_entry;
540 int res = -ENOMEM; 760 int res = -ENOMEM;
541 761
542 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", 762 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
@@ -557,26 +777,17 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
557 desc->decode = NFS_PROTO(inode)->decode_dirent; 777 desc->decode = NFS_PROTO(inode)->decode_dirent;
558 desc->plus = NFS_USE_READDIRPLUS(inode); 778 desc->plus = NFS_USE_READDIRPLUS(inode);
559 779
560 my_entry.cookie = my_entry.prev_cookie = 0;
561 my_entry.eof = 0;
562 my_entry.fh = nfs_alloc_fhandle();
563 my_entry.fattr = nfs_alloc_fattr();
564 if (my_entry.fh == NULL || my_entry.fattr == NULL)
565 goto out_alloc_failed;
566
567 desc->entry = &my_entry;
568
569 nfs_block_sillyrename(dentry); 780 nfs_block_sillyrename(dentry);
570 res = nfs_revalidate_mapping(inode, filp->f_mapping); 781 res = nfs_revalidate_mapping(inode, filp->f_mapping);
571 if (res < 0) 782 if (res < 0)
572 goto out; 783 goto out;
573 784
574 while(!desc->entry->eof) { 785 while (desc->eof != 1) {
575 res = readdir_search_pagecache(desc); 786 res = readdir_search_pagecache(desc);
576 787
577 if (res == -EBADCOOKIE) { 788 if (res == -EBADCOOKIE) {
578 /* This means either end of directory */ 789 /* This means either end of directory */
579 if (*desc->dir_cookie && desc->entry->cookie != *desc->dir_cookie) { 790 if (*desc->dir_cookie && desc->eof == 0) {
580 /* Or that the server has 'lost' a cookie */ 791 /* Or that the server has 'lost' a cookie */
581 res = uncached_readdir(desc, dirent, filldir); 792 res = uncached_readdir(desc, dirent, filldir);
582 if (res >= 0) 793 if (res >= 0)
@@ -588,8 +799,9 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
588 if (res == -ETOOSMALL && desc->plus) { 799 if (res == -ETOOSMALL && desc->plus) {
589 clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); 800 clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
590 nfs_zap_caches(inode); 801 nfs_zap_caches(inode);
802 desc->page_index = 0;
591 desc->plus = 0; 803 desc->plus = 0;
592 desc->entry->eof = 0; 804 desc->eof = 0;
593 continue; 805 continue;
594 } 806 }
595 if (res < 0) 807 if (res < 0)
@@ -605,9 +817,6 @@ out:
605 nfs_unblock_sillyrename(dentry); 817 nfs_unblock_sillyrename(dentry);
606 if (res > 0) 818 if (res > 0)
607 res = 0; 819 res = 0;
608out_alloc_failed:
609 nfs_free_fattr(my_entry.fattr);
610 nfs_free_fhandle(my_entry.fh);
611 dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n", 820 dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n",
612 dentry->d_parent->d_name.name, dentry->d_name.name, 821 dentry->d_parent->d_name.name, dentry->d_name.name,
613 res); 822 res);
@@ -1029,10 +1238,63 @@ static int is_atomic_open(struct nameidata *nd)
1029 return 1; 1238 return 1;
1030} 1239}
1031 1240
1241static struct nfs_open_context *nameidata_to_nfs_open_context(struct dentry *dentry, struct nameidata *nd)
1242{
1243 struct path path = {
1244 .mnt = nd->path.mnt,
1245 .dentry = dentry,
1246 };
1247 struct nfs_open_context *ctx;
1248 struct rpc_cred *cred;
1249 fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
1250
1251 cred = rpc_lookup_cred();
1252 if (IS_ERR(cred))
1253 return ERR_CAST(cred);
1254 ctx = alloc_nfs_open_context(&path, cred, fmode);
1255 put_rpccred(cred);
1256 if (ctx == NULL)
1257 return ERR_PTR(-ENOMEM);
1258 return ctx;
1259}
1260
1261static int do_open(struct inode *inode, struct file *filp)
1262{
1263 nfs_fscache_set_inode_cookie(inode, filp);
1264 return 0;
1265}
1266
1267static int nfs_intent_set_file(struct nameidata *nd, struct nfs_open_context *ctx)
1268{
1269 struct file *filp;
1270 int ret = 0;
1271
1272 /* If the open_intent is for execute, we have an extra check to make */
1273 if (ctx->mode & FMODE_EXEC) {
1274 ret = nfs_may_open(ctx->path.dentry->d_inode,
1275 ctx->cred,
1276 nd->intent.open.flags);
1277 if (ret < 0)
1278 goto out;
1279 }
1280 filp = lookup_instantiate_filp(nd, ctx->path.dentry, do_open);
1281 if (IS_ERR(filp))
1282 ret = PTR_ERR(filp);
1283 else
1284 nfs_file_set_open_context(filp, ctx);
1285out:
1286 put_nfs_open_context(ctx);
1287 return ret;
1288}
1289
1032static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) 1290static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
1033{ 1291{
1292 struct nfs_open_context *ctx;
1293 struct iattr attr;
1034 struct dentry *res = NULL; 1294 struct dentry *res = NULL;
1035 int error; 1295 struct inode *inode;
1296 int open_flags;
1297 int err;
1036 1298
1037 dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n", 1299 dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n",
1038 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); 1300 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1054,13 +1316,32 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1054 goto out; 1316 goto out;
1055 } 1317 }
1056 1318
1319 ctx = nameidata_to_nfs_open_context(dentry, nd);
1320 res = ERR_CAST(ctx);
1321 if (IS_ERR(ctx))
1322 goto out;
1323
1324 open_flags = nd->intent.open.flags;
1325 if (nd->flags & LOOKUP_CREATE) {
1326 attr.ia_mode = nd->intent.open.create_mode;
1327 attr.ia_valid = ATTR_MODE;
1328 if (!IS_POSIXACL(dir))
1329 attr.ia_mode &= ~current_umask();
1330 } else {
1331 open_flags &= ~(O_EXCL | O_CREAT);
1332 attr.ia_valid = 0;
1333 }
1334
1057 /* Open the file on the server */ 1335 /* Open the file on the server */
1058 res = nfs4_atomic_open(dir, dentry, nd); 1336 nfs_block_sillyrename(dentry->d_parent);
1059 if (IS_ERR(res)) { 1337 inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
1060 error = PTR_ERR(res); 1338 if (IS_ERR(inode)) {
1061 switch (error) { 1339 nfs_unblock_sillyrename(dentry->d_parent);
1340 put_nfs_open_context(ctx);
1341 switch (PTR_ERR(inode)) {
1062 /* Make a negative dentry */ 1342 /* Make a negative dentry */
1063 case -ENOENT: 1343 case -ENOENT:
1344 d_add(dentry, NULL);
1064 res = NULL; 1345 res = NULL;
1065 goto out; 1346 goto out;
1066 /* This turned out not to be a regular file */ 1347 /* This turned out not to be a regular file */
@@ -1072,11 +1353,25 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1072 goto no_open; 1353 goto no_open;
1073 /* case -EINVAL: */ 1354 /* case -EINVAL: */
1074 default: 1355 default:
1356 res = ERR_CAST(inode);
1075 goto out; 1357 goto out;
1076 } 1358 }
1077 } else if (res != NULL) 1359 }
1360 res = d_add_unique(dentry, inode);
1361 nfs_unblock_sillyrename(dentry->d_parent);
1362 if (res != NULL) {
1363 dput(ctx->path.dentry);
1364 ctx->path.dentry = dget(res);
1078 dentry = res; 1365 dentry = res;
1366 }
1367 err = nfs_intent_set_file(nd, ctx);
1368 if (err < 0) {
1369 if (res != NULL)
1370 dput(res);
1371 return ERR_PTR(err);
1372 }
1079out: 1373out:
1374 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1080 return res; 1375 return res;
1081no_open: 1376no_open:
1082 return nfs_lookup(dir, dentry, nd); 1377 return nfs_lookup(dir, dentry, nd);
@@ -1087,12 +1382,15 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1087 struct dentry *parent = NULL; 1382 struct dentry *parent = NULL;
1088 struct inode *inode = dentry->d_inode; 1383 struct inode *inode = dentry->d_inode;
1089 struct inode *dir; 1384 struct inode *dir;
1385 struct nfs_open_context *ctx;
1090 int openflags, ret = 0; 1386 int openflags, ret = 0;
1091 1387
1092 if (!is_atomic_open(nd) || d_mountpoint(dentry)) 1388 if (!is_atomic_open(nd) || d_mountpoint(dentry))
1093 goto no_open; 1389 goto no_open;
1390
1094 parent = dget_parent(dentry); 1391 parent = dget_parent(dentry);
1095 dir = parent->d_inode; 1392 dir = parent->d_inode;
1393
1096 /* We can't create new files in nfs_open_revalidate(), so we 1394 /* We can't create new files in nfs_open_revalidate(), so we
1097 * optimize away revalidation of negative dentries. 1395 * optimize away revalidation of negative dentries.
1098 */ 1396 */
@@ -1112,99 +1410,96 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1112 /* We can't create new files, or truncate existing ones here */ 1410 /* We can't create new files, or truncate existing ones here */
1113 openflags &= ~(O_CREAT|O_EXCL|O_TRUNC); 1411 openflags &= ~(O_CREAT|O_EXCL|O_TRUNC);
1114 1412
1413 ctx = nameidata_to_nfs_open_context(dentry, nd);
1414 ret = PTR_ERR(ctx);
1415 if (IS_ERR(ctx))
1416 goto out;
1115 /* 1417 /*
1116 * Note: we're not holding inode->i_mutex and so may be racing with 1418 * Note: we're not holding inode->i_mutex and so may be racing with
1117 * operations that change the directory. We therefore save the 1419 * operations that change the directory. We therefore save the
1118 * change attribute *before* we do the RPC call. 1420 * change attribute *before* we do the RPC call.
1119 */ 1421 */
1120 ret = nfs4_open_revalidate(dir, dentry, openflags, nd); 1422 inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, NULL);
1423 if (IS_ERR(inode)) {
1424 ret = PTR_ERR(inode);
1425 switch (ret) {
1426 case -EPERM:
1427 case -EACCES:
1428 case -EDQUOT:
1429 case -ENOSPC:
1430 case -EROFS:
1431 goto out_put_ctx;
1432 default:
1433 goto out_drop;
1434 }
1435 }
1436 iput(inode);
1437 if (inode != dentry->d_inode)
1438 goto out_drop;
1439
1440 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1441 ret = nfs_intent_set_file(nd, ctx);
1442 if (ret >= 0)
1443 ret = 1;
1121out: 1444out:
1122 dput(parent); 1445 dput(parent);
1123 if (!ret)
1124 d_drop(dentry);
1125 return ret; 1446 return ret;
1447out_drop:
1448 d_drop(dentry);
1449 ret = 0;
1450out_put_ctx:
1451 put_nfs_open_context(ctx);
1452 goto out;
1453
1126no_open_dput: 1454no_open_dput:
1127 dput(parent); 1455 dput(parent);
1128no_open: 1456no_open:
1129 return nfs_lookup_revalidate(dentry, nd); 1457 return nfs_lookup_revalidate(dentry, nd);
1130} 1458}
1131#endif /* CONFIG_NFSV4 */
1132 1459
1133static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc) 1460static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode,
1461 struct nameidata *nd)
1134{ 1462{
1135 struct dentry *parent = desc->file->f_path.dentry; 1463 struct nfs_open_context *ctx = NULL;
1136 struct inode *dir = parent->d_inode; 1464 struct iattr attr;
1137 struct nfs_entry *entry = desc->entry; 1465 int error;
1138 struct dentry *dentry, *alias; 1466 int open_flags = 0;
1139 struct qstr name = {
1140 .name = entry->name,
1141 .len = entry->len,
1142 };
1143 struct inode *inode;
1144 unsigned long verf = nfs_save_change_attribute(dir);
1145 1467
1146 switch (name.len) { 1468 dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
1147 case 2: 1469 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
1148 if (name.name[0] == '.' && name.name[1] == '.')
1149 return dget_parent(parent);
1150 break;
1151 case 1:
1152 if (name.name[0] == '.')
1153 return dget(parent);
1154 }
1155 1470
1156 spin_lock(&dir->i_lock); 1471 attr.ia_mode = mode;
1157 if (NFS_I(dir)->cache_validity & NFS_INO_INVALID_DATA) { 1472 attr.ia_valid = ATTR_MODE;
1158 spin_unlock(&dir->i_lock);
1159 return NULL;
1160 }
1161 spin_unlock(&dir->i_lock);
1162 1473
1163 name.hash = full_name_hash(name.name, name.len); 1474 if ((nd->flags & LOOKUP_CREATE) != 0) {
1164 dentry = d_lookup(parent, &name); 1475 open_flags = nd->intent.open.flags;
1165 if (dentry != NULL) {
1166 /* Is this a positive dentry that matches the readdir info? */
1167 if (dentry->d_inode != NULL &&
1168 (NFS_FILEID(dentry->d_inode) == entry->ino ||
1169 d_mountpoint(dentry))) {
1170 if (!desc->plus || entry->fh->size == 0)
1171 return dentry;
1172 if (nfs_compare_fh(NFS_FH(dentry->d_inode),
1173 entry->fh) == 0)
1174 goto out_renew;
1175 }
1176 /* No, so d_drop to allow one to be created */
1177 d_drop(dentry);
1178 dput(dentry);
1179 }
1180 if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR))
1181 return NULL;
1182 if (name.len > NFS_SERVER(dir)->namelen)
1183 return NULL;
1184 /* Note: caller is already holding the dir->i_mutex! */
1185 dentry = d_alloc(parent, &name);
1186 if (dentry == NULL)
1187 return NULL;
1188 dentry->d_op = NFS_PROTO(dir)->dentry_ops;
1189 inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
1190 if (IS_ERR(inode)) {
1191 dput(dentry);
1192 return NULL;
1193 }
1194 1476
1195 alias = d_materialise_unique(dentry, inode); 1477 ctx = nameidata_to_nfs_open_context(dentry, nd);
1196 if (alias != NULL) { 1478 error = PTR_ERR(ctx);
1197 dput(dentry); 1479 if (IS_ERR(ctx))
1198 if (IS_ERR(alias)) 1480 goto out_err_drop;
1199 return NULL;
1200 dentry = alias;
1201 } 1481 }
1202 1482
1203out_renew: 1483 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, ctx);
1204 nfs_set_verifier(dentry, verf); 1484 if (error != 0)
1205 return dentry; 1485 goto out_put_ctx;
1486 if (ctx != NULL) {
1487 error = nfs_intent_set_file(nd, ctx);
1488 if (error < 0)
1489 goto out_err;
1490 }
1491 return 0;
1492out_put_ctx:
1493 if (ctx != NULL)
1494 put_nfs_open_context(ctx);
1495out_err_drop:
1496 d_drop(dentry);
1497out_err:
1498 return error;
1206} 1499}
1207 1500
1501#endif /* CONFIG_NFSV4 */
1502
1208/* 1503/*
1209 * Code common to create, mkdir, and mknod. 1504 * Code common to create, mkdir, and mknod.
1210 */ 1505 */
@@ -1258,7 +1553,6 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
1258{ 1553{
1259 struct iattr attr; 1554 struct iattr attr;
1260 int error; 1555 int error;
1261 int open_flags = 0;
1262 1556
1263 dfprintk(VFS, "NFS: create(%s/%ld), %s\n", 1557 dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
1264 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); 1558 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1266,10 +1560,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
1266 attr.ia_mode = mode; 1560 attr.ia_mode = mode;
1267 attr.ia_valid = ATTR_MODE; 1561 attr.ia_valid = ATTR_MODE;
1268 1562
1269 if ((nd->flags & LOOKUP_CREATE) != 0) 1563 error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL);
1270 open_flags = nd->intent.open.flags;
1271
1272 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd);
1273 if (error != 0) 1564 if (error != 0)
1274 goto out_err; 1565 goto out_err;
1275 return 0; 1566 return 0;
@@ -1351,76 +1642,6 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
1351 return error; 1642 return error;
1352} 1643}
1353 1644
1354static int nfs_sillyrename(struct inode *dir, struct dentry *dentry)
1355{
1356 static unsigned int sillycounter;
1357 const int fileidsize = sizeof(NFS_FILEID(dentry->d_inode))*2;
1358 const int countersize = sizeof(sillycounter)*2;
1359 const int slen = sizeof(".nfs")+fileidsize+countersize-1;
1360 char silly[slen+1];
1361 struct qstr qsilly;
1362 struct dentry *sdentry;
1363 int error = -EIO;
1364
1365 dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
1366 dentry->d_parent->d_name.name, dentry->d_name.name,
1367 atomic_read(&dentry->d_count));
1368 nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
1369
1370 /*
1371 * We don't allow a dentry to be silly-renamed twice.
1372 */
1373 error = -EBUSY;
1374 if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
1375 goto out;
1376
1377 sprintf(silly, ".nfs%*.*Lx",
1378 fileidsize, fileidsize,
1379 (unsigned long long)NFS_FILEID(dentry->d_inode));
1380
1381 /* Return delegation in anticipation of the rename */
1382 nfs_inode_return_delegation(dentry->d_inode);
1383
1384 sdentry = NULL;
1385 do {
1386 char *suffix = silly + slen - countersize;
1387
1388 dput(sdentry);
1389 sillycounter++;
1390 sprintf(suffix, "%*.*x", countersize, countersize, sillycounter);
1391
1392 dfprintk(VFS, "NFS: trying to rename %s to %s\n",
1393 dentry->d_name.name, silly);
1394
1395 sdentry = lookup_one_len(silly, dentry->d_parent, slen);
1396 /*
1397 * N.B. Better to return EBUSY here ... it could be
1398 * dangerous to delete the file while it's in use.
1399 */
1400 if (IS_ERR(sdentry))
1401 goto out;
1402 } while(sdentry->d_inode != NULL); /* need negative lookup */
1403
1404 qsilly.name = silly;
1405 qsilly.len = strlen(silly);
1406 if (dentry->d_inode) {
1407 error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
1408 dir, &qsilly);
1409 nfs_mark_for_revalidate(dentry->d_inode);
1410 } else
1411 error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
1412 dir, &qsilly);
1413 if (!error) {
1414 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1415 d_move(dentry, sdentry);
1416 error = nfs_async_unlink(dir, dentry);
1417 /* If we return 0 we don't unlink */
1418 }
1419 dput(sdentry);
1420out:
1421 return error;
1422}
1423
1424/* 1645/*
1425 * Remove a file after making sure there are no pending writes, 1646 * Remove a file after making sure there are no pending writes,
1426 * and after checking that the file has only one user. 1647 * and after checking that the file has only one user.
@@ -1580,7 +1801,7 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1580 d_drop(dentry); 1801 d_drop(dentry);
1581 error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); 1802 error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
1582 if (error == 0) { 1803 if (error == 0) {
1583 atomic_inc(&inode->i_count); 1804 ihold(inode);
1584 d_add(dentry, inode); 1805 d_add(dentry, inode);
1585 } 1806 }
1586 return error; 1807 return error;
@@ -1711,14 +1932,14 @@ static void nfs_access_free_list(struct list_head *head)
1711int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) 1932int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
1712{ 1933{
1713 LIST_HEAD(head); 1934 LIST_HEAD(head);
1714 struct nfs_inode *nfsi; 1935 struct nfs_inode *nfsi, *next;
1715 struct nfs_access_entry *cache; 1936 struct nfs_access_entry *cache;
1716 1937
1717 if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) 1938 if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
1718 return (nr_to_scan == 0) ? 0 : -1; 1939 return (nr_to_scan == 0) ? 0 : -1;
1719 1940
1720 spin_lock(&nfs_access_lru_lock); 1941 spin_lock(&nfs_access_lru_lock);
1721 list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) { 1942 list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
1722 struct inode *inode; 1943 struct inode *inode;
1723 1944
1724 if (nr_to_scan-- == 0) 1945 if (nr_to_scan-- == 0)
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index dba50a5625db..a6e711ad130f 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -167,7 +167,7 @@ static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd,
167 return 0; 167 return 0;
168 } 168 }
169 item = container_of(h, struct nfs_dns_ent, h); 169 item = container_of(h, struct nfs_dns_ent, h);
170 ttl = (long)item->h.expiry_time - (long)get_seconds(); 170 ttl = item->h.expiry_time - seconds_since_boot();
171 if (ttl < 0) 171 if (ttl < 0)
172 ttl = 0; 172 ttl = 0;
173 173
@@ -239,7 +239,7 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
239 ttl = get_expiry(&buf); 239 ttl = get_expiry(&buf);
240 if (ttl == 0) 240 if (ttl == 0)
241 goto out; 241 goto out;
242 key.h.expiry_time = ttl + get_seconds(); 242 key.h.expiry_time = ttl + seconds_since_boot();
243 243
244 ret = -ENOMEM; 244 ret = -ENOMEM;
245 item = nfs_dns_lookup(cd, &key); 245 item = nfs_dns_lookup(cd, &key);
@@ -301,7 +301,7 @@ static int do_cache_lookup_nowait(struct cache_detail *cd,
301 goto out_err; 301 goto out_err;
302 ret = -ETIMEDOUT; 302 ret = -ETIMEDOUT;
303 if (!test_bit(CACHE_VALID, &(*item)->h.flags) 303 if (!test_bit(CACHE_VALID, &(*item)->h.flags)
304 || (*item)->h.expiry_time < get_seconds() 304 || (*item)->h.expiry_time < seconds_since_boot()
305 || cd->flush_time > (*item)->h.last_refresh) 305 || cd->flush_time > (*item)->h.last_refresh)
306 goto out_put; 306 goto out_put;
307 ret = -ENOENT; 307 ret = -ENOENT;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 05bf3c0dc751..e756075637b0 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -36,6 +36,7 @@
36#include "internal.h" 36#include "internal.h"
37#include "iostat.h" 37#include "iostat.h"
38#include "fscache.h" 38#include "fscache.h"
39#include "pnfs.h"
39 40
40#define NFSDBG_FACILITY NFSDBG_FILE 41#define NFSDBG_FACILITY NFSDBG_FILE
41 42
@@ -386,6 +387,10 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
386 file->f_path.dentry->d_name.name, 387 file->f_path.dentry->d_name.name,
387 mapping->host->i_ino, len, (long long) pos); 388 mapping->host->i_ino, len, (long long) pos);
388 389
390 pnfs_update_layout(mapping->host,
391 nfs_file_open_context(file),
392 IOMODE_RW);
393
389start: 394start:
390 /* 395 /*
391 * Prevent starvation issues if someone is doing a consistency 396 * Prevent starvation issues if someone is doing a consistency
@@ -551,7 +556,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
551 struct file *filp = vma->vm_file; 556 struct file *filp = vma->vm_file;
552 struct dentry *dentry = filp->f_path.dentry; 557 struct dentry *dentry = filp->f_path.dentry;
553 unsigned pagelen; 558 unsigned pagelen;
554 int ret = -EINVAL; 559 int ret = VM_FAULT_NOPAGE;
555 struct address_space *mapping; 560 struct address_space *mapping;
556 561
557 dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n", 562 dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n",
@@ -567,21 +572,20 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
567 if (mapping != dentry->d_inode->i_mapping) 572 if (mapping != dentry->d_inode->i_mapping)
568 goto out_unlock; 573 goto out_unlock;
569 574
570 ret = 0;
571 pagelen = nfs_page_length(page); 575 pagelen = nfs_page_length(page);
572 if (pagelen == 0) 576 if (pagelen == 0)
573 goto out_unlock; 577 goto out_unlock;
574 578
575 ret = nfs_flush_incompatible(filp, page); 579 ret = VM_FAULT_LOCKED;
576 if (ret != 0) 580 if (nfs_flush_incompatible(filp, page) == 0 &&
577 goto out_unlock; 581 nfs_updatepage(filp, page, 0, pagelen) == 0)
582 goto out;
578 583
579 ret = nfs_updatepage(filp, page, 0, pagelen); 584 ret = VM_FAULT_SIGBUS;
580out_unlock: 585out_unlock:
581 if (!ret)
582 return VM_FAULT_LOCKED;
583 unlock_page(page); 586 unlock_page(page);
584 return VM_FAULT_SIGBUS; 587out:
588 return ret;
585} 589}
586 590
587static const struct vm_operations_struct nfs_file_vm_ops = { 591static const struct vm_operations_struct nfs_file_vm_ops = {
@@ -684,7 +688,8 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
684 return ret; 688 return ret;
685} 689}
686 690
687static int do_getlk(struct file *filp, int cmd, struct file_lock *fl) 691static int
692do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
688{ 693{
689 struct inode *inode = filp->f_mapping->host; 694 struct inode *inode = filp->f_mapping->host;
690 int status = 0; 695 int status = 0;
@@ -699,7 +704,7 @@ static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
699 if (nfs_have_delegation(inode, FMODE_READ)) 704 if (nfs_have_delegation(inode, FMODE_READ))
700 goto out_noconflict; 705 goto out_noconflict;
701 706
702 if (NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM) 707 if (is_local)
703 goto out_noconflict; 708 goto out_noconflict;
704 709
705 status = NFS_PROTO(inode)->lock(filp, cmd, fl); 710 status = NFS_PROTO(inode)->lock(filp, cmd, fl);
@@ -726,7 +731,8 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl)
726 return res; 731 return res;
727} 732}
728 733
729static int do_unlk(struct file *filp, int cmd, struct file_lock *fl) 734static int
735do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
730{ 736{
731 struct inode *inode = filp->f_mapping->host; 737 struct inode *inode = filp->f_mapping->host;
732 int status; 738 int status;
@@ -741,15 +747,24 @@ static int do_unlk(struct file *filp, int cmd, struct file_lock *fl)
741 * If we're signalled while cleaning up locks on process exit, we 747 * If we're signalled while cleaning up locks on process exit, we
742 * still need to complete the unlock. 748 * still need to complete the unlock.
743 */ 749 */
744 /* Use local locking if mounted with "-onolock" */ 750 /*
745 if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) 751 * Use local locking if mounted with "-onolock" or with appropriate
752 * "-olocal_lock="
753 */
754 if (!is_local)
746 status = NFS_PROTO(inode)->lock(filp, cmd, fl); 755 status = NFS_PROTO(inode)->lock(filp, cmd, fl);
747 else 756 else
748 status = do_vfs_lock(filp, fl); 757 status = do_vfs_lock(filp, fl);
749 return status; 758 return status;
750} 759}
751 760
752static int do_setlk(struct file *filp, int cmd, struct file_lock *fl) 761static int
762is_time_granular(struct timespec *ts) {
763 return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000));
764}
765
766static int
767do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
753{ 768{
754 struct inode *inode = filp->f_mapping->host; 769 struct inode *inode = filp->f_mapping->host;
755 int status; 770 int status;
@@ -762,20 +777,31 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
762 if (status != 0) 777 if (status != 0)
763 goto out; 778 goto out;
764 779
765 /* Use local locking if mounted with "-onolock" */ 780 /*
766 if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) 781 * Use local locking if mounted with "-onolock" or with appropriate
782 * "-olocal_lock="
783 */
784 if (!is_local)
767 status = NFS_PROTO(inode)->lock(filp, cmd, fl); 785 status = NFS_PROTO(inode)->lock(filp, cmd, fl);
768 else 786 else
769 status = do_vfs_lock(filp, fl); 787 status = do_vfs_lock(filp, fl);
770 if (status < 0) 788 if (status < 0)
771 goto out; 789 goto out;
790
772 /* 791 /*
773 * Make sure we clear the cache whenever we try to get the lock. 792 * Revalidate the cache if the server has time stamps granular
793 * enough to detect subsecond changes. Otherwise, clear the
794 * cache to prevent missing any changes.
795 *
774 * This makes locking act as a cache coherency point. 796 * This makes locking act as a cache coherency point.
775 */ 797 */
776 nfs_sync_mapping(filp->f_mapping); 798 nfs_sync_mapping(filp->f_mapping);
777 if (!nfs_have_delegation(inode, FMODE_READ)) 799 if (!nfs_have_delegation(inode, FMODE_READ)) {
778 nfs_zap_caches(inode); 800 if (is_time_granular(&NFS_SERVER(inode)->time_delta))
801 __nfs_revalidate_inode(NFS_SERVER(inode), inode);
802 else
803 nfs_zap_caches(inode);
804 }
779out: 805out:
780 return status; 806 return status;
781} 807}
@@ -787,6 +813,7 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
787{ 813{
788 struct inode *inode = filp->f_mapping->host; 814 struct inode *inode = filp->f_mapping->host;
789 int ret = -ENOLCK; 815 int ret = -ENOLCK;
816 int is_local = 0;
790 817
791 dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n", 818 dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n",
792 filp->f_path.dentry->d_parent->d_name.name, 819 filp->f_path.dentry->d_parent->d_name.name,
@@ -800,6 +827,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
800 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) 827 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
801 goto out_err; 828 goto out_err;
802 829
830 if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL)
831 is_local = 1;
832
803 if (NFS_PROTO(inode)->lock_check_bounds != NULL) { 833 if (NFS_PROTO(inode)->lock_check_bounds != NULL) {
804 ret = NFS_PROTO(inode)->lock_check_bounds(fl); 834 ret = NFS_PROTO(inode)->lock_check_bounds(fl);
805 if (ret < 0) 835 if (ret < 0)
@@ -807,11 +837,11 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
807 } 837 }
808 838
809 if (IS_GETLK(cmd)) 839 if (IS_GETLK(cmd))
810 ret = do_getlk(filp, cmd, fl); 840 ret = do_getlk(filp, cmd, fl, is_local);
811 else if (fl->fl_type == F_UNLCK) 841 else if (fl->fl_type == F_UNLCK)
812 ret = do_unlk(filp, cmd, fl); 842 ret = do_unlk(filp, cmd, fl, is_local);
813 else 843 else
814 ret = do_setlk(filp, cmd, fl); 844 ret = do_setlk(filp, cmd, fl, is_local);
815out_err: 845out_err:
816 return ret; 846 return ret;
817} 847}
@@ -821,6 +851,9 @@ out_err:
821 */ 851 */
822static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) 852static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
823{ 853{
854 struct inode *inode = filp->f_mapping->host;
855 int is_local = 0;
856
824 dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n", 857 dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n",
825 filp->f_path.dentry->d_parent->d_name.name, 858 filp->f_path.dentry->d_parent->d_name.name,
826 filp->f_path.dentry->d_name.name, 859 filp->f_path.dentry->d_name.name,
@@ -829,14 +862,17 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
829 if (!(fl->fl_flags & FL_FLOCK)) 862 if (!(fl->fl_flags & FL_FLOCK))
830 return -ENOLCK; 863 return -ENOLCK;
831 864
865 if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
866 is_local = 1;
867
832 /* We're simulating flock() locks using posix locks on the server */ 868 /* We're simulating flock() locks using posix locks on the server */
833 fl->fl_owner = (fl_owner_t)filp; 869 fl->fl_owner = (fl_owner_t)filp;
834 fl->fl_start = 0; 870 fl->fl_start = 0;
835 fl->fl_end = OFFSET_MAX; 871 fl->fl_end = OFFSET_MAX;
836 872
837 if (fl->fl_type == F_UNLCK) 873 if (fl->fl_type == F_UNLCK)
838 return do_unlk(filp, cmd, fl); 874 return do_unlk(filp, cmd, fl, is_local);
839 return do_setlk(filp, cmd, fl); 875 return do_setlk(filp, cmd, fl, is_local);
840} 876}
841 877
842/* 878/*
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index a70e446e1605..ac7b814ce162 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -54,8 +54,7 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
54 iput(inode); 54 iput(inode);
55 return -ENOMEM; 55 return -ENOMEM;
56 } 56 }
57 /* Circumvent igrab(): we know the inode is not being freed */ 57 ihold(inode);
58 atomic_inc(&inode->i_count);
59 /* 58 /*
60 * Ensure that this dentry is invisible to d_find_alias(). 59 * Ensure that this dentry is invisible to d_find_alias().
61 * Otherwise, it may be spliced into the tree by 60 * Otherwise, it may be spliced into the tree by
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 21a84d45916f..dec47ed8b6b9 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -34,6 +34,212 @@
34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 */ 35 */
36 36
37#ifdef CONFIG_NFS_USE_NEW_IDMAPPER
38
39#include <linux/slab.h>
40#include <linux/cred.h>
41#include <linux/nfs_idmap.h>
42#include <linux/keyctl.h>
43#include <linux/key-type.h>
44#include <linux/rcupdate.h>
45#include <linux/kernel.h>
46#include <linux/err.h>
47
48#include <keys/user-type.h>
49
50#define NFS_UINT_MAXLEN 11
51
52const struct cred *id_resolver_cache;
53
54struct key_type key_type_id_resolver = {
55 .name = "id_resolver",
56 .instantiate = user_instantiate,
57 .match = user_match,
58 .revoke = user_revoke,
59 .destroy = user_destroy,
60 .describe = user_describe,
61 .read = user_read,
62};
63
64int nfs_idmap_init(void)
65{
66 struct cred *cred;
67 struct key *keyring;
68 int ret = 0;
69
70 printk(KERN_NOTICE "Registering the %s key type\n", key_type_id_resolver.name);
71
72 cred = prepare_kernel_cred(NULL);
73 if (!cred)
74 return -ENOMEM;
75
76 keyring = key_alloc(&key_type_keyring, ".id_resolver", 0, 0, cred,
77 (KEY_POS_ALL & ~KEY_POS_SETATTR) |
78 KEY_USR_VIEW | KEY_USR_READ,
79 KEY_ALLOC_NOT_IN_QUOTA);
80 if (IS_ERR(keyring)) {
81 ret = PTR_ERR(keyring);
82 goto failed_put_cred;
83 }
84
85 ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
86 if (ret < 0)
87 goto failed_put_key;
88
89 ret = register_key_type(&key_type_id_resolver);
90 if (ret < 0)
91 goto failed_put_key;
92
93 cred->thread_keyring = keyring;
94 cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
95 id_resolver_cache = cred;
96 return 0;
97
98failed_put_key:
99 key_put(keyring);
100failed_put_cred:
101 put_cred(cred);
102 return ret;
103}
104
105void nfs_idmap_quit(void)
106{
107 key_revoke(id_resolver_cache->thread_keyring);
108 unregister_key_type(&key_type_id_resolver);
109 put_cred(id_resolver_cache);
110}
111
112/*
113 * Assemble the description to pass to request_key()
114 * This function will allocate a new string and update dest to point
115 * at it. The caller is responsible for freeing dest.
116 *
117 * On error 0 is returned. Otherwise, the length of dest is returned.
118 */
119static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
120 const char *type, size_t typelen, char **desc)
121{
122 char *cp;
123 size_t desclen = typelen + namelen + 2;
124
125 *desc = kmalloc(desclen, GFP_KERNEL);
126 if (!desc)
127 return -ENOMEM;
128
129 cp = *desc;
130 memcpy(cp, type, typelen);
131 cp += typelen;
132 *cp++ = ':';
133
134 memcpy(cp, name, namelen);
135 cp += namelen;
136 *cp = '\0';
137 return desclen;
138}
139
140static ssize_t nfs_idmap_request_key(const char *name, size_t namelen,
141 const char *type, void *data, size_t data_size)
142{
143 const struct cred *saved_cred;
144 struct key *rkey;
145 char *desc;
146 struct user_key_payload *payload;
147 ssize_t ret;
148
149 ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
150 if (ret <= 0)
151 goto out;
152
153 saved_cred = override_creds(id_resolver_cache);
154 rkey = request_key(&key_type_id_resolver, desc, "");
155 revert_creds(saved_cred);
156 kfree(desc);
157 if (IS_ERR(rkey)) {
158 ret = PTR_ERR(rkey);
159 goto out;
160 }
161
162 rcu_read_lock();
163 rkey->perm |= KEY_USR_VIEW;
164
165 ret = key_validate(rkey);
166 if (ret < 0)
167 goto out_up;
168
169 payload = rcu_dereference(rkey->payload.data);
170 if (IS_ERR_OR_NULL(payload)) {
171 ret = PTR_ERR(payload);
172 goto out_up;
173 }
174
175 ret = payload->datalen;
176 if (ret > 0 && ret <= data_size)
177 memcpy(data, payload->data, ret);
178 else
179 ret = -EINVAL;
180
181out_up:
182 rcu_read_unlock();
183 key_put(rkey);
184out:
185 return ret;
186}
187
188
189/* ID -> Name */
190static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, size_t buflen)
191{
192 char id_str[NFS_UINT_MAXLEN];
193 int id_len;
194 ssize_t ret;
195
196 id_len = snprintf(id_str, sizeof(id_str), "%u", id);
197 ret = nfs_idmap_request_key(id_str, id_len, type, buf, buflen);
198 if (ret < 0)
199 return -EINVAL;
200 return ret;
201}
202
203/* Name -> ID */
204static int nfs_idmap_lookup_id(const char *name, size_t namelen,
205 const char *type, __u32 *id)
206{
207 char id_str[NFS_UINT_MAXLEN];
208 long id_long;
209 ssize_t data_size;
210 int ret = 0;
211
212 data_size = nfs_idmap_request_key(name, namelen, type, id_str, NFS_UINT_MAXLEN);
213 if (data_size <= 0) {
214 ret = -EINVAL;
215 } else {
216 ret = strict_strtol(id_str, 10, &id_long);
217 *id = (__u32)id_long;
218 }
219 return ret;
220}
221
222int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
223{
224 return nfs_idmap_lookup_id(name, namelen, "uid", uid);
225}
226
227int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid)
228{
229 return nfs_idmap_lookup_id(name, namelen, "gid", gid);
230}
231
232int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
233{
234 return nfs_idmap_lookup_name(uid, "user", buf, buflen);
235}
236int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen)
237{
238 return nfs_idmap_lookup_name(gid, "group", buf, buflen);
239}
240
241#else /* CONFIG_NFS_USE_IDMAPPER not defined */
242
37#include <linux/module.h> 243#include <linux/module.h>
38#include <linux/mutex.h> 244#include <linux/mutex.h>
39#include <linux/init.h> 245#include <linux/init.h>
@@ -503,16 +709,17 @@ int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namele
503 return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid); 709 return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid);
504} 710}
505 711
506int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf) 712int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
507{ 713{
508 struct idmap *idmap = clp->cl_idmap; 714 struct idmap *idmap = clp->cl_idmap;
509 715
510 return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf); 716 return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
511} 717}
512int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf) 718int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
513{ 719{
514 struct idmap *idmap = clp->cl_idmap; 720 struct idmap *idmap = clp->cl_idmap;
515 721
516 return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf); 722 return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
517} 723}
518 724
725#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 7d2d6c72aa78..314f57164602 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -48,6 +48,7 @@
48#include "internal.h" 48#include "internal.h"
49#include "fscache.h" 49#include "fscache.h"
50#include "dns_resolve.h" 50#include "dns_resolve.h"
51#include "pnfs.h"
51 52
52#define NFSDBG_FACILITY NFSDBG_VFS 53#define NFSDBG_FACILITY NFSDBG_VFS
53 54
@@ -234,9 +235,6 @@ nfs_init_locked(struct inode *inode, void *opaque)
234 return 0; 235 return 0;
235} 236}
236 237
237/* Don't use READDIRPLUS on directories that we believe are too large */
238#define NFS_LIMIT_READDIRPLUS (8*PAGE_SIZE)
239
240/* 238/*
241 * This is our front-end to iget that looks up inodes by file handle 239 * This is our front-end to iget that looks up inodes by file handle
242 * instead of inode number. 240 * instead of inode number.
@@ -291,8 +289,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
291 } else if (S_ISDIR(inode->i_mode)) { 289 } else if (S_ISDIR(inode->i_mode)) {
292 inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; 290 inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
293 inode->i_fop = &nfs_dir_operations; 291 inode->i_fop = &nfs_dir_operations;
294 if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS) 292 if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS))
295 && fattr->size <= NFS_LIMIT_READDIRPLUS)
296 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); 293 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
297 /* Deal with crossing mountpoints */ 294 /* Deal with crossing mountpoints */
298 if ((fattr->valid & NFS_ATTR_FATTR_FSID) 295 if ((fattr->valid & NFS_ATTR_FATTR_FSID)
@@ -623,7 +620,7 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
623 nfs_revalidate_inode(server, inode); 620 nfs_revalidate_inode(server, inode);
624} 621}
625 622
626static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred) 623struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred, fmode_t f_mode)
627{ 624{
628 struct nfs_open_context *ctx; 625 struct nfs_open_context *ctx;
629 626
@@ -633,11 +630,13 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct
633 path_get(&ctx->path); 630 path_get(&ctx->path);
634 ctx->cred = get_rpccred(cred); 631 ctx->cred = get_rpccred(cred);
635 ctx->state = NULL; 632 ctx->state = NULL;
633 ctx->mode = f_mode;
636 ctx->flags = 0; 634 ctx->flags = 0;
637 ctx->error = 0; 635 ctx->error = 0;
638 ctx->dir_cookie = 0; 636 ctx->dir_cookie = 0;
639 nfs_init_lock_context(&ctx->lock_context); 637 nfs_init_lock_context(&ctx->lock_context);
640 ctx->lock_context.open_context = ctx; 638 ctx->lock_context.open_context = ctx;
639 INIT_LIST_HEAD(&ctx->list);
641 } 640 }
642 return ctx; 641 return ctx;
643} 642}
@@ -653,11 +652,15 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
653{ 652{
654 struct inode *inode = ctx->path.dentry->d_inode; 653 struct inode *inode = ctx->path.dentry->d_inode;
655 654
656 if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) 655 if (!list_empty(&ctx->list)) {
656 if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
657 return;
658 list_del(&ctx->list);
659 spin_unlock(&inode->i_lock);
660 } else if (!atomic_dec_and_test(&ctx->lock_context.count))
657 return; 661 return;
658 list_del(&ctx->list); 662 if (inode != NULL)
659 spin_unlock(&inode->i_lock); 663 NFS_PROTO(inode)->close_context(ctx, is_sync);
660 NFS_PROTO(inode)->close_context(ctx, is_sync);
661 if (ctx->cred != NULL) 664 if (ctx->cred != NULL)
662 put_rpccred(ctx->cred); 665 put_rpccred(ctx->cred);
663 path_put(&ctx->path); 666 path_put(&ctx->path);
@@ -673,7 +676,7 @@ void put_nfs_open_context(struct nfs_open_context *ctx)
673 * Ensure that mmap has a recent RPC credential for use when writing out 676 * Ensure that mmap has a recent RPC credential for use when writing out
674 * shared pages 677 * shared pages
675 */ 678 */
676static void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) 679void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
677{ 680{
678 struct inode *inode = filp->f_path.dentry->d_inode; 681 struct inode *inode = filp->f_path.dentry->d_inode;
679 struct nfs_inode *nfsi = NFS_I(inode); 682 struct nfs_inode *nfsi = NFS_I(inode);
@@ -730,11 +733,10 @@ int nfs_open(struct inode *inode, struct file *filp)
730 cred = rpc_lookup_cred(); 733 cred = rpc_lookup_cred();
731 if (IS_ERR(cred)) 734 if (IS_ERR(cred))
732 return PTR_ERR(cred); 735 return PTR_ERR(cred);
733 ctx = alloc_nfs_open_context(&filp->f_path, cred); 736 ctx = alloc_nfs_open_context(&filp->f_path, cred, filp->f_mode);
734 put_rpccred(cred); 737 put_rpccred(cred);
735 if (ctx == NULL) 738 if (ctx == NULL)
736 return -ENOMEM; 739 return -ENOMEM;
737 ctx->mode = filp->f_mode;
738 nfs_file_set_open_context(filp, ctx); 740 nfs_file_set_open_context(filp, ctx);
739 put_nfs_open_context(ctx); 741 put_nfs_open_context(ctx);
740 nfs_fscache_set_inode_cookie(inode, filp); 742 nfs_fscache_set_inode_cookie(inode, filp);
@@ -1409,6 +1411,7 @@ void nfs4_evict_inode(struct inode *inode)
1409{ 1411{
1410 truncate_inode_pages(&inode->i_data, 0); 1412 truncate_inode_pages(&inode->i_data, 0);
1411 end_writeback(inode); 1413 end_writeback(inode);
1414 pnfs_destroy_layout(NFS_I(inode));
1412 /* If we are holding a delegation, return it! */ 1415 /* If we are holding a delegation, return it! */
1413 nfs_inode_return_delegation_noreclaim(inode); 1416 nfs_inode_return_delegation_noreclaim(inode);
1414 /* First call standard NFS clear_inode() code */ 1417 /* First call standard NFS clear_inode() code */
@@ -1446,6 +1449,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
1446 nfsi->delegation = NULL; 1449 nfsi->delegation = NULL;
1447 nfsi->delegation_state = 0; 1450 nfsi->delegation_state = 0;
1448 init_rwsem(&nfsi->rwsem); 1451 init_rwsem(&nfsi->rwsem);
1452 nfsi->layout = NULL;
1449#endif 1453#endif
1450} 1454}
1451 1455
@@ -1493,7 +1497,7 @@ static int nfsiod_start(void)
1493{ 1497{
1494 struct workqueue_struct *wq; 1498 struct workqueue_struct *wq;
1495 dprintk("RPC: creating workqueue nfsiod\n"); 1499 dprintk("RPC: creating workqueue nfsiod\n");
1496 wq = create_singlethread_workqueue("nfsiod"); 1500 wq = alloc_workqueue("nfsiod", WQ_RESCUER, 0);
1497 if (wq == NULL) 1501 if (wq == NULL)
1498 return -ENOMEM; 1502 return -ENOMEM;
1499 nfsiod_workqueue = wq; 1503 nfsiod_workqueue = wq;
@@ -1521,6 +1525,10 @@ static int __init init_nfs_fs(void)
1521{ 1525{
1522 int err; 1526 int err;
1523 1527
1528 err = nfs_idmap_init();
1529 if (err < 0)
1530 goto out9;
1531
1524 err = nfs_dns_resolver_init(); 1532 err = nfs_dns_resolver_init();
1525 if (err < 0) 1533 if (err < 0)
1526 goto out8; 1534 goto out8;
@@ -1585,6 +1593,8 @@ out6:
1585out7: 1593out7:
1586 nfs_dns_resolver_destroy(); 1594 nfs_dns_resolver_destroy();
1587out8: 1595out8:
1596 nfs_idmap_quit();
1597out9:
1588 return err; 1598 return err;
1589} 1599}
1590 1600
@@ -1597,6 +1607,7 @@ static void __exit exit_nfs_fs(void)
1597 nfs_destroy_nfspagecache(); 1607 nfs_destroy_nfspagecache();
1598 nfs_fscache_unregister(); 1608 nfs_fscache_unregister();
1599 nfs_dns_resolver_destroy(); 1609 nfs_dns_resolver_destroy();
1610 nfs_idmap_quit();
1600#ifdef CONFIG_PROC_FS 1611#ifdef CONFIG_PROC_FS
1601 rpc_proc_unregister("nfs"); 1612 rpc_proc_unregister("nfs");
1602#endif 1613#endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index c961bc92c107..db08ff3ff454 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -63,6 +63,12 @@ struct nfs_clone_mount {
63#define NFS_UNSPEC_PORT (-1) 63#define NFS_UNSPEC_PORT (-1)
64 64
65/* 65/*
66 * Maximum number of pages that readdir can use for creating
67 * a vmapped array of pages.
68 */
69#define NFS_MAX_READDIR_PAGES 8
70
71/*
66 * In-kernel mount arguments 72 * In-kernel mount arguments
67 */ 73 */
68struct nfs_parsed_mount_data { 74struct nfs_parsed_mount_data {
@@ -181,15 +187,15 @@ extern void nfs_destroy_directcache(void);
181/* nfs2xdr.c */ 187/* nfs2xdr.c */
182extern int nfs_stat_to_errno(int); 188extern int nfs_stat_to_errno(int);
183extern struct rpc_procinfo nfs_procedures[]; 189extern struct rpc_procinfo nfs_procedures[];
184extern __be32 * nfs_decode_dirent(__be32 *, struct nfs_entry *, int); 190extern __be32 *nfs_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
185 191
186/* nfs3xdr.c */ 192/* nfs3xdr.c */
187extern struct rpc_procinfo nfs3_procedures[]; 193extern struct rpc_procinfo nfs3_procedures[];
188extern __be32 *nfs3_decode_dirent(__be32 *, struct nfs_entry *, int); 194extern __be32 *nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
189 195
190/* nfs4xdr.c */ 196/* nfs4xdr.c */
191#ifdef CONFIG_NFS_V4 197#ifdef CONFIG_NFS_V4
192extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus); 198extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
193#endif 199#endif
194#ifdef CONFIG_NFS_V4_1 200#ifdef CONFIG_NFS_V4_1
195extern const u32 nfs41_maxread_overhead; 201extern const u32 nfs41_maxread_overhead;
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 59047f8d7d72..eceafe74f473 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -153,6 +153,7 @@ int nfs_mount(struct nfs_mount_request *info)
153 .rpc_resp = &result, 153 .rpc_resp = &result,
154 }; 154 };
155 struct rpc_create_args args = { 155 struct rpc_create_args args = {
156 .net = &init_net,
156 .protocol = info->protocol, 157 .protocol = info->protocol,
157 .address = info->sap, 158 .address = info->sap,
158 .addrsize = info->salen, 159 .addrsize = info->salen,
@@ -224,6 +225,7 @@ void nfs_umount(const struct nfs_mount_request *info)
224 .to_retries = 2, 225 .to_retries = 2,
225 }; 226 };
226 struct rpc_create_args args = { 227 struct rpc_create_args args = {
228 .net = &init_net,
227 .protocol = IPPROTO_UDP, 229 .protocol = IPPROTO_UDP,
228 .address = info->sap, 230 .address = info->sap,
229 .addrsize = info->salen, 231 .addrsize = info->salen,
@@ -436,7 +438,7 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
436 438
437 for (i = 0; i < entries; i++) { 439 for (i = 0; i < entries; i++) {
438 flavors[i] = ntohl(*p++); 440 flavors[i] = ntohl(*p++);
439 dprintk("NFS:\tflavor %u: %d\n", i, flavors[i]); 441 dprintk("NFS: auth flavor[%u]: %d\n", i, flavors[i]);
440 } 442 }
441 *count = i; 443 *count = i;
442 444
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index db8846a0e82e..e6bf45710cc7 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -337,10 +337,10 @@ nfs_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs_createargs *args)
337static int 337static int
338nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args) 338nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
339{ 339{
340 p = xdr_encode_fhandle(p, args->fromfh); 340 p = xdr_encode_fhandle(p, args->old_dir);
341 p = xdr_encode_array(p, args->fromname, args->fromlen); 341 p = xdr_encode_array(p, args->old_name->name, args->old_name->len);
342 p = xdr_encode_fhandle(p, args->tofh); 342 p = xdr_encode_fhandle(p, args->new_dir);
343 p = xdr_encode_array(p, args->toname, args->tolen); 343 p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
344 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 344 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
345 return 0; 345 return 0;
346} 346}
@@ -423,9 +423,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
423 struct page **page; 423 struct page **page;
424 size_t hdrlen; 424 size_t hdrlen;
425 unsigned int pglen, recvd; 425 unsigned int pglen, recvd;
426 u32 len;
427 int status, nr = 0; 426 int status, nr = 0;
428 __be32 *end, *entry, *kaddr;
429 427
430 if ((status = ntohl(*p++))) 428 if ((status = ntohl(*p++)))
431 return nfs_stat_to_errno(status); 429 return nfs_stat_to_errno(status);
@@ -445,80 +443,59 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
445 if (pglen > recvd) 443 if (pglen > recvd)
446 pglen = recvd; 444 pglen = recvd;
447 page = rcvbuf->pages; 445 page = rcvbuf->pages;
448 kaddr = p = kmap_atomic(*page, KM_USER0);
449 end = (__be32 *)((char *)p + pglen);
450 entry = p;
451
452 /* Make sure the packet actually has a value_follows and EOF entry */
453 if ((entry + 1) > end)
454 goto short_pkt;
455
456 for (; *p++; nr++) {
457 if (p + 2 > end)
458 goto short_pkt;
459 p++; /* fileid */
460 len = ntohl(*p++);
461 p += XDR_QUADLEN(len) + 1; /* name plus cookie */
462 if (len > NFS2_MAXNAMLEN) {
463 dprintk("NFS: giant filename in readdir (len 0x%x)!\n",
464 len);
465 goto err_unmap;
466 }
467 if (p + 2 > end)
468 goto short_pkt;
469 entry = p;
470 }
471
472 /*
473 * Apparently some server sends responses that are a valid size, but
474 * contain no entries, and have value_follows==0 and EOF==0. For
475 * those, just set the EOF marker.
476 */
477 if (!nr && entry[1] == 0) {
478 dprintk("NFS: readdir reply truncated!\n");
479 entry[1] = 1;
480 }
481 out:
482 kunmap_atomic(kaddr, KM_USER0);
483 return nr; 446 return nr;
484 short_pkt: 447}
485 /* 448
486 * When we get a short packet there are 2 possibilities. We can 449static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
487 * return an error, or fix up the response to look like a valid 450{
488 * response and return what we have so far. If there are no 451 dprintk("nfs: %s: prematurely hit end of receive buffer. "
489 * entries and the packet was short, then return -EIO. If there 452 "Remaining buffer length is %tu words.\n",
490 * are valid entries in the response, return them and pretend that 453 func, xdr->end - xdr->p);
491 * the call was successful, but incomplete. The caller can retry the
492 * readdir starting at the last cookie.
493 */
494 entry[0] = entry[1] = 0;
495 if (!nr)
496 nr = -errno_NFSERR_IO;
497 goto out;
498err_unmap:
499 nr = -errno_NFSERR_IO;
500 goto out;
501} 454}
502 455
503__be32 * 456__be32 *
504nfs_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) 457nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus)
505{ 458{
506 if (!*p++) { 459 __be32 *p;
507 if (!*p) 460 p = xdr_inline_decode(xdr, 4);
461 if (unlikely(!p))
462 goto out_overflow;
463 if (!ntohl(*p++)) {
464 p = xdr_inline_decode(xdr, 4);
465 if (unlikely(!p))
466 goto out_overflow;
467 if (!ntohl(*p++))
508 return ERR_PTR(-EAGAIN); 468 return ERR_PTR(-EAGAIN);
509 entry->eof = 1; 469 entry->eof = 1;
510 return ERR_PTR(-EBADCOOKIE); 470 return ERR_PTR(-EBADCOOKIE);
511 } 471 }
512 472
473 p = xdr_inline_decode(xdr, 8);
474 if (unlikely(!p))
475 goto out_overflow;
476
513 entry->ino = ntohl(*p++); 477 entry->ino = ntohl(*p++);
514 entry->len = ntohl(*p++); 478 entry->len = ntohl(*p++);
479
480 p = xdr_inline_decode(xdr, entry->len + 4);
481 if (unlikely(!p))
482 goto out_overflow;
515 entry->name = (const char *) p; 483 entry->name = (const char *) p;
516 p += XDR_QUADLEN(entry->len); 484 p += XDR_QUADLEN(entry->len);
517 entry->prev_cookie = entry->cookie; 485 entry->prev_cookie = entry->cookie;
518 entry->cookie = ntohl(*p++); 486 entry->cookie = ntohl(*p++);
519 entry->eof = !p[0] && p[1]; 487
488 p = xdr_inline_peek(xdr, 8);
489 if (p != NULL)
490 entry->eof = !p[0] && p[1];
491 else
492 entry->eof = 0;
520 493
521 return p; 494 return p;
495
496out_overflow:
497 print_overflow_msg(__func__, xdr);
498 return ERR_PTR(-EIO);
522} 499}
523 500
524/* 501/*
@@ -596,7 +573,6 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
596 struct kvec *iov = rcvbuf->head; 573 struct kvec *iov = rcvbuf->head;
597 size_t hdrlen; 574 size_t hdrlen;
598 u32 len, recvd; 575 u32 len, recvd;
599 char *kaddr;
600 int status; 576 int status;
601 577
602 if ((status = ntohl(*p++))) 578 if ((status = ntohl(*p++)))
@@ -623,10 +599,7 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
623 return -EIO; 599 return -EIO;
624 } 600 }
625 601
626 /* NULL terminate the string we got */ 602 xdr_terminate_string(rcvbuf, len);
627 kaddr = (char *)kmap_atomic(rcvbuf->pages[0], KM_USER0);
628 kaddr[len+rcvbuf->page_base] = '\0';
629 kunmap_atomic(kaddr, KM_USER0);
630 return 0; 603 return 0;
631} 604}
632 605
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index fabb4f2849a1..ce939c062a52 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -313,7 +313,7 @@ static void nfs3_free_createdata(struct nfs3_createdata *data)
313 */ 313 */
314static int 314static int
315nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 315nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
316 int flags, struct nameidata *nd) 316 int flags, struct nfs_open_context *ctx)
317{ 317{
318 struct nfs3_createdata *data; 318 struct nfs3_createdata *data;
319 mode_t mode = sattr->ia_mode; 319 mode_t mode = sattr->ia_mode;
@@ -438,19 +438,38 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
438 return 1; 438 return 1;
439} 439}
440 440
441static void
442nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
443{
444 msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME];
445}
446
447static int
448nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
449 struct inode *new_dir)
450{
451 struct nfs_renameres *res;
452
453 if (nfs3_async_handle_jukebox(task, old_dir))
454 return 0;
455 res = task->tk_msg.rpc_resp;
456
457 nfs_post_op_update_inode(old_dir, res->old_fattr);
458 nfs_post_op_update_inode(new_dir, res->new_fattr);
459 return 1;
460}
461
441static int 462static int
442nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, 463nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
443 struct inode *new_dir, struct qstr *new_name) 464 struct inode *new_dir, struct qstr *new_name)
444{ 465{
445 struct nfs3_renameargs arg = { 466 struct nfs_renameargs arg = {
446 .fromfh = NFS_FH(old_dir), 467 .old_dir = NFS_FH(old_dir),
447 .fromname = old_name->name, 468 .old_name = old_name,
448 .fromlen = old_name->len, 469 .new_dir = NFS_FH(new_dir),
449 .tofh = NFS_FH(new_dir), 470 .new_name = new_name,
450 .toname = new_name->name,
451 .tolen = new_name->len
452 }; 471 };
453 struct nfs3_renameres res; 472 struct nfs_renameres res;
454 struct rpc_message msg = { 473 struct rpc_message msg = {
455 .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME], 474 .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME],
456 .rpc_argp = &arg, 475 .rpc_argp = &arg,
@@ -460,17 +479,17 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
460 479
461 dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); 480 dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name);
462 481
463 res.fromattr = nfs_alloc_fattr(); 482 res.old_fattr = nfs_alloc_fattr();
464 res.toattr = nfs_alloc_fattr(); 483 res.new_fattr = nfs_alloc_fattr();
465 if (res.fromattr == NULL || res.toattr == NULL) 484 if (res.old_fattr == NULL || res.new_fattr == NULL)
466 goto out; 485 goto out;
467 486
468 status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); 487 status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
469 nfs_post_op_update_inode(old_dir, res.fromattr); 488 nfs_post_op_update_inode(old_dir, res.old_fattr);
470 nfs_post_op_update_inode(new_dir, res.toattr); 489 nfs_post_op_update_inode(new_dir, res.new_fattr);
471out: 490out:
472 nfs_free_fattr(res.toattr); 491 nfs_free_fattr(res.old_fattr);
473 nfs_free_fattr(res.fromattr); 492 nfs_free_fattr(res.new_fattr);
474 dprintk("NFS reply rename: %d\n", status); 493 dprintk("NFS reply rename: %d\n", status);
475 return status; 494 return status;
476} 495}
@@ -611,7 +630,7 @@ out:
611 */ 630 */
612static int 631static int
613nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, 632nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
614 u64 cookie, struct page *page, unsigned int count, int plus) 633 u64 cookie, struct page **pages, unsigned int count, int plus)
615{ 634{
616 struct inode *dir = dentry->d_inode; 635 struct inode *dir = dentry->d_inode;
617 __be32 *verf = NFS_COOKIEVERF(dir); 636 __be32 *verf = NFS_COOKIEVERF(dir);
@@ -621,7 +640,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
621 .verf = {verf[0], verf[1]}, 640 .verf = {verf[0], verf[1]},
622 .plus = plus, 641 .plus = plus,
623 .count = count, 642 .count = count,
624 .pages = &page 643 .pages = pages
625 }; 644 };
626 struct nfs3_readdirres res = { 645 struct nfs3_readdirres res = {
627 .verf = verf, 646 .verf = verf,
@@ -652,7 +671,8 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
652 671
653 nfs_free_fattr(res.dir_attr); 672 nfs_free_fattr(res.dir_attr);
654out: 673out:
655 dprintk("NFS reply readdir: %d\n", status); 674 dprintk("NFS reply readdir%s: %d\n",
675 plus? "plus" : "", status);
656 return status; 676 return status;
657} 677}
658 678
@@ -722,7 +742,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
722 dprintk("NFS call fsstat\n"); 742 dprintk("NFS call fsstat\n");
723 nfs_fattr_init(stat->fattr); 743 nfs_fattr_init(stat->fattr);
724 status = rpc_call_sync(server->client, &msg, 0); 744 status = rpc_call_sync(server->client, &msg, 0);
725 dprintk("NFS reply statfs: %d\n", status); 745 dprintk("NFS reply fsstat: %d\n", status);
726 return status; 746 return status;
727} 747}
728 748
@@ -844,6 +864,8 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
844 .unlink_setup = nfs3_proc_unlink_setup, 864 .unlink_setup = nfs3_proc_unlink_setup,
845 .unlink_done = nfs3_proc_unlink_done, 865 .unlink_done = nfs3_proc_unlink_done,
846 .rename = nfs3_proc_rename, 866 .rename = nfs3_proc_rename,
867 .rename_setup = nfs3_proc_rename_setup,
868 .rename_done = nfs3_proc_rename_done,
847 .link = nfs3_proc_link, 869 .link = nfs3_proc_link,
848 .symlink = nfs3_proc_symlink, 870 .symlink = nfs3_proc_symlink,
849 .mkdir = nfs3_proc_mkdir, 871 .mkdir = nfs3_proc_mkdir,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 9769704f8ce6..d9a5e832c257 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -100,6 +100,13 @@ static const umode_t nfs_type2fmt[] = {
100 [NF3FIFO] = S_IFIFO, 100 [NF3FIFO] = S_IFIFO,
101}; 101};
102 102
103static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
104{
105 dprintk("nfs: %s: prematurely hit end of receive buffer. "
106 "Remaining buffer length is %tu words.\n",
107 func, xdr->end - xdr->p);
108}
109
103/* 110/*
104 * Common NFS XDR functions as inlines 111 * Common NFS XDR functions as inlines
105 */ 112 */
@@ -119,6 +126,29 @@ xdr_decode_fhandle(__be32 *p, struct nfs_fh *fh)
119 return NULL; 126 return NULL;
120} 127}
121 128
129static inline __be32 *
130xdr_decode_fhandle_stream(struct xdr_stream *xdr, struct nfs_fh *fh)
131{
132 __be32 *p;
133 p = xdr_inline_decode(xdr, 4);
134 if (unlikely(!p))
135 goto out_overflow;
136 fh->size = ntohl(*p++);
137
138 if (fh->size <= NFS3_FHSIZE) {
139 p = xdr_inline_decode(xdr, fh->size);
140 if (unlikely(!p))
141 goto out_overflow;
142 memcpy(fh->data, p, fh->size);
143 return p + XDR_QUADLEN(fh->size);
144 }
145 return NULL;
146
147out_overflow:
148 print_overflow_msg(__func__, xdr);
149 return ERR_PTR(-EIO);
150}
151
122/* 152/*
123 * Encode/decode time. 153 * Encode/decode time.
124 */ 154 */
@@ -241,6 +271,26 @@ xdr_decode_post_op_attr(__be32 *p, struct nfs_fattr *fattr)
241} 271}
242 272
243static inline __be32 * 273static inline __be32 *
274xdr_decode_post_op_attr_stream(struct xdr_stream *xdr, struct nfs_fattr *fattr)
275{
276 __be32 *p;
277
278 p = xdr_inline_decode(xdr, 4);
279 if (unlikely(!p))
280 goto out_overflow;
281 if (ntohl(*p++)) {
282 p = xdr_inline_decode(xdr, 84);
283 if (unlikely(!p))
284 goto out_overflow;
285 p = xdr_decode_fattr(p, fattr);
286 }
287 return p;
288out_overflow:
289 print_overflow_msg(__func__, xdr);
290 return ERR_PTR(-EIO);
291}
292
293static inline __be32 *
244xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr) 294xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr)
245{ 295{
246 if (*p++) 296 if (*p++)
@@ -442,12 +492,12 @@ nfs3_xdr_mknodargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mknodargs *args)
442 * Encode RENAME arguments 492 * Encode RENAME arguments
443 */ 493 */
444static int 494static int
445nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs3_renameargs *args) 495nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
446{ 496{
447 p = xdr_encode_fhandle(p, args->fromfh); 497 p = xdr_encode_fhandle(p, args->old_dir);
448 p = xdr_encode_array(p, args->fromname, args->fromlen); 498 p = xdr_encode_array(p, args->old_name->name, args->old_name->len);
449 p = xdr_encode_fhandle(p, args->tofh); 499 p = xdr_encode_fhandle(p, args->new_dir);
450 p = xdr_encode_array(p, args->toname, args->tolen); 500 p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
451 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 501 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
452 return 0; 502 return 0;
453} 503}
@@ -504,9 +554,8 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
504 struct kvec *iov = rcvbuf->head; 554 struct kvec *iov = rcvbuf->head;
505 struct page **page; 555 struct page **page;
506 size_t hdrlen; 556 size_t hdrlen;
507 u32 len, recvd, pglen; 557 u32 recvd, pglen;
508 int status, nr = 0; 558 int status, nr = 0;
509 __be32 *entry, *end, *kaddr;
510 559
511 status = ntohl(*p++); 560 status = ntohl(*p++);
512 /* Decode post_op_attrs */ 561 /* Decode post_op_attrs */
@@ -536,99 +585,38 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
536 if (pglen > recvd) 585 if (pglen > recvd)
537 pglen = recvd; 586 pglen = recvd;
538 page = rcvbuf->pages; 587 page = rcvbuf->pages;
539 kaddr = p = kmap_atomic(*page, KM_USER0);
540 end = (__be32 *)((char *)p + pglen);
541 entry = p;
542
543 /* Make sure the packet actually has a value_follows and EOF entry */
544 if ((entry + 1) > end)
545 goto short_pkt;
546
547 for (; *p++; nr++) {
548 if (p + 3 > end)
549 goto short_pkt;
550 p += 2; /* inode # */
551 len = ntohl(*p++); /* string length */
552 p += XDR_QUADLEN(len) + 2; /* name + cookie */
553 if (len > NFS3_MAXNAMLEN) {
554 dprintk("NFS: giant filename in readdir (len 0x%x)!\n",
555 len);
556 goto err_unmap;
557 }
558 588
559 if (res->plus) {
560 /* post_op_attr */
561 if (p + 2 > end)
562 goto short_pkt;
563 if (*p++) {
564 p += 21;
565 if (p + 1 > end)
566 goto short_pkt;
567 }
568 /* post_op_fh3 */
569 if (*p++) {
570 if (p + 1 > end)
571 goto short_pkt;
572 len = ntohl(*p++);
573 if (len > NFS3_FHSIZE) {
574 dprintk("NFS: giant filehandle in "
575 "readdir (len 0x%x)!\n", len);
576 goto err_unmap;
577 }
578 p += XDR_QUADLEN(len);
579 }
580 }
581
582 if (p + 2 > end)
583 goto short_pkt;
584 entry = p;
585 }
586
587 /*
588 * Apparently some server sends responses that are a valid size, but
589 * contain no entries, and have value_follows==0 and EOF==0. For
590 * those, just set the EOF marker.
591 */
592 if (!nr && entry[1] == 0) {
593 dprintk("NFS: readdir reply truncated!\n");
594 entry[1] = 1;
595 }
596 out:
597 kunmap_atomic(kaddr, KM_USER0);
598 return nr; 589 return nr;
599 short_pkt:
600 /*
601 * When we get a short packet there are 2 possibilities. We can
602 * return an error, or fix up the response to look like a valid
603 * response and return what we have so far. If there are no
604 * entries and the packet was short, then return -EIO. If there
605 * are valid entries in the response, return them and pretend that
606 * the call was successful, but incomplete. The caller can retry the
607 * readdir starting at the last cookie.
608 */
609 entry[0] = entry[1] = 0;
610 if (!nr)
611 nr = -errno_NFSERR_IO;
612 goto out;
613err_unmap:
614 nr = -errno_NFSERR_IO;
615 goto out;
616} 590}
617 591
618__be32 * 592__be32 *
619nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) 593nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus)
620{ 594{
595 __be32 *p;
621 struct nfs_entry old = *entry; 596 struct nfs_entry old = *entry;
622 597
623 if (!*p++) { 598 p = xdr_inline_decode(xdr, 4);
624 if (!*p) 599 if (unlikely(!p))
600 goto out_overflow;
601 if (!ntohl(*p++)) {
602 p = xdr_inline_decode(xdr, 4);
603 if (unlikely(!p))
604 goto out_overflow;
605 if (!ntohl(*p++))
625 return ERR_PTR(-EAGAIN); 606 return ERR_PTR(-EAGAIN);
626 entry->eof = 1; 607 entry->eof = 1;
627 return ERR_PTR(-EBADCOOKIE); 608 return ERR_PTR(-EBADCOOKIE);
628 } 609 }
629 610
611 p = xdr_inline_decode(xdr, 12);
612 if (unlikely(!p))
613 goto out_overflow;
630 p = xdr_decode_hyper(p, &entry->ino); 614 p = xdr_decode_hyper(p, &entry->ino);
631 entry->len = ntohl(*p++); 615 entry->len = ntohl(*p++);
616
617 p = xdr_inline_decode(xdr, entry->len + 8);
618 if (unlikely(!p))
619 goto out_overflow;
632 entry->name = (const char *) p; 620 entry->name = (const char *) p;
633 p += XDR_QUADLEN(entry->len); 621 p += XDR_QUADLEN(entry->len);
634 entry->prev_cookie = entry->cookie; 622 entry->prev_cookie = entry->cookie;
@@ -636,10 +624,17 @@ nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
636 624
637 if (plus) { 625 if (plus) {
638 entry->fattr->valid = 0; 626 entry->fattr->valid = 0;
639 p = xdr_decode_post_op_attr(p, entry->fattr); 627 p = xdr_decode_post_op_attr_stream(xdr, entry->fattr);
628 if (IS_ERR(p))
629 goto out_overflow_exit;
640 /* In fact, a post_op_fh3: */ 630 /* In fact, a post_op_fh3: */
631 p = xdr_inline_decode(xdr, 4);
632 if (unlikely(!p))
633 goto out_overflow;
641 if (*p++) { 634 if (*p++) {
642 p = xdr_decode_fhandle(p, entry->fh); 635 p = xdr_decode_fhandle_stream(xdr, entry->fh);
636 if (IS_ERR(p))
637 goto out_overflow_exit;
643 /* Ugh -- server reply was truncated */ 638 /* Ugh -- server reply was truncated */
644 if (p == NULL) { 639 if (p == NULL) {
645 dprintk("NFS: FH truncated\n"); 640 dprintk("NFS: FH truncated\n");
@@ -650,8 +645,18 @@ nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
650 memset((u8*)(entry->fh), 0, sizeof(*entry->fh)); 645 memset((u8*)(entry->fh), 0, sizeof(*entry->fh));
651 } 646 }
652 647
653 entry->eof = !p[0] && p[1]; 648 p = xdr_inline_peek(xdr, 8);
649 if (p != NULL)
650 entry->eof = !p[0] && p[1];
651 else
652 entry->eof = 0;
653
654 return p; 654 return p;
655
656out_overflow:
657 print_overflow_msg(__func__, xdr);
658out_overflow_exit:
659 return ERR_PTR(-EIO);
655} 660}
656 661
657/* 662/*
@@ -824,7 +829,6 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
824 struct kvec *iov = rcvbuf->head; 829 struct kvec *iov = rcvbuf->head;
825 size_t hdrlen; 830 size_t hdrlen;
826 u32 len, recvd; 831 u32 len, recvd;
827 char *kaddr;
828 int status; 832 int status;
829 833
830 status = ntohl(*p++); 834 status = ntohl(*p++);
@@ -857,10 +861,7 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
857 return -EIO; 861 return -EIO;
858 } 862 }
859 863
860 /* NULL terminate the string we got */ 864 xdr_terminate_string(rcvbuf, len);
861 kaddr = (char*)kmap_atomic(rcvbuf->pages[0], KM_USER0);
862 kaddr[len+rcvbuf->page_base] = '\0';
863 kunmap_atomic(kaddr, KM_USER0);
864 return 0; 865 return 0;
865} 866}
866 867
@@ -970,14 +971,14 @@ nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
970 * Decode RENAME reply 971 * Decode RENAME reply
971 */ 972 */
972static int 973static int
973nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs3_renameres *res) 974nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs_renameres *res)
974{ 975{
975 int status; 976 int status;
976 977
977 if ((status = ntohl(*p++)) != 0) 978 if ((status = ntohl(*p++)) != 0)
978 status = nfs_stat_to_errno(status); 979 status = nfs_stat_to_errno(status);
979 p = xdr_decode_wcc_data(p, res->fromattr); 980 p = xdr_decode_wcc_data(p, res->old_fattr);
980 p = xdr_decode_wcc_data(p, res->toattr); 981 p = xdr_decode_wcc_data(p, res->new_fattr);
981 return status; 982 return status;
982} 983}
983 984
@@ -1043,8 +1044,9 @@ nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res)
1043 res->wtmult = ntohl(*p++); 1044 res->wtmult = ntohl(*p++);
1044 res->dtpref = ntohl(*p++); 1045 res->dtpref = ntohl(*p++);
1045 p = xdr_decode_hyper(p, &res->maxfilesize); 1046 p = xdr_decode_hyper(p, &res->maxfilesize);
1047 p = xdr_decode_time3(p, &res->time_delta);
1046 1048
1047 /* ignore time_delta and properties */ 1049 /* ignore properties */
1048 res->lease_time = 0; 1050 res->lease_time = 0;
1049 return 0; 1051 return 0;
1050} 1052}
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 311e15cc8af0..9fa496387fdf 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -242,8 +242,6 @@ extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
242extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); 242extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
243extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); 243extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
244extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait); 244extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
245extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
246extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
247extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); 245extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
248extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, 246extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
249 struct nfs4_fs_locations *fs_locations, struct page *page); 247 struct nfs4_fs_locations *fs_locations, struct page *page);
@@ -333,7 +331,7 @@ extern void nfs_free_seqid(struct nfs_seqid *seqid);
333extern const nfs4_stateid zero_stateid; 331extern const nfs4_stateid zero_stateid;
334 332
335/* nfs4xdr.c */ 333/* nfs4xdr.c */
336extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus); 334extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
337extern struct rpc_procinfo nfs4_procedures[]; 335extern struct rpc_procinfo nfs4_procedures[];
338 336
339struct nfs4_mount_data; 337struct nfs4_mount_data;
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
new file mode 100644
index 000000000000..2e92f0d8d654
--- /dev/null
+++ b/fs/nfs/nfs4filelayout.c
@@ -0,0 +1,280 @@
1/*
2 * Module for the pnfs nfs4 file layout driver.
3 * Defines all I/O and Policy interface operations, plus code
4 * to register itself with the pNFS client.
5 *
6 * Copyright (c) 2002
7 * The Regents of the University of Michigan
8 * All Rights Reserved
9 *
10 * Dean Hildebrand <dhildebz@umich.edu>
11 *
12 * Permission is granted to use, copy, create derivative works, and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the University of Michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. If
17 * the above copyright notice or any other identification of the
18 * University of Michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * This software is provided as is, without representation or warranty
22 * of any kind either express or implied, including without limitation
23 * the implied warranties of merchantability, fitness for a particular
24 * purpose, or noninfringement. The Regents of the University of
25 * Michigan shall not be liable for any damages, including special,
26 * indirect, incidental, or consequential damages, with respect to any
27 * claim arising out of or in connection with the use of the software,
28 * even if it has been or is hereafter advised of the possibility of
29 * such damages.
30 */
31
32#include <linux/nfs_fs.h>
33
34#include "internal.h"
35#include "nfs4filelayout.h"
36
37#define NFSDBG_FACILITY NFSDBG_PNFS_LD
38
39MODULE_LICENSE("GPL");
40MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
41MODULE_DESCRIPTION("The NFSv4 file layout driver");
42
43static int
44filelayout_set_layoutdriver(struct nfs_server *nfss)
45{
46 int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client,
47 nfs4_fl_free_deviceid_callback);
48 if (status) {
49 printk(KERN_WARNING "%s: deviceid cache could not be "
50 "initialized\n", __func__);
51 return status;
52 }
53 dprintk("%s: deviceid cache has been initialized successfully\n",
54 __func__);
55 return 0;
56}
57
58/* Clear out the layout by destroying its device list */
59static int
60filelayout_clear_layoutdriver(struct nfs_server *nfss)
61{
62 dprintk("--> %s\n", __func__);
63
64 if (nfss->nfs_client->cl_devid_cache)
65 pnfs_put_deviceid_cache(nfss->nfs_client);
66 return 0;
67}
68
69/*
70 * filelayout_check_layout()
71 *
72 * Make sure layout segment parameters are sane WRT the device.
73 * At this point no generic layer initialization of the lseg has occurred,
74 * and nothing has been added to the layout_hdr cache.
75 *
76 */
77static int
78filelayout_check_layout(struct pnfs_layout_hdr *lo,
79 struct nfs4_filelayout_segment *fl,
80 struct nfs4_layoutget_res *lgr,
81 struct nfs4_deviceid *id)
82{
83 struct nfs4_file_layout_dsaddr *dsaddr;
84 int status = -EINVAL;
85 struct nfs_server *nfss = NFS_SERVER(lo->inode);
86
87 dprintk("--> %s\n", __func__);
88
89 if (fl->pattern_offset > lgr->range.offset) {
90 dprintk("%s pattern_offset %lld to large\n",
91 __func__, fl->pattern_offset);
92 goto out;
93 }
94
95 if (fl->stripe_unit % PAGE_SIZE) {
96 dprintk("%s Stripe unit (%u) not page aligned\n",
97 __func__, fl->stripe_unit);
98 goto out;
99 }
100
101 /* find and reference the deviceid */
102 dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
103 if (dsaddr == NULL) {
104 dsaddr = get_device_info(lo->inode, id);
105 if (dsaddr == NULL)
106 goto out;
107 }
108 fl->dsaddr = dsaddr;
109
110 if (fl->first_stripe_index < 0 ||
111 fl->first_stripe_index >= dsaddr->stripe_count) {
112 dprintk("%s Bad first_stripe_index %d\n",
113 __func__, fl->first_stripe_index);
114 goto out_put;
115 }
116
117 if ((fl->stripe_type == STRIPE_SPARSE &&
118 fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) ||
119 (fl->stripe_type == STRIPE_DENSE &&
120 fl->num_fh != dsaddr->stripe_count)) {
121 dprintk("%s num_fh %u not valid for given packing\n",
122 __func__, fl->num_fh);
123 goto out_put;
124 }
125
126 if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) {
127 dprintk("%s Stripe unit (%u) not aligned with rsize %u "
128 "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize,
129 nfss->wsize);
130 }
131
132 status = 0;
133out:
134 dprintk("--> %s returns %d\n", __func__, status);
135 return status;
136out_put:
137 pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid);
138 goto out;
139}
140
141static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl)
142{
143 int i;
144
145 for (i = 0; i < fl->num_fh; i++) {
146 if (!fl->fh_array[i])
147 break;
148 kfree(fl->fh_array[i]);
149 }
150 kfree(fl->fh_array);
151 fl->fh_array = NULL;
152}
153
154static void
155_filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
156{
157 filelayout_free_fh_array(fl);
158 kfree(fl);
159}
160
161static int
162filelayout_decode_layout(struct pnfs_layout_hdr *flo,
163 struct nfs4_filelayout_segment *fl,
164 struct nfs4_layoutget_res *lgr,
165 struct nfs4_deviceid *id)
166{
167 uint32_t *p = (uint32_t *)lgr->layout.buf;
168 uint32_t nfl_util;
169 int i;
170
171 dprintk("%s: set_layout_map Begin\n", __func__);
172
173 memcpy(id, p, sizeof(*id));
174 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
175 print_deviceid(id);
176
177 nfl_util = be32_to_cpup(p++);
178 if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
179 fl->commit_through_mds = 1;
180 if (nfl_util & NFL4_UFLG_DENSE)
181 fl->stripe_type = STRIPE_DENSE;
182 else
183 fl->stripe_type = STRIPE_SPARSE;
184 fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK;
185
186 fl->first_stripe_index = be32_to_cpup(p++);
187 p = xdr_decode_hyper(p, &fl->pattern_offset);
188 fl->num_fh = be32_to_cpup(p++);
189
190 dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n",
191 __func__, nfl_util, fl->num_fh, fl->first_stripe_index,
192 fl->pattern_offset);
193
194 fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *),
195 GFP_KERNEL);
196 if (!fl->fh_array)
197 return -ENOMEM;
198
199 for (i = 0; i < fl->num_fh; i++) {
200 /* Do we want to use a mempool here? */
201 fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL);
202 if (!fl->fh_array[i]) {
203 filelayout_free_fh_array(fl);
204 return -ENOMEM;
205 }
206 fl->fh_array[i]->size = be32_to_cpup(p++);
207 if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
208 printk(KERN_ERR "Too big fh %d received %d\n",
209 i, fl->fh_array[i]->size);
210 filelayout_free_fh_array(fl);
211 return -EIO;
212 }
213 memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size);
214 p += XDR_QUADLEN(fl->fh_array[i]->size);
215 dprintk("DEBUG: %s: fh len %d\n", __func__,
216 fl->fh_array[i]->size);
217 }
218
219 return 0;
220}
221
222static struct pnfs_layout_segment *
223filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
224 struct nfs4_layoutget_res *lgr)
225{
226 struct nfs4_filelayout_segment *fl;
227 int rc;
228 struct nfs4_deviceid id;
229
230 dprintk("--> %s\n", __func__);
231 fl = kzalloc(sizeof(*fl), GFP_KERNEL);
232 if (!fl)
233 return NULL;
234
235 rc = filelayout_decode_layout(layoutid, fl, lgr, &id);
236 if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id)) {
237 _filelayout_free_lseg(fl);
238 return NULL;
239 }
240 return &fl->generic_hdr;
241}
242
243static void
244filelayout_free_lseg(struct pnfs_layout_segment *lseg)
245{
246 struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode);
247 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
248
249 dprintk("--> %s\n", __func__);
250 pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache,
251 &fl->dsaddr->deviceid);
252 _filelayout_free_lseg(fl);
253}
254
255static struct pnfs_layoutdriver_type filelayout_type = {
256 .id = LAYOUT_NFSV4_1_FILES,
257 .name = "LAYOUT_NFSV4_1_FILES",
258 .owner = THIS_MODULE,
259 .set_layoutdriver = filelayout_set_layoutdriver,
260 .clear_layoutdriver = filelayout_clear_layoutdriver,
261 .alloc_lseg = filelayout_alloc_lseg,
262 .free_lseg = filelayout_free_lseg,
263};
264
265static int __init nfs4filelayout_init(void)
266{
267 printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n",
268 __func__);
269 return pnfs_register_layoutdriver(&filelayout_type);
270}
271
272static void __exit nfs4filelayout_exit(void)
273{
274 printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n",
275 __func__);
276 pnfs_unregister_layoutdriver(&filelayout_type);
277}
278
279module_init(nfs4filelayout_init);
280module_exit(nfs4filelayout_exit);
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
new file mode 100644
index 000000000000..bbf60dd2ab9d
--- /dev/null
+++ b/fs/nfs/nfs4filelayout.h
@@ -0,0 +1,94 @@
1/*
2 * NFSv4 file layout driver data structures.
3 *
4 * Copyright (c) 2002
5 * The Regents of the University of Michigan
6 * All Rights Reserved
7 *
8 * Dean Hildebrand <dhildebz@umich.edu>
9 *
10 * Permission is granted to use, copy, create derivative works, and
11 * redistribute this software and such derivative works for any purpose,
12 * so long as the name of the University of Michigan is not used in
13 * any advertising or publicity pertaining to the use or distribution
14 * of this software without specific, written prior authorization. If
15 * the above copyright notice or any other identification of the
16 * University of Michigan is included in any copy of any portion of
17 * this software, then the disclaimer below must also be included.
18 *
19 * This software is provided as is, without representation or warranty
20 * of any kind either express or implied, including without limitation
21 * the implied warranties of merchantability, fitness for a particular
22 * purpose, or noninfringement. The Regents of the University of
23 * Michigan shall not be liable for any damages, including special,
24 * indirect, incidental, or consequential damages, with respect to any
25 * claim arising out of or in connection with the use of the software,
26 * even if it has been or is hereafter advised of the possibility of
27 * such damages.
28 */
29
30#ifndef FS_NFS_NFS4FILELAYOUT_H
31#define FS_NFS_NFS4FILELAYOUT_H
32
33#include "pnfs.h"
34
35/*
36 * Field testing shows we need to support upto 4096 stripe indices.
37 * We store each index as a u8 (u32 on the wire) to keep the memory footprint
38 * reasonable. This in turn means we support a maximum of 256
39 * RFC 5661 multipath_list4 structures.
40 */
41#define NFS4_PNFS_MAX_STRIPE_CNT 4096
42#define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */
43
44enum stripetype4 {
45 STRIPE_SPARSE = 1,
46 STRIPE_DENSE = 2
47};
48
49/* Individual ip address */
50struct nfs4_pnfs_ds {
51 struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */
52 u32 ds_ip_addr;
53 u32 ds_port;
54 struct nfs_client *ds_clp;
55 atomic_t ds_count;
56};
57
58struct nfs4_file_layout_dsaddr {
59 struct pnfs_deviceid_node deviceid;
60 u32 stripe_count;
61 u8 *stripe_indices;
62 u32 ds_num;
63 struct nfs4_pnfs_ds *ds_list[1];
64};
65
66struct nfs4_filelayout_segment {
67 struct pnfs_layout_segment generic_hdr;
68 u32 stripe_type;
69 u32 commit_through_mds;
70 u32 stripe_unit;
71 u32 first_stripe_index;
72 u64 pattern_offset;
73 struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
74 unsigned int num_fh;
75 struct nfs_fh **fh_array;
76};
77
78static inline struct nfs4_filelayout_segment *
79FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
80{
81 return container_of(lseg,
82 struct nfs4_filelayout_segment,
83 generic_hdr);
84}
85
86extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *);
87extern void print_ds(struct nfs4_pnfs_ds *ds);
88extern void print_deviceid(struct nfs4_deviceid *dev_id);
89extern struct nfs4_file_layout_dsaddr *
90nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
91struct nfs4_file_layout_dsaddr *
92get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
93
94#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
new file mode 100644
index 000000000000..51fe64ace55a
--- /dev/null
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -0,0 +1,448 @@
1/*
2 * Device operations for the pnfs nfs4 file layout driver.
3 *
4 * Copyright (c) 2002
5 * The Regents of the University of Michigan
6 * All Rights Reserved
7 *
8 * Dean Hildebrand <dhildebz@umich.edu>
9 * Garth Goodson <Garth.Goodson@netapp.com>
10 *
11 * Permission is granted to use, copy, create derivative works, and
12 * redistribute this software and such derivative works for any purpose,
13 * so long as the name of the University of Michigan is not used in
14 * any advertising or publicity pertaining to the use or distribution
15 * of this software without specific, written prior authorization. If
16 * the above copyright notice or any other identification of the
17 * University of Michigan is included in any copy of any portion of
18 * this software, then the disclaimer below must also be included.
19 *
20 * This software is provided as is, without representation or warranty
21 * of any kind either express or implied, including without limitation
22 * the implied warranties of merchantability, fitness for a particular
23 * purpose, or noninfringement. The Regents of the University of
24 * Michigan shall not be liable for any damages, including special,
25 * indirect, incidental, or consequential damages, with respect to any
26 * claim arising out of or in connection with the use of the software,
27 * even if it has been or is hereafter advised of the possibility of
28 * such damages.
29 */
30
31#include <linux/nfs_fs.h>
32#include <linux/vmalloc.h>
33
34#include "internal.h"
35#include "nfs4filelayout.h"
36
37#define NFSDBG_FACILITY NFSDBG_PNFS_LD
38
39/*
40 * Data server cache
41 *
42 * Data servers can be mapped to different device ids.
43 * nfs4_pnfs_ds reference counting
44 * - set to 1 on allocation
45 * - incremented when a device id maps a data server already in the cache.
46 * - decremented when deviceid is removed from the cache.
47 */
48DEFINE_SPINLOCK(nfs4_ds_cache_lock);
49static LIST_HEAD(nfs4_data_server_cache);
50
51/* Debug routines */
52void
53print_ds(struct nfs4_pnfs_ds *ds)
54{
55 if (ds == NULL) {
56 printk("%s NULL device\n", __func__);
57 return;
58 }
59 printk(" ip_addr %x port %hu\n"
60 " ref count %d\n"
61 " client %p\n"
62 " cl_exchange_flags %x\n",
63 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
64 atomic_read(&ds->ds_count), ds->ds_clp,
65 ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
66}
67
68void
69print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr)
70{
71 int i;
72
73 ifdebug(FACILITY) {
74 printk("%s dsaddr->ds_num %d\n", __func__,
75 dsaddr->ds_num);
76 for (i = 0; i < dsaddr->ds_num; i++)
77 print_ds(dsaddr->ds_list[i]);
78 }
79}
80
81void print_deviceid(struct nfs4_deviceid *id)
82{
83 u32 *p = (u32 *)id;
84
85 dprintk("%s: device id= [%x%x%x%x]\n", __func__,
86 p[0], p[1], p[2], p[3]);
87}
88
89/* nfs4_ds_cache_lock is held */
90static struct nfs4_pnfs_ds *
91_data_server_lookup_locked(u32 ip_addr, u32 port)
92{
93 struct nfs4_pnfs_ds *ds;
94
95 dprintk("_data_server_lookup: ip_addr=%x port=%hu\n",
96 ntohl(ip_addr), ntohs(port));
97
98 list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) {
99 if (ds->ds_ip_addr == ip_addr &&
100 ds->ds_port == port) {
101 return ds;
102 }
103 }
104 return NULL;
105}
106
107static void
108destroy_ds(struct nfs4_pnfs_ds *ds)
109{
110 dprintk("--> %s\n", __func__);
111 ifdebug(FACILITY)
112 print_ds(ds);
113
114 if (ds->ds_clp)
115 nfs_put_client(ds->ds_clp);
116 kfree(ds);
117}
118
119static void
120nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
121{
122 struct nfs4_pnfs_ds *ds;
123 int i;
124
125 print_deviceid(&dsaddr->deviceid.de_id);
126
127 for (i = 0; i < dsaddr->ds_num; i++) {
128 ds = dsaddr->ds_list[i];
129 if (ds != NULL) {
130 if (atomic_dec_and_lock(&ds->ds_count,
131 &nfs4_ds_cache_lock)) {
132 list_del_init(&ds->ds_node);
133 spin_unlock(&nfs4_ds_cache_lock);
134 destroy_ds(ds);
135 }
136 }
137 }
138 kfree(dsaddr->stripe_indices);
139 kfree(dsaddr);
140}
141
142void
143nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device)
144{
145 struct nfs4_file_layout_dsaddr *dsaddr =
146 container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
147
148 nfs4_fl_free_deviceid(dsaddr);
149}
150
151static struct nfs4_pnfs_ds *
152nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port)
153{
154 struct nfs4_pnfs_ds *tmp_ds, *ds;
155
156 ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL);
157 if (!ds)
158 goto out;
159
160 spin_lock(&nfs4_ds_cache_lock);
161 tmp_ds = _data_server_lookup_locked(ip_addr, port);
162 if (tmp_ds == NULL) {
163 ds->ds_ip_addr = ip_addr;
164 ds->ds_port = port;
165 atomic_set(&ds->ds_count, 1);
166 INIT_LIST_HEAD(&ds->ds_node);
167 ds->ds_clp = NULL;
168 list_add(&ds->ds_node, &nfs4_data_server_cache);
169 dprintk("%s add new data server ip 0x%x\n", __func__,
170 ds->ds_ip_addr);
171 } else {
172 kfree(ds);
173 atomic_inc(&tmp_ds->ds_count);
174 dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n",
175 __func__, tmp_ds->ds_ip_addr,
176 atomic_read(&tmp_ds->ds_count));
177 ds = tmp_ds;
178 }
179 spin_unlock(&nfs4_ds_cache_lock);
180out:
181 return ds;
182}
183
184/*
185 * Currently only support ipv4, and one multi-path address.
186 */
187static struct nfs4_pnfs_ds *
188decode_and_add_ds(__be32 **pp, struct inode *inode)
189{
190 struct nfs4_pnfs_ds *ds = NULL;
191 char *buf;
192 const char *ipend, *pstr;
193 u32 ip_addr, port;
194 int nlen, rlen, i;
195 int tmp[2];
196 __be32 *r_netid, *r_addr, *p = *pp;
197
198 /* r_netid */
199 nlen = be32_to_cpup(p++);
200 r_netid = p;
201 p += XDR_QUADLEN(nlen);
202
203 /* r_addr */
204 rlen = be32_to_cpup(p++);
205 r_addr = p;
206 p += XDR_QUADLEN(rlen);
207 *pp = p;
208
209 /* Check that netid is "tcp" */
210 if (nlen != 3 || memcmp((char *)r_netid, "tcp", 3)) {
211 dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__);
212 goto out_err;
213 }
214
215 /* ipv6 length plus port is legal */
216 if (rlen > INET6_ADDRSTRLEN + 8) {
217 dprintk("%s Invalid address, length %d\n", __func__,
218 rlen);
219 goto out_err;
220 }
221 buf = kmalloc(rlen + 1, GFP_KERNEL);
222 buf[rlen] = '\0';
223 memcpy(buf, r_addr, rlen);
224
225 /* replace the port dots with dashes for the in4_pton() delimiter*/
226 for (i = 0; i < 2; i++) {
227 char *res = strrchr(buf, '.');
228 *res = '-';
229 }
230
231 /* Currently only support ipv4 address */
232 if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) {
233 dprintk("%s: Only ipv4 addresses supported\n", __func__);
234 goto out_free;
235 }
236
237 /* port */
238 pstr = ipend;
239 sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]);
240 port = htons((tmp[0] << 8) | (tmp[1]));
241
242 ds = nfs4_pnfs_ds_add(inode, ip_addr, port);
243 dprintk("%s Decoded address and port %s\n", __func__, buf);
244out_free:
245 kfree(buf);
246out_err:
247 return ds;
248}
249
250/* Decode opaque device data and return the result */
251static struct nfs4_file_layout_dsaddr*
252decode_device(struct inode *ino, struct pnfs_device *pdev)
253{
254 int i, dummy;
255 u32 cnt, num;
256 u8 *indexp;
257 __be32 *p = (__be32 *)pdev->area, *indicesp;
258 struct nfs4_file_layout_dsaddr *dsaddr;
259
260 /* Get the stripe count (number of stripe index) */
261 cnt = be32_to_cpup(p++);
262 dprintk("%s stripe count %d\n", __func__, cnt);
263 if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
264 printk(KERN_WARNING "%s: stripe count %d greater than "
265 "supported maximum %d\n", __func__,
266 cnt, NFS4_PNFS_MAX_STRIPE_CNT);
267 goto out_err;
268 }
269
270 /* Check the multipath list count */
271 indicesp = p;
272 p += XDR_QUADLEN(cnt << 2);
273 num = be32_to_cpup(p++);
274 dprintk("%s ds_num %u\n", __func__, num);
275 if (num > NFS4_PNFS_MAX_MULTI_CNT) {
276 printk(KERN_WARNING "%s: multipath count %d greater than "
277 "supported maximum %d\n", __func__,
278 num, NFS4_PNFS_MAX_MULTI_CNT);
279 goto out_err;
280 }
281 dsaddr = kzalloc(sizeof(*dsaddr) +
282 (sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
283 GFP_KERNEL);
284 if (!dsaddr)
285 goto out_err;
286
287 dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL);
288 if (!dsaddr->stripe_indices)
289 goto out_err_free;
290
291 dsaddr->stripe_count = cnt;
292 dsaddr->ds_num = num;
293
294 memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id));
295
296 /* Go back an read stripe indices */
297 p = indicesp;
298 indexp = &dsaddr->stripe_indices[0];
299 for (i = 0; i < dsaddr->stripe_count; i++) {
300 *indexp = be32_to_cpup(p++);
301 if (*indexp >= num)
302 goto out_err_free;
303 indexp++;
304 }
305 /* Skip already read multipath list count */
306 p++;
307
308 for (i = 0; i < dsaddr->ds_num; i++) {
309 int j;
310
311 dummy = be32_to_cpup(p++); /* multipath count */
312 if (dummy > 1) {
313 printk(KERN_WARNING
314 "%s: Multipath count %d not supported, "
315 "skipping all greater than 1\n", __func__,
316 dummy);
317 }
318 for (j = 0; j < dummy; j++) {
319 if (j == 0) {
320 dsaddr->ds_list[i] = decode_and_add_ds(&p, ino);
321 if (dsaddr->ds_list[i] == NULL)
322 goto out_err_free;
323 } else {
324 u32 len;
325 /* skip extra multipath */
326 len = be32_to_cpup(p++);
327 p += XDR_QUADLEN(len);
328 len = be32_to_cpup(p++);
329 p += XDR_QUADLEN(len);
330 continue;
331 }
332 }
333 }
334 return dsaddr;
335
336out_err_free:
337 nfs4_fl_free_deviceid(dsaddr);
338out_err:
339 dprintk("%s ERROR: returning NULL\n", __func__);
340 return NULL;
341}
342
343/*
344 * Decode the opaque device specified in 'dev'
345 * and add it to the list of available devices.
346 * If the deviceid is already cached, nfs4_add_deviceid will return
347 * a pointer to the cached struct and throw away the new.
348 */
349static struct nfs4_file_layout_dsaddr*
350decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
351{
352 struct nfs4_file_layout_dsaddr *dsaddr;
353 struct pnfs_deviceid_node *d;
354
355 dsaddr = decode_device(inode, dev);
356 if (!dsaddr) {
357 printk(KERN_WARNING "%s: Could not decode or add device\n",
358 __func__);
359 return NULL;
360 }
361
362 d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache,
363 &dsaddr->deviceid);
364
365 return container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
366}
367
368/*
369 * Retrieve the information for dev_id, add it to the list
370 * of available devices, and return it.
371 */
372struct nfs4_file_layout_dsaddr *
373get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id)
374{
375 struct pnfs_device *pdev = NULL;
376 u32 max_resp_sz;
377 int max_pages;
378 struct page **pages = NULL;
379 struct nfs4_file_layout_dsaddr *dsaddr = NULL;
380 int rc, i;
381 struct nfs_server *server = NFS_SERVER(inode);
382
383 /*
384 * Use the session max response size as the basis for setting
385 * GETDEVICEINFO's maxcount
386 */
387 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
388 max_pages = max_resp_sz >> PAGE_SHIFT;
389 dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
390 __func__, inode, max_resp_sz, max_pages);
391
392 pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL);
393 if (pdev == NULL)
394 return NULL;
395
396 pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL);
397 if (pages == NULL) {
398 kfree(pdev);
399 return NULL;
400 }
401 for (i = 0; i < max_pages; i++) {
402 pages[i] = alloc_page(GFP_KERNEL);
403 if (!pages[i])
404 goto out_free;
405 }
406
407 /* set pdev->area */
408 pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL);
409 if (!pdev->area)
410 goto out_free;
411
412 memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
413 pdev->layout_type = LAYOUT_NFSV4_1_FILES;
414 pdev->pages = pages;
415 pdev->pgbase = 0;
416 pdev->pglen = PAGE_SIZE * max_pages;
417 pdev->mincount = 0;
418
419 rc = nfs4_proc_getdeviceinfo(server, pdev);
420 dprintk("%s getdevice info returns %d\n", __func__, rc);
421 if (rc)
422 goto out_free;
423
424 /*
425 * Found new device, need to decode it and then add it to the
426 * list of known devices for this mountpoint.
427 */
428 dsaddr = decode_and_add_device(inode, pdev);
429out_free:
430 if (pdev->area != NULL)
431 vunmap(pdev->area);
432 for (i = 0; i < max_pages; i++)
433 __free_page(pages[i]);
434 kfree(pages);
435 kfree(pdev);
436 dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
437 return dsaddr;
438}
439
440struct nfs4_file_layout_dsaddr *
441nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id)
442{
443 struct pnfs_deviceid_node *d;
444
445 d = pnfs_find_get_deviceid(clp->cl_devid_cache, id);
446 return (d == NULL) ? NULL :
447 container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
448}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 089da5b5d20a..32c8758c99fd 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -55,6 +55,7 @@
55#include "internal.h" 55#include "internal.h"
56#include "iostat.h" 56#include "iostat.h"
57#include "callback.h" 57#include "callback.h"
58#include "pnfs.h"
58 59
59#define NFSDBG_FACILITY NFSDBG_PROC 60#define NFSDBG_FACILITY NFSDBG_PROC
60 61
@@ -129,7 +130,8 @@ const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
129 | FATTR4_WORD0_MAXREAD 130 | FATTR4_WORD0_MAXREAD
130 | FATTR4_WORD0_MAXWRITE 131 | FATTR4_WORD0_MAXWRITE
131 | FATTR4_WORD0_LEASE_TIME, 132 | FATTR4_WORD0_LEASE_TIME,
132 0 133 FATTR4_WORD1_TIME_DELTA
134 | FATTR4_WORD1_FS_LAYOUT_TYPES
133}; 135};
134 136
135const u32 nfs4_fs_locations_bitmap[2] = { 137const u32 nfs4_fs_locations_bitmap[2] = {
@@ -255,9 +257,6 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
255 nfs4_state_mark_reclaim_nograce(clp, state); 257 nfs4_state_mark_reclaim_nograce(clp, state);
256 goto do_state_recovery; 258 goto do_state_recovery;
257 case -NFS4ERR_STALE_STATEID: 259 case -NFS4ERR_STALE_STATEID:
258 if (state == NULL)
259 break;
260 nfs4_state_mark_reclaim_reboot(clp, state);
261 case -NFS4ERR_STALE_CLIENTID: 260 case -NFS4ERR_STALE_CLIENTID:
262 case -NFS4ERR_EXPIRED: 261 case -NFS4ERR_EXPIRED:
263 goto do_state_recovery; 262 goto do_state_recovery;
@@ -334,10 +333,12 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
334 * Must be called while holding tbl->slot_tbl_lock 333 * Must be called while holding tbl->slot_tbl_lock
335 */ 334 */
336static void 335static void
337nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid) 336nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot)
338{ 337{
338 int free_slotid = free_slot - tbl->slots;
339 int slotid = free_slotid; 339 int slotid = free_slotid;
340 340
341 BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE);
341 /* clear used bit in bitmap */ 342 /* clear used bit in bitmap */
342 __clear_bit(slotid, tbl->used_slots); 343 __clear_bit(slotid, tbl->used_slots);
343 344
@@ -379,7 +380,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
379 struct nfs4_slot_table *tbl; 380 struct nfs4_slot_table *tbl;
380 381
381 tbl = &res->sr_session->fc_slot_table; 382 tbl = &res->sr_session->fc_slot_table;
382 if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) { 383 if (!res->sr_slot) {
383 /* just wake up the next guy waiting since 384 /* just wake up the next guy waiting since
384 * we may have not consumed a slot after all */ 385 * we may have not consumed a slot after all */
385 dprintk("%s: No slot\n", __func__); 386 dprintk("%s: No slot\n", __func__);
@@ -387,17 +388,15 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
387 } 388 }
388 389
389 spin_lock(&tbl->slot_tbl_lock); 390 spin_lock(&tbl->slot_tbl_lock);
390 nfs4_free_slot(tbl, res->sr_slotid); 391 nfs4_free_slot(tbl, res->sr_slot);
391 nfs41_check_drain_session_complete(res->sr_session); 392 nfs41_check_drain_session_complete(res->sr_session);
392 spin_unlock(&tbl->slot_tbl_lock); 393 spin_unlock(&tbl->slot_tbl_lock);
393 res->sr_slotid = NFS4_MAX_SLOT_TABLE; 394 res->sr_slot = NULL;
394} 395}
395 396
396static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) 397static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
397{ 398{
398 unsigned long timestamp; 399 unsigned long timestamp;
399 struct nfs4_slot_table *tbl;
400 struct nfs4_slot *slot;
401 struct nfs_client *clp; 400 struct nfs_client *clp;
402 401
403 /* 402 /*
@@ -410,17 +409,14 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
410 res->sr_status = NFS_OK; 409 res->sr_status = NFS_OK;
411 410
412 /* -ERESTARTSYS can result in skipping nfs41_sequence_setup */ 411 /* -ERESTARTSYS can result in skipping nfs41_sequence_setup */
413 if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) 412 if (!res->sr_slot)
414 goto out; 413 goto out;
415 414
416 tbl = &res->sr_session->fc_slot_table;
417 slot = tbl->slots + res->sr_slotid;
418
419 /* Check the SEQUENCE operation status */ 415 /* Check the SEQUENCE operation status */
420 switch (res->sr_status) { 416 switch (res->sr_status) {
421 case 0: 417 case 0:
422 /* Update the slot's sequence and clientid lease timer */ 418 /* Update the slot's sequence and clientid lease timer */
423 ++slot->seq_nr; 419 ++res->sr_slot->seq_nr;
424 timestamp = res->sr_renewal_time; 420 timestamp = res->sr_renewal_time;
425 clp = res->sr_session->clp; 421 clp = res->sr_session->clp;
426 do_renew_lease(clp, timestamp); 422 do_renew_lease(clp, timestamp);
@@ -433,12 +429,14 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
433 * returned NFS4ERR_DELAY as per Section 2.10.6.2 429 * returned NFS4ERR_DELAY as per Section 2.10.6.2
434 * of RFC5661. 430 * of RFC5661.
435 */ 431 */
436 dprintk("%s: slot=%d seq=%d: Operation in progress\n", 432 dprintk("%s: slot=%ld seq=%d: Operation in progress\n",
437 __func__, res->sr_slotid, slot->seq_nr); 433 __func__,
434 res->sr_slot - res->sr_session->fc_slot_table.slots,
435 res->sr_slot->seq_nr);
438 goto out_retry; 436 goto out_retry;
439 default: 437 default:
440 /* Just update the slot sequence no. */ 438 /* Just update the slot sequence no. */
441 ++slot->seq_nr; 439 ++res->sr_slot->seq_nr;
442 } 440 }
443out: 441out:
444 /* The session may be reset by one of the error handlers. */ 442 /* The session may be reset by one of the error handlers. */
@@ -505,10 +503,9 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
505 503
506 dprintk("--> %s\n", __func__); 504 dprintk("--> %s\n", __func__);
507 /* slot already allocated? */ 505 /* slot already allocated? */
508 if (res->sr_slotid != NFS4_MAX_SLOT_TABLE) 506 if (res->sr_slot != NULL)
509 return 0; 507 return 0;
510 508
511 res->sr_slotid = NFS4_MAX_SLOT_TABLE;
512 tbl = &session->fc_slot_table; 509 tbl = &session->fc_slot_table;
513 510
514 spin_lock(&tbl->slot_tbl_lock); 511 spin_lock(&tbl->slot_tbl_lock);
@@ -550,7 +547,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
550 dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr); 547 dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr);
551 548
552 res->sr_session = session; 549 res->sr_session = session;
553 res->sr_slotid = slotid; 550 res->sr_slot = slot;
554 res->sr_renewal_time = jiffies; 551 res->sr_renewal_time = jiffies;
555 res->sr_status_flags = 0; 552 res->sr_status_flags = 0;
556 /* 553 /*
@@ -576,8 +573,9 @@ int nfs4_setup_sequence(const struct nfs_server *server,
576 goto out; 573 goto out;
577 } 574 }
578 575
579 dprintk("--> %s clp %p session %p sr_slotid %d\n", 576 dprintk("--> %s clp %p session %p sr_slot %ld\n",
580 __func__, session->clp, session, res->sr_slotid); 577 __func__, session->clp, session, res->sr_slot ?
578 res->sr_slot - session->fc_slot_table.slots : -1);
581 579
582 ret = nfs41_setup_sequence(session, args, res, cache_reply, 580 ret = nfs41_setup_sequence(session, args, res, cache_reply,
583 task); 581 task);
@@ -650,7 +648,7 @@ static int nfs4_call_sync_sequence(struct nfs_server *server,
650 .callback_data = &data 648 .callback_data = &data
651 }; 649 };
652 650
653 res->sr_slotid = NFS4_MAX_SLOT_TABLE; 651 res->sr_slot = NULL;
654 if (privileged) 652 if (privileged)
655 task_setup.callback_ops = &nfs41_call_priv_sync_ops; 653 task_setup.callback_ops = &nfs41_call_priv_sync_ops;
656 task = rpc_run_task(&task_setup); 654 task = rpc_run_task(&task_setup);
@@ -735,7 +733,6 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
735 p->o_res.server = p->o_arg.server; 733 p->o_res.server = p->o_arg.server;
736 nfs_fattr_init(&p->f_attr); 734 nfs_fattr_init(&p->f_attr);
737 nfs_fattr_init(&p->dir_attr); 735 nfs_fattr_init(&p->dir_attr);
738 p->o_res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
739} 736}
740 737
741static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, 738static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
@@ -1120,6 +1117,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
1120 clear_bit(NFS_DELEGATED_STATE, &state->flags); 1117 clear_bit(NFS_DELEGATED_STATE, &state->flags);
1121 smp_rmb(); 1118 smp_rmb();
1122 if (state->n_rdwr != 0) { 1119 if (state->n_rdwr != 0) {
1120 clear_bit(NFS_O_RDWR_STATE, &state->flags);
1123 ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate); 1121 ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate);
1124 if (ret != 0) 1122 if (ret != 0)
1125 return ret; 1123 return ret;
@@ -1127,6 +1125,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
1127 return -ESTALE; 1125 return -ESTALE;
1128 } 1126 }
1129 if (state->n_wronly != 0) { 1127 if (state->n_wronly != 0) {
1128 clear_bit(NFS_O_WRONLY_STATE, &state->flags);
1130 ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate); 1129 ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate);
1131 if (ret != 0) 1130 if (ret != 0)
1132 return ret; 1131 return ret;
@@ -1134,6 +1133,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
1134 return -ESTALE; 1133 return -ESTALE;
1135 } 1134 }
1136 if (state->n_rdonly != 0) { 1135 if (state->n_rdonly != 0) {
1136 clear_bit(NFS_O_RDONLY_STATE, &state->flags);
1137 ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate); 1137 ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate);
1138 if (ret != 0) 1138 if (ret != 0)
1139 return ret; 1139 return ret;
@@ -1188,7 +1188,7 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
1188 int err; 1188 int err;
1189 do { 1189 do {
1190 err = _nfs4_do_open_reclaim(ctx, state); 1190 err = _nfs4_do_open_reclaim(ctx, state);
1191 if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED) 1191 if (err != -NFS4ERR_DELAY)
1192 break; 1192 break;
1193 nfs4_handle_exception(server, err, &exception); 1193 nfs4_handle_exception(server, err, &exception);
1194 } while (exception.retry); 1194 } while (exception.retry);
@@ -1258,6 +1258,13 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
1258 case -NFS4ERR_ADMIN_REVOKED: 1258 case -NFS4ERR_ADMIN_REVOKED:
1259 case -NFS4ERR_BAD_STATEID: 1259 case -NFS4ERR_BAD_STATEID:
1260 nfs4_state_mark_reclaim_nograce(server->nfs_client, state); 1260 nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
1261 case -EKEYEXPIRED:
1262 /*
1263 * User RPCSEC_GSS context has expired.
1264 * We cannot recover this stateid now, so
1265 * skip it and allow recovery thread to
1266 * proceed.
1267 */
1261 case -ENOMEM: 1268 case -ENOMEM:
1262 err = 0; 1269 err = 0;
1263 goto out; 1270 goto out;
@@ -1605,7 +1612,6 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state
1605 goto out; 1612 goto out;
1606 case -NFS4ERR_GRACE: 1613 case -NFS4ERR_GRACE:
1607 case -NFS4ERR_DELAY: 1614 case -NFS4ERR_DELAY:
1608 case -EKEYEXPIRED:
1609 nfs4_handle_exception(server, err, &exception); 1615 nfs4_handle_exception(server, err, &exception);
1610 err = 0; 1616 err = 0;
1611 } 1617 }
@@ -1975,7 +1981,6 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
1975 calldata->res.fattr = &calldata->fattr; 1981 calldata->res.fattr = &calldata->fattr;
1976 calldata->res.seqid = calldata->arg.seqid; 1982 calldata->res.seqid = calldata->arg.seqid;
1977 calldata->res.server = server; 1983 calldata->res.server = server;
1978 calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
1979 path_get(path); 1984 path_get(path);
1980 calldata->path = *path; 1985 calldata->path = *path;
1981 1986
@@ -1998,120 +2003,17 @@ out:
1998 return status; 2003 return status;
1999} 2004}
2000 2005
2001static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state, fmode_t fmode) 2006static struct inode *
2007nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr)
2002{ 2008{
2003 struct file *filp;
2004 int ret;
2005
2006 /* If the open_intent is for execute, we have an extra check to make */
2007 if (fmode & FMODE_EXEC) {
2008 ret = nfs_may_open(state->inode,
2009 state->owner->so_cred,
2010 nd->intent.open.flags);
2011 if (ret < 0)
2012 goto out_close;
2013 }
2014 filp = lookup_instantiate_filp(nd, path->dentry, NULL);
2015 if (!IS_ERR(filp)) {
2016 struct nfs_open_context *ctx;
2017 ctx = nfs_file_open_context(filp);
2018 ctx->state = state;
2019 return 0;
2020 }
2021 ret = PTR_ERR(filp);
2022out_close:
2023 nfs4_close_sync(path, state, fmode & (FMODE_READ|FMODE_WRITE));
2024 return ret;
2025}
2026
2027struct dentry *
2028nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
2029{
2030 struct path path = {
2031 .mnt = nd->path.mnt,
2032 .dentry = dentry,
2033 };
2034 struct dentry *parent;
2035 struct iattr attr;
2036 struct rpc_cred *cred;
2037 struct nfs4_state *state; 2009 struct nfs4_state *state;
2038 struct dentry *res;
2039 int open_flags = nd->intent.open.flags;
2040 fmode_t fmode = open_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
2041
2042 if (nd->flags & LOOKUP_CREATE) {
2043 attr.ia_mode = nd->intent.open.create_mode;
2044 attr.ia_valid = ATTR_MODE;
2045 if (!IS_POSIXACL(dir))
2046 attr.ia_mode &= ~current_umask();
2047 } else {
2048 open_flags &= ~O_EXCL;
2049 attr.ia_valid = 0;
2050 BUG_ON(open_flags & O_CREAT);
2051 }
2052 2010
2053 cred = rpc_lookup_cred();
2054 if (IS_ERR(cred))
2055 return (struct dentry *)cred;
2056 parent = dentry->d_parent;
2057 /* Protect against concurrent sillydeletes */ 2011 /* Protect against concurrent sillydeletes */
2058 nfs_block_sillyrename(parent); 2012 state = nfs4_do_open(dir, &ctx->path, ctx->mode, open_flags, attr, ctx->cred);
2059 state = nfs4_do_open(dir, &path, fmode, open_flags, &attr, cred); 2013 if (IS_ERR(state))
2060 put_rpccred(cred); 2014 return ERR_CAST(state);
2061 if (IS_ERR(state)) { 2015 ctx->state = state;
2062 if (PTR_ERR(state) == -ENOENT) { 2016 return igrab(state->inode);
2063 d_add(dentry, NULL);
2064 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
2065 }
2066 nfs_unblock_sillyrename(parent);
2067 return (struct dentry *)state;
2068 }
2069 res = d_add_unique(dentry, igrab(state->inode));
2070 if (res != NULL)
2071 path.dentry = res;
2072 nfs_set_verifier(path.dentry, nfs_save_change_attribute(dir));
2073 nfs_unblock_sillyrename(parent);
2074 nfs4_intent_set_file(nd, &path, state, fmode);
2075 return res;
2076}
2077
2078int
2079nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, struct nameidata *nd)
2080{
2081 struct path path = {
2082 .mnt = nd->path.mnt,
2083 .dentry = dentry,
2084 };
2085 struct rpc_cred *cred;
2086 struct nfs4_state *state;
2087 fmode_t fmode = openflags & (FMODE_READ | FMODE_WRITE);
2088
2089 cred = rpc_lookup_cred();
2090 if (IS_ERR(cred))
2091 return PTR_ERR(cred);
2092 state = nfs4_do_open(dir, &path, fmode, openflags, NULL, cred);
2093 put_rpccred(cred);
2094 if (IS_ERR(state)) {
2095 switch (PTR_ERR(state)) {
2096 case -EPERM:
2097 case -EACCES:
2098 case -EDQUOT:
2099 case -ENOSPC:
2100 case -EROFS:
2101 return PTR_ERR(state);
2102 default:
2103 goto out_drop;
2104 }
2105 }
2106 if (state->inode == dentry->d_inode) {
2107 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
2108 nfs4_intent_set_file(nd, &path, state, fmode);
2109 return 1;
2110 }
2111 nfs4_close_sync(&path, state, fmode);
2112out_drop:
2113 d_drop(dentry);
2114 return 0;
2115} 2017}
2116 2018
2117static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) 2019static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
@@ -2568,36 +2470,34 @@ static int nfs4_proc_readlink(struct inode *inode, struct page *page,
2568 2470
2569static int 2471static int
2570nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 2472nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
2571 int flags, struct nameidata *nd) 2473 int flags, struct nfs_open_context *ctx)
2572{ 2474{
2573 struct path path = { 2475 struct path my_path = {
2574 .mnt = nd->path.mnt,
2575 .dentry = dentry, 2476 .dentry = dentry,
2576 }; 2477 };
2478 struct path *path = &my_path;
2577 struct nfs4_state *state; 2479 struct nfs4_state *state;
2578 struct rpc_cred *cred; 2480 struct rpc_cred *cred = NULL;
2579 fmode_t fmode = flags & (FMODE_READ | FMODE_WRITE); 2481 fmode_t fmode = 0;
2580 int status = 0; 2482 int status = 0;
2581 2483
2582 cred = rpc_lookup_cred(); 2484 if (ctx != NULL) {
2583 if (IS_ERR(cred)) { 2485 cred = ctx->cred;
2584 status = PTR_ERR(cred); 2486 path = &ctx->path;
2585 goto out; 2487 fmode = ctx->mode;
2586 } 2488 }
2587 state = nfs4_do_open(dir, &path, fmode, flags, sattr, cred); 2489 state = nfs4_do_open(dir, path, fmode, flags, sattr, cred);
2588 d_drop(dentry); 2490 d_drop(dentry);
2589 if (IS_ERR(state)) { 2491 if (IS_ERR(state)) {
2590 status = PTR_ERR(state); 2492 status = PTR_ERR(state);
2591 goto out_putcred; 2493 goto out;
2592 } 2494 }
2593 d_add(dentry, igrab(state->inode)); 2495 d_add(dentry, igrab(state->inode));
2594 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 2496 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
2595 if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0) 2497 if (ctx != NULL)
2596 status = nfs4_intent_set_file(nd, &path, state, fmode); 2498 ctx->state = state;
2597 else 2499 else
2598 nfs4_close_sync(&path, state, fmode); 2500 nfs4_close_sync(path, state, fmode);
2599out_putcred:
2600 put_rpccred(cred);
2601out: 2501out:
2602 return status; 2502 return status;
2603} 2503}
@@ -2655,6 +2555,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
2655 2555
2656 args->bitmask = server->cache_consistency_bitmask; 2556 args->bitmask = server->cache_consistency_bitmask;
2657 res->server = server; 2557 res->server = server;
2558 res->seq_res.sr_slot = NULL;
2658 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; 2559 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
2659} 2560}
2660 2561
@@ -2671,18 +2572,46 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
2671 return 1; 2572 return 1;
2672} 2573}
2673 2574
2575static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
2576{
2577 struct nfs_server *server = NFS_SERVER(dir);
2578 struct nfs_renameargs *arg = msg->rpc_argp;
2579 struct nfs_renameres *res = msg->rpc_resp;
2580
2581 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME];
2582 arg->bitmask = server->attr_bitmask;
2583 res->server = server;
2584}
2585
2586static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
2587 struct inode *new_dir)
2588{
2589 struct nfs_renameres *res = task->tk_msg.rpc_resp;
2590
2591 if (!nfs4_sequence_done(task, &res->seq_res))
2592 return 0;
2593 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
2594 return 0;
2595
2596 update_changeattr(old_dir, &res->old_cinfo);
2597 nfs_post_op_update_inode(old_dir, res->old_fattr);
2598 update_changeattr(new_dir, &res->new_cinfo);
2599 nfs_post_op_update_inode(new_dir, res->new_fattr);
2600 return 1;
2601}
2602
2674static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, 2603static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
2675 struct inode *new_dir, struct qstr *new_name) 2604 struct inode *new_dir, struct qstr *new_name)
2676{ 2605{
2677 struct nfs_server *server = NFS_SERVER(old_dir); 2606 struct nfs_server *server = NFS_SERVER(old_dir);
2678 struct nfs4_rename_arg arg = { 2607 struct nfs_renameargs arg = {
2679 .old_dir = NFS_FH(old_dir), 2608 .old_dir = NFS_FH(old_dir),
2680 .new_dir = NFS_FH(new_dir), 2609 .new_dir = NFS_FH(new_dir),
2681 .old_name = old_name, 2610 .old_name = old_name,
2682 .new_name = new_name, 2611 .new_name = new_name,
2683 .bitmask = server->attr_bitmask, 2612 .bitmask = server->attr_bitmask,
2684 }; 2613 };
2685 struct nfs4_rename_res res = { 2614 struct nfs_renameres res = {
2686 .server = server, 2615 .server = server,
2687 }; 2616 };
2688 struct rpc_message msg = { 2617 struct rpc_message msg = {
@@ -2896,15 +2825,16 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
2896} 2825}
2897 2826
2898static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, 2827static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
2899 u64 cookie, struct page *page, unsigned int count, int plus) 2828 u64 cookie, struct page **pages, unsigned int count, int plus)
2900{ 2829{
2901 struct inode *dir = dentry->d_inode; 2830 struct inode *dir = dentry->d_inode;
2902 struct nfs4_readdir_arg args = { 2831 struct nfs4_readdir_arg args = {
2903 .fh = NFS_FH(dir), 2832 .fh = NFS_FH(dir),
2904 .pages = &page, 2833 .pages = pages,
2905 .pgbase = 0, 2834 .pgbase = 0,
2906 .count = count, 2835 .count = count,
2907 .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask, 2836 .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask,
2837 .plus = plus,
2908 }; 2838 };
2909 struct nfs4_readdir_res res; 2839 struct nfs4_readdir_res res;
2910 struct rpc_message msg = { 2840 struct rpc_message msg = {
@@ -2932,14 +2862,14 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
2932} 2862}
2933 2863
2934static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, 2864static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
2935 u64 cookie, struct page *page, unsigned int count, int plus) 2865 u64 cookie, struct page **pages, unsigned int count, int plus)
2936{ 2866{
2937 struct nfs4_exception exception = { }; 2867 struct nfs4_exception exception = { };
2938 int err; 2868 int err;
2939 do { 2869 do {
2940 err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode), 2870 err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode),
2941 _nfs4_proc_readdir(dentry, cred, cookie, 2871 _nfs4_proc_readdir(dentry, cred, cookie,
2942 page, count, plus), 2872 pages, count, plus),
2943 &exception); 2873 &exception);
2944 } while (exception.retry); 2874 } while (exception.retry);
2945 return err; 2875 return err;
@@ -3490,9 +3420,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
3490 nfs4_state_mark_reclaim_nograce(clp, state); 3420 nfs4_state_mark_reclaim_nograce(clp, state);
3491 goto do_state_recovery; 3421 goto do_state_recovery;
3492 case -NFS4ERR_STALE_STATEID: 3422 case -NFS4ERR_STALE_STATEID:
3493 if (state == NULL)
3494 break;
3495 nfs4_state_mark_reclaim_reboot(clp, state);
3496 case -NFS4ERR_STALE_CLIENTID: 3423 case -NFS4ERR_STALE_CLIENTID:
3497 case -NFS4ERR_EXPIRED: 3424 case -NFS4ERR_EXPIRED:
3498 goto do_state_recovery; 3425 goto do_state_recovery;
@@ -3626,7 +3553,6 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
3626 case -NFS4ERR_RESOURCE: 3553 case -NFS4ERR_RESOURCE:
3627 /* The IBM lawyers misread another document! */ 3554 /* The IBM lawyers misread another document! */
3628 case -NFS4ERR_DELAY: 3555 case -NFS4ERR_DELAY:
3629 case -EKEYEXPIRED:
3630 err = nfs4_delay(clp->cl_rpcclient, &timeout); 3556 err = nfs4_delay(clp->cl_rpcclient, &timeout);
3631 } 3557 }
3632 } while (err == 0); 3558 } while (err == 0);
@@ -3721,7 +3647,6 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
3721 memcpy(&data->stateid, stateid, sizeof(data->stateid)); 3647 memcpy(&data->stateid, stateid, sizeof(data->stateid));
3722 data->res.fattr = &data->fattr; 3648 data->res.fattr = &data->fattr;
3723 data->res.server = server; 3649 data->res.server = server;
3724 data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
3725 nfs_fattr_init(data->res.fattr); 3650 nfs_fattr_init(data->res.fattr);
3726 data->timestamp = jiffies; 3651 data->timestamp = jiffies;
3727 data->rpc_status = 0; 3652 data->rpc_status = 0;
@@ -3874,7 +3799,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
3874 p->arg.fl = &p->fl; 3799 p->arg.fl = &p->fl;
3875 p->arg.seqid = seqid; 3800 p->arg.seqid = seqid;
3876 p->res.seqid = seqid; 3801 p->res.seqid = seqid;
3877 p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
3878 p->arg.stateid = &lsp->ls_stateid; 3802 p->arg.stateid = &lsp->ls_stateid;
3879 p->lsp = lsp; 3803 p->lsp = lsp;
3880 atomic_inc(&lsp->ls_count); 3804 atomic_inc(&lsp->ls_count);
@@ -4054,7 +3978,6 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
4054 p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; 3978 p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
4055 p->arg.lock_owner.id = lsp->ls_id.id; 3979 p->arg.lock_owner.id = lsp->ls_id.id;
4056 p->res.lock_seqid = p->arg.lock_seqid; 3980 p->res.lock_seqid = p->arg.lock_seqid;
4057 p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
4058 p->lsp = lsp; 3981 p->lsp = lsp;
4059 p->server = server; 3982 p->server = server;
4060 atomic_inc(&lsp->ls_count); 3983 atomic_inc(&lsp->ls_count);
@@ -4241,7 +4164,7 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
4241 if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) 4164 if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
4242 return 0; 4165 return 0;
4243 err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM); 4166 err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
4244 if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED) 4167 if (err != -NFS4ERR_DELAY)
4245 break; 4168 break;
4246 nfs4_handle_exception(server, err, &exception); 4169 nfs4_handle_exception(server, err, &exception);
4247 } while (exception.retry); 4170 } while (exception.retry);
@@ -4266,7 +4189,6 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
4266 goto out; 4189 goto out;
4267 case -NFS4ERR_GRACE: 4190 case -NFS4ERR_GRACE:
4268 case -NFS4ERR_DELAY: 4191 case -NFS4ERR_DELAY:
4269 case -EKEYEXPIRED:
4270 nfs4_handle_exception(server, err, &exception); 4192 nfs4_handle_exception(server, err, &exception);
4271 err = 0; 4193 err = 0;
4272 } 4194 }
@@ -4412,13 +4334,21 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
4412 nfs4_state_mark_reclaim_nograce(server->nfs_client, state); 4334 nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
4413 err = 0; 4335 err = 0;
4414 goto out; 4336 goto out;
4337 case -EKEYEXPIRED:
4338 /*
4339 * User RPCSEC_GSS context has expired.
4340 * We cannot recover this stateid now, so
4341 * skip it and allow recovery thread to
4342 * proceed.
4343 */
4344 err = 0;
4345 goto out;
4415 case -ENOMEM: 4346 case -ENOMEM:
4416 case -NFS4ERR_DENIED: 4347 case -NFS4ERR_DENIED:
4417 /* kill_proc(fl->fl_pid, SIGLOST, 1); */ 4348 /* kill_proc(fl->fl_pid, SIGLOST, 1); */
4418 err = 0; 4349 err = 0;
4419 goto out; 4350 goto out;
4420 case -NFS4ERR_DELAY: 4351 case -NFS4ERR_DELAY:
4421 case -EKEYEXPIRED:
4422 break; 4352 break;
4423 } 4353 }
4424 err = nfs4_handle_exception(server, err, &exception); 4354 err = nfs4_handle_exception(server, err, &exception);
@@ -4647,7 +4577,6 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
4647 switch (task->tk_status) { 4577 switch (task->tk_status) {
4648 case -NFS4ERR_DELAY: 4578 case -NFS4ERR_DELAY:
4649 case -NFS4ERR_GRACE: 4579 case -NFS4ERR_GRACE:
4650 case -EKEYEXPIRED:
4651 dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status); 4580 dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
4652 rpc_delay(task, NFS4_POLL_RETRY_MIN); 4581 rpc_delay(task, NFS4_POLL_RETRY_MIN);
4653 task->tk_status = 0; 4582 task->tk_status = 0;
@@ -4687,7 +4616,6 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
4687 }; 4616 };
4688 int status; 4617 int status;
4689 4618
4690 res.lr_seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
4691 dprintk("--> %s\n", __func__); 4619 dprintk("--> %s\n", __func__);
4692 task = rpc_run_task(&task_setup); 4620 task = rpc_run_task(&task_setup);
4693 4621
@@ -4914,49 +4842,56 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
4914 args->bc_attrs.max_reqs); 4842 args->bc_attrs.max_reqs);
4915} 4843}
4916 4844
4917static int _verify_channel_attr(char *chan, char *attr_name, u32 sent, u32 rcvd) 4845static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
4918{ 4846{
4919 if (rcvd <= sent) 4847 struct nfs4_channel_attrs *sent = &args->fc_attrs;
4920 return 0; 4848 struct nfs4_channel_attrs *rcvd = &session->fc_attrs;
4921 printk(KERN_WARNING "%s: Session INVALID: %s channel %s increased. " 4849
4922 "sent=%u rcvd=%u\n", __func__, chan, attr_name, sent, rcvd); 4850 if (rcvd->headerpadsz > sent->headerpadsz)
4923 return -EINVAL; 4851 return -EINVAL;
4852 if (rcvd->max_resp_sz > sent->max_resp_sz)
4853 return -EINVAL;
4854 /*
4855 * Our requested max_ops is the minimum we need; we're not
4856 * prepared to break up compounds into smaller pieces than that.
4857 * So, no point even trying to continue if the server won't
4858 * cooperate:
4859 */
4860 if (rcvd->max_ops < sent->max_ops)
4861 return -EINVAL;
4862 if (rcvd->max_reqs == 0)
4863 return -EINVAL;
4864 return 0;
4924} 4865}
4925 4866
4926#define _verify_fore_channel_attr(_name_) \ 4867static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
4927 _verify_channel_attr("fore", #_name_, \ 4868{
4928 args->fc_attrs._name_, \ 4869 struct nfs4_channel_attrs *sent = &args->bc_attrs;
4929 session->fc_attrs._name_) 4870 struct nfs4_channel_attrs *rcvd = &session->bc_attrs;
4930 4871
4931#define _verify_back_channel_attr(_name_) \ 4872 if (rcvd->max_rqst_sz > sent->max_rqst_sz)
4932 _verify_channel_attr("back", #_name_, \ 4873 return -EINVAL;
4933 args->bc_attrs._name_, \ 4874 if (rcvd->max_resp_sz < sent->max_resp_sz)
4934 session->bc_attrs._name_) 4875 return -EINVAL;
4876 if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)
4877 return -EINVAL;
4878 /* These would render the backchannel useless: */
4879 if (rcvd->max_ops == 0)
4880 return -EINVAL;
4881 if (rcvd->max_reqs == 0)
4882 return -EINVAL;
4883 return 0;
4884}
4935 4885
4936/*
4937 * The server is not allowed to increase the fore channel header pad size,
4938 * maximum response size, or maximum number of operations.
4939 *
4940 * The back channel attributes are only negotiatied down: We send what the
4941 * (back channel) server insists upon.
4942 */
4943static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args, 4886static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args,
4944 struct nfs4_session *session) 4887 struct nfs4_session *session)
4945{ 4888{
4946 int ret = 0; 4889 int ret;
4947
4948 ret |= _verify_fore_channel_attr(headerpadsz);
4949 ret |= _verify_fore_channel_attr(max_resp_sz);
4950 ret |= _verify_fore_channel_attr(max_ops);
4951
4952 ret |= _verify_back_channel_attr(headerpadsz);
4953 ret |= _verify_back_channel_attr(max_rqst_sz);
4954 ret |= _verify_back_channel_attr(max_resp_sz);
4955 ret |= _verify_back_channel_attr(max_resp_sz_cached);
4956 ret |= _verify_back_channel_attr(max_ops);
4957 ret |= _verify_back_channel_attr(max_reqs);
4958 4890
4959 return ret; 4891 ret = nfs4_verify_fore_channel_attrs(args, session);
4892 if (ret)
4893 return ret;
4894 return nfs4_verify_back_channel_attrs(args, session);
4960} 4895}
4961 4896
4962static int _nfs4_proc_create_session(struct nfs_client *clp) 4897static int _nfs4_proc_create_session(struct nfs_client *clp)
@@ -5111,7 +5046,6 @@ static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client
5111{ 5046{
5112 switch(task->tk_status) { 5047 switch(task->tk_status) {
5113 case -NFS4ERR_DELAY: 5048 case -NFS4ERR_DELAY:
5114 case -EKEYEXPIRED:
5115 rpc_delay(task, NFS4_POLL_RETRY_MAX); 5049 rpc_delay(task, NFS4_POLL_RETRY_MAX);
5116 return -EAGAIN; 5050 return -EAGAIN;
5117 default: 5051 default:
@@ -5180,12 +5114,11 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_
5180 5114
5181 if (!atomic_inc_not_zero(&clp->cl_count)) 5115 if (!atomic_inc_not_zero(&clp->cl_count))
5182 return ERR_PTR(-EIO); 5116 return ERR_PTR(-EIO);
5183 calldata = kmalloc(sizeof(*calldata), GFP_NOFS); 5117 calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
5184 if (calldata == NULL) { 5118 if (calldata == NULL) {
5185 nfs_put_client(clp); 5119 nfs_put_client(clp);
5186 return ERR_PTR(-ENOMEM); 5120 return ERR_PTR(-ENOMEM);
5187 } 5121 }
5188 calldata->res.sr_slotid = NFS4_MAX_SLOT_TABLE;
5189 msg.rpc_argp = &calldata->args; 5122 msg.rpc_argp = &calldata->args;
5190 msg.rpc_resp = &calldata->res; 5123 msg.rpc_resp = &calldata->res;
5191 calldata->clp = clp; 5124 calldata->clp = clp;
@@ -5254,7 +5187,6 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf
5254 case -NFS4ERR_WRONG_CRED: /* What to do here? */ 5187 case -NFS4ERR_WRONG_CRED: /* What to do here? */
5255 break; 5188 break;
5256 case -NFS4ERR_DELAY: 5189 case -NFS4ERR_DELAY:
5257 case -EKEYEXPIRED:
5258 rpc_delay(task, NFS4_POLL_RETRY_MAX); 5190 rpc_delay(task, NFS4_POLL_RETRY_MAX);
5259 return -EAGAIN; 5191 return -EAGAIN;
5260 default: 5192 default:
@@ -5317,7 +5249,6 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
5317 goto out; 5249 goto out;
5318 calldata->clp = clp; 5250 calldata->clp = clp;
5319 calldata->arg.one_fs = 0; 5251 calldata->arg.one_fs = 0;
5320 calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
5321 5252
5322 msg.rpc_argp = &calldata->arg; 5253 msg.rpc_argp = &calldata->arg;
5323 msg.rpc_resp = &calldata->res; 5254 msg.rpc_resp = &calldata->res;
@@ -5333,6 +5264,147 @@ out:
5333 dprintk("<-- %s status=%d\n", __func__, status); 5264 dprintk("<-- %s status=%d\n", __func__, status);
5334 return status; 5265 return status;
5335} 5266}
5267
5268static void
5269nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
5270{
5271 struct nfs4_layoutget *lgp = calldata;
5272 struct inode *ino = lgp->args.inode;
5273 struct nfs_server *server = NFS_SERVER(ino);
5274
5275 dprintk("--> %s\n", __func__);
5276 if (nfs4_setup_sequence(server, &lgp->args.seq_args,
5277 &lgp->res.seq_res, 0, task))
5278 return;
5279 rpc_call_start(task);
5280}
5281
5282static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
5283{
5284 struct nfs4_layoutget *lgp = calldata;
5285 struct nfs_server *server = NFS_SERVER(lgp->args.inode);
5286
5287 dprintk("--> %s\n", __func__);
5288
5289 if (!nfs4_sequence_done(task, &lgp->res.seq_res))
5290 return;
5291
5292 switch (task->tk_status) {
5293 case 0:
5294 break;
5295 case -NFS4ERR_LAYOUTTRYLATER:
5296 case -NFS4ERR_RECALLCONFLICT:
5297 task->tk_status = -NFS4ERR_DELAY;
5298 /* Fall through */
5299 default:
5300 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
5301 rpc_restart_call_prepare(task);
5302 return;
5303 }
5304 }
5305 lgp->status = task->tk_status;
5306 dprintk("<-- %s\n", __func__);
5307}
5308
5309static void nfs4_layoutget_release(void *calldata)
5310{
5311 struct nfs4_layoutget *lgp = calldata;
5312
5313 dprintk("--> %s\n", __func__);
5314 put_layout_hdr(lgp->args.inode);
5315 if (lgp->res.layout.buf != NULL)
5316 free_page((unsigned long) lgp->res.layout.buf);
5317 put_nfs_open_context(lgp->args.ctx);
5318 kfree(calldata);
5319 dprintk("<-- %s\n", __func__);
5320}
5321
5322static const struct rpc_call_ops nfs4_layoutget_call_ops = {
5323 .rpc_call_prepare = nfs4_layoutget_prepare,
5324 .rpc_call_done = nfs4_layoutget_done,
5325 .rpc_release = nfs4_layoutget_release,
5326};
5327
5328int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
5329{
5330 struct nfs_server *server = NFS_SERVER(lgp->args.inode);
5331 struct rpc_task *task;
5332 struct rpc_message msg = {
5333 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
5334 .rpc_argp = &lgp->args,
5335 .rpc_resp = &lgp->res,
5336 };
5337 struct rpc_task_setup task_setup_data = {
5338 .rpc_client = server->client,
5339 .rpc_message = &msg,
5340 .callback_ops = &nfs4_layoutget_call_ops,
5341 .callback_data = lgp,
5342 .flags = RPC_TASK_ASYNC,
5343 };
5344 int status = 0;
5345
5346 dprintk("--> %s\n", __func__);
5347
5348 lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS);
5349 if (lgp->res.layout.buf == NULL) {
5350 nfs4_layoutget_release(lgp);
5351 return -ENOMEM;
5352 }
5353
5354 lgp->res.seq_res.sr_slot = NULL;
5355 task = rpc_run_task(&task_setup_data);
5356 if (IS_ERR(task))
5357 return PTR_ERR(task);
5358 status = nfs4_wait_for_completion_rpc_task(task);
5359 if (status != 0)
5360 goto out;
5361 status = lgp->status;
5362 if (status != 0)
5363 goto out;
5364 status = pnfs_layout_process(lgp);
5365out:
5366 rpc_put_task(task);
5367 dprintk("<-- %s status=%d\n", __func__, status);
5368 return status;
5369}
5370
5371static int
5372_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
5373{
5374 struct nfs4_getdeviceinfo_args args = {
5375 .pdev = pdev,
5376 };
5377 struct nfs4_getdeviceinfo_res res = {
5378 .pdev = pdev,
5379 };
5380 struct rpc_message msg = {
5381 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
5382 .rpc_argp = &args,
5383 .rpc_resp = &res,
5384 };
5385 int status;
5386
5387 dprintk("--> %s\n", __func__);
5388 status = nfs4_call_sync(server, &msg, &args, &res, 0);
5389 dprintk("<-- %s status=%d\n", __func__, status);
5390
5391 return status;
5392}
5393
5394int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
5395{
5396 struct nfs4_exception exception = { };
5397 int err;
5398
5399 do {
5400 err = nfs4_handle_exception(server,
5401 _nfs4_proc_getdeviceinfo(server, pdev),
5402 &exception);
5403 } while (exception.retry);
5404 return err;
5405}
5406EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo);
5407
5336#endif /* CONFIG_NFS_V4_1 */ 5408#endif /* CONFIG_NFS_V4_1 */
5337 5409
5338struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { 5410struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
@@ -5443,6 +5515,8 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
5443 .unlink_setup = nfs4_proc_unlink_setup, 5515 .unlink_setup = nfs4_proc_unlink_setup,
5444 .unlink_done = nfs4_proc_unlink_done, 5516 .unlink_done = nfs4_proc_unlink_done,
5445 .rename = nfs4_proc_rename, 5517 .rename = nfs4_proc_rename,
5518 .rename_setup = nfs4_proc_rename_setup,
5519 .rename_done = nfs4_proc_rename_done,
5446 .link = nfs4_proc_link, 5520 .link = nfs4_proc_link,
5447 .symlink = nfs4_proc_symlink, 5521 .symlink = nfs4_proc_symlink,
5448 .mkdir = nfs4_proc_mkdir, 5522 .mkdir = nfs4_proc_mkdir,
@@ -5463,6 +5537,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
5463 .lock = nfs4_proc_lock, 5537 .lock = nfs4_proc_lock,
5464 .clear_acl_cache = nfs4_zap_acl_attr, 5538 .clear_acl_cache = nfs4_zap_acl_attr,
5465 .close_context = nfs4_close_context, 5539 .close_context = nfs4_close_context,
5540 .open_context = nfs4_atomic_open,
5466}; 5541};
5467 5542
5468/* 5543/*
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 3e2f19b04c06..f575a3126737 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -40,12 +40,13 @@
40 40
41#include <linux/kernel.h> 41#include <linux/kernel.h>
42#include <linux/slab.h> 42#include <linux/slab.h>
43#include <linux/smp_lock.h> 43#include <linux/fs.h>
44#include <linux/nfs_fs.h> 44#include <linux/nfs_fs.h>
45#include <linux/nfs_idmap.h> 45#include <linux/nfs_idmap.h>
46#include <linux/kthread.h> 46#include <linux/kthread.h>
47#include <linux/module.h> 47#include <linux/module.h>
48#include <linux/random.h> 48#include <linux/random.h>
49#include <linux/ratelimit.h>
49#include <linux/workqueue.h> 50#include <linux/workqueue.h>
50#include <linux/bitops.h> 51#include <linux/bitops.h>
51 52
@@ -53,6 +54,7 @@
53#include "callback.h" 54#include "callback.h"
54#include "delegation.h" 55#include "delegation.h"
55#include "internal.h" 56#include "internal.h"
57#include "pnfs.h"
56 58
57#define OPENOWNER_POOL_SIZE 8 59#define OPENOWNER_POOL_SIZE 8
58 60
@@ -970,13 +972,13 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
970 /* Guard against delegation returns and new lock/unlock calls */ 972 /* Guard against delegation returns and new lock/unlock calls */
971 down_write(&nfsi->rwsem); 973 down_write(&nfsi->rwsem);
972 /* Protect inode->i_flock using the BKL */ 974 /* Protect inode->i_flock using the BKL */
973 lock_kernel(); 975 lock_flocks();
974 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 976 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
975 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) 977 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
976 continue; 978 continue;
977 if (nfs_file_open_context(fl->fl_file)->state != state) 979 if (nfs_file_open_context(fl->fl_file)->state != state)
978 continue; 980 continue;
979 unlock_kernel(); 981 unlock_flocks();
980 status = ops->recover_lock(state, fl); 982 status = ops->recover_lock(state, fl);
981 switch (status) { 983 switch (status) {
982 case 0: 984 case 0:
@@ -1003,9 +1005,9 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
1003 /* kill_proc(fl->fl_pid, SIGLOST, 1); */ 1005 /* kill_proc(fl->fl_pid, SIGLOST, 1); */
1004 status = 0; 1006 status = 0;
1005 } 1007 }
1006 lock_kernel(); 1008 lock_flocks();
1007 } 1009 }
1008 unlock_kernel(); 1010 unlock_flocks();
1009out: 1011out:
1010 up_write(&nfsi->rwsem); 1012 up_write(&nfsi->rwsem);
1011 return status; 1013 return status;
@@ -1063,6 +1065,14 @@ restart:
1063 /* Mark the file as being 'closed' */ 1065 /* Mark the file as being 'closed' */
1064 state->state = 0; 1066 state->state = 0;
1065 break; 1067 break;
1068 case -EKEYEXPIRED:
1069 /*
1070 * User RPCSEC_GSS context has expired.
1071 * We cannot recover this stateid now, so
1072 * skip it and allow recovery thread to
1073 * proceed.
1074 */
1075 break;
1066 case -NFS4ERR_ADMIN_REVOKED: 1076 case -NFS4ERR_ADMIN_REVOKED:
1067 case -NFS4ERR_STALE_STATEID: 1077 case -NFS4ERR_STALE_STATEID:
1068 case -NFS4ERR_BAD_STATEID: 1078 case -NFS4ERR_BAD_STATEID:
@@ -1138,16 +1148,14 @@ static void nfs4_reclaim_complete(struct nfs_client *clp,
1138 (void)ops->reclaim_complete(clp); 1148 (void)ops->reclaim_complete(clp);
1139} 1149}
1140 1150
1141static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) 1151static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
1142{ 1152{
1143 struct nfs4_state_owner *sp; 1153 struct nfs4_state_owner *sp;
1144 struct rb_node *pos; 1154 struct rb_node *pos;
1145 struct nfs4_state *state; 1155 struct nfs4_state *state;
1146 1156
1147 if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) 1157 if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
1148 return; 1158 return 0;
1149
1150 nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops);
1151 1159
1152 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { 1160 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
1153 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); 1161 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
@@ -1161,6 +1169,14 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
1161 } 1169 }
1162 1170
1163 nfs_delegation_reap_unclaimed(clp); 1171 nfs_delegation_reap_unclaimed(clp);
1172 return 1;
1173}
1174
1175static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
1176{
1177 if (!nfs4_state_clear_reclaim_reboot(clp))
1178 return;
1179 nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops);
1164} 1180}
1165 1181
1166static void nfs_delegation_clear_all(struct nfs_client *clp) 1182static void nfs_delegation_clear_all(struct nfs_client *clp)
@@ -1175,6 +1191,14 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
1175 nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); 1191 nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
1176} 1192}
1177 1193
1194static void nfs4_warn_keyexpired(const char *s)
1195{
1196 printk_ratelimited(KERN_WARNING "Error: state manager"
1197 " encountered RPCSEC_GSS session"
1198 " expired against NFSv4 server %s.\n",
1199 s);
1200}
1201
1178static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) 1202static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
1179{ 1203{
1180 switch (error) { 1204 switch (error) {
@@ -1187,7 +1211,7 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
1187 case -NFS4ERR_STALE_CLIENTID: 1211 case -NFS4ERR_STALE_CLIENTID:
1188 case -NFS4ERR_LEASE_MOVED: 1212 case -NFS4ERR_LEASE_MOVED:
1189 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); 1213 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
1190 nfs4_state_end_reclaim_reboot(clp); 1214 nfs4_state_clear_reclaim_reboot(clp);
1191 nfs4_state_start_reclaim_reboot(clp); 1215 nfs4_state_start_reclaim_reboot(clp);
1192 break; 1216 break;
1193 case -NFS4ERR_EXPIRED: 1217 case -NFS4ERR_EXPIRED:
@@ -1204,6 +1228,10 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
1204 set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); 1228 set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
1205 /* Zero session reset errors */ 1229 /* Zero session reset errors */
1206 return 0; 1230 return 0;
1231 case -EKEYEXPIRED:
1232 /* Nothing we can do */
1233 nfs4_warn_keyexpired(clp->cl_hostname);
1234 return 0;
1207 } 1235 }
1208 return error; 1236 return error;
1209} 1237}
@@ -1414,9 +1442,10 @@ static void nfs4_set_lease_expired(struct nfs_client *clp, int status)
1414 case -NFS4ERR_DELAY: 1442 case -NFS4ERR_DELAY:
1415 case -NFS4ERR_CLID_INUSE: 1443 case -NFS4ERR_CLID_INUSE:
1416 case -EAGAIN: 1444 case -EAGAIN:
1417 case -EKEYEXPIRED:
1418 break; 1445 break;
1419 1446
1447 case -EKEYEXPIRED:
1448 nfs4_warn_keyexpired(clp->cl_hostname);
1420 case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery 1449 case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
1421 * in nfs4_exchange_id */ 1450 * in nfs4_exchange_id */
1422 default: 1451 default:
@@ -1447,6 +1476,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
1447 } 1476 }
1448 clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); 1477 clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
1449 set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); 1478 set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
1479 pnfs_destroy_all_layouts(clp);
1450 } 1480 }
1451 1481
1452 if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { 1482 if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 08ef91291132..f313c4cce7e4 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -52,6 +52,7 @@
52#include <linux/nfs_idmap.h> 52#include <linux/nfs_idmap.h>
53#include "nfs4_fs.h" 53#include "nfs4_fs.h"
54#include "internal.h" 54#include "internal.h"
55#include "pnfs.h"
55 56
56#define NFSDBG_FACILITY NFSDBG_XDR 57#define NFSDBG_FACILITY NFSDBG_XDR
57 58
@@ -310,6 +311,19 @@ static int nfs4_stat_to_errno(int);
310 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) 311 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
311#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) 312#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4)
312#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) 313#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4)
314#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
315 XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
316#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
317 1 /* layout type */ + \
318 1 /* opaque devaddr4 length */ + \
319 /* devaddr4 payload is read into page */ \
320 1 /* notification bitmap length */ + \
321 1 /* notification bitmap */)
322#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \
323 encode_stateid_maxsz)
324#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \
325 decode_stateid_maxsz + \
326 XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE))
313#else /* CONFIG_NFS_V4_1 */ 327#else /* CONFIG_NFS_V4_1 */
314#define encode_sequence_maxsz 0 328#define encode_sequence_maxsz 0
315#define decode_sequence_maxsz 0 329#define decode_sequence_maxsz 0
@@ -699,6 +713,20 @@ static int nfs4_stat_to_errno(int);
699#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ 713#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \
700 decode_sequence_maxsz + \ 714 decode_sequence_maxsz + \
701 decode_reclaim_complete_maxsz) 715 decode_reclaim_complete_maxsz)
716#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \
717 encode_sequence_maxsz +\
718 encode_getdeviceinfo_maxsz)
719#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz + \
720 decode_sequence_maxsz + \
721 decode_getdeviceinfo_maxsz)
722#define NFS4_enc_layoutget_sz (compound_encode_hdr_maxsz + \
723 encode_sequence_maxsz + \
724 encode_putfh_maxsz + \
725 encode_layoutget_maxsz)
726#define NFS4_dec_layoutget_sz (compound_decode_hdr_maxsz + \
727 decode_sequence_maxsz + \
728 decode_putfh_maxsz + \
729 decode_layoutget_maxsz)
702 730
703const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + 731const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
704 compound_encode_hdr_maxsz + 732 compound_encode_hdr_maxsz +
@@ -816,7 +844,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
816 if (iap->ia_valid & ATTR_MODE) 844 if (iap->ia_valid & ATTR_MODE)
817 len += 4; 845 len += 4;
818 if (iap->ia_valid & ATTR_UID) { 846 if (iap->ia_valid & ATTR_UID) {
819 owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name); 847 owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name, IDMAP_NAMESZ);
820 if (owner_namelen < 0) { 848 if (owner_namelen < 0) {
821 dprintk("nfs: couldn't resolve uid %d to string\n", 849 dprintk("nfs: couldn't resolve uid %d to string\n",
822 iap->ia_uid); 850 iap->ia_uid);
@@ -828,7 +856,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
828 len += 4 + (XDR_QUADLEN(owner_namelen) << 2); 856 len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
829 } 857 }
830 if (iap->ia_valid & ATTR_GID) { 858 if (iap->ia_valid & ATTR_GID) {
831 owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group); 859 owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group, IDMAP_NAMESZ);
832 if (owner_grouplen < 0) { 860 if (owner_grouplen < 0) {
833 dprintk("nfs: couldn't resolve gid %d to string\n", 861 dprintk("nfs: couldn't resolve gid %d to string\n",
834 iap->ia_gid); 862 iap->ia_gid);
@@ -1385,24 +1413,35 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
1385 1413
1386static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr) 1414static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
1387{ 1415{
1388 uint32_t attrs[2] = { 1416 uint32_t attrs[2] = {0, 0};
1389 FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID, 1417 uint32_t dircount = readdir->count >> 1;
1390 FATTR4_WORD1_MOUNTED_ON_FILEID,
1391 };
1392 __be32 *p; 1418 __be32 *p;
1393 1419
1420 if (readdir->plus) {
1421 attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE|
1422 FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE;
1423 attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER|
1424 FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV|
1425 FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS|
1426 FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
1427 dircount >>= 1;
1428 }
1429 attrs[0] |= FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID;
1430 attrs[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID;
1431 /* Switch to mounted_on_fileid if the server supports it */
1432 if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
1433 attrs[0] &= ~FATTR4_WORD0_FILEID;
1434 else
1435 attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
1436
1394 p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20); 1437 p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20);
1395 *p++ = cpu_to_be32(OP_READDIR); 1438 *p++ = cpu_to_be32(OP_READDIR);
1396 p = xdr_encode_hyper(p, readdir->cookie); 1439 p = xdr_encode_hyper(p, readdir->cookie);
1397 p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE); 1440 p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE);
1398 *p++ = cpu_to_be32(readdir->count >> 1); /* We're not doing readdirplus */ 1441 *p++ = cpu_to_be32(dircount);
1399 *p++ = cpu_to_be32(readdir->count); 1442 *p++ = cpu_to_be32(readdir->count);
1400 *p++ = cpu_to_be32(2); 1443 *p++ = cpu_to_be32(2);
1401 /* Switch to mounted_on_fileid if the server supports it */ 1444
1402 if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
1403 attrs[0] &= ~FATTR4_WORD0_FILEID;
1404 else
1405 attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
1406 *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]); 1445 *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
1407 *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); 1446 *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
1408 hdr->nops++; 1447 hdr->nops++;
@@ -1726,6 +1765,58 @@ static void encode_sequence(struct xdr_stream *xdr,
1726#endif /* CONFIG_NFS_V4_1 */ 1765#endif /* CONFIG_NFS_V4_1 */
1727} 1766}
1728 1767
1768#ifdef CONFIG_NFS_V4_1
1769static void
1770encode_getdeviceinfo(struct xdr_stream *xdr,
1771 const struct nfs4_getdeviceinfo_args *args,
1772 struct compound_hdr *hdr)
1773{
1774 __be32 *p;
1775
1776 p = reserve_space(xdr, 16 + NFS4_DEVICEID4_SIZE);
1777 *p++ = cpu_to_be32(OP_GETDEVICEINFO);
1778 p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
1779 NFS4_DEVICEID4_SIZE);
1780 *p++ = cpu_to_be32(args->pdev->layout_type);
1781 *p++ = cpu_to_be32(args->pdev->pglen); /* gdia_maxcount */
1782 *p++ = cpu_to_be32(0); /* bitmap length 0 */
1783 hdr->nops++;
1784 hdr->replen += decode_getdeviceinfo_maxsz;
1785}
1786
1787static void
1788encode_layoutget(struct xdr_stream *xdr,
1789 const struct nfs4_layoutget_args *args,
1790 struct compound_hdr *hdr)
1791{
1792 nfs4_stateid stateid;
1793 __be32 *p;
1794
1795 p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
1796 *p++ = cpu_to_be32(OP_LAYOUTGET);
1797 *p++ = cpu_to_be32(0); /* Signal layout available */
1798 *p++ = cpu_to_be32(args->type);
1799 *p++ = cpu_to_be32(args->range.iomode);
1800 p = xdr_encode_hyper(p, args->range.offset);
1801 p = xdr_encode_hyper(p, args->range.length);
1802 p = xdr_encode_hyper(p, args->minlength);
1803 pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
1804 args->ctx->state);
1805 p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
1806 *p = cpu_to_be32(args->maxcount);
1807
1808 dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
1809 __func__,
1810 args->type,
1811 args->range.iomode,
1812 (unsigned long)args->range.offset,
1813 (unsigned long)args->range.length,
1814 args->maxcount);
1815 hdr->nops++;
1816 hdr->replen += decode_layoutget_maxsz;
1817}
1818#endif /* CONFIG_NFS_V4_1 */
1819
1729/* 1820/*
1730 * END OF "GENERIC" ENCODE ROUTINES. 1821 * END OF "GENERIC" ENCODE ROUTINES.
1731 */ 1822 */
@@ -1823,7 +1914,7 @@ static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs
1823/* 1914/*
1824 * Encode RENAME request 1915 * Encode RENAME request
1825 */ 1916 */
1826static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs4_rename_arg *args) 1917static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs_renameargs *args)
1827{ 1918{
1828 struct xdr_stream xdr; 1919 struct xdr_stream xdr;
1829 struct compound_hdr hdr = { 1920 struct compound_hdr hdr = {
@@ -2543,6 +2634,51 @@ static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p,
2543 return 0; 2634 return 0;
2544} 2635}
2545 2636
2637/*
2638 * Encode GETDEVICEINFO request
2639 */
2640static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p,
2641 struct nfs4_getdeviceinfo_args *args)
2642{
2643 struct xdr_stream xdr;
2644 struct compound_hdr hdr = {
2645 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2646 };
2647
2648 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2649 encode_compound_hdr(&xdr, req, &hdr);
2650 encode_sequence(&xdr, &args->seq_args, &hdr);
2651 encode_getdeviceinfo(&xdr, args, &hdr);
2652
2653 /* set up reply kvec. Subtract notification bitmap max size (2)
2654 * so that notification bitmap is put in xdr_buf tail */
2655 xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2,
2656 args->pdev->pages, args->pdev->pgbase,
2657 args->pdev->pglen);
2658
2659 encode_nops(&hdr);
2660 return 0;
2661}
2662
2663/*
2664 * Encode LAYOUTGET request
2665 */
2666static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p,
2667 struct nfs4_layoutget_args *args)
2668{
2669 struct xdr_stream xdr;
2670 struct compound_hdr hdr = {
2671 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2672 };
2673
2674 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2675 encode_compound_hdr(&xdr, req, &hdr);
2676 encode_sequence(&xdr, &args->seq_args, &hdr);
2677 encode_putfh(&xdr, NFS_FH(args->inode), &hdr);
2678 encode_layoutget(&xdr, args, &hdr);
2679 encode_nops(&hdr);
2680 return 0;
2681}
2546#endif /* CONFIG_NFS_V4_1 */ 2682#endif /* CONFIG_NFS_V4_1 */
2547 2683
2548static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) 2684static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
@@ -2676,7 +2812,10 @@ out_overflow:
2676static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask) 2812static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask)
2677{ 2813{
2678 if (likely(bitmap[0] & FATTR4_WORD0_SUPPORTED_ATTRS)) { 2814 if (likely(bitmap[0] & FATTR4_WORD0_SUPPORTED_ATTRS)) {
2679 decode_attr_bitmap(xdr, bitmask); 2815 int ret;
2816 ret = decode_attr_bitmap(xdr, bitmask);
2817 if (unlikely(ret < 0))
2818 return ret;
2680 bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; 2819 bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
2681 } else 2820 } else
2682 bitmask[0] = bitmask[1] = 0; 2821 bitmask[0] = bitmask[1] = 0;
@@ -2848,6 +2987,56 @@ out_overflow:
2848 return -EIO; 2987 return -EIO;
2849} 2988}
2850 2989
2990static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap)
2991{
2992 __be32 *p;
2993
2994 if (unlikely(bitmap[0] & (FATTR4_WORD0_RDATTR_ERROR - 1U)))
2995 return -EIO;
2996 if (likely(bitmap[0] & FATTR4_WORD0_RDATTR_ERROR)) {
2997 p = xdr_inline_decode(xdr, 4);
2998 if (unlikely(!p))
2999 goto out_overflow;
3000 bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
3001 }
3002 return 0;
3003out_overflow:
3004 print_overflow_msg(__func__, xdr);
3005 return -EIO;
3006}
3007
3008static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh)
3009{
3010 __be32 *p;
3011 int len;
3012
3013 if (fh != NULL)
3014 memset(fh, 0, sizeof(*fh));
3015
3016 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEHANDLE - 1U)))
3017 return -EIO;
3018 if (likely(bitmap[0] & FATTR4_WORD0_FILEHANDLE)) {
3019 p = xdr_inline_decode(xdr, 4);
3020 if (unlikely(!p))
3021 goto out_overflow;
3022 len = be32_to_cpup(p);
3023 if (len > NFS4_FHSIZE)
3024 return -EIO;
3025 p = xdr_inline_decode(xdr, len);
3026 if (unlikely(!p))
3027 goto out_overflow;
3028 if (fh != NULL) {
3029 memcpy(fh->data, p, len);
3030 fh->size = len;
3031 }
3032 bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE;
3033 }
3034 return 0;
3035out_overflow:
3036 print_overflow_msg(__func__, xdr);
3037 return -EIO;
3038}
3039
2851static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 3040static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
2852{ 3041{
2853 __be32 *p; 3042 __be32 *p;
@@ -3521,6 +3710,24 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s
3521 return status; 3710 return status;
3522} 3711}
3523 3712
3713static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap,
3714 struct timespec *time)
3715{
3716 int status = 0;
3717
3718 time->tv_sec = 0;
3719 time->tv_nsec = 0;
3720 if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_DELTA - 1U)))
3721 return -EIO;
3722 if (likely(bitmap[1] & FATTR4_WORD1_TIME_DELTA)) {
3723 status = decode_attr_time(xdr, time);
3724 bitmap[1] &= ~FATTR4_WORD1_TIME_DELTA;
3725 }
3726 dprintk("%s: time_delta=%ld %ld\n", __func__, (long)time->tv_sec,
3727 (long)time->tv_nsec);
3728 return status;
3729}
3730
3524static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) 3731static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
3525{ 3732{
3526 int status = 0; 3733 int status = 0;
@@ -3744,29 +3951,14 @@ xdr_error:
3744 return status; 3951 return status;
3745} 3952}
3746 3953
3747static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, 3954static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
3955 struct nfs_fattr *fattr, struct nfs_fh *fh,
3748 const struct nfs_server *server, int may_sleep) 3956 const struct nfs_server *server, int may_sleep)
3749{ 3957{
3750 __be32 *savep;
3751 uint32_t attrlen,
3752 bitmap[2] = {0},
3753 type;
3754 int status; 3958 int status;
3755 umode_t fmode = 0; 3959 umode_t fmode = 0;
3756 uint64_t fileid; 3960 uint64_t fileid;
3757 3961 uint32_t type;
3758 status = decode_op_hdr(xdr, OP_GETATTR);
3759 if (status < 0)
3760 goto xdr_error;
3761
3762 status = decode_attr_bitmap(xdr, bitmap);
3763 if (status < 0)
3764 goto xdr_error;
3765
3766 status = decode_attr_length(xdr, &attrlen, &savep);
3767 if (status < 0)
3768 goto xdr_error;
3769
3770 3962
3771 status = decode_attr_type(xdr, bitmap, &type); 3963 status = decode_attr_type(xdr, bitmap, &type);
3772 if (status < 0) 3964 if (status < 0)
@@ -3792,6 +3984,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
3792 goto xdr_error; 3984 goto xdr_error;
3793 fattr->valid |= status; 3985 fattr->valid |= status;
3794 3986
3987 status = decode_attr_error(xdr, bitmap);
3988 if (status < 0)
3989 goto xdr_error;
3990
3991 status = decode_attr_filehandle(xdr, bitmap, fh);
3992 if (status < 0)
3993 goto xdr_error;
3994
3795 status = decode_attr_fileid(xdr, bitmap, &fattr->fileid); 3995 status = decode_attr_fileid(xdr, bitmap, &fattr->fileid);
3796 if (status < 0) 3996 if (status < 0)
3797 goto xdr_error; 3997 goto xdr_error;
@@ -3862,12 +4062,101 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
3862 fattr->valid |= status; 4062 fattr->valid |= status;
3863 } 4063 }
3864 4064
4065xdr_error:
4066 dprintk("%s: xdr returned %d\n", __func__, -status);
4067 return status;
4068}
4069
4070static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
4071 struct nfs_fh *fh, const struct nfs_server *server, int may_sleep)
4072{
4073 __be32 *savep;
4074 uint32_t attrlen,
4075 bitmap[2] = {0};
4076 int status;
4077
4078 status = decode_op_hdr(xdr, OP_GETATTR);
4079 if (status < 0)
4080 goto xdr_error;
4081
4082 status = decode_attr_bitmap(xdr, bitmap);
4083 if (status < 0)
4084 goto xdr_error;
4085
4086 status = decode_attr_length(xdr, &attrlen, &savep);
4087 if (status < 0)
4088 goto xdr_error;
4089
4090 status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server, may_sleep);
4091 if (status < 0)
4092 goto xdr_error;
4093
3865 status = verify_attr_len(xdr, savep, attrlen); 4094 status = verify_attr_len(xdr, savep, attrlen);
3866xdr_error: 4095xdr_error:
3867 dprintk("%s: xdr returned %d\n", __func__, -status); 4096 dprintk("%s: xdr returned %d\n", __func__, -status);
3868 return status; 4097 return status;
3869} 4098}
3870 4099
4100static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
4101 const struct nfs_server *server, int may_sleep)
4102{
4103 return decode_getfattr_generic(xdr, fattr, NULL, server, may_sleep);
4104}
4105
4106/*
4107 * Decode potentially multiple layout types. Currently we only support
4108 * one layout driver per file system.
4109 */
4110static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,
4111 uint32_t *layouttype)
4112{
4113 uint32_t *p;
4114 int num;
4115
4116 p = xdr_inline_decode(xdr, 4);
4117 if (unlikely(!p))
4118 goto out_overflow;
4119 num = be32_to_cpup(p);
4120
4121 /* pNFS is not supported by the underlying file system */
4122 if (num == 0) {
4123 *layouttype = 0;
4124 return 0;
4125 }
4126 if (num > 1)
4127 printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers "
4128 "per filesystem not supported\n", __func__);
4129
4130 /* Decode and set first layout type, move xdr->p past unused types */
4131 p = xdr_inline_decode(xdr, num * 4);
4132 if (unlikely(!p))
4133 goto out_overflow;
4134 *layouttype = be32_to_cpup(p);
4135 return 0;
4136out_overflow:
4137 print_overflow_msg(__func__, xdr);
4138 return -EIO;
4139}
4140
4141/*
4142 * The type of file system exported.
4143 * Note we must ensure that layouttype is set in any non-error case.
4144 */
4145static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
4146 uint32_t *layouttype)
4147{
4148 int status = 0;
4149
4150 dprintk("%s: bitmap is %x\n", __func__, bitmap[1]);
4151 if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U)))
4152 return -EIO;
4153 if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) {
4154 status = decode_first_pnfs_layout_type(xdr, layouttype);
4155 bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES;
4156 } else
4157 *layouttype = 0;
4158 return status;
4159}
3871 4160
3872static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) 4161static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
3873{ 4162{
@@ -3894,6 +4183,12 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
3894 if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0) 4183 if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0)
3895 goto xdr_error; 4184 goto xdr_error;
3896 fsinfo->wtpref = fsinfo->wtmax; 4185 fsinfo->wtpref = fsinfo->wtmax;
4186 status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta);
4187 if (status != 0)
4188 goto xdr_error;
4189 status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
4190 if (status != 0)
4191 goto xdr_error;
3897 4192
3898 status = verify_attr_len(xdr, savep, attrlen); 4193 status = verify_attr_len(xdr, savep, attrlen);
3899xdr_error: 4194xdr_error:
@@ -3950,13 +4245,13 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
3950 __be32 *p; 4245 __be32 *p;
3951 uint32_t namelen, type; 4246 uint32_t namelen, type;
3952 4247
3953 p = xdr_inline_decode(xdr, 32); 4248 p = xdr_inline_decode(xdr, 32); /* read 32 bytes */
3954 if (unlikely(!p)) 4249 if (unlikely(!p))
3955 goto out_overflow; 4250 goto out_overflow;
3956 p = xdr_decode_hyper(p, &offset); 4251 p = xdr_decode_hyper(p, &offset); /* read 2 8-byte long words */
3957 p = xdr_decode_hyper(p, &length); 4252 p = xdr_decode_hyper(p, &length);
3958 type = be32_to_cpup(p++); 4253 type = be32_to_cpup(p++); /* 4 byte read */
3959 if (fl != NULL) { 4254 if (fl != NULL) { /* manipulate file lock */
3960 fl->fl_start = (loff_t)offset; 4255 fl->fl_start = (loff_t)offset;
3961 fl->fl_end = fl->fl_start + (loff_t)length - 1; 4256 fl->fl_end = fl->fl_start + (loff_t)length - 1;
3962 if (length == ~(uint64_t)0) 4257 if (length == ~(uint64_t)0)
@@ -3966,9 +4261,9 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
3966 fl->fl_type = F_RDLCK; 4261 fl->fl_type = F_RDLCK;
3967 fl->fl_pid = 0; 4262 fl->fl_pid = 0;
3968 } 4263 }
3969 p = xdr_decode_hyper(p, &clientid); 4264 p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */
3970 namelen = be32_to_cpup(p); 4265 namelen = be32_to_cpup(p); /* read 4 bytes */ /* have read all 32 bytes now */
3971 p = xdr_inline_decode(xdr, namelen); 4266 p = xdr_inline_decode(xdr, namelen); /* variable size field */
3972 if (likely(p)) 4267 if (likely(p))
3973 return -NFS4ERR_DENIED; 4268 return -NFS4ERR_DENIED;
3974out_overflow: 4269out_overflow:
@@ -4200,12 +4495,9 @@ out_overflow:
4200static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir) 4495static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)
4201{ 4496{
4202 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 4497 struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
4203 struct page *page = *rcvbuf->pages;
4204 struct kvec *iov = rcvbuf->head; 4498 struct kvec *iov = rcvbuf->head;
4205 size_t hdrlen; 4499 size_t hdrlen;
4206 u32 recvd, pglen = rcvbuf->page_len; 4500 u32 recvd, pglen = rcvbuf->page_len;
4207 __be32 *end, *entry, *p, *kaddr;
4208 unsigned int nr = 0;
4209 int status; 4501 int status;
4210 4502
4211 status = decode_op_hdr(xdr, OP_READDIR); 4503 status = decode_op_hdr(xdr, OP_READDIR);
@@ -4225,71 +4517,8 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
4225 pglen = recvd; 4517 pglen = recvd;
4226 xdr_read_pages(xdr, pglen); 4518 xdr_read_pages(xdr, pglen);
4227 4519
4228 BUG_ON(pglen + readdir->pgbase > PAGE_CACHE_SIZE); 4520
4229 kaddr = p = kmap_atomic(page, KM_USER0);
4230 end = p + ((pglen + readdir->pgbase) >> 2);
4231 entry = p;
4232
4233 /* Make sure the packet actually has a value_follows and EOF entry */
4234 if ((entry + 1) > end)
4235 goto short_pkt;
4236
4237 for (; *p++; nr++) {
4238 u32 len, attrlen, xlen;
4239 if (end - p < 3)
4240 goto short_pkt;
4241 dprintk("cookie = %Lu, ", *((unsigned long long *)p));
4242 p += 2; /* cookie */
4243 len = ntohl(*p++); /* filename length */
4244 if (len > NFS4_MAXNAMLEN) {
4245 dprintk("NFS: giant filename in readdir (len 0x%x)\n",
4246 len);
4247 goto err_unmap;
4248 }
4249 xlen = XDR_QUADLEN(len);
4250 if (end - p < xlen + 1)
4251 goto short_pkt;
4252 dprintk("filename = %*s\n", len, (char *)p);
4253 p += xlen;
4254 len = ntohl(*p++); /* bitmap length */
4255 if (end - p < len + 1)
4256 goto short_pkt;
4257 p += len;
4258 attrlen = XDR_QUADLEN(ntohl(*p++));
4259 if (end - p < attrlen + 2)
4260 goto short_pkt;
4261 p += attrlen; /* attributes */
4262 entry = p;
4263 }
4264 /*
4265 * Apparently some server sends responses that are a valid size, but
4266 * contain no entries, and have value_follows==0 and EOF==0. For
4267 * those, just set the EOF marker.
4268 */
4269 if (!nr && entry[1] == 0) {
4270 dprintk("NFS: readdir reply truncated!\n");
4271 entry[1] = 1;
4272 }
4273out:
4274 kunmap_atomic(kaddr, KM_USER0);
4275 return 0; 4521 return 0;
4276short_pkt:
4277 /*
4278 * When we get a short packet there are 2 possibilities. We can
4279 * return an error, or fix up the response to look like a valid
4280 * response and return what we have so far. If there are no
4281 * entries and the packet was short, then return -EIO. If there
4282 * are valid entries in the response, return them and pretend that
4283 * the call was successful, but incomplete. The caller can retry the
4284 * readdir starting at the last cookie.
4285 */
4286 dprintk("%s: short packet at entry %d\n", __func__, nr);
4287 entry[0] = entry[1] = 0;
4288 if (nr)
4289 goto out;
4290err_unmap:
4291 kunmap_atomic(kaddr, KM_USER0);
4292 return -errno_NFSERR_IO;
4293} 4522}
4294 4523
4295static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) 4524static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
@@ -4299,7 +4528,6 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
4299 size_t hdrlen; 4528 size_t hdrlen;
4300 u32 len, recvd; 4529 u32 len, recvd;
4301 __be32 *p; 4530 __be32 *p;
4302 char *kaddr;
4303 int status; 4531 int status;
4304 4532
4305 status = decode_op_hdr(xdr, OP_READLINK); 4533 status = decode_op_hdr(xdr, OP_READLINK);
@@ -4330,9 +4558,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
4330 * and and null-terminate the text (the VFS expects 4558 * and and null-terminate the text (the VFS expects
4331 * null-termination). 4559 * null-termination).
4332 */ 4560 */
4333 kaddr = (char *)kmap_atomic(rcvbuf->pages[0], KM_USER0); 4561 xdr_terminate_string(rcvbuf, len);
4334 kaddr[len+rcvbuf->page_base] = '\0';
4335 kunmap_atomic(kaddr, KM_USER0);
4336 return 0; 4562 return 0;
4337out_overflow: 4563out_overflow:
4338 print_overflow_msg(__func__, xdr); 4564 print_overflow_msg(__func__, xdr);
@@ -4668,7 +4894,6 @@ static int decode_sequence(struct xdr_stream *xdr,
4668 struct rpc_rqst *rqstp) 4894 struct rpc_rqst *rqstp)
4669{ 4895{
4670#if defined(CONFIG_NFS_V4_1) 4896#if defined(CONFIG_NFS_V4_1)
4671 struct nfs4_slot *slot;
4672 struct nfs4_sessionid id; 4897 struct nfs4_sessionid id;
4673 u32 dummy; 4898 u32 dummy;
4674 int status; 4899 int status;
@@ -4700,15 +4925,14 @@ static int decode_sequence(struct xdr_stream *xdr,
4700 goto out_overflow; 4925 goto out_overflow;
4701 4926
4702 /* seqid */ 4927 /* seqid */
4703 slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
4704 dummy = be32_to_cpup(p++); 4928 dummy = be32_to_cpup(p++);
4705 if (dummy != slot->seq_nr) { 4929 if (dummy != res->sr_slot->seq_nr) {
4706 dprintk("%s Invalid sequence number\n", __func__); 4930 dprintk("%s Invalid sequence number\n", __func__);
4707 goto out_err; 4931 goto out_err;
4708 } 4932 }
4709 /* slot id */ 4933 /* slot id */
4710 dummy = be32_to_cpup(p++); 4934 dummy = be32_to_cpup(p++);
4711 if (dummy != res->sr_slotid) { 4935 if (dummy != res->sr_slot - res->sr_session->fc_slot_table.slots) {
4712 dprintk("%s Invalid slot id\n", __func__); 4936 dprintk("%s Invalid slot id\n", __func__);
4713 goto out_err; 4937 goto out_err;
4714 } 4938 }
@@ -4731,6 +4955,134 @@ out_overflow:
4731#endif /* CONFIG_NFS_V4_1 */ 4955#endif /* CONFIG_NFS_V4_1 */
4732} 4956}
4733 4957
4958#if defined(CONFIG_NFS_V4_1)
4959
4960static int decode_getdeviceinfo(struct xdr_stream *xdr,
4961 struct pnfs_device *pdev)
4962{
4963 __be32 *p;
4964 uint32_t len, type;
4965 int status;
4966
4967 status = decode_op_hdr(xdr, OP_GETDEVICEINFO);
4968 if (status) {
4969 if (status == -ETOOSMALL) {
4970 p = xdr_inline_decode(xdr, 4);
4971 if (unlikely(!p))
4972 goto out_overflow;
4973 pdev->mincount = be32_to_cpup(p);
4974 dprintk("%s: Min count too small. mincnt = %u\n",
4975 __func__, pdev->mincount);
4976 }
4977 return status;
4978 }
4979
4980 p = xdr_inline_decode(xdr, 8);
4981 if (unlikely(!p))
4982 goto out_overflow;
4983 type = be32_to_cpup(p++);
4984 if (type != pdev->layout_type) {
4985 dprintk("%s: layout mismatch req: %u pdev: %u\n",
4986 __func__, pdev->layout_type, type);
4987 return -EINVAL;
4988 }
4989 /*
4990 * Get the length of the opaque device_addr4. xdr_read_pages places
4991 * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages)
4992 * and places the remaining xdr data in xdr_buf->tail
4993 */
4994 pdev->mincount = be32_to_cpup(p);
4995 xdr_read_pages(xdr, pdev->mincount); /* include space for the length */
4996
4997 /* Parse notification bitmap, verifying that it is zero. */
4998 p = xdr_inline_decode(xdr, 4);
4999 if (unlikely(!p))
5000 goto out_overflow;
5001 len = be32_to_cpup(p);
5002 if (len) {
5003 int i;
5004
5005 p = xdr_inline_decode(xdr, 4 * len);
5006 if (unlikely(!p))
5007 goto out_overflow;
5008 for (i = 0; i < len; i++, p++) {
5009 if (be32_to_cpup(p)) {
5010 dprintk("%s: notifications not supported\n",
5011 __func__);
5012 return -EIO;
5013 }
5014 }
5015 }
5016 return 0;
5017out_overflow:
5018 print_overflow_msg(__func__, xdr);
5019 return -EIO;
5020}
5021
5022static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
5023 struct nfs4_layoutget_res *res)
5024{
5025 __be32 *p;
5026 int status;
5027 u32 layout_count;
5028
5029 status = decode_op_hdr(xdr, OP_LAYOUTGET);
5030 if (status)
5031 return status;
5032 p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE);
5033 if (unlikely(!p))
5034 goto out_overflow;
5035 res->return_on_close = be32_to_cpup(p++);
5036 p = xdr_decode_opaque_fixed(p, res->stateid.data, NFS4_STATEID_SIZE);
5037 layout_count = be32_to_cpup(p);
5038 if (!layout_count) {
5039 dprintk("%s: server responded with empty layout array\n",
5040 __func__);
5041 return -EINVAL;
5042 }
5043
5044 p = xdr_inline_decode(xdr, 24);
5045 if (unlikely(!p))
5046 goto out_overflow;
5047 p = xdr_decode_hyper(p, &res->range.offset);
5048 p = xdr_decode_hyper(p, &res->range.length);
5049 res->range.iomode = be32_to_cpup(p++);
5050 res->type = be32_to_cpup(p++);
5051
5052 status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p);
5053 if (unlikely(status))
5054 return status;
5055
5056 dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n",
5057 __func__,
5058 (unsigned long)res->range.offset,
5059 (unsigned long)res->range.length,
5060 res->range.iomode,
5061 res->type,
5062 res->layout.len);
5063
5064 /* nfs4_proc_layoutget allocated a single page */
5065 if (res->layout.len > PAGE_SIZE)
5066 return -ENOMEM;
5067 memcpy(res->layout.buf, p, res->layout.len);
5068
5069 if (layout_count > 1) {
5070 /* We only handle a length one array at the moment. Any
5071 * further entries are just ignored. Note that this means
5072 * the client may see a response that is less than the
5073 * minimum it requested.
5074 */
5075 dprintk("%s: server responded with %d layouts, dropping tail\n",
5076 __func__, layout_count);
5077 }
5078
5079 return 0;
5080out_overflow:
5081 print_overflow_msg(__func__, xdr);
5082 return -EIO;
5083}
5084#endif /* CONFIG_NFS_V4_1 */
5085
4734/* 5086/*
4735 * END OF "GENERIC" DECODE ROUTINES. 5087 * END OF "GENERIC" DECODE ROUTINES.
4736 */ 5088 */
@@ -4873,7 +5225,7 @@ out:
4873/* 5225/*
4874 * Decode RENAME response 5226 * Decode RENAME response
4875 */ 5227 */
4876static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_rename_res *res) 5228static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs_renameres *res)
4877{ 5229{
4878 struct xdr_stream xdr; 5230 struct xdr_stream xdr;
4879 struct compound_hdr hdr; 5231 struct compound_hdr hdr;
@@ -5758,25 +6110,84 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p,
5758 status = decode_reclaim_complete(&xdr, (void *)NULL); 6110 status = decode_reclaim_complete(&xdr, (void *)NULL);
5759 return status; 6111 return status;
5760} 6112}
6113
6114/*
6115 * Decode GETDEVINFO response
6116 */
6117static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p,
6118 struct nfs4_getdeviceinfo_res *res)
6119{
6120 struct xdr_stream xdr;
6121 struct compound_hdr hdr;
6122 int status;
6123
6124 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
6125 status = decode_compound_hdr(&xdr, &hdr);
6126 if (status != 0)
6127 goto out;
6128 status = decode_sequence(&xdr, &res->seq_res, rqstp);
6129 if (status != 0)
6130 goto out;
6131 status = decode_getdeviceinfo(&xdr, res->pdev);
6132out:
6133 return status;
6134}
6135
6136/*
6137 * Decode LAYOUTGET response
6138 */
6139static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p,
6140 struct nfs4_layoutget_res *res)
6141{
6142 struct xdr_stream xdr;
6143 struct compound_hdr hdr;
6144 int status;
6145
6146 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
6147 status = decode_compound_hdr(&xdr, &hdr);
6148 if (status)
6149 goto out;
6150 status = decode_sequence(&xdr, &res->seq_res, rqstp);
6151 if (status)
6152 goto out;
6153 status = decode_putfh(&xdr);
6154 if (status)
6155 goto out;
6156 status = decode_layoutget(&xdr, rqstp, res);
6157out:
6158 return status;
6159}
5761#endif /* CONFIG_NFS_V4_1 */ 6160#endif /* CONFIG_NFS_V4_1 */
5762 6161
5763__be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) 6162__be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
6163 struct nfs_server *server, int plus)
5764{ 6164{
5765 uint32_t bitmap[2] = {0}; 6165 uint32_t bitmap[2] = {0};
5766 uint32_t len; 6166 uint32_t len;
5767 6167 __be32 *p = xdr_inline_decode(xdr, 4);
5768 if (!*p++) { 6168 if (unlikely(!p))
5769 if (!*p) 6169 goto out_overflow;
6170 if (!ntohl(*p++)) {
6171 p = xdr_inline_decode(xdr, 4);
6172 if (unlikely(!p))
6173 goto out_overflow;
6174 if (!ntohl(*p++))
5770 return ERR_PTR(-EAGAIN); 6175 return ERR_PTR(-EAGAIN);
5771 entry->eof = 1; 6176 entry->eof = 1;
5772 return ERR_PTR(-EBADCOOKIE); 6177 return ERR_PTR(-EBADCOOKIE);
5773 } 6178 }
5774 6179
6180 p = xdr_inline_decode(xdr, 12);
6181 if (unlikely(!p))
6182 goto out_overflow;
5775 entry->prev_cookie = entry->cookie; 6183 entry->prev_cookie = entry->cookie;
5776 p = xdr_decode_hyper(p, &entry->cookie); 6184 p = xdr_decode_hyper(p, &entry->cookie);
5777 entry->len = ntohl(*p++); 6185 entry->len = ntohl(*p++);
6186
6187 p = xdr_inline_decode(xdr, entry->len);
6188 if (unlikely(!p))
6189 goto out_overflow;
5778 entry->name = (const char *) p; 6190 entry->name = (const char *) p;
5779 p += XDR_QUADLEN(entry->len);
5780 6191
5781 /* 6192 /*
5782 * In case the server doesn't return an inode number, 6193 * In case the server doesn't return an inode number,
@@ -5784,32 +6195,33 @@ __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
5784 * since glibc seems to choke on it...) 6195 * since glibc seems to choke on it...)
5785 */ 6196 */
5786 entry->ino = 1; 6197 entry->ino = 1;
6198 entry->fattr->valid = 0;
5787 6199
5788 len = ntohl(*p++); /* bitmap length */ 6200 if (decode_attr_bitmap(xdr, bitmap) < 0)
5789 if (len-- > 0) { 6201 goto out_overflow;
5790 bitmap[0] = ntohl(*p++); 6202
5791 if (len-- > 0) { 6203 if (decode_attr_length(xdr, &len, &p) < 0)
5792 bitmap[1] = ntohl(*p++); 6204 goto out_overflow;
5793 p += len; 6205
5794 } 6206 if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, server, 1) < 0)
5795 } 6207 goto out_overflow;
5796 len = XDR_QUADLEN(ntohl(*p++)); /* attribute buffer length */ 6208 if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
5797 if (len > 0) { 6209 entry->ino = entry->fattr->fileid;
5798 if (bitmap[0] & FATTR4_WORD0_RDATTR_ERROR) { 6210
5799 bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR; 6211 if (verify_attr_len(xdr, p, len) < 0)
5800 /* Ignore the return value of rdattr_error for now */ 6212 goto out_overflow;
5801 p++; 6213
5802 len--; 6214 p = xdr_inline_peek(xdr, 8);
5803 } 6215 if (p != NULL)
5804 if (bitmap[0] == 0 && bitmap[1] == FATTR4_WORD1_MOUNTED_ON_FILEID) 6216 entry->eof = !p[0] && p[1];
5805 xdr_decode_hyper(p, &entry->ino); 6217 else
5806 else if (bitmap[0] == FATTR4_WORD0_FILEID) 6218 entry->eof = 0;
5807 xdr_decode_hyper(p, &entry->ino);
5808 p += len;
5809 }
5810 6219
5811 entry->eof = !p[0] && p[1];
5812 return p; 6220 return p;
6221
6222out_overflow:
6223 print_overflow_msg(__func__, xdr);
6224 return ERR_PTR(-EIO);
5813} 6225}
5814 6226
5815/* 6227/*
@@ -5936,6 +6348,8 @@ struct rpc_procinfo nfs4_procedures[] = {
5936 PROC(SEQUENCE, enc_sequence, dec_sequence), 6348 PROC(SEQUENCE, enc_sequence, dec_sequence),
5937 PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), 6349 PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time),
5938 PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), 6350 PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete),
6351 PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
6352 PROC(LAYOUTGET, enc_layoutget, dec_layoutget),
5939#endif /* CONFIG_NFS_V4_1 */ 6353#endif /* CONFIG_NFS_V4_1 */
5940}; 6354};
5941 6355
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index df101d9f546a..903908a20023 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -3,9 +3,10 @@
3 * 3 *
4 * Allow an NFS filesystem to be mounted as root. The way this works is: 4 * Allow an NFS filesystem to be mounted as root. The way this works is:
5 * (1) Use the IP autoconfig mechanism to set local IP addresses and routes. 5 * (1) Use the IP autoconfig mechanism to set local IP addresses and routes.
6 * (2) Handle RPC negotiation with the system which replied to RARP or 6 * (2) Construct the device string and the options string using DHCP
7 * was reported as a boot server by BOOTP or manually. 7 * option 17 and/or kernel command line options.
8 * (3) The actual mounting is done later, when init() is running. 8 * (3) When mount_root() sets up the root file system, pass these strings
9 * to the NFS client's regular mount interface via sys_mount().
9 * 10 *
10 * 11 *
11 * Changes: 12 * Changes:
@@ -65,470 +66,245 @@
65 * Hua Qin : Support for mounting root file system via 66 * Hua Qin : Support for mounting root file system via
66 * NFS over TCP. 67 * NFS over TCP.
67 * Fabian Frederick: Option parser rebuilt (using parser lib) 68 * Fabian Frederick: Option parser rebuilt (using parser lib)
68*/ 69 * Chuck Lever : Use super.c's text-based mount option parsing
70 * Chuck Lever : Add "nfsrootdebug".
71 */
69 72
70#include <linux/types.h> 73#include <linux/types.h>
71#include <linux/string.h> 74#include <linux/string.h>
72#include <linux/kernel.h>
73#include <linux/time.h>
74#include <linux/fs.h>
75#include <linux/init.h> 75#include <linux/init.h>
76#include <linux/sunrpc/clnt.h>
77#include <linux/sunrpc/xprtsock.h>
78#include <linux/nfs.h> 76#include <linux/nfs.h>
79#include <linux/nfs_fs.h> 77#include <linux/nfs_fs.h>
80#include <linux/nfs_mount.h>
81#include <linux/in.h>
82#include <linux/major.h>
83#include <linux/utsname.h> 78#include <linux/utsname.h>
84#include <linux/inet.h>
85#include <linux/root_dev.h> 79#include <linux/root_dev.h>
86#include <net/ipconfig.h> 80#include <net/ipconfig.h>
87#include <linux/parser.h>
88 81
89#include "internal.h" 82#include "internal.h"
90 83
91/* Define this to allow debugging output */
92#undef NFSROOT_DEBUG
93#define NFSDBG_FACILITY NFSDBG_ROOT 84#define NFSDBG_FACILITY NFSDBG_ROOT
94 85
95/* Default port to use if server is not running a portmapper */
96#define NFS_MNT_PORT 627
97
98/* Default path we try to mount. "%s" gets replaced by our IP address */ 86/* Default path we try to mount. "%s" gets replaced by our IP address */
99#define NFS_ROOT "/tftpboot/%s" 87#define NFS_ROOT "/tftpboot/%s"
100 88
101/* Parameters passed from the kernel command line */ 89/* Parameters passed from the kernel command line */
102static char nfs_root_name[256] __initdata = ""; 90static char nfs_root_parms[256] __initdata = "";
91
92/* Text-based mount options passed to super.c */
93static char nfs_root_options[256] __initdata = "";
103 94
104/* Address of NFS server */ 95/* Address of NFS server */
105static __be32 servaddr __initdata = 0; 96static __be32 servaddr __initdata = htonl(INADDR_NONE);
106 97
107/* Name of directory to mount */ 98/* Name of directory to mount */
108static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = { 0, }; 99static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = "";
109
110/* NFS-related data */
111static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */
112static int nfs_port __initdata = 0; /* Port to connect to for NFS */
113static int mount_port __initdata = 0; /* Mount daemon port number */
114
115
116/***************************************************************************
117
118 Parsing of options
119
120 ***************************************************************************/
121
122enum {
123 /* Options that take integer arguments */
124 Opt_port, Opt_rsize, Opt_wsize, Opt_timeo, Opt_retrans, Opt_acregmin,
125 Opt_acregmax, Opt_acdirmin, Opt_acdirmax,
126 /* Options that take no arguments */
127 Opt_soft, Opt_hard, Opt_intr,
128 Opt_nointr, Opt_posix, Opt_noposix, Opt_cto, Opt_nocto, Opt_ac,
129 Opt_noac, Opt_lock, Opt_nolock, Opt_v2, Opt_v3, Opt_udp, Opt_tcp,
130 Opt_acl, Opt_noacl,
131 /* Error token */
132 Opt_err
133};
134
135static const match_table_t tokens __initconst = {
136 {Opt_port, "port=%u"},
137 {Opt_rsize, "rsize=%u"},
138 {Opt_wsize, "wsize=%u"},
139 {Opt_timeo, "timeo=%u"},
140 {Opt_retrans, "retrans=%u"},
141 {Opt_acregmin, "acregmin=%u"},
142 {Opt_acregmax, "acregmax=%u"},
143 {Opt_acdirmin, "acdirmin=%u"},
144 {Opt_acdirmax, "acdirmax=%u"},
145 {Opt_soft, "soft"},
146 {Opt_hard, "hard"},
147 {Opt_intr, "intr"},
148 {Opt_nointr, "nointr"},
149 {Opt_posix, "posix"},
150 {Opt_noposix, "noposix"},
151 {Opt_cto, "cto"},
152 {Opt_nocto, "nocto"},
153 {Opt_ac, "ac"},
154 {Opt_noac, "noac"},
155 {Opt_lock, "lock"},
156 {Opt_nolock, "nolock"},
157 {Opt_v2, "nfsvers=2"},
158 {Opt_v2, "v2"},
159 {Opt_v3, "nfsvers=3"},
160 {Opt_v3, "v3"},
161 {Opt_udp, "proto=udp"},
162 {Opt_udp, "udp"},
163 {Opt_tcp, "proto=tcp"},
164 {Opt_tcp, "tcp"},
165 {Opt_acl, "acl"},
166 {Opt_noacl, "noacl"},
167 {Opt_err, NULL}
168
169};
170 100
101/* server:export path string passed to super.c */
102static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = "";
103
104#ifdef RPC_DEBUG
171/* 105/*
172 * Parse option string. 106 * When the "nfsrootdebug" kernel command line option is specified,
107 * enable debugging messages for NFSROOT.
173 */ 108 */
174 109static int __init nfs_root_debug(char *__unused)
175static int __init root_nfs_parse(char *name, char *buf)
176{ 110{
177 111 nfs_debug |= NFSDBG_ROOT | NFSDBG_MOUNT;
178 char *p;
179 substring_t args[MAX_OPT_ARGS];
180 int option;
181
182 if (!name)
183 return 1;
184
185 /* Set the NFS remote path */
186 p = strsep(&name, ",");
187 if (p[0] != '\0' && strcmp(p, "default") != 0)
188 strlcpy(buf, p, NFS_MAXPATHLEN);
189
190 while ((p = strsep (&name, ",")) != NULL) {
191 int token;
192 if (!*p)
193 continue;
194 token = match_token(p, tokens, args);
195
196 /* %u tokens only. Beware if you add new tokens! */
197 if (token < Opt_soft && match_int(&args[0], &option))
198 return 0;
199 switch (token) {
200 case Opt_port:
201 nfs_port = option;
202 break;
203 case Opt_rsize:
204 nfs_data.rsize = option;
205 break;
206 case Opt_wsize:
207 nfs_data.wsize = option;
208 break;
209 case Opt_timeo:
210 nfs_data.timeo = option;
211 break;
212 case Opt_retrans:
213 nfs_data.retrans = option;
214 break;
215 case Opt_acregmin:
216 nfs_data.acregmin = option;
217 break;
218 case Opt_acregmax:
219 nfs_data.acregmax = option;
220 break;
221 case Opt_acdirmin:
222 nfs_data.acdirmin = option;
223 break;
224 case Opt_acdirmax:
225 nfs_data.acdirmax = option;
226 break;
227 case Opt_soft:
228 nfs_data.flags |= NFS_MOUNT_SOFT;
229 break;
230 case Opt_hard:
231 nfs_data.flags &= ~NFS_MOUNT_SOFT;
232 break;
233 case Opt_intr:
234 case Opt_nointr:
235 break;
236 case Opt_posix:
237 nfs_data.flags |= NFS_MOUNT_POSIX;
238 break;
239 case Opt_noposix:
240 nfs_data.flags &= ~NFS_MOUNT_POSIX;
241 break;
242 case Opt_cto:
243 nfs_data.flags &= ~NFS_MOUNT_NOCTO;
244 break;
245 case Opt_nocto:
246 nfs_data.flags |= NFS_MOUNT_NOCTO;
247 break;
248 case Opt_ac:
249 nfs_data.flags &= ~NFS_MOUNT_NOAC;
250 break;
251 case Opt_noac:
252 nfs_data.flags |= NFS_MOUNT_NOAC;
253 break;
254 case Opt_lock:
255 nfs_data.flags &= ~NFS_MOUNT_NONLM;
256 break;
257 case Opt_nolock:
258 nfs_data.flags |= NFS_MOUNT_NONLM;
259 break;
260 case Opt_v2:
261 nfs_data.flags &= ~NFS_MOUNT_VER3;
262 break;
263 case Opt_v3:
264 nfs_data.flags |= NFS_MOUNT_VER3;
265 break;
266 case Opt_udp:
267 nfs_data.flags &= ~NFS_MOUNT_TCP;
268 break;
269 case Opt_tcp:
270 nfs_data.flags |= NFS_MOUNT_TCP;
271 break;
272 case Opt_acl:
273 nfs_data.flags &= ~NFS_MOUNT_NOACL;
274 break;
275 case Opt_noacl:
276 nfs_data.flags |= NFS_MOUNT_NOACL;
277 break;
278 default:
279 printk(KERN_WARNING "Root-NFS: unknown "
280 "option: %s\n", p);
281 return 0;
282 }
283 }
284
285 return 1; 112 return 1;
286} 113}
287 114
115__setup("nfsrootdebug", nfs_root_debug);
116#endif
117
288/* 118/*
289 * Prepare the NFS data structure and parse all options. 119 * Parse NFS server and directory information passed on the kernel
120 * command line.
121 *
122 * nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
123 *
124 * If there is a "%s" token in the <root-dir> string, it is replaced
125 * by the ASCII-representation of the client's IP address.
290 */ 126 */
291static int __init root_nfs_name(char *name) 127static int __init nfs_root_setup(char *line)
292{ 128{
293 static char buf[NFS_MAXPATHLEN] __initdata; 129 ROOT_DEV = Root_NFS;
294 char *cp; 130
295 131 if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) {
296 /* Set some default values */ 132 strlcpy(nfs_root_parms, line, sizeof(nfs_root_parms));
297 memset(&nfs_data, 0, sizeof(nfs_data)); 133 } else {
298 nfs_port = -1; 134 size_t n = strlen(line) + sizeof(NFS_ROOT) - 1;
299 nfs_data.version = NFS_MOUNT_VERSION; 135 if (n >= sizeof(nfs_root_parms))
300 nfs_data.flags = NFS_MOUNT_NONLM; /* No lockd in nfs root yet */ 136 line[sizeof(nfs_root_parms) - sizeof(NFS_ROOT) - 2] = '\0';
301 nfs_data.rsize = NFS_DEF_FILE_IO_SIZE; 137 sprintf(nfs_root_parms, NFS_ROOT, line);
302 nfs_data.wsize = NFS_DEF_FILE_IO_SIZE;
303 nfs_data.acregmin = NFS_DEF_ACREGMIN;
304 nfs_data.acregmax = NFS_DEF_ACREGMAX;
305 nfs_data.acdirmin = NFS_DEF_ACDIRMIN;
306 nfs_data.acdirmax = NFS_DEF_ACDIRMAX;
307 strcpy(buf, NFS_ROOT);
308
309 /* Process options received from the remote server */
310 root_nfs_parse(root_server_path, buf);
311
312 /* Override them by options set on kernel command-line */
313 root_nfs_parse(name, buf);
314
315 cp = utsname()->nodename;
316 if (strlen(buf) + strlen(cp) > NFS_MAXPATHLEN) {
317 printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n");
318 return -1;
319 } 138 }
320 sprintf(nfs_export_path, buf, cp); 139
140 /*
141 * Extract the IP address of the NFS server containing our
142 * root file system, if one was specified.
143 *
144 * Note: root_nfs_parse_addr() removes the server-ip from
145 * nfs_root_parms, if it exists.
146 */
147 root_server_addr = root_nfs_parse_addr(nfs_root_parms);
321 148
322 return 1; 149 return 1;
323} 150}
324 151
152__setup("nfsroot=", nfs_root_setup);
325 153
326/* 154static int __init root_nfs_copy(char *dest, const char *src,
327 * Get NFS server address. 155 const size_t destlen)
328 */
329static int __init root_nfs_addr(void)
330{ 156{
331 if ((servaddr = root_server_addr) == htonl(INADDR_NONE)) { 157 if (strlcpy(dest, src, destlen) > destlen)
332 printk(KERN_ERR "Root-NFS: No NFS server available, giving up.\n");
333 return -1; 158 return -1;
334 } 159 return 0;
160}
335 161
336 snprintf(nfs_data.hostname, sizeof(nfs_data.hostname), 162static int __init root_nfs_cat(char *dest, const char *src,
337 "%pI4", &servaddr); 163 const size_t destlen)
164{
165 if (strlcat(dest, src, destlen) > destlen)
166 return -1;
338 return 0; 167 return 0;
339} 168}
340 169
341/* 170/*
342 * Tell the user what's going on. 171 * Parse out root export path and mount options from
172 * passed-in string @incoming.
173 *
174 * Copy the export path into @exppath.
343 */ 175 */
344#ifdef NFSROOT_DEBUG 176static int __init root_nfs_parse_options(char *incoming, char *exppath,
345static void __init root_nfs_print(void) 177 const size_t exppathlen)
346{ 178{
347 printk(KERN_NOTICE "Root-NFS: Mounting %s on server %s as root\n", 179 char *p;
348 nfs_export_path, nfs_data.hostname);
349 printk(KERN_NOTICE "Root-NFS: rsize = %d, wsize = %d, timeo = %d, retrans = %d\n",
350 nfs_data.rsize, nfs_data.wsize, nfs_data.timeo, nfs_data.retrans);
351 printk(KERN_NOTICE "Root-NFS: acreg (min,max) = (%d,%d), acdir (min,max) = (%d,%d)\n",
352 nfs_data.acregmin, nfs_data.acregmax,
353 nfs_data.acdirmin, nfs_data.acdirmax);
354 printk(KERN_NOTICE "Root-NFS: nfsd port = %d, mountd port = %d, flags = %08x\n",
355 nfs_port, mount_port, nfs_data.flags);
356}
357#endif
358
359 180
360static int __init root_nfs_init(void) 181 /*
361{ 182 * Set the NFS remote path
362#ifdef NFSROOT_DEBUG 183 */
363 nfs_debug |= NFSDBG_ROOT; 184 p = strsep(&incoming, ",");
364#endif 185 if (*p != '\0' && strcmp(p, "default") != 0)
186 if (root_nfs_copy(exppath, p, exppathlen))
187 return -1;
365 188
366 /* 189 /*
367 * Decode the root directory path name and NFS options from 190 * @incoming now points to the rest of the string; if it
368 * the kernel command line. This has to go here in order to 191 * contains something, append it to our root options buffer
369 * be able to use the client IP address for the remote root
370 * directory (necessary for pure RARP booting).
371 */ 192 */
372 if (root_nfs_name(nfs_root_name) < 0 || 193 if (incoming != NULL && *incoming != '\0')
373 root_nfs_addr() < 0) 194 if (root_nfs_cat(nfs_root_options, incoming,
374 return -1; 195 sizeof(nfs_root_options)))
196 return -1;
375 197
376#ifdef NFSROOT_DEBUG 198 /*
377 root_nfs_print(); 199 * Possibly prepare for more options to be appended
378#endif 200 */
201 if (nfs_root_options[0] != '\0' &&
202 nfs_root_options[strlen(nfs_root_options)] != ',')
203 if (root_nfs_cat(nfs_root_options, ",",
204 sizeof(nfs_root_options)))
205 return -1;
379 206
380 return 0; 207 return 0;
381} 208}
382 209
383
384/* 210/*
385 * Parse NFS server and directory information passed on the kernel 211 * Decode the export directory path name and NFS options from
386 * command line. 212 * the kernel command line. This has to be done late in order to
213 * use a dynamically acquired client IP address for the remote
214 * root directory path.
215 *
216 * Returns zero if successful; otherwise -1 is returned.
387 */ 217 */
388static int __init nfs_root_setup(char *line) 218static int __init root_nfs_data(char *cmdline)
389{ 219{
390 ROOT_DEV = Root_NFS; 220 char addr_option[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
391 if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) { 221 int len, retval = -1;
392 strlcpy(nfs_root_name, line, sizeof(nfs_root_name)); 222 char *tmp = NULL;
393 } else { 223 const size_t tmplen = sizeof(nfs_export_path);
394 int n = strlen(line) + sizeof(NFS_ROOT) - 1; 224
395 if (n >= sizeof(nfs_root_name)) 225 tmp = kzalloc(tmplen, GFP_KERNEL);
396 line[sizeof(nfs_root_name) - sizeof(NFS_ROOT) - 2] = '\0'; 226 if (tmp == NULL)
397 sprintf(nfs_root_name, NFS_ROOT, line); 227 goto out_nomem;
228 strcpy(tmp, NFS_ROOT);
229
230 if (root_server_path[0] != '\0') {
231 dprintk("Root-NFS: DHCPv4 option 17: %s\n",
232 root_server_path);
233 if (root_nfs_parse_options(root_server_path, tmp, tmplen))
234 goto out_optionstoolong;
398 } 235 }
399 root_server_addr = root_nfs_parse_addr(nfs_root_name);
400 return 1;
401}
402
403__setup("nfsroot=", nfs_root_setup);
404
405/***************************************************************************
406 236
407 Routines to actually mount the root directory 237 if (cmdline[0] != '\0') {
238 dprintk("Root-NFS: nfsroot=%s\n", cmdline);
239 if (root_nfs_parse_options(cmdline, tmp, tmplen))
240 goto out_optionstoolong;
241 }
408 242
409 ***************************************************************************/ 243 /*
244 * Append mandatory options for nfsroot so they override
245 * what has come before
246 */
247 snprintf(addr_option, sizeof(addr_option), "nolock,addr=%pI4",
248 &servaddr);
249 if (root_nfs_cat(nfs_root_options, addr_option,
250 sizeof(nfs_root_options)))
251 goto out_optionstoolong;
410 252
411/* 253 /*
412 * Construct sockaddr_in from address and port number. 254 * Set up nfs_root_device. For NFS mounts, this looks like
413 */ 255 *
414static inline void 256 * server:/path
415set_sockaddr(struct sockaddr_in *sin, __be32 addr, __be16 port) 257 *
416{ 258 * At this point, utsname()->nodename contains our local
417 sin->sin_family = AF_INET; 259 * IP address or hostname, set by ipconfig. If "%s" exists
418 sin->sin_addr.s_addr = addr; 260 * in tmp, substitute the nodename, then shovel the whole
419 sin->sin_port = port; 261 * mess into nfs_root_device.
420} 262 */
263 len = snprintf(nfs_export_path, sizeof(nfs_export_path),
264 tmp, utsname()->nodename);
265 if (len > (int)sizeof(nfs_export_path))
266 goto out_devnametoolong;
267 len = snprintf(nfs_root_device, sizeof(nfs_root_device),
268 "%pI4:%s", &servaddr, nfs_export_path);
269 if (len > (int)sizeof(nfs_root_device))
270 goto out_devnametoolong;
421 271
422/* 272 retval = 0;
423 * Query server portmapper for the port of a daemon program.
424 */
425static int __init root_nfs_getport(int program, int version, int proto)
426{
427 struct sockaddr_in sin;
428 273
429 printk(KERN_NOTICE "Looking up port of RPC %d/%d on %pI4\n", 274out:
430 program, version, &servaddr); 275 kfree(tmp);
431 set_sockaddr(&sin, servaddr, 0); 276 return retval;
432 return rpcb_getport_sync(&sin, program, version, proto); 277out_nomem:
278 printk(KERN_ERR "Root-NFS: could not allocate memory\n");
279 goto out;
280out_optionstoolong:
281 printk(KERN_ERR "Root-NFS: mount options string too long\n");
282 goto out;
283out_devnametoolong:
284 printk(KERN_ERR "Root-NFS: root device name too long.\n");
285 goto out;
433} 286}
434 287
435 288/**
436/* 289 * nfs_root_data - Return prepared 'data' for NFSROOT mount
437 * Use portmapper to find mountd and nfsd port numbers if not overriden 290 * @root_device: OUT: address of string containing NFSROOT device
438 * by the user. Use defaults if portmapper is not available. 291 * @root_data: OUT: address of string containing NFSROOT mount options
439 * XXX: Is there any nfs server with no portmapper? 292 *
293 * Returns zero and sets @root_device and @root_data if successful,
294 * otherwise -1 is returned.
440 */ 295 */
441static int __init root_nfs_ports(void) 296int __init nfs_root_data(char **root_device, char **root_data)
442{ 297{
443 int port; 298 servaddr = root_server_addr;
444 int nfsd_ver, mountd_ver; 299 if (servaddr == htonl(INADDR_NONE)) {
445 int nfsd_port, mountd_port; 300 printk(KERN_ERR "Root-NFS: no NFS server address\n");
446 int proto; 301 return -1;
447
448 if (nfs_data.flags & NFS_MOUNT_VER3) {
449 nfsd_ver = NFS3_VERSION;
450 mountd_ver = NFS_MNT3_VERSION;
451 nfsd_port = NFS_PORT;
452 mountd_port = NFS_MNT_PORT;
453 } else {
454 nfsd_ver = NFS2_VERSION;
455 mountd_ver = NFS_MNT_VERSION;
456 nfsd_port = NFS_PORT;
457 mountd_port = NFS_MNT_PORT;
458 }
459
460 proto = (nfs_data.flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
461
462 if (nfs_port < 0) {
463 if ((port = root_nfs_getport(NFS_PROGRAM, nfsd_ver, proto)) < 0) {
464 printk(KERN_ERR "Root-NFS: Unable to get nfsd port "
465 "number from server, using default\n");
466 port = nfsd_port;
467 }
468 nfs_port = port;
469 dprintk("Root-NFS: Portmapper on server returned %d "
470 "as nfsd port\n", port);
471 } 302 }
472 303
473 if ((port = root_nfs_getport(NFS_MNT_PROGRAM, mountd_ver, proto)) < 0) { 304 if (root_nfs_data(nfs_root_parms) < 0)
474 printk(KERN_ERR "Root-NFS: Unable to get mountd port " 305 return -1;
475 "number from server, using default\n");
476 port = mountd_port;
477 }
478 mount_port = port;
479 dprintk("Root-NFS: mountd port is %d\n", port);
480 306
307 *root_device = nfs_root_device;
308 *root_data = nfs_root_options;
481 return 0; 309 return 0;
482} 310}
483
484
485/*
486 * Get a file handle from the server for the directory which is to be
487 * mounted.
488 */
489static int __init root_nfs_get_handle(void)
490{
491 struct sockaddr_in sin;
492 unsigned int auth_flav_len = 0;
493 struct nfs_mount_request request = {
494 .sap = (struct sockaddr *)&sin,
495 .salen = sizeof(sin),
496 .dirpath = nfs_export_path,
497 .version = (nfs_data.flags & NFS_MOUNT_VER3) ?
498 NFS_MNT3_VERSION : NFS_MNT_VERSION,
499 .protocol = (nfs_data.flags & NFS_MOUNT_TCP) ?
500 XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP,
501 .auth_flav_len = &auth_flav_len,
502 };
503 int status = -ENOMEM;
504
505 request.fh = nfs_alloc_fhandle();
506 if (!request.fh)
507 goto out;
508 set_sockaddr(&sin, servaddr, htons(mount_port));
509 status = nfs_mount(&request);
510 if (status < 0)
511 printk(KERN_ERR "Root-NFS: Server returned error %d "
512 "while mounting %s\n", status, nfs_export_path);
513 else {
514 nfs_data.root.size = request.fh->size;
515 memcpy(&nfs_data.root.data, request.fh->data, request.fh->size);
516 }
517 nfs_free_fhandle(request.fh);
518out:
519 return status;
520}
521
522/*
523 * Get the NFS port numbers and file handle, and return the prepared 'data'
524 * argument for mount() if everything went OK. Return NULL otherwise.
525 */
526void * __init nfs_root_data(void)
527{
528 if (root_nfs_init() < 0
529 || root_nfs_ports() < 0
530 || root_nfs_get_handle() < 0)
531 return NULL;
532 set_sockaddr((struct sockaddr_in *) &nfs_data.addr, servaddr, htons(nfs_port));
533 return (void*)&nfs_data;
534}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
new file mode 100644
index 000000000000..db773428f95f
--- /dev/null
+++ b/fs/nfs/pnfs.c
@@ -0,0 +1,783 @@
1/*
2 * pNFS functions to call and manage layout drivers.
3 *
4 * Copyright (c) 2002 [year of first publication]
5 * The Regents of the University of Michigan
6 * All Rights Reserved
7 *
8 * Dean Hildebrand <dhildebz@umich.edu>
9 *
10 * Permission is granted to use, copy, create derivative works, and
11 * redistribute this software and such derivative works for any purpose,
12 * so long as the name of the University of Michigan is not used in
13 * any advertising or publicity pertaining to the use or distribution
14 * of this software without specific, written prior authorization. If
15 * the above copyright notice or any other identification of the
16 * University of Michigan is included in any copy of any portion of
17 * this software, then the disclaimer below must also be included.
18 *
19 * This software is provided as is, without representation or warranty
20 * of any kind either express or implied, including without limitation
21 * the implied warranties of merchantability, fitness for a particular
22 * purpose, or noninfringement. The Regents of the University of
23 * Michigan shall not be liable for any damages, including special,
24 * indirect, incidental, or consequential damages, with respect to any
25 * claim arising out of or in connection with the use of the software,
26 * even if it has been or is hereafter advised of the possibility of
27 * such damages.
28 */
29
30#include <linux/nfs_fs.h>
31#include "internal.h"
32#include "pnfs.h"
33
34#define NFSDBG_FACILITY NFSDBG_PNFS
35
36/* Locking:
37 *
38 * pnfs_spinlock:
39 * protects pnfs_modules_tbl.
40 */
41static DEFINE_SPINLOCK(pnfs_spinlock);
42
43/*
44 * pnfs_modules_tbl holds all pnfs modules
45 */
46static LIST_HEAD(pnfs_modules_tbl);
47
48/* Return the registered pnfs layout driver module matching given id */
49static struct pnfs_layoutdriver_type *
50find_pnfs_driver_locked(u32 id)
51{
52 struct pnfs_layoutdriver_type *local;
53
54 list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
55 if (local->id == id)
56 goto out;
57 local = NULL;
58out:
59 dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
60 return local;
61}
62
63static struct pnfs_layoutdriver_type *
64find_pnfs_driver(u32 id)
65{
66 struct pnfs_layoutdriver_type *local;
67
68 spin_lock(&pnfs_spinlock);
69 local = find_pnfs_driver_locked(id);
70 spin_unlock(&pnfs_spinlock);
71 return local;
72}
73
74void
75unset_pnfs_layoutdriver(struct nfs_server *nfss)
76{
77 if (nfss->pnfs_curr_ld) {
78 nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
79 module_put(nfss->pnfs_curr_ld->owner);
80 }
81 nfss->pnfs_curr_ld = NULL;
82}
83
84/*
85 * Try to set the server's pnfs module to the pnfs layout type specified by id.
86 * Currently only one pNFS layout driver per filesystem is supported.
87 *
88 * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
89 */
90void
91set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
92{
93 struct pnfs_layoutdriver_type *ld_type = NULL;
94
95 if (id == 0)
96 goto out_no_driver;
97 if (!(server->nfs_client->cl_exchange_flags &
98 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
99 printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__,
100 id, server->nfs_client->cl_exchange_flags);
101 goto out_no_driver;
102 }
103 ld_type = find_pnfs_driver(id);
104 if (!ld_type) {
105 request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id);
106 ld_type = find_pnfs_driver(id);
107 if (!ld_type) {
108 dprintk("%s: No pNFS module found for %u.\n",
109 __func__, id);
110 goto out_no_driver;
111 }
112 }
113 if (!try_module_get(ld_type->owner)) {
114 dprintk("%s: Could not grab reference on module\n", __func__);
115 goto out_no_driver;
116 }
117 server->pnfs_curr_ld = ld_type;
118 if (ld_type->set_layoutdriver(server)) {
119 printk(KERN_ERR
120 "%s: Error initializing mount point for layout driver %u.\n",
121 __func__, id);
122 module_put(ld_type->owner);
123 goto out_no_driver;
124 }
125 dprintk("%s: pNFS module for %u set\n", __func__, id);
126 return;
127
128out_no_driver:
129 dprintk("%s: Using NFSv4 I/O\n", __func__);
130 server->pnfs_curr_ld = NULL;
131}
132
133int
134pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
135{
136 int status = -EINVAL;
137 struct pnfs_layoutdriver_type *tmp;
138
139 if (ld_type->id == 0) {
140 printk(KERN_ERR "%s id 0 is reserved\n", __func__);
141 return status;
142 }
143 if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
144 printk(KERN_ERR "%s Layout driver must provide "
145 "alloc_lseg and free_lseg.\n", __func__);
146 return status;
147 }
148
149 spin_lock(&pnfs_spinlock);
150 tmp = find_pnfs_driver_locked(ld_type->id);
151 if (!tmp) {
152 list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
153 status = 0;
154 dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
155 ld_type->name);
156 } else {
157 printk(KERN_ERR "%s Module with id %d already loaded!\n",
158 __func__, ld_type->id);
159 }
160 spin_unlock(&pnfs_spinlock);
161
162 return status;
163}
164EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
165
166void
167pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
168{
169 dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
170 spin_lock(&pnfs_spinlock);
171 list_del(&ld_type->pnfs_tblid);
172 spin_unlock(&pnfs_spinlock);
173}
174EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
175
176/*
177 * pNFS client layout cache
178 */
179
180static void
181get_layout_hdr_locked(struct pnfs_layout_hdr *lo)
182{
183 assert_spin_locked(&lo->inode->i_lock);
184 lo->refcount++;
185}
186
187static void
188put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
189{
190 assert_spin_locked(&lo->inode->i_lock);
191 BUG_ON(lo->refcount == 0);
192
193 lo->refcount--;
194 if (!lo->refcount) {
195 dprintk("%s: freeing layout cache %p\n", __func__, lo);
196 BUG_ON(!list_empty(&lo->layouts));
197 NFS_I(lo->inode)->layout = NULL;
198 kfree(lo);
199 }
200}
201
202void
203put_layout_hdr(struct inode *inode)
204{
205 spin_lock(&inode->i_lock);
206 put_layout_hdr_locked(NFS_I(inode)->layout);
207 spin_unlock(&inode->i_lock);
208}
209
210static void
211init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
212{
213 INIT_LIST_HEAD(&lseg->fi_list);
214 kref_init(&lseg->kref);
215 lseg->layout = lo;
216}
217
218/* Called without i_lock held, as the free_lseg call may sleep */
219static void
220destroy_lseg(struct kref *kref)
221{
222 struct pnfs_layout_segment *lseg =
223 container_of(kref, struct pnfs_layout_segment, kref);
224 struct inode *ino = lseg->layout->inode;
225
226 dprintk("--> %s\n", __func__);
227 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
228 /* Matched by get_layout_hdr_locked in pnfs_insert_layout */
229 put_layout_hdr(ino);
230}
231
232static void
233put_lseg(struct pnfs_layout_segment *lseg)
234{
235 if (!lseg)
236 return;
237
238 dprintk("%s: lseg %p ref %d\n", __func__, lseg,
239 atomic_read(&lseg->kref.refcount));
240 kref_put(&lseg->kref, destroy_lseg);
241}
242
243static void
244pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list)
245{
246 struct pnfs_layout_segment *lseg, *next;
247 struct nfs_client *clp;
248
249 dprintk("%s:Begin lo %p\n", __func__, lo);
250
251 assert_spin_locked(&lo->inode->i_lock);
252 list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) {
253 dprintk("%s: freeing lseg %p\n", __func__, lseg);
254 list_move(&lseg->fi_list, tmp_list);
255 }
256 clp = NFS_SERVER(lo->inode)->nfs_client;
257 spin_lock(&clp->cl_lock);
258 /* List does not take a reference, so no need for put here */
259 list_del_init(&lo->layouts);
260 spin_unlock(&clp->cl_lock);
261 write_seqlock(&lo->seqlock);
262 clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
263 write_sequnlock(&lo->seqlock);
264
265 dprintk("%s:Return\n", __func__);
266}
267
268static void
269pnfs_free_lseg_list(struct list_head *tmp_list)
270{
271 struct pnfs_layout_segment *lseg;
272
273 while (!list_empty(tmp_list)) {
274 lseg = list_entry(tmp_list->next, struct pnfs_layout_segment,
275 fi_list);
276 dprintk("%s calling put_lseg on %p\n", __func__, lseg);
277 list_del(&lseg->fi_list);
278 put_lseg(lseg);
279 }
280}
281
282void
283pnfs_destroy_layout(struct nfs_inode *nfsi)
284{
285 struct pnfs_layout_hdr *lo;
286 LIST_HEAD(tmp_list);
287
288 spin_lock(&nfsi->vfs_inode.i_lock);
289 lo = nfsi->layout;
290 if (lo) {
291 pnfs_clear_lseg_list(lo, &tmp_list);
292 /* Matched by refcount set to 1 in alloc_init_layout_hdr */
293 put_layout_hdr_locked(lo);
294 }
295 spin_unlock(&nfsi->vfs_inode.i_lock);
296 pnfs_free_lseg_list(&tmp_list);
297}
298
299/*
300 * Called by the state manger to remove all layouts established under an
301 * expired lease.
302 */
303void
304pnfs_destroy_all_layouts(struct nfs_client *clp)
305{
306 struct pnfs_layout_hdr *lo;
307 LIST_HEAD(tmp_list);
308
309 spin_lock(&clp->cl_lock);
310 list_splice_init(&clp->cl_layouts, &tmp_list);
311 spin_unlock(&clp->cl_lock);
312
313 while (!list_empty(&tmp_list)) {
314 lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
315 layouts);
316 dprintk("%s freeing layout for inode %lu\n", __func__,
317 lo->inode->i_ino);
318 pnfs_destroy_layout(NFS_I(lo->inode));
319 }
320}
321
322/* update lo->stateid with new if is more recent
323 *
324 * lo->stateid could be the open stateid, in which case we just use what given.
325 */
326static void
327pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
328 const nfs4_stateid *new)
329{
330 nfs4_stateid *old = &lo->stateid;
331 bool overwrite = false;
332
333 write_seqlock(&lo->seqlock);
334 if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) ||
335 memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
336 overwrite = true;
337 else {
338 u32 oldseq, newseq;
339
340 oldseq = be32_to_cpu(old->stateid.seqid);
341 newseq = be32_to_cpu(new->stateid.seqid);
342 if ((int)(newseq - oldseq) > 0)
343 overwrite = true;
344 }
345 if (overwrite)
346 memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
347 write_sequnlock(&lo->seqlock);
348}
349
350static void
351pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo,
352 struct nfs4_state *state)
353{
354 int seq;
355
356 dprintk("--> %s\n", __func__);
357 write_seqlock(&lo->seqlock);
358 do {
359 seq = read_seqbegin(&state->seqlock);
360 memcpy(lo->stateid.data, state->stateid.data,
361 sizeof(state->stateid.data));
362 } while (read_seqretry(&state->seqlock, seq));
363 set_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
364 write_sequnlock(&lo->seqlock);
365 dprintk("<-- %s\n", __func__);
366}
367
368void
369pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
370 struct nfs4_state *open_state)
371{
372 int seq;
373
374 dprintk("--> %s\n", __func__);
375 do {
376 seq = read_seqbegin(&lo->seqlock);
377 if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) {
378 /* This will trigger retry of the read */
379 pnfs_layout_from_open_stateid(lo, open_state);
380 } else
381 memcpy(dst->data, lo->stateid.data,
382 sizeof(lo->stateid.data));
383 } while (read_seqretry(&lo->seqlock, seq));
384 dprintk("<-- %s\n", __func__);
385}
386
387/*
388* Get layout from server.
389* for now, assume that whole file layouts are requested.
390* arg->offset: 0
391* arg->length: all ones
392*/
393static struct pnfs_layout_segment *
394send_layoutget(struct pnfs_layout_hdr *lo,
395 struct nfs_open_context *ctx,
396 u32 iomode)
397{
398 struct inode *ino = lo->inode;
399 struct nfs_server *server = NFS_SERVER(ino);
400 struct nfs4_layoutget *lgp;
401 struct pnfs_layout_segment *lseg = NULL;
402
403 dprintk("--> %s\n", __func__);
404
405 BUG_ON(ctx == NULL);
406 lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
407 if (lgp == NULL) {
408 put_layout_hdr(lo->inode);
409 return NULL;
410 }
411 lgp->args.minlength = NFS4_MAX_UINT64;
412 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
413 lgp->args.range.iomode = iomode;
414 lgp->args.range.offset = 0;
415 lgp->args.range.length = NFS4_MAX_UINT64;
416 lgp->args.type = server->pnfs_curr_ld->id;
417 lgp->args.inode = ino;
418 lgp->args.ctx = get_nfs_open_context(ctx);
419 lgp->lsegpp = &lseg;
420
421 /* Synchronously retrieve layout information from server and
422 * store in lseg.
423 */
424 nfs4_proc_layoutget(lgp);
425 if (!lseg) {
426 /* remember that LAYOUTGET failed and suspend trying */
427 set_bit(lo_fail_bit(iomode), &lo->state);
428 }
429 return lseg;
430}
431
432/*
433 * Compare two layout segments for sorting into layout cache.
434 * We want to preferentially return RW over RO layouts, so ensure those
435 * are seen first.
436 */
437static s64
438cmp_layout(u32 iomode1, u32 iomode2)
439{
440 /* read > read/write */
441 return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ);
442}
443
444static void
445pnfs_insert_layout(struct pnfs_layout_hdr *lo,
446 struct pnfs_layout_segment *lseg)
447{
448 struct pnfs_layout_segment *lp;
449 int found = 0;
450
451 dprintk("%s:Begin\n", __func__);
452
453 assert_spin_locked(&lo->inode->i_lock);
454 if (list_empty(&lo->segs)) {
455 struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client;
456
457 spin_lock(&clp->cl_lock);
458 BUG_ON(!list_empty(&lo->layouts));
459 list_add_tail(&lo->layouts, &clp->cl_layouts);
460 spin_unlock(&clp->cl_lock);
461 }
462 list_for_each_entry(lp, &lo->segs, fi_list) {
463 if (cmp_layout(lp->range.iomode, lseg->range.iomode) > 0)
464 continue;
465 list_add_tail(&lseg->fi_list, &lp->fi_list);
466 dprintk("%s: inserted lseg %p "
467 "iomode %d offset %llu length %llu before "
468 "lp %p iomode %d offset %llu length %llu\n",
469 __func__, lseg, lseg->range.iomode,
470 lseg->range.offset, lseg->range.length,
471 lp, lp->range.iomode, lp->range.offset,
472 lp->range.length);
473 found = 1;
474 break;
475 }
476 if (!found) {
477 list_add_tail(&lseg->fi_list, &lo->segs);
478 dprintk("%s: inserted lseg %p "
479 "iomode %d offset %llu length %llu at tail\n",
480 __func__, lseg, lseg->range.iomode,
481 lseg->range.offset, lseg->range.length);
482 }
483 get_layout_hdr_locked(lo);
484
485 dprintk("%s:Return\n", __func__);
486}
487
488static struct pnfs_layout_hdr *
489alloc_init_layout_hdr(struct inode *ino)
490{
491 struct pnfs_layout_hdr *lo;
492
493 lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
494 if (!lo)
495 return NULL;
496 lo->refcount = 1;
497 INIT_LIST_HEAD(&lo->layouts);
498 INIT_LIST_HEAD(&lo->segs);
499 seqlock_init(&lo->seqlock);
500 lo->inode = ino;
501 return lo;
502}
503
504static struct pnfs_layout_hdr *
505pnfs_find_alloc_layout(struct inode *ino)
506{
507 struct nfs_inode *nfsi = NFS_I(ino);
508 struct pnfs_layout_hdr *new = NULL;
509
510 dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
511
512 assert_spin_locked(&ino->i_lock);
513 if (nfsi->layout)
514 return nfsi->layout;
515
516 spin_unlock(&ino->i_lock);
517 new = alloc_init_layout_hdr(ino);
518 spin_lock(&ino->i_lock);
519
520 if (likely(nfsi->layout == NULL)) /* Won the race? */
521 nfsi->layout = new;
522 else
523 kfree(new);
524 return nfsi->layout;
525}
526
527/*
528 * iomode matching rules:
529 * iomode lseg match
530 * ----- ----- -----
531 * ANY READ true
532 * ANY RW true
533 * RW READ false
534 * RW RW true
535 * READ READ true
536 * READ RW true
537 */
538static int
539is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
540{
541 return (iomode != IOMODE_RW || lseg->range.iomode == IOMODE_RW);
542}
543
544/*
545 * lookup range in layout
546 */
547static struct pnfs_layout_segment *
548pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode)
549{
550 struct pnfs_layout_segment *lseg, *ret = NULL;
551
552 dprintk("%s:Begin\n", __func__);
553
554 assert_spin_locked(&lo->inode->i_lock);
555 list_for_each_entry(lseg, &lo->segs, fi_list) {
556 if (is_matching_lseg(lseg, iomode)) {
557 ret = lseg;
558 break;
559 }
560 if (cmp_layout(iomode, lseg->range.iomode) > 0)
561 break;
562 }
563
564 dprintk("%s:Return lseg %p ref %d\n",
565 __func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0);
566 return ret;
567}
568
569/*
570 * Layout segment is retreived from the server if not cached.
571 * The appropriate layout segment is referenced and returned to the caller.
572 */
573struct pnfs_layout_segment *
574pnfs_update_layout(struct inode *ino,
575 struct nfs_open_context *ctx,
576 enum pnfs_iomode iomode)
577{
578 struct nfs_inode *nfsi = NFS_I(ino);
579 struct pnfs_layout_hdr *lo;
580 struct pnfs_layout_segment *lseg = NULL;
581
582 if (!pnfs_enabled_sb(NFS_SERVER(ino)))
583 return NULL;
584 spin_lock(&ino->i_lock);
585 lo = pnfs_find_alloc_layout(ino);
586 if (lo == NULL) {
587 dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__);
588 goto out_unlock;
589 }
590
591 /* Check to see if the layout for the given range already exists */
592 lseg = pnfs_has_layout(lo, iomode);
593 if (lseg) {
594 dprintk("%s: Using cached lseg %p for iomode %d)\n",
595 __func__, lseg, iomode);
596 goto out_unlock;
597 }
598
599 /* if LAYOUTGET already failed once we don't try again */
600 if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state))
601 goto out_unlock;
602
603 get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */
604 spin_unlock(&ino->i_lock);
605
606 lseg = send_layoutget(lo, ctx, iomode);
607out:
608 dprintk("%s end, state 0x%lx lseg %p\n", __func__,
609 nfsi->layout->state, lseg);
610 return lseg;
611out_unlock:
612 spin_unlock(&ino->i_lock);
613 goto out;
614}
615
616int
617pnfs_layout_process(struct nfs4_layoutget *lgp)
618{
619 struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
620 struct nfs4_layoutget_res *res = &lgp->res;
621 struct pnfs_layout_segment *lseg;
622 struct inode *ino = lo->inode;
623 int status = 0;
624
625 /* Inject layout blob into I/O device driver */
626 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
627 if (!lseg || IS_ERR(lseg)) {
628 if (!lseg)
629 status = -ENOMEM;
630 else
631 status = PTR_ERR(lseg);
632 dprintk("%s: Could not allocate layout: error %d\n",
633 __func__, status);
634 goto out;
635 }
636
637 spin_lock(&ino->i_lock);
638 init_lseg(lo, lseg);
639 lseg->range = res->range;
640 *lgp->lsegpp = lseg;
641 pnfs_insert_layout(lo, lseg);
642
643 /* Done processing layoutget. Set the layout stateid */
644 pnfs_set_layout_stateid(lo, &res->stateid);
645 spin_unlock(&ino->i_lock);
646out:
647 return status;
648}
649
650/*
651 * Device ID cache. Currently supports one layout type per struct nfs_client.
652 * Add layout type to the lookup key to expand to support multiple types.
653 */
654int
655pnfs_alloc_init_deviceid_cache(struct nfs_client *clp,
656 void (*free_callback)(struct pnfs_deviceid_node *))
657{
658 struct pnfs_deviceid_cache *c;
659
660 c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL);
661 if (!c)
662 return -ENOMEM;
663 spin_lock(&clp->cl_lock);
664 if (clp->cl_devid_cache != NULL) {
665 atomic_inc(&clp->cl_devid_cache->dc_ref);
666 dprintk("%s [kref [%d]]\n", __func__,
667 atomic_read(&clp->cl_devid_cache->dc_ref));
668 kfree(c);
669 } else {
670 /* kzalloc initializes hlists */
671 spin_lock_init(&c->dc_lock);
672 atomic_set(&c->dc_ref, 1);
673 c->dc_free_callback = free_callback;
674 clp->cl_devid_cache = c;
675 dprintk("%s [new]\n", __func__);
676 }
677 spin_unlock(&clp->cl_lock);
678 return 0;
679}
680EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
681
682/*
683 * Called from pnfs_layoutdriver_type->free_lseg
684 * last layout segment reference frees deviceid
685 */
686void
687pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
688 struct pnfs_deviceid_node *devid)
689{
690 struct nfs4_deviceid *id = &devid->de_id;
691 struct pnfs_deviceid_node *d;
692 struct hlist_node *n;
693 long h = nfs4_deviceid_hash(id);
694
695 dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
696 if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock))
697 return;
698
699 hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
700 if (!memcmp(&d->de_id, id, sizeof(*id))) {
701 hlist_del_rcu(&d->de_node);
702 spin_unlock(&c->dc_lock);
703 synchronize_rcu();
704 c->dc_free_callback(devid);
705 return;
706 }
707 spin_unlock(&c->dc_lock);
708 /* Why wasn't it found in the list? */
709 BUG();
710}
711EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
712
713/* Find and reference a deviceid */
714struct pnfs_deviceid_node *
715pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
716{
717 struct pnfs_deviceid_node *d;
718 struct hlist_node *n;
719 long hash = nfs4_deviceid_hash(id);
720
721 dprintk("--> %s hash %ld\n", __func__, hash);
722 rcu_read_lock();
723 hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
724 if (!memcmp(&d->de_id, id, sizeof(*id))) {
725 if (!atomic_inc_not_zero(&d->de_ref)) {
726 goto fail;
727 } else {
728 rcu_read_unlock();
729 return d;
730 }
731 }
732 }
733fail:
734 rcu_read_unlock();
735 return NULL;
736}
737EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid);
738
739/*
740 * Add a deviceid to the cache.
741 * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
742 */
743struct pnfs_deviceid_node *
744pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new)
745{
746 struct pnfs_deviceid_node *d;
747 long hash = nfs4_deviceid_hash(&new->de_id);
748
749 dprintk("--> %s hash %ld\n", __func__, hash);
750 spin_lock(&c->dc_lock);
751 d = pnfs_find_get_deviceid(c, &new->de_id);
752 if (d) {
753 spin_unlock(&c->dc_lock);
754 dprintk("%s [discard]\n", __func__);
755 c->dc_free_callback(new);
756 return d;
757 }
758 INIT_HLIST_NODE(&new->de_node);
759 atomic_set(&new->de_ref, 1);
760 hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
761 spin_unlock(&c->dc_lock);
762 dprintk("%s [new]\n", __func__);
763 return new;
764}
765EXPORT_SYMBOL_GPL(pnfs_add_deviceid);
766
767void
768pnfs_put_deviceid_cache(struct nfs_client *clp)
769{
770 struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
771
772 dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache);
773 if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
774 int i;
775 /* Verify cache is empty */
776 for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++)
777 BUG_ON(!hlist_empty(&local->dc_deviceids[i]));
778 clp->cl_devid_cache = NULL;
779 spin_unlock(&clp->cl_lock);
780 kfree(local);
781 }
782}
783EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
new file mode 100644
index 000000000000..e12367d50489
--- /dev/null
+++ b/fs/nfs/pnfs.h
@@ -0,0 +1,189 @@
1/*
2 * pNFS client data structures.
3 *
4 * Copyright (c) 2002
5 * The Regents of the University of Michigan
6 * All Rights Reserved
7 *
8 * Dean Hildebrand <dhildebz@umich.edu>
9 *
10 * Permission is granted to use, copy, create derivative works, and
11 * redistribute this software and such derivative works for any purpose,
12 * so long as the name of the University of Michigan is not used in
13 * any advertising or publicity pertaining to the use or distribution
14 * of this software without specific, written prior authorization. If
15 * the above copyright notice or any other identification of the
16 * University of Michigan is included in any copy of any portion of
17 * this software, then the disclaimer below must also be included.
18 *
19 * This software is provided as is, without representation or warranty
20 * of any kind either express or implied, including without limitation
21 * the implied warranties of merchantability, fitness for a particular
22 * purpose, or noninfringement. The Regents of the University of
23 * Michigan shall not be liable for any damages, including special,
24 * indirect, incidental, or consequential damages, with respect to any
25 * claim arising out of or in connection with the use of the software,
26 * even if it has been or is hereafter advised of the possibility of
27 * such damages.
28 */
29
30#ifndef FS_NFS_PNFS_H
31#define FS_NFS_PNFS_H
32
33struct pnfs_layout_segment {
34 struct list_head fi_list;
35 struct pnfs_layout_range range;
36 struct kref kref;
37 struct pnfs_layout_hdr *layout;
38};
39
40#ifdef CONFIG_NFS_V4_1
41
42#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
43
44enum {
45 NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
46 NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
47 NFS_LAYOUT_STATEID_SET, /* have a valid layout stateid */
48};
49
50/* Per-layout driver specific registration structure */
51struct pnfs_layoutdriver_type {
52 struct list_head pnfs_tblid;
53 const u32 id;
54 const char *name;
55 struct module *owner;
56 int (*set_layoutdriver) (struct nfs_server *);
57 int (*clear_layoutdriver) (struct nfs_server *);
58 struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
59 void (*free_lseg) (struct pnfs_layout_segment *lseg);
60};
61
62struct pnfs_layout_hdr {
63 unsigned long refcount;
64 struct list_head layouts; /* other client layouts */
65 struct list_head segs; /* layout segments list */
66 seqlock_t seqlock; /* Protects the stateid */
67 nfs4_stateid stateid;
68 unsigned long state;
69 struct inode *inode;
70};
71
72struct pnfs_device {
73 struct nfs4_deviceid dev_id;
74 unsigned int layout_type;
75 unsigned int mincount;
76 struct page **pages;
77 void *area;
78 unsigned int pgbase;
79 unsigned int pglen;
80};
81
82/*
83 * Device ID RCU cache. A device ID is unique per client ID and layout type.
84 */
85#define NFS4_DEVICE_ID_HASH_BITS 5
86#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS)
87#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1)
88
89static inline u32
90nfs4_deviceid_hash(struct nfs4_deviceid *id)
91{
92 unsigned char *cptr = (unsigned char *)id->data;
93 unsigned int nbytes = NFS4_DEVICEID4_SIZE;
94 u32 x = 0;
95
96 while (nbytes--) {
97 x *= 37;
98 x += *cptr++;
99 }
100 return x & NFS4_DEVICE_ID_HASH_MASK;
101}
102
103struct pnfs_deviceid_node {
104 struct hlist_node de_node;
105 struct nfs4_deviceid de_id;
106 atomic_t de_ref;
107};
108
109struct pnfs_deviceid_cache {
110 spinlock_t dc_lock;
111 atomic_t dc_ref;
112 void (*dc_free_callback)(struct pnfs_deviceid_node *);
113 struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
114};
115
116extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *,
117 void (*free_callback)(struct pnfs_deviceid_node *));
118extern void pnfs_put_deviceid_cache(struct nfs_client *);
119extern struct pnfs_deviceid_node *pnfs_find_get_deviceid(
120 struct pnfs_deviceid_cache *,
121 struct nfs4_deviceid *);
122extern struct pnfs_deviceid_node *pnfs_add_deviceid(
123 struct pnfs_deviceid_cache *,
124 struct pnfs_deviceid_node *);
125extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
126 struct pnfs_deviceid_node *devid);
127
128extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
129extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
130
131/* nfs4proc.c */
132extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
133 struct pnfs_device *dev);
134extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
135
136/* pnfs.c */
137struct pnfs_layout_segment *
138pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
139 enum pnfs_iomode access_type);
140void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
141void unset_pnfs_layoutdriver(struct nfs_server *);
142int pnfs_layout_process(struct nfs4_layoutget *lgp);
143void pnfs_destroy_layout(struct nfs_inode *);
144void pnfs_destroy_all_layouts(struct nfs_client *);
145void put_layout_hdr(struct inode *inode);
146void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
147 struct nfs4_state *open_state);
148
149
150static inline int lo_fail_bit(u32 iomode)
151{
152 return iomode == IOMODE_RW ?
153 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
154}
155
156/* Return true if a layout driver is being used for this mountpoint */
157static inline int pnfs_enabled_sb(struct nfs_server *nfss)
158{
159 return nfss->pnfs_curr_ld != NULL;
160}
161
162#else /* CONFIG_NFS_V4_1 */
163
164static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
165{
166}
167
168static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
169{
170}
171
172static inline struct pnfs_layout_segment *
173pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
174 enum pnfs_iomode access_type)
175{
176 return NULL;
177}
178
179static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
180{
181}
182
183static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
184{
185}
186
187#endif /* CONFIG_NFS_V4_1 */
188
189#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 611bec22f552..58e7f84fc1fd 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -258,7 +258,7 @@ static void nfs_free_createdata(const struct nfs_createdata *data)
258 258
259static int 259static int
260nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 260nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
261 int flags, struct nameidata *nd) 261 int flags, struct nfs_open_context *ctx)
262{ 262{
263 struct nfs_createdata *data; 263 struct nfs_createdata *data;
264 struct rpc_message msg = { 264 struct rpc_message msg = {
@@ -365,17 +365,32 @@ static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
365 return 1; 365 return 1;
366} 366}
367 367
368static void
369nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
370{
371 msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME];
372}
373
374static int
375nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
376 struct inode *new_dir)
377{
378 if (nfs_async_handle_expired_key(task))
379 return 0;
380 nfs_mark_for_revalidate(old_dir);
381 nfs_mark_for_revalidate(new_dir);
382 return 1;
383}
384
368static int 385static int
369nfs_proc_rename(struct inode *old_dir, struct qstr *old_name, 386nfs_proc_rename(struct inode *old_dir, struct qstr *old_name,
370 struct inode *new_dir, struct qstr *new_name) 387 struct inode *new_dir, struct qstr *new_name)
371{ 388{
372 struct nfs_renameargs arg = { 389 struct nfs_renameargs arg = {
373 .fromfh = NFS_FH(old_dir), 390 .old_dir = NFS_FH(old_dir),
374 .fromname = old_name->name, 391 .old_name = old_name,
375 .fromlen = old_name->len, 392 .new_dir = NFS_FH(new_dir),
376 .tofh = NFS_FH(new_dir), 393 .new_name = new_name,
377 .toname = new_name->name,
378 .tolen = new_name->len
379 }; 394 };
380 struct rpc_message msg = { 395 struct rpc_message msg = {
381 .rpc_proc = &nfs_procedures[NFSPROC_RENAME], 396 .rpc_proc = &nfs_procedures[NFSPROC_RENAME],
@@ -519,14 +534,14 @@ nfs_proc_rmdir(struct inode *dir, struct qstr *name)
519 */ 534 */
520static int 535static int
521nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, 536nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
522 u64 cookie, struct page *page, unsigned int count, int plus) 537 u64 cookie, struct page **pages, unsigned int count, int plus)
523{ 538{
524 struct inode *dir = dentry->d_inode; 539 struct inode *dir = dentry->d_inode;
525 struct nfs_readdirargs arg = { 540 struct nfs_readdirargs arg = {
526 .fh = NFS_FH(dir), 541 .fh = NFS_FH(dir),
527 .cookie = cookie, 542 .cookie = cookie,
528 .count = count, 543 .count = count,
529 .pages = &page, 544 .pages = pages,
530 }; 545 };
531 struct rpc_message msg = { 546 struct rpc_message msg = {
532 .rpc_proc = &nfs_procedures[NFSPROC_READDIR], 547 .rpc_proc = &nfs_procedures[NFSPROC_READDIR],
@@ -705,6 +720,8 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
705 .unlink_setup = nfs_proc_unlink_setup, 720 .unlink_setup = nfs_proc_unlink_setup,
706 .unlink_done = nfs_proc_unlink_done, 721 .unlink_done = nfs_proc_unlink_done,
707 .rename = nfs_proc_rename, 722 .rename = nfs_proc_rename,
723 .rename_setup = nfs_proc_rename_setup,
724 .rename_done = nfs_proc_rename_done,
708 .link = nfs_proc_link, 725 .link = nfs_proc_link,
709 .symlink = nfs_proc_symlink, 726 .symlink = nfs_proc_symlink,
710 .mkdir = nfs_proc_mkdir, 727 .mkdir = nfs_proc_mkdir,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 87adc2744246..e4b62c6f5a6e 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -25,6 +25,7 @@
25#include "internal.h" 25#include "internal.h"
26#include "iostat.h" 26#include "iostat.h"
27#include "fscache.h" 27#include "fscache.h"
28#include "pnfs.h"
28 29
29#define NFSDBG_FACILITY NFSDBG_PAGECACHE 30#define NFSDBG_FACILITY NFSDBG_PAGECACHE
30 31
@@ -46,7 +47,6 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
46 memset(p, 0, sizeof(*p)); 47 memset(p, 0, sizeof(*p));
47 INIT_LIST_HEAD(&p->pages); 48 INIT_LIST_HEAD(&p->pages);
48 p->npages = pagecount; 49 p->npages = pagecount;
49 p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
50 if (pagecount <= ARRAY_SIZE(p->page_array)) 50 if (pagecount <= ARRAY_SIZE(p->page_array))
51 p->pagevec = p->page_array; 51 p->pagevec = p->page_array;
52 else { 52 else {
@@ -121,6 +121,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
121 len = nfs_page_length(page); 121 len = nfs_page_length(page);
122 if (len == 0) 122 if (len == 0)
123 return nfs_return_empty_page(page); 123 return nfs_return_empty_page(page);
124 pnfs_update_layout(inode, ctx, IOMODE_READ);
124 new = nfs_create_request(ctx, inode, page, 0, len); 125 new = nfs_create_request(ctx, inode, page, 0, len);
125 if (IS_ERR(new)) { 126 if (IS_ERR(new)) {
126 unlock_page(page); 127 unlock_page(page);
@@ -625,6 +626,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
625 if (ret == 0) 626 if (ret == 0)
626 goto read_complete; /* all pages were read */ 627 goto read_complete; /* all pages were read */
627 628
629 pnfs_update_layout(inode, desc.ctx, IOMODE_READ);
628 if (rsize < PAGE_CACHE_SIZE) 630 if (rsize < PAGE_CACHE_SIZE)
629 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); 631 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
630 else 632 else
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f4cbf0c306c6..3600ec700d58 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -100,6 +100,7 @@ enum {
100 Opt_addr, Opt_mountaddr, Opt_clientaddr, 100 Opt_addr, Opt_mountaddr, Opt_clientaddr,
101 Opt_lookupcache, 101 Opt_lookupcache,
102 Opt_fscache_uniq, 102 Opt_fscache_uniq,
103 Opt_local_lock,
103 104
104 /* Special mount options */ 105 /* Special mount options */
105 Opt_userspace, Opt_deprecated, Opt_sloppy, 106 Opt_userspace, Opt_deprecated, Opt_sloppy,
@@ -171,6 +172,7 @@ static const match_table_t nfs_mount_option_tokens = {
171 172
172 { Opt_lookupcache, "lookupcache=%s" }, 173 { Opt_lookupcache, "lookupcache=%s" },
173 { Opt_fscache_uniq, "fsc=%s" }, 174 { Opt_fscache_uniq, "fsc=%s" },
175 { Opt_local_lock, "local_lock=%s" },
174 176
175 { Opt_err, NULL } 177 { Opt_err, NULL }
176}; 178};
@@ -236,6 +238,22 @@ static match_table_t nfs_lookupcache_tokens = {
236 { Opt_lookupcache_err, NULL } 238 { Opt_lookupcache_err, NULL }
237}; 239};
238 240
241enum {
242 Opt_local_lock_all, Opt_local_lock_flock, Opt_local_lock_posix,
243 Opt_local_lock_none,
244
245 Opt_local_lock_err
246};
247
248static match_table_t nfs_local_lock_tokens = {
249 { Opt_local_lock_all, "all" },
250 { Opt_local_lock_flock, "flock" },
251 { Opt_local_lock_posix, "posix" },
252 { Opt_local_lock_none, "none" },
253
254 { Opt_local_lock_err, NULL }
255};
256
239 257
240static void nfs_umount_begin(struct super_block *); 258static void nfs_umount_begin(struct super_block *);
241static int nfs_statfs(struct dentry *, struct kstatfs *); 259static int nfs_statfs(struct dentry *, struct kstatfs *);
@@ -622,6 +640,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
622 const struct proc_nfs_info *nfs_infop; 640 const struct proc_nfs_info *nfs_infop;
623 struct nfs_client *clp = nfss->nfs_client; 641 struct nfs_client *clp = nfss->nfs_client;
624 u32 version = clp->rpc_ops->version; 642 u32 version = clp->rpc_ops->version;
643 int local_flock, local_fcntl;
625 644
626 seq_printf(m, ",vers=%u", version); 645 seq_printf(m, ",vers=%u", version);
627 seq_printf(m, ",rsize=%u", nfss->rsize); 646 seq_printf(m, ",rsize=%u", nfss->rsize);
@@ -670,6 +689,18 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
670 else 689 else
671 seq_printf(m, ",lookupcache=pos"); 690 seq_printf(m, ",lookupcache=pos");
672 } 691 }
692
693 local_flock = nfss->flags & NFS_MOUNT_LOCAL_FLOCK;
694 local_fcntl = nfss->flags & NFS_MOUNT_LOCAL_FCNTL;
695
696 if (!local_flock && !local_fcntl)
697 seq_printf(m, ",local_lock=none");
698 else if (local_flock && local_fcntl)
699 seq_printf(m, ",local_lock=all");
700 else if (local_flock)
701 seq_printf(m, ",local_lock=flock");
702 else
703 seq_printf(m, ",local_lock=posix");
673} 704}
674 705
675/* 706/*
@@ -1017,9 +1048,13 @@ static int nfs_parse_mount_options(char *raw,
1017 break; 1048 break;
1018 case Opt_lock: 1049 case Opt_lock:
1019 mnt->flags &= ~NFS_MOUNT_NONLM; 1050 mnt->flags &= ~NFS_MOUNT_NONLM;
1051 mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK |
1052 NFS_MOUNT_LOCAL_FCNTL);
1020 break; 1053 break;
1021 case Opt_nolock: 1054 case Opt_nolock:
1022 mnt->flags |= NFS_MOUNT_NONLM; 1055 mnt->flags |= NFS_MOUNT_NONLM;
1056 mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |
1057 NFS_MOUNT_LOCAL_FCNTL);
1023 break; 1058 break;
1024 case Opt_v2: 1059 case Opt_v2:
1025 mnt->flags &= ~NFS_MOUNT_VER3; 1060 mnt->flags &= ~NFS_MOUNT_VER3;
@@ -1420,6 +1455,34 @@ static int nfs_parse_mount_options(char *raw,
1420 mnt->fscache_uniq = string; 1455 mnt->fscache_uniq = string;
1421 mnt->options |= NFS_OPTION_FSCACHE; 1456 mnt->options |= NFS_OPTION_FSCACHE;
1422 break; 1457 break;
1458 case Opt_local_lock:
1459 string = match_strdup(args);
1460 if (string == NULL)
1461 goto out_nomem;
1462 token = match_token(string, nfs_local_lock_tokens,
1463 args);
1464 kfree(string);
1465 switch (token) {
1466 case Opt_local_lock_all:
1467 mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |
1468 NFS_MOUNT_LOCAL_FCNTL);
1469 break;
1470 case Opt_local_lock_flock:
1471 mnt->flags |= NFS_MOUNT_LOCAL_FLOCK;
1472 break;
1473 case Opt_local_lock_posix:
1474 mnt->flags |= NFS_MOUNT_LOCAL_FCNTL;
1475 break;
1476 case Opt_local_lock_none:
1477 mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK |
1478 NFS_MOUNT_LOCAL_FCNTL);
1479 break;
1480 default:
1481 dfprintk(MOUNT, "NFS: invalid "
1482 "local_lock argument\n");
1483 return 0;
1484 };
1485 break;
1423 1486
1424 /* 1487 /*
1425 * Special options 1488 * Special options
@@ -1825,6 +1888,12 @@ static int nfs_validate_mount_data(void *options,
1825 if (!args->nfs_server.hostname) 1888 if (!args->nfs_server.hostname)
1826 goto out_nomem; 1889 goto out_nomem;
1827 1890
1891 if (!(data->flags & NFS_MOUNT_NONLM))
1892 args->flags &= ~(NFS_MOUNT_LOCAL_FLOCK|
1893 NFS_MOUNT_LOCAL_FCNTL);
1894 else
1895 args->flags |= (NFS_MOUNT_LOCAL_FLOCK|
1896 NFS_MOUNT_LOCAL_FCNTL);
1828 /* 1897 /*
1829 * The legacy version 6 binary mount data from userspace has a 1898 * The legacy version 6 binary mount data from userspace has a
1830 * field used only to transport selinux information into the 1899 * field used only to transport selinux information into the
@@ -2441,7 +2510,8 @@ static void nfs4_fill_super(struct super_block *sb)
2441 2510
2442static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args) 2511static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args)
2443{ 2512{
2444 args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3); 2513 args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3|
2514 NFS_MOUNT_LOCAL_FLOCK|NFS_MOUNT_LOCAL_FCNTL);
2445} 2515}
2446 2516
2447static int nfs4_validate_text_mount_data(void *options, 2517static int nfs4_validate_text_mount_data(void *options,
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index ad4d2e787b20..978aaeb8a093 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -32,6 +32,7 @@ static ctl_table nfs_cb_sysctls[] = {
32 .extra1 = (int *)&nfs_set_port_min, 32 .extra1 = (int *)&nfs_set_port_min,
33 .extra2 = (int *)&nfs_set_port_max, 33 .extra2 = (int *)&nfs_set_port_max,
34 }, 34 },
35#ifndef CONFIG_NFS_USE_NEW_IDMAPPER
35 { 36 {
36 .procname = "idmap_cache_timeout", 37 .procname = "idmap_cache_timeout",
37 .data = &nfs_idmap_cache_timeout, 38 .data = &nfs_idmap_cache_timeout,
@@ -39,6 +40,7 @@ static ctl_table nfs_cb_sysctls[] = {
39 .mode = 0644, 40 .mode = 0644,
40 .proc_handler = proc_dointvec_jiffies, 41 .proc_handler = proc_dointvec_jiffies,
41 }, 42 },
43#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
42#endif 44#endif
43 { 45 {
44 .procname = "nfs_mountpoint_timeout", 46 .procname = "nfs_mountpoint_timeout",
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 2f84adaad427..9a16bad5d2ea 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -13,9 +13,12 @@
13#include <linux/nfs_fs.h> 13#include <linux/nfs_fs.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/wait.h> 15#include <linux/wait.h>
16#include <linux/namei.h>
16 17
17#include "internal.h" 18#include "internal.h"
18#include "nfs4_fs.h" 19#include "nfs4_fs.h"
20#include "iostat.h"
21#include "delegation.h"
19 22
20struct nfs_unlinkdata { 23struct nfs_unlinkdata {
21 struct hlist_node list; 24 struct hlist_node list;
@@ -244,7 +247,7 @@ void nfs_unblock_sillyrename(struct dentry *dentry)
244 * @dir: parent directory of dentry 247 * @dir: parent directory of dentry
245 * @dentry: dentry to unlink 248 * @dentry: dentry to unlink
246 */ 249 */
247int 250static int
248nfs_async_unlink(struct inode *dir, struct dentry *dentry) 251nfs_async_unlink(struct inode *dir, struct dentry *dentry)
249{ 252{
250 struct nfs_unlinkdata *data; 253 struct nfs_unlinkdata *data;
@@ -259,7 +262,6 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
259 status = PTR_ERR(data->cred); 262 status = PTR_ERR(data->cred);
260 goto out_free; 263 goto out_free;
261 } 264 }
262 data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
263 data->res.dir_attr = &data->dir_attr; 265 data->res.dir_attr = &data->dir_attr;
264 266
265 status = -EBUSY; 267 status = -EBUSY;
@@ -303,3 +305,256 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode)
303 if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data))) 305 if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data)))
304 nfs_free_unlinkdata(data); 306 nfs_free_unlinkdata(data);
305} 307}
308
309/* Cancel a queued async unlink. Called when a sillyrename run fails. */
310static void
311nfs_cancel_async_unlink(struct dentry *dentry)
312{
313 spin_lock(&dentry->d_lock);
314 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
315 struct nfs_unlinkdata *data = dentry->d_fsdata;
316
317 dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
318 spin_unlock(&dentry->d_lock);
319 nfs_free_unlinkdata(data);
320 return;
321 }
322 spin_unlock(&dentry->d_lock);
323}
324
325struct nfs_renamedata {
326 struct nfs_renameargs args;
327 struct nfs_renameres res;
328 struct rpc_cred *cred;
329 struct inode *old_dir;
330 struct dentry *old_dentry;
331 struct nfs_fattr old_fattr;
332 struct inode *new_dir;
333 struct dentry *new_dentry;
334 struct nfs_fattr new_fattr;
335};
336
337/**
338 * nfs_async_rename_done - Sillyrename post-processing
339 * @task: rpc_task of the sillyrename
340 * @calldata: nfs_renamedata for the sillyrename
341 *
342 * Do the directory attribute updates and the d_move
343 */
344static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
345{
346 struct nfs_renamedata *data = calldata;
347 struct inode *old_dir = data->old_dir;
348 struct inode *new_dir = data->new_dir;
349
350 if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
351 nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client);
352 return;
353 }
354
355 if (task->tk_status != 0) {
356 nfs_cancel_async_unlink(data->old_dentry);
357 return;
358 }
359
360 nfs_set_verifier(data->old_dentry, nfs_save_change_attribute(old_dir));
361 d_move(data->old_dentry, data->new_dentry);
362}
363
364/**
365 * nfs_async_rename_release - Release the sillyrename data.
366 * @calldata: the struct nfs_renamedata to be released
367 */
368static void nfs_async_rename_release(void *calldata)
369{
370 struct nfs_renamedata *data = calldata;
371 struct super_block *sb = data->old_dir->i_sb;
372
373 if (data->old_dentry->d_inode)
374 nfs_mark_for_revalidate(data->old_dentry->d_inode);
375
376 dput(data->old_dentry);
377 dput(data->new_dentry);
378 iput(data->old_dir);
379 iput(data->new_dir);
380 nfs_sb_deactive(sb);
381 put_rpccred(data->cred);
382 kfree(data);
383}
384
385#if defined(CONFIG_NFS_V4_1)
386static void nfs_rename_prepare(struct rpc_task *task, void *calldata)
387{
388 struct nfs_renamedata *data = calldata;
389 struct nfs_server *server = NFS_SERVER(data->old_dir);
390
391 if (nfs4_setup_sequence(server, &data->args.seq_args,
392 &data->res.seq_res, 1, task))
393 return;
394 rpc_call_start(task);
395}
396#endif /* CONFIG_NFS_V4_1 */
397
398static const struct rpc_call_ops nfs_rename_ops = {
399 .rpc_call_done = nfs_async_rename_done,
400 .rpc_release = nfs_async_rename_release,
401#if defined(CONFIG_NFS_V4_1)
402 .rpc_call_prepare = nfs_rename_prepare,
403#endif /* CONFIG_NFS_V4_1 */
404};
405
406/**
407 * nfs_async_rename - perform an asynchronous rename operation
408 * @old_dir: directory that currently holds the dentry to be renamed
409 * @new_dir: target directory for the rename
410 * @old_dentry: original dentry to be renamed
411 * @new_dentry: dentry to which the old_dentry should be renamed
412 *
413 * It's expected that valid references to the dentries and inodes are held
414 */
415static struct rpc_task *
416nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
417 struct dentry *old_dentry, struct dentry *new_dentry)
418{
419 struct nfs_renamedata *data;
420 struct rpc_message msg = { };
421 struct rpc_task_setup task_setup_data = {
422 .rpc_message = &msg,
423 .callback_ops = &nfs_rename_ops,
424 .workqueue = nfsiod_workqueue,
425 .rpc_client = NFS_CLIENT(old_dir),
426 .flags = RPC_TASK_ASYNC,
427 };
428
429 data = kzalloc(sizeof(*data), GFP_KERNEL);
430 if (data == NULL)
431 return ERR_PTR(-ENOMEM);
432 task_setup_data.callback_data = data,
433
434 data->cred = rpc_lookup_cred();
435 if (IS_ERR(data->cred)) {
436 struct rpc_task *task = ERR_CAST(data->cred);
437 kfree(data);
438 return task;
439 }
440
441 msg.rpc_argp = &data->args;
442 msg.rpc_resp = &data->res;
443 msg.rpc_cred = data->cred;
444
445 /* set up nfs_renamedata */
446 data->old_dir = old_dir;
447 atomic_inc(&old_dir->i_count);
448 data->new_dir = new_dir;
449 atomic_inc(&new_dir->i_count);
450 data->old_dentry = dget(old_dentry);
451 data->new_dentry = dget(new_dentry);
452 nfs_fattr_init(&data->old_fattr);
453 nfs_fattr_init(&data->new_fattr);
454
455 /* set up nfs_renameargs */
456 data->args.old_dir = NFS_FH(old_dir);
457 data->args.old_name = &old_dentry->d_name;
458 data->args.new_dir = NFS_FH(new_dir);
459 data->args.new_name = &new_dentry->d_name;
460
461 /* set up nfs_renameres */
462 data->res.old_fattr = &data->old_fattr;
463 data->res.new_fattr = &data->new_fattr;
464
465 nfs_sb_active(old_dir->i_sb);
466
467 NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dir);
468
469 return rpc_run_task(&task_setup_data);
470}
471
472/**
473 * nfs_sillyrename - Perform a silly-rename of a dentry
474 * @dir: inode of directory that contains dentry
475 * @dentry: dentry to be sillyrenamed
476 *
477 * NFSv2/3 is stateless and the server doesn't know when the client is
478 * holding a file open. To prevent application problems when a file is
479 * unlinked while it's still open, the client performs a "silly-rename".
480 * That is, it renames the file to a hidden file in the same directory,
481 * and only performs the unlink once the last reference to it is put.
482 *
483 * The final cleanup is done during dentry_iput.
484 */
485int
486nfs_sillyrename(struct inode *dir, struct dentry *dentry)
487{
488 static unsigned int sillycounter;
489 const int fileidsize = sizeof(NFS_FILEID(dentry->d_inode))*2;
490 const int countersize = sizeof(sillycounter)*2;
491 const int slen = sizeof(".nfs")+fileidsize+countersize-1;
492 char silly[slen+1];
493 struct dentry *sdentry;
494 struct rpc_task *task;
495 int error = -EIO;
496
497 dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
498 dentry->d_parent->d_name.name, dentry->d_name.name,
499 atomic_read(&dentry->d_count));
500 nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
501
502 /*
503 * We don't allow a dentry to be silly-renamed twice.
504 */
505 error = -EBUSY;
506 if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
507 goto out;
508
509 sprintf(silly, ".nfs%*.*Lx",
510 fileidsize, fileidsize,
511 (unsigned long long)NFS_FILEID(dentry->d_inode));
512
513 /* Return delegation in anticipation of the rename */
514 nfs_inode_return_delegation(dentry->d_inode);
515
516 sdentry = NULL;
517 do {
518 char *suffix = silly + slen - countersize;
519
520 dput(sdentry);
521 sillycounter++;
522 sprintf(suffix, "%*.*x", countersize, countersize, sillycounter);
523
524 dfprintk(VFS, "NFS: trying to rename %s to %s\n",
525 dentry->d_name.name, silly);
526
527 sdentry = lookup_one_len(silly, dentry->d_parent, slen);
528 /*
529 * N.B. Better to return EBUSY here ... it could be
530 * dangerous to delete the file while it's in use.
531 */
532 if (IS_ERR(sdentry))
533 goto out;
534 } while (sdentry->d_inode != NULL); /* need negative lookup */
535
536 /* queue unlink first. Can't do this from rpc_release as it
537 * has to allocate memory
538 */
539 error = nfs_async_unlink(dir, dentry);
540 if (error)
541 goto out_dput;
542
543 /* run the rename task, undo unlink if it fails */
544 task = nfs_async_rename(dir, dir, dentry, sdentry);
545 if (IS_ERR(task)) {
546 error = -EBUSY;
547 nfs_cancel_async_unlink(dentry);
548 goto out_dput;
549 }
550
551 /* wait for the RPC task to complete, unless a SIGKILL intervenes */
552 error = rpc_wait_for_completion_task(task);
553 if (error == 0)
554 error = task->tk_status;
555 rpc_put_task(task);
556out_dput:
557 dput(sdentry);
558out:
559 return error;
560}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 874972d9427c..4c14c17a5276 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -55,7 +55,6 @@ struct nfs_write_data *nfs_commitdata_alloc(void)
55 if (p) { 55 if (p) {
56 memset(p, 0, sizeof(*p)); 56 memset(p, 0, sizeof(*p));
57 INIT_LIST_HEAD(&p->pages); 57 INIT_LIST_HEAD(&p->pages);
58 p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
59 } 58 }
60 return p; 59 return p;
61} 60}
@@ -75,7 +74,6 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
75 memset(p, 0, sizeof(*p)); 74 memset(p, 0, sizeof(*p));
76 INIT_LIST_HEAD(&p->pages); 75 INIT_LIST_HEAD(&p->pages);
77 p->npages = pagecount; 76 p->npages = pagecount;
78 p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
79 if (pagecount <= ARRAY_SIZE(p->page_array)) 77 if (pagecount <= ARRAY_SIZE(p->page_array))
80 p->pagevec = p->page_array; 78 p->pagevec = p->page_array;
81 else { 79 else {
@@ -292,9 +290,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
292 nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); 290 nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
293 291
294 nfs_pageio_cond_complete(pgio, page->index); 292 nfs_pageio_cond_complete(pgio, page->index);
295 ret = nfs_page_async_flush(pgio, page, 293 ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
296 wbc->sync_mode == WB_SYNC_NONE ||
297 wbc->nonblocking != 0);
298 if (ret == -EAGAIN) { 294 if (ret == -EAGAIN) {
299 redirty_page_for_writepage(wbc, page); 295 redirty_page_for_writepage(wbc, page);
300 ret = 0; 296 ret = 0;
@@ -1433,15 +1429,17 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr
1433 int flags = FLUSH_SYNC; 1429 int flags = FLUSH_SYNC;
1434 int ret = 0; 1430 int ret = 0;
1435 1431
1436 /* Don't commit yet if this is a non-blocking flush and there are 1432 if (wbc->sync_mode == WB_SYNC_NONE) {
1437 * lots of outstanding writes for this mapping. 1433 /* Don't commit yet if this is a non-blocking flush and there
1438 */ 1434 * are a lot of outstanding writes for this mapping.
1439 if (wbc->sync_mode == WB_SYNC_NONE && 1435 */
1440 nfsi->ncommit <= (nfsi->npages >> 1)) 1436 if (nfsi->ncommit <= (nfsi->npages >> 1))
1441 goto out_mark_dirty; 1437 goto out_mark_dirty;
1442 1438
1443 if (wbc->nonblocking || wbc->for_background) 1439 /* don't wait for the COMMIT response */
1444 flags = 0; 1440 flags = 0;
1441 }
1442
1445 ret = nfs_commit_inode(inode, flags); 1443 ret = nfs_commit_inode(inode, flags);
1446 if (ret >= 0) { 1444 if (ret >= 0) {
1447 if (wbc->sync_mode == WB_SYNC_NONE) { 1445 if (wbc->sync_mode == WB_SYNC_NONE) {
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 4264377552e2..18b3e8975fe0 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -28,6 +28,18 @@ config NFSD
28 28
29 If unsure, say N. 29 If unsure, say N.
30 30
31config NFSD_DEPRECATED
32 bool "Include support for deprecated syscall interface to NFSD"
33 depends on NFSD
34 default y
35 help
36 The syscall interface to nfsd was obsoleted in 2.6.0 by a new
37 filesystem based interface. The old interface is due for removal
38 in 2.6.40. If you wish to remove the interface before then
39 say N.
40
41 In unsure, say Y.
42
31config NFSD_V2_ACL 43config NFSD_V2_ACL
32 bool 44 bool
33 depends on NFSD 45 depends on NFSD
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c2a4f71d87dd..c0fcb7ab7f6d 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -28,9 +28,6 @@
28typedef struct auth_domain svc_client; 28typedef struct auth_domain svc_client;
29typedef struct svc_export svc_export; 29typedef struct svc_export svc_export;
30 30
31static void exp_do_unexport(svc_export *unexp);
32static int exp_verify_string(char *cp, int max);
33
34/* 31/*
35 * We have two caches. 32 * We have two caches.
36 * One maps client+vfsmnt+dentry to export options - the export map 33 * One maps client+vfsmnt+dentry to export options - the export map
@@ -802,6 +799,7 @@ exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp)
802 return ek; 799 return ek;
803} 800}
804 801
802#ifdef CONFIG_NFSD_DEPRECATED
805static int exp_set_key(svc_client *clp, int fsid_type, u32 *fsidv, 803static int exp_set_key(svc_client *clp, int fsid_type, u32 *fsidv,
806 struct svc_export *exp) 804 struct svc_export *exp)
807{ 805{
@@ -852,6 +850,7 @@ exp_get_fsid_key(svc_client *clp, int fsid)
852 850
853 return exp_find_key(clp, FSID_NUM, fsidv, NULL); 851 return exp_find_key(clp, FSID_NUM, fsidv, NULL);
854} 852}
853#endif
855 854
856static svc_export *exp_get_by_name(svc_client *clp, const struct path *path, 855static svc_export *exp_get_by_name(svc_client *clp, const struct path *path,
857 struct cache_req *reqp) 856 struct cache_req *reqp)
@@ -893,6 +892,7 @@ static struct svc_export *exp_parent(svc_client *clp, struct path *path)
893 return exp; 892 return exp;
894} 893}
895 894
895#ifdef CONFIG_NFSD_DEPRECATED
896/* 896/*
897 * Hashtable locking. Write locks are placed only by user processes 897 * Hashtable locking. Write locks are placed only by user processes
898 * wanting to modify export information. 898 * wanting to modify export information.
@@ -925,6 +925,19 @@ exp_writeunlock(void)
925{ 925{
926 up_write(&hash_sem); 926 up_write(&hash_sem);
927} 927}
928#else
929
930/* hash_sem not needed once deprecated interface is removed */
931void exp_readlock(void) {}
932static inline void exp_writelock(void){}
933void exp_readunlock(void) {}
934static inline void exp_writeunlock(void){}
935
936#endif
937
938#ifdef CONFIG_NFSD_DEPRECATED
939static void exp_do_unexport(svc_export *unexp);
940static int exp_verify_string(char *cp, int max);
928 941
929static void exp_fsid_unhash(struct svc_export *exp) 942static void exp_fsid_unhash(struct svc_export *exp)
930{ 943{
@@ -935,10 +948,9 @@ static void exp_fsid_unhash(struct svc_export *exp)
935 948
936 ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid); 949 ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid);
937 if (!IS_ERR(ek)) { 950 if (!IS_ERR(ek)) {
938 ek->h.expiry_time = get_seconds()-1; 951 sunrpc_invalidate(&ek->h, &svc_expkey_cache);
939 cache_put(&ek->h, &svc_expkey_cache); 952 cache_put(&ek->h, &svc_expkey_cache);
940 } 953 }
941 svc_expkey_cache.nextcheck = get_seconds();
942} 954}
943 955
944static int exp_fsid_hash(svc_client *clp, struct svc_export *exp) 956static int exp_fsid_hash(svc_client *clp, struct svc_export *exp)
@@ -973,10 +985,9 @@ static void exp_unhash(struct svc_export *exp)
973 985
974 ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino); 986 ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino);
975 if (!IS_ERR(ek)) { 987 if (!IS_ERR(ek)) {
976 ek->h.expiry_time = get_seconds()-1; 988 sunrpc_invalidate(&ek->h, &svc_expkey_cache);
977 cache_put(&ek->h, &svc_expkey_cache); 989 cache_put(&ek->h, &svc_expkey_cache);
978 } 990 }
979 svc_expkey_cache.nextcheck = get_seconds();
980} 991}
981 992
982/* 993/*
@@ -1097,8 +1108,7 @@ out:
1097static void 1108static void
1098exp_do_unexport(svc_export *unexp) 1109exp_do_unexport(svc_export *unexp)
1099{ 1110{
1100 unexp->h.expiry_time = get_seconds()-1; 1111 sunrpc_invalidate(&unexp->h, &svc_export_cache);
1101 svc_export_cache.nextcheck = get_seconds();
1102 exp_unhash(unexp); 1112 exp_unhash(unexp);
1103 exp_fsid_unhash(unexp); 1113 exp_fsid_unhash(unexp);
1104} 1114}
@@ -1150,6 +1160,7 @@ out_unlock:
1150 exp_writeunlock(); 1160 exp_writeunlock();
1151 return err; 1161 return err;
1152} 1162}
1163#endif /* CONFIG_NFSD_DEPRECATED */
1153 1164
1154/* 1165/*
1155 * Obtain the root fh on behalf of a client. 1166 * Obtain the root fh on behalf of a client.
@@ -1459,25 +1470,43 @@ static void show_secinfo_flags(struct seq_file *m, int flags)
1459 show_expflags(m, flags, NFSEXP_SECINFO_FLAGS); 1470 show_expflags(m, flags, NFSEXP_SECINFO_FLAGS);
1460} 1471}
1461 1472
1473static bool secinfo_flags_equal(int f, int g)
1474{
1475 f &= NFSEXP_SECINFO_FLAGS;
1476 g &= NFSEXP_SECINFO_FLAGS;
1477 return f == g;
1478}
1479
1480static int show_secinfo_run(struct seq_file *m, struct exp_flavor_info **fp, struct exp_flavor_info *end)
1481{
1482 int flags;
1483
1484 flags = (*fp)->flags;
1485 seq_printf(m, ",sec=%d", (*fp)->pseudoflavor);
1486 (*fp)++;
1487 while (*fp != end && secinfo_flags_equal(flags, (*fp)->flags)) {
1488 seq_printf(m, ":%d", (*fp)->pseudoflavor);
1489 (*fp)++;
1490 }
1491 return flags;
1492}
1493
1462static void show_secinfo(struct seq_file *m, struct svc_export *exp) 1494static void show_secinfo(struct seq_file *m, struct svc_export *exp)
1463{ 1495{
1464 struct exp_flavor_info *f; 1496 struct exp_flavor_info *f;
1465 struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; 1497 struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
1466 int lastflags = 0, first = 0; 1498 int flags;
1467 1499
1468 if (exp->ex_nflavors == 0) 1500 if (exp->ex_nflavors == 0)
1469 return; 1501 return;
1470 for (f = exp->ex_flavors; f < end; f++) { 1502 f = exp->ex_flavors;
1471 if (first || f->flags != lastflags) { 1503 flags = show_secinfo_run(m, &f, end);
1472 if (!first) 1504 if (!secinfo_flags_equal(flags, exp->ex_flags))
1473 show_secinfo_flags(m, lastflags); 1505 show_secinfo_flags(m, flags);
1474 seq_printf(m, ",sec=%d", f->pseudoflavor); 1506 while (f != end) {
1475 lastflags = f->flags; 1507 flags = show_secinfo_run(m, &f, end);
1476 } else { 1508 show_secinfo_flags(m, flags);
1477 seq_printf(m, ":%d", f->pseudoflavor);
1478 }
1479 } 1509 }
1480 show_secinfo_flags(m, lastflags);
1481} 1510}
1482 1511
1483static void exp_flags(struct seq_file *m, int flag, int fsid, 1512static void exp_flags(struct seq_file *m, int flag, int fsid,
@@ -1532,6 +1561,7 @@ const struct seq_operations nfs_exports_op = {
1532 .show = e_show, 1561 .show = e_show,
1533}; 1562};
1534 1563
1564#ifdef CONFIG_NFSD_DEPRECATED
1535/* 1565/*
1536 * Add or modify a client. 1566 * Add or modify a client.
1537 * Change requests may involve the list of host addresses. The list of 1567 * Change requests may involve the list of host addresses. The list of
@@ -1563,7 +1593,7 @@ exp_addclient(struct nfsctl_client *ncp)
1563 /* Insert client into hashtable. */ 1593 /* Insert client into hashtable. */
1564 for (i = 0; i < ncp->cl_naddr; i++) { 1594 for (i = 0; i < ncp->cl_naddr; i++) {
1565 ipv6_addr_set_v4mapped(ncp->cl_addrlist[i].s_addr, &addr6); 1595 ipv6_addr_set_v4mapped(ncp->cl_addrlist[i].s_addr, &addr6);
1566 auth_unix_add_addr(&addr6, dom); 1596 auth_unix_add_addr(&init_net, &addr6, dom);
1567 } 1597 }
1568 auth_unix_forget_old(dom); 1598 auth_unix_forget_old(dom);
1569 auth_domain_put(dom); 1599 auth_domain_put(dom);
@@ -1621,6 +1651,7 @@ exp_verify_string(char *cp, int max)
1621 printk(KERN_NOTICE "nfsd: couldn't validate string %s\n", cp); 1651 printk(KERN_NOTICE "nfsd: couldn't validate string %s\n", cp);
1622 return 0; 1652 return 0;
1623} 1653}
1654#endif /* CONFIG_NFSD_DEPRECATED */
1624 1655
1625/* 1656/*
1626 * Initialize the exports module. 1657 * Initialize the exports module.
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 988cbb3a19b6..143da2eecd7b 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -41,7 +41,6 @@
41 41
42#define NFSPROC4_CB_NULL 0 42#define NFSPROC4_CB_NULL 0
43#define NFSPROC4_CB_COMPOUND 1 43#define NFSPROC4_CB_COMPOUND 1
44#define NFS4_STATEID_SIZE 16
45 44
46/* Index of predefined Linux callback client operations */ 45/* Index of predefined Linux callback client operations */
47 46
@@ -248,10 +247,11 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
248} 247}
249 248
250static void 249static void
251encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args, 250encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
252 struct nfs4_cb_compound_hdr *hdr) 251 struct nfs4_cb_compound_hdr *hdr)
253{ 252{
254 __be32 *p; 253 __be32 *p;
254 struct nfsd4_session *ses = cb->cb_clp->cl_cb_session;
255 255
256 if (hdr->minorversion == 0) 256 if (hdr->minorversion == 0)
257 return; 257 return;
@@ -259,8 +259,8 @@ encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args,
259 RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20); 259 RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20);
260 260
261 WRITE32(OP_CB_SEQUENCE); 261 WRITE32(OP_CB_SEQUENCE);
262 WRITEMEM(args->cbs_clp->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN); 262 WRITEMEM(ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN);
263 WRITE32(args->cbs_clp->cl_cb_seq_nr); 263 WRITE32(ses->se_cb_seq_nr);
264 WRITE32(0); /* slotid, always 0 */ 264 WRITE32(0); /* slotid, always 0 */
265 WRITE32(0); /* highest slotid always 0 */ 265 WRITE32(0); /* highest slotid always 0 */
266 WRITE32(0); /* cachethis always 0 */ 266 WRITE32(0); /* cachethis always 0 */
@@ -280,18 +280,18 @@ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
280 280
281static int 281static int
282nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, 282nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
283 struct nfs4_rpc_args *rpc_args) 283 struct nfsd4_callback *cb)
284{ 284{
285 struct xdr_stream xdr; 285 struct xdr_stream xdr;
286 struct nfs4_delegation *args = rpc_args->args_op; 286 struct nfs4_delegation *args = cb->cb_op;
287 struct nfs4_cb_compound_hdr hdr = { 287 struct nfs4_cb_compound_hdr hdr = {
288 .ident = args->dl_ident, 288 .ident = cb->cb_clp->cl_cb_ident,
289 .minorversion = rpc_args->args_seq.cbs_minorversion, 289 .minorversion = cb->cb_minorversion,
290 }; 290 };
291 291
292 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 292 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
293 encode_cb_compound_hdr(&xdr, &hdr); 293 encode_cb_compound_hdr(&xdr, &hdr);
294 encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr); 294 encode_cb_sequence(&xdr, cb, &hdr);
295 encode_cb_recall(&xdr, args, &hdr); 295 encode_cb_recall(&xdr, args, &hdr);
296 encode_cb_nops(&hdr); 296 encode_cb_nops(&hdr);
297 return 0; 297 return 0;
@@ -339,15 +339,16 @@ decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
339 * with a single slot. 339 * with a single slot.
340 */ 340 */
341static int 341static int
342decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res, 342decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
343 struct rpc_rqst *rqstp) 343 struct rpc_rqst *rqstp)
344{ 344{
345 struct nfsd4_session *ses = cb->cb_clp->cl_cb_session;
345 struct nfs4_sessionid id; 346 struct nfs4_sessionid id;
346 int status; 347 int status;
347 u32 dummy; 348 u32 dummy;
348 __be32 *p; 349 __be32 *p;
349 350
350 if (res->cbs_minorversion == 0) 351 if (cb->cb_minorversion == 0)
351 return 0; 352 return 0;
352 353
353 status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE); 354 status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE);
@@ -363,13 +364,12 @@ decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res,
363 READ_BUF(NFS4_MAX_SESSIONID_LEN + 16); 364 READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
364 memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN); 365 memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
365 p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN); 366 p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
366 if (memcmp(id.data, res->cbs_clp->cl_sessionid.data, 367 if (memcmp(id.data, ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN)) {
367 NFS4_MAX_SESSIONID_LEN)) {
368 dprintk("%s Invalid session id\n", __func__); 368 dprintk("%s Invalid session id\n", __func__);
369 goto out; 369 goto out;
370 } 370 }
371 READ32(dummy); 371 READ32(dummy);
372 if (dummy != res->cbs_clp->cl_cb_seq_nr) { 372 if (dummy != ses->se_cb_seq_nr) {
373 dprintk("%s Invalid sequence number\n", __func__); 373 dprintk("%s Invalid sequence number\n", __func__);
374 goto out; 374 goto out;
375 } 375 }
@@ -393,7 +393,7 @@ nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
393 393
394static int 394static int
395nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p, 395nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
396 struct nfsd4_cb_sequence *seq) 396 struct nfsd4_callback *cb)
397{ 397{
398 struct xdr_stream xdr; 398 struct xdr_stream xdr;
399 struct nfs4_cb_compound_hdr hdr; 399 struct nfs4_cb_compound_hdr hdr;
@@ -403,8 +403,8 @@ nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
403 status = decode_cb_compound_hdr(&xdr, &hdr); 403 status = decode_cb_compound_hdr(&xdr, &hdr);
404 if (status) 404 if (status)
405 goto out; 405 goto out;
406 if (seq) { 406 if (cb) {
407 status = decode_cb_sequence(&xdr, seq, rqstp); 407 status = decode_cb_sequence(&xdr, cb, rqstp);
408 if (status) 408 if (status)
409 goto out; 409 goto out;
410 } 410 }
@@ -473,30 +473,34 @@ static int max_cb_time(void)
473/* Reference counting, callback cleanup, etc., all look racy as heck. 473/* Reference counting, callback cleanup, etc., all look racy as heck.
474 * And why is cl_cb_set an atomic? */ 474 * And why is cl_cb_set an atomic? */
475 475
476int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb) 476int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
477{ 477{
478 struct rpc_timeout timeparms = { 478 struct rpc_timeout timeparms = {
479 .to_initval = max_cb_time(), 479 .to_initval = max_cb_time(),
480 .to_retries = 0, 480 .to_retries = 0,
481 }; 481 };
482 struct rpc_create_args args = { 482 struct rpc_create_args args = {
483 .protocol = XPRT_TRANSPORT_TCP, 483 .net = &init_net,
484 .address = (struct sockaddr *) &cb->cb_addr, 484 .address = (struct sockaddr *) &conn->cb_addr,
485 .addrsize = cb->cb_addrlen, 485 .addrsize = conn->cb_addrlen,
486 .timeout = &timeparms, 486 .timeout = &timeparms,
487 .program = &cb_program, 487 .program = &cb_program,
488 .prognumber = cb->cb_prog,
489 .version = 0, 488 .version = 0,
490 .authflavor = clp->cl_flavor, 489 .authflavor = clp->cl_flavor,
491 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), 490 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
492 .client_name = clp->cl_principal,
493 }; 491 };
494 struct rpc_clnt *client; 492 struct rpc_clnt *client;
495 493
496 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) 494 if (clp->cl_minorversion == 0) {
497 return -EINVAL; 495 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
498 if (cb->cb_minorversion) { 496 return -EINVAL;
499 args.bc_xprt = cb->cb_xprt; 497 args.client_name = clp->cl_principal;
498 args.prognumber = conn->cb_prog,
499 args.protocol = XPRT_TRANSPORT_TCP;
500 clp->cl_cb_ident = conn->cb_ident;
501 } else {
502 args.bc_xprt = conn->cb_xprt;
503 args.prognumber = clp->cl_cb_session->se_cb_prog;
500 args.protocol = XPRT_TRANSPORT_BC_TCP; 504 args.protocol = XPRT_TRANSPORT_BC_TCP;
501 } 505 }
502 /* Create RPC client */ 506 /* Create RPC client */
@@ -506,7 +510,7 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
506 PTR_ERR(client)); 510 PTR_ERR(client));
507 return PTR_ERR(client); 511 return PTR_ERR(client);
508 } 512 }
509 nfsd4_set_callback_client(clp, client); 513 clp->cl_cb_client = client;
510 return 0; 514 return 0;
511 515
512} 516}
@@ -519,7 +523,7 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason)
519 523
520static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) 524static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
521{ 525{
522 struct nfs4_client *clp = calldata; 526 struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
523 527
524 if (task->tk_status) 528 if (task->tk_status)
525 warn_no_callback_path(clp, task->tk_status); 529 warn_no_callback_path(clp, task->tk_status);
@@ -528,6 +532,8 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
528} 532}
529 533
530static const struct rpc_call_ops nfsd4_cb_probe_ops = { 534static const struct rpc_call_ops nfsd4_cb_probe_ops = {
535 /* XXX: release method to ensure we set the cb channel down if
536 * necessary on early failure? */
531 .rpc_call_done = nfsd4_cb_probe_done, 537 .rpc_call_done = nfsd4_cb_probe_done,
532}; 538};
533 539
@@ -543,38 +549,42 @@ int set_callback_cred(void)
543 return 0; 549 return 0;
544} 550}
545 551
552static struct workqueue_struct *callback_wq;
546 553
547void do_probe_callback(struct nfs4_client *clp) 554static void do_probe_callback(struct nfs4_client *clp)
548{ 555{
549 struct rpc_message msg = { 556 struct nfsd4_callback *cb = &clp->cl_cb_null;
550 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
551 .rpc_argp = clp,
552 .rpc_cred = callback_cred
553 };
554 int status;
555 557
556 status = rpc_call_async(clp->cl_cb_client, &msg, 558 cb->cb_op = NULL;
557 RPC_TASK_SOFT | RPC_TASK_SOFTCONN, 559 cb->cb_clp = clp;
558 &nfsd4_cb_probe_ops, (void *)clp); 560
559 if (status) 561 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL];
560 warn_no_callback_path(clp, status); 562 cb->cb_msg.rpc_argp = NULL;
563 cb->cb_msg.rpc_resp = NULL;
564 cb->cb_msg.rpc_cred = callback_cred;
565
566 cb->cb_ops = &nfsd4_cb_probe_ops;
567
568 queue_work(callback_wq, &cb->cb_work);
561} 569}
562 570
563/* 571/*
564 * Set up the callback client and put a NFSPROC4_CB_NULL on the wire... 572 * Poke the callback thread to process any updates to the callback
573 * parameters, and send a null probe.
565 */ 574 */
566void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb) 575void nfsd4_probe_callback(struct nfs4_client *clp)
567{ 576{
568 int status; 577 set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
578 do_probe_callback(clp);
579}
569 580
581void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
582{
570 BUG_ON(atomic_read(&clp->cl_cb_set)); 583 BUG_ON(atomic_read(&clp->cl_cb_set));
571 584
572 status = setup_callback_client(clp, cb); 585 spin_lock(&clp->cl_lock);
573 if (status) { 586 memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn));
574 warn_no_callback_path(clp, status); 587 spin_unlock(&clp->cl_lock);
575 return;
576 }
577 do_probe_callback(clp);
578} 588}
579 589
580/* 590/*
@@ -585,8 +595,7 @@ void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
585static int nfsd41_cb_setup_sequence(struct nfs4_client *clp, 595static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
586 struct rpc_task *task) 596 struct rpc_task *task)
587{ 597{
588 struct nfs4_rpc_args *args = task->tk_msg.rpc_argp; 598 u32 *ptr = (u32 *)clp->cl_cb_session->se_sessionid.data;
589 u32 *ptr = (u32 *)clp->cl_sessionid.data;
590 int status = 0; 599 int status = 0;
591 600
592 dprintk("%s: %u:%u:%u:%u\n", __func__, 601 dprintk("%s: %u:%u:%u:%u\n", __func__,
@@ -598,14 +607,6 @@ static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
598 status = -EAGAIN; 607 status = -EAGAIN;
599 goto out; 608 goto out;
600 } 609 }
601
602 /*
603 * We'll need the clp during XDR encoding and decoding,
604 * and the sequence during decoding to verify the reply
605 */
606 args->args_seq.cbs_clp = clp;
607 task->tk_msg.rpc_resp = &args->args_seq;
608
609out: 610out:
610 dprintk("%s status=%d\n", __func__, status); 611 dprintk("%s status=%d\n", __func__, status);
611 return status; 612 return status;
@@ -617,13 +618,13 @@ out:
617 */ 618 */
618static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) 619static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
619{ 620{
620 struct nfs4_delegation *dp = calldata; 621 struct nfsd4_callback *cb = calldata;
622 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
621 struct nfs4_client *clp = dp->dl_client; 623 struct nfs4_client *clp = dp->dl_client;
622 struct nfs4_rpc_args *args = task->tk_msg.rpc_argp; 624 u32 minorversion = clp->cl_minorversion;
623 u32 minorversion = clp->cl_cb_conn.cb_minorversion;
624 int status = 0; 625 int status = 0;
625 626
626 args->args_seq.cbs_minorversion = minorversion; 627 cb->cb_minorversion = minorversion;
627 if (minorversion) { 628 if (minorversion) {
628 status = nfsd41_cb_setup_sequence(clp, task); 629 status = nfsd41_cb_setup_sequence(clp, task);
629 if (status) { 630 if (status) {
@@ -640,19 +641,20 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
640 641
641static void nfsd4_cb_done(struct rpc_task *task, void *calldata) 642static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
642{ 643{
643 struct nfs4_delegation *dp = calldata; 644 struct nfsd4_callback *cb = calldata;
645 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
644 struct nfs4_client *clp = dp->dl_client; 646 struct nfs4_client *clp = dp->dl_client;
645 647
646 dprintk("%s: minorversion=%d\n", __func__, 648 dprintk("%s: minorversion=%d\n", __func__,
647 clp->cl_cb_conn.cb_minorversion); 649 clp->cl_minorversion);
648 650
649 if (clp->cl_cb_conn.cb_minorversion) { 651 if (clp->cl_minorversion) {
650 /* No need for lock, access serialized in nfsd4_cb_prepare */ 652 /* No need for lock, access serialized in nfsd4_cb_prepare */
651 ++clp->cl_cb_seq_nr; 653 ++clp->cl_cb_session->se_cb_seq_nr;
652 clear_bit(0, &clp->cl_cb_slot_busy); 654 clear_bit(0, &clp->cl_cb_slot_busy);
653 rpc_wake_up_next(&clp->cl_cb_waitq); 655 rpc_wake_up_next(&clp->cl_cb_waitq);
654 dprintk("%s: freed slot, new seqid=%d\n", __func__, 656 dprintk("%s: freed slot, new seqid=%d\n", __func__,
655 clp->cl_cb_seq_nr); 657 clp->cl_cb_session->se_cb_seq_nr);
656 658
657 /* We're done looking into the sequence information */ 659 /* We're done looking into the sequence information */
658 task->tk_msg.rpc_resp = NULL; 660 task->tk_msg.rpc_resp = NULL;
@@ -662,7 +664,8 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
662 664
663static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) 665static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
664{ 666{
665 struct nfs4_delegation *dp = calldata; 667 struct nfsd4_callback *cb = calldata;
668 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
666 struct nfs4_client *clp = dp->dl_client; 669 struct nfs4_client *clp = dp->dl_client;
667 struct rpc_clnt *current_rpc_client = clp->cl_cb_client; 670 struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
668 671
@@ -707,7 +710,8 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
707 710
708static void nfsd4_cb_recall_release(void *calldata) 711static void nfsd4_cb_recall_release(void *calldata)
709{ 712{
710 struct nfs4_delegation *dp = calldata; 713 struct nfsd4_callback *cb = calldata;
714 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
711 715
712 nfs4_put_delegation(dp); 716 nfs4_put_delegation(dp);
713} 717}
@@ -718,8 +722,6 @@ static const struct rpc_call_ops nfsd4_cb_recall_ops = {
718 .rpc_release = nfsd4_cb_recall_release, 722 .rpc_release = nfsd4_cb_recall_release,
719}; 723};
720 724
721static struct workqueue_struct *callback_wq;
722
723int nfsd4_create_callback_queue(void) 725int nfsd4_create_callback_queue(void)
724{ 726{
725 callback_wq = create_singlethread_workqueue("nfsd4_callbacks"); 727 callback_wq = create_singlethread_workqueue("nfsd4_callbacks");
@@ -734,57 +736,88 @@ void nfsd4_destroy_callback_queue(void)
734} 736}
735 737
736/* must be called under the state lock */ 738/* must be called under the state lock */
737void nfsd4_set_callback_client(struct nfs4_client *clp, struct rpc_clnt *new) 739void nfsd4_shutdown_callback(struct nfs4_client *clp)
738{ 740{
739 struct rpc_clnt *old = clp->cl_cb_client; 741 set_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags);
740
741 clp->cl_cb_client = new;
742 /* 742 /*
743 * After this, any work that saw the old value of cl_cb_client will 743 * Note this won't actually result in a null callback;
744 * be gone: 744 * instead, nfsd4_do_callback_rpc() will detect the killed
745 * client, destroy the rpc client, and stop:
745 */ 746 */
747 do_probe_callback(clp);
746 flush_workqueue(callback_wq); 748 flush_workqueue(callback_wq);
747 /* So we can safely shut it down: */
748 if (old)
749 rpc_shutdown_client(old);
750} 749}
751 750
752/* 751void nfsd4_release_cb(struct nfsd4_callback *cb)
753 * called with dp->dl_count inc'ed.
754 */
755static void _nfsd4_cb_recall(struct nfs4_delegation *dp)
756{ 752{
757 struct nfs4_client *clp = dp->dl_client; 753 if (cb->cb_ops->rpc_release)
758 struct rpc_clnt *clnt = clp->cl_cb_client; 754 cb->cb_ops->rpc_release(cb);
759 struct nfs4_rpc_args *args = &dp->dl_recall.cb_args; 755}
760 struct rpc_message msg = {
761 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
762 .rpc_cred = callback_cred
763 };
764 756
765 if (clnt == NULL) { 757void nfsd4_process_cb_update(struct nfsd4_callback *cb)
766 nfs4_put_delegation(dp); 758{
767 return; /* Client is shutting down; give up. */ 759 struct nfs4_cb_conn conn;
760 struct nfs4_client *clp = cb->cb_clp;
761 int err;
762
763 /*
764 * This is either an update, or the client dying; in either case,
765 * kill the old client:
766 */
767 if (clp->cl_cb_client) {
768 rpc_shutdown_client(clp->cl_cb_client);
769 clp->cl_cb_client = NULL;
768 } 770 }
771 if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags))
772 return;
773 spin_lock(&clp->cl_lock);
774 /*
775 * Only serialized callback code is allowed to clear these
776 * flags; main nfsd code can only set them:
777 */
778 BUG_ON(!clp->cl_cb_flags);
779 clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
780 memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn));
781 spin_unlock(&clp->cl_lock);
769 782
770 args->args_op = dp; 783 err = setup_callback_client(clp, &conn);
771 msg.rpc_argp = args; 784 if (err)
772 dp->dl_retries = 1; 785 warn_no_callback_path(clp, err);
773 rpc_call_async(clnt, &msg, RPC_TASK_SOFT, &nfsd4_cb_recall_ops, dp);
774} 786}
775 787
776void nfsd4_do_callback_rpc(struct work_struct *w) 788void nfsd4_do_callback_rpc(struct work_struct *w)
777{ 789{
778 /* XXX: for now, just send off delegation recall. */ 790 struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work);
779 /* In future, generalize to handle any sort of callback. */ 791 struct nfs4_client *clp = cb->cb_clp;
780 struct nfsd4_callback *c = container_of(w, struct nfsd4_callback, cb_work); 792 struct rpc_clnt *clnt;
781 struct nfs4_delegation *dp = container_of(c, struct nfs4_delegation, dl_recall);
782 793
783 _nfsd4_cb_recall(dp); 794 if (clp->cl_cb_flags)
784} 795 nfsd4_process_cb_update(cb);
785 796
797 clnt = clp->cl_cb_client;
798 if (!clnt) {
799 /* Callback channel broken, or client killed; give up: */
800 nfsd4_release_cb(cb);
801 return;
802 }
803 rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
804 cb->cb_ops, cb);
805}
786 806
787void nfsd4_cb_recall(struct nfs4_delegation *dp) 807void nfsd4_cb_recall(struct nfs4_delegation *dp)
788{ 808{
809 struct nfsd4_callback *cb = &dp->dl_recall;
810
811 dp->dl_retries = 1;
812 cb->cb_op = dp;
813 cb->cb_clp = dp->dl_client;
814 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
815 cb->cb_msg.rpc_argp = cb;
816 cb->cb_msg.rpc_resp = cb;
817 cb->cb_msg.rpc_cred = callback_cred;
818
819 cb->cb_ops = &nfsd4_cb_recall_ops;
820 dp->dl_retries = 1;
821
789 queue_work(callback_wq, &dp->dl_recall.cb_work); 822 queue_work(callback_wq, &dp->dl_recall.cb_work);
790} 823}
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index c78dbf493424..f0695e815f0e 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -482,109 +482,26 @@ nfsd_idmap_shutdown(void)
482 cache_unregister(&nametoid_cache); 482 cache_unregister(&nametoid_cache);
483} 483}
484 484
485/*
486 * Deferred request handling
487 */
488
489struct idmap_defer_req {
490 struct cache_req req;
491 struct cache_deferred_req deferred_req;
492 wait_queue_head_t waitq;
493 atomic_t count;
494};
495
496static inline void
497put_mdr(struct idmap_defer_req *mdr)
498{
499 if (atomic_dec_and_test(&mdr->count))
500 kfree(mdr);
501}
502
503static inline void
504get_mdr(struct idmap_defer_req *mdr)
505{
506 atomic_inc(&mdr->count);
507}
508
509static void
510idmap_revisit(struct cache_deferred_req *dreq, int toomany)
511{
512 struct idmap_defer_req *mdr =
513 container_of(dreq, struct idmap_defer_req, deferred_req);
514
515 wake_up(&mdr->waitq);
516 put_mdr(mdr);
517}
518
519static struct cache_deferred_req *
520idmap_defer(struct cache_req *req)
521{
522 struct idmap_defer_req *mdr =
523 container_of(req, struct idmap_defer_req, req);
524
525 mdr->deferred_req.revisit = idmap_revisit;
526 get_mdr(mdr);
527 return (&mdr->deferred_req);
528}
529
530static inline int
531do_idmap_lookup(struct ent *(*lookup_fn)(struct ent *), struct ent *key,
532 struct cache_detail *detail, struct ent **item,
533 struct idmap_defer_req *mdr)
534{
535 *item = lookup_fn(key);
536 if (!*item)
537 return -ENOMEM;
538 return cache_check(detail, &(*item)->h, &mdr->req);
539}
540
541static inline int
542do_idmap_lookup_nowait(struct ent *(*lookup_fn)(struct ent *),
543 struct ent *key, struct cache_detail *detail,
544 struct ent **item)
545{
546 int ret = -ENOMEM;
547
548 *item = lookup_fn(key);
549 if (!*item)
550 goto out_err;
551 ret = -ETIMEDOUT;
552 if (!test_bit(CACHE_VALID, &(*item)->h.flags)
553 || (*item)->h.expiry_time < get_seconds()
554 || detail->flush_time > (*item)->h.last_refresh)
555 goto out_put;
556 ret = -ENOENT;
557 if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags))
558 goto out_put;
559 return 0;
560out_put:
561 cache_put(&(*item)->h, detail);
562out_err:
563 *item = NULL;
564 return ret;
565}
566
567static int 485static int
568idmap_lookup(struct svc_rqst *rqstp, 486idmap_lookup(struct svc_rqst *rqstp,
569 struct ent *(*lookup_fn)(struct ent *), struct ent *key, 487 struct ent *(*lookup_fn)(struct ent *), struct ent *key,
570 struct cache_detail *detail, struct ent **item) 488 struct cache_detail *detail, struct ent **item)
571{ 489{
572 struct idmap_defer_req *mdr;
573 int ret; 490 int ret;
574 491
575 mdr = kzalloc(sizeof(*mdr), GFP_KERNEL); 492 *item = lookup_fn(key);
576 if (!mdr) 493 if (!*item)
577 return -ENOMEM; 494 return -ENOMEM;
578 atomic_set(&mdr->count, 1); 495 retry:
579 init_waitqueue_head(&mdr->waitq); 496 ret = cache_check(detail, &(*item)->h, &rqstp->rq_chandle);
580 mdr->req.defer = idmap_defer; 497
581 ret = do_idmap_lookup(lookup_fn, key, detail, item, mdr); 498 if (ret == -ETIMEDOUT) {
582 if (ret == -EAGAIN) { 499 struct ent *prev_item = *item;
583 wait_event_interruptible_timeout(mdr->waitq, 500 *item = lookup_fn(key);
584 test_bit(CACHE_VALID, &(*item)->h.flags), 1 * HZ); 501 if (*item != prev_item)
585 ret = do_idmap_lookup_nowait(lookup_fn, key, detail, item); 502 goto retry;
503 cache_put(&(*item)->h, detail);
586 } 504 }
587 put_mdr(mdr);
588 return ret; 505 return ret;
589} 506}
590 507
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 59ec449b0c7f..0cdfd022bb7b 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1031,8 +1031,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1031 resp->cstate.session = NULL; 1031 resp->cstate.session = NULL;
1032 fh_init(&resp->cstate.current_fh, NFS4_FHSIZE); 1032 fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
1033 fh_init(&resp->cstate.save_fh, NFS4_FHSIZE); 1033 fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
1034 /* Use the deferral mechanism only for NFSv4.0 compounds */ 1034 /*
1035 rqstp->rq_usedeferral = (args->minorversion == 0); 1035 * Don't use the deferral mechanism for NFSv4; compounds make it
1036 * too hard to avoid non-idempotency problems.
1037 */
1038 rqstp->rq_usedeferral = 0;
1036 1039
1037 /* 1040 /*
1038 * According to RFC3010, this takes precedence over all other errors. 1041 * According to RFC3010, this takes precedence over all other errors.
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index cf0d2ffb3c84..56347e0ac88d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -33,7 +33,7 @@
33*/ 33*/
34 34
35#include <linux/file.h> 35#include <linux/file.h>
36#include <linux/smp_lock.h> 36#include <linux/fs.h>
37#include <linux/slab.h> 37#include <linux/slab.h>
38#include <linux/namei.h> 38#include <linux/namei.h>
39#include <linux/swap.h> 39#include <linux/swap.h>
@@ -207,7 +207,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
207{ 207{
208 struct nfs4_delegation *dp; 208 struct nfs4_delegation *dp;
209 struct nfs4_file *fp = stp->st_file; 209 struct nfs4_file *fp = stp->st_file;
210 struct nfs4_cb_conn *cb = &stp->st_stateowner->so_client->cl_cb_conn;
211 210
212 dprintk("NFSD alloc_init_deleg\n"); 211 dprintk("NFSD alloc_init_deleg\n");
213 /* 212 /*
@@ -234,7 +233,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
234 nfs4_file_get_access(fp, O_RDONLY); 233 nfs4_file_get_access(fp, O_RDONLY);
235 dp->dl_flock = NULL; 234 dp->dl_flock = NULL;
236 dp->dl_type = type; 235 dp->dl_type = type;
237 dp->dl_ident = cb->cb_ident;
238 dp->dl_stateid.si_boot = boot_time; 236 dp->dl_stateid.si_boot = boot_time;
239 dp->dl_stateid.si_stateownerid = current_delegid++; 237 dp->dl_stateid.si_stateownerid = current_delegid++;
240 dp->dl_stateid.si_fileid = 0; 238 dp->dl_stateid.si_fileid = 0;
@@ -535,171 +533,258 @@ gen_sessionid(struct nfsd4_session *ses)
535 */ 533 */
536#define NFSD_MIN_HDR_SEQ_SZ (24 + 12 + 44) 534#define NFSD_MIN_HDR_SEQ_SZ (24 + 12 + 44)
537 535
536static void
537free_session_slots(struct nfsd4_session *ses)
538{
539 int i;
540
541 for (i = 0; i < ses->se_fchannel.maxreqs; i++)
542 kfree(ses->se_slots[i]);
543}
544
538/* 545/*
539 * Give the client the number of ca_maxresponsesize_cached slots it 546 * We don't actually need to cache the rpc and session headers, so we
540 * requests, of size bounded by NFSD_SLOT_CACHE_SIZE, 547 * can allocate a little less for each slot:
541 * NFSD_MAX_MEM_PER_SESSION, and nfsd_drc_max_mem. Do not allow more 548 */
542 * than NFSD_MAX_SLOTS_PER_SESSION. 549static inline int slot_bytes(struct nfsd4_channel_attrs *ca)
543 * 550{
544 * If we run out of reserved DRC memory we should (up to a point) 551 return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
552}
553
554static int nfsd4_sanitize_slot_size(u32 size)
555{
556 size -= NFSD_MIN_HDR_SEQ_SZ; /* We don't cache the rpc header */
557 size = min_t(u32, size, NFSD_SLOT_CACHE_SIZE);
558
559 return size;
560}
561
562/*
563 * XXX: If we run out of reserved DRC memory we could (up to a point)
545 * re-negotiate active sessions and reduce their slot usage to make 564 * re-negotiate active sessions and reduce their slot usage to make
546 * rooom for new connections. For now we just fail the create session. 565 * rooom for new connections. For now we just fail the create session.
547 */ 566 */
548static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan) 567static int nfsd4_get_drc_mem(int slotsize, u32 num)
549{ 568{
550 int mem, size = fchan->maxresp_cached; 569 int avail;
551 570
552 if (fchan->maxreqs < 1) 571 num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION);
553 return nfserr_inval;
554 572
555 if (size < NFSD_MIN_HDR_SEQ_SZ) 573 spin_lock(&nfsd_drc_lock);
556 size = NFSD_MIN_HDR_SEQ_SZ; 574 avail = min_t(int, NFSD_MAX_MEM_PER_SESSION,
557 size -= NFSD_MIN_HDR_SEQ_SZ; 575 nfsd_drc_max_mem - nfsd_drc_mem_used);
558 if (size > NFSD_SLOT_CACHE_SIZE) 576 num = min_t(int, num, avail / slotsize);
559 size = NFSD_SLOT_CACHE_SIZE; 577 nfsd_drc_mem_used += num * slotsize;
560 578 spin_unlock(&nfsd_drc_lock);
561 /* bound the maxreqs by NFSD_MAX_MEM_PER_SESSION */
562 mem = fchan->maxreqs * size;
563 if (mem > NFSD_MAX_MEM_PER_SESSION) {
564 fchan->maxreqs = NFSD_MAX_MEM_PER_SESSION / size;
565 if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
566 fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
567 mem = fchan->maxreqs * size;
568 }
569 579
580 return num;
581}
582
583static void nfsd4_put_drc_mem(int slotsize, int num)
584{
570 spin_lock(&nfsd_drc_lock); 585 spin_lock(&nfsd_drc_lock);
571 /* bound the total session drc memory ussage */ 586 nfsd_drc_mem_used -= slotsize * num;
572 if (mem + nfsd_drc_mem_used > nfsd_drc_max_mem) {
573 fchan->maxreqs = (nfsd_drc_max_mem - nfsd_drc_mem_used) / size;
574 mem = fchan->maxreqs * size;
575 }
576 nfsd_drc_mem_used += mem;
577 spin_unlock(&nfsd_drc_lock); 587 spin_unlock(&nfsd_drc_lock);
588}
578 589
579 if (fchan->maxreqs == 0) 590static struct nfsd4_session *alloc_session(int slotsize, int numslots)
580 return nfserr_jukebox; 591{
592 struct nfsd4_session *new;
593 int mem, i;
581 594
582 fchan->maxresp_cached = size + NFSD_MIN_HDR_SEQ_SZ; 595 BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot *)
583 return 0; 596 + sizeof(struct nfsd4_session) > PAGE_SIZE);
597 mem = numslots * sizeof(struct nfsd4_slot *);
598
599 new = kzalloc(sizeof(*new) + mem, GFP_KERNEL);
600 if (!new)
601 return NULL;
602 /* allocate each struct nfsd4_slot and data cache in one piece */
603 for (i = 0; i < numslots; i++) {
604 mem = sizeof(struct nfsd4_slot) + slotsize;
605 new->se_slots[i] = kzalloc(mem, GFP_KERNEL);
606 if (!new->se_slots[i])
607 goto out_free;
608 }
609 return new;
610out_free:
611 while (i--)
612 kfree(new->se_slots[i]);
613 kfree(new);
614 return NULL;
584} 615}
585 616
586/* 617static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, struct nfsd4_channel_attrs *req, int numslots, int slotsize)
587 * fchan holds the client values on input, and the server values on output
588 * sv_max_mesg is the maximum payload plus one page for overhead.
589 */
590static int init_forechannel_attrs(struct svc_rqst *rqstp,
591 struct nfsd4_channel_attrs *session_fchan,
592 struct nfsd4_channel_attrs *fchan)
593{ 618{
594 int status = 0; 619 u32 maxrpc = nfsd_serv->sv_max_mesg;
595 __u32 maxcount = nfsd_serv->sv_max_mesg;
596 620
597 /* headerpadsz set to zero in encode routine */ 621 new->maxreqs = numslots;
622 new->maxresp_cached = slotsize + NFSD_MIN_HDR_SEQ_SZ;
623 new->maxreq_sz = min_t(u32, req->maxreq_sz, maxrpc);
624 new->maxresp_sz = min_t(u32, req->maxresp_sz, maxrpc);
625 new->maxops = min_t(u32, req->maxops, NFSD_MAX_OPS_PER_COMPOUND);
626}
598 627
599 /* Use the client's max request and max response size if possible */ 628static void free_conn(struct nfsd4_conn *c)
600 if (fchan->maxreq_sz > maxcount) 629{
601 fchan->maxreq_sz = maxcount; 630 svc_xprt_put(c->cn_xprt);
602 session_fchan->maxreq_sz = fchan->maxreq_sz; 631 kfree(c);
632}
603 633
604 if (fchan->maxresp_sz > maxcount) 634static void nfsd4_conn_lost(struct svc_xpt_user *u)
605 fchan->maxresp_sz = maxcount; 635{
606 session_fchan->maxresp_sz = fchan->maxresp_sz; 636 struct nfsd4_conn *c = container_of(u, struct nfsd4_conn, cn_xpt_user);
637 struct nfs4_client *clp = c->cn_session->se_client;
607 638
608 /* Use the client's maxops if possible */ 639 spin_lock(&clp->cl_lock);
609 if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND) 640 if (!list_empty(&c->cn_persession)) {
610 fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND; 641 list_del(&c->cn_persession);
611 session_fchan->maxops = fchan->maxops; 642 free_conn(c);
643 }
644 spin_unlock(&clp->cl_lock);
645}
612 646
613 /* FIXME: Error means no more DRC pages so the server should 647static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags)
614 * recover pages from existing sessions. For now fail session 648{
615 * creation. 649 struct nfsd4_conn *conn;
616 */
617 status = set_forechannel_drc_size(fchan);
618 650
619 session_fchan->maxresp_cached = fchan->maxresp_cached; 651 conn = kmalloc(sizeof(struct nfsd4_conn), GFP_KERNEL);
620 session_fchan->maxreqs = fchan->maxreqs; 652 if (!conn)
653 return NULL;
654 svc_xprt_get(rqstp->rq_xprt);
655 conn->cn_xprt = rqstp->rq_xprt;
656 conn->cn_flags = flags;
657 INIT_LIST_HEAD(&conn->cn_xpt_user.list);
658 return conn;
659}
621 660
622 dprintk("%s status %d\n", __func__, status); 661static void __nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
623 return status; 662{
663 conn->cn_session = ses;
664 list_add(&conn->cn_persession, &ses->se_conns);
624} 665}
625 666
626static void 667static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
627free_session_slots(struct nfsd4_session *ses)
628{ 668{
629 int i; 669 struct nfs4_client *clp = ses->se_client;
630 670
631 for (i = 0; i < ses->se_fchannel.maxreqs; i++) 671 spin_lock(&clp->cl_lock);
632 kfree(ses->se_slots[i]); 672 __nfsd4_hash_conn(conn, ses);
673 spin_unlock(&clp->cl_lock);
633} 674}
634 675
635/* 676static void nfsd4_register_conn(struct nfsd4_conn *conn)
636 * We don't actually need to cache the rpc and session headers, so we
637 * can allocate a little less for each slot:
638 */
639static inline int slot_bytes(struct nfsd4_channel_attrs *ca)
640{ 677{
641 return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; 678 conn->cn_xpt_user.callback = nfsd4_conn_lost;
679 register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
642} 680}
643 681
644static int 682static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
645alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
646 struct nfsd4_create_session *cses)
647{ 683{
648 struct nfsd4_session *new, tmp; 684 struct nfsd4_conn *conn;
649 struct nfsd4_slot *sp; 685 u32 flags = NFS4_CDFC4_FORE;
650 int idx, slotsize, cachesize, i;
651 int status;
652 686
653 memset(&tmp, 0, sizeof(tmp)); 687 if (ses->se_flags & SESSION4_BACK_CHAN)
688 flags |= NFS4_CDFC4_BACK;
689 conn = alloc_conn(rqstp, flags);
690 if (!conn)
691 return nfserr_jukebox;
692 nfsd4_hash_conn(conn, ses);
693 nfsd4_register_conn(conn);
694 return nfs_ok;
695}
654 696
655 /* FIXME: For now, we just accept the client back channel attributes. */ 697static void nfsd4_del_conns(struct nfsd4_session *s)
656 tmp.se_bchannel = cses->back_channel; 698{
657 status = init_forechannel_attrs(rqstp, &tmp.se_fchannel, 699 struct nfs4_client *clp = s->se_client;
658 &cses->fore_channel); 700 struct nfsd4_conn *c;
659 if (status)
660 goto out;
661 701
662 BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot) 702 spin_lock(&clp->cl_lock);
663 + sizeof(struct nfsd4_session) > PAGE_SIZE); 703 while (!list_empty(&s->se_conns)) {
704 c = list_first_entry(&s->se_conns, struct nfsd4_conn, cn_persession);
705 list_del_init(&c->cn_persession);
706 spin_unlock(&clp->cl_lock);
664 707
665 status = nfserr_jukebox; 708 unregister_xpt_user(c->cn_xprt, &c->cn_xpt_user);
666 /* allocate struct nfsd4_session and slot table pointers in one piece */ 709 free_conn(c);
667 slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot *);
668 new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
669 if (!new)
670 goto out;
671 710
672 memcpy(new, &tmp, sizeof(*new)); 711 spin_lock(&clp->cl_lock);
712 }
713 spin_unlock(&clp->cl_lock);
714}
673 715
674 /* allocate each struct nfsd4_slot and data cache in one piece */ 716void free_session(struct kref *kref)
675 cachesize = slot_bytes(&new->se_fchannel); 717{
676 for (i = 0; i < new->se_fchannel.maxreqs; i++) { 718 struct nfsd4_session *ses;
677 sp = kzalloc(sizeof(*sp) + cachesize, GFP_KERNEL); 719 int mem;
678 if (!sp) 720
679 goto out_free; 721 ses = container_of(kref, struct nfsd4_session, se_ref);
680 new->se_slots[i] = sp; 722 nfsd4_del_conns(ses);
723 spin_lock(&nfsd_drc_lock);
724 mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
725 nfsd_drc_mem_used -= mem;
726 spin_unlock(&nfsd_drc_lock);
727 free_session_slots(ses);
728 kfree(ses);
729}
730
731static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_create_session *cses)
732{
733 struct nfsd4_session *new;
734 struct nfsd4_channel_attrs *fchan = &cses->fore_channel;
735 int numslots, slotsize;
736 int status;
737 int idx;
738
739 /*
740 * Note decreasing slot size below client's request may
741 * make it difficult for client to function correctly, whereas
742 * decreasing the number of slots will (just?) affect
743 * performance. When short on memory we therefore prefer to
744 * decrease number of slots instead of their size.
745 */
746 slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached);
747 numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs);
748
749 new = alloc_session(slotsize, numslots);
750 if (!new) {
751 nfsd4_put_drc_mem(slotsize, fchan->maxreqs);
752 return NULL;
681 } 753 }
754 init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize);
682 755
683 new->se_client = clp; 756 new->se_client = clp;
684 gen_sessionid(new); 757 gen_sessionid(new);
685 idx = hash_sessionid(&new->se_sessionid);
686 memcpy(clp->cl_sessionid.data, new->se_sessionid.data,
687 NFS4_MAX_SESSIONID_LEN);
688 758
759 INIT_LIST_HEAD(&new->se_conns);
760
761 new->se_cb_seq_nr = 1;
689 new->se_flags = cses->flags; 762 new->se_flags = cses->flags;
763 new->se_cb_prog = cses->callback_prog;
690 kref_init(&new->se_ref); 764 kref_init(&new->se_ref);
765 idx = hash_sessionid(&new->se_sessionid);
691 spin_lock(&client_lock); 766 spin_lock(&client_lock);
692 list_add(&new->se_hash, &sessionid_hashtbl[idx]); 767 list_add(&new->se_hash, &sessionid_hashtbl[idx]);
693 list_add(&new->se_perclnt, &clp->cl_sessions); 768 list_add(&new->se_perclnt, &clp->cl_sessions);
694 spin_unlock(&client_lock); 769 spin_unlock(&client_lock);
695 770
696 status = nfs_ok; 771 status = nfsd4_new_conn(rqstp, new);
697out: 772 /* whoops: benny points out, status is ignored! (err, or bogus) */
698 return status; 773 if (status) {
699out_free: 774 free_session(&new->se_ref);
700 free_session_slots(new); 775 return NULL;
701 kfree(new); 776 }
702 goto out; 777 if (!clp->cl_cb_session && (cses->flags & SESSION4_BACK_CHAN)) {
778 struct sockaddr *sa = svc_addr(rqstp);
779
780 clp->cl_cb_session = new;
781 clp->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
782 svc_xprt_get(rqstp->rq_xprt);
783 rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
784 clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
785 nfsd4_probe_callback(clp);
786 }
787 return new;
703} 788}
704 789
705/* caller must hold client_lock */ 790/* caller must hold client_lock */
@@ -731,21 +816,6 @@ unhash_session(struct nfsd4_session *ses)
731 list_del(&ses->se_perclnt); 816 list_del(&ses->se_perclnt);
732} 817}
733 818
734void
735free_session(struct kref *kref)
736{
737 struct nfsd4_session *ses;
738 int mem;
739
740 ses = container_of(kref, struct nfsd4_session, se_ref);
741 spin_lock(&nfsd_drc_lock);
742 mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
743 nfsd_drc_mem_used -= mem;
744 spin_unlock(&nfsd_drc_lock);
745 free_session_slots(ses);
746 kfree(ses);
747}
748
749/* must be called under the client_lock */ 819/* must be called under the client_lock */
750static inline void 820static inline void
751renew_client_locked(struct nfs4_client *clp) 821renew_client_locked(struct nfs4_client *clp)
@@ -812,6 +882,13 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
812static inline void 882static inline void
813free_client(struct nfs4_client *clp) 883free_client(struct nfs4_client *clp)
814{ 884{
885 while (!list_empty(&clp->cl_sessions)) {
886 struct nfsd4_session *ses;
887 ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
888 se_perclnt);
889 list_del(&ses->se_perclnt);
890 nfsd4_put_session(ses);
891 }
815 if (clp->cl_cred.cr_group_info) 892 if (clp->cl_cred.cr_group_info)
816 put_group_info(clp->cl_cred.cr_group_info); 893 put_group_info(clp->cl_cred.cr_group_info);
817 kfree(clp->cl_principal); 894 kfree(clp->cl_principal);
@@ -838,15 +915,12 @@ release_session_client(struct nfsd4_session *session)
838static inline void 915static inline void
839unhash_client_locked(struct nfs4_client *clp) 916unhash_client_locked(struct nfs4_client *clp)
840{ 917{
918 struct nfsd4_session *ses;
919
841 mark_client_expired(clp); 920 mark_client_expired(clp);
842 list_del(&clp->cl_lru); 921 list_del(&clp->cl_lru);
843 while (!list_empty(&clp->cl_sessions)) { 922 list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
844 struct nfsd4_session *ses; 923 list_del_init(&ses->se_hash);
845 ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
846 se_perclnt);
847 unhash_session(ses);
848 nfsd4_put_session(ses);
849 }
850} 924}
851 925
852static void 926static void
@@ -875,7 +949,7 @@ expire_client(struct nfs4_client *clp)
875 sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); 949 sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
876 release_openowner(sop); 950 release_openowner(sop);
877 } 951 }
878 nfsd4_set_callback_client(clp, NULL); 952 nfsd4_shutdown_callback(clp);
879 if (clp->cl_cb_conn.cb_xprt) 953 if (clp->cl_cb_conn.cb_xprt)
880 svc_xprt_put(clp->cl_cb_conn.cb_xprt); 954 svc_xprt_put(clp->cl_cb_conn.cb_xprt);
881 list_del(&clp->cl_idhash); 955 list_del(&clp->cl_idhash);
@@ -960,6 +1034,8 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
960 if (clp == NULL) 1034 if (clp == NULL)
961 return NULL; 1035 return NULL;
962 1036
1037 INIT_LIST_HEAD(&clp->cl_sessions);
1038
963 princ = svc_gss_principal(rqstp); 1039 princ = svc_gss_principal(rqstp);
964 if (princ) { 1040 if (princ) {
965 clp->cl_principal = kstrdup(princ, GFP_KERNEL); 1041 clp->cl_principal = kstrdup(princ, GFP_KERNEL);
@@ -976,8 +1052,9 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
976 INIT_LIST_HEAD(&clp->cl_strhash); 1052 INIT_LIST_HEAD(&clp->cl_strhash);
977 INIT_LIST_HEAD(&clp->cl_openowners); 1053 INIT_LIST_HEAD(&clp->cl_openowners);
978 INIT_LIST_HEAD(&clp->cl_delegations); 1054 INIT_LIST_HEAD(&clp->cl_delegations);
979 INIT_LIST_HEAD(&clp->cl_sessions);
980 INIT_LIST_HEAD(&clp->cl_lru); 1055 INIT_LIST_HEAD(&clp->cl_lru);
1056 spin_lock_init(&clp->cl_lock);
1057 INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc);
981 clp->cl_time = get_seconds(); 1058 clp->cl_time = get_seconds();
982 clear_bit(0, &clp->cl_cb_slot_busy); 1059 clear_bit(0, &clp->cl_cb_slot_busy);
983 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); 1060 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
@@ -986,7 +1063,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
986 clp->cl_flavor = rqstp->rq_flavor; 1063 clp->cl_flavor = rqstp->rq_flavor;
987 copy_cred(&clp->cl_cred, &rqstp->rq_cred); 1064 copy_cred(&clp->cl_cred, &rqstp->rq_cred);
988 gen_confirm(clp); 1065 gen_confirm(clp);
989 1066 clp->cl_cb_session = NULL;
990 return clp; 1067 return clp;
991} 1068}
992 1069
@@ -1098,7 +1175,7 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
1098static void 1175static void
1099gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid) 1176gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
1100{ 1177{
1101 struct nfs4_cb_conn *cb = &clp->cl_cb_conn; 1178 struct nfs4_cb_conn *conn = &clp->cl_cb_conn;
1102 unsigned short expected_family; 1179 unsigned short expected_family;
1103 1180
1104 /* Currently, we only support tcp and tcp6 for the callback channel */ 1181 /* Currently, we only support tcp and tcp6 for the callback channel */
@@ -1111,24 +1188,23 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
1111 else 1188 else
1112 goto out_err; 1189 goto out_err;
1113 1190
1114 cb->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val, 1191 conn->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val,
1115 se->se_callback_addr_len, 1192 se->se_callback_addr_len,
1116 (struct sockaddr *) &cb->cb_addr, 1193 (struct sockaddr *)&conn->cb_addr,
1117 sizeof(cb->cb_addr)); 1194 sizeof(conn->cb_addr));
1118 1195
1119 if (!cb->cb_addrlen || cb->cb_addr.ss_family != expected_family) 1196 if (!conn->cb_addrlen || conn->cb_addr.ss_family != expected_family)
1120 goto out_err; 1197 goto out_err;
1121 1198
1122 if (cb->cb_addr.ss_family == AF_INET6) 1199 if (conn->cb_addr.ss_family == AF_INET6)
1123 ((struct sockaddr_in6 *) &cb->cb_addr)->sin6_scope_id = scopeid; 1200 ((struct sockaddr_in6 *)&conn->cb_addr)->sin6_scope_id = scopeid;
1124 1201
1125 cb->cb_minorversion = 0; 1202 conn->cb_prog = se->se_callback_prog;
1126 cb->cb_prog = se->se_callback_prog; 1203 conn->cb_ident = se->se_callback_ident;
1127 cb->cb_ident = se->se_callback_ident;
1128 return; 1204 return;
1129out_err: 1205out_err:
1130 cb->cb_addr.ss_family = AF_UNSPEC; 1206 conn->cb_addr.ss_family = AF_UNSPEC;
1131 cb->cb_addrlen = 0; 1207 conn->cb_addrlen = 0;
1132 dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) " 1208 dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) "
1133 "will not receive delegations\n", 1209 "will not receive delegations\n",
1134 clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); 1210 clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
@@ -1415,7 +1491,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1415{ 1491{
1416 struct sockaddr *sa = svc_addr(rqstp); 1492 struct sockaddr *sa = svc_addr(rqstp);
1417 struct nfs4_client *conf, *unconf; 1493 struct nfs4_client *conf, *unconf;
1494 struct nfsd4_session *new;
1418 struct nfsd4_clid_slot *cs_slot = NULL; 1495 struct nfsd4_clid_slot *cs_slot = NULL;
1496 bool confirm_me = false;
1419 int status = 0; 1497 int status = 0;
1420 1498
1421 nfs4_lock_state(); 1499 nfs4_lock_state();
@@ -1438,7 +1516,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1438 cs_slot->sl_seqid, cr_ses->seqid); 1516 cs_slot->sl_seqid, cr_ses->seqid);
1439 goto out; 1517 goto out;
1440 } 1518 }
1441 cs_slot->sl_seqid++;
1442 } else if (unconf) { 1519 } else if (unconf) {
1443 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || 1520 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
1444 !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { 1521 !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
@@ -1451,25 +1528,10 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1451 if (status) { 1528 if (status) {
1452 /* an unconfirmed replay returns misordered */ 1529 /* an unconfirmed replay returns misordered */
1453 status = nfserr_seq_misordered; 1530 status = nfserr_seq_misordered;
1454 goto out_cache; 1531 goto out;
1455 } 1532 }
1456 1533
1457 cs_slot->sl_seqid++; /* from 0 to 1 */ 1534 confirm_me = true;
1458 move_to_confirmed(unconf);
1459
1460 if (cr_ses->flags & SESSION4_BACK_CHAN) {
1461 unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
1462 svc_xprt_get(rqstp->rq_xprt);
1463 rpc_copy_addr(
1464 (struct sockaddr *)&unconf->cl_cb_conn.cb_addr,
1465 sa);
1466 unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
1467 unconf->cl_cb_conn.cb_minorversion =
1468 cstate->minorversion;
1469 unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
1470 unconf->cl_cb_seq_nr = 1;
1471 nfsd4_probe_callback(unconf, &unconf->cl_cb_conn);
1472 }
1473 conf = unconf; 1535 conf = unconf;
1474 } else { 1536 } else {
1475 status = nfserr_stale_clientid; 1537 status = nfserr_stale_clientid;
@@ -1477,22 +1539,30 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1477 } 1539 }
1478 1540
1479 /* 1541 /*
1542 * XXX: we should probably set this at creation time, and check
1543 * for consistent minorversion use throughout:
1544 */
1545 conf->cl_minorversion = 1;
1546 /*
1480 * We do not support RDMA or persistent sessions 1547 * We do not support RDMA or persistent sessions
1481 */ 1548 */
1482 cr_ses->flags &= ~SESSION4_PERSIST; 1549 cr_ses->flags &= ~SESSION4_PERSIST;
1483 cr_ses->flags &= ~SESSION4_RDMA; 1550 cr_ses->flags &= ~SESSION4_RDMA;
1484 1551
1485 status = alloc_init_session(rqstp, conf, cr_ses); 1552 status = nfserr_jukebox;
1486 if (status) 1553 new = alloc_init_session(rqstp, conf, cr_ses);
1554 if (!new)
1487 goto out; 1555 goto out;
1488 1556 status = nfs_ok;
1489 memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data, 1557 memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
1490 NFS4_MAX_SESSIONID_LEN); 1558 NFS4_MAX_SESSIONID_LEN);
1559 cs_slot->sl_seqid++;
1491 cr_ses->seqid = cs_slot->sl_seqid; 1560 cr_ses->seqid = cs_slot->sl_seqid;
1492 1561
1493out_cache:
1494 /* cache solo and embedded create sessions under the state lock */ 1562 /* cache solo and embedded create sessions under the state lock */
1495 nfsd4_cache_create_session(cr_ses, cs_slot, status); 1563 nfsd4_cache_create_session(cr_ses, cs_slot, status);
1564 if (confirm_me)
1565 move_to_confirmed(conf);
1496out: 1566out:
1497 nfs4_unlock_state(); 1567 nfs4_unlock_state();
1498 dprintk("%s returns %d\n", __func__, ntohl(status)); 1568 dprintk("%s returns %d\n", __func__, ntohl(status));
@@ -1546,8 +1616,11 @@ nfsd4_destroy_session(struct svc_rqst *r,
1546 1616
1547 nfs4_lock_state(); 1617 nfs4_lock_state();
1548 /* wait for callbacks */ 1618 /* wait for callbacks */
1549 nfsd4_set_callback_client(ses->se_client, NULL); 1619 nfsd4_shutdown_callback(ses->se_client);
1550 nfs4_unlock_state(); 1620 nfs4_unlock_state();
1621
1622 nfsd4_del_conns(ses);
1623
1551 nfsd4_put_session(ses); 1624 nfsd4_put_session(ses);
1552 status = nfs_ok; 1625 status = nfs_ok;
1553out: 1626out:
@@ -1555,6 +1628,36 @@ out:
1555 return status; 1628 return status;
1556} 1629}
1557 1630
1631static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_session *s)
1632{
1633 struct nfsd4_conn *c;
1634
1635 list_for_each_entry(c, &s->se_conns, cn_persession) {
1636 if (c->cn_xprt == xpt) {
1637 return c;
1638 }
1639 }
1640 return NULL;
1641}
1642
1643static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses)
1644{
1645 struct nfs4_client *clp = ses->se_client;
1646 struct nfsd4_conn *c;
1647
1648 spin_lock(&clp->cl_lock);
1649 c = __nfsd4_find_conn(new->cn_xprt, ses);
1650 if (c) {
1651 spin_unlock(&clp->cl_lock);
1652 free_conn(new);
1653 return;
1654 }
1655 __nfsd4_hash_conn(new, ses);
1656 spin_unlock(&clp->cl_lock);
1657 nfsd4_register_conn(new);
1658 return;
1659}
1660
1558__be32 1661__be32
1559nfsd4_sequence(struct svc_rqst *rqstp, 1662nfsd4_sequence(struct svc_rqst *rqstp,
1560 struct nfsd4_compound_state *cstate, 1663 struct nfsd4_compound_state *cstate,
@@ -1563,11 +1666,20 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1563 struct nfsd4_compoundres *resp = rqstp->rq_resp; 1666 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1564 struct nfsd4_session *session; 1667 struct nfsd4_session *session;
1565 struct nfsd4_slot *slot; 1668 struct nfsd4_slot *slot;
1669 struct nfsd4_conn *conn;
1566 int status; 1670 int status;
1567 1671
1568 if (resp->opcnt != 1) 1672 if (resp->opcnt != 1)
1569 return nfserr_sequence_pos; 1673 return nfserr_sequence_pos;
1570 1674
1675 /*
1676 * Will be either used or freed by nfsd4_sequence_check_conn
1677 * below.
1678 */
1679 conn = alloc_conn(rqstp, NFS4_CDFC4_FORE);
1680 if (!conn)
1681 return nfserr_jukebox;
1682
1571 spin_lock(&client_lock); 1683 spin_lock(&client_lock);
1572 status = nfserr_badsession; 1684 status = nfserr_badsession;
1573 session = find_in_sessionid_hashtbl(&seq->sessionid); 1685 session = find_in_sessionid_hashtbl(&seq->sessionid);
@@ -1599,6 +1711,9 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1599 if (status) 1711 if (status)
1600 goto out; 1712 goto out;
1601 1713
1714 nfsd4_sequence_check_conn(conn, session);
1715 conn = NULL;
1716
1602 /* Success! bump slot seqid */ 1717 /* Success! bump slot seqid */
1603 slot->sl_inuse = true; 1718 slot->sl_inuse = true;
1604 slot->sl_seqid = seq->seqid; 1719 slot->sl_seqid = seq->seqid;
@@ -1613,6 +1728,7 @@ out:
1613 nfsd4_get_session(cstate->session); 1728 nfsd4_get_session(cstate->session);
1614 atomic_inc(&session->se_client->cl_refcount); 1729 atomic_inc(&session->se_client->cl_refcount);
1615 } 1730 }
1731 kfree(conn);
1616 spin_unlock(&client_lock); 1732 spin_unlock(&client_lock);
1617 dprintk("%s: return %d\n", __func__, ntohl(status)); 1733 dprintk("%s: return %d\n", __func__, ntohl(status));
1618 return status; 1734 return status;
@@ -1747,6 +1863,11 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1747 goto out; 1863 goto out;
1748 gen_clid(new); 1864 gen_clid(new);
1749 } 1865 }
1866 /*
1867 * XXX: we should probably set this at creation time, and check
1868 * for consistent minorversion use throughout:
1869 */
1870 new->cl_minorversion = 0;
1750 gen_callback(new, setclid, rpc_get_scope_id(sa)); 1871 gen_callback(new, setclid, rpc_get_scope_id(sa));
1751 add_to_unconfirmed(new, strhashval); 1872 add_to_unconfirmed(new, strhashval);
1752 setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; 1873 setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
@@ -1807,7 +1928,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1807 status = nfserr_clid_inuse; 1928 status = nfserr_clid_inuse;
1808 else { 1929 else {
1809 atomic_set(&conf->cl_cb_set, 0); 1930 atomic_set(&conf->cl_cb_set, 0);
1810 nfsd4_probe_callback(conf, &unconf->cl_cb_conn); 1931 nfsd4_change_callback(conf, &unconf->cl_cb_conn);
1932 nfsd4_probe_callback(conf);
1811 expire_client(unconf); 1933 expire_client(unconf);
1812 status = nfs_ok; 1934 status = nfs_ok;
1813 1935
@@ -1841,7 +1963,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1841 } 1963 }
1842 move_to_confirmed(unconf); 1964 move_to_confirmed(unconf);
1843 conf = unconf; 1965 conf = unconf;
1844 nfsd4_probe_callback(conf, &conf->cl_cb_conn); 1966 nfsd4_probe_callback(conf);
1845 status = nfs_ok; 1967 status = nfs_ok;
1846 } 1968 }
1847 } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm))) 1969 } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
@@ -2492,7 +2614,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2492 struct nfs4_delegation *dp; 2614 struct nfs4_delegation *dp;
2493 struct nfs4_stateowner *sop = stp->st_stateowner; 2615 struct nfs4_stateowner *sop = stp->st_stateowner;
2494 int cb_up = atomic_read(&sop->so_client->cl_cb_set); 2616 int cb_up = atomic_read(&sop->so_client->cl_cb_set);
2495 struct file_lock fl, *flp = &fl; 2617 struct file_lock *fl;
2496 int status, flag = 0; 2618 int status, flag = 0;
2497 2619
2498 flag = NFS4_OPEN_DELEGATE_NONE; 2620 flag = NFS4_OPEN_DELEGATE_NONE;
@@ -2526,20 +2648,24 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2526 flag = NFS4_OPEN_DELEGATE_NONE; 2648 flag = NFS4_OPEN_DELEGATE_NONE;
2527 goto out; 2649 goto out;
2528 } 2650 }
2529 locks_init_lock(&fl); 2651 status = -ENOMEM;
2530 fl.fl_lmops = &nfsd_lease_mng_ops; 2652 fl = locks_alloc_lock();
2531 fl.fl_flags = FL_LEASE; 2653 if (!fl)
2532 fl.fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; 2654 goto out;
2533 fl.fl_end = OFFSET_MAX; 2655 locks_init_lock(fl);
2534 fl.fl_owner = (fl_owner_t)dp; 2656 fl->fl_lmops = &nfsd_lease_mng_ops;
2535 fl.fl_file = find_readable_file(stp->st_file); 2657 fl->fl_flags = FL_LEASE;
2536 BUG_ON(!fl.fl_file); 2658 fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
2537 fl.fl_pid = current->tgid; 2659 fl->fl_end = OFFSET_MAX;
2660 fl->fl_owner = (fl_owner_t)dp;
2661 fl->fl_file = find_readable_file(stp->st_file);
2662 BUG_ON(!fl->fl_file);
2663 fl->fl_pid = current->tgid;
2538 2664
2539 /* vfs_setlease checks to see if delegation should be handed out. 2665 /* vfs_setlease checks to see if delegation should be handed out.
2540 * the lock_manager callbacks fl_mylease and fl_change are used 2666 * the lock_manager callbacks fl_mylease and fl_change are used
2541 */ 2667 */
2542 if ((status = vfs_setlease(fl.fl_file, fl.fl_type, &flp))) { 2668 if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
2543 dprintk("NFSD: setlease failed [%d], no delegation\n", status); 2669 dprintk("NFSD: setlease failed [%d], no delegation\n", status);
2544 unhash_delegation(dp); 2670 unhash_delegation(dp);
2545 flag = NFS4_OPEN_DELEGATE_NONE; 2671 flag = NFS4_OPEN_DELEGATE_NONE;
@@ -2944,7 +3070,11 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2944 if (STALE_STATEID(stateid)) 3070 if (STALE_STATEID(stateid))
2945 goto out; 3071 goto out;
2946 3072
2947 status = nfserr_bad_stateid; 3073 /*
3074 * We assume that any stateid that has the current boot time,
3075 * but that we can't find, is expired:
3076 */
3077 status = nfserr_expired;
2948 if (is_delegation_stateid(stateid)) { 3078 if (is_delegation_stateid(stateid)) {
2949 dp = find_delegation_stateid(ino, stateid); 3079 dp = find_delegation_stateid(ino, stateid);
2950 if (!dp) 3080 if (!dp)
@@ -2964,6 +3094,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2964 stp = find_stateid(stateid, flags); 3094 stp = find_stateid(stateid, flags);
2965 if (!stp) 3095 if (!stp)
2966 goto out; 3096 goto out;
3097 status = nfserr_bad_stateid;
2967 if (nfs4_check_fh(current_fh, stp)) 3098 if (nfs4_check_fh(current_fh, stp))
2968 goto out; 3099 goto out;
2969 if (!stp->st_stateowner->so_confirmed) 3100 if (!stp->st_stateowner->so_confirmed)
@@ -3038,8 +3169,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
3038 * a replayed close: 3169 * a replayed close:
3039 */ 3170 */
3040 sop = search_close_lru(stateid->si_stateownerid, flags); 3171 sop = search_close_lru(stateid->si_stateownerid, flags);
3172 /* It's not stale; let's assume it's expired: */
3041 if (sop == NULL) 3173 if (sop == NULL)
3042 return nfserr_bad_stateid; 3174 return nfserr_expired;
3043 *sopp = sop; 3175 *sopp = sop;
3044 goto check_replay; 3176 goto check_replay;
3045 } 3177 }
@@ -3304,6 +3436,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3304 status = nfserr_bad_stateid; 3436 status = nfserr_bad_stateid;
3305 if (!is_delegation_stateid(stateid)) 3437 if (!is_delegation_stateid(stateid))
3306 goto out; 3438 goto out;
3439 status = nfserr_expired;
3307 dp = find_delegation_stateid(inode, stateid); 3440 dp = find_delegation_stateid(inode, stateid);
3308 if (!dp) 3441 if (!dp)
3309 goto out; 3442 goto out;
@@ -3895,7 +4028,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner)
3895 struct inode *inode = filp->fi_inode; 4028 struct inode *inode = filp->fi_inode;
3896 int status = 0; 4029 int status = 0;
3897 4030
3898 lock_kernel(); 4031 lock_flocks();
3899 for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) { 4032 for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) {
3900 if ((*flpp)->fl_owner == (fl_owner_t)lowner) { 4033 if ((*flpp)->fl_owner == (fl_owner_t)lowner) {
3901 status = 1; 4034 status = 1;
@@ -3903,7 +4036,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner)
3903 } 4036 }
3904 } 4037 }
3905out: 4038out:
3906 unlock_kernel(); 4039 unlock_flocks();
3907 return status; 4040 return status;
3908} 4041}
3909 4042
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 1a468bbd330f..f35a94a04026 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1805,19 +1805,23 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1805 goto out_nfserr; 1805 goto out_nfserr;
1806 } 1806 }
1807 } 1807 }
1808 if ((buflen -= 16) < 0)
1809 goto out_resource;
1810 1808
1811 if (unlikely(bmval2)) { 1809 if (bmval2) {
1810 if ((buflen -= 16) < 0)
1811 goto out_resource;
1812 WRITE32(3); 1812 WRITE32(3);
1813 WRITE32(bmval0); 1813 WRITE32(bmval0);
1814 WRITE32(bmval1); 1814 WRITE32(bmval1);
1815 WRITE32(bmval2); 1815 WRITE32(bmval2);
1816 } else if (likely(bmval1)) { 1816 } else if (bmval1) {
1817 if ((buflen -= 12) < 0)
1818 goto out_resource;
1817 WRITE32(2); 1819 WRITE32(2);
1818 WRITE32(bmval0); 1820 WRITE32(bmval0);
1819 WRITE32(bmval1); 1821 WRITE32(bmval1);
1820 } else { 1822 } else {
1823 if ((buflen -= 8) < 0)
1824 goto out_resource;
1821 WRITE32(1); 1825 WRITE32(1);
1822 WRITE32(bmval0); 1826 WRITE32(bmval0);
1823 } 1827 }
@@ -1828,15 +1832,17 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1828 u32 word1 = nfsd_suppattrs1(minorversion); 1832 u32 word1 = nfsd_suppattrs1(minorversion);
1829 u32 word2 = nfsd_suppattrs2(minorversion); 1833 u32 word2 = nfsd_suppattrs2(minorversion);
1830 1834
1831 if ((buflen -= 12) < 0)
1832 goto out_resource;
1833 if (!aclsupport) 1835 if (!aclsupport)
1834 word0 &= ~FATTR4_WORD0_ACL; 1836 word0 &= ~FATTR4_WORD0_ACL;
1835 if (!word2) { 1837 if (!word2) {
1838 if ((buflen -= 12) < 0)
1839 goto out_resource;
1836 WRITE32(2); 1840 WRITE32(2);
1837 WRITE32(word0); 1841 WRITE32(word0);
1838 WRITE32(word1); 1842 WRITE32(word1);
1839 } else { 1843 } else {
1844 if ((buflen -= 16) < 0)
1845 goto out_resource;
1840 WRITE32(3); 1846 WRITE32(3);
1841 WRITE32(word0); 1847 WRITE32(word0);
1842 WRITE32(word1); 1848 WRITE32(word1);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index b53b1d042f1f..d6dc3f61f8ba 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -22,6 +22,7 @@
22 */ 22 */
23enum { 23enum {
24 NFSD_Root = 1, 24 NFSD_Root = 1,
25#ifdef CONFIG_NFSD_DEPRECATED
25 NFSD_Svc, 26 NFSD_Svc,
26 NFSD_Add, 27 NFSD_Add,
27 NFSD_Del, 28 NFSD_Del,
@@ -29,6 +30,7 @@ enum {
29 NFSD_Unexport, 30 NFSD_Unexport,
30 NFSD_Getfd, 31 NFSD_Getfd,
31 NFSD_Getfs, 32 NFSD_Getfs,
33#endif
32 NFSD_List, 34 NFSD_List,
33 NFSD_Export_features, 35 NFSD_Export_features,
34 NFSD_Fh, 36 NFSD_Fh,
@@ -54,6 +56,7 @@ enum {
54/* 56/*
55 * write() for these nodes. 57 * write() for these nodes.
56 */ 58 */
59#ifdef CONFIG_NFSD_DEPRECATED
57static ssize_t write_svc(struct file *file, char *buf, size_t size); 60static ssize_t write_svc(struct file *file, char *buf, size_t size);
58static ssize_t write_add(struct file *file, char *buf, size_t size); 61static ssize_t write_add(struct file *file, char *buf, size_t size);
59static ssize_t write_del(struct file *file, char *buf, size_t size); 62static ssize_t write_del(struct file *file, char *buf, size_t size);
@@ -61,6 +64,7 @@ static ssize_t write_export(struct file *file, char *buf, size_t size);
61static ssize_t write_unexport(struct file *file, char *buf, size_t size); 64static ssize_t write_unexport(struct file *file, char *buf, size_t size);
62static ssize_t write_getfd(struct file *file, char *buf, size_t size); 65static ssize_t write_getfd(struct file *file, char *buf, size_t size);
63static ssize_t write_getfs(struct file *file, char *buf, size_t size); 66static ssize_t write_getfs(struct file *file, char *buf, size_t size);
67#endif
64static ssize_t write_filehandle(struct file *file, char *buf, size_t size); 68static ssize_t write_filehandle(struct file *file, char *buf, size_t size);
65static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size); 69static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size);
66static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size); 70static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size);
@@ -76,6 +80,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
76#endif 80#endif
77 81
78static ssize_t (*write_op[])(struct file *, char *, size_t) = { 82static ssize_t (*write_op[])(struct file *, char *, size_t) = {
83#ifdef CONFIG_NFSD_DEPRECATED
79 [NFSD_Svc] = write_svc, 84 [NFSD_Svc] = write_svc,
80 [NFSD_Add] = write_add, 85 [NFSD_Add] = write_add,
81 [NFSD_Del] = write_del, 86 [NFSD_Del] = write_del,
@@ -83,6 +88,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
83 [NFSD_Unexport] = write_unexport, 88 [NFSD_Unexport] = write_unexport,
84 [NFSD_Getfd] = write_getfd, 89 [NFSD_Getfd] = write_getfd,
85 [NFSD_Getfs] = write_getfs, 90 [NFSD_Getfs] = write_getfs,
91#endif
86 [NFSD_Fh] = write_filehandle, 92 [NFSD_Fh] = write_filehandle,
87 [NFSD_FO_UnlockIP] = write_unlock_ip, 93 [NFSD_FO_UnlockIP] = write_unlock_ip,
88 [NFSD_FO_UnlockFS] = write_unlock_fs, 94 [NFSD_FO_UnlockFS] = write_unlock_fs,
@@ -121,6 +127,14 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu
121 127
122static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) 128static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
123{ 129{
130 static int warned;
131 if (file->f_dentry->d_name.name[0] == '.' && !warned) {
132 printk(KERN_INFO
133 "Warning: \"%s\" uses deprecated NFSD interface: %s."
134 " This will be removed in 2.6.40\n",
135 current->comm, file->f_dentry->d_name.name);
136 warned = 1;
137 }
124 if (! file->private_data) { 138 if (! file->private_data) {
125 /* An attempt to read a transaction file without writing 139 /* An attempt to read a transaction file without writing
126 * causes a 0-byte write so that the file can return 140 * causes a 0-byte write so that the file can return
@@ -137,6 +151,7 @@ static const struct file_operations transaction_ops = {
137 .write = nfsctl_transaction_write, 151 .write = nfsctl_transaction_write,
138 .read = nfsctl_transaction_read, 152 .read = nfsctl_transaction_read,
139 .release = simple_transaction_release, 153 .release = simple_transaction_release,
154 .llseek = default_llseek,
140}; 155};
141 156
142static int exports_open(struct inode *inode, struct file *file) 157static int exports_open(struct inode *inode, struct file *file)
@@ -186,6 +201,7 @@ static const struct file_operations pool_stats_operations = {
186 * payload - write methods 201 * payload - write methods
187 */ 202 */
188 203
204#ifdef CONFIG_NFSD_DEPRECATED
189/** 205/**
190 * write_svc - Start kernel's NFSD server 206 * write_svc - Start kernel's NFSD server
191 * 207 *
@@ -401,7 +417,7 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size)
401 417
402 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6); 418 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6);
403 419
404 clp = auth_unix_lookup(&in6); 420 clp = auth_unix_lookup(&init_net, &in6);
405 if (!clp) 421 if (!clp)
406 err = -EPERM; 422 err = -EPERM;
407 else { 423 else {
@@ -464,7 +480,7 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
464 480
465 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6); 481 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6);
466 482
467 clp = auth_unix_lookup(&in6); 483 clp = auth_unix_lookup(&init_net, &in6);
468 if (!clp) 484 if (!clp)
469 err = -EPERM; 485 err = -EPERM;
470 else { 486 else {
@@ -481,6 +497,7 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
481 out: 497 out:
482 return err; 498 return err;
483} 499}
500#endif /* CONFIG_NFSD_DEPRECATED */
484 501
485/** 502/**
486 * write_unlock_ip - Release all locks used by a client 503 * write_unlock_ip - Release all locks used by a client
@@ -999,12 +1016,12 @@ static ssize_t __write_ports_addxprt(char *buf)
999 if (err != 0) 1016 if (err != 0)
1000 return err; 1017 return err;
1001 1018
1002 err = svc_create_xprt(nfsd_serv, transport, 1019 err = svc_create_xprt(nfsd_serv, transport, &init_net,
1003 PF_INET, port, SVC_SOCK_ANONYMOUS); 1020 PF_INET, port, SVC_SOCK_ANONYMOUS);
1004 if (err < 0) 1021 if (err < 0)
1005 goto out_err; 1022 goto out_err;
1006 1023
1007 err = svc_create_xprt(nfsd_serv, transport, 1024 err = svc_create_xprt(nfsd_serv, transport, &init_net,
1008 PF_INET6, port, SVC_SOCK_ANONYMOUS); 1025 PF_INET6, port, SVC_SOCK_ANONYMOUS);
1009 if (err < 0 && err != -EAFNOSUPPORT) 1026 if (err < 0 && err != -EAFNOSUPPORT)
1010 goto out_close; 1027 goto out_close;
@@ -1355,6 +1372,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
1355static int nfsd_fill_super(struct super_block * sb, void * data, int silent) 1372static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
1356{ 1373{
1357 static struct tree_descr nfsd_files[] = { 1374 static struct tree_descr nfsd_files[] = {
1375#ifdef CONFIG_NFSD_DEPRECATED
1358 [NFSD_Svc] = {".svc", &transaction_ops, S_IWUSR}, 1376 [NFSD_Svc] = {".svc", &transaction_ops, S_IWUSR},
1359 [NFSD_Add] = {".add", &transaction_ops, S_IWUSR}, 1377 [NFSD_Add] = {".add", &transaction_ops, S_IWUSR},
1360 [NFSD_Del] = {".del", &transaction_ops, S_IWUSR}, 1378 [NFSD_Del] = {".del", &transaction_ops, S_IWUSR},
@@ -1362,6 +1380,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
1362 [NFSD_Unexport] = {".unexport", &transaction_ops, S_IWUSR}, 1380 [NFSD_Unexport] = {".unexport", &transaction_ops, S_IWUSR},
1363 [NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR}, 1381 [NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR},
1364 [NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR}, 1382 [NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR},
1383#endif
1365 [NFSD_List] = {"exports", &exports_operations, S_IRUGO}, 1384 [NFSD_List] = {"exports", &exports_operations, S_IRUGO},
1366 [NFSD_Export_features] = {"export_features", 1385 [NFSD_Export_features] = {"export_features",
1367 &export_features_operations, S_IRUGO}, 1386 &export_features_operations, S_IRUGO},
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index b76ac3a82e39..6b641cf2c19a 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -249,7 +249,7 @@ extern time_t nfsd4_grace;
249#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */ 249#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */
250#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */ 250#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */
251 251
252#define NFSD_LAUNDROMAT_MINTIMEOUT 10 /* seconds */ 252#define NFSD_LAUNDROMAT_MINTIMEOUT 1 /* seconds */
253 253
254/* 254/*
255 * The following attributes are currently not supported by the NFSv4 server: 255 * The following attributes are currently not supported by the NFSv4 server:
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index cdfb8c6a4206..c16f8d8331b5 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -196,8 +196,6 @@ fh_lock(struct svc_fh *fhp)
196static inline void 196static inline void
197fh_unlock(struct svc_fh *fhp) 197fh_unlock(struct svc_fh *fhp)
198{ 198{
199 BUG_ON(!fhp->fh_dentry);
200
201 if (fhp->fh_locked) { 199 if (fhp->fh_locked) {
202 fill_post_wcc(fhp); 200 fill_post_wcc(fhp);
203 mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex); 201 mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index e2c43464f237..2bae1d86f5f2 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -16,6 +16,7 @@
16#include <linux/lockd/bind.h> 16#include <linux/lockd/bind.h>
17#include <linux/nfsacl.h> 17#include <linux/nfsacl.h>
18#include <linux/seq_file.h> 18#include <linux/seq_file.h>
19#include <net/net_namespace.h>
19#include "nfsd.h" 20#include "nfsd.h"
20#include "cache.h" 21#include "cache.h"
21#include "vfs.h" 22#include "vfs.h"
@@ -186,12 +187,12 @@ static int nfsd_init_socks(int port)
186 if (!list_empty(&nfsd_serv->sv_permsocks)) 187 if (!list_empty(&nfsd_serv->sv_permsocks))
187 return 0; 188 return 0;
188 189
189 error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port, 190 error = svc_create_xprt(nfsd_serv, "udp", &init_net, PF_INET, port,
190 SVC_SOCK_DEFAULTS); 191 SVC_SOCK_DEFAULTS);
191 if (error < 0) 192 if (error < 0)
192 return error; 193 return error;
193 194
194 error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port, 195 error = svc_create_xprt(nfsd_serv, "tcp", &init_net, PF_INET, port,
195 SVC_SOCK_DEFAULTS); 196 SVC_SOCK_DEFAULTS);
196 if (error < 0) 197 if (error < 0)
197 return error; 198 return error;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 322518c88e4b..39adc27b0685 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -35,6 +35,7 @@
35#ifndef _NFSD4_STATE_H 35#ifndef _NFSD4_STATE_H
36#define _NFSD4_STATE_H 36#define _NFSD4_STATE_H
37 37
38#include <linux/sunrpc/svc_xprt.h>
38#include <linux/nfsd/nfsfh.h> 39#include <linux/nfsd/nfsfh.h>
39#include "nfsfh.h" 40#include "nfsfh.h"
40 41
@@ -64,19 +65,12 @@ typedef struct {
64 (s)->si_fileid, \ 65 (s)->si_fileid, \
65 (s)->si_generation 66 (s)->si_generation
66 67
67struct nfsd4_cb_sequence {
68 /* args/res */
69 u32 cbs_minorversion;
70 struct nfs4_client *cbs_clp;
71};
72
73struct nfs4_rpc_args {
74 void *args_op;
75 struct nfsd4_cb_sequence args_seq;
76};
77
78struct nfsd4_callback { 68struct nfsd4_callback {
79 struct nfs4_rpc_args cb_args; 69 void *cb_op;
70 struct nfs4_client *cb_clp;
71 u32 cb_minorversion;
72 struct rpc_message cb_msg;
73 const struct rpc_call_ops *cb_ops;
80 struct work_struct cb_work; 74 struct work_struct cb_work;
81}; 75};
82 76
@@ -91,7 +85,6 @@ struct nfs4_delegation {
91 u32 dl_type; 85 u32 dl_type;
92 time_t dl_time; 86 time_t dl_time;
93/* For recall: */ 87/* For recall: */
94 u32 dl_ident;
95 stateid_t dl_stateid; 88 stateid_t dl_stateid;
96 struct knfsd_fh dl_fh; 89 struct knfsd_fh dl_fh;
97 int dl_retries; 90 int dl_retries;
@@ -103,8 +96,8 @@ struct nfs4_cb_conn {
103 /* SETCLIENTID info */ 96 /* SETCLIENTID info */
104 struct sockaddr_storage cb_addr; 97 struct sockaddr_storage cb_addr;
105 size_t cb_addrlen; 98 size_t cb_addrlen;
106 u32 cb_prog; 99 u32 cb_prog; /* used only in 4.0 case;
107 u32 cb_minorversion; 100 per-session otherwise */
108 u32 cb_ident; /* minorversion 0 only */ 101 u32 cb_ident; /* minorversion 0 only */
109 struct svc_xprt *cb_xprt; /* minorversion 1 only */ 102 struct svc_xprt *cb_xprt; /* minorversion 1 only */
110}; 103};
@@ -160,6 +153,15 @@ struct nfsd4_clid_slot {
160 struct nfsd4_create_session sl_cr_ses; 153 struct nfsd4_create_session sl_cr_ses;
161}; 154};
162 155
156struct nfsd4_conn {
157 struct list_head cn_persession;
158 struct svc_xprt *cn_xprt;
159 struct svc_xpt_user cn_xpt_user;
160 struct nfsd4_session *cn_session;
161/* CDFC4_FORE, CDFC4_BACK: */
162 unsigned char cn_flags;
163};
164
163struct nfsd4_session { 165struct nfsd4_session {
164 struct kref se_ref; 166 struct kref se_ref;
165 struct list_head se_hash; /* hash by sessionid */ 167 struct list_head se_hash; /* hash by sessionid */
@@ -169,6 +171,9 @@ struct nfsd4_session {
169 struct nfs4_sessionid se_sessionid; 171 struct nfs4_sessionid se_sessionid;
170 struct nfsd4_channel_attrs se_fchannel; 172 struct nfsd4_channel_attrs se_fchannel;
171 struct nfsd4_channel_attrs se_bchannel; 173 struct nfsd4_channel_attrs se_bchannel;
174 struct list_head se_conns;
175 u32 se_cb_prog;
176 u32 se_cb_seq_nr;
172 struct nfsd4_slot *se_slots[]; /* forward channel slots */ 177 struct nfsd4_slot *se_slots[]; /* forward channel slots */
173}; 178};
174 179
@@ -221,24 +226,32 @@ struct nfs4_client {
221 clientid_t cl_clientid; /* generated by server */ 226 clientid_t cl_clientid; /* generated by server */
222 nfs4_verifier cl_confirm; /* generated by server */ 227 nfs4_verifier cl_confirm; /* generated by server */
223 u32 cl_firststate; /* recovery dir creation */ 228 u32 cl_firststate; /* recovery dir creation */
229 u32 cl_minorversion;
224 230
225 /* for v4.0 and v4.1 callbacks: */ 231 /* for v4.0 and v4.1 callbacks: */
226 struct nfs4_cb_conn cl_cb_conn; 232 struct nfs4_cb_conn cl_cb_conn;
233#define NFSD4_CLIENT_CB_UPDATE 1
234#define NFSD4_CLIENT_KILL 2
235 unsigned long cl_cb_flags;
227 struct rpc_clnt *cl_cb_client; 236 struct rpc_clnt *cl_cb_client;
237 u32 cl_cb_ident;
228 atomic_t cl_cb_set; 238 atomic_t cl_cb_set;
239 struct nfsd4_callback cl_cb_null;
240 struct nfsd4_session *cl_cb_session;
241
242 /* for all client information that callback code might need: */
243 spinlock_t cl_lock;
229 244
230 /* for nfs41 */ 245 /* for nfs41 */
231 struct list_head cl_sessions; 246 struct list_head cl_sessions;
232 struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */ 247 struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */
233 u32 cl_exchange_flags; 248 u32 cl_exchange_flags;
234 struct nfs4_sessionid cl_sessionid;
235 /* number of rpc's in progress over an associated session: */ 249 /* number of rpc's in progress over an associated session: */
236 atomic_t cl_refcount; 250 atomic_t cl_refcount;
237 251
238 /* for nfs41 callbacks */ 252 /* for nfs41 callbacks */
239 /* We currently support a single back channel with a single slot */ 253 /* We currently support a single back channel with a single slot */
240 unsigned long cl_cb_slot_busy; 254 unsigned long cl_cb_slot_busy;
241 u32 cl_cb_seq_nr;
242 struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ 255 struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */
243 /* wait here for slots */ 256 /* wait here for slots */
244}; 257};
@@ -440,12 +453,13 @@ extern int nfs4_in_grace(void);
440extern __be32 nfs4_check_open_reclaim(clientid_t *clid); 453extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
441extern void nfs4_free_stateowner(struct kref *kref); 454extern void nfs4_free_stateowner(struct kref *kref);
442extern int set_callback_cred(void); 455extern int set_callback_cred(void);
443extern void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); 456extern void nfsd4_probe_callback(struct nfs4_client *clp);
457extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
444extern void nfsd4_do_callback_rpc(struct work_struct *); 458extern void nfsd4_do_callback_rpc(struct work_struct *);
445extern void nfsd4_cb_recall(struct nfs4_delegation *dp); 459extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
446extern int nfsd4_create_callback_queue(void); 460extern int nfsd4_create_callback_queue(void);
447extern void nfsd4_destroy_callback_queue(void); 461extern void nfsd4_destroy_callback_queue(void);
448extern void nfsd4_set_callback_client(struct nfs4_client *, struct rpc_clnt *); 462extern void nfsd4_shutdown_callback(struct nfs4_client *);
449extern void nfs4_put_delegation(struct nfs4_delegation *dp); 463extern void nfs4_put_delegation(struct nfs4_delegation *dp);
450extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); 464extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
451extern void nfsd4_init_recdir(char *recdir_name); 465extern void nfsd4_init_recdir(char *recdir_name);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 661a6cf8e826..184938fcff04 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -281,23 +281,13 @@ commit_metadata(struct svc_fh *fhp)
281{ 281{
282 struct inode *inode = fhp->fh_dentry->d_inode; 282 struct inode *inode = fhp->fh_dentry->d_inode;
283 const struct export_operations *export_ops = inode->i_sb->s_export_op; 283 const struct export_operations *export_ops = inode->i_sb->s_export_op;
284 int error = 0;
285 284
286 if (!EX_ISSYNC(fhp->fh_export)) 285 if (!EX_ISSYNC(fhp->fh_export))
287 return 0; 286 return 0;
288 287
289 if (export_ops->commit_metadata) { 288 if (export_ops->commit_metadata)
290 error = export_ops->commit_metadata(inode); 289 return export_ops->commit_metadata(inode);
291 } else { 290 return sync_inode_metadata(inode, 1);
292 struct writeback_control wbc = {
293 .sync_mode = WB_SYNC_ALL,
294 .nr_to_write = 0, /* metadata only */
295 };
296
297 error = sync_inode(inode, &wbc);
298 }
299
300 return error;
301} 291}
302 292
303/* 293/*
diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile
index df3e62c1ddc5..85c98737a146 100644
--- a/fs/nilfs2/Makefile
+++ b/fs/nilfs2/Makefile
@@ -2,4 +2,4 @@ obj-$(CONFIG_NILFS2_FS) += nilfs2.o
2nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \ 2nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \
3 btnode.o bmap.o btree.o direct.o dat.o recovery.o \ 3 btnode.o bmap.o btree.o direct.o dat.o recovery.o \
4 the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \ 4 the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \
5 ifile.o alloc.o gcinode.o ioctl.o gcdat.o 5 ifile.o alloc.o gcinode.o ioctl.o
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 3dbdc1d356bf..8b782b062baa 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -533,18 +533,20 @@ void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
533 nilfs_btree_init_gc(bmap); 533 nilfs_btree_init_gc(bmap);
534} 534}
535 535
536void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap) 536void nilfs_bmap_save(const struct nilfs_bmap *bmap,
537 struct nilfs_bmap_store *store)
537{ 538{
538 memcpy(gcbmap, bmap, sizeof(*bmap)); 539 memcpy(store->data, bmap->b_u.u_data, sizeof(store->data));
539 init_rwsem(&gcbmap->b_sem); 540 store->last_allocated_key = bmap->b_last_allocated_key;
540 lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key); 541 store->last_allocated_ptr = bmap->b_last_allocated_ptr;
541 gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode; 542 store->state = bmap->b_state;
542} 543}
543 544
544void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap) 545void nilfs_bmap_restore(struct nilfs_bmap *bmap,
546 const struct nilfs_bmap_store *store)
545{ 547{
546 memcpy(bmap, gcbmap, sizeof(*bmap)); 548 memcpy(bmap->b_u.u_data, store->data, sizeof(store->data));
547 init_rwsem(&bmap->b_sem); 549 bmap->b_last_allocated_key = store->last_allocated_key;
548 lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key); 550 bmap->b_last_allocated_ptr = store->last_allocated_ptr;
549 bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; 551 bmap->b_state = store->state;
550} 552}
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index a20569b19929..bde1c0aa2e15 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -135,6 +135,12 @@ struct nilfs_bmap {
135/* state */ 135/* state */
136#define NILFS_BMAP_DIRTY 0x00000001 136#define NILFS_BMAP_DIRTY 0x00000001
137 137
138struct nilfs_bmap_store {
139 __le64 data[NILFS_BMAP_SIZE / sizeof(__le64)];
140 __u64 last_allocated_key;
141 __u64 last_allocated_ptr;
142 int state;
143};
138 144
139int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *); 145int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
140int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *); 146int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
@@ -153,9 +159,9 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *, __u64, int, __u64 *);
153int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int); 159int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int);
154 160
155void nilfs_bmap_init_gc(struct nilfs_bmap *); 161void nilfs_bmap_init_gc(struct nilfs_bmap *);
156void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
157void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
158 162
163void nilfs_bmap_save(const struct nilfs_bmap *, struct nilfs_bmap_store *);
164void nilfs_bmap_restore(struct nilfs_bmap *, const struct nilfs_bmap_store *);
159 165
160static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key, 166static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key,
161 __u64 *ptr) 167 __u64 *ptr)
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index f78ab1044d1d..5115814cb745 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -37,15 +37,7 @@
37 37
38void nilfs_btnode_cache_init_once(struct address_space *btnc) 38void nilfs_btnode_cache_init_once(struct address_space *btnc)
39{ 39{
40 memset(btnc, 0, sizeof(*btnc)); 40 nilfs_mapping_init_once(btnc);
41 INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC);
42 spin_lock_init(&btnc->tree_lock);
43 INIT_LIST_HEAD(&btnc->private_list);
44 spin_lock_init(&btnc->private_lock);
45
46 spin_lock_init(&btnc->i_mmap_lock);
47 INIT_RAW_PRIO_TREE_ROOT(&btnc->i_mmap);
48 INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
49} 41}
50 42
51static const struct address_space_operations def_btnode_aops = { 43static const struct address_space_operations def_btnode_aops = {
@@ -55,12 +47,7 @@ static const struct address_space_operations def_btnode_aops = {
55void nilfs_btnode_cache_init(struct address_space *btnc, 47void nilfs_btnode_cache_init(struct address_space *btnc,
56 struct backing_dev_info *bdi) 48 struct backing_dev_info *bdi)
57{ 49{
58 btnc->host = NULL; /* can safely set to host inode ? */ 50 nilfs_mapping_init(btnc, bdi, &def_btnode_aops);
59 btnc->flags = 0;
60 mapping_set_gfp_mask(btnc, GFP_NOFS);
61 btnc->assoc_mapping = NULL;
62 btnc->backing_dev_info = bdi;
63 btnc->a_ops = &def_btnode_aops;
64} 51}
65 52
66void nilfs_btnode_cache_clear(struct address_space *btnc) 53void nilfs_btnode_cache_clear(struct address_space *btnc)
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 18737818db63..5ff15a8a1024 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -863,26 +863,19 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
863 */ 863 */
864int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) 864int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
865{ 865{
866 struct the_nilfs *nilfs;
867 int ret; 866 int ret;
868 867
869 nilfs = NILFS_MDT(cpfile)->mi_nilfs;
870
871 switch (mode) { 868 switch (mode) {
872 case NILFS_CHECKPOINT: 869 case NILFS_CHECKPOINT:
873 /* 870 if (nilfs_checkpoint_is_mounted(cpfile->i_sb, cno))
874 * Check for protecting existing snapshot mounts: 871 /*
875 * ns_mount_mutex is used to make this operation atomic and 872 * Current implementation does not have to protect
876 * exclusive with a new mount job. Though it doesn't cover 873 * plain read-only mounts since they are exclusive
877 * umount, it's enough for the purpose. 874 * with a read/write mount and are protected from the
878 */ 875 * cleaner.
879 if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) { 876 */
880 /* Current implementation does not have to protect
881 plain read-only mounts since they are exclusive
882 with a read/write mount and are protected from the
883 cleaner. */
884 ret = -EBUSY; 877 ret = -EBUSY;
885 } else 878 else
886 ret = nilfs_cpfile_clear_snapshot(cpfile, cno); 879 ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
887 return ret; 880 return ret;
888 case NILFS_SNAPSHOT: 881 case NILFS_SNAPSHOT:
@@ -933,27 +926,40 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
933} 926}
934 927
935/** 928/**
936 * nilfs_cpfile_read - read cpfile inode 929 * nilfs_cpfile_read - read or get cpfile inode
937 * @cpfile: cpfile inode 930 * @sb: super block instance
938 * @raw_inode: on-disk cpfile inode
939 */
940int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode)
941{
942 return nilfs_read_inode_common(cpfile, raw_inode);
943}
944
945/**
946 * nilfs_cpfile_new - create cpfile
947 * @nilfs: nilfs object
948 * @cpsize: size of a checkpoint entry 931 * @cpsize: size of a checkpoint entry
932 * @raw_inode: on-disk cpfile inode
933 * @inodep: buffer to store the inode
949 */ 934 */
950struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize) 935int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
936 struct nilfs_inode *raw_inode, struct inode **inodep)
951{ 937{
952 struct inode *cpfile; 938 struct inode *cpfile;
939 int err;
940
941 cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO);
942 if (unlikely(!cpfile))
943 return -ENOMEM;
944 if (!(cpfile->i_state & I_NEW))
945 goto out;
946
947 err = nilfs_mdt_init(cpfile, NILFS_MDT_GFP, 0);
948 if (err)
949 goto failed;
953 950
954 cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO, 0); 951 nilfs_mdt_set_entry_size(cpfile, cpsize,
955 if (cpfile) 952 sizeof(struct nilfs_cpfile_header));
956 nilfs_mdt_set_entry_size(cpfile, cpsize, 953
957 sizeof(struct nilfs_cpfile_header)); 954 err = nilfs_read_inode_common(cpfile, raw_inode);
958 return cpfile; 955 if (err)
956 goto failed;
957
958 unlock_new_inode(cpfile);
959 out:
960 *inodep = cpfile;
961 return 0;
962 failed:
963 iget_failed(cpfile);
964 return err;
959} 965}
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index bc0809e0ab43..a242b9a314f9 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -40,7 +40,7 @@ int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
40ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned, 40ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned,
41 size_t); 41 size_t);
42 42
43int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode); 43int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
44struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize); 44 struct nilfs_inode *raw_inode, struct inode **inodep);
45 45
46#endif /* _NILFS_CPFILE_H */ 46#endif /* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 013146755683..49c844dab33a 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -36,6 +36,7 @@
36struct nilfs_dat_info { 36struct nilfs_dat_info {
37 struct nilfs_mdt_info mi; 37 struct nilfs_mdt_info mi;
38 struct nilfs_palloc_cache palloc_cache; 38 struct nilfs_palloc_cache palloc_cache;
39 struct nilfs_shadow_map shadow;
39}; 40};
40 41
41static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat) 42static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat)
@@ -102,7 +103,8 @@ void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req)
102 nilfs_palloc_abort_alloc_entry(dat, req); 103 nilfs_palloc_abort_alloc_entry(dat, req);
103} 104}
104 105
105void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req) 106static void nilfs_dat_commit_free(struct inode *dat,
107 struct nilfs_palloc_req *req)
106{ 108{
107 struct nilfs_dat_entry *entry; 109 struct nilfs_dat_entry *entry;
108 void *kaddr; 110 void *kaddr;
@@ -327,6 +329,23 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
327 ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh); 329 ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
328 if (ret < 0) 330 if (ret < 0)
329 return ret; 331 return ret;
332
333 /*
334 * The given disk block number (blocknr) is not yet written to
335 * the device at this point.
336 *
337 * To prevent nilfs_dat_translate() from returning the
338 * uncommited block number, this makes a copy of the entry
339 * buffer and redirects nilfs_dat_translate() to the copy.
340 */
341 if (!buffer_nilfs_redirected(entry_bh)) {
342 ret = nilfs_mdt_freeze_buffer(dat, entry_bh);
343 if (ret) {
344 brelse(entry_bh);
345 return ret;
346 }
347 }
348
330 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0); 349 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
331 entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); 350 entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
332 if (unlikely(entry->de_blocknr == cpu_to_le64(0))) { 351 if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
@@ -371,7 +390,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
371 */ 390 */
372int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp) 391int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
373{ 392{
374 struct buffer_head *entry_bh; 393 struct buffer_head *entry_bh, *bh;
375 struct nilfs_dat_entry *entry; 394 struct nilfs_dat_entry *entry;
376 sector_t blocknr; 395 sector_t blocknr;
377 void *kaddr; 396 void *kaddr;
@@ -381,6 +400,15 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
381 if (ret < 0) 400 if (ret < 0)
382 return ret; 401 return ret;
383 402
403 if (!nilfs_doing_gc() && buffer_nilfs_redirected(entry_bh)) {
404 bh = nilfs_mdt_get_frozen_buffer(dat, entry_bh);
405 if (bh) {
406 WARN_ON(!buffer_uptodate(bh));
407 brelse(entry_bh);
408 entry_bh = bh;
409 }
410 }
411
384 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0); 412 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
385 entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); 413 entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
386 blocknr = le64_to_cpu(entry->de_blocknr); 414 blocknr = le64_to_cpu(entry->de_blocknr);
@@ -436,38 +464,48 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
436} 464}
437 465
438/** 466/**
439 * nilfs_dat_read - read dat inode 467 * nilfs_dat_read - read or get dat inode
440 * @dat: dat inode 468 * @sb: super block instance
441 * @raw_inode: on-disk dat inode
442 */
443int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode)
444{
445 return nilfs_read_inode_common(dat, raw_inode);
446}
447
448/**
449 * nilfs_dat_new - create dat file
450 * @nilfs: nilfs object
451 * @entry_size: size of a dat entry 469 * @entry_size: size of a dat entry
470 * @raw_inode: on-disk dat inode
471 * @inodep: buffer to store the inode
452 */ 472 */
453struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size) 473int nilfs_dat_read(struct super_block *sb, size_t entry_size,
474 struct nilfs_inode *raw_inode, struct inode **inodep)
454{ 475{
455 static struct lock_class_key dat_lock_key; 476 static struct lock_class_key dat_lock_key;
456 struct inode *dat; 477 struct inode *dat;
457 struct nilfs_dat_info *di; 478 struct nilfs_dat_info *di;
458 int err; 479 int err;
459 480
460 dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO, sizeof(*di)); 481 dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO);
461 if (dat) { 482 if (unlikely(!dat))
462 err = nilfs_palloc_init_blockgroup(dat, entry_size); 483 return -ENOMEM;
463 if (unlikely(err)) { 484 if (!(dat->i_state & I_NEW))
464 nilfs_mdt_destroy(dat); 485 goto out;
465 return NULL;
466 }
467 486
468 di = NILFS_DAT_I(dat); 487 err = nilfs_mdt_init(dat, NILFS_MDT_GFP, sizeof(*di));
469 lockdep_set_class(&di->mi.mi_sem, &dat_lock_key); 488 if (err)
470 nilfs_palloc_setup_cache(dat, &di->palloc_cache); 489 goto failed;
471 } 490
472 return dat; 491 err = nilfs_palloc_init_blockgroup(dat, entry_size);
492 if (err)
493 goto failed;
494
495 di = NILFS_DAT_I(dat);
496 lockdep_set_class(&di->mi.mi_sem, &dat_lock_key);
497 nilfs_palloc_setup_cache(dat, &di->palloc_cache);
498 nilfs_mdt_setup_shadow_map(dat, &di->shadow);
499
500 err = nilfs_read_inode_common(dat, raw_inode);
501 if (err)
502 goto failed;
503
504 unlock_new_inode(dat);
505 out:
506 *inodep = dat;
507 return 0;
508 failed:
509 iget_failed(dat);
510 return err;
473} 511}
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index d31c3aab0efe..cbd8e9732503 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -53,7 +53,7 @@ int nilfs_dat_freev(struct inode *, __u64 *, size_t);
53int nilfs_dat_move(struct inode *, __u64, sector_t); 53int nilfs_dat_move(struct inode *, __u64, sector_t);
54ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t); 54ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t);
55 55
56int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode); 56int nilfs_dat_read(struct super_block *sb, size_t entry_size,
57struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size); 57 struct nilfs_inode *raw_inode, struct inode **inodep);
58 58
59#endif /* _NILFS_DAT_H */ 59#endif /* _NILFS_DAT_H */
diff --git a/fs/nilfs2/export.h b/fs/nilfs2/export.h
new file mode 100644
index 000000000000..a71cc412b651
--- /dev/null
+++ b/fs/nilfs2/export.h
@@ -0,0 +1,17 @@
1#ifndef NILFS_EXPORT_H
2#define NILFS_EXPORT_H
3
4#include <linux/exportfs.h>
5
6extern const struct export_operations nilfs_export_ops;
7
8struct nilfs_fid {
9 u64 cno;
10 u64 ino;
11 u32 gen;
12
13 u32 parent_gen;
14 u64 parent_ino;
15} __attribute__ ((packed));
16
17#endif
diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c
deleted file mode 100644
index 84a45d1d5464..000000000000
--- a/fs/nilfs2/gcdat.c
+++ /dev/null
@@ -1,87 +0,0 @@
1/*
2 * gcdat.c - NILFS shadow DAT inode for GC
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
21 * and Ryusuke Konishi <ryusuke@osrg.net>.
22 *
23 */
24
25#include <linux/buffer_head.h>
26#include "nilfs.h"
27#include "page.h"
28#include "mdt.h"
29
30int nilfs_init_gcdat_inode(struct the_nilfs *nilfs)
31{
32 struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
33 struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
34 int err;
35
36 gcdat->i_state = 0;
37 gcdat->i_blocks = dat->i_blocks;
38 gii->i_flags = dii->i_flags;
39 gii->i_state = dii->i_state | (1 << NILFS_I_GCDAT);
40 gii->i_cno = 0;
41 nilfs_bmap_init_gcdat(gii->i_bmap, dii->i_bmap);
42 err = nilfs_copy_dirty_pages(gcdat->i_mapping, dat->i_mapping);
43 if (unlikely(err))
44 return err;
45
46 return nilfs_copy_dirty_pages(&gii->i_btnode_cache,
47 &dii->i_btnode_cache);
48}
49
50void nilfs_commit_gcdat_inode(struct the_nilfs *nilfs)
51{
52 struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
53 struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
54 struct address_space *mapping = dat->i_mapping;
55 struct address_space *gmapping = gcdat->i_mapping;
56
57 down_write(&NILFS_MDT(dat)->mi_sem);
58 dat->i_blocks = gcdat->i_blocks;
59 dii->i_flags = gii->i_flags;
60 dii->i_state = gii->i_state & ~(1 << NILFS_I_GCDAT);
61
62 nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap);
63
64 nilfs_palloc_clear_cache(dat);
65 nilfs_palloc_clear_cache(gcdat);
66 nilfs_clear_dirty_pages(mapping);
67 nilfs_copy_back_pages(mapping, gmapping);
68 /* note: mdt dirty flags should be cleared by segctor. */
69
70 nilfs_clear_dirty_pages(&dii->i_btnode_cache);
71 nilfs_copy_back_pages(&dii->i_btnode_cache, &gii->i_btnode_cache);
72
73 up_write(&NILFS_MDT(dat)->mi_sem);
74}
75
76void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs)
77{
78 struct inode *gcdat = nilfs->ns_gc_dat;
79 struct nilfs_inode_info *gii = NILFS_I(gcdat);
80
81 gcdat->i_state = I_FREEING | I_CLEAR;
82 gii->i_flags = 0;
83
84 nilfs_palloc_clear_cache(gcdat);
85 truncate_inode_pages(gcdat->i_mapping, 0);
86 truncate_inode_pages(&gii->i_btnode_cache, 0);
87}
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index bed3a783129b..33ad25ddd5c4 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -28,13 +28,6 @@
28 * gcinodes), and this file provides lookup function of the dummy 28 * gcinodes), and this file provides lookup function of the dummy
29 * inodes and their buffer read function. 29 * inodes and their buffer read function.
30 * 30 *
31 * Since NILFS2 keeps up multiple checkpoints/snapshots across GC, it
32 * has to treat blocks that belong to a same file but have different
33 * checkpoint numbers. To avoid interference among generations, dummy
34 * inodes are managed separately from actual inodes, and their lookup
35 * function (nilfs_gc_iget) is designed to be specified with a
36 * checkpoint number argument as well as an inode number.
37 *
38 * Buffers and pages held by the dummy inodes will be released each 31 * Buffers and pages held by the dummy inodes will be released each
39 * time after they are copied to a new log. Dirty blocks made on the 32 * time after they are copied to a new log. Dirty blocks made on the
40 * current generation and the blocks to be moved by GC never overlap 33 * current generation and the blocks to be moved by GC never overlap
@@ -175,125 +168,46 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
175 } 168 }
176 nilfs_btnode_mark_dirty(bh); 169 nilfs_btnode_mark_dirty(bh);
177 } else { 170 } else {
178 nilfs_mdt_mark_buffer_dirty(bh); 171 nilfs_mark_buffer_dirty(bh);
179 } 172 }
180 return 0; 173 return 0;
181} 174}
182 175
183/* 176int nilfs_init_gcinode(struct inode *inode)
184 * nilfs_init_gccache() - allocate and initialize gc_inode hash table
185 * @nilfs - the_nilfs
186 *
187 * Return Value: On success, 0.
188 * On error, a negative error code is returned.
189 */
190int nilfs_init_gccache(struct the_nilfs *nilfs)
191{ 177{
192 int loop; 178 struct nilfs_inode_info *ii = NILFS_I(inode);
193 179 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
194 BUG_ON(nilfs->ns_gc_inodes_h);
195
196 INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
197
198 nilfs->ns_gc_inodes_h =
199 kmalloc(sizeof(struct hlist_head) * NILFS_GCINODE_HASH_SIZE,
200 GFP_NOFS);
201 if (nilfs->ns_gc_inodes_h == NULL)
202 return -ENOMEM;
203
204 for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++)
205 INIT_HLIST_HEAD(&nilfs->ns_gc_inodes_h[loop]);
206 return 0;
207}
208
209/*
210 * nilfs_destroy_gccache() - free gc_inode hash table
211 * @nilfs - the nilfs
212 */
213void nilfs_destroy_gccache(struct the_nilfs *nilfs)
214{
215 if (nilfs->ns_gc_inodes_h) {
216 nilfs_remove_all_gcinode(nilfs);
217 kfree(nilfs->ns_gc_inodes_h);
218 nilfs->ns_gc_inodes_h = NULL;
219 }
220}
221
222static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino,
223 __u64 cno)
224{
225 struct inode *inode;
226 struct nilfs_inode_info *ii;
227
228 inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS, 0);
229 if (!inode)
230 return NULL;
231 180
232 inode->i_op = NULL; 181 inode->i_mode = S_IFREG;
233 inode->i_fop = NULL; 182 mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
234 inode->i_mapping->a_ops = &def_gcinode_aops; 183 inode->i_mapping->a_ops = &def_gcinode_aops;
184 inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
235 185
236 ii = NILFS_I(inode);
237 ii->i_cno = cno;
238 ii->i_flags = 0; 186 ii->i_flags = 0;
239 ii->i_state = 1 << NILFS_I_GCINODE;
240 ii->i_bh = NULL;
241 nilfs_bmap_init_gc(ii->i_bmap); 187 nilfs_bmap_init_gc(ii->i_bmap);
242 188
243 return inode; 189 /*
244} 190 * Add the inode to GC inode list. Garbage Collection
245 191 * is serialized and no two processes manipulate the
246static unsigned long ihash(ino_t ino, __u64 cno) 192 * list simultaneously.
247{ 193 */
248 return hash_long((unsigned long)((ino << 2) + cno), 194 igrab(inode);
249 NILFS_GCINODE_HASH_BITS); 195 list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
250}
251
252/*
253 * nilfs_gc_iget() - find or create gc inode with specified (ino,cno)
254 */
255struct inode *nilfs_gc_iget(struct the_nilfs *nilfs, ino_t ino, __u64 cno)
256{
257 struct hlist_head *head = nilfs->ns_gc_inodes_h + ihash(ino, cno);
258 struct hlist_node *node;
259 struct inode *inode;
260
261 hlist_for_each_entry(inode, node, head, i_hash) {
262 if (inode->i_ino == ino && NILFS_I(inode)->i_cno == cno)
263 return inode;
264 }
265 196
266 inode = alloc_gcinode(nilfs, ino, cno); 197 return 0;
267 if (likely(inode)) {
268 hlist_add_head(&inode->i_hash, head);
269 list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
270 }
271 return inode;
272}
273
274/*
275 * nilfs_clear_gcinode() - clear and free a gc inode
276 */
277void nilfs_clear_gcinode(struct inode *inode)
278{
279 nilfs_mdt_destroy(inode);
280} 198}
281 199
282/* 200/**
283 * nilfs_remove_all_gcinode() - remove all inodes from the_nilfs 201 * nilfs_remove_all_gcinodes() - remove all unprocessed gc inodes
284 */ 202 */
285void nilfs_remove_all_gcinode(struct the_nilfs *nilfs) 203void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs)
286{ 204{
287 struct hlist_head *head = nilfs->ns_gc_inodes_h; 205 struct list_head *head = &nilfs->ns_gc_inodes;
288 struct hlist_node *node, *n; 206 struct nilfs_inode_info *ii;
289 struct inode *inode;
290 int loop;
291 207
292 for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++, head++) { 208 while (!list_empty(head)) {
293 hlist_for_each_entry_safe(inode, node, n, head, i_hash) { 209 ii = list_first_entry(head, struct nilfs_inode_info, i_dirty);
294 hlist_del_init(&inode->i_hash); 210 list_del_init(&ii->i_dirty);
295 list_del_init(&NILFS_I(inode)->i_dirty); 211 iput(&ii->vfs_inode);
296 nilfs_clear_gcinode(inode); /* might sleep */
297 }
298 } 212 }
299} 213}
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 922d9dd42c8f..9f8a2da67f90 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -161,25 +161,46 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
161} 161}
162 162
163/** 163/**
164 * nilfs_ifile_new - create inode file 164 * nilfs_ifile_read - read or get ifile inode
165 * @sbi: nilfs_sb_info struct 165 * @sb: super block instance
166 * @root: root object
166 * @inode_size: size of an inode 167 * @inode_size: size of an inode
168 * @raw_inode: on-disk ifile inode
169 * @inodep: buffer to store the inode
167 */ 170 */
168struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size) 171int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
172 size_t inode_size, struct nilfs_inode *raw_inode,
173 struct inode **inodep)
169{ 174{
170 struct inode *ifile; 175 struct inode *ifile;
171 int err; 176 int err;
172 177
173 ifile = nilfs_mdt_new(sbi->s_nilfs, sbi->s_super, NILFS_IFILE_INO, 178 ifile = nilfs_iget_locked(sb, root, NILFS_IFILE_INO);
174 sizeof(struct nilfs_ifile_info)); 179 if (unlikely(!ifile))
175 if (ifile) { 180 return -ENOMEM;
176 err = nilfs_palloc_init_blockgroup(ifile, inode_size); 181 if (!(ifile->i_state & I_NEW))
177 if (unlikely(err)) { 182 goto out;
178 nilfs_mdt_destroy(ifile); 183
179 return NULL; 184 err = nilfs_mdt_init(ifile, NILFS_MDT_GFP,
180 } 185 sizeof(struct nilfs_ifile_info));
181 nilfs_palloc_setup_cache(ifile, 186 if (err)
182 &NILFS_IFILE_I(ifile)->palloc_cache); 187 goto failed;
183 } 188
184 return ifile; 189 err = nilfs_palloc_init_blockgroup(ifile, inode_size);
190 if (err)
191 goto failed;
192
193 nilfs_palloc_setup_cache(ifile, &NILFS_IFILE_I(ifile)->palloc_cache);
194
195 err = nilfs_read_inode_common(ifile, raw_inode);
196 if (err)
197 goto failed;
198
199 unlock_new_inode(ifile);
200 out:
201 *inodep = ifile;
202 return 0;
203 failed:
204 iget_failed(ifile);
205 return err;
185} 206}
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index cbca32e498f2..59b6f2b51df6 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -49,6 +49,8 @@ int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
49int nilfs_ifile_delete_inode(struct inode *, ino_t); 49int nilfs_ifile_delete_inode(struct inode *, ino_t);
50int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **); 50int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
51 51
52struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size); 52int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
53 size_t inode_size, struct nilfs_inode *raw_inode,
54 struct inode **inodep);
53 55
54#endif /* _NILFS_IFILE_H */ 56#endif /* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index eccb2f2e2315..71d4bc8464e0 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -34,6 +34,12 @@
34#include "cpfile.h" 34#include "cpfile.h"
35#include "ifile.h" 35#include "ifile.h"
36 36
37struct nilfs_iget_args {
38 u64 ino;
39 __u64 cno;
40 struct nilfs_root *root;
41 int for_gc;
42};
37 43
38/** 44/**
39 * nilfs_get_block() - get a file block on the filesystem (callback function) 45 * nilfs_get_block() - get a file block on the filesystem (callback function)
@@ -279,6 +285,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
279 struct nilfs_sb_info *sbi = NILFS_SB(sb); 285 struct nilfs_sb_info *sbi = NILFS_SB(sb);
280 struct inode *inode; 286 struct inode *inode;
281 struct nilfs_inode_info *ii; 287 struct nilfs_inode_info *ii;
288 struct nilfs_root *root;
282 int err = -ENOMEM; 289 int err = -ENOMEM;
283 ino_t ino; 290 ino_t ino;
284 291
@@ -289,15 +296,17 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
289 mapping_set_gfp_mask(inode->i_mapping, 296 mapping_set_gfp_mask(inode->i_mapping,
290 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); 297 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
291 298
299 root = NILFS_I(dir)->i_root;
292 ii = NILFS_I(inode); 300 ii = NILFS_I(inode);
293 ii->i_state = 1 << NILFS_I_NEW; 301 ii->i_state = 1 << NILFS_I_NEW;
302 ii->i_root = root;
294 303
295 err = nilfs_ifile_create_inode(sbi->s_ifile, &ino, &ii->i_bh); 304 err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh);
296 if (unlikely(err)) 305 if (unlikely(err))
297 goto failed_ifile_create_inode; 306 goto failed_ifile_create_inode;
298 /* reference count of i_bh inherits from nilfs_mdt_read_block() */ 307 /* reference count of i_bh inherits from nilfs_mdt_read_block() */
299 308
300 atomic_inc(&sbi->s_inodes_count); 309 atomic_inc(&root->inodes_count);
301 inode_init_owner(inode, dir, mode); 310 inode_init_owner(inode, dir, mode);
302 inode->i_ino = ino; 311 inode->i_ino = ino;
303 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 312 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -320,7 +329,6 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
320 /* ii->i_file_acl = 0; */ 329 /* ii->i_file_acl = 0; */
321 /* ii->i_dir_acl = 0; */ 330 /* ii->i_dir_acl = 0; */
322 ii->i_dir_start_lookup = 0; 331 ii->i_dir_start_lookup = 0;
323 ii->i_cno = 0;
324 nilfs_set_inode_flags(inode); 332 nilfs_set_inode_flags(inode);
325 spin_lock(&sbi->s_next_gen_lock); 333 spin_lock(&sbi->s_next_gen_lock);
326 inode->i_generation = sbi->s_next_generation++; 334 inode->i_generation = sbi->s_next_generation++;
@@ -350,16 +358,6 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
350 return ERR_PTR(err); 358 return ERR_PTR(err);
351} 359}
352 360
353void nilfs_free_inode(struct inode *inode)
354{
355 struct super_block *sb = inode->i_sb;
356 struct nilfs_sb_info *sbi = NILFS_SB(sb);
357
358 /* XXX: check error code? Is there any thing I can do? */
359 (void) nilfs_ifile_delete_inode(sbi->s_ifile, inode->i_ino);
360 atomic_dec(&sbi->s_inodes_count);
361}
362
363void nilfs_set_inode_flags(struct inode *inode) 361void nilfs_set_inode_flags(struct inode *inode)
364{ 362{
365 unsigned int flags = NILFS_I(inode)->i_flags; 363 unsigned int flags = NILFS_I(inode)->i_flags;
@@ -410,7 +408,6 @@ int nilfs_read_inode_common(struct inode *inode,
410 0 : le32_to_cpu(raw_inode->i_dir_acl); 408 0 : le32_to_cpu(raw_inode->i_dir_acl);
411#endif 409#endif
412 ii->i_dir_start_lookup = 0; 410 ii->i_dir_start_lookup = 0;
413 ii->i_cno = 0;
414 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 411 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
415 412
416 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 413 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
@@ -424,7 +421,8 @@ int nilfs_read_inode_common(struct inode *inode,
424 return 0; 421 return 0;
425} 422}
426 423
427static int __nilfs_read_inode(struct super_block *sb, unsigned long ino, 424static int __nilfs_read_inode(struct super_block *sb,
425 struct nilfs_root *root, unsigned long ino,
428 struct inode *inode) 426 struct inode *inode)
429{ 427{
430 struct nilfs_sb_info *sbi = NILFS_SB(sb); 428 struct nilfs_sb_info *sbi = NILFS_SB(sb);
@@ -434,11 +432,11 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
434 int err; 432 int err;
435 433
436 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 434 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
437 err = nilfs_ifile_get_inode_block(sbi->s_ifile, ino, &bh); 435 err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh);
438 if (unlikely(err)) 436 if (unlikely(err))
439 goto bad_inode; 437 goto bad_inode;
440 438
441 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh); 439 raw_inode = nilfs_ifile_map_inode(root->ifile, ino, bh);
442 440
443 err = nilfs_read_inode_common(inode, raw_inode); 441 err = nilfs_read_inode_common(inode, raw_inode);
444 if (err) 442 if (err)
@@ -461,14 +459,14 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
461 inode, inode->i_mode, 459 inode, inode->i_mode,
462 huge_decode_dev(le64_to_cpu(raw_inode->i_device_code))); 460 huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
463 } 461 }
464 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh); 462 nilfs_ifile_unmap_inode(root->ifile, ino, bh);
465 brelse(bh); 463 brelse(bh);
466 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ 464 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
467 nilfs_set_inode_flags(inode); 465 nilfs_set_inode_flags(inode);
468 return 0; 466 return 0;
469 467
470 failed_unmap: 468 failed_unmap:
471 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh); 469 nilfs_ifile_unmap_inode(root->ifile, ino, bh);
472 brelse(bh); 470 brelse(bh);
473 471
474 bad_inode: 472 bad_inode:
@@ -476,18 +474,95 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
476 return err; 474 return err;
477} 475}
478 476
479struct inode *nilfs_iget(struct super_block *sb, unsigned long ino) 477static int nilfs_iget_test(struct inode *inode, void *opaque)
478{
479 struct nilfs_iget_args *args = opaque;
480 struct nilfs_inode_info *ii;
481
482 if (args->ino != inode->i_ino || args->root != NILFS_I(inode)->i_root)
483 return 0;
484
485 ii = NILFS_I(inode);
486 if (!test_bit(NILFS_I_GCINODE, &ii->i_state))
487 return !args->for_gc;
488
489 return args->for_gc && args->cno == ii->i_cno;
490}
491
492static int nilfs_iget_set(struct inode *inode, void *opaque)
493{
494 struct nilfs_iget_args *args = opaque;
495
496 inode->i_ino = args->ino;
497 if (args->for_gc) {
498 NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE;
499 NILFS_I(inode)->i_cno = args->cno;
500 NILFS_I(inode)->i_root = NULL;
501 } else {
502 if (args->root && args->ino == NILFS_ROOT_INO)
503 nilfs_get_root(args->root);
504 NILFS_I(inode)->i_root = args->root;
505 }
506 return 0;
507}
508
509struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
510 unsigned long ino)
511{
512 struct nilfs_iget_args args = {
513 .ino = ino, .root = root, .cno = 0, .for_gc = 0
514 };
515
516 return ilookup5(sb, ino, nilfs_iget_test, &args);
517}
518
519struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
520 unsigned long ino)
521{
522 struct nilfs_iget_args args = {
523 .ino = ino, .root = root, .cno = 0, .for_gc = 0
524 };
525
526 return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
527}
528
529struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
530 unsigned long ino)
480{ 531{
481 struct inode *inode; 532 struct inode *inode;
482 int err; 533 int err;
483 534
484 inode = iget_locked(sb, ino); 535 inode = nilfs_iget_locked(sb, root, ino);
485 if (unlikely(!inode)) 536 if (unlikely(!inode))
486 return ERR_PTR(-ENOMEM); 537 return ERR_PTR(-ENOMEM);
487 if (!(inode->i_state & I_NEW)) 538 if (!(inode->i_state & I_NEW))
488 return inode; 539 return inode;
489 540
490 err = __nilfs_read_inode(sb, ino, inode); 541 err = __nilfs_read_inode(sb, root, ino, inode);
542 if (unlikely(err)) {
543 iget_failed(inode);
544 return ERR_PTR(err);
545 }
546 unlock_new_inode(inode);
547 return inode;
548}
549
550struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
551 __u64 cno)
552{
553 struct nilfs_iget_args args = {
554 .ino = ino, .root = NULL, .cno = cno, .for_gc = 1
555 };
556 struct inode *inode;
557 int err;
558
559 inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
560 if (unlikely(!inode))
561 return ERR_PTR(-ENOMEM);
562 if (!(inode->i_state & I_NEW))
563 return inode;
564
565 err = nilfs_init_gcinode(inode);
491 if (unlikely(err)) { 566 if (unlikely(err)) {
492 iget_failed(inode); 567 iget_failed(inode);
493 return ERR_PTR(err); 568 return ERR_PTR(err);
@@ -528,21 +603,20 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
528{ 603{
529 ino_t ino = inode->i_ino; 604 ino_t ino = inode->i_ino;
530 struct nilfs_inode_info *ii = NILFS_I(inode); 605 struct nilfs_inode_info *ii = NILFS_I(inode);
531 struct super_block *sb = inode->i_sb; 606 struct inode *ifile = ii->i_root->ifile;
532 struct nilfs_sb_info *sbi = NILFS_SB(sb);
533 struct nilfs_inode *raw_inode; 607 struct nilfs_inode *raw_inode;
534 608
535 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh); 609 raw_inode = nilfs_ifile_map_inode(ifile, ino, ibh);
536 610
537 if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state)) 611 if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
538 memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size); 612 memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size);
539 set_bit(NILFS_I_INODE_DIRTY, &ii->i_state); 613 set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
540 614
541 nilfs_write_inode_common(inode, raw_inode, 0); 615 nilfs_write_inode_common(inode, raw_inode, 0);
542 /* XXX: call with has_bmap = 0 is a workaround to avoid 616 /* XXX: call with has_bmap = 0 is a workaround to avoid
543 deadlock of bmap. This delays update of i_bmap to just 617 deadlock of bmap. This delays update of i_bmap to just
544 before writing */ 618 before writing */
545 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, ibh); 619 nilfs_ifile_unmap_inode(ifile, ino, ibh);
546} 620}
547 621
548#define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */ 622#define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */
@@ -617,6 +691,7 @@ void nilfs_truncate(struct inode *inode)
617static void nilfs_clear_inode(struct inode *inode) 691static void nilfs_clear_inode(struct inode *inode)
618{ 692{
619 struct nilfs_inode_info *ii = NILFS_I(inode); 693 struct nilfs_inode_info *ii = NILFS_I(inode);
694 struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
620 695
621 /* 696 /*
622 * Free resources allocated in nilfs_read_inode(), here. 697 * Free resources allocated in nilfs_read_inode(), here.
@@ -625,10 +700,16 @@ static void nilfs_clear_inode(struct inode *inode)
625 brelse(ii->i_bh); 700 brelse(ii->i_bh);
626 ii->i_bh = NULL; 701 ii->i_bh = NULL;
627 702
703 if (mdi && mdi->mi_palloc_cache)
704 nilfs_palloc_destroy_cache(inode);
705
628 if (test_bit(NILFS_I_BMAP, &ii->i_state)) 706 if (test_bit(NILFS_I_BMAP, &ii->i_state))
629 nilfs_bmap_clear(ii->i_bmap); 707 nilfs_bmap_clear(ii->i_bmap);
630 708
631 nilfs_btnode_cache_clear(&ii->i_btnode_cache); 709 nilfs_btnode_cache_clear(&ii->i_btnode_cache);
710
711 if (ii->i_root && inode->i_ino == NILFS_ROOT_INO)
712 nilfs_put_root(ii->i_root);
632} 713}
633 714
634void nilfs_evict_inode(struct inode *inode) 715void nilfs_evict_inode(struct inode *inode)
@@ -637,7 +718,7 @@ void nilfs_evict_inode(struct inode *inode)
637 struct super_block *sb = inode->i_sb; 718 struct super_block *sb = inode->i_sb;
638 struct nilfs_inode_info *ii = NILFS_I(inode); 719 struct nilfs_inode_info *ii = NILFS_I(inode);
639 720
640 if (inode->i_nlink || unlikely(is_bad_inode(inode))) { 721 if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
641 if (inode->i_data.nrpages) 722 if (inode->i_data.nrpages)
642 truncate_inode_pages(&inode->i_data, 0); 723 truncate_inode_pages(&inode->i_data, 0);
643 end_writeback(inode); 724 end_writeback(inode);
@@ -649,12 +730,16 @@ void nilfs_evict_inode(struct inode *inode)
649 if (inode->i_data.nrpages) 730 if (inode->i_data.nrpages)
650 truncate_inode_pages(&inode->i_data, 0); 731 truncate_inode_pages(&inode->i_data, 0);
651 732
733 /* TODO: some of the following operations may fail. */
652 nilfs_truncate_bmap(ii, 0); 734 nilfs_truncate_bmap(ii, 0);
653 nilfs_mark_inode_dirty(inode); 735 nilfs_mark_inode_dirty(inode);
654 end_writeback(inode); 736 end_writeback(inode);
737
738 nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
739 atomic_dec(&ii->i_root->inodes_count);
740
655 nilfs_clear_inode(inode); 741 nilfs_clear_inode(inode);
656 nilfs_free_inode(inode); 742
657 /* nilfs_free_inode() marks inode buffer dirty */
658 if (IS_SYNC(inode)) 743 if (IS_SYNC(inode))
659 nilfs_set_transaction_flag(NILFS_TI_SYNC); 744 nilfs_set_transaction_flag(NILFS_TI_SYNC);
660 nilfs_transaction_commit(sb); 745 nilfs_transaction_commit(sb);
@@ -700,6 +785,17 @@ out_err:
700 return err; 785 return err;
701} 786}
702 787
788int nilfs_permission(struct inode *inode, int mask)
789{
790 struct nilfs_root *root = NILFS_I(inode)->i_root;
791
792 if ((mask & MAY_WRITE) && root &&
793 root->cno != NILFS_CPTREE_CURRENT_CNO)
794 return -EROFS; /* snapshot is not writable */
795
796 return generic_permission(inode, mask, NULL);
797}
798
703int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode, 799int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
704 struct buffer_head **pbh) 800 struct buffer_head **pbh)
705{ 801{
@@ -709,8 +805,8 @@ int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
709 spin_lock(&sbi->s_inode_lock); 805 spin_lock(&sbi->s_inode_lock);
710 if (ii->i_bh == NULL) { 806 if (ii->i_bh == NULL) {
711 spin_unlock(&sbi->s_inode_lock); 807 spin_unlock(&sbi->s_inode_lock);
712 err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino, 808 err = nilfs_ifile_get_inode_block(ii->i_root->ifile,
713 pbh); 809 inode->i_ino, pbh);
714 if (unlikely(err)) 810 if (unlikely(err))
715 return err; 811 return err;
716 spin_lock(&sbi->s_inode_lock); 812 spin_lock(&sbi->s_inode_lock);
@@ -790,7 +886,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
790 } 886 }
791 nilfs_update_inode(inode, ibh); 887 nilfs_update_inode(inode, ibh);
792 nilfs_mdt_mark_buffer_dirty(ibh); 888 nilfs_mdt_mark_buffer_dirty(ibh);
793 nilfs_mdt_mark_dirty(sbi->s_ifile); 889 nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile);
794 brelse(ibh); 890 brelse(ibh);
795 return 0; 891 return 0;
796} 892}
@@ -808,6 +904,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
808void nilfs_dirty_inode(struct inode *inode) 904void nilfs_dirty_inode(struct inode *inode)
809{ 905{
810 struct nilfs_transaction_info ti; 906 struct nilfs_transaction_info ti;
907 struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
811 908
812 if (is_bad_inode(inode)) { 909 if (is_bad_inode(inode)) {
813 nilfs_warning(inode->i_sb, __func__, 910 nilfs_warning(inode->i_sb, __func__,
@@ -815,6 +912,10 @@ void nilfs_dirty_inode(struct inode *inode)
815 dump_stack(); 912 dump_stack();
816 return; 913 return;
817 } 914 }
915 if (mdi) {
916 nilfs_mdt_mark_dirty(inode);
917 return;
918 }
818 nilfs_transaction_begin(inode->i_sb, &ti, 0); 919 nilfs_transaction_begin(inode->i_sb, &ti, 0);
819 nilfs_mark_inode_dirty(inode); 920 nilfs_mark_inode_dirty(inode);
820 nilfs_transaction_commit(inode->i_sb); /* never fails */ 921 nilfs_transaction_commit(inode->i_sb); /* never fails */
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index f90a33d9a5b0..3e90f86d5bfe 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -22,7 +22,6 @@
22 22
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/wait.h> 24#include <linux/wait.h>
25#include <linux/smp_lock.h> /* lock_kernel(), unlock_kernel() */
26#include <linux/slab.h> 25#include <linux/slab.h>
27#include <linux/capability.h> /* capable() */ 26#include <linux/capability.h> /* capable() */
28#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */ 27#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */
@@ -118,7 +117,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
118 if (copy_from_user(&cpmode, argp, sizeof(cpmode))) 117 if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
119 goto out; 118 goto out;
120 119
121 mutex_lock(&nilfs->ns_mount_mutex); 120 down_read(&inode->i_sb->s_umount);
122 121
123 nilfs_transaction_begin(inode->i_sb, &ti, 0); 122 nilfs_transaction_begin(inode->i_sb, &ti, 0);
124 ret = nilfs_cpfile_change_cpmode( 123 ret = nilfs_cpfile_change_cpmode(
@@ -128,7 +127,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
128 else 127 else
129 nilfs_transaction_commit(inode->i_sb); /* never fails */ 128 nilfs_transaction_commit(inode->i_sb); /* never fails */
130 129
131 mutex_unlock(&nilfs->ns_mount_mutex); 130 up_read(&inode->i_sb->s_umount);
132out: 131out:
133 mnt_drop_write(filp->f_path.mnt); 132 mnt_drop_write(filp->f_path.mnt);
134 return ret; 133 return ret;
@@ -334,7 +333,7 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode,
334 return 0; 333 return 0;
335} 334}
336 335
337static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs, 336static int nilfs_ioctl_move_blocks(struct super_block *sb,
338 struct nilfs_argv *argv, void *buf) 337 struct nilfs_argv *argv, void *buf)
339{ 338{
340 size_t nmembs = argv->v_nmembs; 339 size_t nmembs = argv->v_nmembs;
@@ -349,7 +348,7 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
349 for (i = 0, vdesc = buf; i < nmembs; ) { 348 for (i = 0, vdesc = buf; i < nmembs; ) {
350 ino = vdesc->vd_ino; 349 ino = vdesc->vd_ino;
351 cno = vdesc->vd_cno; 350 cno = vdesc->vd_cno;
352 inode = nilfs_gc_iget(nilfs, ino, cno); 351 inode = nilfs_iget_for_gc(sb, ino, cno);
353 if (unlikely(inode == NULL)) { 352 if (unlikely(inode == NULL)) {
354 ret = -ENOMEM; 353 ret = -ENOMEM;
355 goto failed; 354 goto failed;
@@ -357,11 +356,15 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
357 do { 356 do {
358 ret = nilfs_ioctl_move_inode_block(inode, vdesc, 357 ret = nilfs_ioctl_move_inode_block(inode, vdesc,
359 &buffers); 358 &buffers);
360 if (unlikely(ret < 0)) 359 if (unlikely(ret < 0)) {
360 iput(inode);
361 goto failed; 361 goto failed;
362 }
362 vdesc++; 363 vdesc++;
363 } while (++i < nmembs && 364 } while (++i < nmembs &&
364 vdesc->vd_ino == ino && vdesc->vd_cno == cno); 365 vdesc->vd_ino == ino && vdesc->vd_cno == cno);
366
367 iput(inode); /* The inode still remains in GC inode list */
365 } 368 }
366 369
367 list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) { 370 list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
@@ -567,7 +570,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
567 } 570 }
568 571
569 /* 572 /*
570 * nilfs_ioctl_move_blocks() will call nilfs_gc_iget(), 573 * nilfs_ioctl_move_blocks() will call nilfs_iget_for_gc(),
571 * which will operates an inode list without blocking. 574 * which will operates an inode list without blocking.
572 * To protect the list from concurrent operations, 575 * To protect the list from concurrent operations,
573 * nilfs_ioctl_move_blocks should be atomic operation. 576 * nilfs_ioctl_move_blocks should be atomic operation.
@@ -577,15 +580,16 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
577 goto out_free; 580 goto out_free;
578 } 581 }
579 582
580 ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]); 583 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
584
585 ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]);
581 if (ret < 0) 586 if (ret < 0)
582 printk(KERN_ERR "NILFS: GC failed during preparation: " 587 printk(KERN_ERR "NILFS: GC failed during preparation: "
583 "cannot read source blocks: err=%d\n", ret); 588 "cannot read source blocks: err=%d\n", ret);
584 else 589 else
585 ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); 590 ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
586 591
587 if (ret < 0) 592 nilfs_remove_all_gcinodes(nilfs);
588 nilfs_remove_all_gcinode(nilfs);
589 clear_nilfs_gc_running(nilfs); 593 clear_nilfs_gc_running(nilfs);
590 594
591out_free: 595out_free:
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index d01aff4957d9..39a5b84e2c9f 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -36,7 +36,6 @@
36 36
37#define NILFS_MDT_MAX_RA_BLOCKS (16 - 1) 37#define NILFS_MDT_MAX_RA_BLOCKS (16 - 1)
38 38
39#define INIT_UNUSED_INODE_FIELDS
40 39
41static int 40static int
42nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block, 41nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
@@ -78,25 +77,11 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
78 struct buffer_head *, 77 struct buffer_head *,
79 void *)) 78 void *))
80{ 79{
81 struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
82 struct super_block *sb = inode->i_sb; 80 struct super_block *sb = inode->i_sb;
83 struct nilfs_transaction_info ti; 81 struct nilfs_transaction_info ti;
84 struct buffer_head *bh; 82 struct buffer_head *bh;
85 int err; 83 int err;
86 84
87 if (!sb) {
88 /*
89 * Make sure this function is not called from any
90 * read-only context.
91 */
92 if (!nilfs->ns_writer) {
93 WARN_ON(1);
94 err = -EROFS;
95 goto out;
96 }
97 sb = nilfs->ns_writer->s_super;
98 }
99
100 nilfs_transaction_begin(sb, &ti, 0); 85 nilfs_transaction_begin(sb, &ti, 0);
101 86
102 err = -ENOMEM; 87 err = -ENOMEM;
@@ -112,7 +97,7 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
112 if (buffer_uptodate(bh)) 97 if (buffer_uptodate(bh))
113 goto failed_bh; 98 goto failed_bh;
114 99
115 bh->b_bdev = nilfs->ns_bdev; 100 bh->b_bdev = sb->s_bdev;
116 err = nilfs_mdt_insert_new_block(inode, block, bh, init_block); 101 err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
117 if (likely(!err)) { 102 if (likely(!err)) {
118 get_bh(bh); 103 get_bh(bh);
@@ -129,7 +114,7 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
129 err = nilfs_transaction_commit(sb); 114 err = nilfs_transaction_commit(sb);
130 else 115 else
131 nilfs_transaction_abort(sb); 116 nilfs_transaction_abort(sb);
132 out: 117
133 return err; 118 return err;
134} 119}
135 120
@@ -167,9 +152,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
167 unlock_buffer(bh); 152 unlock_buffer(bh);
168 goto failed_bh; 153 goto failed_bh;
169 } 154 }
170 bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev; 155 map_bh(bh, inode->i_sb, (sector_t)blknum);
171 bh->b_blocknr = (sector_t)blknum;
172 set_buffer_mapped(bh);
173 156
174 bh->b_end_io = end_buffer_read_sync; 157 bh->b_end_io = end_buffer_read_sync;
175 get_bh(bh); 158 get_bh(bh);
@@ -398,35 +381,24 @@ int nilfs_mdt_fetch_dirty(struct inode *inode)
398static int 381static int
399nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) 382nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
400{ 383{
401 struct inode *inode = container_of(page->mapping, 384 struct inode *inode;
402 struct inode, i_data); 385 struct super_block *sb;
403 struct super_block *sb = inode->i_sb;
404 struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
405 struct nilfs_sb_info *writer = NULL;
406 int err = 0; 386 int err = 0;
407 387
408 redirty_page_for_writepage(wbc, page); 388 redirty_page_for_writepage(wbc, page);
409 unlock_page(page); 389 unlock_page(page);
410 390
411 if (page->mapping->assoc_mapping) 391 inode = page->mapping->host;
412 return 0; /* Do not request flush for shadow page cache */ 392 if (!inode)
413 if (!sb) { 393 return 0;
414 down_read(&nilfs->ns_writer_sem); 394
415 writer = nilfs->ns_writer; 395 sb = inode->i_sb;
416 if (!writer) {
417 up_read(&nilfs->ns_writer_sem);
418 return -EROFS;
419 }
420 sb = writer->s_super;
421 }
422 396
423 if (wbc->sync_mode == WB_SYNC_ALL) 397 if (wbc->sync_mode == WB_SYNC_ALL)
424 err = nilfs_construct_segment(sb); 398 err = nilfs_construct_segment(sb);
425 else if (wbc->for_reclaim) 399 else if (wbc->for_reclaim)
426 nilfs_flush_segment(sb, inode->i_ino); 400 nilfs_flush_segment(sb, inode->i_ino);
427 401
428 if (writer)
429 up_read(&nilfs->ns_writer_sem);
430 return err; 402 return err;
431} 403}
432 404
@@ -439,105 +411,27 @@ static const struct address_space_operations def_mdt_aops = {
439static const struct inode_operations def_mdt_iops; 411static const struct inode_operations def_mdt_iops;
440static const struct file_operations def_mdt_fops; 412static const struct file_operations def_mdt_fops;
441 413
442/* 414
443 * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile, 415int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
444 * ifile, or gcinodes. This allows the B-tree code and segment constructor
445 * to treat them like regular files, and this helps to simplify the
446 * implementation.
447 * On the other hand, some of the pseudo inodes have an irregular point:
448 * They don't have valid inode->i_sb pointer because their lifetimes are
449 * longer than those of the super block structs; they may continue for
450 * several consecutive mounts/umounts. This would need discussions.
451 */
452/**
453 * nilfs_mdt_new_common - allocate a pseudo inode for metadata file
454 * @nilfs: nilfs object
455 * @sb: super block instance the metadata file belongs to
456 * @ino: inode number
457 * @gfp_mask: gfp mask for data pages
458 * @objsz: size of the private object attached to inode->i_private
459 */
460struct inode *
461nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
462 ino_t ino, gfp_t gfp_mask, size_t objsz)
463{ 416{
464 struct inode *inode = nilfs_alloc_inode_common(nilfs); 417 struct nilfs_mdt_info *mi;
465 418
466 if (!inode) 419 mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS);
467 return NULL; 420 if (!mi)
468 else { 421 return -ENOMEM;
469 struct address_space * const mapping = &inode->i_data;
470 struct nilfs_mdt_info *mi;
471
472 mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS);
473 if (!mi) {
474 nilfs_destroy_inode(inode);
475 return NULL;
476 }
477 mi->mi_nilfs = nilfs;
478 init_rwsem(&mi->mi_sem);
479
480 inode->i_sb = sb; /* sb may be NULL for some meta data files */
481 inode->i_blkbits = nilfs->ns_blocksize_bits;
482 inode->i_flags = 0;
483 atomic_set(&inode->i_count, 1);
484 inode->i_nlink = 1;
485 inode->i_ino = ino;
486 inode->i_mode = S_IFREG;
487 inode->i_private = mi;
488
489#ifdef INIT_UNUSED_INODE_FIELDS
490 atomic_set(&inode->i_writecount, 0);
491 inode->i_size = 0;
492 inode->i_blocks = 0;
493 inode->i_bytes = 0;
494 inode->i_generation = 0;
495#ifdef CONFIG_QUOTA
496 memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
497#endif
498 inode->i_pipe = NULL;
499 inode->i_bdev = NULL;
500 inode->i_cdev = NULL;
501 inode->i_rdev = 0;
502#ifdef CONFIG_SECURITY
503 inode->i_security = NULL;
504#endif
505 inode->dirtied_when = 0;
506
507 INIT_LIST_HEAD(&inode->i_list);
508 INIT_LIST_HEAD(&inode->i_sb_list);
509 inode->i_state = 0;
510#endif
511
512 spin_lock_init(&inode->i_lock);
513 mutex_init(&inode->i_mutex);
514 init_rwsem(&inode->i_alloc_sem);
515
516 mapping->host = NULL; /* instead of inode */
517 mapping->flags = 0;
518 mapping_set_gfp_mask(mapping, gfp_mask);
519 mapping->assoc_mapping = NULL;
520 mapping->backing_dev_info = nilfs->ns_bdi;
521
522 inode->i_mapping = mapping;
523 }
524 422
525 return inode; 423 init_rwsem(&mi->mi_sem);
526} 424 inode->i_private = mi;
527 425
528struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb, 426 inode->i_mode = S_IFREG;
529 ino_t ino, size_t objsz) 427 mapping_set_gfp_mask(inode->i_mapping, gfp_mask);
530{ 428 inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
531 struct inode *inode;
532
533 inode = nilfs_mdt_new_common(nilfs, sb, ino, NILFS_MDT_GFP, objsz);
534 if (!inode)
535 return NULL;
536 429
537 inode->i_op = &def_mdt_iops; 430 inode->i_op = &def_mdt_iops;
538 inode->i_fop = &def_mdt_fops; 431 inode->i_fop = &def_mdt_fops;
539 inode->i_mapping->a_ops = &def_mdt_aops; 432 inode->i_mapping->a_ops = &def_mdt_aops;
540 return inode; 433
434 return 0;
541} 435}
542 436
543void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size, 437void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
@@ -550,34 +444,159 @@ void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
550 mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size); 444 mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
551} 445}
552 446
553void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow) 447static const struct address_space_operations shadow_map_aops = {
448 .sync_page = block_sync_page,
449};
450
451/**
452 * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file
453 * @inode: inode of the metadata file
454 * @shadow: shadow mapping
455 */
456int nilfs_mdt_setup_shadow_map(struct inode *inode,
457 struct nilfs_shadow_map *shadow)
554{ 458{
555 shadow->i_mapping->assoc_mapping = orig->i_mapping; 459 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
556 NILFS_I(shadow)->i_btnode_cache.assoc_mapping = 460 struct backing_dev_info *bdi = inode->i_sb->s_bdi;
557 &NILFS_I(orig)->i_btnode_cache; 461
462 INIT_LIST_HEAD(&shadow->frozen_buffers);
463 nilfs_mapping_init_once(&shadow->frozen_data);
464 nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops);
465 nilfs_mapping_init_once(&shadow->frozen_btnodes);
466 nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops);
467 mi->mi_shadow = shadow;
468 return 0;
558} 469}
559 470
560static void nilfs_mdt_clear(struct inode *inode) 471/**
472 * nilfs_mdt_save_to_shadow_map - copy bmap and dirty pages to shadow map
473 * @inode: inode of the metadata file
474 */
475int nilfs_mdt_save_to_shadow_map(struct inode *inode)
561{ 476{
477 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
562 struct nilfs_inode_info *ii = NILFS_I(inode); 478 struct nilfs_inode_info *ii = NILFS_I(inode);
479 struct nilfs_shadow_map *shadow = mi->mi_shadow;
480 int ret;
563 481
564 invalidate_mapping_pages(inode->i_mapping, 0, -1); 482 ret = nilfs_copy_dirty_pages(&shadow->frozen_data, inode->i_mapping);
565 truncate_inode_pages(inode->i_mapping, 0); 483 if (ret)
484 goto out;
485
486 ret = nilfs_copy_dirty_pages(&shadow->frozen_btnodes,
487 &ii->i_btnode_cache);
488 if (ret)
489 goto out;
566 490
567 if (test_bit(NILFS_I_BMAP, &ii->i_state)) 491 nilfs_bmap_save(ii->i_bmap, &shadow->bmap_store);
568 nilfs_bmap_clear(ii->i_bmap); 492 out:
569 nilfs_btnode_cache_clear(&ii->i_btnode_cache); 493 return ret;
570} 494}
571 495
572void nilfs_mdt_destroy(struct inode *inode) 496int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
573{ 497{
574 struct nilfs_mdt_info *mdi = NILFS_MDT(inode); 498 struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
499 struct buffer_head *bh_frozen;
500 struct page *page;
501 int blkbits = inode->i_blkbits;
502 int ret = -ENOMEM;
503
504 page = grab_cache_page(&shadow->frozen_data, bh->b_page->index);
505 if (!page)
506 return ret;
507
508 if (!page_has_buffers(page))
509 create_empty_buffers(page, 1 << blkbits, 0);
510
511 bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits);
512 if (bh_frozen) {
513 if (!buffer_uptodate(bh_frozen))
514 nilfs_copy_buffer(bh_frozen, bh);
515 if (list_empty(&bh_frozen->b_assoc_buffers)) {
516 list_add_tail(&bh_frozen->b_assoc_buffers,
517 &shadow->frozen_buffers);
518 set_buffer_nilfs_redirected(bh);
519 } else {
520 brelse(bh_frozen); /* already frozen */
521 }
522 ret = 0;
523 }
524 unlock_page(page);
525 page_cache_release(page);
526 return ret;
527}
528
529struct buffer_head *
530nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh)
531{
532 struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
533 struct buffer_head *bh_frozen = NULL;
534 struct page *page;
535 int n;
536
537 page = find_lock_page(&shadow->frozen_data, bh->b_page->index);
538 if (page) {
539 if (page_has_buffers(page)) {
540 n = bh_offset(bh) >> inode->i_blkbits;
541 bh_frozen = nilfs_page_get_nth_block(page, n);
542 }
543 unlock_page(page);
544 page_cache_release(page);
545 }
546 return bh_frozen;
547}
548
549static void nilfs_release_frozen_buffers(struct nilfs_shadow_map *shadow)
550{
551 struct list_head *head = &shadow->frozen_buffers;
552 struct buffer_head *bh;
553
554 while (!list_empty(head)) {
555 bh = list_first_entry(head, struct buffer_head,
556 b_assoc_buffers);
557 list_del_init(&bh->b_assoc_buffers);
558 brelse(bh); /* drop ref-count to make it releasable */
559 }
560}
561
562/**
563 * nilfs_mdt_restore_from_shadow_map - restore dirty pages and bmap state
564 * @inode: inode of the metadata file
565 */
566void nilfs_mdt_restore_from_shadow_map(struct inode *inode)
567{
568 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
569 struct nilfs_inode_info *ii = NILFS_I(inode);
570 struct nilfs_shadow_map *shadow = mi->mi_shadow;
571
572 down_write(&mi->mi_sem);
575 573
576 if (mdi->mi_palloc_cache) 574 if (mi->mi_palloc_cache)
577 nilfs_palloc_destroy_cache(inode); 575 nilfs_palloc_clear_cache(inode);
578 nilfs_mdt_clear(inode); 576
577 nilfs_clear_dirty_pages(inode->i_mapping);
578 nilfs_copy_back_pages(inode->i_mapping, &shadow->frozen_data);
579
580 nilfs_clear_dirty_pages(&ii->i_btnode_cache);
581 nilfs_copy_back_pages(&ii->i_btnode_cache, &shadow->frozen_btnodes);
582
583 nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store);
584
585 up_write(&mi->mi_sem);
586}
587
588/**
589 * nilfs_mdt_clear_shadow_map - truncate pages in shadow map caches
590 * @inode: inode of the metadata file
591 */
592void nilfs_mdt_clear_shadow_map(struct inode *inode)
593{
594 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
595 struct nilfs_shadow_map *shadow = mi->mi_shadow;
579 596
580 kfree(mdi->mi_bgl); /* kfree(NULL) is safe */ 597 down_write(&mi->mi_sem);
581 kfree(mdi); 598 nilfs_release_frozen_buffers(shadow);
582 nilfs_destroy_inode(inode); 599 truncate_inode_pages(&shadow->frozen_data, 0);
600 truncate_inode_pages(&shadow->frozen_btnodes, 0);
601 up_write(&mi->mi_sem);
583} 602}
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index 6c4bbb0470fc..b13734bf3521 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -28,26 +28,33 @@
28#include "nilfs.h" 28#include "nilfs.h"
29#include "page.h" 29#include "page.h"
30 30
31struct nilfs_shadow_map {
32 struct nilfs_bmap_store bmap_store;
33 struct address_space frozen_data;
34 struct address_space frozen_btnodes;
35 struct list_head frozen_buffers;
36};
37
31/** 38/**
32 * struct nilfs_mdt_info - on-memory private data of meta data files 39 * struct nilfs_mdt_info - on-memory private data of meta data files
33 * @mi_nilfs: back pointer to the_nilfs struct
34 * @mi_sem: reader/writer semaphore for meta data operations 40 * @mi_sem: reader/writer semaphore for meta data operations
35 * @mi_bgl: per-blockgroup locking 41 * @mi_bgl: per-blockgroup locking
36 * @mi_entry_size: size of an entry 42 * @mi_entry_size: size of an entry
37 * @mi_first_entry_offset: offset to the first entry 43 * @mi_first_entry_offset: offset to the first entry
38 * @mi_entries_per_block: number of entries in a block 44 * @mi_entries_per_block: number of entries in a block
39 * @mi_palloc_cache: persistent object allocator cache 45 * @mi_palloc_cache: persistent object allocator cache
46 * @mi_shadow: shadow of bmap and page caches
40 * @mi_blocks_per_group: number of blocks in a group 47 * @mi_blocks_per_group: number of blocks in a group
41 * @mi_blocks_per_desc_block: number of blocks per descriptor block 48 * @mi_blocks_per_desc_block: number of blocks per descriptor block
42 */ 49 */
43struct nilfs_mdt_info { 50struct nilfs_mdt_info {
44 struct the_nilfs *mi_nilfs;
45 struct rw_semaphore mi_sem; 51 struct rw_semaphore mi_sem;
46 struct blockgroup_lock *mi_bgl; 52 struct blockgroup_lock *mi_bgl;
47 unsigned mi_entry_size; 53 unsigned mi_entry_size;
48 unsigned mi_first_entry_offset; 54 unsigned mi_first_entry_offset;
49 unsigned long mi_entries_per_block; 55 unsigned long mi_entries_per_block;
50 struct nilfs_palloc_cache *mi_palloc_cache; 56 struct nilfs_palloc_cache *mi_palloc_cache;
57 struct nilfs_shadow_map *mi_shadow;
51 unsigned long mi_blocks_per_group; 58 unsigned long mi_blocks_per_group;
52 unsigned long mi_blocks_per_desc_block; 59 unsigned long mi_blocks_per_desc_block;
53}; 60};
@@ -59,9 +66,7 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
59 66
60static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode) 67static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode)
61{ 68{
62 struct super_block *sb = inode->i_sb; 69 return NILFS_SB(inode->i_sb)->s_nilfs;
63
64 return sb ? NILFS_SB(sb)->s_nilfs : NILFS_MDT(inode)->mi_nilfs;
65} 70}
66 71
67/* Default GFP flags using highmem */ 72/* Default GFP flags using highmem */
@@ -76,14 +81,17 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long);
76int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long); 81int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
77int nilfs_mdt_fetch_dirty(struct inode *); 82int nilfs_mdt_fetch_dirty(struct inode *);
78 83
79struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t, 84int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz);
80 size_t);
81struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
82 ino_t, gfp_t, size_t);
83void nilfs_mdt_destroy(struct inode *);
84void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned); 85void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
85void nilfs_mdt_set_shadow(struct inode *, struct inode *);
86 86
87int nilfs_mdt_setup_shadow_map(struct inode *inode,
88 struct nilfs_shadow_map *shadow);
89int nilfs_mdt_save_to_shadow_map(struct inode *inode);
90void nilfs_mdt_restore_from_shadow_map(struct inode *inode);
91void nilfs_mdt_clear_shadow_map(struct inode *inode);
92int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh);
93struct buffer_head *nilfs_mdt_get_frozen_buffer(struct inode *inode,
94 struct buffer_head *bh);
87 95
88#define nilfs_mdt_mark_buffer_dirty(bh) nilfs_mark_buffer_dirty(bh) 96#define nilfs_mdt_mark_buffer_dirty(bh) nilfs_mark_buffer_dirty(bh)
89 97
@@ -100,7 +108,7 @@ static inline void nilfs_mdt_clear_dirty(struct inode *inode)
100 108
101static inline __u64 nilfs_mdt_cno(struct inode *inode) 109static inline __u64 nilfs_mdt_cno(struct inode *inode)
102{ 110{
103 return NILFS_MDT(inode)->mi_nilfs->ns_cno; 111 return NILFS_I_NILFS(inode)->ns_cno;
104} 112}
105 113
106#define nilfs_mdt_bgl_lock(inode, bg) \ 114#define nilfs_mdt_bgl_lock(inode, bg) \
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index ad6ed2cf19b4..6e9557ecf161 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -40,7 +40,11 @@
40 40
41#include <linux/pagemap.h> 41#include <linux/pagemap.h>
42#include "nilfs.h" 42#include "nilfs.h"
43#include "export.h"
43 44
45#define NILFS_FID_SIZE_NON_CONNECTABLE \
46 (offsetof(struct nilfs_fid, parent_gen) / 4)
47#define NILFS_FID_SIZE_CONNECTABLE (sizeof(struct nilfs_fid) / 4)
44 48
45static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode) 49static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
46{ 50{
@@ -70,29 +74,13 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
70 ino = nilfs_inode_by_name(dir, &dentry->d_name); 74 ino = nilfs_inode_by_name(dir, &dentry->d_name);
71 inode = NULL; 75 inode = NULL;
72 if (ino) { 76 if (ino) {
73 inode = nilfs_iget(dir->i_sb, ino); 77 inode = nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino);
74 if (IS_ERR(inode)) 78 if (IS_ERR(inode))
75 return ERR_CAST(inode); 79 return ERR_CAST(inode);
76 } 80 }
77 return d_splice_alias(inode, dentry); 81 return d_splice_alias(inode, dentry);
78} 82}
79 83
80struct dentry *nilfs_get_parent(struct dentry *child)
81{
82 unsigned long ino;
83 struct inode *inode;
84 struct qstr dotdot = {.name = "..", .len = 2};
85
86 ino = nilfs_inode_by_name(child->d_inode, &dotdot);
87 if (!ino)
88 return ERR_PTR(-ENOENT);
89
90 inode = nilfs_iget(child->d_inode->i_sb, ino);
91 if (IS_ERR(inode))
92 return ERR_CAST(inode);
93 return d_obtain_alias(inode);
94}
95
96/* 84/*
97 * By the time this is called, we already have created 85 * By the time this is called, we already have created
98 * the directory cache entry for the new file, but it 86 * the directory cache entry for the new file, but it
@@ -219,7 +207,7 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
219 207
220 inode->i_ctime = CURRENT_TIME; 208 inode->i_ctime = CURRENT_TIME;
221 inode_inc_link_count(inode); 209 inode_inc_link_count(inode);
222 atomic_inc(&inode->i_count); 210 ihold(inode);
223 211
224 err = nilfs_add_nondir(dentry, inode); 212 err = nilfs_add_nondir(dentry, inode);
225 if (!err) 213 if (!err)
@@ -468,6 +456,115 @@ out:
468 return err; 456 return err;
469} 457}
470 458
459/*
460 * Export operations
461 */
462static struct dentry *nilfs_get_parent(struct dentry *child)
463{
464 unsigned long ino;
465 struct inode *inode;
466 struct qstr dotdot = {.name = "..", .len = 2};
467 struct nilfs_root *root;
468
469 ino = nilfs_inode_by_name(child->d_inode, &dotdot);
470 if (!ino)
471 return ERR_PTR(-ENOENT);
472
473 root = NILFS_I(child->d_inode)->i_root;
474
475 inode = nilfs_iget(child->d_inode->i_sb, root, ino);
476 if (IS_ERR(inode))
477 return ERR_CAST(inode);
478
479 return d_obtain_alias(inode);
480}
481
482static struct dentry *nilfs_get_dentry(struct super_block *sb, u64 cno,
483 u64 ino, u32 gen)
484{
485 struct nilfs_root *root;
486 struct inode *inode;
487
488 if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO)
489 return ERR_PTR(-ESTALE);
490
491 root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs, cno);
492 if (!root)
493 return ERR_PTR(-ESTALE);
494
495 inode = nilfs_iget(sb, root, ino);
496 nilfs_put_root(root);
497
498 if (IS_ERR(inode))
499 return ERR_CAST(inode);
500 if (gen && inode->i_generation != gen) {
501 iput(inode);
502 return ERR_PTR(-ESTALE);
503 }
504 return d_obtain_alias(inode);
505}
506
507static struct dentry *nilfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
508 int fh_len, int fh_type)
509{
510 struct nilfs_fid *fid = (struct nilfs_fid *)fh;
511
512 if ((fh_len != NILFS_FID_SIZE_NON_CONNECTABLE &&
513 fh_len != NILFS_FID_SIZE_CONNECTABLE) ||
514 (fh_type != FILEID_NILFS_WITH_PARENT &&
515 fh_type != FILEID_NILFS_WITHOUT_PARENT))
516 return NULL;
517
518 return nilfs_get_dentry(sb, fid->cno, fid->ino, fid->gen);
519}
520
521static struct dentry *nilfs_fh_to_parent(struct super_block *sb, struct fid *fh,
522 int fh_len, int fh_type)
523{
524 struct nilfs_fid *fid = (struct nilfs_fid *)fh;
525
526 if (fh_len != NILFS_FID_SIZE_CONNECTABLE ||
527 fh_type != FILEID_NILFS_WITH_PARENT)
528 return NULL;
529
530 return nilfs_get_dentry(sb, fid->cno, fid->parent_ino, fid->parent_gen);
531}
532
533static int nilfs_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
534 int connectable)
535{
536 struct nilfs_fid *fid = (struct nilfs_fid *)fh;
537 struct inode *inode = dentry->d_inode;
538 struct nilfs_root *root = NILFS_I(inode)->i_root;
539 int type;
540
541 if (*lenp < NILFS_FID_SIZE_NON_CONNECTABLE ||
542 (connectable && *lenp < NILFS_FID_SIZE_CONNECTABLE))
543 return 255;
544
545 fid->cno = root->cno;
546 fid->ino = inode->i_ino;
547 fid->gen = inode->i_generation;
548
549 if (connectable && !S_ISDIR(inode->i_mode)) {
550 struct inode *parent;
551
552 spin_lock(&dentry->d_lock);
553 parent = dentry->d_parent->d_inode;
554 fid->parent_ino = parent->i_ino;
555 fid->parent_gen = parent->i_generation;
556 spin_unlock(&dentry->d_lock);
557
558 type = FILEID_NILFS_WITH_PARENT;
559 *lenp = NILFS_FID_SIZE_CONNECTABLE;
560 } else {
561 type = FILEID_NILFS_WITHOUT_PARENT;
562 *lenp = NILFS_FID_SIZE_NON_CONNECTABLE;
563 }
564
565 return type;
566}
567
471const struct inode_operations nilfs_dir_inode_operations = { 568const struct inode_operations nilfs_dir_inode_operations = {
472 .create = nilfs_create, 569 .create = nilfs_create,
473 .lookup = nilfs_lookup, 570 .lookup = nilfs_lookup,
@@ -491,4 +588,12 @@ const struct inode_operations nilfs_symlink_inode_operations = {
491 .readlink = generic_readlink, 588 .readlink = generic_readlink,
492 .follow_link = page_follow_link_light, 589 .follow_link = page_follow_link_light,
493 .put_link = page_put_link, 590 .put_link = page_put_link,
591 .permission = nilfs_permission,
592};
593
594const struct export_operations nilfs_export_ops = {
595 .encode_fh = nilfs_encode_fh,
596 .fh_to_dentry = nilfs_fh_to_dentry,
597 .fh_to_parent = nilfs_fh_to_parent,
598 .get_parent = nilfs_get_parent,
494}; 599};
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index d3d54046e5f8..f7560da5a567 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -59,6 +59,7 @@ struct nilfs_inode_info {
59#endif 59#endif
60 struct buffer_head *i_bh; /* i_bh contains a new or dirty 60 struct buffer_head *i_bh; /* i_bh contains a new or dirty
61 disk inode */ 61 disk inode */
62 struct nilfs_root *i_root;
62 struct inode vfs_inode; 63 struct inode vfs_inode;
63}; 64};
64 65
@@ -100,7 +101,6 @@ enum {
100 NILFS_I_INODE_DIRTY, /* write_inode is requested */ 101 NILFS_I_INODE_DIRTY, /* write_inode is requested */
101 NILFS_I_BMAP, /* has bmap and btnode_cache */ 102 NILFS_I_BMAP, /* has bmap and btnode_cache */
102 NILFS_I_GCINODE, /* inode for GC, on memory only */ 103 NILFS_I_GCINODE, /* inode for GC, on memory only */
103 NILFS_I_GCDAT, /* shadow DAT, on memory only */
104}; 104};
105 105
106/* 106/*
@@ -192,7 +192,7 @@ static inline int nilfs_doing_construction(void)
192 192
193static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs) 193static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
194{ 194{
195 return nilfs_doing_gc() ? nilfs->ns_gc_dat : nilfs->ns_dat; 195 return nilfs->ns_dat;
196} 196}
197 197
198/* 198/*
@@ -200,12 +200,9 @@ static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
200 */ 200 */
201#ifdef CONFIG_NILFS_POSIX_ACL 201#ifdef CONFIG_NILFS_POSIX_ACL
202#error "NILFS: not yet supported POSIX ACL" 202#error "NILFS: not yet supported POSIX ACL"
203extern int nilfs_permission(struct inode *, int, struct nameidata *);
204extern int nilfs_acl_chmod(struct inode *); 203extern int nilfs_acl_chmod(struct inode *);
205extern int nilfs_init_acl(struct inode *, struct inode *); 204extern int nilfs_init_acl(struct inode *, struct inode *);
206#else 205#else
207#define nilfs_permission NULL
208
209static inline int nilfs_acl_chmod(struct inode *inode) 206static inline int nilfs_acl_chmod(struct inode *inode)
210{ 207{
211 return 0; 208 return 0;
@@ -247,11 +244,19 @@ extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
247extern void nilfs_set_inode_flags(struct inode *); 244extern void nilfs_set_inode_flags(struct inode *);
248extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *); 245extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
249extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int); 246extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
250extern struct inode *nilfs_iget(struct super_block *, unsigned long); 247struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
248 unsigned long ino);
249struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
250 unsigned long ino);
251struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
252 unsigned long ino);
253extern struct inode *nilfs_iget_for_gc(struct super_block *sb,
254 unsigned long ino, __u64 cno);
251extern void nilfs_update_inode(struct inode *, struct buffer_head *); 255extern void nilfs_update_inode(struct inode *, struct buffer_head *);
252extern void nilfs_truncate(struct inode *); 256extern void nilfs_truncate(struct inode *);
253extern void nilfs_evict_inode(struct inode *); 257extern void nilfs_evict_inode(struct inode *);
254extern int nilfs_setattr(struct dentry *, struct iattr *); 258extern int nilfs_setattr(struct dentry *, struct iattr *);
259int nilfs_permission(struct inode *inode, int mask);
255extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *, 260extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *,
256 struct buffer_head **); 261 struct buffer_head **);
257extern int nilfs_inode_dirty(struct inode *); 262extern int nilfs_inode_dirty(struct inode *);
@@ -260,11 +265,7 @@ extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *,
260extern int nilfs_mark_inode_dirty(struct inode *); 265extern int nilfs_mark_inode_dirty(struct inode *);
261extern void nilfs_dirty_inode(struct inode *); 266extern void nilfs_dirty_inode(struct inode *);
262 267
263/* namei.c */
264extern struct dentry *nilfs_get_parent(struct dentry *);
265
266/* super.c */ 268/* super.c */
267extern struct inode *nilfs_alloc_inode_common(struct the_nilfs *);
268extern struct inode *nilfs_alloc_inode(struct super_block *); 269extern struct inode *nilfs_alloc_inode(struct super_block *);
269extern void nilfs_destroy_inode(struct inode *); 270extern void nilfs_destroy_inode(struct inode *);
270extern void nilfs_error(struct super_block *, const char *, const char *, ...) 271extern void nilfs_error(struct super_block *, const char *, const char *, ...)
@@ -283,8 +284,9 @@ extern struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *,
283 int flip); 284 int flip);
284extern int nilfs_commit_super(struct nilfs_sb_info *, int); 285extern int nilfs_commit_super(struct nilfs_sb_info *, int);
285extern int nilfs_cleanup_super(struct nilfs_sb_info *); 286extern int nilfs_cleanup_super(struct nilfs_sb_info *);
286extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64); 287int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
287extern void nilfs_detach_checkpoint(struct nilfs_sb_info *); 288 struct nilfs_root **root);
289int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno);
288 290
289/* gcinode.c */ 291/* gcinode.c */
290int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64, 292int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
@@ -292,16 +294,8 @@ int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
292int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64, 294int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64,
293 struct buffer_head **); 295 struct buffer_head **);
294int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *); 296int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *);
295int nilfs_init_gccache(struct the_nilfs *); 297int nilfs_init_gcinode(struct inode *inode);
296void nilfs_destroy_gccache(struct the_nilfs *); 298void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs);
297void nilfs_clear_gcinode(struct inode *);
298struct inode *nilfs_gc_iget(struct the_nilfs *, ino_t, __u64);
299void nilfs_remove_all_gcinode(struct the_nilfs *);
300
301/* gcdat.c */
302int nilfs_init_gcdat_inode(struct the_nilfs *);
303void nilfs_commit_gcdat_inode(struct the_nilfs *);
304void nilfs_clear_gcdat_inode(struct the_nilfs *);
305 299
306/* 300/*
307 * Inodes and files operations 301 * Inodes and files operations
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index aab11db2cb08..a6c3c2e817f8 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -79,8 +79,8 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
79{ 79{
80 int blkbits = inode->i_blkbits; 80 int blkbits = inode->i_blkbits;
81 pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits); 81 pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits);
82 struct page *page, *opage; 82 struct page *page;
83 struct buffer_head *bh, *obh; 83 struct buffer_head *bh;
84 84
85 page = grab_cache_page(mapping, index); 85 page = grab_cache_page(mapping, index);
86 if (unlikely(!page)) 86 if (unlikely(!page))
@@ -92,30 +92,6 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
92 page_cache_release(page); 92 page_cache_release(page);
93 return NULL; 93 return NULL;
94 } 94 }
95 if (!buffer_uptodate(bh) && mapping->assoc_mapping != NULL) {
96 /*
97 * Shadow page cache uses assoc_mapping to point its original
98 * page cache. The following code tries the original cache
99 * if the given cache is a shadow and it didn't hit.
100 */
101 opage = find_lock_page(mapping->assoc_mapping, index);
102 if (!opage)
103 return bh;
104
105 obh = __nilfs_get_page_block(opage, blkoff, index, blkbits,
106 b_state);
107 if (buffer_uptodate(obh)) {
108 nilfs_copy_buffer(bh, obh);
109 if (buffer_dirty(obh)) {
110 nilfs_mark_buffer_dirty(bh);
111 if (!buffer_nilfs_node(bh) && NILFS_MDT(inode))
112 nilfs_mdt_mark_dirty(inode);
113 }
114 }
115 brelse(obh);
116 unlock_page(opage);
117 page_cache_release(opage);
118 }
119 return bh; 95 return bh;
120} 96}
121 97
@@ -131,6 +107,7 @@ void nilfs_forget_buffer(struct buffer_head *bh)
131 lock_buffer(bh); 107 lock_buffer(bh);
132 clear_buffer_nilfs_volatile(bh); 108 clear_buffer_nilfs_volatile(bh);
133 clear_buffer_nilfs_checked(bh); 109 clear_buffer_nilfs_checked(bh);
110 clear_buffer_nilfs_redirected(bh);
134 clear_buffer_dirty(bh); 111 clear_buffer_dirty(bh);
135 if (nilfs_page_buffers_clean(page)) 112 if (nilfs_page_buffers_clean(page))
136 __nilfs_clear_page_dirty(page); 113 __nilfs_clear_page_dirty(page);
@@ -483,6 +460,7 @@ void nilfs_clear_dirty_pages(struct address_space *mapping)
483 clear_buffer_dirty(bh); 460 clear_buffer_dirty(bh);
484 clear_buffer_nilfs_volatile(bh); 461 clear_buffer_nilfs_volatile(bh);
485 clear_buffer_nilfs_checked(bh); 462 clear_buffer_nilfs_checked(bh);
463 clear_buffer_nilfs_redirected(bh);
486 clear_buffer_uptodate(bh); 464 clear_buffer_uptodate(bh);
487 clear_buffer_mapped(bh); 465 clear_buffer_mapped(bh);
488 unlock_buffer(bh); 466 unlock_buffer(bh);
@@ -513,6 +491,31 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
513 } 491 }
514 return nc; 492 return nc;
515} 493}
494
495void nilfs_mapping_init_once(struct address_space *mapping)
496{
497 memset(mapping, 0, sizeof(*mapping));
498 INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
499 spin_lock_init(&mapping->tree_lock);
500 INIT_LIST_HEAD(&mapping->private_list);
501 spin_lock_init(&mapping->private_lock);
502
503 spin_lock_init(&mapping->i_mmap_lock);
504 INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
505 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
506}
507
508void nilfs_mapping_init(struct address_space *mapping,
509 struct backing_dev_info *bdi,
510 const struct address_space_operations *aops)
511{
512 mapping->host = NULL;
513 mapping->flags = 0;
514 mapping_set_gfp_mask(mapping, GFP_NOFS);
515 mapping->assoc_mapping = NULL;
516 mapping->backing_dev_info = bdi;
517 mapping->a_ops = aops;
518}
516 519
517/* 520/*
518 * NILFS2 needs clear_page_dirty() in the following two cases: 521 * NILFS2 needs clear_page_dirty() in the following two cases:
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index f53d8da41ed7..fb9e8a8a2038 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -35,12 +35,14 @@ enum {
35 BH_NILFS_Node, 35 BH_NILFS_Node,
36 BH_NILFS_Volatile, 36 BH_NILFS_Volatile,
37 BH_NILFS_Checked, 37 BH_NILFS_Checked,
38 BH_NILFS_Redirected,
38}; 39};
39 40
40BUFFER_FNS(NILFS_Allocated, nilfs_allocated) /* nilfs private buffers */ 41BUFFER_FNS(NILFS_Allocated, nilfs_allocated) /* nilfs private buffers */
41BUFFER_FNS(NILFS_Node, nilfs_node) /* nilfs node buffers */ 42BUFFER_FNS(NILFS_Node, nilfs_node) /* nilfs node buffers */
42BUFFER_FNS(NILFS_Volatile, nilfs_volatile) 43BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
43BUFFER_FNS(NILFS_Checked, nilfs_checked) /* buffer is verified */ 44BUFFER_FNS(NILFS_Checked, nilfs_checked) /* buffer is verified */
45BUFFER_FNS(NILFS_Redirected, nilfs_redirected) /* redirected to a copy */
44 46
45 47
46void nilfs_mark_buffer_dirty(struct buffer_head *bh); 48void nilfs_mark_buffer_dirty(struct buffer_head *bh);
@@ -59,6 +61,10 @@ void nilfs_free_private_page(struct page *);
59int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); 61int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
60void nilfs_copy_back_pages(struct address_space *, struct address_space *); 62void nilfs_copy_back_pages(struct address_space *, struct address_space *);
61void nilfs_clear_dirty_pages(struct address_space *); 63void nilfs_clear_dirty_pages(struct address_space *);
64void nilfs_mapping_init_once(struct address_space *mapping);
65void nilfs_mapping_init(struct address_space *mapping,
66 struct backing_dev_info *bdi,
67 const struct address_space_operations *aops);
62unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); 68unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
63 69
64#define NILFS_PAGE_BUG(page, m, a...) \ 70#define NILFS_PAGE_BUG(page, m, a...) \
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index d0c35ef39f6a..5d2711c28da7 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -440,7 +440,6 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
440 segnum[2] = ri->ri_segnum; 440 segnum[2] = ri->ri_segnum;
441 segnum[3] = ri->ri_nextnum; 441 segnum[3] = ri->ri_nextnum;
442 442
443 nilfs_attach_writer(nilfs, sbi);
444 /* 443 /*
445 * Releasing the next segment of the latest super root. 444 * Releasing the next segment of the latest super root.
446 * The next segment is invalidated by this recovery. 445 * The next segment is invalidated by this recovery.
@@ -480,7 +479,6 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
480 479
481 failed: 480 failed:
482 /* No need to recover sufile because it will be destroyed on error */ 481 /* No need to recover sufile because it will be destroyed on error */
483 nilfs_detach_writer(nilfs, sbi);
484 return err; 482 return err;
485} 483}
486 484
@@ -504,6 +502,7 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
504 502
505static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, 503static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
506 struct nilfs_sb_info *sbi, 504 struct nilfs_sb_info *sbi,
505 struct nilfs_root *root,
507 struct list_head *head, 506 struct list_head *head,
508 unsigned long *nr_salvaged_blocks) 507 unsigned long *nr_salvaged_blocks)
509{ 508{
@@ -515,7 +514,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
515 int err = 0, err2 = 0; 514 int err = 0, err2 = 0;
516 515
517 list_for_each_entry_safe(rb, n, head, list) { 516 list_for_each_entry_safe(rb, n, head, list) {
518 inode = nilfs_iget(sbi->s_super, rb->ino); 517 inode = nilfs_iget(sbi->s_super, root, rb->ino);
519 if (IS_ERR(inode)) { 518 if (IS_ERR(inode)) {
520 err = PTR_ERR(inode); 519 err = PTR_ERR(inode);
521 inode = NULL; 520 inode = NULL;
@@ -578,6 +577,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
578 */ 577 */
579static int nilfs_do_roll_forward(struct the_nilfs *nilfs, 578static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
580 struct nilfs_sb_info *sbi, 579 struct nilfs_sb_info *sbi,
580 struct nilfs_root *root,
581 struct nilfs_recovery_info *ri) 581 struct nilfs_recovery_info *ri)
582{ 582{
583 struct buffer_head *bh_sum = NULL; 583 struct buffer_head *bh_sum = NULL;
@@ -597,7 +597,6 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
597 }; 597 };
598 int state = RF_INIT_ST; 598 int state = RF_INIT_ST;
599 599
600 nilfs_attach_writer(nilfs, sbi);
601 pseg_start = ri->ri_lsegs_start; 600 pseg_start = ri->ri_lsegs_start;
602 seg_seq = ri->ri_lsegs_start_seq; 601 seg_seq = ri->ri_lsegs_start_seq;
603 segnum = nilfs_get_segnum_of_block(nilfs, pseg_start); 602 segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
@@ -649,7 +648,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
649 goto failed; 648 goto failed;
650 if (flags & NILFS_SS_LOGEND) { 649 if (flags & NILFS_SS_LOGEND) {
651 err = nilfs_recover_dsync_blocks( 650 err = nilfs_recover_dsync_blocks(
652 nilfs, sbi, &dsync_blocks, 651 nilfs, sbi, root, &dsync_blocks,
653 &nsalvaged_blocks); 652 &nsalvaged_blocks);
654 if (unlikely(err)) 653 if (unlikely(err))
655 goto failed; 654 goto failed;
@@ -688,7 +687,6 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
688 out: 687 out:
689 brelse(bh_sum); 688 brelse(bh_sum);
690 dispose_recovery_list(&dsync_blocks); 689 dispose_recovery_list(&dsync_blocks);
691 nilfs_detach_writer(nilfs, sbi);
692 return err; 690 return err;
693 691
694 confused: 692 confused:
@@ -746,19 +744,20 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
746 struct nilfs_sb_info *sbi, 744 struct nilfs_sb_info *sbi,
747 struct nilfs_recovery_info *ri) 745 struct nilfs_recovery_info *ri)
748{ 746{
747 struct nilfs_root *root;
749 int err; 748 int err;
750 749
751 if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0) 750 if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0)
752 return 0; 751 return 0;
753 752
754 err = nilfs_attach_checkpoint(sbi, ri->ri_cno); 753 err = nilfs_attach_checkpoint(sbi, ri->ri_cno, true, &root);
755 if (unlikely(err)) { 754 if (unlikely(err)) {
756 printk(KERN_ERR 755 printk(KERN_ERR
757 "NILFS: error loading the latest checkpoint.\n"); 756 "NILFS: error loading the latest checkpoint.\n");
758 return err; 757 return err;
759 } 758 }
760 759
761 err = nilfs_do_roll_forward(nilfs, sbi, ri); 760 err = nilfs_do_roll_forward(nilfs, sbi, root, ri);
762 if (unlikely(err)) 761 if (unlikely(err))
763 goto failed; 762 goto failed;
764 763
@@ -770,7 +769,7 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
770 goto failed; 769 goto failed;
771 } 770 }
772 771
773 err = nilfs_attach_segment_constructor(sbi); 772 err = nilfs_attach_segment_constructor(sbi, root);
774 if (unlikely(err)) 773 if (unlikely(err))
775 goto failed; 774 goto failed;
776 775
@@ -788,7 +787,7 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
788 } 787 }
789 788
790 failed: 789 failed:
791 nilfs_detach_checkpoint(sbi); 790 nilfs_put_root(root);
792 return err; 791 return err;
793} 792}
794 793
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index 0776ccc2504a..35a07157b980 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -42,11 +42,6 @@ struct nilfs_sc_info;
42 * NILFS super-block data in memory 42 * NILFS super-block data in memory
43 */ 43 */
44struct nilfs_sb_info { 44struct nilfs_sb_info {
45 /* Snapshot status */
46 __u64 s_snapshot_cno; /* Checkpoint number */
47 atomic_t s_inodes_count;
48 atomic_t s_blocks_count; /* Reserved (might be deleted) */
49
50 /* Mount options */ 45 /* Mount options */
51 unsigned long s_mount_opt; 46 unsigned long s_mount_opt;
52 uid_t s_resuid; 47 uid_t s_resuid;
@@ -59,8 +54,6 @@ struct nilfs_sb_info {
59 /* Fundamental members */ 54 /* Fundamental members */
60 struct super_block *s_super; /* reverse pointer to super_block */ 55 struct super_block *s_super; /* reverse pointer to super_block */
61 struct the_nilfs *s_nilfs; 56 struct the_nilfs *s_nilfs;
62 struct list_head s_list; /* list head for nilfs->ns_supers */
63 atomic_t s_count; /* reference count */
64 57
65 /* Segment constructor */ 58 /* Segment constructor */
66 struct list_head s_dirty_files; /* dirty files list */ 59 struct list_head s_dirty_files; /* dirty files list */
@@ -68,9 +61,6 @@ struct nilfs_sb_info {
68 spinlock_t s_inode_lock; /* Lock for the nilfs inode. 61 spinlock_t s_inode_lock; /* Lock for the nilfs inode.
69 It covers s_dirty_files list */ 62 It covers s_dirty_files list */
70 63
71 /* Metadata files */
72 struct inode *s_ifile; /* index file inode */
73
74 /* Inode allocator */ 64 /* Inode allocator */
75 spinlock_t s_next_gen_lock; 65 spinlock_t s_next_gen_lock;
76 u32 s_next_generation; 66 u32 s_next_generation;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 4588fb9e93df..0f83e93935b2 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -371,7 +371,8 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
371 struct bio *bio = wi->bio; 371 struct bio *bio = wi->bio;
372 int err; 372 int err;
373 373
374 if (segbuf->sb_nbio > 0 && bdi_write_congested(wi->nilfs->ns_bdi)) { 374 if (segbuf->sb_nbio > 0 &&
375 bdi_write_congested(segbuf->sb_super->s_bdi)) {
375 wait_for_completion(&segbuf->sb_bio_event); 376 wait_for_completion(&segbuf->sb_bio_event);
376 segbuf->sb_nbio--; 377 segbuf->sb_nbio--;
377 if (unlikely(atomic_read(&segbuf->sb_err))) { 378 if (unlikely(atomic_read(&segbuf->sb_err))) {
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 9fd051a33c4f..687d090cea34 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -191,6 +191,8 @@ int nilfs_transaction_begin(struct super_block *sb,
191 if (ret > 0) 191 if (ret > 0)
192 return 0; 192 return 0;
193 193
194 vfs_check_frozen(sb, SB_FREEZE_WRITE);
195
194 sbi = NILFS_SB(sb); 196 sbi = NILFS_SB(sb);
195 nilfs = sbi->s_nilfs; 197 nilfs = sbi->s_nilfs;
196 down_read(&nilfs->ns_segctor_sem); 198 down_read(&nilfs->ns_segctor_sem);
@@ -366,8 +368,7 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
366 368
367 if (nilfs_doing_gc()) 369 if (nilfs_doing_gc())
368 flags = NILFS_SS_GC; 370 flags = NILFS_SS_GC;
369 err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime, 371 err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime, sci->sc_cno);
370 sci->sc_sbi->s_nilfs->ns_cno);
371 if (unlikely(err)) 372 if (unlikely(err))
372 return err; 373 return err;
373 374
@@ -440,17 +441,26 @@ static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci,
440 struct nilfs_finfo *finfo; 441 struct nilfs_finfo *finfo;
441 struct nilfs_inode_info *ii; 442 struct nilfs_inode_info *ii;
442 struct nilfs_segment_buffer *segbuf; 443 struct nilfs_segment_buffer *segbuf;
444 __u64 cno;
443 445
444 if (sci->sc_blk_cnt == 0) 446 if (sci->sc_blk_cnt == 0)
445 return; 447 return;
446 448
447 ii = NILFS_I(inode); 449 ii = NILFS_I(inode);
450
451 if (test_bit(NILFS_I_GCINODE, &ii->i_state))
452 cno = ii->i_cno;
453 else if (NILFS_ROOT_METADATA_FILE(inode->i_ino))
454 cno = 0;
455 else
456 cno = sci->sc_cno;
457
448 finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr, 458 finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr,
449 sizeof(*finfo)); 459 sizeof(*finfo));
450 finfo->fi_ino = cpu_to_le64(inode->i_ino); 460 finfo->fi_ino = cpu_to_le64(inode->i_ino);
451 finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt); 461 finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt);
452 finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt); 462 finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt);
453 finfo->fi_cno = cpu_to_le64(ii->i_cno); 463 finfo->fi_cno = cpu_to_le64(cno);
454 464
455 segbuf = sci->sc_curseg; 465 segbuf = sci->sc_curseg;
456 segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset + 466 segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset +
@@ -755,12 +765,12 @@ static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
755 } 765 }
756} 766}
757 767
758static int nilfs_test_metadata_dirty(struct nilfs_sb_info *sbi) 768static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
769 struct nilfs_root *root)
759{ 770{
760 struct the_nilfs *nilfs = sbi->s_nilfs;
761 int ret = 0; 771 int ret = 0;
762 772
763 if (nilfs_mdt_fetch_dirty(sbi->s_ifile)) 773 if (nilfs_mdt_fetch_dirty(root->ifile))
764 ret++; 774 ret++;
765 if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile)) 775 if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile))
766 ret++; 776 ret++;
@@ -785,7 +795,7 @@ static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
785 struct nilfs_sb_info *sbi = sci->sc_sbi; 795 struct nilfs_sb_info *sbi = sci->sc_sbi;
786 int ret = 0; 796 int ret = 0;
787 797
788 if (nilfs_test_metadata_dirty(sbi)) 798 if (nilfs_test_metadata_dirty(sbi->s_nilfs, sci->sc_root))
789 set_bit(NILFS_SC_DIRTY, &sci->sc_flags); 799 set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
790 800
791 spin_lock(&sbi->s_inode_lock); 801 spin_lock(&sbi->s_inode_lock);
@@ -801,7 +811,7 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
801 struct nilfs_sb_info *sbi = sci->sc_sbi; 811 struct nilfs_sb_info *sbi = sci->sc_sbi;
802 struct the_nilfs *nilfs = sbi->s_nilfs; 812 struct the_nilfs *nilfs = sbi->s_nilfs;
803 813
804 nilfs_mdt_clear_dirty(sbi->s_ifile); 814 nilfs_mdt_clear_dirty(sci->sc_root->ifile);
805 nilfs_mdt_clear_dirty(nilfs->ns_cpfile); 815 nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
806 nilfs_mdt_clear_dirty(nilfs->ns_sufile); 816 nilfs_mdt_clear_dirty(nilfs->ns_sufile);
807 nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs)); 817 nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs));
@@ -848,9 +858,9 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
848 raw_cp->cp_snapshot_list.ssl_next = 0; 858 raw_cp->cp_snapshot_list.ssl_next = 0;
849 raw_cp->cp_snapshot_list.ssl_prev = 0; 859 raw_cp->cp_snapshot_list.ssl_prev = 0;
850 raw_cp->cp_inodes_count = 860 raw_cp->cp_inodes_count =
851 cpu_to_le64(atomic_read(&sbi->s_inodes_count)); 861 cpu_to_le64(atomic_read(&sci->sc_root->inodes_count));
852 raw_cp->cp_blocks_count = 862 raw_cp->cp_blocks_count =
853 cpu_to_le64(atomic_read(&sbi->s_blocks_count)); 863 cpu_to_le64(atomic_read(&sci->sc_root->blocks_count));
854 raw_cp->cp_nblk_inc = 864 raw_cp->cp_nblk_inc =
855 cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc); 865 cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
856 raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime); 866 raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
@@ -861,7 +871,8 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
861 else 871 else
862 nilfs_checkpoint_set_minor(raw_cp); 872 nilfs_checkpoint_set_minor(raw_cp);
863 873
864 nilfs_write_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode, 1); 874 nilfs_write_inode_common(sci->sc_root->ifile,
875 &raw_cp->cp_ifile_inode, 1);
865 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp); 876 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
866 return 0; 877 return 0;
867 878
@@ -886,13 +897,12 @@ static void nilfs_fill_in_file_bmap(struct inode *ifile,
886 } 897 }
887} 898}
888 899
889static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci, 900static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci)
890 struct inode *ifile)
891{ 901{
892 struct nilfs_inode_info *ii; 902 struct nilfs_inode_info *ii;
893 903
894 list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) { 904 list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) {
895 nilfs_fill_in_file_bmap(ifile, ii); 905 nilfs_fill_in_file_bmap(sci->sc_root->ifile, ii);
896 set_bit(NILFS_I_COLLECTED, &ii->i_state); 906 set_bit(NILFS_I_COLLECTED, &ii->i_state);
897 } 907 }
898} 908}
@@ -1135,7 +1145,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
1135 sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED; 1145 sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
1136 /* Fall through */ 1146 /* Fall through */
1137 case NILFS_ST_IFILE: 1147 case NILFS_ST_IFILE:
1138 err = nilfs_segctor_scan_file(sci, sbi->s_ifile, 1148 err = nilfs_segctor_scan_file(sci, sci->sc_root->ifile,
1139 &nilfs_sc_file_ops); 1149 &nilfs_sc_file_ops);
1140 if (unlikely(err)) 1150 if (unlikely(err))
1141 break; 1151 break;
@@ -1599,7 +1609,7 @@ nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out)
1599 kunmap_atomic(kaddr, KM_USER0); 1609 kunmap_atomic(kaddr, KM_USER0);
1600 1610
1601 if (!TestSetPageWriteback(clone_page)) 1611 if (!TestSetPageWriteback(clone_page))
1602 inc_zone_page_state(clone_page, NR_WRITEBACK); 1612 account_page_writeback(clone_page);
1603 unlock_page(clone_page); 1613 unlock_page(clone_page);
1604 1614
1605 return 0; 1615 return 0;
@@ -1900,6 +1910,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1900 set_buffer_uptodate(bh); 1910 set_buffer_uptodate(bh);
1901 clear_buffer_dirty(bh); 1911 clear_buffer_dirty(bh);
1902 clear_buffer_nilfs_volatile(bh); 1912 clear_buffer_nilfs_volatile(bh);
1913 clear_buffer_nilfs_redirected(bh);
1903 if (bh == segbuf->sb_super_root) { 1914 if (bh == segbuf->sb_super_root) {
1904 if (bh->b_page != bd_page) { 1915 if (bh->b_page != bd_page) {
1905 end_page_writeback(bd_page); 1916 end_page_writeback(bd_page);
@@ -1936,11 +1947,9 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1936 1947
1937 nilfs_drop_collected_inodes(&sci->sc_dirty_files); 1948 nilfs_drop_collected_inodes(&sci->sc_dirty_files);
1938 1949
1939 if (nilfs_doing_gc()) { 1950 if (nilfs_doing_gc())
1940 nilfs_drop_collected_inodes(&sci->sc_gc_inodes); 1951 nilfs_drop_collected_inodes(&sci->sc_gc_inodes);
1941 if (update_sr) 1952 else
1942 nilfs_commit_gcdat_inode(nilfs);
1943 } else
1944 nilfs->ns_nongc_ctime = sci->sc_seg_ctime; 1953 nilfs->ns_nongc_ctime = sci->sc_seg_ctime;
1945 1954
1946 sci->sc_nblk_inc += sci->sc_nblk_this_inc; 1955 sci->sc_nblk_inc += sci->sc_nblk_this_inc;
@@ -1976,7 +1985,7 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
1976 struct nilfs_sb_info *sbi) 1985 struct nilfs_sb_info *sbi)
1977{ 1986{
1978 struct nilfs_inode_info *ii, *n; 1987 struct nilfs_inode_info *ii, *n;
1979 __u64 cno = sbi->s_nilfs->ns_cno; 1988 struct inode *ifile = sci->sc_root->ifile;
1980 1989
1981 spin_lock(&sbi->s_inode_lock); 1990 spin_lock(&sbi->s_inode_lock);
1982 retry: 1991 retry:
@@ -1987,14 +1996,14 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
1987 1996
1988 spin_unlock(&sbi->s_inode_lock); 1997 spin_unlock(&sbi->s_inode_lock);
1989 err = nilfs_ifile_get_inode_block( 1998 err = nilfs_ifile_get_inode_block(
1990 sbi->s_ifile, ii->vfs_inode.i_ino, &ibh); 1999 ifile, ii->vfs_inode.i_ino, &ibh);
1991 if (unlikely(err)) { 2000 if (unlikely(err)) {
1992 nilfs_warning(sbi->s_super, __func__, 2001 nilfs_warning(sbi->s_super, __func__,
1993 "failed to get inode block.\n"); 2002 "failed to get inode block.\n");
1994 return err; 2003 return err;
1995 } 2004 }
1996 nilfs_mdt_mark_buffer_dirty(ibh); 2005 nilfs_mdt_mark_buffer_dirty(ibh);
1997 nilfs_mdt_mark_dirty(sbi->s_ifile); 2006 nilfs_mdt_mark_dirty(ifile);
1998 spin_lock(&sbi->s_inode_lock); 2007 spin_lock(&sbi->s_inode_lock);
1999 if (likely(!ii->i_bh)) 2008 if (likely(!ii->i_bh))
2000 ii->i_bh = ibh; 2009 ii->i_bh = ibh;
@@ -2002,7 +2011,6 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
2002 brelse(ibh); 2011 brelse(ibh);
2003 goto retry; 2012 goto retry;
2004 } 2013 }
2005 ii->i_cno = cno;
2006 2014
2007 clear_bit(NILFS_I_QUEUED, &ii->i_state); 2015 clear_bit(NILFS_I_QUEUED, &ii->i_state);
2008 set_bit(NILFS_I_BUSY, &ii->i_state); 2016 set_bit(NILFS_I_BUSY, &ii->i_state);
@@ -2011,8 +2019,6 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
2011 } 2019 }
2012 spin_unlock(&sbi->s_inode_lock); 2020 spin_unlock(&sbi->s_inode_lock);
2013 2021
2014 NILFS_I(sbi->s_ifile)->i_cno = cno;
2015
2016 return 0; 2022 return 0;
2017} 2023}
2018 2024
@@ -2021,19 +2027,13 @@ static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
2021{ 2027{
2022 struct nilfs_transaction_info *ti = current->journal_info; 2028 struct nilfs_transaction_info *ti = current->journal_info;
2023 struct nilfs_inode_info *ii, *n; 2029 struct nilfs_inode_info *ii, *n;
2024 __u64 cno = sbi->s_nilfs->ns_cno;
2025 2030
2026 spin_lock(&sbi->s_inode_lock); 2031 spin_lock(&sbi->s_inode_lock);
2027 list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) { 2032 list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
2028 if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) || 2033 if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) ||
2029 test_bit(NILFS_I_DIRTY, &ii->i_state)) { 2034 test_bit(NILFS_I_DIRTY, &ii->i_state))
2030 /* The current checkpoint number (=nilfs->ns_cno) is
2031 changed between check-in and check-out only if the
2032 super root is written out. So, we can update i_cno
2033 for the inodes that remain in the dirty list. */
2034 ii->i_cno = cno;
2035 continue; 2035 continue;
2036 } 2036
2037 clear_bit(NILFS_I_BUSY, &ii->i_state); 2037 clear_bit(NILFS_I_BUSY, &ii->i_state);
2038 brelse(ii->i_bh); 2038 brelse(ii->i_bh);
2039 ii->i_bh = NULL; 2039 ii->i_bh = NULL;
@@ -2054,12 +2054,13 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2054 int err; 2054 int err;
2055 2055
2056 sci->sc_stage.scnt = NILFS_ST_INIT; 2056 sci->sc_stage.scnt = NILFS_ST_INIT;
2057 sci->sc_cno = nilfs->ns_cno;
2057 2058
2058 err = nilfs_segctor_check_in_files(sci, sbi); 2059 err = nilfs_segctor_check_in_files(sci, sbi);
2059 if (unlikely(err)) 2060 if (unlikely(err))
2060 goto out; 2061 goto out;
2061 2062
2062 if (nilfs_test_metadata_dirty(sbi)) 2063 if (nilfs_test_metadata_dirty(nilfs, sci->sc_root))
2063 set_bit(NILFS_SC_DIRTY, &sci->sc_flags); 2064 set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
2064 2065
2065 if (nilfs_segctor_clean(sci)) 2066 if (nilfs_segctor_clean(sci))
@@ -2091,7 +2092,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2091 goto failed; 2092 goto failed;
2092 2093
2093 if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) 2094 if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
2094 nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile); 2095 nilfs_segctor_fill_in_file_bmap(sci);
2095 2096
2096 if (mode == SC_LSEG_SR && 2097 if (mode == SC_LSEG_SR &&
2097 sci->sc_stage.scnt >= NILFS_ST_CPFILE) { 2098 sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
@@ -2452,9 +2453,8 @@ nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
2452 list_for_each_entry_safe(ii, n, head, i_dirty) { 2453 list_for_each_entry_safe(ii, n, head, i_dirty) {
2453 if (!test_bit(NILFS_I_UPDATED, &ii->i_state)) 2454 if (!test_bit(NILFS_I_UPDATED, &ii->i_state))
2454 continue; 2455 continue;
2455 hlist_del_init(&ii->vfs_inode.i_hash);
2456 list_del_init(&ii->i_dirty); 2456 list_del_init(&ii->i_dirty);
2457 nilfs_clear_gcinode(&ii->vfs_inode); 2457 iput(&ii->vfs_inode);
2458 } 2458 }
2459} 2459}
2460 2460
@@ -2472,13 +2472,15 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2472 2472
2473 nilfs_transaction_lock(sbi, &ti, 1); 2473 nilfs_transaction_lock(sbi, &ti, 1);
2474 2474
2475 err = nilfs_init_gcdat_inode(nilfs); 2475 err = nilfs_mdt_save_to_shadow_map(nilfs->ns_dat);
2476 if (unlikely(err)) 2476 if (unlikely(err))
2477 goto out_unlock; 2477 goto out_unlock;
2478 2478
2479 err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs); 2479 err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs);
2480 if (unlikely(err)) 2480 if (unlikely(err)) {
2481 nilfs_mdt_restore_from_shadow_map(nilfs->ns_dat);
2481 goto out_unlock; 2482 goto out_unlock;
2483 }
2482 2484
2483 sci->sc_freesegs = kbufs[4]; 2485 sci->sc_freesegs = kbufs[4];
2484 sci->sc_nfreesegs = argv[4].v_nmembs; 2486 sci->sc_nfreesegs = argv[4].v_nmembs;
@@ -2510,7 +2512,7 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2510 out_unlock: 2512 out_unlock:
2511 sci->sc_freesegs = NULL; 2513 sci->sc_freesegs = NULL;
2512 sci->sc_nfreesegs = 0; 2514 sci->sc_nfreesegs = 0;
2513 nilfs_clear_gcdat_inode(nilfs); 2515 nilfs_mdt_clear_shadow_map(nilfs->ns_dat);
2514 nilfs_transaction_unlock(sbi); 2516 nilfs_transaction_unlock(sbi);
2515 return err; 2517 return err;
2516} 2518}
@@ -2672,6 +2674,8 @@ static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci)
2672} 2674}
2673 2675
2674static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci) 2676static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
2677 __acquires(&sci->sc_state_lock)
2678 __releases(&sci->sc_state_lock)
2675{ 2679{
2676 sci->sc_state |= NILFS_SEGCTOR_QUIT; 2680 sci->sc_state |= NILFS_SEGCTOR_QUIT;
2677 2681
@@ -2686,7 +2690,8 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
2686/* 2690/*
2687 * Setup & clean-up functions 2691 * Setup & clean-up functions
2688 */ 2692 */
2689static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi) 2693static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi,
2694 struct nilfs_root *root)
2690{ 2695{
2691 struct nilfs_sc_info *sci; 2696 struct nilfs_sc_info *sci;
2692 2697
@@ -2697,6 +2702,9 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
2697 sci->sc_sbi = sbi; 2702 sci->sc_sbi = sbi;
2698 sci->sc_super = sbi->s_super; 2703 sci->sc_super = sbi->s_super;
2699 2704
2705 nilfs_get_root(root);
2706 sci->sc_root = root;
2707
2700 init_waitqueue_head(&sci->sc_wait_request); 2708 init_waitqueue_head(&sci->sc_wait_request);
2701 init_waitqueue_head(&sci->sc_wait_daemon); 2709 init_waitqueue_head(&sci->sc_wait_daemon);
2702 init_waitqueue_head(&sci->sc_wait_task); 2710 init_waitqueue_head(&sci->sc_wait_task);
@@ -2771,6 +2779,8 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2771 WARN_ON(!list_empty(&sci->sc_segbufs)); 2779 WARN_ON(!list_empty(&sci->sc_segbufs));
2772 WARN_ON(!list_empty(&sci->sc_write_logs)); 2780 WARN_ON(!list_empty(&sci->sc_write_logs));
2773 2781
2782 nilfs_put_root(sci->sc_root);
2783
2774 down_write(&sbi->s_nilfs->ns_segctor_sem); 2784 down_write(&sbi->s_nilfs->ns_segctor_sem);
2775 2785
2776 del_timer_sync(&sci->sc_timer); 2786 del_timer_sync(&sci->sc_timer);
@@ -2780,6 +2790,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2780/** 2790/**
2781 * nilfs_attach_segment_constructor - attach a segment constructor 2791 * nilfs_attach_segment_constructor - attach a segment constructor
2782 * @sbi: nilfs_sb_info 2792 * @sbi: nilfs_sb_info
2793 * @root: root object of the current filesystem tree
2783 * 2794 *
2784 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info, 2795 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
2785 * initializes it, and starts the segment constructor. 2796 * initializes it, and starts the segment constructor.
@@ -2789,9 +2800,9 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2789 * 2800 *
2790 * %-ENOMEM - Insufficient memory available. 2801 * %-ENOMEM - Insufficient memory available.
2791 */ 2802 */
2792int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi) 2803int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
2804 struct nilfs_root *root)
2793{ 2805{
2794 struct the_nilfs *nilfs = sbi->s_nilfs;
2795 int err; 2806 int err;
2796 2807
2797 if (NILFS_SC(sbi)) { 2808 if (NILFS_SC(sbi)) {
@@ -2803,14 +2814,12 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
2803 nilfs_detach_segment_constructor(sbi); 2814 nilfs_detach_segment_constructor(sbi);
2804 } 2815 }
2805 2816
2806 sbi->s_sc_info = nilfs_segctor_new(sbi); 2817 sbi->s_sc_info = nilfs_segctor_new(sbi, root);
2807 if (!sbi->s_sc_info) 2818 if (!sbi->s_sc_info)
2808 return -ENOMEM; 2819 return -ENOMEM;
2809 2820
2810 nilfs_attach_writer(nilfs, sbi);
2811 err = nilfs_segctor_start_thread(NILFS_SC(sbi)); 2821 err = nilfs_segctor_start_thread(NILFS_SC(sbi));
2812 if (err) { 2822 if (err) {
2813 nilfs_detach_writer(nilfs, sbi);
2814 kfree(sbi->s_sc_info); 2823 kfree(sbi->s_sc_info);
2815 sbi->s_sc_info = NULL; 2824 sbi->s_sc_info = NULL;
2816 } 2825 }
@@ -2847,5 +2856,4 @@ void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi)
2847 up_write(&nilfs->ns_segctor_sem); 2856 up_write(&nilfs->ns_segctor_sem);
2848 2857
2849 nilfs_dispose_list(sbi, &garbage_list, 1); 2858 nilfs_dispose_list(sbi, &garbage_list, 1);
2850 nilfs_detach_writer(nilfs, sbi);
2851} 2859}
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 17c487bd8152..cd8056e7cbed 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -29,6 +29,8 @@
29#include <linux/nilfs2_fs.h> 29#include <linux/nilfs2_fs.h>
30#include "sb.h" 30#include "sb.h"
31 31
32struct nilfs_root;
33
32/** 34/**
33 * struct nilfs_recovery_info - Recovery information 35 * struct nilfs_recovery_info - Recovery information
34 * @ri_need_recovery: Recovery status 36 * @ri_need_recovery: Recovery status
@@ -87,6 +89,7 @@ struct nilfs_segsum_pointer {
87 * struct nilfs_sc_info - Segment constructor information 89 * struct nilfs_sc_info - Segment constructor information
88 * @sc_super: Back pointer to super_block struct 90 * @sc_super: Back pointer to super_block struct
89 * @sc_sbi: Back pointer to nilfs_sb_info struct 91 * @sc_sbi: Back pointer to nilfs_sb_info struct
92 * @sc_root: root object of the current filesystem tree
90 * @sc_nblk_inc: Block count of current generation 93 * @sc_nblk_inc: Block count of current generation
91 * @sc_dirty_files: List of files to be written 94 * @sc_dirty_files: List of files to be written
92 * @sc_gc_inodes: List of GC inodes having blocks to be written 95 * @sc_gc_inodes: List of GC inodes having blocks to be written
@@ -107,6 +110,7 @@ struct nilfs_segsum_pointer {
107 * @sc_datablk_cnt: Data block count of a file 110 * @sc_datablk_cnt: Data block count of a file
108 * @sc_nblk_this_inc: Number of blocks included in the current logical segment 111 * @sc_nblk_this_inc: Number of blocks included in the current logical segment
109 * @sc_seg_ctime: Creation time 112 * @sc_seg_ctime: Creation time
113 * @sc_cno: checkpoint number of current log
110 * @sc_flags: Internal flags 114 * @sc_flags: Internal flags
111 * @sc_state_lock: spinlock for sc_state and so on 115 * @sc_state_lock: spinlock for sc_state and so on
112 * @sc_state: Segctord state flags 116 * @sc_state: Segctord state flags
@@ -128,6 +132,7 @@ struct nilfs_segsum_pointer {
128struct nilfs_sc_info { 132struct nilfs_sc_info {
129 struct super_block *sc_super; 133 struct super_block *sc_super;
130 struct nilfs_sb_info *sc_sbi; 134 struct nilfs_sb_info *sc_sbi;
135 struct nilfs_root *sc_root;
131 136
132 unsigned long sc_nblk_inc; 137 unsigned long sc_nblk_inc;
133 138
@@ -156,7 +161,7 @@ struct nilfs_sc_info {
156 unsigned long sc_datablk_cnt; 161 unsigned long sc_datablk_cnt;
157 unsigned long sc_nblk_this_inc; 162 unsigned long sc_nblk_this_inc;
158 time_t sc_seg_ctime; 163 time_t sc_seg_ctime;
159 164 __u64 sc_cno;
160 unsigned long sc_flags; 165 unsigned long sc_flags;
161 166
162 spinlock_t sc_state_lock; 167 spinlock_t sc_state_lock;
@@ -230,7 +235,8 @@ extern void nilfs_flush_segment(struct super_block *, ino_t);
230extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *, 235extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *,
231 void **); 236 void **);
232 237
233extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *); 238int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
239 struct nilfs_root *root);
234extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *); 240extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
235 241
236/* recovery.c */ 242/* recovery.c */
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 3c6cc6005c2e..1d6f488ccae8 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -505,7 +505,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
505{ 505{
506 struct buffer_head *header_bh; 506 struct buffer_head *header_bh;
507 struct nilfs_sufile_header *header; 507 struct nilfs_sufile_header *header;
508 struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs; 508 struct the_nilfs *nilfs = NILFS_I_NILFS(sufile);
509 void *kaddr; 509 void *kaddr;
510 int ret; 510 int ret;
511 511
@@ -583,7 +583,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
583 struct nilfs_segment_usage *su; 583 struct nilfs_segment_usage *su;
584 struct nilfs_suinfo *si = buf; 584 struct nilfs_suinfo *si = buf;
585 size_t susz = NILFS_MDT(sufile)->mi_entry_size; 585 size_t susz = NILFS_MDT(sufile)->mi_entry_size;
586 struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs; 586 struct the_nilfs *nilfs = NILFS_I_NILFS(sufile);
587 void *kaddr; 587 void *kaddr;
588 unsigned long nsegs, segusages_per_block; 588 unsigned long nsegs, segusages_per_block;
589 ssize_t n; 589 ssize_t n;
@@ -635,46 +635,55 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
635} 635}
636 636
637/** 637/**
638 * nilfs_sufile_read - read sufile inode 638 * nilfs_sufile_read - read or get sufile inode
639 * @sufile: sufile inode 639 * @sb: super block instance
640 * @susize: size of a segment usage entry
640 * @raw_inode: on-disk sufile inode 641 * @raw_inode: on-disk sufile inode
642 * @inodep: buffer to store the inode
641 */ 643 */
642int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode) 644int nilfs_sufile_read(struct super_block *sb, size_t susize,
645 struct nilfs_inode *raw_inode, struct inode **inodep)
643{ 646{
644 struct nilfs_sufile_info *sui = NILFS_SUI(sufile); 647 struct inode *sufile;
648 struct nilfs_sufile_info *sui;
645 struct buffer_head *header_bh; 649 struct buffer_head *header_bh;
646 struct nilfs_sufile_header *header; 650 struct nilfs_sufile_header *header;
647 void *kaddr; 651 void *kaddr;
648 int ret; 652 int err;
649 653
650 ret = nilfs_read_inode_common(sufile, raw_inode); 654 sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO);
651 if (ret < 0) 655 if (unlikely(!sufile))
652 return ret; 656 return -ENOMEM;
657 if (!(sufile->i_state & I_NEW))
658 goto out;
653 659
654 ret = nilfs_sufile_get_header_block(sufile, &header_bh); 660 err = nilfs_mdt_init(sufile, NILFS_MDT_GFP, sizeof(*sui));
655 if (!ret) { 661 if (err)
656 kaddr = kmap_atomic(header_bh->b_page, KM_USER0); 662 goto failed;
657 header = kaddr + bh_offset(header_bh);
658 sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
659 kunmap_atomic(kaddr, KM_USER0);
660 brelse(header_bh);
661 }
662 return ret;
663}
664 663
665/** 664 nilfs_mdt_set_entry_size(sufile, susize,
666 * nilfs_sufile_new - create sufile 665 sizeof(struct nilfs_sufile_header));
667 * @nilfs: nilfs object 666
668 * @susize: size of a segment usage entry 667 err = nilfs_read_inode_common(sufile, raw_inode);
669 */ 668 if (err)
670struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize) 669 goto failed;
671{ 670
672 struct inode *sufile; 671 err = nilfs_sufile_get_header_block(sufile, &header_bh);
672 if (err)
673 goto failed;
673 674
674 sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO, 675 sui = NILFS_SUI(sufile);
675 sizeof(struct nilfs_sufile_info)); 676 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
676 if (sufile) 677 header = kaddr + bh_offset(header_bh);
677 nilfs_mdt_set_entry_size(sufile, susize, 678 sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
678 sizeof(struct nilfs_sufile_header)); 679 kunmap_atomic(kaddr, KM_USER0);
679 return sufile; 680 brelse(header_bh);
681
682 unlock_new_inode(sufile);
683 out:
684 *inodep = sufile;
685 return 0;
686 failed:
687 iget_failed(sufile);
688 return err;
680} 689}
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index 15163b8aff7d..a943fbacb45b 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -31,7 +31,7 @@
31 31
32static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile) 32static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
33{ 33{
34 return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments; 34 return NILFS_I_NILFS(sufile)->ns_nsegments;
35} 35}
36 36
37unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile); 37unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile);
@@ -61,8 +61,8 @@ void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
61void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *, 61void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
62 struct buffer_head *); 62 struct buffer_head *);
63 63
64int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode); 64int nilfs_sufile_read(struct super_block *sb, size_t susize,
65struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize); 65 struct nilfs_inode *raw_inode, struct inode **inodep);
66 66
67/** 67/**
68 * nilfs_sufile_scrap - make a segment garbage 68 * nilfs_sufile_scrap - make a segment garbage
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 922263393c76..35ae03c0db86 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -45,14 +45,13 @@
45#include <linux/parser.h> 45#include <linux/parser.h>
46#include <linux/random.h> 46#include <linux/random.h>
47#include <linux/crc32.h> 47#include <linux/crc32.h>
48#include <linux/smp_lock.h>
49#include <linux/vfs.h> 48#include <linux/vfs.h>
50#include <linux/writeback.h> 49#include <linux/writeback.h>
51#include <linux/kobject.h> 50#include <linux/kobject.h>
52#include <linux/exportfs.h>
53#include <linux/seq_file.h> 51#include <linux/seq_file.h>
54#include <linux/mount.h> 52#include <linux/mount.h>
55#include "nilfs.h" 53#include "nilfs.h"
54#include "export.h"
56#include "mdt.h" 55#include "mdt.h"
57#include "alloc.h" 56#include "alloc.h"
58#include "btree.h" 57#include "btree.h"
@@ -69,11 +68,12 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
69 "(NILFS)"); 68 "(NILFS)");
70MODULE_LICENSE("GPL"); 69MODULE_LICENSE("GPL");
71 70
72struct kmem_cache *nilfs_inode_cachep; 71static struct kmem_cache *nilfs_inode_cachep;
73struct kmem_cache *nilfs_transaction_cachep; 72struct kmem_cache *nilfs_transaction_cachep;
74struct kmem_cache *nilfs_segbuf_cachep; 73struct kmem_cache *nilfs_segbuf_cachep;
75struct kmem_cache *nilfs_btree_path_cache; 74struct kmem_cache *nilfs_btree_path_cache;
76 75
76static int nilfs_setup_super(struct nilfs_sb_info *sbi, int is_mount);
77static int nilfs_remount(struct super_block *sb, int *flags, char *data); 77static int nilfs_remount(struct super_block *sb, int *flags, char *data);
78 78
79static void nilfs_set_error(struct nilfs_sb_info *sbi) 79static void nilfs_set_error(struct nilfs_sb_info *sbi)
@@ -147,7 +147,7 @@ void nilfs_warning(struct super_block *sb, const char *function,
147} 147}
148 148
149 149
150struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs) 150struct inode *nilfs_alloc_inode(struct super_block *sb)
151{ 151{
152 struct nilfs_inode_info *ii; 152 struct nilfs_inode_info *ii;
153 153
@@ -156,18 +156,20 @@ struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
156 return NULL; 156 return NULL;
157 ii->i_bh = NULL; 157 ii->i_bh = NULL;
158 ii->i_state = 0; 158 ii->i_state = 0;
159 ii->i_cno = 0;
159 ii->vfs_inode.i_version = 1; 160 ii->vfs_inode.i_version = 1;
160 nilfs_btnode_cache_init(&ii->i_btnode_cache, nilfs->ns_bdi); 161 nilfs_btnode_cache_init(&ii->i_btnode_cache, sb->s_bdi);
161 return &ii->vfs_inode; 162 return &ii->vfs_inode;
162} 163}
163 164
164struct inode *nilfs_alloc_inode(struct super_block *sb)
165{
166 return nilfs_alloc_inode_common(NILFS_SB(sb)->s_nilfs);
167}
168
169void nilfs_destroy_inode(struct inode *inode) 165void nilfs_destroy_inode(struct inode *inode)
170{ 166{
167 struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
168
169 if (mdi) {
170 kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
171 kfree(mdi);
172 }
171 kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode)); 173 kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
172} 174}
173 175
@@ -178,17 +180,9 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
178 180
179 retry: 181 retry:
180 set_buffer_dirty(nilfs->ns_sbh[0]); 182 set_buffer_dirty(nilfs->ns_sbh[0]);
181
182 if (nilfs_test_opt(sbi, BARRIER)) { 183 if (nilfs_test_opt(sbi, BARRIER)) {
183 err = __sync_dirty_buffer(nilfs->ns_sbh[0], 184 err = __sync_dirty_buffer(nilfs->ns_sbh[0],
184 WRITE_SYNC | WRITE_BARRIER); 185 WRITE_SYNC | WRITE_FLUSH_FUA);
185 if (err == -EOPNOTSUPP) {
186 nilfs_warning(sbi->s_super, __func__,
187 "barrier-based sync failed. "
188 "disabling barriers\n");
189 nilfs_clear_opt(sbi, BARRIER);
190 goto retry;
191 }
192 } else { 186 } else {
193 err = sync_dirty_buffer(nilfs->ns_sbh[0]); 187 err = sync_dirty_buffer(nilfs->ns_sbh[0]);
194 } 188 }
@@ -342,8 +336,6 @@ static void nilfs_put_super(struct super_block *sb)
342 struct nilfs_sb_info *sbi = NILFS_SB(sb); 336 struct nilfs_sb_info *sbi = NILFS_SB(sb);
343 struct the_nilfs *nilfs = sbi->s_nilfs; 337 struct the_nilfs *nilfs = sbi->s_nilfs;
344 338
345 lock_kernel();
346
347 nilfs_detach_segment_constructor(sbi); 339 nilfs_detach_segment_constructor(sbi);
348 340
349 if (!(sb->s_flags & MS_RDONLY)) { 341 if (!(sb->s_flags & MS_RDONLY)) {
@@ -351,18 +343,15 @@ static void nilfs_put_super(struct super_block *sb)
351 nilfs_cleanup_super(sbi); 343 nilfs_cleanup_super(sbi);
352 up_write(&nilfs->ns_sem); 344 up_write(&nilfs->ns_sem);
353 } 345 }
354 down_write(&nilfs->ns_super_sem);
355 if (nilfs->ns_current == sbi)
356 nilfs->ns_current = NULL;
357 up_write(&nilfs->ns_super_sem);
358 346
359 nilfs_detach_checkpoint(sbi); 347 iput(nilfs->ns_sufile);
360 put_nilfs(sbi->s_nilfs); 348 iput(nilfs->ns_cpfile);
349 iput(nilfs->ns_dat);
350
351 destroy_nilfs(nilfs);
361 sbi->s_super = NULL; 352 sbi->s_super = NULL;
362 sb->s_fs_info = NULL; 353 sb->s_fs_info = NULL;
363 nilfs_put_sbinfo(sbi); 354 kfree(sbi);
364
365 unlock_kernel();
366} 355}
367 356
368static int nilfs_sync_fs(struct super_block *sb, int wait) 357static int nilfs_sync_fs(struct super_block *sb, int wait)
@@ -389,21 +378,22 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
389 return err; 378 return err;
390} 379}
391 380
392int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) 381int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
382 struct nilfs_root **rootp)
393{ 383{
394 struct the_nilfs *nilfs = sbi->s_nilfs; 384 struct the_nilfs *nilfs = sbi->s_nilfs;
385 struct nilfs_root *root;
395 struct nilfs_checkpoint *raw_cp; 386 struct nilfs_checkpoint *raw_cp;
396 struct buffer_head *bh_cp; 387 struct buffer_head *bh_cp;
397 int err; 388 int err = -ENOMEM;
398 389
399 down_write(&nilfs->ns_super_sem); 390 root = nilfs_find_or_create_root(
400 list_add(&sbi->s_list, &nilfs->ns_supers); 391 nilfs, curr_mnt ? NILFS_CPTREE_CURRENT_CNO : cno);
401 up_write(&nilfs->ns_super_sem); 392 if (!root)
393 return err;
402 394
403 err = -ENOMEM; 395 if (root->ifile)
404 sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size); 396 goto reuse; /* already attached checkpoint */
405 if (!sbi->s_ifile)
406 goto delist;
407 397
408 down_read(&nilfs->ns_segctor_sem); 398 down_read(&nilfs->ns_segctor_sem);
409 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, 399 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
@@ -419,45 +409,64 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
419 } 409 }
420 goto failed; 410 goto failed;
421 } 411 }
422 err = nilfs_read_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode); 412
423 if (unlikely(err)) 413 err = nilfs_ifile_read(sbi->s_super, root, nilfs->ns_inode_size,
414 &raw_cp->cp_ifile_inode, &root->ifile);
415 if (err)
424 goto failed_bh; 416 goto failed_bh;
425 atomic_set(&sbi->s_inodes_count, le64_to_cpu(raw_cp->cp_inodes_count)); 417
426 atomic_set(&sbi->s_blocks_count, le64_to_cpu(raw_cp->cp_blocks_count)); 418 atomic_set(&root->inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
419 atomic_set(&root->blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
427 420
428 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); 421 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
422
423 reuse:
424 *rootp = root;
429 return 0; 425 return 0;
430 426
431 failed_bh: 427 failed_bh:
432 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); 428 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
433 failed: 429 failed:
434 nilfs_mdt_destroy(sbi->s_ifile); 430 nilfs_put_root(root);
435 sbi->s_ifile = NULL;
436 431
437 delist: 432 return err;
438 down_write(&nilfs->ns_super_sem); 433}
439 list_del_init(&sbi->s_list);
440 up_write(&nilfs->ns_super_sem);
441 434
435static int nilfs_freeze(struct super_block *sb)
436{
437 struct nilfs_sb_info *sbi = NILFS_SB(sb);
438 struct the_nilfs *nilfs = sbi->s_nilfs;
439 int err;
440
441 if (sb->s_flags & MS_RDONLY)
442 return 0;
443
444 /* Mark super block clean */
445 down_write(&nilfs->ns_sem);
446 err = nilfs_cleanup_super(sbi);
447 up_write(&nilfs->ns_sem);
442 return err; 448 return err;
443} 449}
444 450
445void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi) 451static int nilfs_unfreeze(struct super_block *sb)
446{ 452{
453 struct nilfs_sb_info *sbi = NILFS_SB(sb);
447 struct the_nilfs *nilfs = sbi->s_nilfs; 454 struct the_nilfs *nilfs = sbi->s_nilfs;
448 455
449 nilfs_mdt_destroy(sbi->s_ifile); 456 if (sb->s_flags & MS_RDONLY)
450 sbi->s_ifile = NULL; 457 return 0;
451 down_write(&nilfs->ns_super_sem); 458
452 list_del_init(&sbi->s_list); 459 down_write(&nilfs->ns_sem);
453 up_write(&nilfs->ns_super_sem); 460 nilfs_setup_super(sbi, false);
461 up_write(&nilfs->ns_sem);
462 return 0;
454} 463}
455 464
456static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) 465static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
457{ 466{
458 struct super_block *sb = dentry->d_sb; 467 struct super_block *sb = dentry->d_sb;
459 struct nilfs_sb_info *sbi = NILFS_SB(sb); 468 struct nilfs_root *root = NILFS_I(dentry->d_inode)->i_root;
460 struct the_nilfs *nilfs = sbi->s_nilfs; 469 struct the_nilfs *nilfs = root->nilfs;
461 u64 id = huge_encode_dev(sb->s_bdev->bd_dev); 470 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
462 unsigned long long blocks; 471 unsigned long long blocks;
463 unsigned long overhead; 472 unsigned long overhead;
@@ -493,7 +502,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
493 buf->f_bfree = nfreeblocks; 502 buf->f_bfree = nfreeblocks;
494 buf->f_bavail = (buf->f_bfree >= nrsvblocks) ? 503 buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
495 (buf->f_bfree - nrsvblocks) : 0; 504 (buf->f_bfree - nrsvblocks) : 0;
496 buf->f_files = atomic_read(&sbi->s_inodes_count); 505 buf->f_files = atomic_read(&root->inodes_count);
497 buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */ 506 buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */
498 buf->f_namelen = NILFS_NAME_LEN; 507 buf->f_namelen = NILFS_NAME_LEN;
499 buf->f_fsid.val[0] = (u32)id; 508 buf->f_fsid.val[0] = (u32)id;
@@ -506,12 +515,12 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
506{ 515{
507 struct super_block *sb = vfs->mnt_sb; 516 struct super_block *sb = vfs->mnt_sb;
508 struct nilfs_sb_info *sbi = NILFS_SB(sb); 517 struct nilfs_sb_info *sbi = NILFS_SB(sb);
518 struct nilfs_root *root = NILFS_I(vfs->mnt_root->d_inode)->i_root;
509 519
510 if (!nilfs_test_opt(sbi, BARRIER)) 520 if (!nilfs_test_opt(sbi, BARRIER))
511 seq_puts(seq, ",nobarrier"); 521 seq_puts(seq, ",nobarrier");
512 if (nilfs_test_opt(sbi, SNAPSHOT)) 522 if (root->cno != NILFS_CPTREE_CURRENT_CNO)
513 seq_printf(seq, ",cp=%llu", 523 seq_printf(seq, ",cp=%llu", (unsigned long long)root->cno);
514 (unsigned long long int)sbi->s_snapshot_cno);
515 if (nilfs_test_opt(sbi, ERRORS_PANIC)) 524 if (nilfs_test_opt(sbi, ERRORS_PANIC))
516 seq_puts(seq, ",errors=panic"); 525 seq_puts(seq, ",errors=panic");
517 if (nilfs_test_opt(sbi, ERRORS_CONT)) 526 if (nilfs_test_opt(sbi, ERRORS_CONT))
@@ -537,6 +546,8 @@ static const struct super_operations nilfs_sops = {
537 .put_super = nilfs_put_super, 546 .put_super = nilfs_put_super,
538 /* .write_super = nilfs_write_super, */ 547 /* .write_super = nilfs_write_super, */
539 .sync_fs = nilfs_sync_fs, 548 .sync_fs = nilfs_sync_fs,
549 .freeze_fs = nilfs_freeze,
550 .unfreeze_fs = nilfs_unfreeze,
540 /* .write_super_lockfs */ 551 /* .write_super_lockfs */
541 /* .unlockfs */ 552 /* .unlockfs */
542 .statfs = nilfs_statfs, 553 .statfs = nilfs_statfs,
@@ -545,48 +556,6 @@ static const struct super_operations nilfs_sops = {
545 .show_options = nilfs_show_options 556 .show_options = nilfs_show_options
546}; 557};
547 558
548static struct inode *
549nilfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
550{
551 struct inode *inode;
552
553 if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO &&
554 ino != NILFS_SKETCH_INO)
555 return ERR_PTR(-ESTALE);
556
557 inode = nilfs_iget(sb, ino);
558 if (IS_ERR(inode))
559 return ERR_CAST(inode);
560 if (generation && inode->i_generation != generation) {
561 iput(inode);
562 return ERR_PTR(-ESTALE);
563 }
564
565 return inode;
566}
567
568static struct dentry *
569nilfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len,
570 int fh_type)
571{
572 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
573 nilfs_nfs_get_inode);
574}
575
576static struct dentry *
577nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len,
578 int fh_type)
579{
580 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
581 nilfs_nfs_get_inode);
582}
583
584static const struct export_operations nilfs_export_ops = {
585 .fh_to_dentry = nilfs_fh_to_dentry,
586 .fh_to_parent = nilfs_fh_to_parent,
587 .get_parent = nilfs_get_parent,
588};
589
590enum { 559enum {
591 Opt_err_cont, Opt_err_panic, Opt_err_ro, 560 Opt_err_cont, Opt_err_panic, Opt_err_ro,
592 Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery, 561 Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
@@ -612,7 +581,6 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
612 struct nilfs_sb_info *sbi = NILFS_SB(sb); 581 struct nilfs_sb_info *sbi = NILFS_SB(sb);
613 char *p; 582 char *p;
614 substring_t args[MAX_OPT_ARGS]; 583 substring_t args[MAX_OPT_ARGS];
615 int option;
616 584
617 if (!options) 585 if (!options)
618 return 1; 586 return 1;
@@ -650,30 +618,12 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
650 nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT); 618 nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT);
651 break; 619 break;
652 case Opt_snapshot: 620 case Opt_snapshot:
653 if (match_int(&args[0], &option) || option <= 0)
654 return 0;
655 if (is_remount) { 621 if (is_remount) {
656 if (!nilfs_test_opt(sbi, SNAPSHOT)) { 622 printk(KERN_ERR
657 printk(KERN_ERR 623 "NILFS: \"%s\" option is invalid "
658 "NILFS: cannot change regular " 624 "for remount.\n", p);
659 "mount to snapshot.\n");
660 return 0;
661 } else if (option != sbi->s_snapshot_cno) {
662 printk(KERN_ERR
663 "NILFS: cannot remount to a "
664 "different snapshot.\n");
665 return 0;
666 }
667 break;
668 }
669 if (!(sb->s_flags & MS_RDONLY)) {
670 printk(KERN_ERR "NILFS: cannot mount snapshot "
671 "read/write. A read-only option is "
672 "required.\n");
673 return 0; 625 return 0;
674 } 626 }
675 sbi->s_snapshot_cno = option;
676 nilfs_set_opt(sbi, SNAPSHOT);
677 break; 627 break;
678 case Opt_norecovery: 628 case Opt_norecovery:
679 nilfs_set_opt(sbi, NORECOVERY); 629 nilfs_set_opt(sbi, NORECOVERY);
@@ -701,7 +651,7 @@ nilfs_set_default_options(struct nilfs_sb_info *sbi,
701 NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER; 651 NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
702} 652}
703 653
704static int nilfs_setup_super(struct nilfs_sb_info *sbi) 654static int nilfs_setup_super(struct nilfs_sb_info *sbi, int is_mount)
705{ 655{
706 struct the_nilfs *nilfs = sbi->s_nilfs; 656 struct the_nilfs *nilfs = sbi->s_nilfs;
707 struct nilfs_super_block **sbp; 657 struct nilfs_super_block **sbp;
@@ -713,6 +663,9 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi)
713 if (!sbp) 663 if (!sbp)
714 return -EIO; 664 return -EIO;
715 665
666 if (!is_mount)
667 goto skip_mount_setup;
668
716 max_mnt_count = le16_to_cpu(sbp[0]->s_max_mnt_count); 669 max_mnt_count = le16_to_cpu(sbp[0]->s_max_mnt_count);
717 mnt_count = le16_to_cpu(sbp[0]->s_mnt_count); 670 mnt_count = le16_to_cpu(sbp[0]->s_mnt_count);
718 671
@@ -729,9 +682,11 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi)
729 sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT); 682 sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
730 683
731 sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1); 684 sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1);
685 sbp[0]->s_mtime = cpu_to_le64(get_seconds());
686
687skip_mount_setup:
732 sbp[0]->s_state = 688 sbp[0]->s_state =
733 cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS); 689 cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS);
734 sbp[0]->s_mtime = cpu_to_le64(get_seconds());
735 /* synchronize sbp[1] with sbp[0] */ 690 /* synchronize sbp[1] with sbp[0] */
736 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); 691 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
737 return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL); 692 return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
@@ -798,22 +753,156 @@ int nilfs_check_feature_compatibility(struct super_block *sb,
798 return 0; 753 return 0;
799} 754}
800 755
756static int nilfs_get_root_dentry(struct super_block *sb,
757 struct nilfs_root *root,
758 struct dentry **root_dentry)
759{
760 struct inode *inode;
761 struct dentry *dentry;
762 int ret = 0;
763
764 inode = nilfs_iget(sb, root, NILFS_ROOT_INO);
765 if (IS_ERR(inode)) {
766 printk(KERN_ERR "NILFS: get root inode failed\n");
767 ret = PTR_ERR(inode);
768 goto out;
769 }
770 if (!S_ISDIR(inode->i_mode) || !inode->i_blocks || !inode->i_size) {
771 iput(inode);
772 printk(KERN_ERR "NILFS: corrupt root inode.\n");
773 ret = -EINVAL;
774 goto out;
775 }
776
777 if (root->cno == NILFS_CPTREE_CURRENT_CNO) {
778 dentry = d_find_alias(inode);
779 if (!dentry) {
780 dentry = d_alloc_root(inode);
781 if (!dentry) {
782 iput(inode);
783 ret = -ENOMEM;
784 goto failed_dentry;
785 }
786 } else {
787 iput(inode);
788 }
789 } else {
790 dentry = d_obtain_alias(inode);
791 if (IS_ERR(dentry)) {
792 ret = PTR_ERR(dentry);
793 goto failed_dentry;
794 }
795 }
796 *root_dentry = dentry;
797 out:
798 return ret;
799
800 failed_dentry:
801 printk(KERN_ERR "NILFS: get root dentry failed\n");
802 goto out;
803}
804
805static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
806 struct dentry **root_dentry)
807{
808 struct the_nilfs *nilfs = NILFS_SB(s)->s_nilfs;
809 struct nilfs_root *root;
810 int ret;
811
812 down_read(&nilfs->ns_segctor_sem);
813 ret = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, cno);
814 up_read(&nilfs->ns_segctor_sem);
815 if (ret < 0) {
816 ret = (ret == -ENOENT) ? -EINVAL : ret;
817 goto out;
818 } else if (!ret) {
819 printk(KERN_ERR "NILFS: The specified checkpoint is "
820 "not a snapshot (checkpoint number=%llu).\n",
821 (unsigned long long)cno);
822 ret = -EINVAL;
823 goto out;
824 }
825
826 ret = nilfs_attach_checkpoint(NILFS_SB(s), cno, false, &root);
827 if (ret) {
828 printk(KERN_ERR "NILFS: error loading snapshot "
829 "(checkpoint number=%llu).\n",
830 (unsigned long long)cno);
831 goto out;
832 }
833 ret = nilfs_get_root_dentry(s, root, root_dentry);
834 nilfs_put_root(root);
835 out:
836 return ret;
837}
838
839static int nilfs_tree_was_touched(struct dentry *root_dentry)
840{
841 return atomic_read(&root_dentry->d_count) > 1;
842}
843
844/**
845 * nilfs_try_to_shrink_tree() - try to shrink dentries of a checkpoint
846 * @root_dentry: root dentry of the tree to be shrunk
847 *
848 * This function returns true if the tree was in-use.
849 */
850static int nilfs_try_to_shrink_tree(struct dentry *root_dentry)
851{
852 if (have_submounts(root_dentry))
853 return true;
854 shrink_dcache_parent(root_dentry);
855 return nilfs_tree_was_touched(root_dentry);
856}
857
858int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
859{
860 struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
861 struct nilfs_root *root;
862 struct inode *inode;
863 struct dentry *dentry;
864 int ret;
865
866 if (cno < 0 || cno > nilfs->ns_cno)
867 return false;
868
869 if (cno >= nilfs_last_cno(nilfs))
870 return true; /* protect recent checkpoints */
871
872 ret = false;
873 root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs, cno);
874 if (root) {
875 inode = nilfs_ilookup(sb, root, NILFS_ROOT_INO);
876 if (inode) {
877 dentry = d_find_alias(inode);
878 if (dentry) {
879 if (nilfs_tree_was_touched(dentry))
880 ret = nilfs_try_to_shrink_tree(dentry);
881 dput(dentry);
882 }
883 iput(inode);
884 }
885 nilfs_put_root(root);
886 }
887 return ret;
888}
889
801/** 890/**
802 * nilfs_fill_super() - initialize a super block instance 891 * nilfs_fill_super() - initialize a super block instance
803 * @sb: super_block 892 * @sb: super_block
804 * @data: mount options 893 * @data: mount options
805 * @silent: silent mode flag 894 * @silent: silent mode flag
806 * @nilfs: the_nilfs struct
807 * 895 *
808 * This function is called exclusively by nilfs->ns_mount_mutex. 896 * This function is called exclusively by nilfs->ns_mount_mutex.
809 * So, the recovery process is protected from other simultaneous mounts. 897 * So, the recovery process is protected from other simultaneous mounts.
810 */ 898 */
811static int 899static int
812nilfs_fill_super(struct super_block *sb, void *data, int silent, 900nilfs_fill_super(struct super_block *sb, void *data, int silent)
813 struct the_nilfs *nilfs)
814{ 901{
902 struct the_nilfs *nilfs;
815 struct nilfs_sb_info *sbi; 903 struct nilfs_sb_info *sbi;
816 struct inode *root; 904 struct nilfs_root *fsroot;
905 struct backing_dev_info *bdi;
817 __u64 cno; 906 __u64 cno;
818 int err; 907 int err;
819 908
@@ -822,19 +911,21 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
822 return -ENOMEM; 911 return -ENOMEM;
823 912
824 sb->s_fs_info = sbi; 913 sb->s_fs_info = sbi;
914 sbi->s_super = sb;
825 915
826 get_nilfs(nilfs); 916 nilfs = alloc_nilfs(sb->s_bdev);
917 if (!nilfs) {
918 err = -ENOMEM;
919 goto failed_sbi;
920 }
827 sbi->s_nilfs = nilfs; 921 sbi->s_nilfs = nilfs;
828 sbi->s_super = sb;
829 atomic_set(&sbi->s_count, 1);
830 922
831 err = init_nilfs(nilfs, sbi, (char *)data); 923 err = init_nilfs(nilfs, sbi, (char *)data);
832 if (err) 924 if (err)
833 goto failed_sbi; 925 goto failed_nilfs;
834 926
835 spin_lock_init(&sbi->s_inode_lock); 927 spin_lock_init(&sbi->s_inode_lock);
836 INIT_LIST_HEAD(&sbi->s_dirty_files); 928 INIT_LIST_HEAD(&sbi->s_dirty_files);
837 INIT_LIST_HEAD(&sbi->s_list);
838 929
839 /* 930 /*
840 * Following initialization is overlapped because 931 * Following initialization is overlapped because
@@ -850,94 +941,59 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
850 sb->s_export_op = &nilfs_export_ops; 941 sb->s_export_op = &nilfs_export_ops;
851 sb->s_root = NULL; 942 sb->s_root = NULL;
852 sb->s_time_gran = 1; 943 sb->s_time_gran = 1;
853 sb->s_bdi = nilfs->ns_bdi; 944
945 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
946 sb->s_bdi = bdi ? : &default_backing_dev_info;
854 947
855 err = load_nilfs(nilfs, sbi); 948 err = load_nilfs(nilfs, sbi);
856 if (err) 949 if (err)
857 goto failed_sbi; 950 goto failed_nilfs;
858 951
859 cno = nilfs_last_cno(nilfs); 952 cno = nilfs_last_cno(nilfs);
860 953 err = nilfs_attach_checkpoint(sbi, cno, true, &fsroot);
861 if (sb->s_flags & MS_RDONLY) {
862 if (nilfs_test_opt(sbi, SNAPSHOT)) {
863 down_read(&nilfs->ns_segctor_sem);
864 err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile,
865 sbi->s_snapshot_cno);
866 up_read(&nilfs->ns_segctor_sem);
867 if (err < 0) {
868 if (err == -ENOENT)
869 err = -EINVAL;
870 goto failed_sbi;
871 }
872 if (!err) {
873 printk(KERN_ERR
874 "NILFS: The specified checkpoint is "
875 "not a snapshot "
876 "(checkpoint number=%llu).\n",
877 (unsigned long long)sbi->s_snapshot_cno);
878 err = -EINVAL;
879 goto failed_sbi;
880 }
881 cno = sbi->s_snapshot_cno;
882 }
883 }
884
885 err = nilfs_attach_checkpoint(sbi, cno);
886 if (err) { 954 if (err) {
887 printk(KERN_ERR "NILFS: error loading a checkpoint" 955 printk(KERN_ERR "NILFS: error loading last checkpoint "
888 " (checkpoint number=%llu).\n", (unsigned long long)cno); 956 "(checkpoint number=%llu).\n", (unsigned long long)cno);
889 goto failed_sbi; 957 goto failed_unload;
890 } 958 }
891 959
892 if (!(sb->s_flags & MS_RDONLY)) { 960 if (!(sb->s_flags & MS_RDONLY)) {
893 err = nilfs_attach_segment_constructor(sbi); 961 err = nilfs_attach_segment_constructor(sbi, fsroot);
894 if (err) 962 if (err)
895 goto failed_checkpoint; 963 goto failed_checkpoint;
896 } 964 }
897 965
898 root = nilfs_iget(sb, NILFS_ROOT_INO); 966 err = nilfs_get_root_dentry(sb, fsroot, &sb->s_root);
899 if (IS_ERR(root)) { 967 if (err)
900 printk(KERN_ERR "NILFS: get root inode failed\n");
901 err = PTR_ERR(root);
902 goto failed_segctor;
903 }
904 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
905 iput(root);
906 printk(KERN_ERR "NILFS: corrupt root inode.\n");
907 err = -EINVAL;
908 goto failed_segctor;
909 }
910 sb->s_root = d_alloc_root(root);
911 if (!sb->s_root) {
912 iput(root);
913 printk(KERN_ERR "NILFS: get root dentry failed\n");
914 err = -ENOMEM;
915 goto failed_segctor; 968 goto failed_segctor;
916 } 969
970 nilfs_put_root(fsroot);
917 971
918 if (!(sb->s_flags & MS_RDONLY)) { 972 if (!(sb->s_flags & MS_RDONLY)) {
919 down_write(&nilfs->ns_sem); 973 down_write(&nilfs->ns_sem);
920 nilfs_setup_super(sbi); 974 nilfs_setup_super(sbi, true);
921 up_write(&nilfs->ns_sem); 975 up_write(&nilfs->ns_sem);
922 } 976 }
923 977
924 down_write(&nilfs->ns_super_sem);
925 if (!nilfs_test_opt(sbi, SNAPSHOT))
926 nilfs->ns_current = sbi;
927 up_write(&nilfs->ns_super_sem);
928
929 return 0; 978 return 0;
930 979
931 failed_segctor: 980 failed_segctor:
932 nilfs_detach_segment_constructor(sbi); 981 nilfs_detach_segment_constructor(sbi);
933 982
934 failed_checkpoint: 983 failed_checkpoint:
935 nilfs_detach_checkpoint(sbi); 984 nilfs_put_root(fsroot);
985
986 failed_unload:
987 iput(nilfs->ns_sufile);
988 iput(nilfs->ns_cpfile);
989 iput(nilfs->ns_dat);
990
991 failed_nilfs:
992 destroy_nilfs(nilfs);
936 993
937 failed_sbi: 994 failed_sbi:
938 put_nilfs(nilfs);
939 sb->s_fs_info = NULL; 995 sb->s_fs_info = NULL;
940 nilfs_put_sbinfo(sbi); 996 kfree(sbi);
941 return err; 997 return err;
942} 998}
943 999
@@ -947,15 +1003,10 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
947 struct the_nilfs *nilfs = sbi->s_nilfs; 1003 struct the_nilfs *nilfs = sbi->s_nilfs;
948 unsigned long old_sb_flags; 1004 unsigned long old_sb_flags;
949 struct nilfs_mount_options old_opts; 1005 struct nilfs_mount_options old_opts;
950 int was_snapshot, err; 1006 int err;
951
952 lock_kernel();
953 1007
954 down_write(&nilfs->ns_super_sem);
955 old_sb_flags = sb->s_flags; 1008 old_sb_flags = sb->s_flags;
956 old_opts.mount_opt = sbi->s_mount_opt; 1009 old_opts.mount_opt = sbi->s_mount_opt;
957 old_opts.snapshot_cno = sbi->s_snapshot_cno;
958 was_snapshot = nilfs_test_opt(sbi, SNAPSHOT);
959 1010
960 if (!parse_options(data, sb, 1)) { 1011 if (!parse_options(data, sb, 1)) {
961 err = -EINVAL; 1012 err = -EINVAL;
@@ -964,11 +1015,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
964 sb->s_flags = (sb->s_flags & ~MS_POSIXACL); 1015 sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
965 1016
966 err = -EINVAL; 1017 err = -EINVAL;
967 if (was_snapshot && !(*flags & MS_RDONLY)) {
968 printk(KERN_ERR "NILFS (device %s): cannot remount snapshot "
969 "read/write.\n", sb->s_id);
970 goto restore_opts;
971 }
972 1018
973 if (!nilfs_valid_fs(nilfs)) { 1019 if (!nilfs_valid_fs(nilfs)) {
974 printk(KERN_WARNING "NILFS (device %s): couldn't " 1020 printk(KERN_WARNING "NILFS (device %s): couldn't "
@@ -993,6 +1039,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
993 up_write(&nilfs->ns_sem); 1039 up_write(&nilfs->ns_sem);
994 } else { 1040 } else {
995 __u64 features; 1041 __u64 features;
1042 struct nilfs_root *root;
996 1043
997 /* 1044 /*
998 * Mounting a RDONLY partition read-write, so reread and 1045 * Mounting a RDONLY partition read-write, so reread and
@@ -1014,25 +1061,21 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
1014 1061
1015 sb->s_flags &= ~MS_RDONLY; 1062 sb->s_flags &= ~MS_RDONLY;
1016 1063
1017 err = nilfs_attach_segment_constructor(sbi); 1064 root = NILFS_I(sb->s_root->d_inode)->i_root;
1065 err = nilfs_attach_segment_constructor(sbi, root);
1018 if (err) 1066 if (err)
1019 goto restore_opts; 1067 goto restore_opts;
1020 1068
1021 down_write(&nilfs->ns_sem); 1069 down_write(&nilfs->ns_sem);
1022 nilfs_setup_super(sbi); 1070 nilfs_setup_super(sbi, true);
1023 up_write(&nilfs->ns_sem); 1071 up_write(&nilfs->ns_sem);
1024 } 1072 }
1025 out: 1073 out:
1026 up_write(&nilfs->ns_super_sem);
1027 unlock_kernel();
1028 return 0; 1074 return 0;
1029 1075
1030 restore_opts: 1076 restore_opts:
1031 sb->s_flags = old_sb_flags; 1077 sb->s_flags = old_sb_flags;
1032 sbi->s_mount_opt = old_opts.mount_opt; 1078 sbi->s_mount_opt = old_opts.mount_opt;
1033 sbi->s_snapshot_cno = old_opts.snapshot_cno;
1034 up_write(&nilfs->ns_super_sem);
1035 unlock_kernel();
1036 return err; 1079 return err;
1037} 1080}
1038 1081
@@ -1052,7 +1095,7 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
1052{ 1095{
1053 char *p, *options = data; 1096 char *p, *options = data;
1054 substring_t args[MAX_OPT_ARGS]; 1097 substring_t args[MAX_OPT_ARGS];
1055 int option, token; 1098 int token;
1056 int ret = 0; 1099 int ret = 0;
1057 1100
1058 do { 1101 do {
@@ -1060,16 +1103,18 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
1060 if (p != NULL && *p) { 1103 if (p != NULL && *p) {
1061 token = match_token(p, tokens, args); 1104 token = match_token(p, tokens, args);
1062 if (token == Opt_snapshot) { 1105 if (token == Opt_snapshot) {
1063 if (!(sd->flags & MS_RDONLY)) 1106 if (!(sd->flags & MS_RDONLY)) {
1064 ret++; 1107 ret++;
1065 else { 1108 } else {
1066 ret = match_int(&args[0], &option); 1109 sd->cno = simple_strtoull(args[0].from,
1067 if (!ret) { 1110 NULL, 0);
1068 if (option > 0) 1111 /*
1069 sd->cno = option; 1112 * No need to see the end pointer;
1070 else 1113 * match_token() has done syntax
1071 ret++; 1114 * checking.
1072 } 1115 */
1116 if (sd->cno == 0)
1117 ret++;
1073 } 1118 }
1074 } 1119 }
1075 if (ret) 1120 if (ret)
@@ -1086,18 +1131,14 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
1086 1131
1087static int nilfs_set_bdev_super(struct super_block *s, void *data) 1132static int nilfs_set_bdev_super(struct super_block *s, void *data)
1088{ 1133{
1089 struct nilfs_super_data *sd = data; 1134 s->s_bdev = data;
1090
1091 s->s_bdev = sd->bdev;
1092 s->s_dev = s->s_bdev->bd_dev; 1135 s->s_dev = s->s_bdev->bd_dev;
1093 return 0; 1136 return 0;
1094} 1137}
1095 1138
1096static int nilfs_test_bdev_super(struct super_block *s, void *data) 1139static int nilfs_test_bdev_super(struct super_block *s, void *data)
1097{ 1140{
1098 struct nilfs_super_data *sd = data; 1141 return (void *)s->s_bdev == data;
1099
1100 return sd->sbi && s->s_fs_info == (void *)sd->sbi;
1101} 1142}
1102 1143
1103static int 1144static int
@@ -1107,8 +1148,8 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1107 struct nilfs_super_data sd; 1148 struct nilfs_super_data sd;
1108 struct super_block *s; 1149 struct super_block *s;
1109 fmode_t mode = FMODE_READ; 1150 fmode_t mode = FMODE_READ;
1110 struct the_nilfs *nilfs; 1151 struct dentry *root_dentry;
1111 int err, need_to_close = 1; 1152 int err, s_new = false;
1112 1153
1113 if (!(flags & MS_RDONLY)) 1154 if (!(flags & MS_RDONLY))
1114 mode |= FMODE_WRITE; 1155 mode |= FMODE_WRITE;
@@ -1117,12 +1158,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1117 if (IS_ERR(sd.bdev)) 1158 if (IS_ERR(sd.bdev))
1118 return PTR_ERR(sd.bdev); 1159 return PTR_ERR(sd.bdev);
1119 1160
1120 /*
1121 * To get mount instance using sget() vfs-routine, NILFS needs
1122 * much more information than normal filesystems to identify mount
1123 * instance. For snapshot mounts, not only a mount type (ro-mount
1124 * or rw-mount) but also a checkpoint number is required.
1125 */
1126 sd.cno = 0; 1161 sd.cno = 0;
1127 sd.flags = flags; 1162 sd.flags = flags;
1128 if (nilfs_identify((char *)data, &sd)) { 1163 if (nilfs_identify((char *)data, &sd)) {
@@ -1130,94 +1165,86 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1130 goto failed; 1165 goto failed;
1131 } 1166 }
1132 1167
1133 nilfs = find_or_create_nilfs(sd.bdev);
1134 if (!nilfs) {
1135 err = -ENOMEM;
1136 goto failed;
1137 }
1138
1139 mutex_lock(&nilfs->ns_mount_mutex);
1140
1141 if (!sd.cno) {
1142 /*
1143 * Check if an exclusive mount exists or not.
1144 * Snapshot mounts coexist with a current mount
1145 * (i.e. rw-mount or ro-mount), whereas rw-mount and
1146 * ro-mount are mutually exclusive.
1147 */
1148 down_read(&nilfs->ns_super_sem);
1149 if (nilfs->ns_current &&
1150 ((nilfs->ns_current->s_super->s_flags ^ flags)
1151 & MS_RDONLY)) {
1152 up_read(&nilfs->ns_super_sem);
1153 err = -EBUSY;
1154 goto failed_unlock;
1155 }
1156 up_read(&nilfs->ns_super_sem);
1157 }
1158
1159 /*
1160 * Find existing nilfs_sb_info struct
1161 */
1162 sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno);
1163
1164 /* 1168 /*
1165 * Get super block instance holding the nilfs_sb_info struct. 1169 * once the super is inserted into the list by sget, s_umount
1166 * A new instance is allocated if no existing mount is present or 1170 * will protect the lockfs code from trying to start a snapshot
1167 * existing instance has been unmounted. 1171 * while we are mounting
1168 */ 1172 */
1169 s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd); 1173 mutex_lock(&sd.bdev->bd_fsfreeze_mutex);
1170 if (sd.sbi) 1174 if (sd.bdev->bd_fsfreeze_count > 0) {
1171 nilfs_put_sbinfo(sd.sbi); 1175 mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
1172 1176 err = -EBUSY;
1177 goto failed;
1178 }
1179 s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, sd.bdev);
1180 mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
1173 if (IS_ERR(s)) { 1181 if (IS_ERR(s)) {
1174 err = PTR_ERR(s); 1182 err = PTR_ERR(s);
1175 goto failed_unlock; 1183 goto failed;
1176 } 1184 }
1177 1185
1178 if (!s->s_root) { 1186 if (!s->s_root) {
1179 char b[BDEVNAME_SIZE]; 1187 char b[BDEVNAME_SIZE];
1180 1188
1189 s_new = true;
1190
1181 /* New superblock instance created */ 1191 /* New superblock instance created */
1182 s->s_flags = flags; 1192 s->s_flags = flags;
1183 s->s_mode = mode; 1193 s->s_mode = mode;
1184 strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id)); 1194 strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
1185 sb_set_blocksize(s, block_size(sd.bdev)); 1195 sb_set_blocksize(s, block_size(sd.bdev));
1186 1196
1187 err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0, 1197 err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
1188 nilfs);
1189 if (err) 1198 if (err)
1190 goto cancel_new; 1199 goto failed_super;
1191 1200
1192 s->s_flags |= MS_ACTIVE; 1201 s->s_flags |= MS_ACTIVE;
1193 need_to_close = 0; 1202 } else if (!sd.cno) {
1203 int busy = false;
1204
1205 if (nilfs_tree_was_touched(s->s_root)) {
1206 busy = nilfs_try_to_shrink_tree(s->s_root);
1207 if (busy && (flags ^ s->s_flags) & MS_RDONLY) {
1208 printk(KERN_ERR "NILFS: the device already "
1209 "has a %s mount.\n",
1210 (s->s_flags & MS_RDONLY) ?
1211 "read-only" : "read/write");
1212 err = -EBUSY;
1213 goto failed_super;
1214 }
1215 }
1216 if (!busy) {
1217 /*
1218 * Try remount to setup mount states if the current
1219 * tree is not mounted and only snapshots use this sb.
1220 */
1221 err = nilfs_remount(s, &flags, data);
1222 if (err)
1223 goto failed_super;
1224 }
1194 } 1225 }
1195 1226
1196 mutex_unlock(&nilfs->ns_mount_mutex); 1227 if (sd.cno) {
1197 put_nilfs(nilfs); 1228 err = nilfs_attach_snapshot(s, sd.cno, &root_dentry);
1198 if (need_to_close) 1229 if (err)
1199 close_bdev_exclusive(sd.bdev, mode); 1230 goto failed_super;
1200 simple_set_mnt(mnt, s); 1231 } else {
1201 return 0; 1232 root_dentry = dget(s->s_root);
1233 }
1202 1234
1203 failed_unlock: 1235 if (!s_new)
1204 mutex_unlock(&nilfs->ns_mount_mutex); 1236 close_bdev_exclusive(sd.bdev, mode);
1205 put_nilfs(nilfs);
1206 failed:
1207 close_bdev_exclusive(sd.bdev, mode);
1208 1237
1209 return err; 1238 mnt->mnt_sb = s;
1239 mnt->mnt_root = root_dentry;
1240 return 0;
1210 1241
1211 cancel_new: 1242 failed_super:
1212 /* Abandoning the newly allocated superblock */
1213 mutex_unlock(&nilfs->ns_mount_mutex);
1214 put_nilfs(nilfs);
1215 deactivate_locked_super(s); 1243 deactivate_locked_super(s);
1216 /* 1244
1217 * deactivate_locked_super() invokes close_bdev_exclusive(). 1245 failed:
1218 * We must finish all post-cleaning before this call; 1246 if (!s_new)
1219 * put_nilfs() needs the block device. 1247 close_bdev_exclusive(sd.bdev, mode);
1220 */
1221 return err; 1248 return err;
1222} 1249}
1223 1250
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index ba7c10c917fc..0254be2d73c6 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -35,9 +35,6 @@
35#include "segbuf.h" 35#include "segbuf.h"
36 36
37 37
38static LIST_HEAD(nilfs_objects);
39static DEFINE_SPINLOCK(nilfs_lock);
40
41static int nilfs_valid_sb(struct nilfs_super_block *sbp); 38static int nilfs_valid_sb(struct nilfs_super_block *sbp);
42 39
43void nilfs_set_last_segment(struct the_nilfs *nilfs, 40void nilfs_set_last_segment(struct the_nilfs *nilfs,
@@ -61,16 +58,13 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs,
61} 58}
62 59
63/** 60/**
64 * alloc_nilfs - allocate the_nilfs structure 61 * alloc_nilfs - allocate a nilfs object
65 * @bdev: block device to which the_nilfs is related 62 * @bdev: block device to which the_nilfs is related
66 * 63 *
67 * alloc_nilfs() allocates memory for the_nilfs and
68 * initializes its reference count and locks.
69 *
70 * Return Value: On success, pointer to the_nilfs is returned. 64 * Return Value: On success, pointer to the_nilfs is returned.
71 * On error, NULL is returned. 65 * On error, NULL is returned.
72 */ 66 */
73static struct the_nilfs *alloc_nilfs(struct block_device *bdev) 67struct the_nilfs *alloc_nilfs(struct block_device *bdev)
74{ 68{
75 struct the_nilfs *nilfs; 69 struct the_nilfs *nilfs;
76 70
@@ -79,103 +73,38 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
79 return NULL; 73 return NULL;
80 74
81 nilfs->ns_bdev = bdev; 75 nilfs->ns_bdev = bdev;
82 atomic_set(&nilfs->ns_count, 1);
83 atomic_set(&nilfs->ns_ndirtyblks, 0); 76 atomic_set(&nilfs->ns_ndirtyblks, 0);
84 init_rwsem(&nilfs->ns_sem); 77 init_rwsem(&nilfs->ns_sem);
85 init_rwsem(&nilfs->ns_super_sem); 78 INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
86 mutex_init(&nilfs->ns_mount_mutex);
87 init_rwsem(&nilfs->ns_writer_sem);
88 INIT_LIST_HEAD(&nilfs->ns_list);
89 INIT_LIST_HEAD(&nilfs->ns_supers);
90 spin_lock_init(&nilfs->ns_last_segment_lock); 79 spin_lock_init(&nilfs->ns_last_segment_lock);
91 nilfs->ns_gc_inodes_h = NULL; 80 nilfs->ns_cptree = RB_ROOT;
81 spin_lock_init(&nilfs->ns_cptree_lock);
92 init_rwsem(&nilfs->ns_segctor_sem); 82 init_rwsem(&nilfs->ns_segctor_sem);
93 83
94 return nilfs; 84 return nilfs;
95} 85}
96 86
97/** 87/**
98 * find_or_create_nilfs - find or create nilfs object 88 * destroy_nilfs - destroy nilfs object
99 * @bdev: block device to which the_nilfs is related 89 * @nilfs: nilfs object to be released
100 *
101 * find_nilfs() looks up an existent nilfs object created on the
102 * device and gets the reference count of the object. If no nilfs object
103 * is found on the device, a new nilfs object is allocated.
104 *
105 * Return Value: On success, pointer to the nilfs object is returned.
106 * On error, NULL is returned.
107 */
108struct the_nilfs *find_or_create_nilfs(struct block_device *bdev)
109{
110 struct the_nilfs *nilfs, *new = NULL;
111
112 retry:
113 spin_lock(&nilfs_lock);
114 list_for_each_entry(nilfs, &nilfs_objects, ns_list) {
115 if (nilfs->ns_bdev == bdev) {
116 get_nilfs(nilfs);
117 spin_unlock(&nilfs_lock);
118 if (new)
119 put_nilfs(new);
120 return nilfs; /* existing object */
121 }
122 }
123 if (new) {
124 list_add_tail(&new->ns_list, &nilfs_objects);
125 spin_unlock(&nilfs_lock);
126 return new; /* new object */
127 }
128 spin_unlock(&nilfs_lock);
129
130 new = alloc_nilfs(bdev);
131 if (new)
132 goto retry;
133 return NULL; /* insufficient memory */
134}
135
136/**
137 * put_nilfs - release a reference to the_nilfs
138 * @nilfs: the_nilfs structure to be released
139 *
140 * put_nilfs() decrements a reference counter of the_nilfs.
141 * If the reference count reaches zero, the_nilfs is freed.
142 */ 90 */
143void put_nilfs(struct the_nilfs *nilfs) 91void destroy_nilfs(struct the_nilfs *nilfs)
144{ 92{
145 spin_lock(&nilfs_lock);
146 if (!atomic_dec_and_test(&nilfs->ns_count)) {
147 spin_unlock(&nilfs_lock);
148 return;
149 }
150 list_del_init(&nilfs->ns_list);
151 spin_unlock(&nilfs_lock);
152
153 /*
154 * Increment of ns_count never occurs below because the caller
155 * of get_nilfs() holds at least one reference to the_nilfs.
156 * Thus its exclusion control is not required here.
157 */
158
159 might_sleep(); 93 might_sleep();
160 if (nilfs_loaded(nilfs)) {
161 nilfs_mdt_destroy(nilfs->ns_sufile);
162 nilfs_mdt_destroy(nilfs->ns_cpfile);
163 nilfs_mdt_destroy(nilfs->ns_dat);
164 nilfs_mdt_destroy(nilfs->ns_gc_dat);
165 }
166 if (nilfs_init(nilfs)) { 94 if (nilfs_init(nilfs)) {
167 nilfs_destroy_gccache(nilfs);
168 brelse(nilfs->ns_sbh[0]); 95 brelse(nilfs->ns_sbh[0]);
169 brelse(nilfs->ns_sbh[1]); 96 brelse(nilfs->ns_sbh[1]);
170 } 97 }
171 kfree(nilfs); 98 kfree(nilfs);
172} 99}
173 100
174static int nilfs_load_super_root(struct the_nilfs *nilfs, sector_t sr_block) 101static int nilfs_load_super_root(struct the_nilfs *nilfs,
102 struct super_block *sb, sector_t sr_block)
175{ 103{
176 struct buffer_head *bh_sr; 104 struct buffer_head *bh_sr;
177 struct nilfs_super_root *raw_sr; 105 struct nilfs_super_root *raw_sr;
178 struct nilfs_super_block **sbp = nilfs->ns_sbp; 106 struct nilfs_super_block **sbp = nilfs->ns_sbp;
107 struct nilfs_inode *rawi;
179 unsigned dat_entry_size, segment_usage_size, checkpoint_size; 108 unsigned dat_entry_size, segment_usage_size, checkpoint_size;
180 unsigned inode_size; 109 unsigned inode_size;
181 int err; 110 int err;
@@ -192,40 +121,22 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs, sector_t sr_block)
192 121
193 inode_size = nilfs->ns_inode_size; 122 inode_size = nilfs->ns_inode_size;
194 123
195 err = -ENOMEM; 124 rawi = (void *)bh_sr->b_data + NILFS_SR_DAT_OFFSET(inode_size);
196 nilfs->ns_dat = nilfs_dat_new(nilfs, dat_entry_size); 125 err = nilfs_dat_read(sb, dat_entry_size, rawi, &nilfs->ns_dat);
197 if (unlikely(!nilfs->ns_dat)) 126 if (err)
198 goto failed; 127 goto failed;
199 128
200 nilfs->ns_gc_dat = nilfs_dat_new(nilfs, dat_entry_size); 129 rawi = (void *)bh_sr->b_data + NILFS_SR_CPFILE_OFFSET(inode_size);
201 if (unlikely(!nilfs->ns_gc_dat)) 130 err = nilfs_cpfile_read(sb, checkpoint_size, rawi, &nilfs->ns_cpfile);
131 if (err)
202 goto failed_dat; 132 goto failed_dat;
203 133
204 nilfs->ns_cpfile = nilfs_cpfile_new(nilfs, checkpoint_size); 134 rawi = (void *)bh_sr->b_data + NILFS_SR_SUFILE_OFFSET(inode_size);
205 if (unlikely(!nilfs->ns_cpfile)) 135 err = nilfs_sufile_read(sb, segment_usage_size, rawi,
206 goto failed_gc_dat; 136 &nilfs->ns_sufile);
207 137 if (err)
208 nilfs->ns_sufile = nilfs_sufile_new(nilfs, segment_usage_size);
209 if (unlikely(!nilfs->ns_sufile))
210 goto failed_cpfile; 138 goto failed_cpfile;
211 139
212 nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat);
213
214 err = nilfs_dat_read(nilfs->ns_dat, (void *)bh_sr->b_data +
215 NILFS_SR_DAT_OFFSET(inode_size));
216 if (unlikely(err))
217 goto failed_sufile;
218
219 err = nilfs_cpfile_read(nilfs->ns_cpfile, (void *)bh_sr->b_data +
220 NILFS_SR_CPFILE_OFFSET(inode_size));
221 if (unlikely(err))
222 goto failed_sufile;
223
224 err = nilfs_sufile_read(nilfs->ns_sufile, (void *)bh_sr->b_data +
225 NILFS_SR_SUFILE_OFFSET(inode_size));
226 if (unlikely(err))
227 goto failed_sufile;
228
229 raw_sr = (struct nilfs_super_root *)bh_sr->b_data; 140 raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
230 nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime); 141 nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime);
231 142
@@ -233,17 +144,11 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs, sector_t sr_block)
233 brelse(bh_sr); 144 brelse(bh_sr);
234 return err; 145 return err;
235 146
236 failed_sufile:
237 nilfs_mdt_destroy(nilfs->ns_sufile);
238
239 failed_cpfile: 147 failed_cpfile:
240 nilfs_mdt_destroy(nilfs->ns_cpfile); 148 iput(nilfs->ns_cpfile);
241
242 failed_gc_dat:
243 nilfs_mdt_destroy(nilfs->ns_gc_dat);
244 149
245 failed_dat: 150 failed_dat:
246 nilfs_mdt_destroy(nilfs->ns_dat); 151 iput(nilfs->ns_dat);
247 goto failed; 152 goto failed;
248} 153}
249 154
@@ -306,15 +211,6 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
306 int valid_fs = nilfs_valid_fs(nilfs); 211 int valid_fs = nilfs_valid_fs(nilfs);
307 int err; 212 int err;
308 213
309 if (nilfs_loaded(nilfs)) {
310 if (valid_fs ||
311 ((s_flags & MS_RDONLY) && nilfs_test_opt(sbi, NORECOVERY)))
312 return 0;
313 printk(KERN_ERR "NILFS: the filesystem is in an incomplete "
314 "recovery state.\n");
315 return -EINVAL;
316 }
317
318 if (!valid_fs) { 214 if (!valid_fs) {
319 printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n"); 215 printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
320 if (s_flags & MS_RDONLY) { 216 if (s_flags & MS_RDONLY) {
@@ -375,7 +271,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
375 goto scan_error; 271 goto scan_error;
376 } 272 }
377 273
378 err = nilfs_load_super_root(nilfs, ri.ri_super_root); 274 err = nilfs_load_super_root(nilfs, sbi->s_super, ri.ri_super_root);
379 if (unlikely(err)) { 275 if (unlikely(err)) {
380 printk(KERN_ERR "NILFS: error loading super root.\n"); 276 printk(KERN_ERR "NILFS: error loading super root.\n");
381 goto failed; 277 goto failed;
@@ -443,10 +339,9 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
443 goto failed; 339 goto failed;
444 340
445 failed_unload: 341 failed_unload:
446 nilfs_mdt_destroy(nilfs->ns_cpfile); 342 iput(nilfs->ns_cpfile);
447 nilfs_mdt_destroy(nilfs->ns_sufile); 343 iput(nilfs->ns_sufile);
448 nilfs_mdt_destroy(nilfs->ns_dat); 344 iput(nilfs->ns_dat);
449 nilfs_mdt_destroy(nilfs->ns_gc_dat);
450 345
451 failed: 346 failed:
452 nilfs_clear_recovery_info(&ri); 347 nilfs_clear_recovery_info(&ri);
@@ -468,8 +363,8 @@ static unsigned long long nilfs_max_size(unsigned int blkbits)
468static int nilfs_store_disk_layout(struct the_nilfs *nilfs, 363static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
469 struct nilfs_super_block *sbp) 364 struct nilfs_super_block *sbp)
470{ 365{
471 if (le32_to_cpu(sbp->s_rev_level) != NILFS_CURRENT_REV) { 366 if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) {
472 printk(KERN_ERR "NILFS: revision mismatch " 367 printk(KERN_ERR "NILFS: unsupported revision "
473 "(superblock rev.=%d.%d, current rev.=%d.%d). " 368 "(superblock rev.=%d.%d, current rev.=%d.%d). "
474 "Please check the version of mkfs.nilfs.\n", 369 "Please check the version of mkfs.nilfs.\n",
475 le32_to_cpu(sbp->s_rev_level), 370 le32_to_cpu(sbp->s_rev_level),
@@ -631,12 +526,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
631 * 526 *
632 * init_nilfs() performs common initialization per block device (e.g. 527 * init_nilfs() performs common initialization per block device (e.g.
633 * reading the super block, getting disk layout information, initializing 528 * reading the super block, getting disk layout information, initializing
634 * shared fields in the_nilfs). It takes on some portion of the jobs 529 * shared fields in the_nilfs).
635 * typically done by a fill_super() routine. This division arises from
636 * the nature that multiple NILFS instances may be simultaneously
637 * mounted on a device.
638 * For multiple mounts on the same device, only the first mount
639 * invokes these tasks.
640 * 530 *
641 * Return Value: On success, 0 is returned. On error, a negative error 531 * Return Value: On success, 0 is returned. On error, a negative error
642 * code is returned. 532 * code is returned.
@@ -645,32 +535,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
645{ 535{
646 struct super_block *sb = sbi->s_super; 536 struct super_block *sb = sbi->s_super;
647 struct nilfs_super_block *sbp; 537 struct nilfs_super_block *sbp;
648 struct backing_dev_info *bdi;
649 int blocksize; 538 int blocksize;
650 int err; 539 int err;
651 540
652 down_write(&nilfs->ns_sem); 541 down_write(&nilfs->ns_sem);
653 if (nilfs_init(nilfs)) {
654 /* Load values from existing the_nilfs */
655 sbp = nilfs->ns_sbp[0];
656 err = nilfs_store_magic_and_option(sb, sbp, data);
657 if (err)
658 goto out;
659
660 err = nilfs_check_feature_compatibility(sb, sbp);
661 if (err)
662 goto out;
663
664 blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
665 if (sb->s_blocksize != blocksize &&
666 !sb_set_blocksize(sb, blocksize)) {
667 printk(KERN_ERR "NILFS: blocksize %d unfit to device\n",
668 blocksize);
669 err = -EINVAL;
670 }
671 sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
672 goto out;
673 }
674 542
675 blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE); 543 blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE);
676 if (!blocksize) { 544 if (!blocksize) {
@@ -729,18 +597,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
729 597
730 nilfs->ns_mount_state = le16_to_cpu(sbp->s_state); 598 nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
731 599
732 bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
733 nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
734
735 err = nilfs_store_log_cursor(nilfs, sbp); 600 err = nilfs_store_log_cursor(nilfs, sbp);
736 if (err) 601 if (err)
737 goto failed_sbh; 602 goto failed_sbh;
738 603
739 /* Initialize gcinode cache */
740 err = nilfs_init_gccache(nilfs);
741 if (err)
742 goto failed_sbh;
743
744 set_nilfs_init(nilfs); 604 set_nilfs_init(nilfs);
745 err = 0; 605 err = 0;
746 out: 606 out:
@@ -775,9 +635,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
775 ret = blkdev_issue_discard(nilfs->ns_bdev, 635 ret = blkdev_issue_discard(nilfs->ns_bdev,
776 start * sects_per_block, 636 start * sects_per_block,
777 nblocks * sects_per_block, 637 nblocks * sects_per_block,
778 GFP_NOFS, 638 GFP_NOFS, 0);
779 BLKDEV_IFL_WAIT |
780 BLKDEV_IFL_BARRIER);
781 if (ret < 0) 639 if (ret < 0)
782 return ret; 640 return ret;
783 nblocks = 0; 641 nblocks = 0;
@@ -787,8 +645,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
787 ret = blkdev_issue_discard(nilfs->ns_bdev, 645 ret = blkdev_issue_discard(nilfs->ns_bdev,
788 start * sects_per_block, 646 start * sects_per_block,
789 nblocks * sects_per_block, 647 nblocks * sects_per_block,
790 GFP_NOFS, 648 GFP_NOFS, 0);
791 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
792 return ret; 649 return ret;
793} 650}
794 651
@@ -815,79 +672,92 @@ int nilfs_near_disk_full(struct the_nilfs *nilfs)
815 return ncleansegs <= nilfs->ns_nrsvsegs + nincsegs; 672 return ncleansegs <= nilfs->ns_nrsvsegs + nincsegs;
816} 673}
817 674
818/** 675struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno)
819 * nilfs_find_sbinfo - find existing nilfs_sb_info structure
820 * @nilfs: nilfs object
821 * @rw_mount: mount type (non-zero value for read/write mount)
822 * @cno: checkpoint number (zero for read-only mount)
823 *
824 * nilfs_find_sbinfo() returns the nilfs_sb_info structure which
825 * @rw_mount and @cno (in case of snapshots) matched. If no instance
826 * was found, NULL is returned. Although the super block instance can
827 * be unmounted after this function returns, the nilfs_sb_info struct
828 * is kept on memory until nilfs_put_sbinfo() is called.
829 */
830struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *nilfs,
831 int rw_mount, __u64 cno)
832{ 676{
833 struct nilfs_sb_info *sbi; 677 struct rb_node *n;
834 678 struct nilfs_root *root;
835 down_read(&nilfs->ns_super_sem); 679
836 /* 680 spin_lock(&nilfs->ns_cptree_lock);
837 * The SNAPSHOT flag and sb->s_flags are supposed to be 681 n = nilfs->ns_cptree.rb_node;
838 * protected with nilfs->ns_super_sem. 682 while (n) {
839 */ 683 root = rb_entry(n, struct nilfs_root, rb_node);
840 sbi = nilfs->ns_current; 684
841 if (rw_mount) { 685 if (cno < root->cno) {
842 if (sbi && !(sbi->s_super->s_flags & MS_RDONLY)) 686 n = n->rb_left;
843 goto found; /* read/write mount */ 687 } else if (cno > root->cno) {
844 else 688 n = n->rb_right;
845 goto out; 689 } else {
846 } else if (cno == 0) { 690 atomic_inc(&root->count);
847 if (sbi && (sbi->s_super->s_flags & MS_RDONLY)) 691 spin_unlock(&nilfs->ns_cptree_lock);
848 goto found; /* read-only mount */ 692 return root;
849 else 693 }
850 goto out;
851 } 694 }
695 spin_unlock(&nilfs->ns_cptree_lock);
852 696
853 list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
854 if (nilfs_test_opt(sbi, SNAPSHOT) &&
855 sbi->s_snapshot_cno == cno)
856 goto found; /* snapshot mount */
857 }
858 out:
859 up_read(&nilfs->ns_super_sem);
860 return NULL; 697 return NULL;
861
862 found:
863 atomic_inc(&sbi->s_count);
864 up_read(&nilfs->ns_super_sem);
865 return sbi;
866} 698}
867 699
868int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno, 700struct nilfs_root *
869 int snapshot_mount) 701nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
870{ 702{
871 struct nilfs_sb_info *sbi; 703 struct rb_node **p, *parent;
872 int ret = 0; 704 struct nilfs_root *root, *new;
873 705
874 down_read(&nilfs->ns_super_sem); 706 root = nilfs_lookup_root(nilfs, cno);
875 if (cno == 0 || cno > nilfs->ns_cno) 707 if (root)
876 goto out_unlock; 708 return root;
877 709
878 list_for_each_entry(sbi, &nilfs->ns_supers, s_list) { 710 new = kmalloc(sizeof(*root), GFP_KERNEL);
879 if (sbi->s_snapshot_cno == cno && 711 if (!new)
880 (!snapshot_mount || nilfs_test_opt(sbi, SNAPSHOT))) { 712 return NULL;
881 /* exclude read-only mounts */ 713
882 ret++; 714 spin_lock(&nilfs->ns_cptree_lock);
883 break; 715
716 p = &nilfs->ns_cptree.rb_node;
717 parent = NULL;
718
719 while (*p) {
720 parent = *p;
721 root = rb_entry(parent, struct nilfs_root, rb_node);
722
723 if (cno < root->cno) {
724 p = &(*p)->rb_left;
725 } else if (cno > root->cno) {
726 p = &(*p)->rb_right;
727 } else {
728 atomic_inc(&root->count);
729 spin_unlock(&nilfs->ns_cptree_lock);
730 kfree(new);
731 return root;
884 } 732 }
885 } 733 }
886 /* for protecting recent checkpoints */
887 if (cno >= nilfs_last_cno(nilfs))
888 ret++;
889 734
890 out_unlock: 735 new->cno = cno;
891 up_read(&nilfs->ns_super_sem); 736 new->ifile = NULL;
892 return ret; 737 new->nilfs = nilfs;
738 atomic_set(&new->count, 1);
739 atomic_set(&new->inodes_count, 0);
740 atomic_set(&new->blocks_count, 0);
741
742 rb_link_node(&new->rb_node, parent, p);
743 rb_insert_color(&new->rb_node, &nilfs->ns_cptree);
744
745 spin_unlock(&nilfs->ns_cptree_lock);
746
747 return new;
748}
749
750void nilfs_put_root(struct nilfs_root *root)
751{
752 if (atomic_dec_and_test(&root->count)) {
753 struct the_nilfs *nilfs = root->nilfs;
754
755 spin_lock(&nilfs->ns_cptree_lock);
756 rb_erase(&root->rb_node, &nilfs->ns_cptree);
757 spin_unlock(&nilfs->ns_cptree_lock);
758 if (root->ifile)
759 iput(root->ifile);
760
761 kfree(root);
762 }
893} 763}
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index f785a7b0ab99..69226e14b745 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -26,6 +26,7 @@
26 26
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/buffer_head.h> 28#include <linux/buffer_head.h>
29#include <linux/rbtree.h>
29#include <linux/fs.h> 30#include <linux/fs.h>
30#include <linux/blkdev.h> 31#include <linux/blkdev.h>
31#include <linux/backing-dev.h> 32#include <linux/backing-dev.h>
@@ -45,22 +46,13 @@ enum {
45/** 46/**
46 * struct the_nilfs - struct to supervise multiple nilfs mount points 47 * struct the_nilfs - struct to supervise multiple nilfs mount points
47 * @ns_flags: flags 48 * @ns_flags: flags
48 * @ns_count: reference count
49 * @ns_list: list head for nilfs_list
50 * @ns_bdev: block device 49 * @ns_bdev: block device
51 * @ns_bdi: backing dev info
52 * @ns_writer: back pointer to writable nilfs_sb_info
53 * @ns_sem: semaphore for shared states 50 * @ns_sem: semaphore for shared states
54 * @ns_super_sem: semaphore for global operations across super block instances
55 * @ns_mount_mutex: mutex protecting mount process of nilfs
56 * @ns_writer_sem: semaphore protecting ns_writer attach/detach
57 * @ns_current: back pointer to current mount
58 * @ns_sbh: buffer heads of on-disk super blocks 51 * @ns_sbh: buffer heads of on-disk super blocks
59 * @ns_sbp: pointers to super block data 52 * @ns_sbp: pointers to super block data
60 * @ns_sbwtime: previous write time of super block 53 * @ns_sbwtime: previous write time of super block
61 * @ns_sbwcount: write count of super block 54 * @ns_sbwcount: write count of super block
62 * @ns_sbsize: size of valid data in super block 55 * @ns_sbsize: size of valid data in super block
63 * @ns_supers: list of nilfs super block structs
64 * @ns_seg_seq: segment sequence counter 56 * @ns_seg_seq: segment sequence counter
65 * @ns_segnum: index number of the latest full segment. 57 * @ns_segnum: index number of the latest full segment.
66 * @ns_nextnum: index number of the full segment index to be used next 58 * @ns_nextnum: index number of the full segment index to be used next
@@ -79,9 +71,9 @@ enum {
79 * @ns_dat: DAT file inode 71 * @ns_dat: DAT file inode
80 * @ns_cpfile: checkpoint file inode 72 * @ns_cpfile: checkpoint file inode
81 * @ns_sufile: segusage file inode 73 * @ns_sufile: segusage file inode
82 * @ns_gc_dat: shadow inode of the DAT file inode for GC 74 * @ns_cptree: rb-tree of all mounted checkpoints (nilfs_root)
75 * @ns_cptree_lock: lock protecting @ns_cptree
83 * @ns_gc_inodes: dummy inodes to keep live blocks 76 * @ns_gc_inodes: dummy inodes to keep live blocks
84 * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks
85 * @ns_blocksize_bits: bit length of block size 77 * @ns_blocksize_bits: bit length of block size
86 * @ns_blocksize: block size 78 * @ns_blocksize: block size
87 * @ns_nsegments: number of segments in filesystem 79 * @ns_nsegments: number of segments in filesystem
@@ -95,22 +87,9 @@ enum {
95 */ 87 */
96struct the_nilfs { 88struct the_nilfs {
97 unsigned long ns_flags; 89 unsigned long ns_flags;
98 atomic_t ns_count;
99 struct list_head ns_list;
100 90
101 struct block_device *ns_bdev; 91 struct block_device *ns_bdev;
102 struct backing_dev_info *ns_bdi;
103 struct nilfs_sb_info *ns_writer;
104 struct rw_semaphore ns_sem; 92 struct rw_semaphore ns_sem;
105 struct rw_semaphore ns_super_sem;
106 struct mutex ns_mount_mutex;
107 struct rw_semaphore ns_writer_sem;
108
109 /*
110 * components protected by ns_super_sem
111 */
112 struct nilfs_sb_info *ns_current;
113 struct list_head ns_supers;
114 93
115 /* 94 /*
116 * used for 95 * used for
@@ -163,11 +142,13 @@ struct the_nilfs {
163 struct inode *ns_dat; 142 struct inode *ns_dat;
164 struct inode *ns_cpfile; 143 struct inode *ns_cpfile;
165 struct inode *ns_sufile; 144 struct inode *ns_sufile;
166 struct inode *ns_gc_dat;
167 145
168 /* GC inode list and hash table head */ 146 /* Checkpoint tree */
147 struct rb_root ns_cptree;
148 spinlock_t ns_cptree_lock;
149
150 /* GC inode list */
169 struct list_head ns_gc_inodes; 151 struct list_head ns_gc_inodes;
170 struct hlist_head *ns_gc_inodes_h;
171 152
172 /* Disk layout information (static) */ 153 /* Disk layout information (static) */
173 unsigned int ns_blocksize_bits; 154 unsigned int ns_blocksize_bits;
@@ -182,9 +163,6 @@ struct the_nilfs {
182 u32 ns_crc_seed; 163 u32 ns_crc_seed;
183}; 164};
184 165
185#define NILFS_GCINODE_HASH_BITS 8
186#define NILFS_GCINODE_HASH_SIZE (1<<NILFS_GCINODE_HASH_BITS)
187
188#define THE_NILFS_FNS(bit, name) \ 166#define THE_NILFS_FNS(bit, name) \
189static inline void set_nilfs_##name(struct the_nilfs *nilfs) \ 167static inline void set_nilfs_##name(struct the_nilfs *nilfs) \
190{ \ 168{ \
@@ -205,6 +183,32 @@ THE_NILFS_FNS(DISCONTINUED, discontinued)
205THE_NILFS_FNS(GC_RUNNING, gc_running) 183THE_NILFS_FNS(GC_RUNNING, gc_running)
206THE_NILFS_FNS(SB_DIRTY, sb_dirty) 184THE_NILFS_FNS(SB_DIRTY, sb_dirty)
207 185
186/**
187 * struct nilfs_root - nilfs root object
188 * @cno: checkpoint number
189 * @rb_node: red-black tree node
190 * @count: refcount of this structure
191 * @nilfs: nilfs object
192 * @ifile: inode file
193 * @root: root inode
194 * @inodes_count: number of inodes
195 * @blocks_count: number of blocks (Reserved)
196 */
197struct nilfs_root {
198 __u64 cno;
199 struct rb_node rb_node;
200
201 atomic_t count;
202 struct the_nilfs *nilfs;
203 struct inode *ifile;
204
205 atomic_t inodes_count;
206 atomic_t blocks_count;
207};
208
209/* Special checkpoint number */
210#define NILFS_CPTREE_CURRENT_CNO 0
211
208/* Minimum interval of periodical update of superblocks (in seconds) */ 212/* Minimum interval of periodical update of superblocks (in seconds) */
209#define NILFS_SB_FREQ 10 213#define NILFS_SB_FREQ 10
210 214
@@ -221,46 +225,25 @@ static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
221} 225}
222 226
223void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); 227void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
224struct the_nilfs *find_or_create_nilfs(struct block_device *); 228struct the_nilfs *alloc_nilfs(struct block_device *bdev);
225void put_nilfs(struct the_nilfs *); 229void destroy_nilfs(struct the_nilfs *nilfs);
226int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *); 230int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
227int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *); 231int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
228int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t); 232int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
229int nilfs_count_free_blocks(struct the_nilfs *, sector_t *); 233int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
234struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno);
235struct nilfs_root *nilfs_find_or_create_root(struct the_nilfs *nilfs,
236 __u64 cno);
237void nilfs_put_root(struct nilfs_root *root);
230struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64); 238struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
231int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
232int nilfs_near_disk_full(struct the_nilfs *); 239int nilfs_near_disk_full(struct the_nilfs *);
233void nilfs_fall_back_super_block(struct the_nilfs *); 240void nilfs_fall_back_super_block(struct the_nilfs *);
234void nilfs_swap_super_block(struct the_nilfs *); 241void nilfs_swap_super_block(struct the_nilfs *);
235 242
236 243
237static inline void get_nilfs(struct the_nilfs *nilfs) 244static inline void nilfs_get_root(struct nilfs_root *root)
238{
239 /* Caller must have at least one reference of the_nilfs. */
240 atomic_inc(&nilfs->ns_count);
241}
242
243static inline void
244nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
245{
246 down_write(&nilfs->ns_writer_sem);
247 nilfs->ns_writer = sbi;
248 up_write(&nilfs->ns_writer_sem);
249}
250
251static inline void
252nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
253{
254 down_write(&nilfs->ns_writer_sem);
255 if (sbi == nilfs->ns_writer)
256 nilfs->ns_writer = NULL;
257 up_write(&nilfs->ns_writer_sem);
258}
259
260static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
261{ 245{
262 if (atomic_dec_and_test(&sbi->s_count)) 246 atomic_inc(&root->count);
263 kfree(sbi);
264} 247}
265 248
266static inline int nilfs_valid_fs(struct the_nilfs *nilfs) 249static inline int nilfs_valid_fs(struct the_nilfs *nilfs)
diff --git a/fs/no-block.c b/fs/no-block.c
index d269a93d3467..6e40e42a43de 100644
--- a/fs/no-block.c
+++ b/fs/no-block.c
@@ -19,4 +19,5 @@ static int no_blkdev_open(struct inode * inode, struct file * filp)
19 19
20const struct file_operations def_blk_fops = { 20const struct file_operations def_blk_fops = {
21 .open = no_blkdev_open, 21 .open = no_blkdev_open,
22 .llseek = noop_llseek,
22}; 23};
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 22c629eedd82..b388443c3a09 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -3,4 +3,4 @@ config FSNOTIFY
3 3
4source "fs/notify/dnotify/Kconfig" 4source "fs/notify/dnotify/Kconfig"
5source "fs/notify/inotify/Kconfig" 5source "fs/notify/inotify/Kconfig"
6source "fs/notify/fanotify/Kconfig" 6#source "fs/notify/fanotify/Kconfig"
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 5ed8e58d7bfc..bbcb98e7fcc6 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -433,6 +433,7 @@ static const struct file_operations fanotify_fops = {
433 .release = fanotify_release, 433 .release = fanotify_release,
434 .unlocked_ioctl = fanotify_ioctl, 434 .unlocked_ioctl = fanotify_ioctl,
435 .compat_ioctl = fanotify_ioctl, 435 .compat_ioctl = fanotify_ioctl,
436 .llseek = noop_llseek,
436}; 437};
437 438
438static void fanotify_free_mark(struct fsnotify_mark *fsn_mark) 439static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 36802420d69a..4498a208df94 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -88,8 +88,6 @@ void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
88{ 88{
89 struct dentry *parent; 89 struct dentry *parent;
90 struct inode *p_inode; 90 struct inode *p_inode;
91 bool send = false;
92 bool should_update_children = false;
93 91
94 if (!dentry) 92 if (!dentry)
95 dentry = path->dentry; 93 dentry = path->dentry;
@@ -97,29 +95,12 @@ void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
97 if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED)) 95 if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
98 return; 96 return;
99 97
100 spin_lock(&dentry->d_lock); 98 parent = dget_parent(dentry);
101 parent = dentry->d_parent;
102 p_inode = parent->d_inode; 99 p_inode = parent->d_inode;
103 100
104 if (fsnotify_inode_watches_children(p_inode)) { 101 if (unlikely(!fsnotify_inode_watches_children(p_inode)))
105 if (p_inode->i_fsnotify_mask & mask) { 102 __fsnotify_update_child_dentry_flags(p_inode);
106 dget(parent); 103 else if (p_inode->i_fsnotify_mask & mask) {
107 send = true;
108 }
109 } else {
110 /*
111 * The parent doesn't care about events on it's children but
112 * at least one child thought it did. We need to run all the
113 * children and update their d_flags to let them know p_inode
114 * doesn't care about them any more.
115 */
116 dget(parent);
117 should_update_children = true;
118 }
119
120 spin_unlock(&dentry->d_lock);
121
122 if (send) {
123 /* we are notifying a parent so come up with the new mask which 104 /* we are notifying a parent so come up with the new mask which
124 * specifies these are events which came from a child. */ 105 * specifies these are events which came from a child. */
125 mask |= FS_EVENT_ON_CHILD; 106 mask |= FS_EVENT_ON_CHILD;
@@ -130,13 +111,9 @@ void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
130 else 111 else
131 fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE, 112 fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
132 dentry->d_name.name, 0); 113 dentry->d_name.name, 0);
133 dput(parent);
134 } 114 }
135 115
136 if (unlikely(should_update_children)) { 116 dput(parent);
137 __fsnotify_update_child_dentry_flags(p_inode);
138 dput(parent);
139 }
140} 117}
141EXPORT_SYMBOL_GPL(__fsnotify_parent); 118EXPORT_SYMBOL_GPL(__fsnotify_parent);
142 119
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 33297c005060..21ed10660b80 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -240,6 +240,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
240{ 240{
241 struct inode *inode, *next_i, *need_iput = NULL; 241 struct inode *inode, *next_i, *need_iput = NULL;
242 242
243 spin_lock(&inode_lock);
243 list_for_each_entry_safe(inode, next_i, list, i_sb_list) { 244 list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
244 struct inode *need_iput_tmp; 245 struct inode *need_iput_tmp;
245 246
@@ -297,4 +298,5 @@ void fsnotify_unmount_inodes(struct list_head *list)
297 298
298 spin_lock(&inode_lock); 299 spin_lock(&inode_lock);
299 } 300 }
301 spin_unlock(&inode_lock);
300} 302}
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index bf7f6d776c31..24edc1185d53 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -344,6 +344,7 @@ static const struct file_operations inotify_fops = {
344 .release = inotify_release, 344 .release = inotify_release,
345 .unlocked_ioctl = inotify_ioctl, 345 .unlocked_ioctl = inotify_ioctl,
346 .compat_ioctl = inotify_ioctl, 346 .compat_ioctl = inotify_ioctl,
347 .llseek = noop_llseek,
347}; 348};
348 349
349 350
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 512806171bfa..d3fbe5730bfc 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -30,7 +30,6 @@
30#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
31#include <linux/vfs.h> 31#include <linux/vfs.h>
32#include <linux/moduleparam.h> 32#include <linux/moduleparam.h>
33#include <linux/smp_lock.h>
34#include <linux/bitmap.h> 33#include <linux/bitmap.h>
35 34
36#include "sysctl.h" 35#include "sysctl.h"
@@ -445,7 +444,6 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
445 444
446 ntfs_debug("Entering with remount options string: %s", opt); 445 ntfs_debug("Entering with remount options string: %s", opt);
447 446
448 lock_kernel();
449#ifndef NTFS_RW 447#ifndef NTFS_RW
450 /* For read-only compiled driver, enforce read-only flag. */ 448 /* For read-only compiled driver, enforce read-only flag. */
451 *flags |= MS_RDONLY; 449 *flags |= MS_RDONLY;
@@ -469,18 +467,15 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
469 if (NVolErrors(vol)) { 467 if (NVolErrors(vol)) {
470 ntfs_error(sb, "Volume has errors and is read-only%s", 468 ntfs_error(sb, "Volume has errors and is read-only%s",
471 es); 469 es);
472 unlock_kernel();
473 return -EROFS; 470 return -EROFS;
474 } 471 }
475 if (vol->vol_flags & VOLUME_IS_DIRTY) { 472 if (vol->vol_flags & VOLUME_IS_DIRTY) {
476 ntfs_error(sb, "Volume is dirty and read-only%s", es); 473 ntfs_error(sb, "Volume is dirty and read-only%s", es);
477 unlock_kernel();
478 return -EROFS; 474 return -EROFS;
479 } 475 }
480 if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { 476 if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
481 ntfs_error(sb, "Volume has been modified by chkdsk " 477 ntfs_error(sb, "Volume has been modified by chkdsk "
482 "and is read-only%s", es); 478 "and is read-only%s", es);
483 unlock_kernel();
484 return -EROFS; 479 return -EROFS;
485 } 480 }
486 if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { 481 if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
@@ -488,13 +483,11 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
488 "(0x%x) and is read-only%s", 483 "(0x%x) and is read-only%s",
489 (unsigned)le16_to_cpu(vol->vol_flags), 484 (unsigned)le16_to_cpu(vol->vol_flags),
490 es); 485 es);
491 unlock_kernel();
492 return -EROFS; 486 return -EROFS;
493 } 487 }
494 if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { 488 if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
495 ntfs_error(sb, "Failed to set dirty bit in volume " 489 ntfs_error(sb, "Failed to set dirty bit in volume "
496 "information flags%s", es); 490 "information flags%s", es);
497 unlock_kernel();
498 return -EROFS; 491 return -EROFS;
499 } 492 }
500#if 0 493#if 0
@@ -514,21 +507,18 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
514 ntfs_error(sb, "Failed to empty journal $LogFile%s", 507 ntfs_error(sb, "Failed to empty journal $LogFile%s",
515 es); 508 es);
516 NVolSetErrors(vol); 509 NVolSetErrors(vol);
517 unlock_kernel();
518 return -EROFS; 510 return -EROFS;
519 } 511 }
520 if (!ntfs_mark_quotas_out_of_date(vol)) { 512 if (!ntfs_mark_quotas_out_of_date(vol)) {
521 ntfs_error(sb, "Failed to mark quotas out of date%s", 513 ntfs_error(sb, "Failed to mark quotas out of date%s",
522 es); 514 es);
523 NVolSetErrors(vol); 515 NVolSetErrors(vol);
524 unlock_kernel();
525 return -EROFS; 516 return -EROFS;
526 } 517 }
527 if (!ntfs_stamp_usnjrnl(vol)) { 518 if (!ntfs_stamp_usnjrnl(vol)) {
528 ntfs_error(sb, "Failed to stamp transation log " 519 ntfs_error(sb, "Failed to stamp transation log "
529 "($UsnJrnl)%s", es); 520 "($UsnJrnl)%s", es);
530 NVolSetErrors(vol); 521 NVolSetErrors(vol);
531 unlock_kernel();
532 return -EROFS; 522 return -EROFS;
533 } 523 }
534 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { 524 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
@@ -544,11 +534,9 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
544 534
545 // TODO: Deal with *flags. 535 // TODO: Deal with *flags.
546 536
547 if (!parse_options(vol, opt)) { 537 if (!parse_options(vol, opt))
548 unlock_kernel();
549 return -EINVAL; 538 return -EINVAL;
550 } 539
551 unlock_kernel();
552 ntfs_debug("Done."); 540 ntfs_debug("Done.");
553 return 0; 541 return 0;
554} 542}
@@ -2261,8 +2249,6 @@ static void ntfs_put_super(struct super_block *sb)
2261 2249
2262 ntfs_debug("Entering."); 2250 ntfs_debug("Entering.");
2263 2251
2264 lock_kernel();
2265
2266#ifdef NTFS_RW 2252#ifdef NTFS_RW
2267 /* 2253 /*
2268 * Commit all inodes while they are still open in case some of them 2254 * Commit all inodes while they are still open in case some of them
@@ -2433,8 +2419,6 @@ static void ntfs_put_super(struct super_block *sb)
2433 2419
2434 sb->s_fs_info = NULL; 2420 sb->s_fs_info = NULL;
2435 kfree(vol); 2421 kfree(vol);
2436
2437 unlock_kernel();
2438} 2422}
2439 2423
2440/** 2424/**
@@ -2772,8 +2756,6 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
2772 init_rwsem(&vol->mftbmp_lock); 2756 init_rwsem(&vol->mftbmp_lock);
2773 init_rwsem(&vol->lcnbmp_lock); 2757 init_rwsem(&vol->lcnbmp_lock);
2774 2758
2775 unlock_kernel();
2776
2777 /* By default, enable sparse support. */ 2759 /* By default, enable sparse support. */
2778 NVolSetSparseEnabled(vol); 2760 NVolSetSparseEnabled(vol);
2779 2761
@@ -2929,8 +2911,8 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
2929 goto unl_upcase_iput_tmp_ino_err_out_now; 2911 goto unl_upcase_iput_tmp_ino_err_out_now;
2930 } 2912 }
2931 if ((sb->s_root = d_alloc_root(vol->root_ino))) { 2913 if ((sb->s_root = d_alloc_root(vol->root_ino))) {
2932 /* We increment i_count simulating an ntfs_iget(). */ 2914 /* We grab a reference, simulating an ntfs_iget(). */
2933 atomic_inc(&vol->root_ino->i_count); 2915 ihold(vol->root_ino);
2934 ntfs_debug("Exiting, status successful."); 2916 ntfs_debug("Exiting, status successful.");
2935 /* Release the default upcase if it has no users. */ 2917 /* Release the default upcase if it has no users. */
2936 mutex_lock(&ntfs_lock); 2918 mutex_lock(&ntfs_lock);
@@ -2940,7 +2922,6 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
2940 } 2922 }
2941 mutex_unlock(&ntfs_lock); 2923 mutex_unlock(&ntfs_lock);
2942 sb->s_export_op = &ntfs_export_ops; 2924 sb->s_export_op = &ntfs_export_ops;
2943 lock_kernel();
2944 lockdep_on(); 2925 lockdep_on();
2945 return 0; 2926 return 0;
2946 } 2927 }
@@ -3040,24 +3021,8 @@ iput_tmp_ino_err_out_now:
3040 if (vol->mft_ino && vol->mft_ino != tmp_ino) 3021 if (vol->mft_ino && vol->mft_ino != tmp_ino)
3041 iput(vol->mft_ino); 3022 iput(vol->mft_ino);
3042 vol->mft_ino = NULL; 3023 vol->mft_ino = NULL;
3043 /*
3044 * This is needed to get ntfs_clear_extent_inode() called for each
3045 * inode we have ever called ntfs_iget()/iput() on, otherwise we A)
3046 * leak resources and B) a subsequent mount fails automatically due to
3047 * ntfs_iget() never calling down into our ntfs_read_locked_inode()
3048 * method again... FIXME: Do we need to do this twice now because of
3049 * attribute inodes? I think not, so leave as is for now... (AIA)
3050 */
3051 if (invalidate_inodes(sb)) {
3052 ntfs_error(sb, "Busy inodes left. This is most likely a NTFS "
3053 "driver bug.");
3054 /* Copied from fs/super.c. I just love this message. (-; */
3055 printk("NTFS: Busy inodes after umount. Self-destruct in 5 "
3056 "seconds. Have a nice day...\n");
3057 }
3058 /* Errors at this stage are irrelevant. */ 3024 /* Errors at this stage are irrelevant. */
3059err_out_now: 3025err_out_now:
3060 lock_kernel();
3061 sb->s_fs_info = NULL; 3026 sb->s_fs_info = NULL;
3062 kfree(vol); 3027 kfree(vol);
3063 ntfs_debug("Failed, returning -EINVAL."); 3028 ntfs_debug("Failed, returning -EINVAL.");
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index a76e0aa5cd3f..391915093fe1 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -209,7 +209,10 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
209 } 209 }
210 210
211 inode->i_mode = new_mode; 211 inode->i_mode = new_mode;
212 inode->i_ctime = CURRENT_TIME;
212 di->i_mode = cpu_to_le16(inode->i_mode); 213 di->i_mode = cpu_to_le16(inode->i_mode);
214 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
215 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
213 216
214 ocfs2_journal_dirty(handle, di_bh); 217 ocfs2_journal_dirty(handle, di_bh);
215 218
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 0de69c9a08be..f1e962cb3b73 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -165,7 +165,7 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
165 * ocfs2 never allocates in this function - the only time we 165 * ocfs2 never allocates in this function - the only time we
166 * need to use BH_New is when we're extending i_size on a file 166 * need to use BH_New is when we're extending i_size on a file
167 * system which doesn't support holes, in which case BH_New 167 * system which doesn't support holes, in which case BH_New
168 * allows block_prepare_write() to zero. 168 * allows __block_write_begin() to zero.
169 * 169 *
170 * If we see this on a sparse file system, then a truncate has 170 * If we see this on a sparse file system, then a truncate has
171 * raced us and removed the cluster. In this case, we clear 171 * raced us and removed the cluster. In this case, we clear
@@ -407,21 +407,6 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
407 return ret; 407 return ret;
408} 408}
409 409
410/*
411 * This is called from ocfs2_write_zero_page() which has handled it's
412 * own cluster locking and has ensured allocation exists for those
413 * blocks to be written.
414 */
415int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
416 unsigned from, unsigned to)
417{
418 int ret;
419
420 ret = block_prepare_write(page, from, to, ocfs2_get_block);
421
422 return ret;
423}
424
425/* Taken from ext3. We don't necessarily need the full blown 410/* Taken from ext3. We don't necessarily need the full blown
426 * functionality yet, but IMHO it's better to cut and paste the whole 411 * functionality yet, but IMHO it's better to cut and paste the whole
427 * thing so we can avoid introducing our own bugs (and easily pick up 412 * thing so we can avoid introducing our own bugs (and easily pick up
@@ -732,7 +717,7 @@ static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
732} 717}
733 718
734/* 719/*
735 * Some of this taken from block_prepare_write(). We already have our 720 * Some of this taken from __block_write_begin(). We already have our
736 * mapping by now though, and the entire write will be allocating or 721 * mapping by now though, and the entire write will be allocating or
737 * it won't, so not much need to use BH_New. 722 * it won't, so not much need to use BH_New.
738 * 723 *
@@ -883,8 +868,8 @@ struct ocfs2_write_ctxt {
883 * out in so that future reads from that region will get 868 * out in so that future reads from that region will get
884 * zero's. 869 * zero's.
885 */ 870 */
886 struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
887 unsigned int w_num_pages; 871 unsigned int w_num_pages;
872 struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
888 struct page *w_target_page; 873 struct page *w_target_page;
889 874
890 /* 875 /*
@@ -1642,7 +1627,8 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
1642 return ret; 1627 return ret;
1643} 1628}
1644 1629
1645int ocfs2_write_begin_nolock(struct address_space *mapping, 1630int ocfs2_write_begin_nolock(struct file *filp,
1631 struct address_space *mapping,
1646 loff_t pos, unsigned len, unsigned flags, 1632 loff_t pos, unsigned len, unsigned flags,
1647 struct page **pagep, void **fsdata, 1633 struct page **pagep, void **fsdata,
1648 struct buffer_head *di_bh, struct page *mmap_page) 1634 struct buffer_head *di_bh, struct page *mmap_page)
@@ -1692,7 +1678,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1692 mlog_errno(ret); 1678 mlog_errno(ret);
1693 goto out; 1679 goto out;
1694 } else if (ret == 1) { 1680 } else if (ret == 1) {
1695 ret = ocfs2_refcount_cow(inode, di_bh, 1681 ret = ocfs2_refcount_cow(inode, filp, di_bh,
1696 wc->w_cpos, wc->w_clen, UINT_MAX); 1682 wc->w_cpos, wc->w_clen, UINT_MAX);
1697 if (ret) { 1683 if (ret) {
1698 mlog_errno(ret); 1684 mlog_errno(ret);
@@ -1854,7 +1840,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
1854 */ 1840 */
1855 down_write(&OCFS2_I(inode)->ip_alloc_sem); 1841 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1856 1842
1857 ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, 1843 ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
1858 fsdata, di_bh, NULL); 1844 fsdata, di_bh, NULL);
1859 if (ret) { 1845 if (ret) {
1860 mlog_errno(ret); 1846 mlog_errno(ret);
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index c48e93ffc513..76bfdfda691a 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,9 +22,6 @@
22#ifndef OCFS2_AOPS_H 22#ifndef OCFS2_AOPS_H
23#define OCFS2_AOPS_H 23#define OCFS2_AOPS_H
24 24
25int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
26 unsigned from, unsigned to);
27
28handle_t *ocfs2_start_walk_page_trans(struct inode *inode, 25handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
29 struct page *page, 26 struct page *page,
30 unsigned from, 27 unsigned from,
@@ -48,7 +45,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
48 loff_t pos, unsigned len, unsigned copied, 45 loff_t pos, unsigned len, unsigned copied,
49 struct page *page, void *fsdata); 46 struct page *page, void *fsdata);
50 47
51int ocfs2_write_begin_nolock(struct address_space *mapping, 48int ocfs2_write_begin_nolock(struct file *filp,
49 struct address_space *mapping,
52 loff_t pos, unsigned len, unsigned flags, 50 loff_t pos, unsigned len, unsigned flags,
53 struct page **pagep, void **fsdata, 51 struct page **pagep, void **fsdata,
54 struct buffer_head *di_bh, struct page *mmap_page); 52 struct buffer_head *di_bh, struct page *mmap_page);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 41d5f1f92d56..52c7557f3e25 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -62,10 +62,51 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
62static LIST_HEAD(o2hb_node_events); 62static LIST_HEAD(o2hb_node_events);
63static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); 63static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
64 64
65/*
66 * In global heartbeat, we maintain a series of region bitmaps.
67 * - o2hb_region_bitmap allows us to limit the region number to max region.
68 * - o2hb_live_region_bitmap tracks live regions (seen steady iterations).
69 * - o2hb_quorum_region_bitmap tracks live regions that have seen all nodes
70 * heartbeat on it.
71 * - o2hb_failed_region_bitmap tracks the regions that have seen io timeouts.
72 */
73static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
74static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
75static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
76static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
77
78#define O2HB_DB_TYPE_LIVENODES 0
79#define O2HB_DB_TYPE_LIVEREGIONS 1
80#define O2HB_DB_TYPE_QUORUMREGIONS 2
81#define O2HB_DB_TYPE_FAILEDREGIONS 3
82#define O2HB_DB_TYPE_REGION_LIVENODES 4
83#define O2HB_DB_TYPE_REGION_NUMBER 5
84#define O2HB_DB_TYPE_REGION_ELAPSED_TIME 6
85struct o2hb_debug_buf {
86 int db_type;
87 int db_size;
88 int db_len;
89 void *db_data;
90};
91
92static struct o2hb_debug_buf *o2hb_db_livenodes;
93static struct o2hb_debug_buf *o2hb_db_liveregions;
94static struct o2hb_debug_buf *o2hb_db_quorumregions;
95static struct o2hb_debug_buf *o2hb_db_failedregions;
96
65#define O2HB_DEBUG_DIR "o2hb" 97#define O2HB_DEBUG_DIR "o2hb"
66#define O2HB_DEBUG_LIVENODES "livenodes" 98#define O2HB_DEBUG_LIVENODES "livenodes"
99#define O2HB_DEBUG_LIVEREGIONS "live_regions"
100#define O2HB_DEBUG_QUORUMREGIONS "quorum_regions"
101#define O2HB_DEBUG_FAILEDREGIONS "failed_regions"
102#define O2HB_DEBUG_REGION_NUMBER "num"
103#define O2HB_DEBUG_REGION_ELAPSED_TIME "elapsed_time_in_ms"
104
67static struct dentry *o2hb_debug_dir; 105static struct dentry *o2hb_debug_dir;
68static struct dentry *o2hb_debug_livenodes; 106static struct dentry *o2hb_debug_livenodes;
107static struct dentry *o2hb_debug_liveregions;
108static struct dentry *o2hb_debug_quorumregions;
109static struct dentry *o2hb_debug_failedregions;
69 110
70static LIST_HEAD(o2hb_all_regions); 111static LIST_HEAD(o2hb_all_regions);
71 112
@@ -77,7 +118,19 @@ static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
77 118
78#define O2HB_DEFAULT_BLOCK_BITS 9 119#define O2HB_DEFAULT_BLOCK_BITS 9
79 120
121enum o2hb_heartbeat_modes {
122 O2HB_HEARTBEAT_LOCAL = 0,
123 O2HB_HEARTBEAT_GLOBAL,
124 O2HB_HEARTBEAT_NUM_MODES,
125};
126
127char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
128 "local", /* O2HB_HEARTBEAT_LOCAL */
129 "global", /* O2HB_HEARTBEAT_GLOBAL */
130};
131
80unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; 132unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
133unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
81 134
82/* Only sets a new threshold if there are no active regions. 135/* Only sets a new threshold if there are no active regions.
83 * 136 *
@@ -94,6 +147,22 @@ static void o2hb_dead_threshold_set(unsigned int threshold)
94 } 147 }
95} 148}
96 149
150static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode)
151{
152 int ret = -1;
153
154 if (hb_mode < O2HB_HEARTBEAT_NUM_MODES) {
155 spin_lock(&o2hb_live_lock);
156 if (list_empty(&o2hb_all_regions)) {
157 o2hb_heartbeat_mode = hb_mode;
158 ret = 0;
159 }
160 spin_unlock(&o2hb_live_lock);
161 }
162
163 return ret;
164}
165
97struct o2hb_node_event { 166struct o2hb_node_event {
98 struct list_head hn_item; 167 struct list_head hn_item;
99 enum o2hb_callback_type hn_event_type; 168 enum o2hb_callback_type hn_event_type;
@@ -135,6 +204,18 @@ struct o2hb_region {
135 struct block_device *hr_bdev; 204 struct block_device *hr_bdev;
136 struct o2hb_disk_slot *hr_slots; 205 struct o2hb_disk_slot *hr_slots;
137 206
207 /* live node map of this region */
208 unsigned long hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
209 unsigned int hr_region_num;
210
211 struct dentry *hr_debug_dir;
212 struct dentry *hr_debug_livenodes;
213 struct dentry *hr_debug_regnum;
214 struct dentry *hr_debug_elapsed_time;
215 struct o2hb_debug_buf *hr_db_livenodes;
216 struct o2hb_debug_buf *hr_db_regnum;
217 struct o2hb_debug_buf *hr_db_elapsed_time;
218
138 /* let the person setting up hb wait for it to return until it 219 /* let the person setting up hb wait for it to return until it
139 * has reached a 'steady' state. This will be fixed when we have 220 * has reached a 'steady' state. This will be fixed when we have
140 * a more complete api that doesn't lead to this sort of fragility. */ 221 * a more complete api that doesn't lead to this sort of fragility. */
@@ -163,8 +244,19 @@ struct o2hb_bio_wait_ctxt {
163 int wc_error; 244 int wc_error;
164}; 245};
165 246
247static int o2hb_pop_count(void *map, int count)
248{
249 int i = -1, pop = 0;
250
251 while ((i = find_next_bit(map, count, i + 1)) < count)
252 pop++;
253 return pop;
254}
255
166static void o2hb_write_timeout(struct work_struct *work) 256static void o2hb_write_timeout(struct work_struct *work)
167{ 257{
258 int failed, quorum;
259 unsigned long flags;
168 struct o2hb_region *reg = 260 struct o2hb_region *reg =
169 container_of(work, struct o2hb_region, 261 container_of(work, struct o2hb_region,
170 hr_write_timeout_work.work); 262 hr_write_timeout_work.work);
@@ -172,6 +264,28 @@ static void o2hb_write_timeout(struct work_struct *work)
172 mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u " 264 mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
173 "milliseconds\n", reg->hr_dev_name, 265 "milliseconds\n", reg->hr_dev_name,
174 jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); 266 jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
267
268 if (o2hb_global_heartbeat_active()) {
269 spin_lock_irqsave(&o2hb_live_lock, flags);
270 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
271 set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
272 failed = o2hb_pop_count(&o2hb_failed_region_bitmap,
273 O2NM_MAX_REGIONS);
274 quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap,
275 O2NM_MAX_REGIONS);
276 spin_unlock_irqrestore(&o2hb_live_lock, flags);
277
278 mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n",
279 quorum, failed);
280
281 /*
282 * Fence if the number of failed regions >= half the number
283 * of quorum regions
284 */
285 if ((failed << 1) < quorum)
286 return;
287 }
288
175 o2quo_disk_timeout(); 289 o2quo_disk_timeout();
176} 290}
177 291
@@ -180,6 +294,11 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
180 mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", 294 mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
181 O2HB_MAX_WRITE_TIMEOUT_MS); 295 O2HB_MAX_WRITE_TIMEOUT_MS);
182 296
297 if (o2hb_global_heartbeat_active()) {
298 spin_lock(&o2hb_live_lock);
299 clear_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
300 spin_unlock(&o2hb_live_lock);
301 }
183 cancel_delayed_work(&reg->hr_write_timeout_work); 302 cancel_delayed_work(&reg->hr_write_timeout_work);
184 reg->hr_last_timeout_start = jiffies; 303 reg->hr_last_timeout_start = jiffies;
185 schedule_delayed_work(&reg->hr_write_timeout_work, 304 schedule_delayed_work(&reg->hr_write_timeout_work,
@@ -513,6 +632,8 @@ static void o2hb_queue_node_event(struct o2hb_node_event *event,
513{ 632{
514 assert_spin_locked(&o2hb_live_lock); 633 assert_spin_locked(&o2hb_live_lock);
515 634
635 BUG_ON((!node) && (type != O2HB_NODE_DOWN_CB));
636
516 event->hn_event_type = type; 637 event->hn_event_type = type;
517 event->hn_node = node; 638 event->hn_node = node;
518 event->hn_node_num = node_num; 639 event->hn_node_num = node_num;
@@ -554,6 +675,35 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
554 o2nm_node_put(node); 675 o2nm_node_put(node);
555} 676}
556 677
678static void o2hb_set_quorum_device(struct o2hb_region *reg,
679 struct o2hb_disk_slot *slot)
680{
681 assert_spin_locked(&o2hb_live_lock);
682
683 if (!o2hb_global_heartbeat_active())
684 return;
685
686 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
687 return;
688
689 /*
690 * A region can be added to the quorum only when it sees all
691 * live nodes heartbeat on it. In other words, the region has been
692 * added to all nodes.
693 */
694 if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
695 sizeof(o2hb_live_node_bitmap)))
696 return;
697
698 if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD)
699 return;
700
701 printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n",
702 config_item_name(&reg->hr_item));
703
704 set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
705}
706
557static int o2hb_check_slot(struct o2hb_region *reg, 707static int o2hb_check_slot(struct o2hb_region *reg,
558 struct o2hb_disk_slot *slot) 708 struct o2hb_disk_slot *slot)
559{ 709{
@@ -565,14 +715,22 @@ static int o2hb_check_slot(struct o2hb_region *reg,
565 u64 cputime; 715 u64 cputime;
566 unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; 716 unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS;
567 unsigned int slot_dead_ms; 717 unsigned int slot_dead_ms;
718 int tmp;
568 719
569 memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); 720 memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
570 721
571 /* Is this correct? Do we assume that the node doesn't exist 722 /*
572 * if we're not configured for him? */ 723 * If a node is no longer configured but is still in the livemap, we
724 * may need to clear that bit from the livemap.
725 */
573 node = o2nm_get_node_by_num(slot->ds_node_num); 726 node = o2nm_get_node_by_num(slot->ds_node_num);
574 if (!node) 727 if (!node) {
575 return 0; 728 spin_lock(&o2hb_live_lock);
729 tmp = test_bit(slot->ds_node_num, o2hb_live_node_bitmap);
730 spin_unlock(&o2hb_live_lock);
731 if (!tmp)
732 return 0;
733 }
576 734
577 if (!o2hb_verify_crc(reg, hb_block)) { 735 if (!o2hb_verify_crc(reg, hb_block)) {
578 /* all paths from here will drop o2hb_live_lock for 736 /* all paths from here will drop o2hb_live_lock for
@@ -639,8 +797,12 @@ fire_callbacks:
639 mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n", 797 mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n",
640 slot->ds_node_num, (long long)slot->ds_last_generation); 798 slot->ds_node_num, (long long)slot->ds_last_generation);
641 799
800 set_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
801
642 /* first on the list generates a callback */ 802 /* first on the list generates a callback */
643 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { 803 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
804 mlog(ML_HEARTBEAT, "o2hb: Add node %d to live nodes "
805 "bitmap\n", slot->ds_node_num);
644 set_bit(slot->ds_node_num, o2hb_live_node_bitmap); 806 set_bit(slot->ds_node_num, o2hb_live_node_bitmap);
645 807
646 o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node, 808 o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
@@ -684,13 +846,18 @@ fire_callbacks:
684 mlog(ML_HEARTBEAT, "Node %d left my region\n", 846 mlog(ML_HEARTBEAT, "Node %d left my region\n",
685 slot->ds_node_num); 847 slot->ds_node_num);
686 848
849 clear_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
850
687 /* last off the live_slot generates a callback */ 851 /* last off the live_slot generates a callback */
688 list_del_init(&slot->ds_live_item); 852 list_del_init(&slot->ds_live_item);
689 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { 853 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
854 mlog(ML_HEARTBEAT, "o2hb: Remove node %d from live "
855 "nodes bitmap\n", slot->ds_node_num);
690 clear_bit(slot->ds_node_num, o2hb_live_node_bitmap); 856 clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
691 857
692 o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node, 858 /* node can be null */
693 slot->ds_node_num); 859 o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB,
860 node, slot->ds_node_num);
694 861
695 changed = 1; 862 changed = 1;
696 } 863 }
@@ -706,11 +873,14 @@ fire_callbacks:
706 slot->ds_equal_samples = 0; 873 slot->ds_equal_samples = 0;
707 } 874 }
708out: 875out:
876 o2hb_set_quorum_device(reg, slot);
877
709 spin_unlock(&o2hb_live_lock); 878 spin_unlock(&o2hb_live_lock);
710 879
711 o2hb_run_event_list(&event); 880 o2hb_run_event_list(&event);
712 881
713 o2nm_node_put(node); 882 if (node)
883 o2nm_node_put(node);
714 return changed; 884 return changed;
715} 885}
716 886
@@ -737,6 +907,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
737{ 907{
738 int i, ret, highest_node, change = 0; 908 int i, ret, highest_node, change = 0;
739 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; 909 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
910 unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
740 struct o2hb_bio_wait_ctxt write_wc; 911 struct o2hb_bio_wait_ctxt write_wc;
741 912
742 ret = o2nm_configured_node_map(configured_nodes, 913 ret = o2nm_configured_node_map(configured_nodes,
@@ -746,6 +917,17 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
746 return ret; 917 return ret;
747 } 918 }
748 919
920 /*
921 * If a node is not configured but is in the livemap, we still need
922 * to read the slot so as to be able to remove it from the livemap.
923 */
924 o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
925 i = -1;
926 while ((i = find_next_bit(live_node_bitmap,
927 O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
928 set_bit(i, configured_nodes);
929 }
930
749 highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); 931 highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
750 if (highest_node >= O2NM_MAX_NODES) { 932 if (highest_node >= O2NM_MAX_NODES) {
751 mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); 933 mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
@@ -917,21 +1099,59 @@ static int o2hb_thread(void *data)
917#ifdef CONFIG_DEBUG_FS 1099#ifdef CONFIG_DEBUG_FS
918static int o2hb_debug_open(struct inode *inode, struct file *file) 1100static int o2hb_debug_open(struct inode *inode, struct file *file)
919{ 1101{
1102 struct o2hb_debug_buf *db = inode->i_private;
1103 struct o2hb_region *reg;
920 unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 1104 unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
921 char *buf = NULL; 1105 char *buf = NULL;
922 int i = -1; 1106 int i = -1;
923 int out = 0; 1107 int out = 0;
924 1108
1109 /* max_nodes should be the largest bitmap we pass here */
1110 BUG_ON(sizeof(map) < db->db_size);
1111
925 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 1112 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
926 if (!buf) 1113 if (!buf)
927 goto bail; 1114 goto bail;
928 1115
929 o2hb_fill_node_map(map, sizeof(map)); 1116 switch (db->db_type) {
1117 case O2HB_DB_TYPE_LIVENODES:
1118 case O2HB_DB_TYPE_LIVEREGIONS:
1119 case O2HB_DB_TYPE_QUORUMREGIONS:
1120 case O2HB_DB_TYPE_FAILEDREGIONS:
1121 spin_lock(&o2hb_live_lock);
1122 memcpy(map, db->db_data, db->db_size);
1123 spin_unlock(&o2hb_live_lock);
1124 break;
1125
1126 case O2HB_DB_TYPE_REGION_LIVENODES:
1127 spin_lock(&o2hb_live_lock);
1128 reg = (struct o2hb_region *)db->db_data;
1129 memcpy(map, reg->hr_live_node_bitmap, db->db_size);
1130 spin_unlock(&o2hb_live_lock);
1131 break;
1132
1133 case O2HB_DB_TYPE_REGION_NUMBER:
1134 reg = (struct o2hb_region *)db->db_data;
1135 out += snprintf(buf + out, PAGE_SIZE - out, "%d\n",
1136 reg->hr_region_num);
1137 goto done;
1138
1139 case O2HB_DB_TYPE_REGION_ELAPSED_TIME:
1140 reg = (struct o2hb_region *)db->db_data;
1141 out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
1142 jiffies_to_msecs(jiffies -
1143 reg->hr_last_timeout_start));
1144 goto done;
1145
1146 default:
1147 goto done;
1148 }
930 1149
931 while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) 1150 while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len)
932 out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); 1151 out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
933 out += snprintf(buf + out, PAGE_SIZE - out, "\n"); 1152 out += snprintf(buf + out, PAGE_SIZE - out, "\n");
934 1153
1154done:
935 i_size_write(inode, out); 1155 i_size_write(inode, out);
936 1156
937 file->private_data = buf; 1157 file->private_data = buf;
@@ -978,10 +1198,104 @@ static const struct file_operations o2hb_debug_fops = {
978 1198
979void o2hb_exit(void) 1199void o2hb_exit(void)
980{ 1200{
981 if (o2hb_debug_livenodes) 1201 kfree(o2hb_db_livenodes);
982 debugfs_remove(o2hb_debug_livenodes); 1202 kfree(o2hb_db_liveregions);
983 if (o2hb_debug_dir) 1203 kfree(o2hb_db_quorumregions);
984 debugfs_remove(o2hb_debug_dir); 1204 kfree(o2hb_db_failedregions);
1205 debugfs_remove(o2hb_debug_failedregions);
1206 debugfs_remove(o2hb_debug_quorumregions);
1207 debugfs_remove(o2hb_debug_liveregions);
1208 debugfs_remove(o2hb_debug_livenodes);
1209 debugfs_remove(o2hb_debug_dir);
1210}
1211
1212static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
1213 struct o2hb_debug_buf **db, int db_len,
1214 int type, int size, int len, void *data)
1215{
1216 *db = kmalloc(db_len, GFP_KERNEL);
1217 if (!*db)
1218 return NULL;
1219
1220 (*db)->db_type = type;
1221 (*db)->db_size = size;
1222 (*db)->db_len = len;
1223 (*db)->db_data = data;
1224
1225 return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db,
1226 &o2hb_debug_fops);
1227}
1228
1229static int o2hb_debug_init(void)
1230{
1231 int ret = -ENOMEM;
1232
1233 o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
1234 if (!o2hb_debug_dir) {
1235 mlog_errno(ret);
1236 goto bail;
1237 }
1238
1239 o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES,
1240 o2hb_debug_dir,
1241 &o2hb_db_livenodes,
1242 sizeof(*o2hb_db_livenodes),
1243 O2HB_DB_TYPE_LIVENODES,
1244 sizeof(o2hb_live_node_bitmap),
1245 O2NM_MAX_NODES,
1246 o2hb_live_node_bitmap);
1247 if (!o2hb_debug_livenodes) {
1248 mlog_errno(ret);
1249 goto bail;
1250 }
1251
1252 o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS,
1253 o2hb_debug_dir,
1254 &o2hb_db_liveregions,
1255 sizeof(*o2hb_db_liveregions),
1256 O2HB_DB_TYPE_LIVEREGIONS,
1257 sizeof(o2hb_live_region_bitmap),
1258 O2NM_MAX_REGIONS,
1259 o2hb_live_region_bitmap);
1260 if (!o2hb_debug_liveregions) {
1261 mlog_errno(ret);
1262 goto bail;
1263 }
1264
1265 o2hb_debug_quorumregions =
1266 o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS,
1267 o2hb_debug_dir,
1268 &o2hb_db_quorumregions,
1269 sizeof(*o2hb_db_quorumregions),
1270 O2HB_DB_TYPE_QUORUMREGIONS,
1271 sizeof(o2hb_quorum_region_bitmap),
1272 O2NM_MAX_REGIONS,
1273 o2hb_quorum_region_bitmap);
1274 if (!o2hb_debug_quorumregions) {
1275 mlog_errno(ret);
1276 goto bail;
1277 }
1278
1279 o2hb_debug_failedregions =
1280 o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS,
1281 o2hb_debug_dir,
1282 &o2hb_db_failedregions,
1283 sizeof(*o2hb_db_failedregions),
1284 O2HB_DB_TYPE_FAILEDREGIONS,
1285 sizeof(o2hb_failed_region_bitmap),
1286 O2NM_MAX_REGIONS,
1287 o2hb_failed_region_bitmap);
1288 if (!o2hb_debug_failedregions) {
1289 mlog_errno(ret);
1290 goto bail;
1291 }
1292
1293 ret = 0;
1294bail:
1295 if (ret)
1296 o2hb_exit();
1297
1298 return ret;
985} 1299}
986 1300
987int o2hb_init(void) 1301int o2hb_init(void)
@@ -997,24 +1311,12 @@ int o2hb_init(void)
997 INIT_LIST_HEAD(&o2hb_node_events); 1311 INIT_LIST_HEAD(&o2hb_node_events);
998 1312
999 memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); 1313 memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
1314 memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
1315 memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap));
1316 memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
1317 memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
1000 1318
1001 o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); 1319 return o2hb_debug_init();
1002 if (!o2hb_debug_dir) {
1003 mlog_errno(-ENOMEM);
1004 return -ENOMEM;
1005 }
1006
1007 o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES,
1008 S_IFREG|S_IRUSR,
1009 o2hb_debug_dir, NULL,
1010 &o2hb_debug_fops);
1011 if (!o2hb_debug_livenodes) {
1012 mlog_errno(-ENOMEM);
1013 debugfs_remove(o2hb_debug_dir);
1014 return -ENOMEM;
1015 }
1016
1017 return 0;
1018} 1320}
1019 1321
1020/* if we're already in a callback then we're already serialized by the sem */ 1322/* if we're already in a callback then we're already serialized by the sem */
@@ -1078,6 +1380,13 @@ static void o2hb_region_release(struct config_item *item)
1078 if (reg->hr_slots) 1380 if (reg->hr_slots)
1079 kfree(reg->hr_slots); 1381 kfree(reg->hr_slots);
1080 1382
1383 kfree(reg->hr_db_regnum);
1384 kfree(reg->hr_db_livenodes);
1385 debugfs_remove(reg->hr_debug_livenodes);
1386 debugfs_remove(reg->hr_debug_regnum);
1387 debugfs_remove(reg->hr_debug_elapsed_time);
1388 debugfs_remove(reg->hr_debug_dir);
1389
1081 spin_lock(&o2hb_live_lock); 1390 spin_lock(&o2hb_live_lock);
1082 list_del(&reg->hr_all_item); 1391 list_del(&reg->hr_all_item);
1083 spin_unlock(&o2hb_live_lock); 1392 spin_unlock(&o2hb_live_lock);
@@ -1441,6 +1750,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1441 /* Ok, we were woken. Make sure it wasn't by drop_item() */ 1750 /* Ok, we were woken. Make sure it wasn't by drop_item() */
1442 spin_lock(&o2hb_live_lock); 1751 spin_lock(&o2hb_live_lock);
1443 hb_task = reg->hr_task; 1752 hb_task = reg->hr_task;
1753 if (o2hb_global_heartbeat_active())
1754 set_bit(reg->hr_region_num, o2hb_live_region_bitmap);
1444 spin_unlock(&o2hb_live_lock); 1755 spin_unlock(&o2hb_live_lock);
1445 1756
1446 if (hb_task) 1757 if (hb_task)
@@ -1448,6 +1759,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1448 else 1759 else
1449 ret = -EIO; 1760 ret = -EIO;
1450 1761
1762 if (hb_task && o2hb_global_heartbeat_active())
1763 printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n",
1764 config_item_name(&reg->hr_item));
1765
1451out: 1766out:
1452 if (filp) 1767 if (filp)
1453 fput(filp); 1768 fput(filp);
@@ -1586,21 +1901,94 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group
1586 : NULL; 1901 : NULL;
1587} 1902}
1588 1903
1904static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
1905{
1906 int ret = -ENOMEM;
1907
1908 reg->hr_debug_dir =
1909 debugfs_create_dir(config_item_name(&reg->hr_item), dir);
1910 if (!reg->hr_debug_dir) {
1911 mlog_errno(ret);
1912 goto bail;
1913 }
1914
1915 reg->hr_debug_livenodes =
1916 o2hb_debug_create(O2HB_DEBUG_LIVENODES,
1917 reg->hr_debug_dir,
1918 &(reg->hr_db_livenodes),
1919 sizeof(*(reg->hr_db_livenodes)),
1920 O2HB_DB_TYPE_REGION_LIVENODES,
1921 sizeof(reg->hr_live_node_bitmap),
1922 O2NM_MAX_NODES, reg);
1923 if (!reg->hr_debug_livenodes) {
1924 mlog_errno(ret);
1925 goto bail;
1926 }
1927
1928 reg->hr_debug_regnum =
1929 o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER,
1930 reg->hr_debug_dir,
1931 &(reg->hr_db_regnum),
1932 sizeof(*(reg->hr_db_regnum)),
1933 O2HB_DB_TYPE_REGION_NUMBER,
1934 0, O2NM_MAX_NODES, reg);
1935 if (!reg->hr_debug_regnum) {
1936 mlog_errno(ret);
1937 goto bail;
1938 }
1939
1940 reg->hr_debug_elapsed_time =
1941 o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME,
1942 reg->hr_debug_dir,
1943 &(reg->hr_db_elapsed_time),
1944 sizeof(*(reg->hr_db_elapsed_time)),
1945 O2HB_DB_TYPE_REGION_ELAPSED_TIME,
1946 0, 0, reg);
1947 if (!reg->hr_debug_elapsed_time) {
1948 mlog_errno(ret);
1949 goto bail;
1950 }
1951
1952 ret = 0;
1953bail:
1954 return ret;
1955}
1956
1589static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, 1957static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
1590 const char *name) 1958 const char *name)
1591{ 1959{
1592 struct o2hb_region *reg = NULL; 1960 struct o2hb_region *reg = NULL;
1961 int ret;
1593 1962
1594 reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL); 1963 reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL);
1595 if (reg == NULL) 1964 if (reg == NULL)
1596 return ERR_PTR(-ENOMEM); 1965 return ERR_PTR(-ENOMEM);
1597 1966
1598 config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type); 1967 if (strlen(name) > O2HB_MAX_REGION_NAME_LEN)
1968 return ERR_PTR(-ENAMETOOLONG);
1599 1969
1600 spin_lock(&o2hb_live_lock); 1970 spin_lock(&o2hb_live_lock);
1971 reg->hr_region_num = 0;
1972 if (o2hb_global_heartbeat_active()) {
1973 reg->hr_region_num = find_first_zero_bit(o2hb_region_bitmap,
1974 O2NM_MAX_REGIONS);
1975 if (reg->hr_region_num >= O2NM_MAX_REGIONS) {
1976 spin_unlock(&o2hb_live_lock);
1977 return ERR_PTR(-EFBIG);
1978 }
1979 set_bit(reg->hr_region_num, o2hb_region_bitmap);
1980 }
1601 list_add_tail(&reg->hr_all_item, &o2hb_all_regions); 1981 list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
1602 spin_unlock(&o2hb_live_lock); 1982 spin_unlock(&o2hb_live_lock);
1603 1983
1984 config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
1985
1986 ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
1987 if (ret) {
1988 config_item_put(&reg->hr_item);
1989 return ERR_PTR(ret);
1990 }
1991
1604 return &reg->hr_item; 1992 return &reg->hr_item;
1605} 1993}
1606 1994
@@ -1612,6 +2000,10 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
1612 2000
1613 /* stop the thread when the user removes the region dir */ 2001 /* stop the thread when the user removes the region dir */
1614 spin_lock(&o2hb_live_lock); 2002 spin_lock(&o2hb_live_lock);
2003 if (o2hb_global_heartbeat_active()) {
2004 clear_bit(reg->hr_region_num, o2hb_region_bitmap);
2005 clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
2006 }
1615 hb_task = reg->hr_task; 2007 hb_task = reg->hr_task;
1616 reg->hr_task = NULL; 2008 reg->hr_task = NULL;
1617 spin_unlock(&o2hb_live_lock); 2009 spin_unlock(&o2hb_live_lock);
@@ -1628,6 +2020,9 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
1628 wake_up(&o2hb_steady_queue); 2020 wake_up(&o2hb_steady_queue);
1629 } 2021 }
1630 2022
2023 if (o2hb_global_heartbeat_active())
2024 printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
2025 config_item_name(&reg->hr_item));
1631 config_item_put(item); 2026 config_item_put(item);
1632} 2027}
1633 2028
@@ -1688,6 +2083,41 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group
1688 return count; 2083 return count;
1689} 2084}
1690 2085
2086static
2087ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group,
2088 char *page)
2089{
2090 return sprintf(page, "%s\n",
2091 o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]);
2092}
2093
2094static
2095ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
2096 const char *page, size_t count)
2097{
2098 unsigned int i;
2099 int ret;
2100 size_t len;
2101
2102 len = (page[count - 1] == '\n') ? count - 1 : count;
2103 if (!len)
2104 return -EINVAL;
2105
2106 for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) {
2107 if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len))
2108 continue;
2109
2110 ret = o2hb_global_hearbeat_mode_set(i);
2111 if (!ret)
2112 printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n",
2113 o2hb_heartbeat_mode_desc[i]);
2114 return count;
2115 }
2116
2117 return -EINVAL;
2118
2119}
2120
1691static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = { 2121static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
1692 .attr = { .ca_owner = THIS_MODULE, 2122 .attr = { .ca_owner = THIS_MODULE,
1693 .ca_name = "dead_threshold", 2123 .ca_name = "dead_threshold",
@@ -1696,8 +2126,17 @@ static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold
1696 .store = o2hb_heartbeat_group_threshold_store, 2126 .store = o2hb_heartbeat_group_threshold_store,
1697}; 2127};
1698 2128
2129static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = {
2130 .attr = { .ca_owner = THIS_MODULE,
2131 .ca_name = "mode",
2132 .ca_mode = S_IRUGO | S_IWUSR },
2133 .show = o2hb_heartbeat_group_mode_show,
2134 .store = o2hb_heartbeat_group_mode_store,
2135};
2136
1699static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { 2137static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
1700 &o2hb_heartbeat_group_attr_threshold.attr, 2138 &o2hb_heartbeat_group_attr_threshold.attr,
2139 &o2hb_heartbeat_group_attr_mode.attr,
1701 NULL, 2140 NULL,
1702}; 2141};
1703 2142
@@ -1963,3 +2402,34 @@ void o2hb_stop_all_regions(void)
1963 spin_unlock(&o2hb_live_lock); 2402 spin_unlock(&o2hb_live_lock);
1964} 2403}
1965EXPORT_SYMBOL_GPL(o2hb_stop_all_regions); 2404EXPORT_SYMBOL_GPL(o2hb_stop_all_regions);
2405
2406int o2hb_get_all_regions(char *region_uuids, u8 max_regions)
2407{
2408 struct o2hb_region *reg;
2409 int numregs = 0;
2410 char *p;
2411
2412 spin_lock(&o2hb_live_lock);
2413
2414 p = region_uuids;
2415 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
2416 mlog(0, "Region: %s\n", config_item_name(&reg->hr_item));
2417 if (numregs < max_regions) {
2418 memcpy(p, config_item_name(&reg->hr_item),
2419 O2HB_MAX_REGION_NAME_LEN);
2420 p += O2HB_MAX_REGION_NAME_LEN;
2421 }
2422 numregs++;
2423 }
2424
2425 spin_unlock(&o2hb_live_lock);
2426
2427 return numregs;
2428}
2429EXPORT_SYMBOL_GPL(o2hb_get_all_regions);
2430
2431int o2hb_global_heartbeat_active(void)
2432{
2433 return (o2hb_heartbeat_mode == O2HB_HEARTBEAT_GLOBAL);
2434}
2435EXPORT_SYMBOL(o2hb_global_heartbeat_active);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 2f1649253b49..00ad8e8fea51 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -31,6 +31,8 @@
31 31
32#define O2HB_REGION_TIMEOUT_MS 2000 32#define O2HB_REGION_TIMEOUT_MS 2000
33 33
34#define O2HB_MAX_REGION_NAME_LEN 32
35
34/* number of changes to be seen as live */ 36/* number of changes to be seen as live */
35#define O2HB_LIVE_THRESHOLD 2 37#define O2HB_LIVE_THRESHOLD 2
36/* number of equal samples to be seen as dead */ 38/* number of equal samples to be seen as dead */
@@ -81,5 +83,7 @@ int o2hb_check_node_heartbeating(u8 node_num);
81int o2hb_check_node_heartbeating_from_callback(u8 node_num); 83int o2hb_check_node_heartbeating_from_callback(u8 node_num);
82int o2hb_check_local_node_heartbeating(void); 84int o2hb_check_local_node_heartbeating(void);
83void o2hb_stop_all_regions(void); 85void o2hb_stop_all_regions(void);
86int o2hb_get_all_regions(char *region_uuids, u8 numregions);
87int o2hb_global_heartbeat_active(void);
84 88
85#endif /* O2CLUSTER_HEARTBEAT_H */ 89#endif /* O2CLUSTER_HEARTBEAT_H */
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index fd96e2a2fa56..ea2ed9f56c94 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -119,7 +119,8 @@
119#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ 119#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
120#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ 120#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
121#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */ 121#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */
122#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */ 122#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */
123#define ML_CLUSTER 0x0000001000000000ULL /* cluster stack */
123 124
124#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) 125#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
125#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT) 126#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index ed0c9f367fed..bb240647ca5f 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -711,6 +711,8 @@ static struct config_item *o2nm_node_group_make_item(struct config_group *group,
711 config_item_init_type_name(&node->nd_item, name, &o2nm_node_type); 711 config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
712 spin_lock_init(&node->nd_lock); 712 spin_lock_init(&node->nd_lock);
713 713
714 mlog(ML_CLUSTER, "o2nm: Registering node %s\n", name);
715
714 return &node->nd_item; 716 return &node->nd_item;
715} 717}
716 718
@@ -744,6 +746,9 @@ static void o2nm_node_group_drop_item(struct config_group *group,
744 } 746 }
745 write_unlock(&cluster->cl_nodes_lock); 747 write_unlock(&cluster->cl_nodes_lock);
746 748
749 mlog(ML_CLUSTER, "o2nm: Unregistered node %s\n",
750 config_item_name(&node->nd_item));
751
747 config_item_put(item); 752 config_item_put(item);
748} 753}
749 754
diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h
index 5b9854bad571..49b594325bec 100644
--- a/fs/ocfs2/cluster/ocfs2_nodemanager.h
+++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h
@@ -36,4 +36,10 @@
36/* host name, group name, cluster name all 64 bytes */ 36/* host name, group name, cluster name all 64 bytes */
37#define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN 37#define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN
38 38
39/*
40 * Maximum number of global heartbeat regions allowed.
41 * **CAUTION** Changing this number will break dlm compatibility.
42 */
43#define O2NM_MAX_REGIONS 32
44
39#endif /* _OCFS2_NODEMANAGER_H */ 45#endif /* _OCFS2_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 1361997cf205..9aa426e42123 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -977,7 +977,7 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
977int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, 977int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
978 size_t caller_veclen, u8 target_node, int *status) 978 size_t caller_veclen, u8 target_node, int *status)
979{ 979{
980 int ret; 980 int ret = 0;
981 struct o2net_msg *msg = NULL; 981 struct o2net_msg *msg = NULL;
982 size_t veclen, caller_bytes = 0; 982 size_t veclen, caller_bytes = 0;
983 struct kvec *vec = NULL; 983 struct kvec *vec = NULL;
@@ -1696,6 +1696,9 @@ static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
1696{ 1696{
1697 o2quo_hb_down(node_num); 1697 o2quo_hb_down(node_num);
1698 1698
1699 if (!node)
1700 return;
1701
1699 if (node_num != o2nm_this_node()) 1702 if (node_num != o2nm_this_node())
1700 o2net_disconnect_node(node); 1703 o2net_disconnect_node(node);
1701 1704
@@ -1709,6 +1712,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
1709 1712
1710 o2quo_hb_up(node_num); 1713 o2quo_hb_up(node_num);
1711 1714
1715 BUG_ON(!node);
1716
1712 /* ensure an immediate connect attempt */ 1717 /* ensure an immediate connect attempt */
1713 nn->nn_last_connect_attempt = jiffies - 1718 nn->nn_last_connect_attempt = jiffies -
1714 (msecs_to_jiffies(o2net_reconnect_delay()) + 1); 1719 (msecs_to_jiffies(o2net_reconnect_delay()) + 1);
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 96fa7ebc530c..15fdbdf9eb4b 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -129,7 +129,7 @@ struct o2net_node {
129 129
130struct o2net_sock_container { 130struct o2net_sock_container {
131 struct kref sc_kref; 131 struct kref sc_kref;
132 /* the next two are vaild for the life time of the sc */ 132 /* the next two are valid for the life time of the sc */
133 struct socket *sc_sock; 133 struct socket *sc_sock;
134 struct o2nm_node *sc_node; 134 struct o2nm_node *sc_node;
135 135
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b4957c7d9fe2..edaded48e7e9 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -40,6 +40,14 @@
40#include "inode.h" 40#include "inode.h"
41#include "super.h" 41#include "super.h"
42 42
43void ocfs2_dentry_attach_gen(struct dentry *dentry)
44{
45 unsigned long gen =
46 OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
47 BUG_ON(dentry->d_inode);
48 dentry->d_fsdata = (void *)gen;
49}
50
43 51
44static int ocfs2_dentry_revalidate(struct dentry *dentry, 52static int ocfs2_dentry_revalidate(struct dentry *dentry,
45 struct nameidata *nd) 53 struct nameidata *nd)
@@ -51,11 +59,20 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
51 mlog_entry("(0x%p, '%.*s')\n", dentry, 59 mlog_entry("(0x%p, '%.*s')\n", dentry,
52 dentry->d_name.len, dentry->d_name.name); 60 dentry->d_name.len, dentry->d_name.name);
53 61
54 /* Never trust a negative dentry - force a new lookup. */ 62 /* For a negative dentry -
63 * check the generation number of the parent and compare with the
64 * one stored in the inode.
65 */
55 if (inode == NULL) { 66 if (inode == NULL) {
56 mlog(0, "negative dentry: %.*s\n", dentry->d_name.len, 67 unsigned long gen = (unsigned long) dentry->d_fsdata;
57 dentry->d_name.name); 68 unsigned long pgen =
58 goto bail; 69 OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
70 mlog(0, "negative dentry: %.*s parent gen: %lu "
71 "dentry gen: %lu\n",
72 dentry->d_name.len, dentry->d_name.name, pgen, gen);
73 if (gen != pgen)
74 goto bail;
75 goto valid;
59 } 76 }
60 77
61 BUG_ON(!osb); 78 BUG_ON(!osb);
@@ -96,6 +113,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
96 goto bail; 113 goto bail;
97 } 114 }
98 115
116valid:
99 ret = 1; 117 ret = 1;
100 118
101bail: 119bail:
@@ -227,6 +245,12 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
227 if (!inode) 245 if (!inode)
228 return 0; 246 return 0;
229 247
248 if (!dentry->d_inode && dentry->d_fsdata) {
249 /* Converting a negative dentry to positive
250 Clear dentry->d_fsdata */
251 dentry->d_fsdata = dl = NULL;
252 }
253
230 if (dl) { 254 if (dl) {
231 mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, 255 mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
232 " \"%.*s\": old parent: %llu, new: %llu\n", 256 " \"%.*s\": old parent: %llu, new: %llu\n",
@@ -452,6 +476,7 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
452 476
453out: 477out:
454 iput(inode); 478 iput(inode);
479 ocfs2_dentry_attach_gen(dentry);
455} 480}
456 481
457/* 482/*
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index f5dd1789acf1..b79eff709958 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -64,5 +64,6 @@ void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target,
64 struct inode *old_dir, struct inode *new_dir); 64 struct inode *old_dir, struct inode *new_dir);
65 65
66extern spinlock_t dentry_attach_lock; 66extern spinlock_t dentry_attach_lock;
67void ocfs2_dentry_attach_gen(struct dentry *dentry);
67 68
68#endif /* OCFS2_DCACHE_H */ 69#endif /* OCFS2_DCACHE_H */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f04ebcfffc4a..c49f6de0e7ab 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3931,6 +3931,15 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3931 goto out_commit; 3931 goto out_commit;
3932 } 3932 }
3933 3933
3934 cpos = split_hash;
3935 ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
3936 data_ac, meta_ac, new_dx_leaves,
3937 num_dx_leaves);
3938 if (ret) {
3939 mlog_errno(ret);
3940 goto out_commit;
3941 }
3942
3934 for (i = 0; i < num_dx_leaves; i++) { 3943 for (i = 0; i < num_dx_leaves; i++) {
3935 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), 3944 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
3936 orig_dx_leaves[i], 3945 orig_dx_leaves[i],
@@ -3939,15 +3948,14 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3939 mlog_errno(ret); 3948 mlog_errno(ret);
3940 goto out_commit; 3949 goto out_commit;
3941 } 3950 }
3942 }
3943 3951
3944 cpos = split_hash; 3952 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
3945 ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle, 3953 new_dx_leaves[i],
3946 data_ac, meta_ac, new_dx_leaves, 3954 OCFS2_JOURNAL_ACCESS_WRITE);
3947 num_dx_leaves); 3955 if (ret) {
3948 if (ret) { 3956 mlog_errno(ret);
3949 mlog_errno(ret); 3957 goto out_commit;
3950 goto out_commit; 3958 }
3951 } 3959 }
3952 3960
3953 ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf, 3961 ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 4b6ae2c13b47..b36d0bf77a5a 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -445,7 +445,9 @@ enum {
445 DLM_LOCK_REQUEST_MSG, /* 515 */ 445 DLM_LOCK_REQUEST_MSG, /* 515 */
446 DLM_RECO_DATA_DONE_MSG, /* 516 */ 446 DLM_RECO_DATA_DONE_MSG, /* 516 */
447 DLM_BEGIN_RECO_MSG, /* 517 */ 447 DLM_BEGIN_RECO_MSG, /* 517 */
448 DLM_FINALIZE_RECO_MSG /* 518 */ 448 DLM_FINALIZE_RECO_MSG, /* 518 */
449 DLM_QUERY_REGION, /* 519 */
450 DLM_QUERY_NODEINFO, /* 520 */
449}; 451};
450 452
451struct dlm_reco_node_data 453struct dlm_reco_node_data
@@ -727,6 +729,31 @@ struct dlm_cancel_join
727 u8 domain[O2NM_MAX_NAME_LEN]; 729 u8 domain[O2NM_MAX_NAME_LEN];
728}; 730};
729 731
732struct dlm_query_region {
733 u8 qr_node;
734 u8 qr_numregions;
735 u8 qr_namelen;
736 u8 pad1;
737 u8 qr_domain[O2NM_MAX_NAME_LEN];
738 u8 qr_regions[O2HB_MAX_REGION_NAME_LEN * O2NM_MAX_REGIONS];
739};
740
741struct dlm_node_info {
742 u8 ni_nodenum;
743 u8 pad1;
744 u16 ni_ipv4_port;
745 u32 ni_ipv4_address;
746};
747
748struct dlm_query_nodeinfo {
749 u8 qn_nodenum;
750 u8 qn_numnodes;
751 u8 qn_namelen;
752 u8 pad1;
753 u8 qn_domain[O2NM_MAX_NAME_LEN];
754 struct dlm_node_info qn_nodes[O2NM_MAX_NODES];
755};
756
730struct dlm_exit_domain 757struct dlm_exit_domain
731{ 758{
732 u8 node_idx; 759 u8 node_idx;
@@ -1030,6 +1057,7 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm,
1030 struct dlm_lock_resource *res); 1057 struct dlm_lock_resource *res);
1031void dlm_clean_master_list(struct dlm_ctxt *dlm, 1058void dlm_clean_master_list(struct dlm_ctxt *dlm,
1032 u8 dead_node); 1059 u8 dead_node);
1060void dlm_force_free_mles(struct dlm_ctxt *dlm);
1033int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock); 1061int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
1034int __dlm_lockres_has_locks(struct dlm_lock_resource *res); 1062int __dlm_lockres_has_locks(struct dlm_lock_resource *res);
1035int __dlm_lockres_unused(struct dlm_lock_resource *res); 1063int __dlm_lockres_unused(struct dlm_lock_resource *res);
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 5efdd37dfe48..272ec8631a51 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -493,7 +493,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
493 struct hlist_head *bucket; 493 struct hlist_head *bucket;
494 struct hlist_node *list; 494 struct hlist_node *list;
495 int i, out = 0; 495 int i, out = 0;
496 unsigned long total = 0, longest = 0, bktcnt; 496 unsigned long total = 0, longest = 0, bucket_count = 0;
497 497
498 out += snprintf(db->buf + out, db->len - out, 498 out += snprintf(db->buf + out, db->len - out,
499 "Dumping MLEs for Domain: %s\n", dlm->name); 499 "Dumping MLEs for Domain: %s\n", dlm->name);
@@ -505,13 +505,13 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
505 mle = hlist_entry(list, struct dlm_master_list_entry, 505 mle = hlist_entry(list, struct dlm_master_list_entry,
506 master_hash_node); 506 master_hash_node);
507 ++total; 507 ++total;
508 ++bktcnt; 508 ++bucket_count;
509 if (db->len - out < 200) 509 if (db->len - out < 200)
510 continue; 510 continue;
511 out += dump_mle(mle, db->buf + out, db->len - out); 511 out += dump_mle(mle, db->buf + out, db->len - out);
512 } 512 }
513 longest = max(longest, bktcnt); 513 longest = max(longest, bucket_count);
514 bktcnt = 0; 514 bucket_count = 0;
515 } 515 }
516 spin_unlock(&dlm->master_lock); 516 spin_unlock(&dlm->master_lock);
517 517
@@ -636,8 +636,14 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
636 spin_lock(&dlm->track_lock); 636 spin_lock(&dlm->track_lock);
637 if (oldres) 637 if (oldres)
638 track_list = &oldres->tracking; 638 track_list = &oldres->tracking;
639 else 639 else {
640 track_list = &dlm->tracking_list; 640 track_list = &dlm->tracking_list;
641 if (list_empty(track_list)) {
642 dl = NULL;
643 spin_unlock(&dlm->track_lock);
644 goto bail;
645 }
646 }
641 647
642 list_for_each_entry(res, track_list, tracking) { 648 list_for_each_entry(res, track_list, tracking) {
643 if (&res->tracking == &dlm->tracking_list) 649 if (&res->tracking == &dlm->tracking_list)
@@ -660,6 +666,7 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
660 } else 666 } else
661 dl = NULL; 667 dl = NULL;
662 668
669bail:
663 /* passed to seq_show */ 670 /* passed to seq_show */
664 return dl; 671 return dl;
665} 672}
@@ -775,7 +782,9 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
775 782
776 /* Domain: xxxxxxxxxx Key: 0xdfbac769 */ 783 /* Domain: xxxxxxxxxx Key: 0xdfbac769 */
777 out += snprintf(db->buf + out, db->len - out, 784 out += snprintf(db->buf + out, db->len - out,
778 "Domain: %s Key: 0x%08x\n", dlm->name, dlm->key); 785 "Domain: %s Key: 0x%08x Protocol: %d.%d\n",
786 dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major,
787 dlm->dlm_locking_proto.pv_minor);
779 788
780 /* Thread Pid: xxx Node: xxx State: xxxxx */ 789 /* Thread Pid: xxx Node: xxx State: xxxxx */
781 out += snprintf(db->buf + out, db->len - out, 790 out += snprintf(db->buf + out, db->len - out,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 153abb5abef0..58a93b953735 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -128,10 +128,14 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
128 * will have a negotiated version with the same major number and a minor 128 * will have a negotiated version with the same major number and a minor
129 * number equal or smaller. The dlm_ctxt->dlm_locking_proto field should 129 * number equal or smaller. The dlm_ctxt->dlm_locking_proto field should
130 * be used to determine what a running domain is actually using. 130 * be used to determine what a running domain is actually using.
131 *
132 * New in version 1.1:
133 * - Message DLM_QUERY_REGION added to support global heartbeat
134 * - Message DLM_QUERY_NODEINFO added to allow online node removes
131 */ 135 */
132static const struct dlm_protocol_version dlm_protocol = { 136static const struct dlm_protocol_version dlm_protocol = {
133 .pv_major = 1, 137 .pv_major = 1,
134 .pv_minor = 0, 138 .pv_minor = 1,
135}; 139};
136 140
137#define DLM_DOMAIN_BACKOFF_MS 200 141#define DLM_DOMAIN_BACKOFF_MS 200
@@ -142,6 +146,8 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
142 void **ret_data); 146 void **ret_data);
143static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, 147static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
144 void **ret_data); 148 void **ret_data);
149static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
150 void *data, void **ret_data);
145static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, 151static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
146 void **ret_data); 152 void **ret_data);
147static int dlm_protocol_compare(struct dlm_protocol_version *existing, 153static int dlm_protocol_compare(struct dlm_protocol_version *existing,
@@ -693,6 +699,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
693 699
694 dlm_mark_domain_leaving(dlm); 700 dlm_mark_domain_leaving(dlm);
695 dlm_leave_domain(dlm); 701 dlm_leave_domain(dlm);
702 dlm_force_free_mles(dlm);
696 dlm_complete_dlm_shutdown(dlm); 703 dlm_complete_dlm_shutdown(dlm);
697 } 704 }
698 dlm_put(dlm); 705 dlm_put(dlm);
@@ -920,6 +927,370 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
920 return 0; 927 return 0;
921} 928}
922 929
930static int dlm_match_regions(struct dlm_ctxt *dlm,
931 struct dlm_query_region *qr)
932{
933 char *local = NULL, *remote = qr->qr_regions;
934 char *l, *r;
935 int localnr, i, j, foundit;
936 int status = 0;
937
938 if (!o2hb_global_heartbeat_active()) {
939 if (qr->qr_numregions) {
940 mlog(ML_ERROR, "Domain %s: Joining node %d has global "
941 "heartbeat enabled but local node %d does not\n",
942 qr->qr_domain, qr->qr_node, dlm->node_num);
943 status = -EINVAL;
944 }
945 goto bail;
946 }
947
948 if (o2hb_global_heartbeat_active() && !qr->qr_numregions) {
949 mlog(ML_ERROR, "Domain %s: Local node %d has global "
950 "heartbeat enabled but joining node %d does not\n",
951 qr->qr_domain, dlm->node_num, qr->qr_node);
952 status = -EINVAL;
953 goto bail;
954 }
955
956 r = remote;
957 for (i = 0; i < qr->qr_numregions; ++i) {
958 mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, r);
959 r += O2HB_MAX_REGION_NAME_LEN;
960 }
961
962 local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL);
963 if (!local) {
964 status = -ENOMEM;
965 goto bail;
966 }
967
968 localnr = o2hb_get_all_regions(local, O2NM_MAX_REGIONS);
969
970 /* compare local regions with remote */
971 l = local;
972 for (i = 0; i < localnr; ++i) {
973 foundit = 0;
974 r = remote;
975 for (j = 0; j <= qr->qr_numregions; ++j) {
976 if (!memcmp(l, r, O2HB_MAX_REGION_NAME_LEN)) {
977 foundit = 1;
978 break;
979 }
980 r += O2HB_MAX_REGION_NAME_LEN;
981 }
982 if (!foundit) {
983 status = -EINVAL;
984 mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
985 "in local node %d but not in joining node %d\n",
986 qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, l,
987 dlm->node_num, qr->qr_node);
988 goto bail;
989 }
990 l += O2HB_MAX_REGION_NAME_LEN;
991 }
992
993 /* compare remote with local regions */
994 r = remote;
995 for (i = 0; i < qr->qr_numregions; ++i) {
996 foundit = 0;
997 l = local;
998 for (j = 0; j < localnr; ++j) {
999 if (!memcmp(r, l, O2HB_MAX_REGION_NAME_LEN)) {
1000 foundit = 1;
1001 break;
1002 }
1003 l += O2HB_MAX_REGION_NAME_LEN;
1004 }
1005 if (!foundit) {
1006 status = -EINVAL;
1007 mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
1008 "in joining node %d but not in local node %d\n",
1009 qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, r,
1010 qr->qr_node, dlm->node_num);
1011 goto bail;
1012 }
1013 r += O2HB_MAX_REGION_NAME_LEN;
1014 }
1015
1016bail:
1017 kfree(local);
1018
1019 return status;
1020}
1021
1022static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map)
1023{
1024 struct dlm_query_region *qr = NULL;
1025 int status, ret = 0, i;
1026 char *p;
1027
1028 if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
1029 goto bail;
1030
1031 qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL);
1032 if (!qr) {
1033 ret = -ENOMEM;
1034 mlog_errno(ret);
1035 goto bail;
1036 }
1037
1038 qr->qr_node = dlm->node_num;
1039 qr->qr_namelen = strlen(dlm->name);
1040 memcpy(qr->qr_domain, dlm->name, qr->qr_namelen);
1041 /* if local hb, the numregions will be zero */
1042 if (o2hb_global_heartbeat_active())
1043 qr->qr_numregions = o2hb_get_all_regions(qr->qr_regions,
1044 O2NM_MAX_REGIONS);
1045
1046 p = qr->qr_regions;
1047 for (i = 0; i < qr->qr_numregions; ++i, p += O2HB_MAX_REGION_NAME_LEN)
1048 mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, p);
1049
1050 i = -1;
1051 while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
1052 i + 1)) < O2NM_MAX_NODES) {
1053 if (i == dlm->node_num)
1054 continue;
1055
1056 mlog(0, "Sending regions to node %d\n", i);
1057
1058 ret = o2net_send_message(DLM_QUERY_REGION, DLM_MOD_KEY, qr,
1059 sizeof(struct dlm_query_region),
1060 i, &status);
1061 if (ret >= 0)
1062 ret = status;
1063 if (ret) {
1064 mlog(ML_ERROR, "Region mismatch %d, node %d\n",
1065 ret, i);
1066 break;
1067 }
1068 }
1069
1070bail:
1071 kfree(qr);
1072 return ret;
1073}
1074
1075static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
1076 void *data, void **ret_data)
1077{
1078 struct dlm_query_region *qr;
1079 struct dlm_ctxt *dlm = NULL;
1080 int status = 0;
1081 int locked = 0;
1082
1083 qr = (struct dlm_query_region *) msg->buf;
1084
1085 mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node,
1086 qr->qr_domain);
1087
1088 status = -EINVAL;
1089
1090 spin_lock(&dlm_domain_lock);
1091 dlm = __dlm_lookup_domain_full(qr->qr_domain, qr->qr_namelen);
1092 if (!dlm) {
1093 mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
1094 "before join domain\n", qr->qr_node, qr->qr_domain);
1095 goto bail;
1096 }
1097
1098 spin_lock(&dlm->spinlock);
1099 locked = 1;
1100 if (dlm->joining_node != qr->qr_node) {
1101 mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
1102 "but joining node is %d\n", qr->qr_node, qr->qr_domain,
1103 dlm->joining_node);
1104 goto bail;
1105 }
1106
1107 /* Support for global heartbeat was added in 1.1 */
1108 if (dlm->dlm_locking_proto.pv_major == 1 &&
1109 dlm->dlm_locking_proto.pv_minor == 0) {
1110 mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
1111 "but active dlm protocol is %d.%d\n", qr->qr_node,
1112 qr->qr_domain, dlm->dlm_locking_proto.pv_major,
1113 dlm->dlm_locking_proto.pv_minor);
1114 goto bail;
1115 }
1116
1117 status = dlm_match_regions(dlm, qr);
1118
1119bail:
1120 if (locked)
1121 spin_unlock(&dlm->spinlock);
1122 spin_unlock(&dlm_domain_lock);
1123
1124 return status;
1125}
1126
1127static int dlm_match_nodes(struct dlm_ctxt *dlm, struct dlm_query_nodeinfo *qn)
1128{
1129 struct o2nm_node *local;
1130 struct dlm_node_info *remote;
1131 int i, j;
1132 int status = 0;
1133
1134 for (j = 0; j < qn->qn_numnodes; ++j)
1135 mlog(0, "Node %3d, %pI4:%u\n", qn->qn_nodes[j].ni_nodenum,
1136 &(qn->qn_nodes[j].ni_ipv4_address),
1137 ntohs(qn->qn_nodes[j].ni_ipv4_port));
1138
1139 for (i = 0; i < O2NM_MAX_NODES && !status; ++i) {
1140 local = o2nm_get_node_by_num(i);
1141 remote = NULL;
1142 for (j = 0; j < qn->qn_numnodes; ++j) {
1143 if (qn->qn_nodes[j].ni_nodenum == i) {
1144 remote = &(qn->qn_nodes[j]);
1145 break;
1146 }
1147 }
1148
1149 if (!local && !remote)
1150 continue;
1151
1152 if ((local && !remote) || (!local && remote))
1153 status = -EINVAL;
1154
1155 if (!status &&
1156 ((remote->ni_nodenum != local->nd_num) ||
1157 (remote->ni_ipv4_port != local->nd_ipv4_port) ||
1158 (remote->ni_ipv4_address != local->nd_ipv4_address)))
1159 status = -EINVAL;
1160
1161 if (status) {
1162 if (remote && !local)
1163 mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
1164 "registered in joining node %d but not in "
1165 "local node %d\n", qn->qn_domain,
1166 remote->ni_nodenum,
1167 &(remote->ni_ipv4_address),
1168 ntohs(remote->ni_ipv4_port),
1169 qn->qn_nodenum, dlm->node_num);
1170 if (local && !remote)
1171 mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
1172 "registered in local node %d but not in "
1173 "joining node %d\n", qn->qn_domain,
1174 local->nd_num, &(local->nd_ipv4_address),
1175 ntohs(local->nd_ipv4_port),
1176 dlm->node_num, qn->qn_nodenum);
1177 BUG_ON((!local && !remote));
1178 }
1179
1180 if (local)
1181 o2nm_node_put(local);
1182 }
1183
1184 return status;
1185}
1186
1187static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map)
1188{
1189 struct dlm_query_nodeinfo *qn = NULL;
1190 struct o2nm_node *node;
1191 int ret = 0, status, count, i;
1192
1193 if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
1194 goto bail;
1195
1196 qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL);
1197 if (!qn) {
1198 ret = -ENOMEM;
1199 mlog_errno(ret);
1200 goto bail;
1201 }
1202
1203 for (i = 0, count = 0; i < O2NM_MAX_NODES; ++i) {
1204 node = o2nm_get_node_by_num(i);
1205 if (!node)
1206 continue;
1207 qn->qn_nodes[count].ni_nodenum = node->nd_num;
1208 qn->qn_nodes[count].ni_ipv4_port = node->nd_ipv4_port;
1209 qn->qn_nodes[count].ni_ipv4_address = node->nd_ipv4_address;
1210 mlog(0, "Node %3d, %pI4:%u\n", node->nd_num,
1211 &(node->nd_ipv4_address), ntohs(node->nd_ipv4_port));
1212 ++count;
1213 o2nm_node_put(node);
1214 }
1215
1216 qn->qn_nodenum = dlm->node_num;
1217 qn->qn_numnodes = count;
1218 qn->qn_namelen = strlen(dlm->name);
1219 memcpy(qn->qn_domain, dlm->name, qn->qn_namelen);
1220
1221 i = -1;
1222 while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
1223 i + 1)) < O2NM_MAX_NODES) {
1224 if (i == dlm->node_num)
1225 continue;
1226
1227 mlog(0, "Sending nodeinfo to node %d\n", i);
1228
1229 ret = o2net_send_message(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
1230 qn, sizeof(struct dlm_query_nodeinfo),
1231 i, &status);
1232 if (ret >= 0)
1233 ret = status;
1234 if (ret) {
1235 mlog(ML_ERROR, "node mismatch %d, node %d\n", ret, i);
1236 break;
1237 }
1238 }
1239
1240bail:
1241 kfree(qn);
1242 return ret;
1243}
1244
1245static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len,
1246 void *data, void **ret_data)
1247{
1248 struct dlm_query_nodeinfo *qn;
1249 struct dlm_ctxt *dlm = NULL;
1250 int locked = 0, status = -EINVAL;
1251
1252 qn = (struct dlm_query_nodeinfo *) msg->buf;
1253
1254 mlog(0, "Node %u queries nodes on domain %s\n", qn->qn_nodenum,
1255 qn->qn_domain);
1256
1257 spin_lock(&dlm_domain_lock);
1258 dlm = __dlm_lookup_domain_full(qn->qn_domain, qn->qn_namelen);
1259 if (!dlm) {
1260 mlog(ML_ERROR, "Node %d queried nodes on domain %s before "
1261 "join domain\n", qn->qn_nodenum, qn->qn_domain);
1262 goto bail;
1263 }
1264
1265 spin_lock(&dlm->spinlock);
1266 locked = 1;
1267 if (dlm->joining_node != qn->qn_nodenum) {
1268 mlog(ML_ERROR, "Node %d queried nodes on domain %s but "
1269 "joining node is %d\n", qn->qn_nodenum, qn->qn_domain,
1270 dlm->joining_node);
1271 goto bail;
1272 }
1273
1274 /* Support for node query was added in 1.1 */
1275 if (dlm->dlm_locking_proto.pv_major == 1 &&
1276 dlm->dlm_locking_proto.pv_minor == 0) {
1277 mlog(ML_ERROR, "Node %d queried nodes on domain %s "
1278 "but active dlm protocol is %d.%d\n", qn->qn_nodenum,
1279 qn->qn_domain, dlm->dlm_locking_proto.pv_major,
1280 dlm->dlm_locking_proto.pv_minor);
1281 goto bail;
1282 }
1283
1284 status = dlm_match_nodes(dlm, qn);
1285
1286bail:
1287 if (locked)
1288 spin_unlock(&dlm->spinlock);
1289 spin_unlock(&dlm_domain_lock);
1290
1291 return status;
1292}
1293
923static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, 1294static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
924 void **ret_data) 1295 void **ret_data)
925{ 1296{
@@ -1240,6 +1611,20 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
1240 set_bit(dlm->node_num, dlm->domain_map); 1611 set_bit(dlm->node_num, dlm->domain_map);
1241 spin_unlock(&dlm->spinlock); 1612 spin_unlock(&dlm->spinlock);
1242 1613
1614 /* Support for global heartbeat and node info was added in 1.1 */
1615 if (dlm_protocol.pv_major > 1 || dlm_protocol.pv_minor > 0) {
1616 status = dlm_send_nodeinfo(dlm, ctxt->yes_resp_map);
1617 if (status) {
1618 mlog_errno(status);
1619 goto bail;
1620 }
1621 status = dlm_send_regions(dlm, ctxt->yes_resp_map);
1622 if (status) {
1623 mlog_errno(status);
1624 goto bail;
1625 }
1626 }
1627
1243 dlm_send_join_asserts(dlm, ctxt->yes_resp_map); 1628 dlm_send_join_asserts(dlm, ctxt->yes_resp_map);
1244 1629
1245 /* Joined state *must* be set before the joining node 1630 /* Joined state *must* be set before the joining node
@@ -1806,7 +2191,21 @@ static int dlm_register_net_handlers(void)
1806 sizeof(struct dlm_cancel_join), 2191 sizeof(struct dlm_cancel_join),
1807 dlm_cancel_join_handler, 2192 dlm_cancel_join_handler,
1808 NULL, NULL, &dlm_join_handlers); 2193 NULL, NULL, &dlm_join_handlers);
2194 if (status)
2195 goto bail;
2196
2197 status = o2net_register_handler(DLM_QUERY_REGION, DLM_MOD_KEY,
2198 sizeof(struct dlm_query_region),
2199 dlm_query_region_handler,
2200 NULL, NULL, &dlm_join_handlers);
1809 2201
2202 if (status)
2203 goto bail;
2204
2205 status = o2net_register_handler(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
2206 sizeof(struct dlm_query_nodeinfo),
2207 dlm_query_nodeinfo_handler,
2208 NULL, NULL, &dlm_join_handlers);
1810bail: 2209bail:
1811 if (status < 0) 2210 if (status < 0)
1812 dlm_unregister_net_handlers(); 2211 dlm_unregister_net_handlers();
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index ffb4c68dafa4..f564b0e5f80d 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -3433,3 +3433,43 @@ void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
3433 wake_up(&res->wq); 3433 wake_up(&res->wq);
3434 wake_up(&dlm->migration_wq); 3434 wake_up(&dlm->migration_wq);
3435} 3435}
3436
3437void dlm_force_free_mles(struct dlm_ctxt *dlm)
3438{
3439 int i;
3440 struct hlist_head *bucket;
3441 struct dlm_master_list_entry *mle;
3442 struct hlist_node *tmp, *list;
3443
3444 /*
3445 * We notified all other nodes that we are exiting the domain and
3446 * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still
3447 * around we force free them and wake any processes that are waiting
3448 * on the mles
3449 */
3450 spin_lock(&dlm->spinlock);
3451 spin_lock(&dlm->master_lock);
3452
3453 BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING);
3454 BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES));
3455
3456 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
3457 bucket = dlm_master_hash(dlm, i);
3458 hlist_for_each_safe(list, tmp, bucket) {
3459 mle = hlist_entry(list, struct dlm_master_list_entry,
3460 master_hash_node);
3461 if (mle->type != DLM_MLE_BLOCK) {
3462 mlog(ML_ERROR, "bad mle: %p\n", mle);
3463 dlm_print_one_mle(mle);
3464 }
3465 atomic_set(&mle->woken, 1);
3466 wake_up(&mle->wq);
3467
3468 __dlm_unlink_mle(dlm, mle);
3469 __dlm_mle_detach_hb_events(dlm, mle);
3470 __dlm_put_mle(mle);
3471 }
3472 }
3473 spin_unlock(&dlm->master_lock);
3474 spin_unlock(&dlm->spinlock);
3475}
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index c2903b84bb7a..75e115f1bd73 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -400,6 +400,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
400 if (inode) { 400 if (inode) {
401 ip = DLMFS_I(inode); 401 ip = DLMFS_I(inode);
402 402
403 inode->i_ino = get_next_ino();
403 inode->i_mode = mode; 404 inode->i_mode = mode;
404 inode->i_uid = current_fsuid(); 405 inode->i_uid = current_fsuid();
405 inode->i_gid = current_fsgid(); 406 inode->i_gid = current_fsgid();
@@ -425,6 +426,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
425 if (!inode) 426 if (!inode)
426 return NULL; 427 return NULL;
427 428
429 inode->i_ino = get_next_ino();
428 inode->i_mode = mode; 430 inode->i_mode = mode;
429 inode->i_uid = current_fsuid(); 431 inode->i_uid = current_fsuid();
430 inode->i_gid = current_fsgid(); 432 inode->i_gid = current_fsgid();
@@ -612,6 +614,7 @@ static const struct file_operations dlmfs_file_operations = {
612 .poll = dlmfs_file_poll, 614 .poll = dlmfs_file_poll,
613 .read = dlmfs_file_read, 615 .read = dlmfs_file_read,
614 .write = dlmfs_file_write, 616 .write = dlmfs_file_write,
617 .llseek = default_llseek,
615}; 618};
616 619
617static const struct inode_operations dlmfs_dir_inode_operations = { 620static const struct inode_operations dlmfs_dir_inode_operations = {
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 5e02a893f46e..e8d94d722ecb 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3635,10 +3635,18 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
3635{ 3635{
3636 struct inode *inode; 3636 struct inode *inode;
3637 struct address_space *mapping; 3637 struct address_space *mapping;
3638 struct ocfs2_inode_info *oi;
3638 3639
3639 inode = ocfs2_lock_res_inode(lockres); 3640 inode = ocfs2_lock_res_inode(lockres);
3640 mapping = inode->i_mapping; 3641 mapping = inode->i_mapping;
3641 3642
3643 if (S_ISDIR(inode->i_mode)) {
3644 oi = OCFS2_I(inode);
3645 oi->ip_dir_lock_gen++;
3646 mlog(0, "generation: %u\n", oi->ip_dir_lock_gen);
3647 goto out;
3648 }
3649
3642 if (!S_ISREG(inode->i_mode)) 3650 if (!S_ISREG(inode->i_mode))
3643 goto out; 3651 goto out;
3644 3652
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index d1ce48e1b3d6..1d596d8c4a4a 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -84,6 +84,7 @@ enum {
84 OI_LS_PARENT, 84 OI_LS_PARENT,
85 OI_LS_RENAME1, 85 OI_LS_RENAME1,
86 OI_LS_RENAME2, 86 OI_LS_RENAME2,
87 OI_LS_REFLINK_TARGET,
87}; 88};
88 89
89int ocfs2_dlm_init(struct ocfs2_super *osb); 90int ocfs2_dlm_init(struct ocfs2_super *osb);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 9a03c151b5ce..77b4c04a2809 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -64,12 +64,6 @@
64 64
65#include "buffer_head_io.h" 65#include "buffer_head_io.h"
66 66
67static int ocfs2_sync_inode(struct inode *inode)
68{
69 filemap_fdatawrite(inode->i_mapping);
70 return sync_mapping_buffers(inode->i_mapping);
71}
72
73static int ocfs2_init_file_private(struct inode *inode, struct file *file) 67static int ocfs2_init_file_private(struct inode *inode, struct file *file)
74{ 68{
75 struct ocfs2_file_private *fp; 69 struct ocfs2_file_private *fp;
@@ -180,16 +174,12 @@ static int ocfs2_sync_file(struct file *file, int datasync)
180{ 174{
181 int err = 0; 175 int err = 0;
182 journal_t *journal; 176 journal_t *journal;
183 struct dentry *dentry = file->f_path.dentry;
184 struct inode *inode = file->f_mapping->host; 177 struct inode *inode = file->f_mapping->host;
185 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 178 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
186 179
187 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, 180 mlog_entry("(0x%p, %d, 0x%p, '%.*s')\n", file, datasync,
188 dentry->d_name.len, dentry->d_name.name); 181 file->f_path.dentry, file->f_path.dentry->d_name.len,
189 182 file->f_path.dentry->d_name.name);
190 err = ocfs2_sync_inode(dentry->d_inode);
191 if (err)
192 goto bail;
193 183
194 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { 184 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
195 /* 185 /*
@@ -197,8 +187,7 @@ static int ocfs2_sync_file(struct file *file, int datasync)
197 * platter 187 * platter
198 */ 188 */
199 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) 189 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
200 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, 190 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
201 NULL, BLKDEV_IFL_WAIT);
202 goto bail; 191 goto bail;
203 } 192 }
204 193
@@ -370,7 +359,7 @@ static int ocfs2_cow_file_pos(struct inode *inode,
370 if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) 359 if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
371 goto out; 360 goto out;
372 361
373 return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1); 362 return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);
374 363
375out: 364out:
376 return status; 365 return status;
@@ -807,13 +796,12 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
807 block_end = block_start + (1 << inode->i_blkbits); 796 block_end = block_start + (1 << inode->i_blkbits);
808 797
809 /* 798 /*
810 * block_start is block-aligned. Bump it by one to 799 * block_start is block-aligned. Bump it by one to force
811 * force ocfs2_{prepare,commit}_write() to zero the 800 * __block_write_begin and block_commit_write to zero the
812 * whole block. 801 * whole block.
813 */ 802 */
814 ret = ocfs2_prepare_write_nolock(inode, page, 803 ret = __block_write_begin(page, block_start + 1, 0,
815 block_start + 1, 804 ocfs2_get_block);
816 block_start + 1);
817 if (ret < 0) { 805 if (ret < 0) {
818 mlog_errno(ret); 806 mlog_errno(ret);
819 goto out_unlock; 807 goto out_unlock;
@@ -913,8 +901,8 @@ static int ocfs2_zero_extend_get_range(struct inode *inode,
913 zero_clusters = last_cpos - zero_cpos; 901 zero_clusters = last_cpos - zero_cpos;
914 902
915 if (needs_cow) { 903 if (needs_cow) {
916 rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters, 904 rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,
917 UINT_MAX); 905 zero_clusters, UINT_MAX);
918 if (rc) { 906 if (rc) {
919 mlog_errno(rc); 907 mlog_errno(rc);
920 goto out; 908 goto out;
@@ -2062,6 +2050,7 @@ out:
2062} 2050}
2063 2051
2064static int ocfs2_prepare_inode_for_refcount(struct inode *inode, 2052static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
2053 struct file *file,
2065 loff_t pos, size_t count, 2054 loff_t pos, size_t count,
2066 int *meta_level) 2055 int *meta_level)
2067{ 2056{
@@ -2079,7 +2068,7 @@ static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
2079 2068
2080 *meta_level = 1; 2069 *meta_level = 1;
2081 2070
2082 ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); 2071 ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX);
2083 if (ret) 2072 if (ret)
2084 mlog_errno(ret); 2073 mlog_errno(ret);
2085out: 2074out:
@@ -2087,7 +2076,7 @@ out:
2087 return ret; 2076 return ret;
2088} 2077}
2089 2078
2090static int ocfs2_prepare_inode_for_write(struct dentry *dentry, 2079static int ocfs2_prepare_inode_for_write(struct file *file,
2091 loff_t *ppos, 2080 loff_t *ppos,
2092 size_t count, 2081 size_t count,
2093 int appending, 2082 int appending,
@@ -2095,6 +2084,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
2095 int *has_refcount) 2084 int *has_refcount)
2096{ 2085{
2097 int ret = 0, meta_level = 0; 2086 int ret = 0, meta_level = 0;
2087 struct dentry *dentry = file->f_path.dentry;
2098 struct inode *inode = dentry->d_inode; 2088 struct inode *inode = dentry->d_inode;
2099 loff_t saved_pos, end; 2089 loff_t saved_pos, end;
2100 2090
@@ -2150,6 +2140,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
2150 meta_level = -1; 2140 meta_level = -1;
2151 2141
2152 ret = ocfs2_prepare_inode_for_refcount(inode, 2142 ret = ocfs2_prepare_inode_for_refcount(inode,
2143 file,
2153 saved_pos, 2144 saved_pos,
2154 count, 2145 count,
2155 &meta_level); 2146 &meta_level);
@@ -2232,6 +2223,8 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
2232 struct file *file = iocb->ki_filp; 2223 struct file *file = iocb->ki_filp;
2233 struct inode *inode = file->f_path.dentry->d_inode; 2224 struct inode *inode = file->f_path.dentry->d_inode;
2234 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2225 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2226 int full_coherency = !(osb->s_mount_opt &
2227 OCFS2_MOUNT_COHERENCY_BUFFERED);
2235 2228
2236 mlog_entry("(0x%p, %u, '%.*s')\n", file, 2229 mlog_entry("(0x%p, %u, '%.*s')\n", file,
2237 (unsigned int)nr_segs, 2230 (unsigned int)nr_segs,
@@ -2255,16 +2248,39 @@ relock:
2255 have_alloc_sem = 1; 2248 have_alloc_sem = 1;
2256 } 2249 }
2257 2250
2258 /* concurrent O_DIRECT writes are allowed */ 2251 /*
2259 rw_level = !direct_io; 2252 * Concurrent O_DIRECT writes are allowed with
2253 * mount_option "coherency=buffered".
2254 */
2255 rw_level = (!direct_io || full_coherency);
2256
2260 ret = ocfs2_rw_lock(inode, rw_level); 2257 ret = ocfs2_rw_lock(inode, rw_level);
2261 if (ret < 0) { 2258 if (ret < 0) {
2262 mlog_errno(ret); 2259 mlog_errno(ret);
2263 goto out_sems; 2260 goto out_sems;
2264 } 2261 }
2265 2262
2263 /*
2264 * O_DIRECT writes with "coherency=full" need to take EX cluster
2265 * inode_lock to guarantee coherency.
2266 */
2267 if (direct_io && full_coherency) {
2268 /*
2269 * We need to take and drop the inode lock to force
2270 * other nodes to drop their caches. Buffered I/O
2271 * already does this in write_begin().
2272 */
2273 ret = ocfs2_inode_lock(inode, NULL, 1);
2274 if (ret < 0) {
2275 mlog_errno(ret);
2276 goto out_sems;
2277 }
2278
2279 ocfs2_inode_unlock(inode, 1);
2280 }
2281
2266 can_do_direct = direct_io; 2282 can_do_direct = direct_io;
2267 ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, 2283 ret = ocfs2_prepare_inode_for_write(file, ppos,
2268 iocb->ki_left, appending, 2284 iocb->ki_left, appending,
2269 &can_do_direct, &has_refcount); 2285 &can_do_direct, &has_refcount);
2270 if (ret < 0) { 2286 if (ret < 0) {
@@ -2312,17 +2328,6 @@ relock:
2312 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, 2328 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
2313 ppos, count, ocount); 2329 ppos, count, ocount);
2314 if (written < 0) { 2330 if (written < 0) {
2315 /*
2316 * direct write may have instantiated a few
2317 * blocks outside i_size. Trim these off again.
2318 * Don't need i_size_read because we hold i_mutex.
2319 *
2320 * XXX(truncate): this looks buggy because ocfs2 did not
2321 * actually implement ->truncate. Take a look at
2322 * the new truncate sequence and update this accordingly
2323 */
2324 if (*ppos + count > inode->i_size)
2325 truncate_setsize(inode, inode->i_size);
2326 ret = written; 2331 ret = written;
2327 goto out_dio; 2332 goto out_dio;
2328 } 2333 }
@@ -2394,7 +2399,7 @@ static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
2394{ 2399{
2395 int ret; 2400 int ret;
2396 2401
2397 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos, 2402 ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
2398 sd->total_len, 0, NULL, NULL); 2403 sd->total_len, 0, NULL, NULL);
2399 if (ret < 0) { 2404 if (ret < 0) {
2400 mlog_errno(ret); 2405 mlog_errno(ret);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index eece3e05d9d0..f935fd6600dd 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -335,6 +335,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
335 else 335 else
336 inode->i_fop = &ocfs2_dops_no_plocks; 336 inode->i_fop = &ocfs2_dops_no_plocks;
337 i_size_write(inode, le64_to_cpu(fe->i_size)); 337 i_size_write(inode, le64_to_cpu(fe->i_size));
338 OCFS2_I(inode)->ip_dir_lock_gen = 1;
338 break; 339 break;
339 case S_IFLNK: 340 case S_IFLNK:
340 if (ocfs2_inode_is_fast_symlink(inode)) 341 if (ocfs2_inode_is_fast_symlink(inode))
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 6de5a869db30..1c508b149b3a 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -46,30 +46,28 @@ struct ocfs2_inode_info
46 /* These fields are protected by ip_lock */ 46 /* These fields are protected by ip_lock */
47 spinlock_t ip_lock; 47 spinlock_t ip_lock;
48 u32 ip_open_count; 48 u32 ip_open_count;
49 u32 ip_clusters;
50 struct list_head ip_io_markers; 49 struct list_head ip_io_markers;
50 u32 ip_clusters;
51 51
52 u16 ip_dyn_features;
52 struct mutex ip_io_mutex; 53 struct mutex ip_io_mutex;
53
54 u32 ip_flags; /* see below */ 54 u32 ip_flags; /* see below */
55 u32 ip_attr; /* inode attributes */ 55 u32 ip_attr; /* inode attributes */
56 u16 ip_dyn_features;
57 56
58 /* protected by recovery_lock. */ 57 /* protected by recovery_lock. */
59 struct inode *ip_next_orphan; 58 struct inode *ip_next_orphan;
60 59
61 u32 ip_dir_start_lookup;
62
63 struct ocfs2_caching_info ip_metadata_cache; 60 struct ocfs2_caching_info ip_metadata_cache;
64
65 struct ocfs2_extent_map ip_extent_map; 61 struct ocfs2_extent_map ip_extent_map;
66
67 struct inode vfs_inode; 62 struct inode vfs_inode;
68 struct jbd2_inode ip_jinode; 63 struct jbd2_inode ip_jinode;
69 64
65 u32 ip_dir_start_lookup;
66
70 /* Only valid if the inode is the dir. */ 67 /* Only valid if the inode is the dir. */
71 u32 ip_last_used_slot; 68 u32 ip_last_used_slot;
72 u64 ip_last_used_group; 69 u64 ip_last_used_group;
70 u32 ip_dir_lock_gen;
73 71
74 struct ocfs2_alloc_reservation ip_la_data_resv; 72 struct ocfs2_alloc_reservation ip_la_data_resv;
75}; 73};
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7d9d9c132cef..7a4868196152 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -26,6 +26,26 @@
26 26
27#include <linux/ext2_fs.h> 27#include <linux/ext2_fs.h>
28 28
29#define o2info_from_user(a, b) \
30 copy_from_user(&(a), (b), sizeof(a))
31#define o2info_to_user(a, b) \
32 copy_to_user((typeof(a) __user *)b, &(a), sizeof(a))
33
34/*
35 * This call is void because we are already reporting an error that may
36 * be -EFAULT. The error will be returned from the ioctl(2) call. It's
37 * just a best-effort to tell userspace that this request caused the error.
38 */
39static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq,
40 struct ocfs2_info_request __user *req)
41{
42 kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
43 (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
44}
45
46#define o2info_set_request_error(a, b) \
47 __o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
48
29static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) 49static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
30{ 50{
31 int status; 51 int status;
@@ -109,6 +129,328 @@ bail:
109 return status; 129 return status;
110} 130}
111 131
132int ocfs2_info_handle_blocksize(struct inode *inode,
133 struct ocfs2_info_request __user *req)
134{
135 int status = -EFAULT;
136 struct ocfs2_info_blocksize oib;
137
138 if (o2info_from_user(oib, req))
139 goto bail;
140
141 oib.ib_blocksize = inode->i_sb->s_blocksize;
142 oib.ib_req.ir_flags |= OCFS2_INFO_FL_FILLED;
143
144 if (o2info_to_user(oib, req))
145 goto bail;
146
147 status = 0;
148bail:
149 if (status)
150 o2info_set_request_error(oib, req);
151
152 return status;
153}
154
155int ocfs2_info_handle_clustersize(struct inode *inode,
156 struct ocfs2_info_request __user *req)
157{
158 int status = -EFAULT;
159 struct ocfs2_info_clustersize oic;
160 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
161
162 if (o2info_from_user(oic, req))
163 goto bail;
164
165 oic.ic_clustersize = osb->s_clustersize;
166 oic.ic_req.ir_flags |= OCFS2_INFO_FL_FILLED;
167
168 if (o2info_to_user(oic, req))
169 goto bail;
170
171 status = 0;
172bail:
173 if (status)
174 o2info_set_request_error(oic, req);
175
176 return status;
177}
178
179int ocfs2_info_handle_maxslots(struct inode *inode,
180 struct ocfs2_info_request __user *req)
181{
182 int status = -EFAULT;
183 struct ocfs2_info_maxslots oim;
184 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
185
186 if (o2info_from_user(oim, req))
187 goto bail;
188
189 oim.im_max_slots = osb->max_slots;
190 oim.im_req.ir_flags |= OCFS2_INFO_FL_FILLED;
191
192 if (o2info_to_user(oim, req))
193 goto bail;
194
195 status = 0;
196bail:
197 if (status)
198 o2info_set_request_error(oim, req);
199
200 return status;
201}
202
203int ocfs2_info_handle_label(struct inode *inode,
204 struct ocfs2_info_request __user *req)
205{
206 int status = -EFAULT;
207 struct ocfs2_info_label oil;
208 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
209
210 if (o2info_from_user(oil, req))
211 goto bail;
212
213 memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
214 oil.il_req.ir_flags |= OCFS2_INFO_FL_FILLED;
215
216 if (o2info_to_user(oil, req))
217 goto bail;
218
219 status = 0;
220bail:
221 if (status)
222 o2info_set_request_error(oil, req);
223
224 return status;
225}
226
227int ocfs2_info_handle_uuid(struct inode *inode,
228 struct ocfs2_info_request __user *req)
229{
230 int status = -EFAULT;
231 struct ocfs2_info_uuid oiu;
232 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
233
234 if (o2info_from_user(oiu, req))
235 goto bail;
236
237 memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
238 oiu.iu_req.ir_flags |= OCFS2_INFO_FL_FILLED;
239
240 if (o2info_to_user(oiu, req))
241 goto bail;
242
243 status = 0;
244bail:
245 if (status)
246 o2info_set_request_error(oiu, req);
247
248 return status;
249}
250
251int ocfs2_info_handle_fs_features(struct inode *inode,
252 struct ocfs2_info_request __user *req)
253{
254 int status = -EFAULT;
255 struct ocfs2_info_fs_features oif;
256 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
257
258 if (o2info_from_user(oif, req))
259 goto bail;
260
261 oif.if_compat_features = osb->s_feature_compat;
262 oif.if_incompat_features = osb->s_feature_incompat;
263 oif.if_ro_compat_features = osb->s_feature_ro_compat;
264 oif.if_req.ir_flags |= OCFS2_INFO_FL_FILLED;
265
266 if (o2info_to_user(oif, req))
267 goto bail;
268
269 status = 0;
270bail:
271 if (status)
272 o2info_set_request_error(oif, req);
273
274 return status;
275}
276
277int ocfs2_info_handle_journal_size(struct inode *inode,
278 struct ocfs2_info_request __user *req)
279{
280 int status = -EFAULT;
281 struct ocfs2_info_journal_size oij;
282 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
283
284 if (o2info_from_user(oij, req))
285 goto bail;
286
287 oij.ij_journal_size = osb->journal->j_inode->i_size;
288
289 oij.ij_req.ir_flags |= OCFS2_INFO_FL_FILLED;
290
291 if (o2info_to_user(oij, req))
292 goto bail;
293
294 status = 0;
295bail:
296 if (status)
297 o2info_set_request_error(oij, req);
298
299 return status;
300}
301
302int ocfs2_info_handle_unknown(struct inode *inode,
303 struct ocfs2_info_request __user *req)
304{
305 int status = -EFAULT;
306 struct ocfs2_info_request oir;
307
308 if (o2info_from_user(oir, req))
309 goto bail;
310
311 oir.ir_flags &= ~OCFS2_INFO_FL_FILLED;
312
313 if (o2info_to_user(oir, req))
314 goto bail;
315
316 status = 0;
317bail:
318 if (status)
319 o2info_set_request_error(oir, req);
320
321 return status;
322}
323
324/*
325 * Validate and distinguish OCFS2_IOC_INFO requests.
326 *
327 * - validate the magic number.
328 * - distinguish different requests.
329 * - validate size of different requests.
330 */
331int ocfs2_info_handle_request(struct inode *inode,
332 struct ocfs2_info_request __user *req)
333{
334 int status = -EFAULT;
335 struct ocfs2_info_request oir;
336
337 if (o2info_from_user(oir, req))
338 goto bail;
339
340 status = -EINVAL;
341 if (oir.ir_magic != OCFS2_INFO_MAGIC)
342 goto bail;
343
344 switch (oir.ir_code) {
345 case OCFS2_INFO_BLOCKSIZE:
346 if (oir.ir_size == sizeof(struct ocfs2_info_blocksize))
347 status = ocfs2_info_handle_blocksize(inode, req);
348 break;
349 case OCFS2_INFO_CLUSTERSIZE:
350 if (oir.ir_size == sizeof(struct ocfs2_info_clustersize))
351 status = ocfs2_info_handle_clustersize(inode, req);
352 break;
353 case OCFS2_INFO_MAXSLOTS:
354 if (oir.ir_size == sizeof(struct ocfs2_info_maxslots))
355 status = ocfs2_info_handle_maxslots(inode, req);
356 break;
357 case OCFS2_INFO_LABEL:
358 if (oir.ir_size == sizeof(struct ocfs2_info_label))
359 status = ocfs2_info_handle_label(inode, req);
360 break;
361 case OCFS2_INFO_UUID:
362 if (oir.ir_size == sizeof(struct ocfs2_info_uuid))
363 status = ocfs2_info_handle_uuid(inode, req);
364 break;
365 case OCFS2_INFO_FS_FEATURES:
366 if (oir.ir_size == sizeof(struct ocfs2_info_fs_features))
367 status = ocfs2_info_handle_fs_features(inode, req);
368 break;
369 case OCFS2_INFO_JOURNAL_SIZE:
370 if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
371 status = ocfs2_info_handle_journal_size(inode, req);
372 break;
373 default:
374 status = ocfs2_info_handle_unknown(inode, req);
375 break;
376 }
377
378bail:
379 return status;
380}
381
382int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx,
383 u64 *req_addr, int compat_flag)
384{
385 int status = -EFAULT;
386 u64 __user *bp = NULL;
387
388 if (compat_flag) {
389#ifdef CONFIG_COMPAT
390 /*
391 * pointer bp stores the base address of a pointers array,
392 * which collects all addresses of separate request.
393 */
394 bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests);
395#else
396 BUG();
397#endif
398 } else
399 bp = (u64 __user *)(unsigned long)(info->oi_requests);
400
401 if (o2info_from_user(*req_addr, bp + idx))
402 goto bail;
403
404 status = 0;
405bail:
406 return status;
407}
408
409/*
410 * OCFS2_IOC_INFO handles an array of requests passed from userspace.
411 *
412 * ocfs2_info_handle() recevies a large info aggregation, grab and
413 * validate the request count from header, then break it into small
414 * pieces, later specific handlers can handle them one by one.
415 *
416 * Idea here is to make each separate request small enough to ensure
417 * a better backward&forward compatibility, since a small piece of
418 * request will be less likely to be broken if disk layout get changed.
419 */
420int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
421 int compat_flag)
422{
423 int i, status = 0;
424 u64 req_addr;
425 struct ocfs2_info_request __user *reqp;
426
427 if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) ||
428 (!info->oi_requests)) {
429 status = -EINVAL;
430 goto bail;
431 }
432
433 for (i = 0; i < info->oi_count; i++) {
434
435 status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag);
436 if (status)
437 break;
438
439 reqp = (struct ocfs2_info_request *)(unsigned long)req_addr;
440 if (!reqp) {
441 status = -EINVAL;
442 goto bail;
443 }
444
445 status = ocfs2_info_handle_request(inode, reqp);
446 if (status)
447 break;
448 }
449
450bail:
451 return status;
452}
453
112long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 454long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
113{ 455{
114 struct inode *inode = filp->f_path.dentry->d_inode; 456 struct inode *inode = filp->f_path.dentry->d_inode;
@@ -120,6 +462,7 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
120 struct reflink_arguments args; 462 struct reflink_arguments args;
121 const char *old_path, *new_path; 463 const char *old_path, *new_path;
122 bool preserve; 464 bool preserve;
465 struct ocfs2_info info;
123 466
124 switch (cmd) { 467 switch (cmd) {
125 case OCFS2_IOC_GETFLAGS: 468 case OCFS2_IOC_GETFLAGS:
@@ -174,6 +517,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
174 preserve = (args.preserve != 0); 517 preserve = (args.preserve != 0);
175 518
176 return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve); 519 return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
520 case OCFS2_IOC_INFO:
521 if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
522 sizeof(struct ocfs2_info)))
523 return -EFAULT;
524
525 return ocfs2_info_handle(inode, &info, 0);
177 default: 526 default:
178 return -ENOTTY; 527 return -ENOTTY;
179 } 528 }
@@ -185,6 +534,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
185 bool preserve; 534 bool preserve;
186 struct reflink_arguments args; 535 struct reflink_arguments args;
187 struct inode *inode = file->f_path.dentry->d_inode; 536 struct inode *inode = file->f_path.dentry->d_inode;
537 struct ocfs2_info info;
188 538
189 switch (cmd) { 539 switch (cmd) {
190 case OCFS2_IOC32_GETFLAGS: 540 case OCFS2_IOC32_GETFLAGS:
@@ -209,6 +559,12 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
209 559
210 return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path), 560 return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
211 compat_ptr(args.new_path), preserve); 561 compat_ptr(args.new_path), preserve);
562 case OCFS2_IOC_INFO:
563 if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
564 sizeof(struct ocfs2_info)))
565 return -EFAULT;
566
567 return ocfs2_info_handle(inode, &info, 1);
212 default: 568 default:
213 return -ENOIOCTLCMD; 569 return -ENOIOCTLCMD;
214 } 570 }
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9b57c0350ff9..faa2303dbf0a 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -301,7 +301,6 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
301{ 301{
302 int status = 0; 302 int status = 0;
303 unsigned int flushed; 303 unsigned int flushed;
304 unsigned long old_id;
305 struct ocfs2_journal *journal = NULL; 304 struct ocfs2_journal *journal = NULL;
306 305
307 mlog_entry_void(); 306 mlog_entry_void();
@@ -326,7 +325,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
326 goto finally; 325 goto finally;
327 } 326 }
328 327
329 old_id = ocfs2_inc_trans_id(journal); 328 ocfs2_inc_trans_id(journal);
330 329
331 flushed = atomic_read(&journal->j_num_trans); 330 flushed = atomic_read(&journal->j_num_trans);
332 atomic_set(&journal->j_num_trans, 0); 331 atomic_set(&journal->j_num_trans, 0);
@@ -342,9 +341,6 @@ finally:
342 return status; 341 return status;
343} 342}
344 343
345/* pass it NULL and it will allocate a new handle object for you. If
346 * you pass it a handle however, it may still return error, in which
347 * case it has free'd the passed handle for you. */
348handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) 344handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
349{ 345{
350 journal_t *journal = osb->journal->j_journal; 346 journal_t *journal = osb->journal->j_journal;
@@ -1888,6 +1884,8 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
1888 1884
1889 os = &osb->osb_orphan_scan; 1885 os = &osb->osb_orphan_scan;
1890 1886
1887 mlog(0, "Begin orphan scan\n");
1888
1891 if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE) 1889 if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
1892 goto out; 1890 goto out;
1893 1891
@@ -1920,6 +1918,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
1920unlock: 1918unlock:
1921 ocfs2_orphan_scan_unlock(osb, seqno); 1919 ocfs2_orphan_scan_unlock(osb, seqno);
1922out: 1920out:
1921 mlog(0, "Orphan scan completed\n");
1923 return; 1922 return;
1924} 1923}
1925 1924
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index b5baaa8e710f..43e56b97f9c0 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -67,11 +67,12 @@ struct ocfs2_journal {
67 struct buffer_head *j_bh; /* Journal disk inode block */ 67 struct buffer_head *j_bh; /* Journal disk inode block */
68 atomic_t j_num_trans; /* Number of transactions 68 atomic_t j_num_trans; /* Number of transactions
69 * currently in the system. */ 69 * currently in the system. */
70 spinlock_t j_lock;
70 unsigned long j_trans_id; 71 unsigned long j_trans_id;
71 struct rw_semaphore j_trans_barrier; 72 struct rw_semaphore j_trans_barrier;
72 wait_queue_head_t j_checkpointed; 73 wait_queue_head_t j_checkpointed;
73 74
74 spinlock_t j_lock; 75 /* both fields protected by j_lock*/
75 struct list_head j_la_cleanups; 76 struct list_head j_la_cleanups;
76 struct work_struct j_recovery_work; 77 struct work_struct j_recovery_work;
77}; 78};
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 4c18f4ad93b4..7e32db9c2c99 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -59,10 +59,11 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
59 return ret; 59 return ret;
60} 60}
61 61
62static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, 62static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
63 struct page *page) 63 struct page *page)
64{ 64{
65 int ret; 65 int ret;
66 struct inode *inode = file->f_path.dentry->d_inode;
66 struct address_space *mapping = inode->i_mapping; 67 struct address_space *mapping = inode->i_mapping;
67 loff_t pos = page_offset(page); 68 loff_t pos = page_offset(page);
68 unsigned int len = PAGE_CACHE_SIZE; 69 unsigned int len = PAGE_CACHE_SIZE;
@@ -111,7 +112,7 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
111 if (page->index == last_index) 112 if (page->index == last_index)
112 len = ((size - 1) & ~PAGE_CACHE_MASK) + 1; 113 len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
113 114
114 ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page, 115 ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
115 &fsdata, di_bh, page); 116 &fsdata, di_bh, page);
116 if (ret) { 117 if (ret) {
117 if (ret != -ENOSPC) 118 if (ret != -ENOSPC)
@@ -159,7 +160,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
159 */ 160 */
160 down_write(&OCFS2_I(inode)->ip_alloc_sem); 161 down_write(&OCFS2_I(inode)->ip_alloc_sem);
161 162
162 ret = __ocfs2_page_mkwrite(inode, di_bh, page); 163 ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page);
163 164
164 up_write(&OCFS2_I(inode)->ip_alloc_sem); 165 up_write(&OCFS2_I(inode)->ip_alloc_sem);
165 166
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index a00dda2e4f16..ff5744e1e36f 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -171,7 +171,8 @@ bail_add:
171 ret = ERR_PTR(status); 171 ret = ERR_PTR(status);
172 goto bail_unlock; 172 goto bail_unlock;
173 } 173 }
174 } 174 } else
175 ocfs2_dentry_attach_gen(dentry);
175 176
176bail_unlock: 177bail_unlock:
177 /* Don't drop the cluster lock until *after* the d_add -- 178 /* Don't drop the cluster lock until *after* the d_add --
@@ -741,7 +742,7 @@ static int ocfs2_link(struct dentry *old_dentry,
741 goto out_commit; 742 goto out_commit;
742 } 743 }
743 744
744 atomic_inc(&inode->i_count); 745 ihold(inode);
745 dentry->d_op = &ocfs2_dentry_ops; 746 dentry->d_op = &ocfs2_dentry_ops;
746 d_instantiate(dentry, inode); 747 d_instantiate(dentry, inode);
747 748
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index c67003b6b5a2..d8408217e3bd 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -150,26 +150,33 @@ typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
150struct ocfs2_lock_res { 150struct ocfs2_lock_res {
151 void *l_priv; 151 void *l_priv;
152 struct ocfs2_lock_res_ops *l_ops; 152 struct ocfs2_lock_res_ops *l_ops;
153 spinlock_t l_lock; 153
154 154
155 struct list_head l_blocked_list; 155 struct list_head l_blocked_list;
156 struct list_head l_mask_waiters; 156 struct list_head l_mask_waiters;
157 157
158 enum ocfs2_lock_type l_type;
159 unsigned long l_flags; 158 unsigned long l_flags;
160 char l_name[OCFS2_LOCK_ID_MAX_LEN]; 159 char l_name[OCFS2_LOCK_ID_MAX_LEN];
161 int l_level;
162 unsigned int l_ro_holders; 160 unsigned int l_ro_holders;
163 unsigned int l_ex_holders; 161 unsigned int l_ex_holders;
164 struct ocfs2_dlm_lksb l_lksb; 162 unsigned char l_level;
163
164 /* Data packed - type enum ocfs2_lock_type */
165 unsigned char l_type;
165 166
166 /* used from AST/BAST funcs. */ 167 /* used from AST/BAST funcs. */
167 enum ocfs2_ast_action l_action; 168 /* Data packed - enum type ocfs2_ast_action */
168 enum ocfs2_unlock_action l_unlock_action; 169 unsigned char l_action;
169 int l_requested; 170 /* Data packed - enum type ocfs2_unlock_action */
170 int l_blocking; 171 unsigned char l_unlock_action;
172 unsigned char l_requested;
173 unsigned char l_blocking;
171 unsigned int l_pending_gen; 174 unsigned int l_pending_gen;
172 175
176 spinlock_t l_lock;
177
178 struct ocfs2_dlm_lksb l_lksb;
179
173 wait_queue_head_t l_event; 180 wait_queue_head_t l_event;
174 181
175 struct list_head l_debug_list; 182 struct list_head l_debug_list;
@@ -243,7 +250,7 @@ enum ocfs2_local_alloc_state
243 250
244enum ocfs2_mount_options 251enum ocfs2_mount_options
245{ 252{
246 OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Heartbeat started in local mode */ 253 OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Local heartbeat */
247 OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */ 254 OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */
248 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ 255 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */
249 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ 256 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
@@ -256,6 +263,10 @@ enum ocfs2_mount_options
256 control lists */ 263 control lists */
257 OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */ 264 OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
258 OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */ 265 OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
266 OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12, /* Allow concurrent O_DIRECT
267 writes */
268 OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */
269 OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
259}; 270};
260 271
261#define OCFS2_OSB_SOFT_RO 0x0001 272#define OCFS2_OSB_SOFT_RO 0x0001
@@ -277,7 +288,8 @@ struct ocfs2_super
277 struct super_block *sb; 288 struct super_block *sb;
278 struct inode *root_inode; 289 struct inode *root_inode;
279 struct inode *sys_root_inode; 290 struct inode *sys_root_inode;
280 struct inode *system_inodes[NUM_SYSTEM_INODES]; 291 struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES];
292 struct inode **local_system_inodes;
281 293
282 struct ocfs2_slot_info *slot_info; 294 struct ocfs2_slot_info *slot_info;
283 295
@@ -368,6 +380,8 @@ struct ocfs2_super
368 struct ocfs2_alloc_stats alloc_stats; 380 struct ocfs2_alloc_stats alloc_stats;
369 char dev_str[20]; /* "major,minor" of the device */ 381 char dev_str[20]; /* "major,minor" of the device */
370 382
383 u8 osb_stackflags;
384
371 char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; 385 char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
372 struct ocfs2_cluster_connection *cconn; 386 struct ocfs2_cluster_connection *cconn;
373 struct ocfs2_lock_res osb_super_lockres; 387 struct ocfs2_lock_res osb_super_lockres;
@@ -601,10 +615,35 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
601 return ret; 615 return ret;
602} 616}
603 617
604static inline int ocfs2_userspace_stack(struct ocfs2_super *osb) 618static inline int ocfs2_clusterinfo_valid(struct ocfs2_super *osb)
605{ 619{
606 return (osb->s_feature_incompat & 620 return (osb->s_feature_incompat &
607 OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK); 621 (OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK |
622 OCFS2_FEATURE_INCOMPAT_CLUSTERINFO));
623}
624
625static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
626{
627 if (ocfs2_clusterinfo_valid(osb) &&
628 memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
629 OCFS2_STACK_LABEL_LEN))
630 return 1;
631 return 0;
632}
633
634static inline int ocfs2_o2cb_stack(struct ocfs2_super *osb)
635{
636 if (ocfs2_clusterinfo_valid(osb) &&
637 !memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
638 OCFS2_STACK_LABEL_LEN))
639 return 1;
640 return 0;
641}
642
643static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb)
644{
645 return ocfs2_o2cb_stack(osb) &&
646 (osb->osb_stackflags & OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT);
608} 647}
609 648
610static inline int ocfs2_mount_local(struct ocfs2_super *osb) 649static inline int ocfs2_mount_local(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 33f1c9a8258d..c2e4f8222e2f 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -101,7 +101,8 @@
101 | OCFS2_FEATURE_INCOMPAT_META_ECC \ 101 | OCFS2_FEATURE_INCOMPAT_META_ECC \
102 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ 102 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
103 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \ 103 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
104 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG) 104 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \
105 | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
105#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ 106#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
106 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ 107 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
107 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) 108 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -170,6 +171,13 @@
170#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000 171#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000
171 172
172/* 173/*
174 * Incompat bit to indicate useable clusterinfo with stackflags for all
175 * cluster stacks (userspace adnd o2cb). If this bit is set,
176 * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set.
177 */
178#define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000
179
180/*
173 * backup superblock flag is used to indicate that this volume 181 * backup superblock flag is used to indicate that this volume
174 * has backup superblocks. 182 * has backup superblocks.
175 */ 183 */
@@ -235,18 +243,31 @@
235#define OCFS2_HAS_REFCOUNT_FL (0x0010) 243#define OCFS2_HAS_REFCOUNT_FL (0x0010)
236 244
237/* Inode attributes, keep in sync with EXT2 */ 245/* Inode attributes, keep in sync with EXT2 */
238#define OCFS2_SECRM_FL (0x00000001) /* Secure deletion */ 246#define OCFS2_SECRM_FL FS_SECRM_FL /* Secure deletion */
239#define OCFS2_UNRM_FL (0x00000002) /* Undelete */ 247#define OCFS2_UNRM_FL FS_UNRM_FL /* Undelete */
240#define OCFS2_COMPR_FL (0x00000004) /* Compress file */ 248#define OCFS2_COMPR_FL FS_COMPR_FL /* Compress file */
241#define OCFS2_SYNC_FL (0x00000008) /* Synchronous updates */ 249#define OCFS2_SYNC_FL FS_SYNC_FL /* Synchronous updates */
242#define OCFS2_IMMUTABLE_FL (0x00000010) /* Immutable file */ 250#define OCFS2_IMMUTABLE_FL FS_IMMUTABLE_FL /* Immutable file */
243#define OCFS2_APPEND_FL (0x00000020) /* writes to file may only append */ 251#define OCFS2_APPEND_FL FS_APPEND_FL /* writes to file may only append */
244#define OCFS2_NODUMP_FL (0x00000040) /* do not dump file */ 252#define OCFS2_NODUMP_FL FS_NODUMP_FL /* do not dump file */
245#define OCFS2_NOATIME_FL (0x00000080) /* do not update atime */ 253#define OCFS2_NOATIME_FL FS_NOATIME_FL /* do not update atime */
246#define OCFS2_DIRSYNC_FL (0x00010000) /* dirsync behaviour (directories only) */ 254/* Reserved for compression usage... */
247 255#define OCFS2_DIRTY_FL FS_DIRTY_FL
248#define OCFS2_FL_VISIBLE (0x000100FF) /* User visible flags */ 256#define OCFS2_COMPRBLK_FL FS_COMPRBLK_FL /* One or more compressed clusters */
249#define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */ 257#define OCFS2_NOCOMP_FL FS_NOCOMP_FL /* Don't compress */
258#define OCFS2_ECOMPR_FL FS_ECOMPR_FL /* Compression error */
259/* End compression flags --- maybe not all used */
260#define OCFS2_BTREE_FL FS_BTREE_FL /* btree format dir */
261#define OCFS2_INDEX_FL FS_INDEX_FL /* hash-indexed directory */
262#define OCFS2_IMAGIC_FL FS_IMAGIC_FL /* AFS directory */
263#define OCFS2_JOURNAL_DATA_FL FS_JOURNAL_DATA_FL /* Reserved for ext3 */
264#define OCFS2_NOTAIL_FL FS_NOTAIL_FL /* file tail should not be merged */
265#define OCFS2_DIRSYNC_FL FS_DIRSYNC_FL /* dirsync behaviour (directories only) */
266#define OCFS2_TOPDIR_FL FS_TOPDIR_FL /* Top of directory hierarchies*/
267#define OCFS2_RESERVED_FL FS_RESERVED_FL /* reserved for ext2 lib */
268
269#define OCFS2_FL_VISIBLE FS_FL_USER_VISIBLE /* User visible flags */
270#define OCFS2_FL_MODIFIABLE FS_FL_USER_MODIFIABLE /* User modifiable flags */
250 271
251/* 272/*
252 * Extent record flags (e_node.leaf.flags) 273 * Extent record flags (e_node.leaf.flags)
@@ -279,10 +300,13 @@
279#define OCFS2_VOL_UUID_LEN 16 300#define OCFS2_VOL_UUID_LEN 16
280#define OCFS2_MAX_VOL_LABEL_LEN 64 301#define OCFS2_MAX_VOL_LABEL_LEN 64
281 302
282/* The alternate, userspace stack fields */ 303/* The cluster stack fields */
283#define OCFS2_STACK_LABEL_LEN 4 304#define OCFS2_STACK_LABEL_LEN 4
284#define OCFS2_CLUSTER_NAME_LEN 16 305#define OCFS2_CLUSTER_NAME_LEN 16
285 306
307/* Classic (historically speaking) cluster stack */
308#define OCFS2_CLASSIC_CLUSTER_STACK "o2cb"
309
286/* Journal limits (in bytes) */ 310/* Journal limits (in bytes) */
287#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) 311#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
288 312
@@ -292,6 +316,11 @@
292 */ 316 */
293#define OCFS2_MIN_XATTR_INLINE_SIZE 256 317#define OCFS2_MIN_XATTR_INLINE_SIZE 256
294 318
319/*
320 * Cluster info flags (ocfs2_cluster_info.ci_stackflags)
321 */
322#define OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT (0x01)
323
295struct ocfs2_system_inode_info { 324struct ocfs2_system_inode_info {
296 char *si_name; 325 char *si_name;
297 int si_iflags; 326 int si_iflags;
@@ -309,6 +338,7 @@ enum {
309 USER_QUOTA_SYSTEM_INODE, 338 USER_QUOTA_SYSTEM_INODE,
310 GROUP_QUOTA_SYSTEM_INODE, 339 GROUP_QUOTA_SYSTEM_INODE,
311#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE 340#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
341#define OCFS2_FIRST_LOCAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE
312 ORPHAN_DIR_SYSTEM_INODE, 342 ORPHAN_DIR_SYSTEM_INODE,
313 EXTENT_ALLOC_SYSTEM_INODE, 343 EXTENT_ALLOC_SYSTEM_INODE,
314 INODE_ALLOC_SYSTEM_INODE, 344 INODE_ALLOC_SYSTEM_INODE,
@@ -317,8 +347,12 @@ enum {
317 TRUNCATE_LOG_SYSTEM_INODE, 347 TRUNCATE_LOG_SYSTEM_INODE,
318 LOCAL_USER_QUOTA_SYSTEM_INODE, 348 LOCAL_USER_QUOTA_SYSTEM_INODE,
319 LOCAL_GROUP_QUOTA_SYSTEM_INODE, 349 LOCAL_GROUP_QUOTA_SYSTEM_INODE,
350#define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE
320 NUM_SYSTEM_INODES 351 NUM_SYSTEM_INODES
321}; 352};
353#define NUM_GLOBAL_SYSTEM_INODES OCFS2_LAST_GLOBAL_SYSTEM_INODE
354#define NUM_LOCAL_SYSTEM_INODES \
355 (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE)
322 356
323static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { 357static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
324 /* Global system inodes (single copy) */ 358 /* Global system inodes (single copy) */
@@ -347,6 +381,7 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
347/* Parameter passed from mount.ocfs2 to module */ 381/* Parameter passed from mount.ocfs2 to module */
348#define OCFS2_HB_NONE "heartbeat=none" 382#define OCFS2_HB_NONE "heartbeat=none"
349#define OCFS2_HB_LOCAL "heartbeat=local" 383#define OCFS2_HB_LOCAL "heartbeat=local"
384#define OCFS2_HB_GLOBAL "heartbeat=global"
350 385
351/* 386/*
352 * OCFS2 directory file types. Only the low 3 bits are used. The 387 * OCFS2 directory file types. Only the low 3 bits are used. The
@@ -553,9 +588,21 @@ struct ocfs2_slot_map_extended {
553 */ 588 */
554}; 589};
555 590
591/*
592 * ci_stackflags is only valid if the incompat bit
593 * OCFS2_FEATURE_INCOMPAT_CLUSTERINFO is set.
594 */
556struct ocfs2_cluster_info { 595struct ocfs2_cluster_info {
557/*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN]; 596/*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN];
558 __le32 ci_reserved; 597 union {
598 __le32 ci_reserved;
599 struct {
600 __u8 ci_stackflags;
601 __u8 ci_reserved1;
602 __u8 ci_reserved2;
603 __u8 ci_reserved3;
604 };
605 };
559/*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN]; 606/*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN];
560/*18*/ 607/*18*/
561}; 608};
@@ -592,9 +639,9 @@ struct ocfs2_super_block {
592 * group header */ 639 * group header */
593/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ 640/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
594/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */ 641/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */
595/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace 642/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Only valid if either
596 stack. Only valid 643 userspace or clusterinfo
597 with INCOMPAT flag. */ 644 INCOMPAT flag set. */
598/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size 645/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size
599 for this fs*/ 646 for this fs*/
600 __le16 s_reserved0; 647 __le16 s_reserved0;
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index 2d3420af1a83..b46f39bf7438 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -23,10 +23,10 @@
23/* 23/*
24 * ioctl commands 24 * ioctl commands
25 */ 25 */
26#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long) 26#define OCFS2_IOC_GETFLAGS FS_IOC_GETFLAGS
27#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long) 27#define OCFS2_IOC_SETFLAGS FS_IOC_SETFLAGS
28#define OCFS2_IOC32_GETFLAGS _IOR('f', 1, int) 28#define OCFS2_IOC32_GETFLAGS FS_IOC32_GETFLAGS
29#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int) 29#define OCFS2_IOC32_SETFLAGS FS_IOC32_SETFLAGS
30 30
31/* 31/*
32 * Space reservation / allocation / free ioctls and argument structure 32 * Space reservation / allocation / free ioctls and argument structure
@@ -76,4 +76,99 @@ struct reflink_arguments {
76}; 76};
77#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments) 77#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
78 78
79/* Following definitions dedicated for ocfs2_info_request ioctls. */
80#define OCFS2_INFO_MAX_REQUEST (50)
81#define OCFS2_TEXT_UUID_LEN (OCFS2_VOL_UUID_LEN * 2)
82
83/* Magic number of all requests */
84#define OCFS2_INFO_MAGIC (0x4F32494E)
85
86/*
87 * Always try to separate info request into small pieces to
88 * guarantee the backward&forward compatibility.
89 */
90struct ocfs2_info {
91 __u64 oi_requests; /* Array of __u64 pointers to requests */
92 __u32 oi_count; /* Number of requests in info_requests */
93 __u32 oi_pad;
94};
95
96struct ocfs2_info_request {
97/*00*/ __u32 ir_magic; /* Magic number */
98 __u32 ir_code; /* Info request code */
99 __u32 ir_size; /* Size of request */
100 __u32 ir_flags; /* Request flags */
101/*10*/ /* Request specific fields */
102};
103
104struct ocfs2_info_clustersize {
105 struct ocfs2_info_request ic_req;
106 __u32 ic_clustersize;
107 __u32 ic_pad;
108};
109
110struct ocfs2_info_blocksize {
111 struct ocfs2_info_request ib_req;
112 __u32 ib_blocksize;
113 __u32 ib_pad;
114};
115
116struct ocfs2_info_maxslots {
117 struct ocfs2_info_request im_req;
118 __u32 im_max_slots;
119 __u32 im_pad;
120};
121
122struct ocfs2_info_label {
123 struct ocfs2_info_request il_req;
124 __u8 il_label[OCFS2_MAX_VOL_LABEL_LEN];
125} __attribute__ ((packed));
126
127struct ocfs2_info_uuid {
128 struct ocfs2_info_request iu_req;
129 __u8 iu_uuid_str[OCFS2_TEXT_UUID_LEN + 1];
130} __attribute__ ((packed));
131
132struct ocfs2_info_fs_features {
133 struct ocfs2_info_request if_req;
134 __u32 if_compat_features;
135 __u32 if_incompat_features;
136 __u32 if_ro_compat_features;
137 __u32 if_pad;
138};
139
140struct ocfs2_info_journal_size {
141 struct ocfs2_info_request ij_req;
142 __u64 ij_journal_size;
143};
144
145/* Codes for ocfs2_info_request */
146enum ocfs2_info_type {
147 OCFS2_INFO_CLUSTERSIZE = 1,
148 OCFS2_INFO_BLOCKSIZE,
149 OCFS2_INFO_MAXSLOTS,
150 OCFS2_INFO_LABEL,
151 OCFS2_INFO_UUID,
152 OCFS2_INFO_FS_FEATURES,
153 OCFS2_INFO_JOURNAL_SIZE,
154 OCFS2_INFO_NUM_TYPES
155};
156
157/* Flags for struct ocfs2_info_request */
158/* Filled by the caller */
159#define OCFS2_INFO_FL_NON_COHERENT (0x00000001) /* Cluster coherency not
160 required. This is a hint.
161 It is up to ocfs2 whether
162 the request can be fulfilled
163 without locking. */
164/* Filled by ocfs2 */
165#define OCFS2_INFO_FL_FILLED (0x40000000) /* Filesystem understood
166 this request and
167 filled in the answer */
168
169#define OCFS2_INFO_FL_ERROR (0x80000000) /* Error happened during
170 request handling. */
171
172#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info)
173
79#endif /* OCFS2_IOCTL_H */ 174#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 0afeda83120f..b5f9160e93e9 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -49,6 +49,7 @@
49 49
50struct ocfs2_cow_context { 50struct ocfs2_cow_context {
51 struct inode *inode; 51 struct inode *inode;
52 struct file *file;
52 u32 cow_start; 53 u32 cow_start;
53 u32 cow_len; 54 u32 cow_len;
54 struct ocfs2_extent_tree data_et; 55 struct ocfs2_extent_tree data_et;
@@ -2932,13 +2933,16 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2932 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); 2933 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
2933 struct page *page; 2934 struct page *page;
2934 pgoff_t page_index; 2935 pgoff_t page_index;
2935 unsigned int from, to; 2936 unsigned int from, to, readahead_pages;
2936 loff_t offset, end, map_end; 2937 loff_t offset, end, map_end;
2937 struct address_space *mapping = context->inode->i_mapping; 2938 struct address_space *mapping = context->inode->i_mapping;
2938 2939
2939 mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster, 2940 mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster,
2940 new_cluster, new_len, cpos); 2941 new_cluster, new_len, cpos);
2941 2942
2943 readahead_pages =
2944 (ocfs2_cow_contig_clusters(sb) <<
2945 OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
2942 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; 2946 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
2943 end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); 2947 end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
2944 /* 2948 /*
@@ -2969,6 +2973,14 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2969 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) 2973 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
2970 BUG_ON(PageDirty(page)); 2974 BUG_ON(PageDirty(page));
2971 2975
2976 if (PageReadahead(page) && context->file) {
2977 page_cache_async_readahead(mapping,
2978 &context->file->f_ra,
2979 context->file,
2980 page, page_index,
2981 readahead_pages);
2982 }
2983
2972 if (!PageUptodate(page)) { 2984 if (!PageUptodate(page)) {
2973 ret = block_read_full_page(page, ocfs2_get_block); 2985 ret = block_read_full_page(page, ocfs2_get_block);
2974 if (ret) { 2986 if (ret) {
@@ -3409,12 +3421,35 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
3409 return ret; 3421 return ret;
3410} 3422}
3411 3423
3424static void ocfs2_readahead_for_cow(struct inode *inode,
3425 struct file *file,
3426 u32 start, u32 len)
3427{
3428 struct address_space *mapping;
3429 pgoff_t index;
3430 unsigned long num_pages;
3431 int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
3432
3433 if (!file)
3434 return;
3435
3436 mapping = file->f_mapping;
3437 num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT;
3438 if (!num_pages)
3439 num_pages = 1;
3440
3441 index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT;
3442 page_cache_sync_readahead(mapping, &file->f_ra, file,
3443 index, num_pages);
3444}
3445
3412/* 3446/*
3413 * Starting at cpos, try to CoW write_len clusters. Don't CoW 3447 * Starting at cpos, try to CoW write_len clusters. Don't CoW
3414 * past max_cpos. This will stop when it runs into a hole or an 3448 * past max_cpos. This will stop when it runs into a hole or an
3415 * unrefcounted extent. 3449 * unrefcounted extent.
3416 */ 3450 */
3417static int ocfs2_refcount_cow_hunk(struct inode *inode, 3451static int ocfs2_refcount_cow_hunk(struct inode *inode,
3452 struct file *file,
3418 struct buffer_head *di_bh, 3453 struct buffer_head *di_bh,
3419 u32 cpos, u32 write_len, u32 max_cpos) 3454 u32 cpos, u32 write_len, u32 max_cpos)
3420{ 3455{
@@ -3443,6 +3478,8 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
3443 3478
3444 BUG_ON(cow_len == 0); 3479 BUG_ON(cow_len == 0);
3445 3480
3481 ocfs2_readahead_for_cow(inode, file, cow_start, cow_len);
3482
3446 context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); 3483 context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
3447 if (!context) { 3484 if (!context) {
3448 ret = -ENOMEM; 3485 ret = -ENOMEM;
@@ -3464,6 +3501,7 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
3464 context->ref_root_bh = ref_root_bh; 3501 context->ref_root_bh = ref_root_bh;
3465 context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; 3502 context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
3466 context->get_clusters = ocfs2_di_get_clusters; 3503 context->get_clusters = ocfs2_di_get_clusters;
3504 context->file = file;
3467 3505
3468 ocfs2_init_dinode_extent_tree(&context->data_et, 3506 ocfs2_init_dinode_extent_tree(&context->data_et,
3469 INODE_CACHE(inode), di_bh); 3507 INODE_CACHE(inode), di_bh);
@@ -3492,6 +3530,7 @@ out:
3492 * clusters between cpos and cpos+write_len are safe to modify. 3530 * clusters between cpos and cpos+write_len are safe to modify.
3493 */ 3531 */
3494int ocfs2_refcount_cow(struct inode *inode, 3532int ocfs2_refcount_cow(struct inode *inode,
3533 struct file *file,
3495 struct buffer_head *di_bh, 3534 struct buffer_head *di_bh,
3496 u32 cpos, u32 write_len, u32 max_cpos) 3535 u32 cpos, u32 write_len, u32 max_cpos)
3497{ 3536{
@@ -3511,7 +3550,7 @@ int ocfs2_refcount_cow(struct inode *inode,
3511 num_clusters = write_len; 3550 num_clusters = write_len;
3512 3551
3513 if (ext_flags & OCFS2_EXT_REFCOUNTED) { 3552 if (ext_flags & OCFS2_EXT_REFCOUNTED) {
3514 ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos, 3553 ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos,
3515 num_clusters, max_cpos); 3554 num_clusters, max_cpos);
3516 if (ret) { 3555 if (ret) {
3517 mlog_errno(ret); 3556 mlog_errno(ret);
@@ -4201,8 +4240,9 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
4201 goto out; 4240 goto out;
4202 } 4241 }
4203 4242
4204 mutex_lock(&new_inode->i_mutex); 4243 mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD);
4205 ret = ocfs2_inode_lock(new_inode, &new_bh, 1); 4244 ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
4245 OI_LS_REFLINK_TARGET);
4206 if (ret) { 4246 if (ret) {
4207 mlog_errno(ret); 4247 mlog_errno(ret);
4208 goto out_unlock; 4248 goto out_unlock;
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 9983ba1570e2..c8ce46f7d8e3 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -21,14 +21,14 @@ struct ocfs2_refcount_tree {
21 struct rb_node rf_node; 21 struct rb_node rf_node;
22 u64 rf_blkno; 22 u64 rf_blkno;
23 u32 rf_generation; 23 u32 rf_generation;
24 struct kref rf_getcnt;
24 struct rw_semaphore rf_sem; 25 struct rw_semaphore rf_sem;
25 struct ocfs2_lock_res rf_lockres; 26 struct ocfs2_lock_res rf_lockres;
26 struct kref rf_getcnt;
27 int rf_removed; 27 int rf_removed;
28 28
29 /* the following 4 fields are used by caching_info. */ 29 /* the following 4 fields are used by caching_info. */
30 struct ocfs2_caching_info rf_ci;
31 spinlock_t rf_lock; 30 spinlock_t rf_lock;
31 struct ocfs2_caching_info rf_ci;
32 struct mutex rf_io_mutex; 32 struct mutex rf_io_mutex;
33 struct super_block *rf_sb; 33 struct super_block *rf_sb;
34}; 34};
@@ -52,7 +52,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
52 u32 clusters, 52 u32 clusters,
53 int *credits, 53 int *credits,
54 int *ref_blocks); 54 int *ref_blocks);
55int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh, 55int ocfs2_refcount_cow(struct inode *inode,
56 struct file *filep, struct buffer_head *di_bh,
56 u32 cpos, u32 write_len, u32 max_cpos); 57 u32 cpos, u32 write_len, u32 max_cpos);
57 58
58typedef int (ocfs2_post_refcount_func)(struct inode *inode, 59typedef int (ocfs2_post_refcount_func)(struct inode *inode,
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index d8b6e4259b80..3e78db361bc7 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -732,25 +732,23 @@ int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
732 struct ocfs2_alloc_reservation *resv, 732 struct ocfs2_alloc_reservation *resv,
733 int *cstart, int *clen) 733 int *cstart, int *clen)
734{ 734{
735 unsigned int wanted = *clen;
736
737 if (resv == NULL || ocfs2_resmap_disabled(resmap)) 735 if (resv == NULL || ocfs2_resmap_disabled(resmap))
738 return -ENOSPC; 736 return -ENOSPC;
739 737
740 spin_lock(&resv_lock); 738 spin_lock(&resv_lock);
741 739
742 /*
743 * We don't want to over-allocate for temporary
744 * windows. Otherwise, we run the risk of fragmenting the
745 * allocation space.
746 */
747 wanted = ocfs2_resv_window_bits(resmap, resv);
748 if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
749 wanted = *clen;
750
751 if (ocfs2_resv_empty(resv)) { 740 if (ocfs2_resv_empty(resv)) {
752 mlog(0, "empty reservation, find new window\n"); 741 /*
742 * We don't want to over-allocate for temporary
743 * windows. Otherwise, we run the risk of fragmenting the
744 * allocation space.
745 */
746 unsigned int wanted = ocfs2_resv_window_bits(resmap, resv);
753 747
748 if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
749 wanted = *clen;
750
751 mlog(0, "empty reservation, find new window\n");
754 /* 752 /*
755 * Try to get a window here. If it works, we must fall 753 * Try to get a window here. If it works, we must fall
756 * through and test the bitmap . This avoids some 754 * through and test the bitmap . This avoids some
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bfbd7e9e949f..ab4e0172cc1d 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -357,7 +357,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
357{ 357{
358 int status = 0; 358 int status = 0;
359 u64 blkno; 359 u64 blkno;
360 unsigned long long blocks, bytes; 360 unsigned long long blocks, bytes = 0;
361 unsigned int i; 361 unsigned int i;
362 struct buffer_head *bh; 362 struct buffer_head *bh;
363 363
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 0d3049f696c5..19965b00c43c 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -283,6 +283,8 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
283 /* for now we only have one cluster/node, make sure we see it 283 /* for now we only have one cluster/node, make sure we see it
284 * in the heartbeat universe */ 284 * in the heartbeat universe */
285 if (!o2hb_check_local_node_heartbeating()) { 285 if (!o2hb_check_local_node_heartbeating()) {
286 if (o2hb_global_heartbeat_active())
287 mlog(ML_ERROR, "Global heartbeat not started\n");
286 rc = -EINVAL; 288 rc = -EINVAL;
287 goto out; 289 goto out;
288 } 290 }
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 2dc57bca0688..252e7c82f929 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -22,7 +22,6 @@
22#include <linux/miscdevice.h> 22#include <linux/miscdevice.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/smp_lock.h>
26#include <linux/reboot.h> 25#include <linux/reboot.h>
27#include <asm/uaccess.h> 26#include <asm/uaccess.h>
28 27
@@ -612,12 +611,10 @@ static int ocfs2_control_open(struct inode *inode, struct file *file)
612 return -ENOMEM; 611 return -ENOMEM;
613 p->op_this_node = -1; 612 p->op_this_node = -1;
614 613
615 lock_kernel();
616 mutex_lock(&ocfs2_control_lock); 614 mutex_lock(&ocfs2_control_lock);
617 file->private_data = p; 615 file->private_data = p;
618 list_add(&p->op_list, &ocfs2_control_private_list); 616 list_add(&p->op_list, &ocfs2_control_private_list);
619 mutex_unlock(&ocfs2_control_lock); 617 mutex_unlock(&ocfs2_control_lock);
620 unlock_kernel();
621 618
622 return 0; 619 return 0;
623} 620}
@@ -628,6 +625,7 @@ static const struct file_operations ocfs2_control_fops = {
628 .read = ocfs2_control_read, 625 .read = ocfs2_control_read,
629 .write = ocfs2_control_write, 626 .write = ocfs2_control_write,
630 .owner = THIS_MODULE, 627 .owner = THIS_MODULE,
628 .llseek = default_llseek,
631}; 629};
632 630
633static struct miscdevice ocfs2_control_device = { 631static struct miscdevice ocfs2_control_device = {
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 8a286f54dca1..5fed60de7630 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -357,7 +357,7 @@ out:
357static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb, 357static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
358 struct ocfs2_group_desc *bg, 358 struct ocfs2_group_desc *bg,
359 struct ocfs2_chain_list *cl, 359 struct ocfs2_chain_list *cl,
360 u64 p_blkno, u32 clusters) 360 u64 p_blkno, unsigned int clusters)
361{ 361{
362 struct ocfs2_extent_list *el = &bg->bg_list; 362 struct ocfs2_extent_list *el = &bg->bg_list;
363 struct ocfs2_extent_rec *rec; 363 struct ocfs2_extent_rec *rec;
@@ -369,7 +369,7 @@ static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
369 rec->e_blkno = cpu_to_le64(p_blkno); 369 rec->e_blkno = cpu_to_le64(p_blkno);
370 rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) / 370 rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
371 le16_to_cpu(cl->cl_bpc)); 371 le16_to_cpu(cl->cl_bpc));
372 rec->e_leaf_clusters = cpu_to_le32(clusters); 372 rec->e_leaf_clusters = cpu_to_le16(clusters);
373 le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc)); 373 le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
374 le16_add_cpu(&bg->bg_free_bits_count, 374 le16_add_cpu(&bg->bg_free_bits_count,
375 clusters * le16_to_cpu(cl->cl_bpc)); 375 clusters * le16_to_cpu(cl->cl_bpc));
@@ -1380,6 +1380,14 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
1380 } 1380 }
1381 1381
1382 le16_add_cpu(&bg->bg_free_bits_count, -num_bits); 1382 le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1383 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
1384 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
1385 " count %u but claims %u are freed. num_bits %d",
1386 (unsigned long long)le64_to_cpu(bg->bg_blkno),
1387 le16_to_cpu(bg->bg_bits),
1388 le16_to_cpu(bg->bg_free_bits_count), num_bits);
1389 return -EROFS;
1390 }
1383 while(num_bits--) 1391 while(num_bits--)
1384 ocfs2_set_bit(bit_off++, bitmap); 1392 ocfs2_set_bit(bit_off++, bitmap);
1385 1393
@@ -2419,6 +2427,14 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
2419 (unsigned long *) undo_bg->bg_bitmap); 2427 (unsigned long *) undo_bg->bg_bitmap);
2420 } 2428 }
2421 le16_add_cpu(&bg->bg_free_bits_count, num_bits); 2429 le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2430 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
2431 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
2432 " count %u but claims %u are freed. num_bits %d",
2433 (unsigned long long)le64_to_cpu(bg->bg_blkno),
2434 le16_to_cpu(bg->bg_bits),
2435 le16_to_cpu(bg->bg_free_bits_count), num_bits);
2436 return -EROFS;
2437 }
2422 2438
2423 if (undo_fn) 2439 if (undo_fn)
2424 jbd_unlock_bh_state(group_bh); 2440 jbd_unlock_bh_state(group_bh);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index fa1be1b304d1..56f0cb395820 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -162,6 +162,7 @@ enum {
162 Opt_nointr, 162 Opt_nointr,
163 Opt_hb_none, 163 Opt_hb_none,
164 Opt_hb_local, 164 Opt_hb_local,
165 Opt_hb_global,
165 Opt_data_ordered, 166 Opt_data_ordered,
166 Opt_data_writeback, 167 Opt_data_writeback,
167 Opt_atime_quantum, 168 Opt_atime_quantum,
@@ -177,6 +178,8 @@ enum {
177 Opt_noacl, 178 Opt_noacl,
178 Opt_usrquota, 179 Opt_usrquota,
179 Opt_grpquota, 180 Opt_grpquota,
181 Opt_coherency_buffered,
182 Opt_coherency_full,
180 Opt_resv_level, 183 Opt_resv_level,
181 Opt_dir_resv_level, 184 Opt_dir_resv_level,
182 Opt_err, 185 Opt_err,
@@ -190,6 +193,7 @@ static const match_table_t tokens = {
190 {Opt_nointr, "nointr"}, 193 {Opt_nointr, "nointr"},
191 {Opt_hb_none, OCFS2_HB_NONE}, 194 {Opt_hb_none, OCFS2_HB_NONE},
192 {Opt_hb_local, OCFS2_HB_LOCAL}, 195 {Opt_hb_local, OCFS2_HB_LOCAL},
196 {Opt_hb_global, OCFS2_HB_GLOBAL},
193 {Opt_data_ordered, "data=ordered"}, 197 {Opt_data_ordered, "data=ordered"},
194 {Opt_data_writeback, "data=writeback"}, 198 {Opt_data_writeback, "data=writeback"},
195 {Opt_atime_quantum, "atime_quantum=%u"}, 199 {Opt_atime_quantum, "atime_quantum=%u"},
@@ -205,6 +209,8 @@ static const match_table_t tokens = {
205 {Opt_noacl, "noacl"}, 209 {Opt_noacl, "noacl"},
206 {Opt_usrquota, "usrquota"}, 210 {Opt_usrquota, "usrquota"},
207 {Opt_grpquota, "grpquota"}, 211 {Opt_grpquota, "grpquota"},
212 {Opt_coherency_buffered, "coherency=buffered"},
213 {Opt_coherency_full, "coherency=full"},
208 {Opt_resv_level, "resv_level=%u"}, 214 {Opt_resv_level, "resv_level=%u"},
209 {Opt_dir_resv_level, "dir_resv_level=%u"}, 215 {Opt_dir_resv_level, "dir_resv_level=%u"},
210 {Opt_err, NULL} 216 {Opt_err, NULL}
@@ -514,11 +520,11 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
514 520
515 mlog_entry_void(); 521 mlog_entry_void();
516 522
517 for (i = 0; i < NUM_SYSTEM_INODES; i++) { 523 for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
518 inode = osb->system_inodes[i]; 524 inode = osb->global_system_inodes[i];
519 if (inode) { 525 if (inode) {
520 iput(inode); 526 iput(inode);
521 osb->system_inodes[i] = NULL; 527 osb->global_system_inodes[i] = NULL;
522 } 528 }
523 } 529 }
524 530
@@ -534,6 +540,20 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
534 osb->root_inode = NULL; 540 osb->root_inode = NULL;
535 } 541 }
536 542
543 if (!osb->local_system_inodes)
544 goto out;
545
546 for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
547 if (osb->local_system_inodes[i]) {
548 iput(osb->local_system_inodes[i]);
549 osb->local_system_inodes[i] = NULL;
550 }
551 }
552
553 kfree(osb->local_system_inodes);
554 osb->local_system_inodes = NULL;
555
556out:
537 mlog_exit(0); 557 mlog_exit(0);
538} 558}
539 559
@@ -608,8 +628,7 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
608 int ret = 0; 628 int ret = 0;
609 struct mount_options parsed_options; 629 struct mount_options parsed_options;
610 struct ocfs2_super *osb = OCFS2_SB(sb); 630 struct ocfs2_super *osb = OCFS2_SB(sb);
611 631 u32 tmp;
612 lock_kernel();
613 632
614 if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || 633 if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
615 !ocfs2_check_set_options(sb, &parsed_options)) { 634 !ocfs2_check_set_options(sb, &parsed_options)) {
@@ -617,8 +636,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
617 goto out; 636 goto out;
618 } 637 }
619 638
620 if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) != 639 tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
621 (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL)) { 640 OCFS2_MOUNT_HB_NONE;
641 if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
622 ret = -EINVAL; 642 ret = -EINVAL;
623 mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n"); 643 mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
624 goto out; 644 goto out;
@@ -717,7 +737,6 @@ unlock_osb:
717 MS_POSIXACL : 0); 737 MS_POSIXACL : 0);
718 } 738 }
719out: 739out:
720 unlock_kernel();
721 return ret; 740 return ret;
722} 741}
723 742
@@ -809,23 +828,29 @@ bail:
809 828
810static int ocfs2_verify_heartbeat(struct ocfs2_super *osb) 829static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
811{ 830{
812 if (ocfs2_mount_local(osb)) { 831 u32 hb_enabled = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL;
813 if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { 832
833 if (osb->s_mount_opt & hb_enabled) {
834 if (ocfs2_mount_local(osb)) {
814 mlog(ML_ERROR, "Cannot heartbeat on a locally " 835 mlog(ML_ERROR, "Cannot heartbeat on a locally "
815 "mounted device.\n"); 836 "mounted device.\n");
816 return -EINVAL; 837 return -EINVAL;
817 } 838 }
818 } 839 if (ocfs2_userspace_stack(osb)) {
819
820 if (ocfs2_userspace_stack(osb)) {
821 if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
822 mlog(ML_ERROR, "Userspace stack expected, but " 840 mlog(ML_ERROR, "Userspace stack expected, but "
823 "o2cb heartbeat arguments passed to mount\n"); 841 "o2cb heartbeat arguments passed to mount\n");
824 return -EINVAL; 842 return -EINVAL;
825 } 843 }
844 if (((osb->s_mount_opt & OCFS2_MOUNT_HB_GLOBAL) &&
845 !ocfs2_cluster_o2cb_global_heartbeat(osb)) ||
846 ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) &&
847 ocfs2_cluster_o2cb_global_heartbeat(osb))) {
848 mlog(ML_ERROR, "Mismatching o2cb heartbeat modes\n");
849 return -EINVAL;
850 }
826 } 851 }
827 852
828 if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) { 853 if (!(osb->s_mount_opt & hb_enabled)) {
829 if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) && 854 if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) &&
830 !ocfs2_userspace_stack(osb)) { 855 !ocfs2_userspace_stack(osb)) {
831 mlog(ML_ERROR, "Heartbeat has to be started to mount " 856 mlog(ML_ERROR, "Heartbeat has to be started to mount "
@@ -1291,6 +1316,7 @@ static int ocfs2_parse_options(struct super_block *sb,
1291{ 1316{
1292 int status; 1317 int status;
1293 char *p; 1318 char *p;
1319 u32 tmp;
1294 1320
1295 mlog_entry("remount: %d, options: \"%s\"\n", is_remount, 1321 mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
1296 options ? options : "(none)"); 1322 options ? options : "(none)");
@@ -1322,7 +1348,10 @@ static int ocfs2_parse_options(struct super_block *sb,
1322 mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL; 1348 mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL;
1323 break; 1349 break;
1324 case Opt_hb_none: 1350 case Opt_hb_none:
1325 mopt->mount_opt &= ~OCFS2_MOUNT_HB_LOCAL; 1351 mopt->mount_opt |= OCFS2_MOUNT_HB_NONE;
1352 break;
1353 case Opt_hb_global:
1354 mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL;
1326 break; 1355 break;
1327 case Opt_barrier: 1356 case Opt_barrier:
1328 if (match_int(&args[0], &option)) { 1357 if (match_int(&args[0], &option)) {
@@ -1438,6 +1467,12 @@ static int ocfs2_parse_options(struct super_block *sb,
1438 case Opt_grpquota: 1467 case Opt_grpquota:
1439 mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; 1468 mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
1440 break; 1469 break;
1470 case Opt_coherency_buffered:
1471 mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED;
1472 break;
1473 case Opt_coherency_full:
1474 mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED;
1475 break;
1441 case Opt_acl: 1476 case Opt_acl:
1442 mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; 1477 mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
1443 mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; 1478 mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
@@ -1477,6 +1512,15 @@ static int ocfs2_parse_options(struct super_block *sb,
1477 } 1512 }
1478 } 1513 }
1479 1514
1515 /* Ensure only one heartbeat mode */
1516 tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
1517 OCFS2_MOUNT_HB_NONE);
1518 if (hweight32(tmp) != 1) {
1519 mlog(ML_ERROR, "Invalid heartbeat mount options\n");
1520 status = 0;
1521 goto bail;
1522 }
1523
1480 status = 1; 1524 status = 1;
1481 1525
1482bail: 1526bail:
@@ -1490,10 +1534,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1490 unsigned long opts = osb->s_mount_opt; 1534 unsigned long opts = osb->s_mount_opt;
1491 unsigned int local_alloc_megs; 1535 unsigned int local_alloc_megs;
1492 1536
1493 if (opts & OCFS2_MOUNT_HB_LOCAL) 1537 if (opts & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL)) {
1494 seq_printf(s, ",_netdev,heartbeat=local"); 1538 seq_printf(s, ",_netdev");
1495 else 1539 if (opts & OCFS2_MOUNT_HB_LOCAL)
1496 seq_printf(s, ",heartbeat=none"); 1540 seq_printf(s, ",%s", OCFS2_HB_LOCAL);
1541 else
1542 seq_printf(s, ",%s", OCFS2_HB_GLOBAL);
1543 } else
1544 seq_printf(s, ",%s", OCFS2_HB_NONE);
1497 1545
1498 if (opts & OCFS2_MOUNT_NOINTR) 1546 if (opts & OCFS2_MOUNT_NOINTR)
1499 seq_printf(s, ",nointr"); 1547 seq_printf(s, ",nointr");
@@ -1536,6 +1584,11 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1536 if (opts & OCFS2_MOUNT_GRPQUOTA) 1584 if (opts & OCFS2_MOUNT_GRPQUOTA)
1537 seq_printf(s, ",grpquota"); 1585 seq_printf(s, ",grpquota");
1538 1586
1587 if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED)
1588 seq_printf(s, ",coherency=buffered");
1589 else
1590 seq_printf(s, ",coherency=full");
1591
1539 if (opts & OCFS2_MOUNT_NOUSERXATTR) 1592 if (opts & OCFS2_MOUNT_NOUSERXATTR)
1540 seq_printf(s, ",nouser_xattr"); 1593 seq_printf(s, ",nouser_xattr");
1541 else 1594 else
@@ -1640,13 +1693,9 @@ static void ocfs2_put_super(struct super_block *sb)
1640{ 1693{
1641 mlog_entry("(0x%p)\n", sb); 1694 mlog_entry("(0x%p)\n", sb);
1642 1695
1643 lock_kernel();
1644
1645 ocfs2_sync_blockdev(sb); 1696 ocfs2_sync_blockdev(sb);
1646 ocfs2_dismount_volume(sb, 0); 1697 ocfs2_dismount_volume(sb, 0);
1647 1698
1648 unlock_kernel();
1649
1650 mlog_exit_void(); 1699 mlog_exit_void();
1651} 1700}
1652 1701
@@ -1990,6 +2039,36 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu
1990 return 0; 2039 return 0;
1991} 2040}
1992 2041
2042/* Make sure entire volume is addressable by our journal. Requires
2043 osb_clusters_at_boot to be valid and for the journal to have been
2044 initialized by ocfs2_journal_init(). */
2045static int ocfs2_journal_addressable(struct ocfs2_super *osb)
2046{
2047 int status = 0;
2048 u64 max_block =
2049 ocfs2_clusters_to_blocks(osb->sb,
2050 osb->osb_clusters_at_boot) - 1;
2051
2052 /* 32-bit block number is always OK. */
2053 if (max_block <= (u32)~0ULL)
2054 goto out;
2055
2056 /* Volume is "huge", so see if our journal is new enough to
2057 support it. */
2058 if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb,
2059 OCFS2_FEATURE_COMPAT_JBD2_SB) &&
2060 jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0,
2061 JBD2_FEATURE_INCOMPAT_64BIT))) {
2062 mlog(ML_ERROR, "The journal cannot address the entire volume. "
2063 "Enable the 'block64' journal option with tunefs.ocfs2");
2064 status = -EFBIG;
2065 goto out;
2066 }
2067
2068 out:
2069 return status;
2070}
2071
1993static int ocfs2_initialize_super(struct super_block *sb, 2072static int ocfs2_initialize_super(struct super_block *sb,
1994 struct buffer_head *bh, 2073 struct buffer_head *bh,
1995 int sector_size, 2074 int sector_size,
@@ -2002,6 +2081,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
2002 struct ocfs2_journal *journal; 2081 struct ocfs2_journal *journal;
2003 __le32 uuid_net_key; 2082 __le32 uuid_net_key;
2004 struct ocfs2_super *osb; 2083 struct ocfs2_super *osb;
2084 u64 total_blocks;
2005 2085
2006 mlog_entry_void(); 2086 mlog_entry_void();
2007 2087
@@ -2060,6 +2140,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
2060 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", 2140 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
2061 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 2141 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
2062 2142
2143 osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
2144 if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
2145 mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
2146 osb->max_slots);
2147 status = -EINVAL;
2148 goto bail;
2149 }
2150 mlog(0, "max_slots for this device: %u\n", osb->max_slots);
2151
2063 ocfs2_orphan_scan_init(osb); 2152 ocfs2_orphan_scan_init(osb);
2064 2153
2065 status = ocfs2_recovery_init(osb); 2154 status = ocfs2_recovery_init(osb);
@@ -2098,15 +2187,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
2098 goto bail; 2187 goto bail;
2099 } 2188 }
2100 2189
2101 osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
2102 if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
2103 mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
2104 osb->max_slots);
2105 status = -EINVAL;
2106 goto bail;
2107 }
2108 mlog(0, "max_slots for this device: %u\n", osb->max_slots);
2109
2110 osb->slot_recovery_generations = 2190 osb->slot_recovery_generations =
2111 kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations), 2191 kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
2112 GFP_KERNEL); 2192 GFP_KERNEL);
@@ -2149,7 +2229,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
2149 goto bail; 2229 goto bail;
2150 } 2230 }
2151 2231
2152 if (ocfs2_userspace_stack(osb)) { 2232 if (ocfs2_clusterinfo_valid(osb)) {
2233 osb->osb_stackflags =
2234 OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
2153 memcpy(osb->osb_cluster_stack, 2235 memcpy(osb->osb_cluster_stack,
2154 OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, 2236 OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
2155 OCFS2_STACK_LABEL_LEN); 2237 OCFS2_STACK_LABEL_LEN);
@@ -2214,11 +2296,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
2214 goto bail; 2296 goto bail;
2215 } 2297 }
2216 2298
2217 if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1) 2299 total_blocks = ocfs2_clusters_to_blocks(osb->sb,
2218 > (u32)~0UL) { 2300 le32_to_cpu(di->i_clusters));
2219 mlog(ML_ERROR, "Volume might try to write to blocks beyond " 2301
2220 "what jbd can address in 32 bits.\n"); 2302 status = generic_check_addressable(osb->sb->s_blocksize_bits,
2221 status = -EINVAL; 2303 total_blocks);
2304 if (status) {
2305 mlog(ML_ERROR, "Volume too large "
2306 "to mount safely on this system");
2307 status = -EFBIG;
2222 goto bail; 2308 goto bail;
2223 } 2309 }
2224 2310
@@ -2380,6 +2466,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
2380 goto finally; 2466 goto finally;
2381 } 2467 }
2382 2468
2469 /* Now that journal has been initialized, check to make sure
2470 entire volume is addressable. */
2471 status = ocfs2_journal_addressable(osb);
2472 if (status)
2473 goto finally;
2474
2383 /* If the journal was unmounted cleanly then we don't want to 2475 /* If the journal was unmounted cleanly then we don't want to
2384 * recover anything. Otherwise, journal_load will do that 2476 * recover anything. Otherwise, journal_load will do that
2385 * dirty work for us :) */ 2477 * dirty work for us :) */
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 32499d213fc4..9975457c981f 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -128,7 +128,7 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry,
128 } 128 }
129 129
130 /* Fast symlinks can't be large */ 130 /* Fast symlinks can't be large */
131 len = strlen(target); 131 len = strnlen(target, ocfs2_fast_symlink_chars(inode->i_sb));
132 link = kzalloc(len + 1, GFP_NOFS); 132 link = kzalloc(len + 1, GFP_NOFS);
133 if (!link) { 133 if (!link) {
134 status = -ENOMEM; 134 status = -ENOMEM;
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index bfe7190cdbf1..902efb23b6a6 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -44,11 +44,6 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
44 int type, 44 int type,
45 u32 slot); 45 u32 slot);
46 46
47static inline int is_global_system_inode(int type);
48static inline int is_in_system_inode_array(struct ocfs2_super *osb,
49 int type,
50 u32 slot);
51
52#ifdef CONFIG_DEBUG_LOCK_ALLOC 47#ifdef CONFIG_DEBUG_LOCK_ALLOC
53static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES]; 48static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES];
54#endif 49#endif
@@ -59,11 +54,52 @@ static inline int is_global_system_inode(int type)
59 type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; 54 type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE;
60} 55}
61 56
62static inline int is_in_system_inode_array(struct ocfs2_super *osb, 57static struct inode **get_local_system_inode(struct ocfs2_super *osb,
63 int type, 58 int type,
64 u32 slot) 59 u32 slot)
65{ 60{
66 return slot == osb->slot_num || is_global_system_inode(type); 61 int index;
62 struct inode **local_system_inodes, **free = NULL;
63
64 BUG_ON(slot == OCFS2_INVALID_SLOT);
65 BUG_ON(type < OCFS2_FIRST_LOCAL_SYSTEM_INODE ||
66 type > OCFS2_LAST_LOCAL_SYSTEM_INODE);
67
68 spin_lock(&osb->osb_lock);
69 local_system_inodes = osb->local_system_inodes;
70 spin_unlock(&osb->osb_lock);
71
72 if (unlikely(!local_system_inodes)) {
73 local_system_inodes = kzalloc(sizeof(struct inode *) *
74 NUM_LOCAL_SYSTEM_INODES *
75 osb->max_slots,
76 GFP_NOFS);
77 if (!local_system_inodes) {
78 mlog_errno(-ENOMEM);
79 /*
80 * return NULL here so that ocfs2_get_sytem_file_inodes
81 * will try to create an inode and use it. We will try
82 * to initialize local_system_inodes next time.
83 */
84 return NULL;
85 }
86
87 spin_lock(&osb->osb_lock);
88 if (osb->local_system_inodes) {
89 /* Someone has initialized it for us. */
90 free = local_system_inodes;
91 local_system_inodes = osb->local_system_inodes;
92 } else
93 osb->local_system_inodes = local_system_inodes;
94 spin_unlock(&osb->osb_lock);
95 if (unlikely(free))
96 kfree(free);
97 }
98
99 index = (slot * NUM_LOCAL_SYSTEM_INODES) +
100 (type - OCFS2_FIRST_LOCAL_SYSTEM_INODE);
101
102 return &local_system_inodes[index];
67} 103}
68 104
69struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, 105struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
@@ -74,8 +110,10 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
74 struct inode **arr = NULL; 110 struct inode **arr = NULL;
75 111
76 /* avoid the lookup if cached in local system file array */ 112 /* avoid the lookup if cached in local system file array */
77 if (is_in_system_inode_array(osb, type, slot)) 113 if (is_global_system_inode(type)) {
78 arr = &(osb->system_inodes[type]); 114 arr = &(osb->global_system_inodes[type]);
115 } else
116 arr = get_local_system_inode(osb, type, slot);
79 117
80 if (arr && ((inode = *arr) != NULL)) { 118 if (arr && ((inode = *arr) != NULL)) {
81 /* get a ref in addition to the array ref */ 119 /* get a ref in addition to the array ref */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d03469f61801..67cd43914641 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1286,13 +1286,11 @@ int ocfs2_xattr_get_nolock(struct inode *inode,
1286 xis.inode_bh = xbs.inode_bh = di_bh; 1286 xis.inode_bh = xbs.inode_bh = di_bh;
1287 di = (struct ocfs2_dinode *)di_bh->b_data; 1287 di = (struct ocfs2_dinode *)di_bh->b_data;
1288 1288
1289 down_read(&oi->ip_xattr_sem);
1290 ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer, 1289 ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
1291 buffer_size, &xis); 1290 buffer_size, &xis);
1292 if (ret == -ENODATA && di->i_xattr_loc) 1291 if (ret == -ENODATA && di->i_xattr_loc)
1293 ret = ocfs2_xattr_block_get(inode, name_index, name, buffer, 1292 ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
1294 buffer_size, &xbs); 1293 buffer_size, &xbs);
1295 up_read(&oi->ip_xattr_sem);
1296 1294
1297 return ret; 1295 return ret;
1298} 1296}
@@ -1316,8 +1314,10 @@ static int ocfs2_xattr_get(struct inode *inode,
1316 mlog_errno(ret); 1314 mlog_errno(ret);
1317 return ret; 1315 return ret;
1318 } 1316 }
1317 down_read(&OCFS2_I(inode)->ip_xattr_sem);
1319 ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index, 1318 ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
1320 name, buffer, buffer_size); 1319 name, buffer, buffer_size);
1320 up_read(&OCFS2_I(inode)->ip_xattr_sem);
1321 1321
1322 ocfs2_inode_unlock(inode, 0); 1322 ocfs2_inode_unlock(inode, 0);
1323 1323
@@ -7081,7 +7081,7 @@ static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args,
7081 goto out; 7081 goto out;
7082 } 7082 }
7083 7083
7084 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) 7084 if (!indexed)
7085 ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh); 7085 ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh);
7086 else 7086 else
7087 ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh); 7087 ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 79fbf3f390f0..0a8b0ad0c7e2 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -352,6 +352,7 @@ static void part_release(struct device *dev)
352{ 352{
353 struct hd_struct *p = dev_to_part(dev); 353 struct hd_struct *p = dev_to_part(dev);
354 free_part_stats(p); 354 free_part_stats(p);
355 free_part_info(p);
355 kfree(p); 356 kfree(p);
356} 357}
357 358
@@ -401,7 +402,8 @@ static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
401 whole_disk_show, NULL); 402 whole_disk_show, NULL);
402 403
403struct hd_struct *add_partition(struct gendisk *disk, int partno, 404struct hd_struct *add_partition(struct gendisk *disk, int partno,
404 sector_t start, sector_t len, int flags) 405 sector_t start, sector_t len, int flags,
406 struct partition_meta_info *info)
405{ 407{
406 struct hd_struct *p; 408 struct hd_struct *p;
407 dev_t devt = MKDEV(0, 0); 409 dev_t devt = MKDEV(0, 0);
@@ -438,6 +440,14 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
438 p->partno = partno; 440 p->partno = partno;
439 p->policy = get_disk_ro(disk); 441 p->policy = get_disk_ro(disk);
440 442
443 if (info) {
444 struct partition_meta_info *pinfo = alloc_part_info(disk);
445 if (!pinfo)
446 goto out_free_stats;
447 memcpy(pinfo, info, sizeof(*info));
448 p->info = pinfo;
449 }
450
441 dname = dev_name(ddev); 451 dname = dev_name(ddev);
442 if (isdigit(dname[strlen(dname) - 1])) 452 if (isdigit(dname[strlen(dname) - 1]))
443 dev_set_name(pdev, "%sp%d", dname, partno); 453 dev_set_name(pdev, "%sp%d", dname, partno);
@@ -451,7 +461,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
451 461
452 err = blk_alloc_devt(p, &devt); 462 err = blk_alloc_devt(p, &devt);
453 if (err) 463 if (err)
454 goto out_free_stats; 464 goto out_free_info;
455 pdev->devt = devt; 465 pdev->devt = devt;
456 466
457 /* delay uevent until 'holders' subdir is created */ 467 /* delay uevent until 'holders' subdir is created */
@@ -481,6 +491,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
481 491
482 return p; 492 return p;
483 493
494out_free_info:
495 free_part_info(p);
484out_free_stats: 496out_free_stats:
485 free_part_stats(p); 497 free_part_stats(p);
486out_free: 498out_free:
@@ -513,14 +525,14 @@ void register_disk(struct gendisk *disk)
513 525
514 if (device_add(ddev)) 526 if (device_add(ddev))
515 return; 527 return;
516#ifndef CONFIG_SYSFS_DEPRECATED 528 if (!sysfs_deprecated) {
517 err = sysfs_create_link(block_depr, &ddev->kobj, 529 err = sysfs_create_link(block_depr, &ddev->kobj,
518 kobject_name(&ddev->kobj)); 530 kobject_name(&ddev->kobj));
519 if (err) { 531 if (err) {
520 device_del(ddev); 532 device_del(ddev);
521 return; 533 return;
534 }
522 } 535 }
523#endif
524 disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); 536 disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
525 disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); 537 disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
526 538
@@ -642,6 +654,7 @@ rescan:
642 /* add partitions */ 654 /* add partitions */
643 for (p = 1; p < state->limit; p++) { 655 for (p = 1; p < state->limit; p++) {
644 sector_t size, from; 656 sector_t size, from;
657 struct partition_meta_info *info = NULL;
645 658
646 size = state->parts[p].size; 659 size = state->parts[p].size;
647 if (!size) 660 if (!size)
@@ -675,8 +688,12 @@ rescan:
675 size = get_capacity(disk) - from; 688 size = get_capacity(disk) - from;
676 } 689 }
677 } 690 }
691
692 if (state->parts[p].has_info)
693 info = &state->parts[p].info;
678 part = add_partition(disk, p, from, size, 694 part = add_partition(disk, p, from, size,
679 state->parts[p].flags); 695 state->parts[p].flags,
696 &state->parts[p].info);
680 if (IS_ERR(part)) { 697 if (IS_ERR(part)) {
681 printk(KERN_ERR " %s: p%d could not be added: %ld\n", 698 printk(KERN_ERR " %s: p%d could not be added: %ld\n",
682 disk->disk_name, p, -PTR_ERR(part)); 699 disk->disk_name, p, -PTR_ERR(part));
@@ -737,8 +754,7 @@ void del_gendisk(struct gendisk *disk)
737 kobject_put(disk->part0.holder_dir); 754 kobject_put(disk->part0.holder_dir);
738 kobject_put(disk->slave_dir); 755 kobject_put(disk->slave_dir);
739 disk->driverfs_dev = NULL; 756 disk->driverfs_dev = NULL;
740#ifndef CONFIG_SYSFS_DEPRECATED 757 if (!sysfs_deprecated)
741 sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); 758 sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
742#endif
743 device_del(disk_to_dev(disk)); 759 device_del(disk_to_dev(disk));
744} 760}
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 8e4e103ba216..d68bf4dc3bc2 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -1,5 +1,6 @@
1#include <linux/pagemap.h> 1#include <linux/pagemap.h>
2#include <linux/blkdev.h> 2#include <linux/blkdev.h>
3#include <linux/genhd.h>
3 4
4/* 5/*
5 * add_gd_partition adds a partitions details to the devices partition 6 * add_gd_partition adds a partitions details to the devices partition
@@ -12,6 +13,8 @@ struct parsed_partitions {
12 sector_t from; 13 sector_t from;
13 sector_t size; 14 sector_t size;
14 int flags; 15 int flags;
16 bool has_info;
17 struct partition_meta_info info;
15 } parts[DISK_MAX_PARTS]; 18 } parts[DISK_MAX_PARTS];
16 int next; 19 int next;
17 int limit; 20 int limit;
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index dbb44d4bb8a7..ac0ccb5026a2 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -94,6 +94,7 @@
94 * 94 *
95 ************************************************************/ 95 ************************************************************/
96#include <linux/crc32.h> 96#include <linux/crc32.h>
97#include <linux/ctype.h>
97#include <linux/math64.h> 98#include <linux/math64.h>
98#include <linux/slab.h> 99#include <linux/slab.h>
99#include "check.h" 100#include "check.h"
@@ -604,6 +605,7 @@ int efi_partition(struct parsed_partitions *state)
604 gpt_entry *ptes = NULL; 605 gpt_entry *ptes = NULL;
605 u32 i; 606 u32 i;
606 unsigned ssz = bdev_logical_block_size(state->bdev) / 512; 607 unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
608 u8 unparsed_guid[37];
607 609
608 if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { 610 if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
609 kfree(gpt); 611 kfree(gpt);
@@ -614,6 +616,9 @@ int efi_partition(struct parsed_partitions *state)
614 pr_debug("GUID Partition Table is valid! Yea!\n"); 616 pr_debug("GUID Partition Table is valid! Yea!\n");
615 617
616 for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) { 618 for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
619 struct partition_meta_info *info;
620 unsigned label_count = 0;
621 unsigned label_max;
617 u64 start = le64_to_cpu(ptes[i].starting_lba); 622 u64 start = le64_to_cpu(ptes[i].starting_lba);
618 u64 size = le64_to_cpu(ptes[i].ending_lba) - 623 u64 size = le64_to_cpu(ptes[i].ending_lba) -
619 le64_to_cpu(ptes[i].starting_lba) + 1ULL; 624 le64_to_cpu(ptes[i].starting_lba) + 1ULL;
@@ -627,6 +632,26 @@ int efi_partition(struct parsed_partitions *state)
627 if (!efi_guidcmp(ptes[i].partition_type_guid, 632 if (!efi_guidcmp(ptes[i].partition_type_guid,
628 PARTITION_LINUX_RAID_GUID)) 633 PARTITION_LINUX_RAID_GUID))
629 state->parts[i + 1].flags = ADDPART_FLAG_RAID; 634 state->parts[i + 1].flags = ADDPART_FLAG_RAID;
635
636 info = &state->parts[i + 1].info;
637 /* Instead of doing a manual swap to big endian, reuse the
638 * common ASCII hex format as the interim.
639 */
640 efi_guid_unparse(&ptes[i].unique_partition_guid, unparsed_guid);
641 part_pack_uuid(unparsed_guid, info->uuid);
642
643 /* Naively convert UTF16-LE to 7 bits. */
644 label_max = min(sizeof(info->volname) - 1,
645 sizeof(ptes[i].partition_name));
646 info->volname[label_max] = 0;
647 while (label_count < label_max) {
648 u8 c = ptes[i].partition_name[label_count] & 0xff;
649 if (c && !isprint(c))
650 c = '!';
651 info->volname[label_count] = c;
652 label_count++;
653 }
654 state->parts[i + 1].has_info = true;
630 } 655 }
631 kfree(ptes); 656 kfree(ptes);
632 kfree(gpt); 657 kfree(gpt);
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 5bf8a04b5d9b..789c625c7aa5 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -5,7 +5,7 @@
5 * Copyright (c) 2001-2007 Anton Altaparmakov 5 * Copyright (c) 2001-2007 Anton Altaparmakov
6 * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com> 6 * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
7 * 7 *
8 * Documentation is available at http://www.linux-ntfs.org/content/view/19/37/ 8 * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads
9 * 9 *
10 * This program is free software; you can redistribute it and/or modify it under 10 * This program is free software; you can redistribute it and/or modify it under
11 * the terms of the GNU General Public License as published by the Free Software 11 * the terms of the GNU General Public License as published by the Free Software
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index d1fb50b28d86..374242c0971a 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -5,7 +5,7 @@
5 * Copyright (c) 2001-2007 Anton Altaparmakov 5 * Copyright (c) 2001-2007 Anton Altaparmakov
6 * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com> 6 * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
7 * 7 *
8 * Documentation is available at http://www.linux-ntfs.org/content/view/19/37/ 8 * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads
9 * 9 *
10 * This program is free software; you can redistribute it and/or modify it 10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU General Public License as published by the Free 11 * under the terms of the GNU General Public License as published by the Free
diff --git a/fs/pipe.c b/fs/pipe.c
index 279eef96c51c..d2d7566ce68e 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -382,7 +382,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
382 error = ops->confirm(pipe, buf); 382 error = ops->confirm(pipe, buf);
383 if (error) { 383 if (error) {
384 if (!ret) 384 if (!ret)
385 error = ret; 385 ret = error;
386 break; 386 break;
387 } 387 }
388 388
@@ -954,6 +954,8 @@ static struct inode * get_pipe_inode(void)
954 if (!inode) 954 if (!inode)
955 goto fail_inode; 955 goto fail_inode;
956 956
957 inode->i_ino = get_next_ino();
958
957 pipe = alloc_pipe_info(inode); 959 pipe = alloc_pipe_info(inode);
958 if (!pipe) 960 if (!pipe)
959 goto fail_iput; 961 goto fail_iput;
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 50f8f0600f06..6a0068841d96 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -33,8 +33,8 @@ config PROC_KCORE
33 depends on PROC_FS && MMU 33 depends on PROC_FS && MMU
34 34
35config PROC_VMCORE 35config PROC_VMCORE
36 bool "/proc/vmcore support (EXPERIMENTAL)" 36 bool "/proc/vmcore support"
37 depends on PROC_FS && CRASH_DUMP 37 depends on PROC_FS && CRASH_DUMP
38 default y 38 default y
39 help 39 help
40 Exports the dump image of crashed kernel in ELF format. 40 Exports the dump image of crashed kernel in ELF format.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a1c43e7c8a7b..f3d02ca461ec 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -226,7 +226,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
226{ 226{
227 struct mm_struct *mm; 227 struct mm_struct *mm;
228 228
229 if (mutex_lock_killable(&task->cred_guard_mutex)) 229 if (mutex_lock_killable(&task->signal->cred_guard_mutex))
230 return NULL; 230 return NULL;
231 231
232 mm = get_task_mm(task); 232 mm = get_task_mm(task);
@@ -235,7 +235,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
235 mmput(mm); 235 mmput(mm);
236 mm = NULL; 236 mm = NULL;
237 } 237 }
238 mutex_unlock(&task->cred_guard_mutex); 238 mutex_unlock(&task->signal->cred_guard_mutex);
239 239
240 return mm; 240 return mm;
241} 241}
@@ -771,6 +771,8 @@ static const struct file_operations proc_single_file_operations = {
771static int mem_open(struct inode* inode, struct file* file) 771static int mem_open(struct inode* inode, struct file* file)
772{ 772{
773 file->private_data = (void*)((long)current->self_exec_id); 773 file->private_data = (void*)((long)current->self_exec_id);
774 /* OK to pass negative loff_t, we can catch out-of-range */
775 file->f_mode |= FMODE_UNSIGNED_OFFSET;
774 return 0; 776 return 0;
775} 777}
776 778
@@ -1023,28 +1025,47 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
1023 memset(buffer, 0, sizeof(buffer)); 1025 memset(buffer, 0, sizeof(buffer));
1024 if (count > sizeof(buffer) - 1) 1026 if (count > sizeof(buffer) - 1)
1025 count = sizeof(buffer) - 1; 1027 count = sizeof(buffer) - 1;
1026 if (copy_from_user(buffer, buf, count)) 1028 if (copy_from_user(buffer, buf, count)) {
1027 return -EFAULT; 1029 err = -EFAULT;
1030 goto out;
1031 }
1028 1032
1029 err = strict_strtol(strstrip(buffer), 0, &oom_adjust); 1033 err = strict_strtol(strstrip(buffer), 0, &oom_adjust);
1030 if (err) 1034 if (err)
1031 return -EINVAL; 1035 goto out;
1032 if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && 1036 if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
1033 oom_adjust != OOM_DISABLE) 1037 oom_adjust != OOM_DISABLE) {
1034 return -EINVAL; 1038 err = -EINVAL;
1039 goto out;
1040 }
1035 1041
1036 task = get_proc_task(file->f_path.dentry->d_inode); 1042 task = get_proc_task(file->f_path.dentry->d_inode);
1037 if (!task) 1043 if (!task) {
1038 return -ESRCH; 1044 err = -ESRCH;
1045 goto out;
1046 }
1047
1048 task_lock(task);
1049 if (!task->mm) {
1050 err = -EINVAL;
1051 goto err_task_lock;
1052 }
1053
1039 if (!lock_task_sighand(task, &flags)) { 1054 if (!lock_task_sighand(task, &flags)) {
1040 put_task_struct(task); 1055 err = -ESRCH;
1041 return -ESRCH; 1056 goto err_task_lock;
1042 } 1057 }
1043 1058
1044 if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) { 1059 if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
1045 unlock_task_sighand(task, &flags); 1060 err = -EACCES;
1046 put_task_struct(task); 1061 goto err_sighand;
1047 return -EACCES; 1062 }
1063
1064 if (oom_adjust != task->signal->oom_adj) {
1065 if (oom_adjust == OOM_DISABLE)
1066 atomic_inc(&task->mm->oom_disable_count);
1067 if (task->signal->oom_adj == OOM_DISABLE)
1068 atomic_dec(&task->mm->oom_disable_count);
1048 } 1069 }
1049 1070
1050 /* 1071 /*
@@ -1065,10 +1086,13 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
1065 else 1086 else
1066 task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / 1087 task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
1067 -OOM_DISABLE; 1088 -OOM_DISABLE;
1089err_sighand:
1068 unlock_task_sighand(task, &flags); 1090 unlock_task_sighand(task, &flags);
1091err_task_lock:
1092 task_unlock(task);
1069 put_task_struct(task); 1093 put_task_struct(task);
1070 1094out:
1071 return count; 1095 return err < 0 ? err : count;
1072} 1096}
1073 1097
1074static const struct file_operations proc_oom_adjust_operations = { 1098static const struct file_operations proc_oom_adjust_operations = {
@@ -1109,30 +1133,49 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1109 memset(buffer, 0, sizeof(buffer)); 1133 memset(buffer, 0, sizeof(buffer));
1110 if (count > sizeof(buffer) - 1) 1134 if (count > sizeof(buffer) - 1)
1111 count = sizeof(buffer) - 1; 1135 count = sizeof(buffer) - 1;
1112 if (copy_from_user(buffer, buf, count)) 1136 if (copy_from_user(buffer, buf, count)) {
1113 return -EFAULT; 1137 err = -EFAULT;
1138 goto out;
1139 }
1114 1140
1115 err = strict_strtol(strstrip(buffer), 0, &oom_score_adj); 1141 err = strict_strtol(strstrip(buffer), 0, &oom_score_adj);
1116 if (err) 1142 if (err)
1117 return -EINVAL; 1143 goto out;
1118 if (oom_score_adj < OOM_SCORE_ADJ_MIN || 1144 if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
1119 oom_score_adj > OOM_SCORE_ADJ_MAX) 1145 oom_score_adj > OOM_SCORE_ADJ_MAX) {
1120 return -EINVAL; 1146 err = -EINVAL;
1147 goto out;
1148 }
1121 1149
1122 task = get_proc_task(file->f_path.dentry->d_inode); 1150 task = get_proc_task(file->f_path.dentry->d_inode);
1123 if (!task) 1151 if (!task) {
1124 return -ESRCH; 1152 err = -ESRCH;
1153 goto out;
1154 }
1155
1156 task_lock(task);
1157 if (!task->mm) {
1158 err = -EINVAL;
1159 goto err_task_lock;
1160 }
1161
1125 if (!lock_task_sighand(task, &flags)) { 1162 if (!lock_task_sighand(task, &flags)) {
1126 put_task_struct(task); 1163 err = -ESRCH;
1127 return -ESRCH; 1164 goto err_task_lock;
1128 } 1165 }
1166
1129 if (oom_score_adj < task->signal->oom_score_adj && 1167 if (oom_score_adj < task->signal->oom_score_adj &&
1130 !capable(CAP_SYS_RESOURCE)) { 1168 !capable(CAP_SYS_RESOURCE)) {
1131 unlock_task_sighand(task, &flags); 1169 err = -EACCES;
1132 put_task_struct(task); 1170 goto err_sighand;
1133 return -EACCES;
1134 } 1171 }
1135 1172
1173 if (oom_score_adj != task->signal->oom_score_adj) {
1174 if (oom_score_adj == OOM_SCORE_ADJ_MIN)
1175 atomic_inc(&task->mm->oom_disable_count);
1176 if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
1177 atomic_dec(&task->mm->oom_disable_count);
1178 }
1136 task->signal->oom_score_adj = oom_score_adj; 1179 task->signal->oom_score_adj = oom_score_adj;
1137 /* 1180 /*
1138 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is 1181 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
@@ -1143,14 +1186,19 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1143 else 1186 else
1144 task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) / 1187 task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
1145 OOM_SCORE_ADJ_MAX; 1188 OOM_SCORE_ADJ_MAX;
1189err_sighand:
1146 unlock_task_sighand(task, &flags); 1190 unlock_task_sighand(task, &flags);
1191err_task_lock:
1192 task_unlock(task);
1147 put_task_struct(task); 1193 put_task_struct(task);
1148 return count; 1194out:
1195 return err < 0 ? err : count;
1149} 1196}
1150 1197
1151static const struct file_operations proc_oom_score_adj_operations = { 1198static const struct file_operations proc_oom_score_adj_operations = {
1152 .read = oom_score_adj_read, 1199 .read = oom_score_adj_read,
1153 .write = oom_score_adj_write, 1200 .write = oom_score_adj_write,
1201 .llseek = default_llseek,
1154}; 1202};
1155 1203
1156#ifdef CONFIG_AUDITSYSCALL 1204#ifdef CONFIG_AUDITSYSCALL
@@ -1600,6 +1648,7 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
1600 1648
1601 /* Common stuff */ 1649 /* Common stuff */
1602 ei = PROC_I(inode); 1650 ei = PROC_I(inode);
1651 inode->i_ino = get_next_ino();
1603 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 1652 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
1604 inode->i_op = &proc_def_inode_operations; 1653 inode->i_op = &proc_def_inode_operations;
1605 1654
@@ -2039,11 +2088,13 @@ static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
2039static const struct file_operations proc_fdinfo_file_operations = { 2088static const struct file_operations proc_fdinfo_file_operations = {
2040 .open = nonseekable_open, 2089 .open = nonseekable_open,
2041 .read = proc_fdinfo_read, 2090 .read = proc_fdinfo_read,
2091 .llseek = no_llseek,
2042}; 2092};
2043 2093
2044static const struct file_operations proc_fd_operations = { 2094static const struct file_operations proc_fd_operations = {
2045 .read = generic_read_dir, 2095 .read = generic_read_dir,
2046 .readdir = proc_readfd, 2096 .readdir = proc_readfd,
2097 .llseek = default_llseek,
2047}; 2098};
2048 2099
2049/* 2100/*
@@ -2112,6 +2163,7 @@ static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
2112static const struct file_operations proc_fdinfo_operations = { 2163static const struct file_operations proc_fdinfo_operations = {
2113 .read = generic_read_dir, 2164 .read = generic_read_dir,
2114 .readdir = proc_readfdinfo, 2165 .readdir = proc_readfdinfo,
2166 .llseek = default_llseek,
2115}; 2167};
2116 2168
2117/* 2169/*
@@ -2302,14 +2354,14 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
2302 goto out_free; 2354 goto out_free;
2303 2355
2304 /* Guard against adverse ptrace interaction */ 2356 /* Guard against adverse ptrace interaction */
2305 length = mutex_lock_interruptible(&task->cred_guard_mutex); 2357 length = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
2306 if (length < 0) 2358 if (length < 0)
2307 goto out_free; 2359 goto out_free;
2308 2360
2309 length = security_setprocattr(task, 2361 length = security_setprocattr(task,
2310 (char*)file->f_path.dentry->d_name.name, 2362 (char*)file->f_path.dentry->d_name.name,
2311 (void*)page, count); 2363 (void*)page, count);
2312 mutex_unlock(&task->cred_guard_mutex); 2364 mutex_unlock(&task->signal->cred_guard_mutex);
2313out_free: 2365out_free:
2314 free_page((unsigned long) page); 2366 free_page((unsigned long) page);
2315out: 2367out:
@@ -2343,6 +2395,7 @@ static int proc_attr_dir_readdir(struct file * filp,
2343static const struct file_operations proc_attr_dir_operations = { 2395static const struct file_operations proc_attr_dir_operations = {
2344 .read = generic_read_dir, 2396 .read = generic_read_dir,
2345 .readdir = proc_attr_dir_readdir, 2397 .readdir = proc_attr_dir_readdir,
2398 .llseek = default_llseek,
2346}; 2399};
2347 2400
2348static struct dentry *proc_attr_dir_lookup(struct inode *dir, 2401static struct dentry *proc_attr_dir_lookup(struct inode *dir,
@@ -2542,6 +2595,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
2542 2595
2543 /* Initialize the inode */ 2596 /* Initialize the inode */
2544 ei = PROC_I(inode); 2597 ei = PROC_I(inode);
2598 inode->i_ino = get_next_ino();
2545 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 2599 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
2546 2600
2547 /* 2601 /*
@@ -2675,7 +2729,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2675 INF("auxv", S_IRUSR, proc_pid_auxv), 2729 INF("auxv", S_IRUSR, proc_pid_auxv),
2676 ONE("status", S_IRUGO, proc_pid_status), 2730 ONE("status", S_IRUGO, proc_pid_status),
2677 ONE("personality", S_IRUSR, proc_pid_personality), 2731 ONE("personality", S_IRUSR, proc_pid_personality),
2678 INF("limits", S_IRUSR, proc_pid_limits), 2732 INF("limits", S_IRUGO, proc_pid_limits),
2679#ifdef CONFIG_SCHED_DEBUG 2733#ifdef CONFIG_SCHED_DEBUG
2680 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 2734 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2681#endif 2735#endif
@@ -2751,6 +2805,7 @@ static int proc_tgid_base_readdir(struct file * filp,
2751static const struct file_operations proc_tgid_base_operations = { 2805static const struct file_operations proc_tgid_base_operations = {
2752 .read = generic_read_dir, 2806 .read = generic_read_dir,
2753 .readdir = proc_tgid_base_readdir, 2807 .readdir = proc_tgid_base_readdir,
2808 .llseek = default_llseek,
2754}; 2809};
2755 2810
2756static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ 2811static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
@@ -3011,7 +3066,7 @@ static const struct pid_entry tid_base_stuff[] = {
3011 INF("auxv", S_IRUSR, proc_pid_auxv), 3066 INF("auxv", S_IRUSR, proc_pid_auxv),
3012 ONE("status", S_IRUGO, proc_pid_status), 3067 ONE("status", S_IRUGO, proc_pid_status),
3013 ONE("personality", S_IRUSR, proc_pid_personality), 3068 ONE("personality", S_IRUSR, proc_pid_personality),
3014 INF("limits", S_IRUSR, proc_pid_limits), 3069 INF("limits", S_IRUGO, proc_pid_limits),
3015#ifdef CONFIG_SCHED_DEBUG 3070#ifdef CONFIG_SCHED_DEBUG
3016 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 3071 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
3017#endif 3072#endif
@@ -3088,6 +3143,7 @@ static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *den
3088static const struct file_operations proc_tid_base_operations = { 3143static const struct file_operations proc_tid_base_operations = {
3089 .read = generic_read_dir, 3144 .read = generic_read_dir,
3090 .readdir = proc_tid_base_readdir, 3145 .readdir = proc_tid_base_readdir,
3146 .llseek = default_llseek,
3091}; 3147};
3092 3148
3093static const struct inode_operations proc_tid_base_inode_operations = { 3149static const struct inode_operations proc_tid_base_inode_operations = {
@@ -3324,4 +3380,5 @@ static const struct inode_operations proc_task_inode_operations = {
3324static const struct file_operations proc_task_operations = { 3380static const struct file_operations proc_task_operations = {
3325 .read = generic_read_dir, 3381 .read = generic_read_dir,
3326 .readdir = proc_task_readdir, 3382 .readdir = proc_task_readdir,
3383 .llseek = default_llseek,
3327}; 3384};
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 5be436ea088e..b652cb00906b 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -23,6 +23,8 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
23 if (!inode) 23 if (!inode)
24 goto out; 24 goto out;
25 25
26 inode->i_ino = get_next_ino();
27
26 sysctl_head_get(head); 28 sysctl_head_get(head);
27 ei = PROC_I(inode); 29 ei = PROC_I(inode);
28 ei->sysctl = head; 30 ei->sysctl = head;
@@ -364,6 +366,7 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
364static const struct file_operations proc_sys_file_operations = { 366static const struct file_operations proc_sys_file_operations = {
365 .read = proc_sys_read, 367 .read = proc_sys_read,
366 .write = proc_sys_write, 368 .write = proc_sys_write,
369 .llseek = default_llseek,
367}; 370};
368 371
369static const struct file_operations proc_sys_dir_file_operations = { 372static const struct file_operations proc_sys_dir_file_operations = {
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 4258384ed22d..93d99b316325 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -179,6 +179,7 @@ static int proc_root_readdir(struct file * filp,
179static const struct file_operations proc_root_operations = { 179static const struct file_operations proc_root_operations = {
180 .read = generic_read_dir, 180 .read = generic_read_dir,
181 .readdir = proc_root_readdir, 181 .readdir = proc_root_readdir,
182 .llseek = default_llseek,
182}; 183};
183 184
184/* 185/*
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 1807c2419f17..37994737c983 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -10,13 +10,13 @@ static int show_softirqs(struct seq_file *p, void *v)
10{ 10{
11 int i, j; 11 int i, j;
12 12
13 seq_printf(p, " "); 13 seq_printf(p, " ");
14 for_each_possible_cpu(i) 14 for_each_possible_cpu(i)
15 seq_printf(p, "CPU%-8d", i); 15 seq_printf(p, "CPU%-8d", i);
16 seq_printf(p, "\n"); 16 seq_printf(p, "\n");
17 17
18 for (i = 0; i < NR_SOFTIRQS; i++) { 18 for (i = 0; i < NR_SOFTIRQS; i++) {
19 seq_printf(p, "%8s:", softirq_to_name[i]); 19 seq_printf(p, "%12s:", softirq_to_name[i]);
20 for_each_possible_cpu(j) 20 for_each_possible_cpu(j)
21 seq_printf(p, " %10u", kstat_softirqs_cpu(i, j)); 21 seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
22 seq_printf(p, "\n"); 22 seq_printf(p, "\n");
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index bf31b03fc275..e15a19c93bae 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -31,7 +31,6 @@ static int show_stat(struct seq_file *p, void *v)
31 u64 sum_softirq = 0; 31 u64 sum_softirq = 0;
32 unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; 32 unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
33 struct timespec boottime; 33 struct timespec boottime;
34 unsigned int per_irq_sum;
35 34
36 user = nice = system = idle = iowait = 35 user = nice = system = idle = iowait =
37 irq = softirq = steal = cputime64_zero; 36 irq = softirq = steal = cputime64_zero;
@@ -52,9 +51,7 @@ static int show_stat(struct seq_file *p, void *v)
52 guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); 51 guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
53 guest_nice = cputime64_add(guest_nice, 52 guest_nice = cputime64_add(guest_nice,
54 kstat_cpu(i).cpustat.guest_nice); 53 kstat_cpu(i).cpustat.guest_nice);
55 for_each_irq_nr(j) { 54 sum += kstat_cpu_irqs_sum(i);
56 sum += kstat_irqs_cpu(j, i);
57 }
58 sum += arch_irq_stat_cpu(i); 55 sum += arch_irq_stat_cpu(i);
59 56
60 for (j = 0; j < NR_SOFTIRQS; j++) { 57 for (j = 0; j < NR_SOFTIRQS; j++) {
@@ -110,13 +107,8 @@ static int show_stat(struct seq_file *p, void *v)
110 seq_printf(p, "intr %llu", (unsigned long long)sum); 107 seq_printf(p, "intr %llu", (unsigned long long)sum);
111 108
112 /* sum again ? it could be updated? */ 109 /* sum again ? it could be updated? */
113 for_each_irq_nr(j) { 110 for_each_irq_nr(j)
114 per_irq_sum = 0; 111 seq_printf(p, " %u", kstat_irqs(j));
115 for_each_possible_cpu(i)
116 per_irq_sum += kstat_irqs_cpu(j, i);
117
118 seq_printf(p, " %u", per_irq_sum);
119 }
120 112
121 seq_printf(p, 113 seq_printf(p,
122 "\nctxt %llu\n" 114 "\nctxt %llu\n"
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 271afc48b9a5..da6b01d70f01 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -327,6 +327,7 @@ struct mem_size_stats {
327 unsigned long private_clean; 327 unsigned long private_clean;
328 unsigned long private_dirty; 328 unsigned long private_dirty;
329 unsigned long referenced; 329 unsigned long referenced;
330 unsigned long anonymous;
330 unsigned long swap; 331 unsigned long swap;
331 u64 pss; 332 u64 pss;
332}; 333};
@@ -357,19 +358,22 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
357 if (!page) 358 if (!page)
358 continue; 359 continue;
359 360
361 if (PageAnon(page))
362 mss->anonymous += PAGE_SIZE;
363
360 mss->resident += PAGE_SIZE; 364 mss->resident += PAGE_SIZE;
361 /* Accumulate the size in pages that have been accessed. */ 365 /* Accumulate the size in pages that have been accessed. */
362 if (pte_young(ptent) || PageReferenced(page)) 366 if (pte_young(ptent) || PageReferenced(page))
363 mss->referenced += PAGE_SIZE; 367 mss->referenced += PAGE_SIZE;
364 mapcount = page_mapcount(page); 368 mapcount = page_mapcount(page);
365 if (mapcount >= 2) { 369 if (mapcount >= 2) {
366 if (pte_dirty(ptent)) 370 if (pte_dirty(ptent) || PageDirty(page))
367 mss->shared_dirty += PAGE_SIZE; 371 mss->shared_dirty += PAGE_SIZE;
368 else 372 else
369 mss->shared_clean += PAGE_SIZE; 373 mss->shared_clean += PAGE_SIZE;
370 mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount; 374 mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
371 } else { 375 } else {
372 if (pte_dirty(ptent)) 376 if (pte_dirty(ptent) || PageDirty(page))
373 mss->private_dirty += PAGE_SIZE; 377 mss->private_dirty += PAGE_SIZE;
374 else 378 else
375 mss->private_clean += PAGE_SIZE; 379 mss->private_clean += PAGE_SIZE;
@@ -410,6 +414,7 @@ static int show_smap(struct seq_file *m, void *v)
410 "Private_Clean: %8lu kB\n" 414 "Private_Clean: %8lu kB\n"
411 "Private_Dirty: %8lu kB\n" 415 "Private_Dirty: %8lu kB\n"
412 "Referenced: %8lu kB\n" 416 "Referenced: %8lu kB\n"
417 "Anonymous: %8lu kB\n"
413 "Swap: %8lu kB\n" 418 "Swap: %8lu kB\n"
414 "KernelPageSize: %8lu kB\n" 419 "KernelPageSize: %8lu kB\n"
415 "MMUPageSize: %8lu kB\n", 420 "MMUPageSize: %8lu kB\n",
@@ -421,6 +426,7 @@ static int show_smap(struct seq_file *m, void *v)
421 mss.private_clean >> 10, 426 mss.private_clean >> 10,
422 mss.private_dirty >> 10, 427 mss.private_dirty >> 10,
423 mss.referenced >> 10, 428 mss.referenced >> 10,
429 mss.anonymous >> 10,
424 mss.swap >> 10, 430 mss.swap >> 10,
425 vma_kernel_pagesize(vma) >> 10, 431 vma_kernel_pagesize(vma) >> 10,
426 vma_mmu_pagesize(vma) >> 10); 432 vma_mmu_pagesize(vma) >> 10);
@@ -539,6 +545,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
539 545
540const struct file_operations proc_clear_refs_operations = { 546const struct file_operations proc_clear_refs_operations = {
541 .write = clear_refs_write, 547 .write = clear_refs_write,
548 .llseek = noop_llseek,
542}; 549};
543 550
544struct pagemapread { 551struct pagemapread {
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 91c817ff02c3..2367fb3f70bc 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -163,7 +163,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
163 163
164static const struct file_operations proc_vmcore_operations = { 164static const struct file_operations proc_vmcore_operations = {
165 .read = read_vmcore, 165 .read = read_vmcore,
166 .llseek = generic_file_llseek, 166 .llseek = default_llseek,
167}; 167};
168 168
169static struct vmcore* __init get_new_element(void) 169static struct vmcore* __init get_new_element(void)
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 6e8fc62b40a8..7b0329468a5d 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -11,7 +11,6 @@
11 * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support. 11 * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support.
12 */ 12 */
13 13
14#include <linux/smp_lock.h>
15#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
16#include "qnx4.h" 15#include "qnx4.h"
17 16
@@ -29,8 +28,6 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
29 QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size)); 28 QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
30 QNX4DEBUG((KERN_INFO "filp->f_pos = %ld\n", (long) filp->f_pos)); 29 QNX4DEBUG((KERN_INFO "filp->f_pos = %ld\n", (long) filp->f_pos));
31 30
32 lock_kernel();
33
34 while (filp->f_pos < inode->i_size) { 31 while (filp->f_pos < inode->i_size) {
35 blknum = qnx4_block_map( inode, filp->f_pos >> QNX4_BLOCK_SIZE_BITS ); 32 blknum = qnx4_block_map( inode, filp->f_pos >> QNX4_BLOCK_SIZE_BITS );
36 bh = sb_bread(inode->i_sb, blknum); 33 bh = sb_bread(inode->i_sb, blknum);
@@ -71,7 +68,6 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
71 brelse(bh); 68 brelse(bh);
72 } 69 }
73out: 70out:
74 unlock_kernel();
75 return 0; 71 return 0;
76} 72}
77 73
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 16829722be93..01bad30026fc 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -16,7 +16,6 @@
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/highuid.h> 18#include <linux/highuid.h>
19#include <linux/smp_lock.h>
20#include <linux/pagemap.h> 19#include <linux/pagemap.h>
21#include <linux/buffer_head.h> 20#include <linux/buffer_head.h>
22#include <linux/writeback.h> 21#include <linux/writeback.h>
@@ -157,8 +156,6 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
157 struct super_block *sb = dentry->d_sb; 156 struct super_block *sb = dentry->d_sb;
158 u64 id = huge_encode_dev(sb->s_bdev->bd_dev); 157 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
159 158
160 lock_kernel();
161
162 buf->f_type = sb->s_magic; 159 buf->f_type = sb->s_magic;
163 buf->f_bsize = sb->s_blocksize; 160 buf->f_bsize = sb->s_blocksize;
164 buf->f_blocks = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size) * 8; 161 buf->f_blocks = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size) * 8;
@@ -168,8 +165,6 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
168 buf->f_fsid.val[0] = (u32)id; 165 buf->f_fsid.val[0] = (u32)id;
169 buf->f_fsid.val[1] = (u32)(id >> 32); 166 buf->f_fsid.val[1] = (u32)(id >> 32);
170 167
171 unlock_kernel();
172
173 return 0; 168 return 0;
174} 169}
175 170
@@ -283,7 +278,6 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
283 goto outi; 278 goto outi;
284 279
285 brelse(bh); 280 brelse(bh);
286
287 return 0; 281 return 0;
288 282
289 outi: 283 outi:
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 58703ebba879..275327b5615e 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -12,7 +12,6 @@
12 * 04-07-1998 by Frank Denis : first step for rmdir/unlink. 12 * 04-07-1998 by Frank Denis : first step for rmdir/unlink.
13 */ 13 */
14 14
15#include <linux/smp_lock.h>
16#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
17#include "qnx4.h" 16#include "qnx4.h"
18 17
@@ -109,7 +108,6 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nam
109 int len = dentry->d_name.len; 108 int len = dentry->d_name.len;
110 struct inode *foundinode = NULL; 109 struct inode *foundinode = NULL;
111 110
112 lock_kernel();
113 if (!(bh = qnx4_find_entry(len, dir, name, &de, &ino))) 111 if (!(bh = qnx4_find_entry(len, dir, name, &de, &ino)))
114 goto out; 112 goto out;
115 /* The entry is linked, let's get the real info */ 113 /* The entry is linked, let's get the real info */
@@ -123,13 +121,11 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nam
123 121
124 foundinode = qnx4_iget(dir->i_sb, ino); 122 foundinode = qnx4_iget(dir->i_sb, ino);
125 if (IS_ERR(foundinode)) { 123 if (IS_ERR(foundinode)) {
126 unlock_kernel();
127 QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n", 124 QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n",
128 PTR_ERR(foundinode))); 125 PTR_ERR(foundinode)));
129 return ERR_CAST(foundinode); 126 return ERR_CAST(foundinode);
130 } 127 }
131out: 128out:
132 unlock_kernel();
133 d_add(dentry, foundinode); 129 d_add(dentry, foundinode);
134 130
135 return NULL; 131 return NULL;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a5ebae70dc6d..67fadb1ad2c1 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -58,6 +58,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
58 struct inode * inode = new_inode(sb); 58 struct inode * inode = new_inode(sb);
59 59
60 if (inode) { 60 if (inode) {
61 inode->i_ino = get_next_ino();
61 inode_init_owner(inode, dir, mode); 62 inode_init_owner(inode, dir, mode);
62 inode->i_mapping->a_ops = &ramfs_aops; 63 inode->i_mapping->a_ops = &ramfs_aops;
63 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; 64 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
diff --git a/fs/read_write.c b/fs/read_write.c
index 74e36586e4d3..9cd9d148105d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -31,6 +31,20 @@ const struct file_operations generic_ro_fops = {
31 31
32EXPORT_SYMBOL(generic_ro_fops); 32EXPORT_SYMBOL(generic_ro_fops);
33 33
34static int
35__negative_fpos_check(struct file *file, loff_t pos, size_t count)
36{
37 /*
38 * pos or pos+count is negative here, check overflow.
39 * too big "count" will be caught in rw_verify_area().
40 */
41 if ((pos < 0) && (pos + count < pos))
42 return -EOVERFLOW;
43 if (file->f_mode & FMODE_UNSIGNED_OFFSET)
44 return 0;
45 return -EINVAL;
46}
47
34/** 48/**
35 * generic_file_llseek_unlocked - lockless generic llseek implementation 49 * generic_file_llseek_unlocked - lockless generic llseek implementation
36 * @file: file structure to seek on 50 * @file: file structure to seek on
@@ -62,7 +76,9 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
62 break; 76 break;
63 } 77 }
64 78
65 if (offset < 0 || offset > inode->i_sb->s_maxbytes) 79 if (offset < 0 && __negative_fpos_check(file, offset, 0))
80 return -EINVAL;
81 if (offset > inode->i_sb->s_maxbytes)
66 return -EINVAL; 82 return -EINVAL;
67 83
68 /* Special lock needed here? */ 84 /* Special lock needed here? */
@@ -124,7 +140,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
124{ 140{
125 loff_t retval; 141 loff_t retval;
126 142
127 lock_kernel(); 143 mutex_lock(&file->f_dentry->d_inode->i_mutex);
128 switch (origin) { 144 switch (origin) {
129 case SEEK_END: 145 case SEEK_END:
130 offset += i_size_read(file->f_path.dentry->d_inode); 146 offset += i_size_read(file->f_path.dentry->d_inode);
@@ -137,7 +153,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
137 offset += file->f_pos; 153 offset += file->f_pos;
138 } 154 }
139 retval = -EINVAL; 155 retval = -EINVAL;
140 if (offset >= 0) { 156 if (offset >= 0 || !__negative_fpos_check(file, offset, 0)) {
141 if (offset != file->f_pos) { 157 if (offset != file->f_pos) {
142 file->f_pos = offset; 158 file->f_pos = offset;
143 file->f_version = 0; 159 file->f_version = 0;
@@ -145,7 +161,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
145 retval = offset; 161 retval = offset;
146 } 162 }
147out: 163out:
148 unlock_kernel(); 164 mutex_unlock(&file->f_dentry->d_inode->i_mutex);
149 return retval; 165 return retval;
150} 166}
151EXPORT_SYMBOL(default_llseek); 167EXPORT_SYMBOL(default_llseek);
@@ -156,7 +172,6 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
156 172
157 fn = no_llseek; 173 fn = no_llseek;
158 if (file->f_mode & FMODE_LSEEK) { 174 if (file->f_mode & FMODE_LSEEK) {
159 fn = default_llseek;
160 if (file->f_op && file->f_op->llseek) 175 if (file->f_op && file->f_op->llseek)
161 fn = file->f_op->llseek; 176 fn = file->f_op->llseek;
162 } 177 }
@@ -222,6 +237,7 @@ bad:
222} 237}
223#endif 238#endif
224 239
240
225/* 241/*
226 * rw_verify_area doesn't like huge counts. We limit 242 * rw_verify_area doesn't like huge counts. We limit
227 * them to something that fits in "int" so that others 243 * them to something that fits in "int" so that others
@@ -239,8 +255,11 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
239 if (unlikely((ssize_t) count < 0)) 255 if (unlikely((ssize_t) count < 0))
240 return retval; 256 return retval;
241 pos = *ppos; 257 pos = *ppos;
242 if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) 258 if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) {
243 return retval; 259 retval = __negative_fpos_check(file, pos, count);
260 if (retval)
261 return retval;
262 }
244 263
245 if (unlikely(inode->i_flock && mandatory_lock(inode))) { 264 if (unlikely(inode->i_flock && mandatory_lock(inode))) {
246 retval = locks_mandatory_area( 265 retval = locks_mandatory_area(
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index 513f431038f9..7cd46666ba2c 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig
@@ -10,7 +10,8 @@ config REISERFS_FS
10 10
11 In general, ReiserFS is as fast as ext2, but is very efficient with 11 In general, ReiserFS is as fast as ext2, but is very efficient with
12 large directories and small files. Additional patches are needed 12 large directories and small files. Additional patches are needed
13 for NFS and quotas, please see <http://www.namesys.com/> for links. 13 for NFS and quotas, please see
14 <https://reiser4.wiki.kernel.org/index.php/Main_Page> for links.
14 15
15 It is more easily extended to have features currently found in 16 It is more easily extended to have features currently found in
16 database and keyword search systems than block allocation based file 17 database and keyword search systems than block allocation based file
@@ -18,7 +19,8 @@ config REISERFS_FS
18 plugins consistent with our motto ``It takes more than a license to 19 plugins consistent with our motto ``It takes more than a license to
19 make source code open.'' 20 make source code open.''
20 21
21 Read <http://www.namesys.com/> to learn more about reiserfs. 22 Read <https://reiser4.wiki.kernel.org/index.php/Main_Page>
23 to learn more about reiserfs.
22 24
23 Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com. 25 Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com.
24 26
diff --git a/fs/reiserfs/README b/fs/reiserfs/README
index 14e8c9d460e5..e2f7a264e3ff 100644
--- a/fs/reiserfs/README
+++ b/fs/reiserfs/README
@@ -43,7 +43,7 @@ to address the fair crediting issue in the next GPL version.)
43[END LICENSING] 43[END LICENSING]
44 44
45Reiserfs is a file system based on balanced tree algorithms, which is 45Reiserfs is a file system based on balanced tree algorithms, which is
46described at http://devlinux.com/namesys. 46described at https://reiser4.wiki.kernel.org/index.php/Main_Page
47 47
48Stop reading here. Go there, then return. 48Stop reading here. Go there, then return.
49 49
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 6846371498b6..91f080cc76c8 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -152,8 +152,7 @@ static int reiserfs_sync_file(struct file *filp, int datasync)
152 barrier_done = reiserfs_commit_for_inode(inode); 152 barrier_done = reiserfs_commit_for_inode(inode);
153 reiserfs_write_unlock(inode->i_sb); 153 reiserfs_write_unlock(inode->i_sb);
154 if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) 154 if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
155 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 155 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
156 BLKDEV_IFL_WAIT);
157 if (barrier_done < 0) 156 if (barrier_done < 0)
158 return barrier_done; 157 return barrier_done;
159 return (err < 0) ? -EIO : 0; 158 return (err < 0) ? -EIO : 0;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index caa758377d66..41656d40dc5c 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -22,8 +22,6 @@
22 22
23int reiserfs_commit_write(struct file *f, struct page *page, 23int reiserfs_commit_write(struct file *f, struct page *page,
24 unsigned from, unsigned to); 24 unsigned from, unsigned to);
25int reiserfs_prepare_write(struct file *f, struct page *page,
26 unsigned from, unsigned to);
27 25
28void reiserfs_evict_inode(struct inode *inode) 26void reiserfs_evict_inode(struct inode *inode)
29{ 27{
@@ -165,7 +163,7 @@ inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
165** but tail is still sitting in a direct item, and we can't write to 163** but tail is still sitting in a direct item, and we can't write to
166** it. So, look through this page, and check all the mapped buffers 164** it. So, look through this page, and check all the mapped buffers
167** to make sure they have valid block numbers. Any that don't need 165** to make sure they have valid block numbers. Any that don't need
168** to be unmapped, so that block_prepare_write will correctly call 166** to be unmapped, so that __block_write_begin will correctly call
169** reiserfs_get_block to convert the tail into an unformatted node 167** reiserfs_get_block to convert the tail into an unformatted node
170*/ 168*/
171static inline void fix_tail_page_for_writing(struct page *page) 169static inline void fix_tail_page_for_writing(struct page *page)
@@ -439,13 +437,13 @@ static int reiserfs_bmap(struct inode *inode, sector_t block,
439} 437}
440 438
441/* special version of get_block that is only used by grab_tail_page right 439/* special version of get_block that is only used by grab_tail_page right
442** now. It is sent to block_prepare_write, and when you try to get a 440** now. It is sent to __block_write_begin, and when you try to get a
443** block past the end of the file (or a block from a hole) it returns 441** block past the end of the file (or a block from a hole) it returns
444** -ENOENT instead of a valid buffer. block_prepare_write expects to 442** -ENOENT instead of a valid buffer. __block_write_begin expects to
445** be able to do i/o on the buffers returned, unless an error value 443** be able to do i/o on the buffers returned, unless an error value
446** is also returned. 444** is also returned.
447** 445**
448** So, this allows block_prepare_write to be used for reading a single block 446** So, this allows __block_write_begin to be used for reading a single block
449** in a page. Where it does not produce a valid page for holes, or past the 447** in a page. Where it does not produce a valid page for holes, or past the
450** end of the file. This turns out to be exactly what we need for reading 448** end of the file. This turns out to be exactly what we need for reading
451** tails for conversion. 449** tails for conversion.
@@ -558,11 +556,12 @@ static int convert_tail_for_hole(struct inode *inode,
558 ** 556 **
559 ** We must fix the tail page for writing because it might have buffers 557 ** We must fix the tail page for writing because it might have buffers
560 ** that are mapped, but have a block number of 0. This indicates tail 558 ** that are mapped, but have a block number of 0. This indicates tail
561 ** data that has been read directly into the page, and block_prepare_write 559 ** data that has been read directly into the page, and
562 ** won't trigger a get_block in this case. 560 ** __block_write_begin won't trigger a get_block in this case.
563 */ 561 */
564 fix_tail_page_for_writing(tail_page); 562 fix_tail_page_for_writing(tail_page);
565 retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end); 563 retval = __reiserfs_write_begin(tail_page, tail_start,
564 tail_end - tail_start);
566 if (retval) 565 if (retval)
567 goto unlock; 566 goto unlock;
568 567
@@ -2033,7 +2032,7 @@ static int grab_tail_page(struct inode *inode,
2033 /* start within the page of the last block in the file */ 2032 /* start within the page of the last block in the file */
2034 start = (offset / blocksize) * blocksize; 2033 start = (offset / blocksize) * blocksize;
2035 2034
2036 error = block_prepare_write(page, start, offset, 2035 error = __block_write_begin(page, start, offset - start,
2037 reiserfs_get_block_create_0); 2036 reiserfs_get_block_create_0);
2038 if (error) 2037 if (error)
2039 goto unlock; 2038 goto unlock;
@@ -2438,7 +2437,7 @@ static int reiserfs_write_full_page(struct page *page,
2438 /* from this point on, we know the buffer is mapped to a 2437 /* from this point on, we know the buffer is mapped to a
2439 * real block and not a direct item 2438 * real block and not a direct item
2440 */ 2439 */
2441 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { 2440 if (wbc->sync_mode != WB_SYNC_NONE) {
2442 lock_buffer(bh); 2441 lock_buffer(bh);
2443 } else { 2442 } else {
2444 if (!trylock_buffer(bh)) { 2443 if (!trylock_buffer(bh)) {
@@ -2628,8 +2627,7 @@ static int reiserfs_write_begin(struct file *file,
2628 return ret; 2627 return ret;
2629} 2628}
2630 2629
2631int reiserfs_prepare_write(struct file *f, struct page *page, 2630int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
2632 unsigned from, unsigned to)
2633{ 2631{
2634 struct inode *inode = page->mapping->host; 2632 struct inode *inode = page->mapping->host;
2635 int ret; 2633 int ret;
@@ -2650,7 +2648,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
2650 th->t_refcount++; 2648 th->t_refcount++;
2651 } 2649 }
2652 2650
2653 ret = block_prepare_write(page, from, to, reiserfs_get_block); 2651 ret = __block_write_begin(page, from, len, reiserfs_get_block);
2654 if (ret && reiserfs_transaction_running(inode->i_sb)) { 2652 if (ret && reiserfs_transaction_running(inode->i_sb)) {
2655 struct reiserfs_transaction_handle *th = current->journal_info; 2653 struct reiserfs_transaction_handle *th = current->journal_info;
2656 /* this gets a little ugly. If reiserfs_get_block returned an 2654 /* this gets a little ugly. If reiserfs_get_block returned an
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index f53505de0712..adf22b485cea 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -160,8 +160,6 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
160 160
161int reiserfs_commit_write(struct file *f, struct page *page, 161int reiserfs_commit_write(struct file *f, struct page *page,
162 unsigned from, unsigned to); 162 unsigned from, unsigned to);
163int reiserfs_prepare_write(struct file *f, struct page *page,
164 unsigned from, unsigned to);
165/* 163/*
166** reiserfs_unpack 164** reiserfs_unpack
167** Function try to convert tail from direct item into indirect. 165** Function try to convert tail from direct item into indirect.
@@ -170,6 +168,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
170int reiserfs_unpack(struct inode *inode, struct file *filp) 168int reiserfs_unpack(struct inode *inode, struct file *filp)
171{ 169{
172 int retval = 0; 170 int retval = 0;
171 int depth;
173 int index; 172 int index;
174 struct page *page; 173 struct page *page;
175 struct address_space *mapping; 174 struct address_space *mapping;
@@ -188,8 +187,8 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
188 /* we need to make sure nobody is changing the file size beneath 187 /* we need to make sure nobody is changing the file size beneath
189 ** us 188 ** us
190 */ 189 */
191 mutex_lock(&inode->i_mutex); 190 reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
192 reiserfs_write_lock(inode->i_sb); 191 depth = reiserfs_write_lock_once(inode->i_sb);
193 192
194 write_from = inode->i_size & (blocksize - 1); 193 write_from = inode->i_size & (blocksize - 1);
195 /* if we are on a block boundary, we are already unpacked. */ 194 /* if we are on a block boundary, we are already unpacked. */
@@ -199,7 +198,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
199 } 198 }
200 199
201 /* we unpack by finding the page with the tail, and calling 200 /* we unpack by finding the page with the tail, and calling
202 ** reiserfs_prepare_write on that page. This will force a 201 ** __reiserfs_write_begin on that page. This will force a
203 ** reiserfs_get_block to unpack the tail for us. 202 ** reiserfs_get_block to unpack the tail for us.
204 */ 203 */
205 index = inode->i_size >> PAGE_CACHE_SHIFT; 204 index = inode->i_size >> PAGE_CACHE_SHIFT;
@@ -209,7 +208,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
209 if (!page) { 208 if (!page) {
210 goto out; 209 goto out;
211 } 210 }
212 retval = reiserfs_prepare_write(NULL, page, write_from, write_from); 211 retval = __reiserfs_write_begin(page, write_from, 0);
213 if (retval) 212 if (retval)
214 goto out_unlock; 213 goto out_unlock;
215 214
@@ -224,6 +223,6 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
224 223
225 out: 224 out:
226 mutex_unlock(&inode->i_mutex); 225 mutex_unlock(&inode->i_mutex);
227 reiserfs_write_unlock(inode->i_sb); 226 reiserfs_write_unlock_once(inode->i_sb, depth);
228 return retval; 227 return retval;
229} 228}
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 812e2c05aa29..076c8b194682 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -138,13 +138,6 @@ static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)
138 return 0; 138 return 0;
139} 139}
140 140
141static void disable_barrier(struct super_block *s)
142{
143 REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH);
144 printk("reiserfs: disabling flush barriers on %s\n",
145 reiserfs_bdevname(s));
146}
147
148static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block 141static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
149 *sb) 142 *sb)
150{ 143{
@@ -677,30 +670,6 @@ static void submit_ordered_buffer(struct buffer_head *bh)
677 submit_bh(WRITE, bh); 670 submit_bh(WRITE, bh);
678} 671}
679 672
680static int submit_barrier_buffer(struct buffer_head *bh)
681{
682 get_bh(bh);
683 bh->b_end_io = reiserfs_end_ordered_io;
684 clear_buffer_dirty(bh);
685 if (!buffer_uptodate(bh))
686 BUG();
687 return submit_bh(WRITE_BARRIER, bh);
688}
689
690static void check_barrier_completion(struct super_block *s,
691 struct buffer_head *bh)
692{
693 if (buffer_eopnotsupp(bh)) {
694 clear_buffer_eopnotsupp(bh);
695 disable_barrier(s);
696 set_buffer_uptodate(bh);
697 set_buffer_dirty(bh);
698 reiserfs_write_unlock(s);
699 sync_dirty_buffer(bh);
700 reiserfs_write_lock(s);
701 }
702}
703
704#define CHUNK_SIZE 32 673#define CHUNK_SIZE 32
705struct buffer_chunk { 674struct buffer_chunk {
706 struct buffer_head *bh[CHUNK_SIZE]; 675 struct buffer_head *bh[CHUNK_SIZE];
@@ -1009,7 +978,6 @@ static int flush_commit_list(struct super_block *s,
1009 struct buffer_head *tbh = NULL; 978 struct buffer_head *tbh = NULL;
1010 unsigned int trans_id = jl->j_trans_id; 979 unsigned int trans_id = jl->j_trans_id;
1011 struct reiserfs_journal *journal = SB_JOURNAL(s); 980 struct reiserfs_journal *journal = SB_JOURNAL(s);
1012 int barrier = 0;
1013 int retval = 0; 981 int retval = 0;
1014 int write_len; 982 int write_len;
1015 983
@@ -1094,24 +1062,6 @@ static int flush_commit_list(struct super_block *s,
1094 } 1062 }
1095 atomic_dec(&journal->j_async_throttle); 1063 atomic_dec(&journal->j_async_throttle);
1096 1064
1097 /* We're skipping the commit if there's an error */
1098 if (retval || reiserfs_is_journal_aborted(journal))
1099 barrier = 0;
1100
1101 /* wait on everything written so far before writing the commit
1102 * if we are in barrier mode, send the commit down now
1103 */
1104 barrier = reiserfs_barrier_flush(s);
1105 if (barrier) {
1106 int ret;
1107 lock_buffer(jl->j_commit_bh);
1108 ret = submit_barrier_buffer(jl->j_commit_bh);
1109 if (ret == -EOPNOTSUPP) {
1110 set_buffer_uptodate(jl->j_commit_bh);
1111 disable_barrier(s);
1112 barrier = 0;
1113 }
1114 }
1115 for (i = 0; i < (jl->j_len + 1); i++) { 1065 for (i = 0; i < (jl->j_len + 1); i++) {
1116 bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + 1066 bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
1117 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s); 1067 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
@@ -1143,27 +1093,22 @@ static int flush_commit_list(struct super_block *s,
1143 1093
1144 BUG_ON(atomic_read(&(jl->j_commit_left)) != 1); 1094 BUG_ON(atomic_read(&(jl->j_commit_left)) != 1);
1145 1095
1146 if (!barrier) { 1096 /* If there was a write error in the journal - we can't commit
1147 /* If there was a write error in the journal - we can't commit 1097 * this transaction - it will be invalid and, if successful,
1148 * this transaction - it will be invalid and, if successful, 1098 * will just end up propagating the write error out to
1149 * will just end up propagating the write error out to 1099 * the file system. */
1150 * the file system. */ 1100 if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
1151 if (likely(!retval && !reiserfs_is_journal_aborted (journal))) { 1101 if (buffer_dirty(jl->j_commit_bh))
1152 if (buffer_dirty(jl->j_commit_bh)) 1102 BUG();
1153 BUG(); 1103 mark_buffer_dirty(jl->j_commit_bh) ;
1154 mark_buffer_dirty(jl->j_commit_bh) ;
1155 reiserfs_write_unlock(s);
1156 sync_dirty_buffer(jl->j_commit_bh) ;
1157 reiserfs_write_lock(s);
1158 }
1159 } else {
1160 reiserfs_write_unlock(s); 1104 reiserfs_write_unlock(s);
1161 wait_on_buffer(jl->j_commit_bh); 1105 if (reiserfs_barrier_flush(s))
1106 __sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA);
1107 else
1108 sync_dirty_buffer(jl->j_commit_bh);
1162 reiserfs_write_lock(s); 1109 reiserfs_write_lock(s);
1163 } 1110 }
1164 1111
1165 check_barrier_completion(s, jl->j_commit_bh);
1166
1167 /* If there was a write error in the journal - we can't commit this 1112 /* If there was a write error in the journal - we can't commit this
1168 * transaction - it will be invalid and, if successful, will just end 1113 * transaction - it will be invalid and, if successful, will just end
1169 * up propagating the write error out to the filesystem. */ 1114 * up propagating the write error out to the filesystem. */
@@ -1319,26 +1264,15 @@ static int _update_journal_header_block(struct super_block *sb,
1319 jh->j_first_unflushed_offset = cpu_to_le32(offset); 1264 jh->j_first_unflushed_offset = cpu_to_le32(offset);
1320 jh->j_mount_id = cpu_to_le32(journal->j_mount_id); 1265 jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
1321 1266
1322 if (reiserfs_barrier_flush(sb)) { 1267 set_buffer_dirty(journal->j_header_bh);
1323 int ret; 1268 reiserfs_write_unlock(sb);
1324 lock_buffer(journal->j_header_bh); 1269
1325 ret = submit_barrier_buffer(journal->j_header_bh); 1270 if (reiserfs_barrier_flush(sb))
1326 if (ret == -EOPNOTSUPP) { 1271 __sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA);
1327 set_buffer_uptodate(journal->j_header_bh); 1272 else
1328 disable_barrier(sb);
1329 goto sync;
1330 }
1331 reiserfs_write_unlock(sb);
1332 wait_on_buffer(journal->j_header_bh);
1333 reiserfs_write_lock(sb);
1334 check_barrier_completion(sb, journal->j_header_bh);
1335 } else {
1336 sync:
1337 set_buffer_dirty(journal->j_header_bh);
1338 reiserfs_write_unlock(sb);
1339 sync_dirty_buffer(journal->j_header_bh); 1273 sync_dirty_buffer(journal->j_header_bh);
1340 reiserfs_write_lock(sb); 1274
1341 } 1275 reiserfs_write_lock(sb);
1342 if (!buffer_uptodate(journal->j_header_bh)) { 1276 if (!buffer_uptodate(journal->j_header_bh)) {
1343 reiserfs_warning(sb, "journal-837", 1277 reiserfs_warning(sb, "journal-837",
1344 "IO error during journal replay"); 1278 "IO error during journal replay");
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index ee78d4a0086a..ba5f51ec3458 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1156,7 +1156,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
1156 inode->i_ctime = CURRENT_TIME_SEC; 1156 inode->i_ctime = CURRENT_TIME_SEC;
1157 reiserfs_update_sd(&th, inode); 1157 reiserfs_update_sd(&th, inode);
1158 1158
1159 atomic_inc(&inode->i_count); 1159 ihold(inode);
1160 d_instantiate(dentry, inode); 1160 d_instantiate(dentry, inode);
1161 retval = journal_end(&th, dir->i_sb, jbegin_count); 1161 retval = journal_end(&th, dir->i_sb, jbegin_count);
1162 reiserfs_write_unlock(dir->i_sb); 1162 reiserfs_write_unlock(dir->i_sb);
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 8c4cf273c672..5d04a7828e7a 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -418,13 +418,11 @@ static inline __u32 xattr_hash(const char *msg, int len)
418 418
419int reiserfs_commit_write(struct file *f, struct page *page, 419int reiserfs_commit_write(struct file *f, struct page *page,
420 unsigned from, unsigned to); 420 unsigned from, unsigned to);
421int reiserfs_prepare_write(struct file *f, struct page *page,
422 unsigned from, unsigned to);
423 421
424static void update_ctime(struct inode *inode) 422static void update_ctime(struct inode *inode)
425{ 423{
426 struct timespec now = current_fs_time(inode->i_sb); 424 struct timespec now = current_fs_time(inode->i_sb);
427 if (hlist_unhashed(&inode->i_hash) || !inode->i_nlink || 425 if (inode_unhashed(inode) || !inode->i_nlink ||
428 timespec_equal(&inode->i_ctime, &now)) 426 timespec_equal(&inode->i_ctime, &now))
429 return; 427 return;
430 428
@@ -532,8 +530,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
532 rxh->h_hash = cpu_to_le32(xahash); 530 rxh->h_hash = cpu_to_le32(xahash);
533 } 531 }
534 532
535 err = reiserfs_prepare_write(NULL, page, page_offset, 533 err = __reiserfs_write_begin(page, page_offset, chunk + skip);
536 page_offset + chunk + skip);
537 if (!err) { 534 if (!err) {
538 if (buffer) 535 if (buffer)
539 memcpy(data + skip, buffer + buffer_pos, chunk); 536 memcpy(data + skip, buffer + buffer_pos, chunk);
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 42d213546894..268580535c92 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -282,6 +282,7 @@ error:
282static const struct file_operations romfs_dir_operations = { 282static const struct file_operations romfs_dir_operations = {
283 .read = generic_read_dir, 283 .read = generic_read_dir,
284 .readdir = romfs_readdir, 284 .readdir = romfs_readdir,
285 .llseek = default_llseek,
285}; 286};
286 287
287static const struct inode_operations romfs_dir_inode_operations = { 288static const struct inode_operations romfs_dir_inode_operations = {
diff --git a/fs/select.c b/fs/select.c
index 500a669f7790..b7b10aa30861 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -67,7 +67,7 @@ static long __estimate_accuracy(struct timespec *tv)
67 return slack; 67 return slack;
68} 68}
69 69
70static long estimate_accuracy(struct timespec *tv) 70long select_estimate_accuracy(struct timespec *tv)
71{ 71{
72 unsigned long ret; 72 unsigned long ret;
73 struct timespec now; 73 struct timespec now;
@@ -417,7 +417,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
417 } 417 }
418 418
419 if (end_time && !timed_out) 419 if (end_time && !timed_out)
420 slack = estimate_accuracy(end_time); 420 slack = select_estimate_accuracy(end_time);
421 421
422 retval = 0; 422 retval = 0;
423 for (;;) { 423 for (;;) {
@@ -769,7 +769,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
769 } 769 }
770 770
771 if (end_time && !timed_out) 771 if (end_time && !timed_out)
772 slack = estimate_accuracy(end_time); 772 slack = select_estimate_accuracy(end_time);
773 773
774 for (;;) { 774 for (;;) {
775 struct poll_list *walk; 775 struct poll_list *walk;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index e1f437be6c3c..05d6b0e78c95 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -131,7 +131,7 @@ Eoverflow:
131 */ 131 */
132ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) 132ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
133{ 133{
134 struct seq_file *m = (struct seq_file *)file->private_data; 134 struct seq_file *m = file->private_data;
135 size_t copied = 0; 135 size_t copied = 0;
136 loff_t pos; 136 loff_t pos;
137 size_t n; 137 size_t n;
@@ -280,7 +280,7 @@ EXPORT_SYMBOL(seq_read);
280 */ 280 */
281loff_t seq_lseek(struct file *file, loff_t offset, int origin) 281loff_t seq_lseek(struct file *file, loff_t offset, int origin)
282{ 282{
283 struct seq_file *m = (struct seq_file *)file->private_data; 283 struct seq_file *m = file->private_data;
284 loff_t retval = -EINVAL; 284 loff_t retval = -EINVAL;
285 285
286 mutex_lock(&m->lock); 286 mutex_lock(&m->lock);
@@ -324,7 +324,7 @@ EXPORT_SYMBOL(seq_lseek);
324 */ 324 */
325int seq_release(struct inode *inode, struct file *file) 325int seq_release(struct inode *inode, struct file *file)
326{ 326{
327 struct seq_file *m = (struct seq_file *)file->private_data; 327 struct seq_file *m = file->private_data;
328 kfree(m->buf); 328 kfree(m->buf);
329 kfree(m); 329 kfree(m);
330 return 0; 330 return 0;
@@ -462,9 +462,7 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
462 if (size) { 462 if (size) {
463 char *p; 463 char *p;
464 464
465 spin_lock(&dcache_lock);
466 p = __d_path(path, root, buf, size); 465 p = __d_path(path, root, buf, size);
467 spin_unlock(&dcache_lock);
468 res = PTR_ERR(p); 466 res = PTR_ERR(p);
469 if (!IS_ERR(p)) { 467 if (!IS_ERR(p)) {
470 char *end = mangle_path(buf, p, esc); 468 char *end = mangle_path(buf, p, esc);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 1c5a6add779d..492465b451dd 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -99,6 +99,16 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
99#ifdef __ARCH_SI_TRAPNO 99#ifdef __ARCH_SI_TRAPNO
100 err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno); 100 err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno);
101#endif 101#endif
102#ifdef BUS_MCEERR_AO
103 /*
104 * Other callers might not initialize the si_lsb field,
105 * so check explicitly for the right codes here.
106 */
107 if (kinfo->si_code == BUS_MCEERR_AR ||
108 kinfo->si_code == BUS_MCEERR_AO)
109 err |= __put_user((short) kinfo->si_addr_lsb,
110 &uinfo->ssi_addr_lsb);
111#endif
102 break; 112 break;
103 case __SI_CHLD: 113 case __SI_CHLD:
104 err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); 114 err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
@@ -206,6 +216,7 @@ static const struct file_operations signalfd_fops = {
206 .release = signalfd_release, 216 .release = signalfd_release,
207 .poll = signalfd_poll, 217 .poll = signalfd_poll,
208 .read = signalfd_read, 218 .read = signalfd_read,
219 .llseek = noop_llseek,
209}; 220};
210 221
211SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, 222SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
diff --git a/fs/smbfs/Kconfig b/fs/smbfs/Kconfig
index e668127c8b2e..2bc24a8c4039 100644
--- a/fs/smbfs/Kconfig
+++ b/fs/smbfs/Kconfig
@@ -1,5 +1,6 @@
1config SMB_FS 1config SMB_FS
2 tristate "SMB file system support (OBSOLETE, please use CIFS)" 2 tristate "SMB file system support (OBSOLETE, please use CIFS)"
3 depends on BKL # probably unfixable
3 depends on INET 4 depends on INET
4 select NLS 5 select NLS
5 help 6 help
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 00a70cab1f36..f678d421e541 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -406,21 +406,15 @@ void
406smb_renew_times(struct dentry * dentry) 406smb_renew_times(struct dentry * dentry)
407{ 407{
408 dget(dentry); 408 dget(dentry);
409 spin_lock(&dentry->d_lock); 409 dentry->d_time = jiffies;
410 for (;;) {
411 struct dentry *parent;
412 410
413 dentry->d_time = jiffies; 411 while (!IS_ROOT(dentry)) {
414 if (IS_ROOT(dentry)) 412 struct dentry *parent = dget_parent(dentry);
415 break;
416 parent = dentry->d_parent;
417 dget(parent);
418 spin_unlock(&dentry->d_lock);
419 dput(dentry); 413 dput(dentry);
420 dentry = parent; 414 dentry = parent;
421 spin_lock(&dentry->d_lock); 415
416 dentry->d_time = jiffies;
422 } 417 }
423 spin_unlock(&dentry->d_lock);
424 dput(dentry); 418 dput(dentry);
425} 419}
426 420
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 450c91941988..f6e9ee59757e 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -229,7 +229,6 @@ smb_invalidate_inodes(struct smb_sb_info *server)
229{ 229{
230 VERBOSE("\n"); 230 VERBOSE("\n");
231 shrink_dcache_sb(SB_of(server)); 231 shrink_dcache_sb(SB_of(server));
232 invalidate_inodes(SB_of(server));
233} 232}
234 233
235/* 234/*
@@ -501,6 +500,8 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
501 void *mem; 500 void *mem;
502 static int warn_count; 501 static int warn_count;
503 502
503 lock_kernel();
504
504 if (warn_count < 5) { 505 if (warn_count < 5) {
505 warn_count++; 506 warn_count++;
506 printk(KERN_EMERG "smbfs is deprecated and will be removed" 507 printk(KERN_EMERG "smbfs is deprecated and will be removed"
@@ -621,6 +622,7 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
621 622
622 smb_new_dentry(sb->s_root); 623 smb_new_dentry(sb->s_root);
623 624
625 unlock_kernel();
624 return 0; 626 return 0;
625 627
626out_no_root: 628out_no_root:
@@ -643,9 +645,11 @@ out_wrong_data:
643out_no_data: 645out_no_data:
644 printk(KERN_ERR "smb_fill_super: missing data argument\n"); 646 printk(KERN_ERR "smb_fill_super: missing data argument\n");
645out_fail: 647out_fail:
648 unlock_kernel();
646 return -EINVAL; 649 return -EINVAL;
647out_no_server: 650out_no_server:
648 printk(KERN_ERR "smb_fill_super: cannot allocate struct smb_sb_info\n"); 651 printk(KERN_ERR "smb_fill_super: cannot allocate struct smb_sb_info\n");
652 unlock_kernel();
649 return -ENOMEM; 653 return -ENOMEM;
650} 654}
651 655
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index 71c29b6670b4..3dcf638d4d3a 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -332,16 +332,15 @@ static int smb_build_path(struct smb_sb_info *server, unsigned char *buf,
332 * and store it in reversed order [see reverse_string()] 332 * and store it in reversed order [see reverse_string()]
333 */ 333 */
334 dget(entry); 334 dget(entry);
335 spin_lock(&entry->d_lock);
336 while (!IS_ROOT(entry)) { 335 while (!IS_ROOT(entry)) {
337 struct dentry *parent; 336 struct dentry *parent;
338 337
339 if (maxlen < (3<<unicode)) { 338 if (maxlen < (3<<unicode)) {
340 spin_unlock(&entry->d_lock);
341 dput(entry); 339 dput(entry);
342 return -ENAMETOOLONG; 340 return -ENAMETOOLONG;
343 } 341 }
344 342
343 spin_lock(&entry->d_lock);
345 len = server->ops->convert(path, maxlen-2, 344 len = server->ops->convert(path, maxlen-2,
346 entry->d_name.name, entry->d_name.len, 345 entry->d_name.name, entry->d_name.len,
347 server->local_nls, server->remote_nls); 346 server->local_nls, server->remote_nls);
@@ -359,15 +358,12 @@ static int smb_build_path(struct smb_sb_info *server, unsigned char *buf,
359 } 358 }
360 *path++ = '\\'; 359 *path++ = '\\';
361 maxlen -= len+1; 360 maxlen -= len+1;
362
363 parent = entry->d_parent;
364 dget(parent);
365 spin_unlock(&entry->d_lock); 361 spin_unlock(&entry->d_lock);
362
363 parent = dget_parent(entry);
366 dput(entry); 364 dput(entry);
367 entry = parent; 365 entry = parent;
368 spin_lock(&entry->d_lock);
369 } 366 }
370 spin_unlock(&entry->d_lock);
371 dput(entry); 367 dput(entry);
372 reverse_string(buf, path-buf); 368 reverse_string(buf, path-buf);
373 369
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 12b933ac6585..0dc340aa2be9 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -230,5 +230,6 @@ failed_read:
230 230
231const struct file_operations squashfs_dir_ops = { 231const struct file_operations squashfs_dir_ops = {
232 .read = generic_read_dir, 232 .read = generic_read_dir,
233 .readdir = squashfs_readdir 233 .readdir = squashfs_readdir,
234 .llseek = default_llseek,
234}; 235};
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 88b4f8606652..07a4f1156048 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -30,7 +30,6 @@
30#include <linux/fs.h> 30#include <linux/fs.h>
31#include <linux/vfs.h> 31#include <linux/vfs.h>
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/smp_lock.h>
34#include <linux/mutex.h> 33#include <linux/mutex.h>
35#include <linux/pagemap.h> 34#include <linux/pagemap.h>
36#include <linux/init.h> 35#include <linux/init.h>
@@ -354,8 +353,6 @@ static int squashfs_remount(struct super_block *sb, int *flags, char *data)
354 353
355static void squashfs_put_super(struct super_block *sb) 354static void squashfs_put_super(struct super_block *sb)
356{ 355{
357 lock_kernel();
358
359 if (sb->s_fs_info) { 356 if (sb->s_fs_info) {
360 struct squashfs_sb_info *sbi = sb->s_fs_info; 357 struct squashfs_sb_info *sbi = sb->s_fs_info;
361 squashfs_cache_delete(sbi->block_cache); 358 squashfs_cache_delete(sbi->block_cache);
@@ -370,8 +367,6 @@ static void squashfs_put_super(struct super_block *sb)
370 kfree(sb->s_fs_info); 367 kfree(sb->s_fs_info);
371 sb->s_fs_info = NULL; 368 sb->s_fs_info = NULL;
372 } 369 }
373
374 unlock_kernel();
375} 370}
376 371
377 372
diff --git a/fs/super.c b/fs/super.c
index 8819e3a7ff20..b9c9869165db 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -273,14 +273,14 @@ void generic_shutdown_super(struct super_block *sb)
273 get_fs_excl(); 273 get_fs_excl();
274 sb->s_flags &= ~MS_ACTIVE; 274 sb->s_flags &= ~MS_ACTIVE;
275 275
276 /* bad name - it should be evict_inodes() */ 276 fsnotify_unmount_inodes(&sb->s_inodes);
277 invalidate_inodes(sb); 277
278 evict_inodes(sb);
278 279
279 if (sop->put_super) 280 if (sop->put_super)
280 sop->put_super(sb); 281 sop->put_super(sb);
281 282
282 /* Forget any remaining inodes */ 283 if (!list_empty(&sb->s_inodes)) {
283 if (invalidate_inodes(sb)) {
284 printk("VFS: Busy inodes after unmount of %s. " 284 printk("VFS: Busy inodes after unmount of %s. "
285 "Self-destruct in 5 seconds. Have a nice day...\n", 285 "Self-destruct in 5 seconds. Have a nice day...\n",
286 sb->s_id); 286 sb->s_id);
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 4e321f7353fa..a4759833d62d 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -179,30 +179,14 @@ static void bin_vma_open(struct vm_area_struct *vma)
179 struct bin_buffer *bb = file->private_data; 179 struct bin_buffer *bb = file->private_data;
180 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 180 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
181 181
182 if (!bb->vm_ops || !bb->vm_ops->open) 182 if (!bb->vm_ops)
183 return;
184
185 if (!sysfs_get_active(attr_sd))
186 return;
187
188 bb->vm_ops->open(vma);
189
190 sysfs_put_active(attr_sd);
191}
192
193static void bin_vma_close(struct vm_area_struct *vma)
194{
195 struct file *file = vma->vm_file;
196 struct bin_buffer *bb = file->private_data;
197 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
198
199 if (!bb->vm_ops || !bb->vm_ops->close)
200 return; 183 return;
201 184
202 if (!sysfs_get_active(attr_sd)) 185 if (!sysfs_get_active(attr_sd))
203 return; 186 return;
204 187
205 bb->vm_ops->close(vma); 188 if (bb->vm_ops->open)
189 bb->vm_ops->open(vma);
206 190
207 sysfs_put_active(attr_sd); 191 sysfs_put_active(attr_sd);
208} 192}
@@ -214,13 +198,15 @@ static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
214 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 198 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
215 int ret; 199 int ret;
216 200
217 if (!bb->vm_ops || !bb->vm_ops->fault) 201 if (!bb->vm_ops)
218 return VM_FAULT_SIGBUS; 202 return VM_FAULT_SIGBUS;
219 203
220 if (!sysfs_get_active(attr_sd)) 204 if (!sysfs_get_active(attr_sd))
221 return VM_FAULT_SIGBUS; 205 return VM_FAULT_SIGBUS;
222 206
223 ret = bb->vm_ops->fault(vma, vmf); 207 ret = VM_FAULT_SIGBUS;
208 if (bb->vm_ops->fault)
209 ret = bb->vm_ops->fault(vma, vmf);
224 210
225 sysfs_put_active(attr_sd); 211 sysfs_put_active(attr_sd);
226 return ret; 212 return ret;
@@ -236,13 +222,12 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
236 if (!bb->vm_ops) 222 if (!bb->vm_ops)
237 return VM_FAULT_SIGBUS; 223 return VM_FAULT_SIGBUS;
238 224
239 if (!bb->vm_ops->page_mkwrite)
240 return 0;
241
242 if (!sysfs_get_active(attr_sd)) 225 if (!sysfs_get_active(attr_sd))
243 return VM_FAULT_SIGBUS; 226 return VM_FAULT_SIGBUS;
244 227
245 ret = bb->vm_ops->page_mkwrite(vma, vmf); 228 ret = 0;
229 if (bb->vm_ops->page_mkwrite)
230 ret = bb->vm_ops->page_mkwrite(vma, vmf);
246 231
247 sysfs_put_active(attr_sd); 232 sysfs_put_active(attr_sd);
248 return ret; 233 return ret;
@@ -256,13 +241,15 @@ static int bin_access(struct vm_area_struct *vma, unsigned long addr,
256 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 241 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
257 int ret; 242 int ret;
258 243
259 if (!bb->vm_ops || !bb->vm_ops->access) 244 if (!bb->vm_ops)
260 return -EINVAL; 245 return -EINVAL;
261 246
262 if (!sysfs_get_active(attr_sd)) 247 if (!sysfs_get_active(attr_sd))
263 return -EINVAL; 248 return -EINVAL;
264 249
265 ret = bb->vm_ops->access(vma, addr, buf, len, write); 250 ret = -EINVAL;
251 if (bb->vm_ops->access)
252 ret = bb->vm_ops->access(vma, addr, buf, len, write);
266 253
267 sysfs_put_active(attr_sd); 254 sysfs_put_active(attr_sd);
268 return ret; 255 return ret;
@@ -276,13 +263,15 @@ static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
276 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 263 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
277 int ret; 264 int ret;
278 265
279 if (!bb->vm_ops || !bb->vm_ops->set_policy) 266 if (!bb->vm_ops)
280 return 0; 267 return 0;
281 268
282 if (!sysfs_get_active(attr_sd)) 269 if (!sysfs_get_active(attr_sd))
283 return -EINVAL; 270 return -EINVAL;
284 271
285 ret = bb->vm_ops->set_policy(vma, new); 272 ret = 0;
273 if (bb->vm_ops->set_policy)
274 ret = bb->vm_ops->set_policy(vma, new);
286 275
287 sysfs_put_active(attr_sd); 276 sysfs_put_active(attr_sd);
288 return ret; 277 return ret;
@@ -296,13 +285,15 @@ static struct mempolicy *bin_get_policy(struct vm_area_struct *vma,
296 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 285 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
297 struct mempolicy *pol; 286 struct mempolicy *pol;
298 287
299 if (!bb->vm_ops || !bb->vm_ops->get_policy) 288 if (!bb->vm_ops)
300 return vma->vm_policy; 289 return vma->vm_policy;
301 290
302 if (!sysfs_get_active(attr_sd)) 291 if (!sysfs_get_active(attr_sd))
303 return vma->vm_policy; 292 return vma->vm_policy;
304 293
305 pol = bb->vm_ops->get_policy(vma, addr); 294 pol = vma->vm_policy;
295 if (bb->vm_ops->get_policy)
296 pol = bb->vm_ops->get_policy(vma, addr);
306 297
307 sysfs_put_active(attr_sd); 298 sysfs_put_active(attr_sd);
308 return pol; 299 return pol;
@@ -316,13 +307,15 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
316 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 307 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
317 int ret; 308 int ret;
318 309
319 if (!bb->vm_ops || !bb->vm_ops->migrate) 310 if (!bb->vm_ops)
320 return 0; 311 return 0;
321 312
322 if (!sysfs_get_active(attr_sd)) 313 if (!sysfs_get_active(attr_sd))
323 return 0; 314 return 0;
324 315
325 ret = bb->vm_ops->migrate(vma, from, to, flags); 316 ret = 0;
317 if (bb->vm_ops->migrate)
318 ret = bb->vm_ops->migrate(vma, from, to, flags);
326 319
327 sysfs_put_active(attr_sd); 320 sysfs_put_active(attr_sd);
328 return ret; 321 return ret;
@@ -331,7 +324,6 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
331 324
332static const struct vm_operations_struct bin_vm_ops = { 325static const struct vm_operations_struct bin_vm_ops = {
333 .open = bin_vma_open, 326 .open = bin_vma_open,
334 .close = bin_vma_close,
335 .fault = bin_fault, 327 .fault = bin_fault,
336 .page_mkwrite = bin_page_mkwrite, 328 .page_mkwrite = bin_page_mkwrite,
337 .access = bin_access, 329 .access = bin_access,
@@ -377,6 +369,14 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
377 if (bb->mmapped && bb->vm_ops != vma->vm_ops) 369 if (bb->mmapped && bb->vm_ops != vma->vm_ops)
378 goto out_put; 370 goto out_put;
379 371
372 /*
373 * It is not possible to successfully wrap close.
374 * So error if someone is trying to use close.
375 */
376 rc = -EINVAL;
377 if (vma->vm_ops && vma->vm_ops->close)
378 goto out_put;
379
380 rc = 0; 380 rc = 0;
381 bb->mmapped = 1; 381 bb->mmapped = 1;
382 bb->vm_ops = vma->vm_ops; 382 bb->vm_ops = vma->vm_ops;
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 23c1e598792a..442f34ff1af8 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -148,6 +148,65 @@ void sysfs_remove_group(struct kobject * kobj,
148 sysfs_put(sd); 148 sysfs_put(sd);
149} 149}
150 150
151/**
152 * sysfs_merge_group - merge files into a pre-existing attribute group.
153 * @kobj: The kobject containing the group.
154 * @grp: The files to create and the attribute group they belong to.
155 *
156 * This function returns an error if the group doesn't exist or any of the
157 * files already exist in that group, in which case none of the new files
158 * are created.
159 */
160int sysfs_merge_group(struct kobject *kobj,
161 const struct attribute_group *grp)
162{
163 struct sysfs_dirent *dir_sd;
164 int error = 0;
165 struct attribute *const *attr;
166 int i;
167
168 if (grp)
169 dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
170 else
171 dir_sd = sysfs_get(kobj->sd);
172 if (!dir_sd)
173 return -ENOENT;
174
175 for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr))
176 error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR);
177 if (error) {
178 while (--i >= 0)
179 sysfs_hash_and_remove(dir_sd, NULL, (*--attr)->name);
180 }
181 sysfs_put(dir_sd);
182
183 return error;
184}
185EXPORT_SYMBOL_GPL(sysfs_merge_group);
186
187/**
188 * sysfs_unmerge_group - remove files from a pre-existing attribute group.
189 * @kobj: The kobject containing the group.
190 * @grp: The files to remove and the attribute group they belong to.
191 */
192void sysfs_unmerge_group(struct kobject *kobj,
193 const struct attribute_group *grp)
194{
195 struct sysfs_dirent *dir_sd;
196 struct attribute *const *attr;
197
198 if (grp)
199 dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
200 else
201 dir_sd = sysfs_get(kobj->sd);
202 if (dir_sd) {
203 for (attr = grp->attrs; *attr; ++attr)
204 sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
205 sysfs_put(dir_sd);
206 }
207}
208EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
209
151 210
152EXPORT_SYMBOL_GPL(sysfs_create_group); 211EXPORT_SYMBOL_GPL(sysfs_create_group);
153EXPORT_SYMBOL_GPL(sysfs_update_group); 212EXPORT_SYMBOL_GPL(sysfs_update_group);
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 33e047b59b8d..11e7f7d11cd0 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -126,7 +126,7 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir,
126 126
127 inode->i_ctime = CURRENT_TIME_SEC; 127 inode->i_ctime = CURRENT_TIME_SEC;
128 inode_inc_link_count(inode); 128 inode_inc_link_count(inode);
129 atomic_inc(&inode->i_count); 129 ihold(inode);
130 130
131 return add_nondir(dentry, inode); 131 return add_nondir(dentry, inode);
132} 132}
diff --git a/fs/timerfd.c b/fs/timerfd.c
index b86ab8eff79a..8c4fc1425b3e 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -144,6 +144,7 @@ static const struct file_operations timerfd_fops = {
144 .release = timerfd_release, 144 .release = timerfd_release,
145 .poll = timerfd_poll, 145 .poll = timerfd_poll,
146 .read = timerfd_read, 146 .read = timerfd_read,
147 .llseek = noop_llseek,
147}; 148};
148 149
149static struct file *timerfd_fget(int fd) 150static struct file *timerfd_fget(int fd)
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 37fa7ed062d8..02429d81ca33 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -63,7 +63,9 @@ static int do_commit(struct ubifs_info *c)
63 struct ubifs_lp_stats lst; 63 struct ubifs_lp_stats lst;
64 64
65 dbg_cmt("start"); 65 dbg_cmt("start");
66 if (c->ro_media) { 66 ubifs_assert(!c->ro_media && !c->ro_mount);
67
68 if (c->ro_error) {
67 err = -EROFS; 69 err = -EROFS;
68 goto out_up; 70 goto out_up;
69 } 71 }
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index c2a68baa782f..0bee4dbffc31 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2239,6 +2239,162 @@ out_free:
2239 return err; 2239 return err;
2240} 2240}
2241 2241
2242/**
2243 * dbg_check_data_nodes_order - check that list of data nodes is sorted.
2244 * @c: UBIFS file-system description object
2245 * @head: the list of nodes ('struct ubifs_scan_node' objects)
2246 *
2247 * This function returns zero if the list of data nodes is sorted correctly,
2248 * and %-EINVAL if not.
2249 */
2250int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head)
2251{
2252 struct list_head *cur;
2253 struct ubifs_scan_node *sa, *sb;
2254
2255 if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
2256 return 0;
2257
2258 for (cur = head->next; cur->next != head; cur = cur->next) {
2259 ino_t inuma, inumb;
2260 uint32_t blka, blkb;
2261
2262 cond_resched();
2263 sa = container_of(cur, struct ubifs_scan_node, list);
2264 sb = container_of(cur->next, struct ubifs_scan_node, list);
2265
2266 if (sa->type != UBIFS_DATA_NODE) {
2267 ubifs_err("bad node type %d", sa->type);
2268 dbg_dump_node(c, sa->node);
2269 return -EINVAL;
2270 }
2271 if (sb->type != UBIFS_DATA_NODE) {
2272 ubifs_err("bad node type %d", sb->type);
2273 dbg_dump_node(c, sb->node);
2274 return -EINVAL;
2275 }
2276
2277 inuma = key_inum(c, &sa->key);
2278 inumb = key_inum(c, &sb->key);
2279
2280 if (inuma < inumb)
2281 continue;
2282 if (inuma > inumb) {
2283 ubifs_err("larger inum %lu goes before inum %lu",
2284 (unsigned long)inuma, (unsigned long)inumb);
2285 goto error_dump;
2286 }
2287
2288 blka = key_block(c, &sa->key);
2289 blkb = key_block(c, &sb->key);
2290
2291 if (blka > blkb) {
2292 ubifs_err("larger block %u goes before %u", blka, blkb);
2293 goto error_dump;
2294 }
2295 if (blka == blkb) {
2296 ubifs_err("two data nodes for the same block");
2297 goto error_dump;
2298 }
2299 }
2300
2301 return 0;
2302
2303error_dump:
2304 dbg_dump_node(c, sa->node);
2305 dbg_dump_node(c, sb->node);
2306 return -EINVAL;
2307}
2308
2309/**
2310 * dbg_check_nondata_nodes_order - check that list of data nodes is sorted.
2311 * @c: UBIFS file-system description object
2312 * @head: the list of nodes ('struct ubifs_scan_node' objects)
2313 *
2314 * This function returns zero if the list of non-data nodes is sorted correctly,
2315 * and %-EINVAL if not.
2316 */
2317int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
2318{
2319 struct list_head *cur;
2320 struct ubifs_scan_node *sa, *sb;
2321
2322 if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
2323 return 0;
2324
2325 for (cur = head->next; cur->next != head; cur = cur->next) {
2326 ino_t inuma, inumb;
2327 uint32_t hasha, hashb;
2328
2329 cond_resched();
2330 sa = container_of(cur, struct ubifs_scan_node, list);
2331 sb = container_of(cur->next, struct ubifs_scan_node, list);
2332
2333 if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
2334 sa->type != UBIFS_XENT_NODE) {
2335 ubifs_err("bad node type %d", sa->type);
2336 dbg_dump_node(c, sa->node);
2337 return -EINVAL;
2338 }
2339 if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
2340 sa->type != UBIFS_XENT_NODE) {
2341 ubifs_err("bad node type %d", sb->type);
2342 dbg_dump_node(c, sb->node);
2343 return -EINVAL;
2344 }
2345
2346 if (sa->type != UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) {
2347 ubifs_err("non-inode node goes before inode node");
2348 goto error_dump;
2349 }
2350
2351 if (sa->type == UBIFS_INO_NODE && sb->type != UBIFS_INO_NODE)
2352 continue;
2353
2354 if (sa->type == UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) {
2355 /* Inode nodes are sorted in descending size order */
2356 if (sa->len < sb->len) {
2357 ubifs_err("smaller inode node goes first");
2358 goto error_dump;
2359 }
2360 continue;
2361 }
2362
2363 /*
2364 * This is either a dentry or xentry, which should be sorted in
2365 * ascending (parent ino, hash) order.
2366 */
2367 inuma = key_inum(c, &sa->key);
2368 inumb = key_inum(c, &sb->key);
2369
2370 if (inuma < inumb)
2371 continue;
2372 if (inuma > inumb) {
2373 ubifs_err("larger inum %lu goes before inum %lu",
2374 (unsigned long)inuma, (unsigned long)inumb);
2375 goto error_dump;
2376 }
2377
2378 hasha = key_block(c, &sa->key);
2379 hashb = key_block(c, &sb->key);
2380
2381 if (hasha > hashb) {
2382 ubifs_err("larger hash %u goes before %u", hasha, hashb);
2383 goto error_dump;
2384 }
2385 }
2386
2387 return 0;
2388
2389error_dump:
2390 ubifs_msg("dumping first node");
2391 dbg_dump_node(c, sa->node);
2392 ubifs_msg("dumping second node");
2393 dbg_dump_node(c, sb->node);
2394 return -EINVAL;
2395 return 0;
2396}
2397
2242static int invocation_cnt; 2398static int invocation_cnt;
2243 2399
2244int dbg_force_in_the_gaps(void) 2400int dbg_force_in_the_gaps(void)
@@ -2625,6 +2781,7 @@ static const struct file_operations dfs_fops = {
2625 .open = open_debugfs_file, 2781 .open = open_debugfs_file,
2626 .write = write_debugfs_file, 2782 .write = write_debugfs_file,
2627 .owner = THIS_MODULE, 2783 .owner = THIS_MODULE,
2784 .llseek = default_llseek,
2628}; 2785};
2629 2786
2630/** 2787/**
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 29d960101ea6..69ebe4729151 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -324,6 +324,8 @@ int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
324 int row, int col); 324 int row, int col);
325int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode, 325int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
326 loff_t size); 326 loff_t size);
327int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head);
328int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head);
327 329
328/* Force the use of in-the-gaps method for testing */ 330/* Force the use of in-the-gaps method for testing */
329 331
@@ -465,6 +467,8 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
465#define dbg_check_lprops(c) 0 467#define dbg_check_lprops(c) 0
466#define dbg_check_lpt_nodes(c, cnode, row, col) 0 468#define dbg_check_lpt_nodes(c, cnode, row, col) 0
467#define dbg_check_inode_size(c, inode, size) 0 469#define dbg_check_inode_size(c, inode, size) 0
470#define dbg_check_data_nodes_order(c, head) 0
471#define dbg_check_nondata_nodes_order(c, head) 0
468#define dbg_force_in_the_gaps_enabled 0 472#define dbg_force_in_the_gaps_enabled 0
469#define dbg_force_in_the_gaps() 0 473#define dbg_force_in_the_gaps() 0
470#define dbg_failure_mode 0 474#define dbg_failure_mode 0
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 87ebcce72213..14f64b689d7f 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -550,7 +550,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
550 550
551 lock_2_inodes(dir, inode); 551 lock_2_inodes(dir, inode);
552 inc_nlink(inode); 552 inc_nlink(inode);
553 atomic_inc(&inode->i_count); 553 ihold(inode);
554 inode->i_ctime = ubifs_current_time(inode); 554 inode->i_ctime = ubifs_current_time(inode);
555 dir->i_size += sz_change; 555 dir->i_size += sz_change;
556 dir_ui->ui_size = dir->i_size; 556 dir_ui->ui_size = dir->i_size;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 03ae894c45de..d77db7e36484 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -433,8 +433,9 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
433 struct page *page; 433 struct page *page;
434 434
435 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size); 435 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
436 ubifs_assert(!c->ro_media && !c->ro_mount);
436 437
437 if (unlikely(c->ro_media)) 438 if (unlikely(c->ro_error))
438 return -EROFS; 439 return -EROFS;
439 440
440 /* Try out the fast-path part first */ 441 /* Try out the fast-path part first */
@@ -1439,9 +1440,9 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vm
1439 1440
1440 dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index, 1441 dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index,
1441 i_size_read(inode)); 1442 i_size_read(inode));
1442 ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY)); 1443 ubifs_assert(!c->ro_media && !c->ro_mount);
1443 1444
1444 if (unlikely(c->ro_media)) 1445 if (unlikely(c->ro_error))
1445 return VM_FAULT_SIGBUS; /* -EROFS */ 1446 return VM_FAULT_SIGBUS; /* -EROFS */
1446 1447
1447 /* 1448 /*
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 918d1582ca05..151f10882820 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -125,10 +125,16 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
125 struct ubifs_scan_node *sa, *sb; 125 struct ubifs_scan_node *sa, *sb;
126 126
127 cond_resched(); 127 cond_resched();
128 if (a == b)
129 return 0;
130
128 sa = list_entry(a, struct ubifs_scan_node, list); 131 sa = list_entry(a, struct ubifs_scan_node, list);
129 sb = list_entry(b, struct ubifs_scan_node, list); 132 sb = list_entry(b, struct ubifs_scan_node, list);
133
130 ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY); 134 ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
131 ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY); 135 ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);
136 ubifs_assert(sa->type == UBIFS_DATA_NODE);
137 ubifs_assert(sb->type == UBIFS_DATA_NODE);
132 138
133 inuma = key_inum(c, &sa->key); 139 inuma = key_inum(c, &sa->key);
134 inumb = key_inum(c, &sb->key); 140 inumb = key_inum(c, &sb->key);
@@ -157,28 +163,40 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
157 */ 163 */
158int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) 164int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
159{ 165{
160 int typea, typeb;
161 ino_t inuma, inumb; 166 ino_t inuma, inumb;
162 struct ubifs_info *c = priv; 167 struct ubifs_info *c = priv;
163 struct ubifs_scan_node *sa, *sb; 168 struct ubifs_scan_node *sa, *sb;
164 169
165 cond_resched(); 170 cond_resched();
171 if (a == b)
172 return 0;
173
166 sa = list_entry(a, struct ubifs_scan_node, list); 174 sa = list_entry(a, struct ubifs_scan_node, list);
167 sb = list_entry(b, struct ubifs_scan_node, list); 175 sb = list_entry(b, struct ubifs_scan_node, list);
168 typea = key_type(c, &sa->key); 176
169 typeb = key_type(c, &sb->key); 177 ubifs_assert(key_type(c, &sa->key) != UBIFS_DATA_KEY &&
170 ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY); 178 key_type(c, &sb->key) != UBIFS_DATA_KEY);
179 ubifs_assert(sa->type != UBIFS_DATA_NODE &&
180 sb->type != UBIFS_DATA_NODE);
171 181
172 /* Inodes go before directory entries */ 182 /* Inodes go before directory entries */
173 if (typea == UBIFS_INO_KEY) { 183 if (sa->type == UBIFS_INO_NODE) {
174 if (typeb == UBIFS_INO_KEY) 184 if (sb->type == UBIFS_INO_NODE)
175 return sb->len - sa->len; 185 return sb->len - sa->len;
176 return -1; 186 return -1;
177 } 187 }
178 if (typeb == UBIFS_INO_KEY) 188 if (sb->type == UBIFS_INO_NODE)
179 return 1; 189 return 1;
180 190
181 ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY); 191 ubifs_assert(key_type(c, &sa->key) == UBIFS_DENT_KEY ||
192 key_type(c, &sa->key) == UBIFS_XENT_KEY);
193 ubifs_assert(key_type(c, &sb->key) == UBIFS_DENT_KEY ||
194 key_type(c, &sb->key) == UBIFS_XENT_KEY);
195 ubifs_assert(sa->type == UBIFS_DENT_NODE ||
196 sa->type == UBIFS_XENT_NODE);
197 ubifs_assert(sb->type == UBIFS_DENT_NODE ||
198 sb->type == UBIFS_XENT_NODE);
199
182 inuma = key_inum(c, &sa->key); 200 inuma = key_inum(c, &sa->key);
183 inumb = key_inum(c, &sb->key); 201 inumb = key_inum(c, &sb->key);
184 202
@@ -224,17 +242,33 @@ int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
224static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb, 242static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
225 struct list_head *nondata, int *min) 243 struct list_head *nondata, int *min)
226{ 244{
245 int err;
227 struct ubifs_scan_node *snod, *tmp; 246 struct ubifs_scan_node *snod, *tmp;
228 247
229 *min = INT_MAX; 248 *min = INT_MAX;
230 249
231 /* Separate data nodes and non-data nodes */ 250 /* Separate data nodes and non-data nodes */
232 list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { 251 list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
233 int err; 252 ubifs_assert(snod->type == UBIFS_INO_NODE ||
253 snod->type == UBIFS_DATA_NODE ||
254 snod->type == UBIFS_DENT_NODE ||
255 snod->type == UBIFS_XENT_NODE ||
256 snod->type == UBIFS_TRUN_NODE);
257
258 if (snod->type != UBIFS_INO_NODE &&
259 snod->type != UBIFS_DATA_NODE &&
260 snod->type != UBIFS_DENT_NODE &&
261 snod->type != UBIFS_XENT_NODE) {
262 /* Probably truncation node, zap it */
263 list_del(&snod->list);
264 kfree(snod);
265 continue;
266 }
234 267
235 ubifs_assert(snod->type != UBIFS_IDX_NODE); 268 ubifs_assert(key_type(c, &snod->key) == UBIFS_DATA_KEY ||
236 ubifs_assert(snod->type != UBIFS_REF_NODE); 269 key_type(c, &snod->key) == UBIFS_INO_KEY ||
237 ubifs_assert(snod->type != UBIFS_CS_NODE); 270 key_type(c, &snod->key) == UBIFS_DENT_KEY ||
271 key_type(c, &snod->key) == UBIFS_XENT_KEY);
238 272
239 err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum, 273 err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
240 snod->offs, 0); 274 snod->offs, 0);
@@ -258,6 +292,13 @@ static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
258 /* Sort data and non-data nodes */ 292 /* Sort data and non-data nodes */
259 list_sort(c, &sleb->nodes, &data_nodes_cmp); 293 list_sort(c, &sleb->nodes, &data_nodes_cmp);
260 list_sort(c, nondata, &nondata_nodes_cmp); 294 list_sort(c, nondata, &nondata_nodes_cmp);
295
296 err = dbg_check_data_nodes_order(c, &sleb->nodes);
297 if (err)
298 return err;
299 err = dbg_check_nondata_nodes_order(c, nondata);
300 if (err)
301 return err;
261 return 0; 302 return 0;
262} 303}
263 304
@@ -575,13 +616,14 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
575 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; 616 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
576 617
577 ubifs_assert_cmt_locked(c); 618 ubifs_assert_cmt_locked(c);
619 ubifs_assert(!c->ro_media && !c->ro_mount);
578 620
579 if (ubifs_gc_should_commit(c)) 621 if (ubifs_gc_should_commit(c))
580 return -EAGAIN; 622 return -EAGAIN;
581 623
582 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); 624 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
583 625
584 if (c->ro_media) { 626 if (c->ro_error) {
585 ret = -EROFS; 627 ret = -EROFS;
586 goto out_unlock; 628 goto out_unlock;
587 } 629 }
@@ -677,14 +719,12 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
677 719
678 ret = ubifs_garbage_collect_leb(c, &lp); 720 ret = ubifs_garbage_collect_leb(c, &lp);
679 if (ret < 0) { 721 if (ret < 0) {
680 if (ret == -EAGAIN || ret == -ENOSPC) { 722 if (ret == -EAGAIN) {
681 /* 723 /*
682 * These codes are not errors, so we have to 724 * This is not error, so we have to return the
683 * return the LEB to lprops. But if the 725 * LEB to lprops. But if 'ubifs_return_leb()'
684 * 'ubifs_return_leb()' function fails, its 726 * fails, its failure code is propagated to the
685 * failure code is propagated to the caller 727 * caller instead of the original '-EAGAIN'.
686 * instead of the original '-EAGAIN' or
687 * '-ENOSPC'.
688 */ 728 */
689 err = ubifs_return_leb(c, lp.lnum); 729 err = ubifs_return_leb(c, lp.lnum);
690 if (err) 730 if (err)
@@ -774,8 +814,8 @@ out_unlock:
774out: 814out:
775 ubifs_assert(ret < 0); 815 ubifs_assert(ret < 0);
776 ubifs_assert(ret != -ENOSPC && ret != -EAGAIN); 816 ubifs_assert(ret != -ENOSPC && ret != -EAGAIN);
777 ubifs_ro_mode(c, ret);
778 ubifs_wbuf_sync_nolock(wbuf); 817 ubifs_wbuf_sync_nolock(wbuf);
818 ubifs_ro_mode(c, ret);
779 mutex_unlock(&wbuf->io_mutex); 819 mutex_unlock(&wbuf->io_mutex);
780 ubifs_return_leb(c, lp.lnum); 820 ubifs_return_leb(c, lp.lnum);
781 return ret; 821 return ret;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index bcf5a16f30bb..d82173182eeb 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -61,8 +61,8 @@
61 */ 61 */
62void ubifs_ro_mode(struct ubifs_info *c, int err) 62void ubifs_ro_mode(struct ubifs_info *c, int err)
63{ 63{
64 if (!c->ro_media) { 64 if (!c->ro_error) {
65 c->ro_media = 1; 65 c->ro_error = 1;
66 c->no_chk_data_crc = 0; 66 c->no_chk_data_crc = 0;
67 c->vfs_sb->s_flags |= MS_RDONLY; 67 c->vfs_sb->s_flags |= MS_RDONLY;
68 ubifs_warn("switched to read-only mode, error %d", err); 68 ubifs_warn("switched to read-only mode, error %d", err);
@@ -356,11 +356,11 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
356 356
357 dbg_io("LEB %d:%d, %d bytes, jhead %s", 357 dbg_io("LEB %d:%d, %d bytes, jhead %s",
358 wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead)); 358 wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead));
359 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
360 ubifs_assert(!(wbuf->avail & 7)); 359 ubifs_assert(!(wbuf->avail & 7));
361 ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size); 360 ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
361 ubifs_assert(!c->ro_media && !c->ro_mount);
362 362
363 if (c->ro_media) 363 if (c->ro_error)
364 return -EROFS; 364 return -EROFS;
365 365
366 ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail); 366 ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail);
@@ -440,11 +440,12 @@ int ubifs_bg_wbufs_sync(struct ubifs_info *c)
440{ 440{
441 int err, i; 441 int err, i;
442 442
443 ubifs_assert(!c->ro_media && !c->ro_mount);
443 if (!c->need_wbuf_sync) 444 if (!c->need_wbuf_sync)
444 return 0; 445 return 0;
445 c->need_wbuf_sync = 0; 446 c->need_wbuf_sync = 0;
446 447
447 if (c->ro_media) { 448 if (c->ro_error) {
448 err = -EROFS; 449 err = -EROFS;
449 goto out_timers; 450 goto out_timers;
450 } 451 }
@@ -519,6 +520,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
519 ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size); 520 ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
520 ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size); 521 ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size);
521 ubifs_assert(mutex_is_locked(&wbuf->io_mutex)); 522 ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
523 ubifs_assert(!c->ro_media && !c->ro_mount);
522 524
523 if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) { 525 if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
524 err = -ENOSPC; 526 err = -ENOSPC;
@@ -527,7 +529,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
527 529
528 cancel_wbuf_timer_nolock(wbuf); 530 cancel_wbuf_timer_nolock(wbuf);
529 531
530 if (c->ro_media) 532 if (c->ro_error)
531 return -EROFS; 533 return -EROFS;
532 534
533 if (aligned_len <= wbuf->avail) { 535 if (aligned_len <= wbuf->avail) {
@@ -663,8 +665,9 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
663 buf_len); 665 buf_len);
664 ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0); 666 ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
665 ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size); 667 ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size);
668 ubifs_assert(!c->ro_media && !c->ro_mount);
666 669
667 if (c->ro_media) 670 if (c->ro_error)
668 return -EROFS; 671 return -EROFS;
669 672
670 ubifs_prepare_node(c, buf, len, 1); 673 ubifs_prepare_node(c, buf, len, 1);
@@ -815,7 +818,8 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
815 return 0; 818 return 0;
816 819
817out: 820out:
818 ubifs_err("bad node at LEB %d:%d", lnum, offs); 821 ubifs_err("bad node at LEB %d:%d, LEB mapping status %d", lnum, offs,
822 ubi_is_mapped(c->ubi, lnum));
819 dbg_dump_node(c, buf); 823 dbg_dump_node(c, buf);
820 dbg_dump_stack(); 824 dbg_dump_stack();
821 return -EINVAL; 825 return -EINVAL;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index d321baeca68d..914f1bd89e57 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -122,11 +122,12 @@ static int reserve_space(struct ubifs_info *c, int jhead, int len)
122 * better to try to allocate space at the ends of eraseblocks. This is 122 * better to try to allocate space at the ends of eraseblocks. This is
123 * what the squeeze parameter does. 123 * what the squeeze parameter does.
124 */ 124 */
125 ubifs_assert(!c->ro_media && !c->ro_mount);
125 squeeze = (jhead == BASEHD); 126 squeeze = (jhead == BASEHD);
126again: 127again:
127 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); 128 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
128 129
129 if (c->ro_media) { 130 if (c->ro_error) {
130 err = -EROFS; 131 err = -EROFS;
131 goto out_unlock; 132 goto out_unlock;
132 } 133 }
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 0f530c684f0b..92a8491a8f8c 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -306,6 +306,20 @@ static inline void trun_key_init(const struct ubifs_info *c,
306} 306}
307 307
308/** 308/**
309 * invalid_key_init - initialize invalid node key.
310 * @c: UBIFS file-system description object
311 * @key: key to initialize
312 *
313 * This is a helper function which marks a @key object as invalid.
314 */
315static inline void invalid_key_init(const struct ubifs_info *c,
316 union ubifs_key *key)
317{
318 key->u32[0] = 0xDEADBEAF;
319 key->u32[1] = UBIFS_INVALID_KEY;
320}
321
322/**
309 * key_type - get key type. 323 * key_type - get key type.
310 * @c: UBIFS file-system description object 324 * @c: UBIFS file-system description object
311 * @key: key to get type of 325 * @key: key to get type of
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index c345e125f42c..4d0cb1241460 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -159,7 +159,7 @@ void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud)
159 jhead = &c->jheads[bud->jhead]; 159 jhead = &c->jheads[bud->jhead];
160 list_add_tail(&bud->list, &jhead->buds_list); 160 list_add_tail(&bud->list, &jhead->buds_list);
161 } else 161 } else
162 ubifs_assert(c->replaying && (c->vfs_sb->s_flags & MS_RDONLY)); 162 ubifs_assert(c->replaying && c->ro_mount);
163 163
164 /* 164 /*
165 * Note, although this is a new bud, we anyway account this space now, 165 * Note, although this is a new bud, we anyway account this space now,
@@ -223,8 +223,8 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
223 } 223 }
224 224
225 mutex_lock(&c->log_mutex); 225 mutex_lock(&c->log_mutex);
226 226 ubifs_assert(!c->ro_media && !c->ro_mount);
227 if (c->ro_media) { 227 if (c->ro_error) {
228 err = -EROFS; 228 err = -EROFS;
229 goto out_unlock; 229 goto out_unlock;
230 } 230 }
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index 0084a33c4c69..72775d35b99e 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1363,6 +1363,7 @@ static int read_lsave(struct ubifs_info *c)
1363 goto out; 1363 goto out;
1364 for (i = 0; i < c->lsave_cnt; i++) { 1364 for (i = 0; i < c->lsave_cnt; i++) {
1365 int lnum = c->lsave[i]; 1365 int lnum = c->lsave[i];
1366 struct ubifs_lprops *lprops;
1366 1367
1367 /* 1368 /*
1368 * Due to automatic resizing, the values in the lsave table 1369 * Due to automatic resizing, the values in the lsave table
@@ -1370,7 +1371,11 @@ static int read_lsave(struct ubifs_info *c)
1370 */ 1371 */
1371 if (lnum >= c->leb_cnt) 1372 if (lnum >= c->leb_cnt)
1372 continue; 1373 continue;
1373 ubifs_lpt_lookup(c, lnum); 1374 lprops = ubifs_lpt_lookup(c, lnum);
1375 if (IS_ERR(lprops)) {
1376 err = PTR_ERR(lprops);
1377 goto out;
1378 }
1374 } 1379 }
1375out: 1380out:
1376 vfree(buf); 1381 vfree(buf);
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index d12535b7fc78..5c90dec5db0b 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -705,6 +705,9 @@ static int make_tree_dirty(struct ubifs_info *c)
705 struct ubifs_pnode *pnode; 705 struct ubifs_pnode *pnode;
706 706
707 pnode = pnode_lookup(c, 0); 707 pnode = pnode_lookup(c, 0);
708 if (IS_ERR(pnode))
709 return PTR_ERR(pnode);
710
708 while (pnode) { 711 while (pnode) {
709 do_make_pnode_dirty(c, pnode); 712 do_make_pnode_dirty(c, pnode);
710 pnode = next_pnode_to_dirty(c, pnode); 713 pnode = next_pnode_to_dirty(c, pnode);
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 28beaeedadc0..21f47afdacff 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -361,7 +361,8 @@ int ubifs_write_master(struct ubifs_info *c)
361{ 361{
362 int err, lnum, offs, len; 362 int err, lnum, offs, len;
363 363
364 if (c->ro_media) 364 ubifs_assert(!c->ro_media && !c->ro_mount);
365 if (c->ro_error)
365 return -EROFS; 366 return -EROFS;
366 367
367 lnum = UBIFS_MST_LNUM; 368 lnum = UBIFS_MST_LNUM;
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 4fa81d867e41..c3de04dc952a 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -132,7 +132,8 @@ static inline int ubifs_leb_unmap(const struct ubifs_info *c, int lnum)
132{ 132{
133 int err; 133 int err;
134 134
135 if (c->ro_media) 135 ubifs_assert(!c->ro_media && !c->ro_mount);
136 if (c->ro_error)
136 return -EROFS; 137 return -EROFS;
137 err = ubi_leb_unmap(c->ubi, lnum); 138 err = ubi_leb_unmap(c->ubi, lnum);
138 if (err) { 139 if (err) {
@@ -159,7 +160,8 @@ static inline int ubifs_leb_write(const struct ubifs_info *c, int lnum,
159{ 160{
160 int err; 161 int err;
161 162
162 if (c->ro_media) 163 ubifs_assert(!c->ro_media && !c->ro_mount);
164 if (c->ro_error)
163 return -EROFS; 165 return -EROFS;
164 err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype); 166 err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype);
165 if (err) { 167 if (err) {
@@ -186,7 +188,8 @@ static inline int ubifs_leb_change(const struct ubifs_info *c, int lnum,
186{ 188{
187 int err; 189 int err;
188 190
189 if (c->ro_media) 191 ubifs_assert(!c->ro_media && !c->ro_mount);
192 if (c->ro_error)
190 return -EROFS; 193 return -EROFS;
191 err = ubi_leb_change(c->ubi, lnum, buf, len, dtype); 194 err = ubi_leb_change(c->ubi, lnum, buf, len, dtype);
192 if (err) { 195 if (err) {
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index daae9e1f5382..77e9b874b6c2 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -292,7 +292,7 @@ int ubifs_recover_master_node(struct ubifs_info *c)
292 292
293 memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ); 293 memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ);
294 294
295 if ((c->vfs_sb->s_flags & MS_RDONLY)) { 295 if (c->ro_mount) {
296 /* Read-only mode. Keep a copy for switching to rw mode */ 296 /* Read-only mode. Keep a copy for switching to rw mode */
297 c->rcvrd_mst_node = kmalloc(sz, GFP_KERNEL); 297 c->rcvrd_mst_node = kmalloc(sz, GFP_KERNEL);
298 if (!c->rcvrd_mst_node) { 298 if (!c->rcvrd_mst_node) {
@@ -469,7 +469,7 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
469 endpt = snod->offs + snod->len; 469 endpt = snod->offs + snod->len;
470 } 470 }
471 471
472 if ((c->vfs_sb->s_flags & MS_RDONLY) && !c->remounting_rw) { 472 if (c->ro_mount && !c->remounting_rw) {
473 /* Add to recovery list */ 473 /* Add to recovery list */
474 struct ubifs_unclean_leb *ucleb; 474 struct ubifs_unclean_leb *ucleb;
475 475
@@ -772,7 +772,8 @@ out_free:
772 * @sbuf: LEB-sized buffer to use 772 * @sbuf: LEB-sized buffer to use
773 * 773 *
774 * This function does a scan of a LEB, but caters for errors that might have 774 * This function does a scan of a LEB, but caters for errors that might have
775 * been caused by the unclean unmount from which we are attempting to recover. 775 * been caused by unclean reboots from which we are attempting to recover
776 * (assume that only the last log LEB can be corrupted by an unclean reboot).
776 * 777 *
777 * This function returns %0 on success and a negative error code on failure. 778 * This function returns %0 on success and a negative error code on failure.
778 */ 779 */
@@ -883,7 +884,7 @@ int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf)
883{ 884{
884 int err; 885 int err;
885 886
886 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY) || c->remounting_rw); 887 ubifs_assert(!c->ro_mount || c->remounting_rw);
887 888
888 dbg_rcvry("checking index head at %d:%d", c->ihead_lnum, c->ihead_offs); 889 dbg_rcvry("checking index head at %d:%d", c->ihead_lnum, c->ihead_offs);
889 err = recover_head(c, c->ihead_lnum, c->ihead_offs, sbuf); 890 err = recover_head(c, c->ihead_lnum, c->ihead_offs, sbuf);
@@ -1461,7 +1462,7 @@ int ubifs_recover_size(struct ubifs_info *c)
1461 } 1462 }
1462 } 1463 }
1463 if (e->exists && e->i_size < e->d_size) { 1464 if (e->exists && e->i_size < e->d_size) {
1464 if (!e->inode && (c->vfs_sb->s_flags & MS_RDONLY)) { 1465 if (!e->inode && c->ro_mount) {
1465 /* Fix the inode size and pin it in memory */ 1466 /* Fix the inode size and pin it in memory */
1466 struct inode *inode; 1467 struct inode *inode;
1467 1468
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 5c2d6d759a3e..eed0fcff8d73 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -627,8 +627,7 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
627 ubifs_assert(sleb->endpt - offs >= used); 627 ubifs_assert(sleb->endpt - offs >= used);
628 ubifs_assert(sleb->endpt % c->min_io_size == 0); 628 ubifs_assert(sleb->endpt % c->min_io_size == 0);
629 629
630 if (sleb->endpt + c->min_io_size <= c->leb_size && 630 if (sleb->endpt + c->min_io_size <= c->leb_size && !c->ro_mount)
631 !(c->vfs_sb->s_flags & MS_RDONLY))
632 err = ubifs_wbuf_seek_nolock(&c->jheads[jhead].wbuf, lnum, 631 err = ubifs_wbuf_seek_nolock(&c->jheads[jhead].wbuf, lnum,
633 sleb->endpt, UBI_SHORTTERM); 632 sleb->endpt, UBI_SHORTTERM);
634 633
@@ -840,6 +839,11 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
840 if (IS_ERR(sleb)) { 839 if (IS_ERR(sleb)) {
841 if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery) 840 if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery)
842 return PTR_ERR(sleb); 841 return PTR_ERR(sleb);
842 /*
843 * Note, the below function will recover this log LEB only if
844 * it is the last, because unclean reboots can possibly corrupt
845 * only the tail of the log.
846 */
843 sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf); 847 sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
844 if (IS_ERR(sleb)) 848 if (IS_ERR(sleb))
845 return PTR_ERR(sleb); 849 return PTR_ERR(sleb);
@@ -851,7 +855,6 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
851 } 855 }
852 856
853 node = sleb->buf; 857 node = sleb->buf;
854
855 snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list); 858 snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
856 if (c->cs_sqnum == 0) { 859 if (c->cs_sqnum == 0) {
857 /* 860 /*
@@ -898,7 +901,6 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
898 } 901 }
899 902
900 list_for_each_entry(snod, &sleb->nodes, list) { 903 list_for_each_entry(snod, &sleb->nodes, list) {
901
902 cond_resched(); 904 cond_resched();
903 905
904 if (snod->sqnum >= SQNUM_WATERMARK) { 906 if (snod->sqnum >= SQNUM_WATERMARK) {
@@ -1011,7 +1013,6 @@ out:
1011int ubifs_replay_journal(struct ubifs_info *c) 1013int ubifs_replay_journal(struct ubifs_info *c)
1012{ 1014{
1013 int err, i, lnum, offs, free; 1015 int err, i, lnum, offs, free;
1014 void *sbuf = NULL;
1015 1016
1016 BUILD_BUG_ON(UBIFS_TRUN_KEY > 5); 1017 BUILD_BUG_ON(UBIFS_TRUN_KEY > 5);
1017 1018
@@ -1026,14 +1027,8 @@ int ubifs_replay_journal(struct ubifs_info *c)
1026 return -EINVAL; 1027 return -EINVAL;
1027 } 1028 }
1028 1029
1029 sbuf = vmalloc(c->leb_size);
1030 if (!sbuf)
1031 return -ENOMEM;
1032
1033 dbg_mnt("start replaying the journal"); 1030 dbg_mnt("start replaying the journal");
1034
1035 c->replaying = 1; 1031 c->replaying = 1;
1036
1037 lnum = c->ltail_lnum = c->lhead_lnum; 1032 lnum = c->ltail_lnum = c->lhead_lnum;
1038 offs = c->lhead_offs; 1033 offs = c->lhead_offs;
1039 1034
@@ -1046,7 +1041,7 @@ int ubifs_replay_journal(struct ubifs_info *c)
1046 lnum = UBIFS_LOG_LNUM; 1041 lnum = UBIFS_LOG_LNUM;
1047 offs = 0; 1042 offs = 0;
1048 } 1043 }
1049 err = replay_log_leb(c, lnum, offs, sbuf); 1044 err = replay_log_leb(c, lnum, offs, c->sbuf);
1050 if (err == 1) 1045 if (err == 1)
1051 /* We hit the end of the log */ 1046 /* We hit the end of the log */
1052 break; 1047 break;
@@ -1079,7 +1074,6 @@ int ubifs_replay_journal(struct ubifs_info *c)
1079out: 1074out:
1080 destroy_replay_tree(c); 1075 destroy_replay_tree(c);
1081 destroy_bud_list(c); 1076 destroy_bud_list(c);
1082 vfree(sbuf);
1083 c->replaying = 0; 1077 c->replaying = 0;
1084 return err; 1078 return err;
1085} 1079}
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 96cb62c8a9dd..bf31b4729e51 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -542,11 +542,8 @@ int ubifs_read_superblock(struct ubifs_info *c)
542 * due to the unavailability of time-travelling equipment. 542 * due to the unavailability of time-travelling equipment.
543 */ 543 */
544 if (c->fmt_version > UBIFS_FORMAT_VERSION) { 544 if (c->fmt_version > UBIFS_FORMAT_VERSION) {
545 struct super_block *sb = c->vfs_sb; 545 ubifs_assert(!c->ro_media || c->ro_mount);
546 int mounting_ro = sb->s_flags & MS_RDONLY; 546 if (!c->ro_mount ||
547
548 ubifs_assert(!c->ro_media || mounting_ro);
549 if (!mounting_ro ||
550 c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) { 547 c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) {
551 ubifs_err("on-flash format version is w%d/r%d, but " 548 ubifs_err("on-flash format version is w%d/r%d, but "
552 "software only supports up to version " 549 "software only supports up to version "
@@ -624,7 +621,7 @@ int ubifs_read_superblock(struct ubifs_info *c)
624 c->old_leb_cnt = c->leb_cnt; 621 c->old_leb_cnt = c->leb_cnt;
625 if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) { 622 if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) {
626 c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size); 623 c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size);
627 if (c->vfs_sb->s_flags & MS_RDONLY) 624 if (c->ro_mount)
628 dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs", 625 dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs",
629 c->old_leb_cnt, c->leb_cnt); 626 c->old_leb_cnt, c->leb_cnt);
630 else { 627 else {
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 96c525384191..3e1ee57dbeaa 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -197,7 +197,7 @@ int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
197 struct ubifs_ino_node *ino = buf; 197 struct ubifs_ino_node *ino = buf;
198 struct ubifs_scan_node *snod; 198 struct ubifs_scan_node *snod;
199 199
200 snod = kzalloc(sizeof(struct ubifs_scan_node), GFP_NOFS); 200 snod = kmalloc(sizeof(struct ubifs_scan_node), GFP_NOFS);
201 if (!snod) 201 if (!snod)
202 return -ENOMEM; 202 return -ENOMEM;
203 203
@@ -212,13 +212,15 @@ int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
212 case UBIFS_DENT_NODE: 212 case UBIFS_DENT_NODE:
213 case UBIFS_XENT_NODE: 213 case UBIFS_XENT_NODE:
214 case UBIFS_DATA_NODE: 214 case UBIFS_DATA_NODE:
215 case UBIFS_TRUN_NODE:
216 /* 215 /*
217 * The key is in the same place in all keyed 216 * The key is in the same place in all keyed
218 * nodes. 217 * nodes.
219 */ 218 */
220 key_read(c, &ino->key, &snod->key); 219 key_read(c, &ino->key, &snod->key);
221 break; 220 break;
221 default:
222 invalid_key_init(c, &snod->key);
223 break;
222 } 224 }
223 list_add_tail(&snod->list, &sleb->nodes); 225 list_add_tail(&snod->list, &sleb->nodes);
224 sleb->nodes_cnt += 1; 226 sleb->nodes_cnt += 1;
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 0b201114a5ad..46961c003236 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -250,7 +250,7 @@ static int kick_a_thread(void)
250 dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt); 250 dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt);
251 251
252 if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN || 252 if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN ||
253 c->ro_media) { 253 c->ro_mount || c->ro_error) {
254 mutex_unlock(&c->umount_mutex); 254 mutex_unlock(&c->umount_mutex);
255 continue; 255 continue;
256 } 256 }
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index cd5900b85d38..9a47c9f0ad07 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1137,11 +1137,11 @@ static int check_free_space(struct ubifs_info *c)
1137 */ 1137 */
1138static int mount_ubifs(struct ubifs_info *c) 1138static int mount_ubifs(struct ubifs_info *c)
1139{ 1139{
1140 struct super_block *sb = c->vfs_sb; 1140 int err;
1141 int err, mounted_read_only = (sb->s_flags & MS_RDONLY);
1142 long long x; 1141 long long x;
1143 size_t sz; 1142 size_t sz;
1144 1143
1144 c->ro_mount = !!(c->vfs_sb->s_flags & MS_RDONLY);
1145 err = init_constants_early(c); 1145 err = init_constants_early(c);
1146 if (err) 1146 if (err)
1147 return err; 1147 return err;
@@ -1154,7 +1154,7 @@ static int mount_ubifs(struct ubifs_info *c)
1154 if (err) 1154 if (err)
1155 goto out_free; 1155 goto out_free;
1156 1156
1157 if (c->empty && (mounted_read_only || c->ro_media)) { 1157 if (c->empty && (c->ro_mount || c->ro_media)) {
1158 /* 1158 /*
1159 * This UBI volume is empty, and read-only, or the file system 1159 * This UBI volume is empty, and read-only, or the file system
1160 * is mounted read-only - we cannot format it. 1160 * is mounted read-only - we cannot format it.
@@ -1165,7 +1165,7 @@ static int mount_ubifs(struct ubifs_info *c)
1165 goto out_free; 1165 goto out_free;
1166 } 1166 }
1167 1167
1168 if (c->ro_media && !mounted_read_only) { 1168 if (c->ro_media && !c->ro_mount) {
1169 ubifs_err("cannot mount read-write - read-only media"); 1169 ubifs_err("cannot mount read-write - read-only media");
1170 err = -EROFS; 1170 err = -EROFS;
1171 goto out_free; 1171 goto out_free;
@@ -1185,7 +1185,7 @@ static int mount_ubifs(struct ubifs_info *c)
1185 if (!c->sbuf) 1185 if (!c->sbuf)
1186 goto out_free; 1186 goto out_free;
1187 1187
1188 if (!mounted_read_only) { 1188 if (!c->ro_mount) {
1189 c->ileb_buf = vmalloc(c->leb_size); 1189 c->ileb_buf = vmalloc(c->leb_size);
1190 if (!c->ileb_buf) 1190 if (!c->ileb_buf)
1191 goto out_free; 1191 goto out_free;
@@ -1228,7 +1228,7 @@ static int mount_ubifs(struct ubifs_info *c)
1228 } 1228 }
1229 1229
1230 sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id); 1230 sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
1231 if (!mounted_read_only) { 1231 if (!c->ro_mount) {
1232 err = alloc_wbufs(c); 1232 err = alloc_wbufs(c);
1233 if (err) 1233 if (err)
1234 goto out_cbuf; 1234 goto out_cbuf;
@@ -1254,12 +1254,12 @@ static int mount_ubifs(struct ubifs_info *c)
1254 if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) { 1254 if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
1255 ubifs_msg("recovery needed"); 1255 ubifs_msg("recovery needed");
1256 c->need_recovery = 1; 1256 c->need_recovery = 1;
1257 if (!mounted_read_only) { 1257 if (!c->ro_mount) {
1258 err = ubifs_recover_inl_heads(c, c->sbuf); 1258 err = ubifs_recover_inl_heads(c, c->sbuf);
1259 if (err) 1259 if (err)
1260 goto out_master; 1260 goto out_master;
1261 } 1261 }
1262 } else if (!mounted_read_only) { 1262 } else if (!c->ro_mount) {
1263 /* 1263 /*
1264 * Set the "dirty" flag so that if we reboot uncleanly we 1264 * Set the "dirty" flag so that if we reboot uncleanly we
1265 * will notice this immediately on the next mount. 1265 * will notice this immediately on the next mount.
@@ -1270,7 +1270,7 @@ static int mount_ubifs(struct ubifs_info *c)
1270 goto out_master; 1270 goto out_master;
1271 } 1271 }
1272 1272
1273 err = ubifs_lpt_init(c, 1, !mounted_read_only); 1273 err = ubifs_lpt_init(c, 1, !c->ro_mount);
1274 if (err) 1274 if (err)
1275 goto out_lpt; 1275 goto out_lpt;
1276 1276
@@ -1285,11 +1285,11 @@ static int mount_ubifs(struct ubifs_info *c)
1285 /* Calculate 'min_idx_lebs' after journal replay */ 1285 /* Calculate 'min_idx_lebs' after journal replay */
1286 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); 1286 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
1287 1287
1288 err = ubifs_mount_orphans(c, c->need_recovery, mounted_read_only); 1288 err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount);
1289 if (err) 1289 if (err)
1290 goto out_orphans; 1290 goto out_orphans;
1291 1291
1292 if (!mounted_read_only) { 1292 if (!c->ro_mount) {
1293 int lnum; 1293 int lnum;
1294 1294
1295 err = check_free_space(c); 1295 err = check_free_space(c);
@@ -1351,7 +1351,7 @@ static int mount_ubifs(struct ubifs_info *c)
1351 spin_unlock(&ubifs_infos_lock); 1351 spin_unlock(&ubifs_infos_lock);
1352 1352
1353 if (c->need_recovery) { 1353 if (c->need_recovery) {
1354 if (mounted_read_only) 1354 if (c->ro_mount)
1355 ubifs_msg("recovery deferred"); 1355 ubifs_msg("recovery deferred");
1356 else { 1356 else {
1357 c->need_recovery = 0; 1357 c->need_recovery = 0;
@@ -1378,7 +1378,7 @@ static int mount_ubifs(struct ubifs_info *c)
1378 1378
1379 ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"", 1379 ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
1380 c->vi.ubi_num, c->vi.vol_id, c->vi.name); 1380 c->vi.ubi_num, c->vi.vol_id, c->vi.name);
1381 if (mounted_read_only) 1381 if (c->ro_mount)
1382 ubifs_msg("mounted read-only"); 1382 ubifs_msg("mounted read-only");
1383 x = (long long)c->main_lebs * c->leb_size; 1383 x = (long long)c->main_lebs * c->leb_size;
1384 ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d " 1384 ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d "
@@ -1640,7 +1640,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1640 } 1640 }
1641 1641
1642 dbg_gen("re-mounted read-write"); 1642 dbg_gen("re-mounted read-write");
1643 c->vfs_sb->s_flags &= ~MS_RDONLY; 1643 c->ro_mount = 0;
1644 c->remounting_rw = 0; 1644 c->remounting_rw = 0;
1645 c->always_chk_crc = 0; 1645 c->always_chk_crc = 0;
1646 err = dbg_check_space_info(c); 1646 err = dbg_check_space_info(c);
@@ -1676,7 +1676,7 @@ static void ubifs_remount_ro(struct ubifs_info *c)
1676 int i, err; 1676 int i, err;
1677 1677
1678 ubifs_assert(!c->need_recovery); 1678 ubifs_assert(!c->need_recovery);
1679 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY)); 1679 ubifs_assert(!c->ro_mount);
1680 1680
1681 mutex_lock(&c->umount_mutex); 1681 mutex_lock(&c->umount_mutex);
1682 if (c->bgt) { 1682 if (c->bgt) {
@@ -1686,10 +1686,8 @@ static void ubifs_remount_ro(struct ubifs_info *c)
1686 1686
1687 dbg_save_space_info(c); 1687 dbg_save_space_info(c);
1688 1688
1689 for (i = 0; i < c->jhead_cnt; i++) { 1689 for (i = 0; i < c->jhead_cnt; i++)
1690 ubifs_wbuf_sync(&c->jheads[i].wbuf); 1690 ubifs_wbuf_sync(&c->jheads[i].wbuf);
1691 hrtimer_cancel(&c->jheads[i].wbuf.timer);
1692 }
1693 1691
1694 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); 1692 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
1695 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); 1693 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
@@ -1704,6 +1702,7 @@ static void ubifs_remount_ro(struct ubifs_info *c)
1704 vfree(c->ileb_buf); 1702 vfree(c->ileb_buf);
1705 c->ileb_buf = NULL; 1703 c->ileb_buf = NULL;
1706 ubifs_lpt_free(c, 1); 1704 ubifs_lpt_free(c, 1);
1705 c->ro_mount = 1;
1707 err = dbg_check_space_info(c); 1706 err = dbg_check_space_info(c);
1708 if (err) 1707 if (err)
1709 ubifs_ro_mode(c, err); 1708 ubifs_ro_mode(c, err);
@@ -1735,7 +1734,7 @@ static void ubifs_put_super(struct super_block *sb)
1735 * the mutex is locked. 1734 * the mutex is locked.
1736 */ 1735 */
1737 mutex_lock(&c->umount_mutex); 1736 mutex_lock(&c->umount_mutex);
1738 if (!(c->vfs_sb->s_flags & MS_RDONLY)) { 1737 if (!c->ro_mount) {
1739 /* 1738 /*
1740 * First of all kill the background thread to make sure it does 1739 * First of all kill the background thread to make sure it does
1741 * not interfere with un-mounting and freeing resources. 1740 * not interfere with un-mounting and freeing resources.
@@ -1745,23 +1744,22 @@ static void ubifs_put_super(struct super_block *sb)
1745 c->bgt = NULL; 1744 c->bgt = NULL;
1746 } 1745 }
1747 1746
1748 /* Synchronize write-buffers */
1749 if (c->jheads)
1750 for (i = 0; i < c->jhead_cnt; i++)
1751 ubifs_wbuf_sync(&c->jheads[i].wbuf);
1752
1753 /* 1747 /*
1754 * On fatal errors c->ro_media is set to 1, in which case we do 1748 * On fatal errors c->ro_error is set to 1, in which case we do
1755 * not write the master node. 1749 * not write the master node.
1756 */ 1750 */
1757 if (!c->ro_media) { 1751 if (!c->ro_error) {
1752 int err;
1753
1754 /* Synchronize write-buffers */
1755 for (i = 0; i < c->jhead_cnt; i++)
1756 ubifs_wbuf_sync(&c->jheads[i].wbuf);
1757
1758 /* 1758 /*
1759 * We are being cleanly unmounted which means the 1759 * We are being cleanly unmounted which means the
1760 * orphans were killed - indicate this in the master 1760 * orphans were killed - indicate this in the master
1761 * node. Also save the reserved GC LEB number. 1761 * node. Also save the reserved GC LEB number.
1762 */ 1762 */
1763 int err;
1764
1765 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); 1763 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
1766 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); 1764 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
1767 c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); 1765 c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
@@ -1774,6 +1772,10 @@ static void ubifs_put_super(struct super_block *sb)
1774 */ 1772 */
1775 ubifs_err("failed to write master node, " 1773 ubifs_err("failed to write master node, "
1776 "error %d", err); 1774 "error %d", err);
1775 } else {
1776 for (i = 0; i < c->jhead_cnt; i++)
1777 /* Make sure write-buffer timers are canceled */
1778 hrtimer_cancel(&c->jheads[i].wbuf.timer);
1777 } 1779 }
1778 } 1780 }
1779 1781
@@ -1797,17 +1799,21 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1797 return err; 1799 return err;
1798 } 1800 }
1799 1801
1800 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { 1802 if (c->ro_mount && !(*flags & MS_RDONLY)) {
1803 if (c->ro_error) {
1804 ubifs_msg("cannot re-mount R/W due to prior errors");
1805 return -EROFS;
1806 }
1801 if (c->ro_media) { 1807 if (c->ro_media) {
1802 ubifs_msg("cannot re-mount due to prior errors"); 1808 ubifs_msg("cannot re-mount R/W - UBI volume is R/O");
1803 return -EROFS; 1809 return -EROFS;
1804 } 1810 }
1805 err = ubifs_remount_rw(c); 1811 err = ubifs_remount_rw(c);
1806 if (err) 1812 if (err)
1807 return err; 1813 return err;
1808 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { 1814 } else if (!c->ro_mount && (*flags & MS_RDONLY)) {
1809 if (c->ro_media) { 1815 if (c->ro_error) {
1810 ubifs_msg("cannot re-mount due to prior errors"); 1816 ubifs_msg("cannot re-mount R/O due to prior errors");
1811 return -EROFS; 1817 return -EROFS;
1812 } 1818 }
1813 ubifs_remount_ro(c); 1819 ubifs_remount_ro(c);
@@ -2049,8 +2055,8 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
2049 */ 2055 */
2050 ubi = open_ubi(name, UBI_READONLY); 2056 ubi = open_ubi(name, UBI_READONLY);
2051 if (IS_ERR(ubi)) { 2057 if (IS_ERR(ubi)) {
2052 ubifs_err("cannot open \"%s\", error %d", 2058 dbg_err("cannot open \"%s\", error %d",
2053 name, (int)PTR_ERR(ubi)); 2059 name, (int)PTR_ERR(ubi));
2054 return PTR_ERR(ubi); 2060 return PTR_ERR(ubi);
2055 } 2061 }
2056 ubi_get_volume_info(ubi, &vi); 2062 ubi_get_volume_info(ubi, &vi);
@@ -2064,9 +2070,11 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
2064 } 2070 }
2065 2071
2066 if (sb->s_root) { 2072 if (sb->s_root) {
2073 struct ubifs_info *c1 = sb->s_fs_info;
2074
2067 /* A new mount point for already mounted UBIFS */ 2075 /* A new mount point for already mounted UBIFS */
2068 dbg_gen("this ubi volume is already mounted"); 2076 dbg_gen("this ubi volume is already mounted");
2069 if ((flags ^ sb->s_flags) & MS_RDONLY) { 2077 if (!!(flags & MS_RDONLY) != c1->ro_mount) {
2070 err = -EBUSY; 2078 err = -EBUSY;
2071 goto out_deact; 2079 goto out_deact;
2072 } 2080 }
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 2194915220e5..ad9cf0133622 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1177,6 +1177,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
1177 unsigned long time = get_seconds(); 1177 unsigned long time = get_seconds();
1178 1178
1179 dbg_tnc("search key %s", DBGKEY(key)); 1179 dbg_tnc("search key %s", DBGKEY(key));
1180 ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
1180 1181
1181 znode = c->zroot.znode; 1182 znode = c->zroot.znode;
1182 if (unlikely(!znode)) { 1183 if (unlikely(!znode)) {
@@ -2966,7 +2967,7 @@ static struct ubifs_znode *right_znode(struct ubifs_info *c,
2966 * 2967 *
2967 * This function searches an indexing node by its first key @key and its 2968 * This function searches an indexing node by its first key @key and its
2968 * address @lnum:@offs. It looks up the indexing tree by pulling all indexing 2969 * address @lnum:@offs. It looks up the indexing tree by pulling all indexing
2969 * nodes it traverses to TNC. This function is called fro indexing nodes which 2970 * nodes it traverses to TNC. This function is called for indexing nodes which
2970 * were found on the media by scanning, for example when garbage-collecting or 2971 * were found on the media by scanning, for example when garbage-collecting or
2971 * when doing in-the-gaps commit. This means that the indexing node which is 2972 * when doing in-the-gaps commit. This means that the indexing node which is
2972 * looked for does not have to have exactly the same leftmost key @key, because 2973 * looked for does not have to have exactly the same leftmost key @key, because
@@ -2988,6 +2989,8 @@ static struct ubifs_znode *lookup_znode(struct ubifs_info *c,
2988 struct ubifs_znode *znode, *zn; 2989 struct ubifs_znode *znode, *zn;
2989 int n, nn; 2990 int n, nn;
2990 2991
2992 ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
2993
2991 /* 2994 /*
2992 * The arguments have probably been read off flash, so don't assume 2995 * The arguments have probably been read off flash, so don't assume
2993 * they are valid. 2996 * they are valid.
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 0c9876b396dd..381d6b207a52 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -119,8 +119,12 @@
119 * in TNC. However, when replaying, it is handy to introduce fake "truncation" 119 * in TNC. However, when replaying, it is handy to introduce fake "truncation"
120 * keys for truncation nodes because the code becomes simpler. So we define 120 * keys for truncation nodes because the code becomes simpler. So we define
121 * %UBIFS_TRUN_KEY type. 121 * %UBIFS_TRUN_KEY type.
122 *
123 * But otherwise, out of the journal reply scope, the truncation keys are
124 * invalid.
122 */ 125 */
123#define UBIFS_TRUN_KEY UBIFS_KEY_TYPES_CNT 126#define UBIFS_TRUN_KEY UBIFS_KEY_TYPES_CNT
127#define UBIFS_INVALID_KEY UBIFS_KEY_TYPES_CNT
124 128
125/* 129/*
126 * How much a directory entry/extended attribute entry adds to the parent/host 130 * How much a directory entry/extended attribute entry adds to the parent/host
@@ -1028,6 +1032,8 @@ struct ubifs_debug_info;
1028 * @max_leb_cnt: maximum count of logical eraseblocks 1032 * @max_leb_cnt: maximum count of logical eraseblocks
1029 * @old_leb_cnt: count of logical eraseblocks before re-size 1033 * @old_leb_cnt: count of logical eraseblocks before re-size
1030 * @ro_media: the underlying UBI volume is read-only 1034 * @ro_media: the underlying UBI volume is read-only
1035 * @ro_mount: the file-system was mounted as read-only
1036 * @ro_error: UBIFS switched to R/O mode because an error happened
1031 * 1037 *
1032 * @dirty_pg_cnt: number of dirty pages (not used) 1038 * @dirty_pg_cnt: number of dirty pages (not used)
1033 * @dirty_zn_cnt: number of dirty znodes 1039 * @dirty_zn_cnt: number of dirty znodes
@@ -1168,11 +1174,14 @@ struct ubifs_debug_info;
1168 * @replay_sqnum: sequence number of node currently being replayed 1174 * @replay_sqnum: sequence number of node currently being replayed
1169 * @need_recovery: file-system needs recovery 1175 * @need_recovery: file-system needs recovery
1170 * @replaying: set to %1 during journal replay 1176 * @replaying: set to %1 during journal replay
1171 * @unclean_leb_list: LEBs to recover when mounting ro to rw 1177 * @unclean_leb_list: LEBs to recover when re-mounting R/O mounted FS to R/W
1172 * @rcvrd_mst_node: recovered master node to write when mounting ro to rw 1178 * mode
1179 * @rcvrd_mst_node: recovered master node to write when re-mounting R/O mounted
1180 * FS to R/W mode
1173 * @size_tree: inode size information for recovery 1181 * @size_tree: inode size information for recovery
1174 * @remounting_rw: set while remounting from ro to rw (sb flags have MS_RDONLY) 1182 * @remounting_rw: set while re-mounting from R/O mode to R/W mode
1175 * @always_chk_crc: always check CRCs (while mounting and remounting rw) 1183 * @always_chk_crc: always check CRCs (while mounting and remounting to R/W
1184 * mode)
1176 * @mount_opts: UBIFS-specific mount options 1185 * @mount_opts: UBIFS-specific mount options
1177 * 1186 *
1178 * @dbg: debugging-related information 1187 * @dbg: debugging-related information
@@ -1268,7 +1277,9 @@ struct ubifs_info {
1268 int leb_cnt; 1277 int leb_cnt;
1269 int max_leb_cnt; 1278 int max_leb_cnt;
1270 int old_leb_cnt; 1279 int old_leb_cnt;
1271 int ro_media; 1280 unsigned int ro_media:1;
1281 unsigned int ro_mount:1;
1282 unsigned int ro_error:1;
1272 1283
1273 atomic_long_t dirty_pg_cnt; 1284 atomic_long_t dirty_pg_cnt;
1274 atomic_long_t dirty_zn_cnt; 1285 atomic_long_t dirty_zn_cnt;
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index 0e0e99bd6bce..f8def3c8ea4c 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -1,5 +1,6 @@
1config UDF_FS 1config UDF_FS
2 tristate "UDF file system support" 2 tristate "UDF file system support"
3 depends on BKL # needs serious work to remove
3 select CRC_ITU_T 4 select CRC_ITU_T
4 help 5 help
5 This is the new file system used on some CD-ROMs and DVDs. Say Y if 6 This is the new file system used on some CD-ROMs and DVDs. Say Y if
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index bf5fc674193c..6d8dc02baebb 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1101,7 +1101,7 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
1101 inc_nlink(inode); 1101 inc_nlink(inode);
1102 inode->i_ctime = current_fs_time(inode->i_sb); 1102 inode->i_ctime = current_fs_time(inode->i_sb);
1103 mark_inode_dirty(inode); 1103 mark_inode_dirty(inode);
1104 atomic_inc(&inode->i_count); 1104 ihold(inode);
1105 d_instantiate(dentry, inode); 1105 d_instantiate(dentry, inode);
1106 unlock_kernel(); 1106 unlock_kernel();
1107 1107
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 65412d84a45d..76f3d6d97b40 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1880,6 +1880,8 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1880 struct kernel_lb_addr rootdir, fileset; 1880 struct kernel_lb_addr rootdir, fileset;
1881 struct udf_sb_info *sbi; 1881 struct udf_sb_info *sbi;
1882 1882
1883 lock_kernel();
1884
1883 uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT); 1885 uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
1884 uopt.uid = -1; 1886 uopt.uid = -1;
1885 uopt.gid = -1; 1887 uopt.gid = -1;
@@ -1888,8 +1890,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1888 uopt.dmode = UDF_INVALID_MODE; 1890 uopt.dmode = UDF_INVALID_MODE;
1889 1891
1890 sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL); 1892 sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL);
1891 if (!sbi) 1893 if (!sbi) {
1894 unlock_kernel();
1892 return -ENOMEM; 1895 return -ENOMEM;
1896 }
1893 1897
1894 sb->s_fs_info = sbi; 1898 sb->s_fs_info = sbi;
1895 1899
@@ -2035,6 +2039,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
2035 goto error_out; 2039 goto error_out;
2036 } 2040 }
2037 sb->s_maxbytes = MAX_LFS_FILESIZE; 2041 sb->s_maxbytes = MAX_LFS_FILESIZE;
2042 unlock_kernel();
2038 return 0; 2043 return 0;
2039 2044
2040error_out: 2045error_out:
@@ -2055,6 +2060,7 @@ error_out:
2055 kfree(sbi); 2060 kfree(sbi);
2056 sb->s_fs_info = NULL; 2061 sb->s_fs_info = NULL;
2057 2062
2063 unlock_kernel();
2058 return -EINVAL; 2064 return -EINVAL;
2059} 2065}
2060 2066
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
index e4f10a40768a..30c8f223253d 100644
--- a/fs/ufs/Kconfig
+++ b/fs/ufs/Kconfig
@@ -1,6 +1,7 @@
1config UFS_FS 1config UFS_FS
2 tristate "UFS file system support (read only)" 2 tristate "UFS file system support (read only)"
3 depends on BLOCK 3 depends on BLOCK
4 depends on BKL # probably fixable
4 help 5 help
5 BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD, 6 BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
6 OpenBSD and NeXTstep) use a file system called UFS. Some System V 7 OpenBSD and NeXTstep) use a file system called UFS. Some System V
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index b056f02b1fb3..12f39b9e4437 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -180,7 +180,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
180 180
181 inode->i_ctime = CURRENT_TIME_SEC; 181 inode->i_ctime = CURRENT_TIME_SEC;
182 inode_inc_link_count(inode); 182 inode_inc_link_count(inode);
183 atomic_inc(&inode->i_count); 183 ihold(inode);
184 184
185 error = ufs_add_nondir(dentry, inode); 185 error = ufs_add_nondir(dentry, inode);
186 unlock_kernel(); 186 unlock_kernel();
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index d510c1b91817..6b9be90dae7d 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -696,6 +696,8 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
696 unsigned maxsymlen; 696 unsigned maxsymlen;
697 int ret = -EINVAL; 697 int ret = -EINVAL;
698 698
699 lock_kernel();
700
699 uspi = NULL; 701 uspi = NULL;
700 ubh = NULL; 702 ubh = NULL;
701 flags = 0; 703 flags = 0;
@@ -1163,6 +1165,7 @@ magic_found:
1163 goto failed; 1165 goto failed;
1164 1166
1165 UFSD("EXIT\n"); 1167 UFSD("EXIT\n");
1168 unlock_kernel();
1166 return 0; 1169 return 0;
1167 1170
1168dalloc_failed: 1171dalloc_failed:
@@ -1174,10 +1177,12 @@ failed:
1174 kfree(sbi); 1177 kfree(sbi);
1175 sb->s_fs_info = NULL; 1178 sb->s_fs_info = NULL;
1176 UFSD("EXIT (FAILED)\n"); 1179 UFSD("EXIT (FAILED)\n");
1180 unlock_kernel();
1177 return ret; 1181 return ret;
1178 1182
1179failed_nomem: 1183failed_nomem:
1180 UFSD("EXIT (NOMEM)\n"); 1184 UFSD("EXIT (NOMEM)\n");
1185 unlock_kernel();
1181 return -ENOMEM; 1186 return -ENOMEM;
1182} 1187}
1183 1188
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index b552f816de15..c9af48fffcd7 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1139,8 +1139,7 @@ xfs_vm_writepage(
1139 type = IO_DELAY; 1139 type = IO_DELAY;
1140 flags = BMAPI_ALLOCATE; 1140 flags = BMAPI_ALLOCATE;
1141 1141
1142 if (wbc->sync_mode == WB_SYNC_NONE && 1142 if (wbc->sync_mode == WB_SYNC_NONE)
1143 wbc->nonblocking)
1144 flags |= BMAPI_TRYLOCK; 1143 flags |= BMAPI_TRYLOCK;
1145 } 1144 }
1146 1145
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 286e36e21dae..63fd2c07cb57 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -188,8 +188,8 @@ _xfs_buf_initialize(
188 atomic_set(&bp->b_hold, 1); 188 atomic_set(&bp->b_hold, 1);
189 init_completion(&bp->b_iowait); 189 init_completion(&bp->b_iowait);
190 INIT_LIST_HEAD(&bp->b_list); 190 INIT_LIST_HEAD(&bp->b_list);
191 INIT_LIST_HEAD(&bp->b_hash_list); 191 RB_CLEAR_NODE(&bp->b_rbnode);
192 init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */ 192 sema_init(&bp->b_sema, 0); /* held, no waiters */
193 XB_SET_OWNER(bp); 193 XB_SET_OWNER(bp);
194 bp->b_target = target; 194 bp->b_target = target;
195 bp->b_file_offset = range_base; 195 bp->b_file_offset = range_base;
@@ -262,8 +262,6 @@ xfs_buf_free(
262{ 262{
263 trace_xfs_buf_free(bp, _RET_IP_); 263 trace_xfs_buf_free(bp, _RET_IP_);
264 264
265 ASSERT(list_empty(&bp->b_hash_list));
266
267 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { 265 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
268 uint i; 266 uint i;
269 267
@@ -422,8 +420,10 @@ _xfs_buf_find(
422{ 420{
423 xfs_off_t range_base; 421 xfs_off_t range_base;
424 size_t range_length; 422 size_t range_length;
425 xfs_bufhash_t *hash; 423 struct xfs_perag *pag;
426 xfs_buf_t *bp, *n; 424 struct rb_node **rbp;
425 struct rb_node *parent;
426 xfs_buf_t *bp;
427 427
428 range_base = (ioff << BBSHIFT); 428 range_base = (ioff << BBSHIFT);
429 range_length = (isize << BBSHIFT); 429 range_length = (isize << BBSHIFT);
@@ -432,14 +432,37 @@ _xfs_buf_find(
432 ASSERT(!(range_length < (1 << btp->bt_sshift))); 432 ASSERT(!(range_length < (1 << btp->bt_sshift)));
433 ASSERT(!(range_base & (xfs_off_t)btp->bt_smask)); 433 ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
434 434
435 hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)]; 435 /* get tree root */
436 436 pag = xfs_perag_get(btp->bt_mount,
437 spin_lock(&hash->bh_lock); 437 xfs_daddr_to_agno(btp->bt_mount, ioff));
438 438
439 list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) { 439 /* walk tree */
440 ASSERT(btp == bp->b_target); 440 spin_lock(&pag->pag_buf_lock);
441 if (bp->b_file_offset == range_base && 441 rbp = &pag->pag_buf_tree.rb_node;
442 bp->b_buffer_length == range_length) { 442 parent = NULL;
443 bp = NULL;
444 while (*rbp) {
445 parent = *rbp;
446 bp = rb_entry(parent, struct xfs_buf, b_rbnode);
447
448 if (range_base < bp->b_file_offset)
449 rbp = &(*rbp)->rb_left;
450 else if (range_base > bp->b_file_offset)
451 rbp = &(*rbp)->rb_right;
452 else {
453 /*
454 * found a block offset match. If the range doesn't
455 * match, the only way this is allowed is if the buffer
456 * in the cache is stale and the transaction that made
457 * it stale has not yet committed. i.e. we are
458 * reallocating a busy extent. Skip this buffer and
459 * continue searching to the right for an exact match.
460 */
461 if (bp->b_buffer_length != range_length) {
462 ASSERT(bp->b_flags & XBF_STALE);
463 rbp = &(*rbp)->rb_right;
464 continue;
465 }
443 atomic_inc(&bp->b_hold); 466 atomic_inc(&bp->b_hold);
444 goto found; 467 goto found;
445 } 468 }
@@ -449,17 +472,21 @@ _xfs_buf_find(
449 if (new_bp) { 472 if (new_bp) {
450 _xfs_buf_initialize(new_bp, btp, range_base, 473 _xfs_buf_initialize(new_bp, btp, range_base,
451 range_length, flags); 474 range_length, flags);
452 new_bp->b_hash = hash; 475 rb_link_node(&new_bp->b_rbnode, parent, rbp);
453 list_add(&new_bp->b_hash_list, &hash->bh_list); 476 rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
477 /* the buffer keeps the perag reference until it is freed */
478 new_bp->b_pag = pag;
479 spin_unlock(&pag->pag_buf_lock);
454 } else { 480 } else {
455 XFS_STATS_INC(xb_miss_locked); 481 XFS_STATS_INC(xb_miss_locked);
482 spin_unlock(&pag->pag_buf_lock);
483 xfs_perag_put(pag);
456 } 484 }
457
458 spin_unlock(&hash->bh_lock);
459 return new_bp; 485 return new_bp;
460 486
461found: 487found:
462 spin_unlock(&hash->bh_lock); 488 spin_unlock(&pag->pag_buf_lock);
489 xfs_perag_put(pag);
463 490
464 /* Attempt to get the semaphore without sleeping, 491 /* Attempt to get the semaphore without sleeping,
465 * if this does not work then we need to drop the 492 * if this does not work then we need to drop the
@@ -625,8 +652,7 @@ void
625xfs_buf_readahead( 652xfs_buf_readahead(
626 xfs_buftarg_t *target, 653 xfs_buftarg_t *target,
627 xfs_off_t ioff, 654 xfs_off_t ioff,
628 size_t isize, 655 size_t isize)
629 xfs_buf_flags_t flags)
630{ 656{
631 struct backing_dev_info *bdi; 657 struct backing_dev_info *bdi;
632 658
@@ -634,8 +660,42 @@ xfs_buf_readahead(
634 if (bdi_read_congested(bdi)) 660 if (bdi_read_congested(bdi))
635 return; 661 return;
636 662
637 flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); 663 xfs_buf_read(target, ioff, isize,
638 xfs_buf_read(target, ioff, isize, flags); 664 XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK);
665}
666
667/*
668 * Read an uncached buffer from disk. Allocates and returns a locked
669 * buffer containing the disk contents or nothing.
670 */
671struct xfs_buf *
672xfs_buf_read_uncached(
673 struct xfs_mount *mp,
674 struct xfs_buftarg *target,
675 xfs_daddr_t daddr,
676 size_t length,
677 int flags)
678{
679 xfs_buf_t *bp;
680 int error;
681
682 bp = xfs_buf_get_uncached(target, length, flags);
683 if (!bp)
684 return NULL;
685
686 /* set up the buffer for a read IO */
687 xfs_buf_lock(bp);
688 XFS_BUF_SET_ADDR(bp, daddr);
689 XFS_BUF_READ(bp);
690 XFS_BUF_BUSY(bp);
691
692 xfsbdstrat(mp, bp);
693 error = xfs_buf_iowait(bp);
694 if (error || bp->b_error) {
695 xfs_buf_relse(bp);
696 return NULL;
697 }
698 return bp;
639} 699}
640 700
641xfs_buf_t * 701xfs_buf_t *
@@ -707,9 +767,10 @@ xfs_buf_associate_memory(
707} 767}
708 768
709xfs_buf_t * 769xfs_buf_t *
710xfs_buf_get_noaddr( 770xfs_buf_get_uncached(
771 struct xfs_buftarg *target,
711 size_t len, 772 size_t len,
712 xfs_buftarg_t *target) 773 int flags)
713{ 774{
714 unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT; 775 unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
715 int error, i; 776 int error, i;
@@ -725,7 +786,7 @@ xfs_buf_get_noaddr(
725 goto fail_free_buf; 786 goto fail_free_buf;
726 787
727 for (i = 0; i < page_count; i++) { 788 for (i = 0; i < page_count; i++) {
728 bp->b_pages[i] = alloc_page(GFP_KERNEL); 789 bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
729 if (!bp->b_pages[i]) 790 if (!bp->b_pages[i])
730 goto fail_free_mem; 791 goto fail_free_mem;
731 } 792 }
@@ -740,7 +801,7 @@ xfs_buf_get_noaddr(
740 801
741 xfs_buf_unlock(bp); 802 xfs_buf_unlock(bp);
742 803
743 trace_xfs_buf_get_noaddr(bp, _RET_IP_); 804 trace_xfs_buf_get_uncached(bp, _RET_IP_);
744 return bp; 805 return bp;
745 806
746 fail_free_mem: 807 fail_free_mem:
@@ -774,29 +835,30 @@ void
774xfs_buf_rele( 835xfs_buf_rele(
775 xfs_buf_t *bp) 836 xfs_buf_t *bp)
776{ 837{
777 xfs_bufhash_t *hash = bp->b_hash; 838 struct xfs_perag *pag = bp->b_pag;
778 839
779 trace_xfs_buf_rele(bp, _RET_IP_); 840 trace_xfs_buf_rele(bp, _RET_IP_);
780 841
781 if (unlikely(!hash)) { 842 if (!pag) {
782 ASSERT(!bp->b_relse); 843 ASSERT(!bp->b_relse);
844 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
783 if (atomic_dec_and_test(&bp->b_hold)) 845 if (atomic_dec_and_test(&bp->b_hold))
784 xfs_buf_free(bp); 846 xfs_buf_free(bp);
785 return; 847 return;
786 } 848 }
787 849
850 ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
788 ASSERT(atomic_read(&bp->b_hold) > 0); 851 ASSERT(atomic_read(&bp->b_hold) > 0);
789 if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) { 852 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
790 if (bp->b_relse) { 853 if (bp->b_relse) {
791 atomic_inc(&bp->b_hold); 854 atomic_inc(&bp->b_hold);
792 spin_unlock(&hash->bh_lock); 855 spin_unlock(&pag->pag_buf_lock);
793 (*(bp->b_relse)) (bp); 856 bp->b_relse(bp);
794 } else if (bp->b_flags & XBF_FS_MANAGED) {
795 spin_unlock(&hash->bh_lock);
796 } else { 857 } else {
797 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); 858 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
798 list_del_init(&bp->b_hash_list); 859 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
799 spin_unlock(&hash->bh_lock); 860 spin_unlock(&pag->pag_buf_lock);
861 xfs_perag_put(pag);
800 xfs_buf_free(bp); 862 xfs_buf_free(bp);
801 } 863 }
802 } 864 }
@@ -859,7 +921,7 @@ xfs_buf_lock(
859 trace_xfs_buf_lock(bp, _RET_IP_); 921 trace_xfs_buf_lock(bp, _RET_IP_);
860 922
861 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 923 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
862 xfs_log_force(bp->b_mount, 0); 924 xfs_log_force(bp->b_target->bt_mount, 0);
863 if (atomic_read(&bp->b_io_remaining)) 925 if (atomic_read(&bp->b_io_remaining))
864 blk_run_address_space(bp->b_target->bt_mapping); 926 blk_run_address_space(bp->b_target->bt_mapping);
865 down(&bp->b_sema); 927 down(&bp->b_sema);
@@ -924,19 +986,7 @@ xfs_buf_iodone_work(
924 xfs_buf_t *bp = 986 xfs_buf_t *bp =
925 container_of(work, xfs_buf_t, b_iodone_work); 987 container_of(work, xfs_buf_t, b_iodone_work);
926 988
927 /* 989 if (bp->b_iodone)
928 * We can get an EOPNOTSUPP to ordered writes. Here we clear the
929 * ordered flag and reissue them. Because we can't tell the higher
930 * layers directly that they should not issue ordered I/O anymore, they
931 * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
932 */
933 if ((bp->b_error == EOPNOTSUPP) &&
934 (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
935 trace_xfs_buf_ordered_retry(bp, _RET_IP_);
936 bp->b_flags &= ~XBF_ORDERED;
937 bp->b_flags |= _XFS_BARRIER_FAILED;
938 xfs_buf_iorequest(bp);
939 } else if (bp->b_iodone)
940 (*(bp->b_iodone))(bp); 990 (*(bp->b_iodone))(bp);
941 else if (bp->b_flags & XBF_ASYNC) 991 else if (bp->b_flags & XBF_ASYNC)
942 xfs_buf_relse(bp); 992 xfs_buf_relse(bp);
@@ -982,7 +1032,6 @@ xfs_bwrite(
982{ 1032{
983 int error; 1033 int error;
984 1034
985 bp->b_mount = mp;
986 bp->b_flags |= XBF_WRITE; 1035 bp->b_flags |= XBF_WRITE;
987 bp->b_flags &= ~(XBF_ASYNC | XBF_READ); 1036 bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
988 1037
@@ -1003,8 +1052,6 @@ xfs_bdwrite(
1003{ 1052{
1004 trace_xfs_buf_bdwrite(bp, _RET_IP_); 1053 trace_xfs_buf_bdwrite(bp, _RET_IP_);
1005 1054
1006 bp->b_mount = mp;
1007
1008 bp->b_flags &= ~XBF_READ; 1055 bp->b_flags &= ~XBF_READ;
1009 bp->b_flags |= (XBF_DELWRI | XBF_ASYNC); 1056 bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
1010 1057
@@ -1013,7 +1060,7 @@ xfs_bdwrite(
1013 1060
1014/* 1061/*
1015 * Called when we want to stop a buffer from getting written or read. 1062 * Called when we want to stop a buffer from getting written or read.
1016 * We attach the EIO error, muck with its flags, and call biodone 1063 * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
1017 * so that the proper iodone callbacks get called. 1064 * so that the proper iodone callbacks get called.
1018 */ 1065 */
1019STATIC int 1066STATIC int
@@ -1030,21 +1077,21 @@ xfs_bioerror(
1030 XFS_BUF_ERROR(bp, EIO); 1077 XFS_BUF_ERROR(bp, EIO);
1031 1078
1032 /* 1079 /*
1033 * We're calling biodone, so delete XBF_DONE flag. 1080 * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
1034 */ 1081 */
1035 XFS_BUF_UNREAD(bp); 1082 XFS_BUF_UNREAD(bp);
1036 XFS_BUF_UNDELAYWRITE(bp); 1083 XFS_BUF_UNDELAYWRITE(bp);
1037 XFS_BUF_UNDONE(bp); 1084 XFS_BUF_UNDONE(bp);
1038 XFS_BUF_STALE(bp); 1085 XFS_BUF_STALE(bp);
1039 1086
1040 xfs_biodone(bp); 1087 xfs_buf_ioend(bp, 0);
1041 1088
1042 return EIO; 1089 return EIO;
1043} 1090}
1044 1091
1045/* 1092/*
1046 * Same as xfs_bioerror, except that we are releasing the buffer 1093 * Same as xfs_bioerror, except that we are releasing the buffer
1047 * here ourselves, and avoiding the biodone call. 1094 * here ourselves, and avoiding the xfs_buf_ioend call.
1048 * This is meant for userdata errors; metadata bufs come with 1095 * This is meant for userdata errors; metadata bufs come with
1049 * iodone functions attached, so that we can track down errors. 1096 * iodone functions attached, so that we can track down errors.
1050 */ 1097 */
@@ -1093,7 +1140,7 @@ int
1093xfs_bdstrat_cb( 1140xfs_bdstrat_cb(
1094 struct xfs_buf *bp) 1141 struct xfs_buf *bp)
1095{ 1142{
1096 if (XFS_FORCED_SHUTDOWN(bp->b_mount)) { 1143 if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
1097 trace_xfs_bdstrat_shut(bp, _RET_IP_); 1144 trace_xfs_bdstrat_shut(bp, _RET_IP_);
1098 /* 1145 /*
1099 * Metadata write that didn't get logged but 1146 * Metadata write that didn't get logged but
@@ -1195,7 +1242,7 @@ _xfs_buf_ioapply(
1195 1242
1196 if (bp->b_flags & XBF_ORDERED) { 1243 if (bp->b_flags & XBF_ORDERED) {
1197 ASSERT(!(bp->b_flags & XBF_READ)); 1244 ASSERT(!(bp->b_flags & XBF_READ));
1198 rw = WRITE_BARRIER; 1245 rw = WRITE_FLUSH_FUA;
1199 } else if (bp->b_flags & XBF_LOG_BUFFER) { 1246 } else if (bp->b_flags & XBF_LOG_BUFFER) {
1200 ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); 1247 ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
1201 bp->b_flags &= ~_XBF_RUN_QUEUES; 1248 bp->b_flags &= ~_XBF_RUN_QUEUES;
@@ -1399,62 +1446,24 @@ xfs_buf_iomove(
1399 */ 1446 */
1400void 1447void
1401xfs_wait_buftarg( 1448xfs_wait_buftarg(
1402 xfs_buftarg_t *btp) 1449 struct xfs_buftarg *btp)
1403{
1404 xfs_buf_t *bp, *n;
1405 xfs_bufhash_t *hash;
1406 uint i;
1407
1408 for (i = 0; i < (1 << btp->bt_hashshift); i++) {
1409 hash = &btp->bt_hash[i];
1410again:
1411 spin_lock(&hash->bh_lock);
1412 list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
1413 ASSERT(btp == bp->b_target);
1414 if (!(bp->b_flags & XBF_FS_MANAGED)) {
1415 spin_unlock(&hash->bh_lock);
1416 /*
1417 * Catch superblock reference count leaks
1418 * immediately
1419 */
1420 BUG_ON(bp->b_bn == 0);
1421 delay(100);
1422 goto again;
1423 }
1424 }
1425 spin_unlock(&hash->bh_lock);
1426 }
1427}
1428
1429/*
1430 * Allocate buffer hash table for a given target.
1431 * For devices containing metadata (i.e. not the log/realtime devices)
1432 * we need to allocate a much larger hash table.
1433 */
1434STATIC void
1435xfs_alloc_bufhash(
1436 xfs_buftarg_t *btp,
1437 int external)
1438{ 1450{
1439 unsigned int i; 1451 struct xfs_perag *pag;
1452 uint i;
1440 1453
1441 btp->bt_hashshift = external ? 3 : 12; /* 8 or 4096 buckets */ 1454 for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) {
1442 btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) * 1455 pag = xfs_perag_get(btp->bt_mount, i);
1443 sizeof(xfs_bufhash_t)); 1456 spin_lock(&pag->pag_buf_lock);
1444 for (i = 0; i < (1 << btp->bt_hashshift); i++) { 1457 while (rb_first(&pag->pag_buf_tree)) {
1445 spin_lock_init(&btp->bt_hash[i].bh_lock); 1458 spin_unlock(&pag->pag_buf_lock);
1446 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); 1459 delay(100);
1460 spin_lock(&pag->pag_buf_lock);
1461 }
1462 spin_unlock(&pag->pag_buf_lock);
1463 xfs_perag_put(pag);
1447 } 1464 }
1448} 1465}
1449 1466
1450STATIC void
1451xfs_free_bufhash(
1452 xfs_buftarg_t *btp)
1453{
1454 kmem_free_large(btp->bt_hash);
1455 btp->bt_hash = NULL;
1456}
1457
1458/* 1467/*
1459 * buftarg list for delwrite queue processing 1468 * buftarg list for delwrite queue processing
1460 */ 1469 */
@@ -1487,7 +1496,6 @@ xfs_free_buftarg(
1487 xfs_flush_buftarg(btp, 1); 1496 xfs_flush_buftarg(btp, 1);
1488 if (mp->m_flags & XFS_MOUNT_BARRIER) 1497 if (mp->m_flags & XFS_MOUNT_BARRIER)
1489 xfs_blkdev_issue_flush(btp); 1498 xfs_blkdev_issue_flush(btp);
1490 xfs_free_bufhash(btp);
1491 iput(btp->bt_mapping->host); 1499 iput(btp->bt_mapping->host);
1492 1500
1493 /* Unregister the buftarg first so that we don't get a 1501 /* Unregister the buftarg first so that we don't get a
@@ -1572,6 +1580,7 @@ xfs_mapping_buftarg(
1572 XFS_BUFTARG_NAME(btp)); 1580 XFS_BUFTARG_NAME(btp));
1573 return ENOMEM; 1581 return ENOMEM;
1574 } 1582 }
1583 inode->i_ino = get_next_ino();
1575 inode->i_mode = S_IFBLK; 1584 inode->i_mode = S_IFBLK;
1576 inode->i_bdev = bdev; 1585 inode->i_bdev = bdev;
1577 inode->i_rdev = bdev->bd_dev; 1586 inode->i_rdev = bdev->bd_dev;
@@ -1609,6 +1618,7 @@ out_error:
1609 1618
1610xfs_buftarg_t * 1619xfs_buftarg_t *
1611xfs_alloc_buftarg( 1620xfs_alloc_buftarg(
1621 struct xfs_mount *mp,
1612 struct block_device *bdev, 1622 struct block_device *bdev,
1613 int external, 1623 int external,
1614 const char *fsname) 1624 const char *fsname)
@@ -1617,6 +1627,7 @@ xfs_alloc_buftarg(
1617 1627
1618 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); 1628 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
1619 1629
1630 btp->bt_mount = mp;
1620 btp->bt_dev = bdev->bd_dev; 1631 btp->bt_dev = bdev->bd_dev;
1621 btp->bt_bdev = bdev; 1632 btp->bt_bdev = bdev;
1622 if (xfs_setsize_buftarg_early(btp, bdev)) 1633 if (xfs_setsize_buftarg_early(btp, bdev))
@@ -1625,7 +1636,6 @@ xfs_alloc_buftarg(
1625 goto error; 1636 goto error;
1626 if (xfs_alloc_delwrite_queue(btp, fsname)) 1637 if (xfs_alloc_delwrite_queue(btp, fsname))
1627 goto error; 1638 goto error;
1628 xfs_alloc_bufhash(btp, external);
1629 return btp; 1639 return btp;
1630 1640
1631error: 1641error:
@@ -1916,7 +1926,7 @@ xfs_flush_buftarg(
1916 bp = list_first_entry(&wait_list, struct xfs_buf, b_list); 1926 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
1917 1927
1918 list_del_init(&bp->b_list); 1928 list_del_init(&bp->b_list);
1919 xfs_iowait(bp); 1929 xfs_buf_iowait(bp);
1920 xfs_buf_relse(bp); 1930 xfs_buf_relse(bp);
1921 } 1931 }
1922 } 1932 }
@@ -1933,7 +1943,7 @@ xfs_buf_init(void)
1933 goto out; 1943 goto out;
1934 1944
1935 xfslogd_workqueue = alloc_workqueue("xfslogd", 1945 xfslogd_workqueue = alloc_workqueue("xfslogd",
1936 WQ_RESCUER | WQ_HIGHPRI, 1); 1946 WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
1937 if (!xfslogd_workqueue) 1947 if (!xfslogd_workqueue)
1938 goto out_free_buf_zone; 1948 goto out_free_buf_zone;
1939 1949
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 2a05614f0b92..383a3f37cf98 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -51,7 +51,6 @@ typedef enum {
51#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ 51#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
52#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */ 52#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */
53#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */ 53#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */
54#define XBF_FS_MANAGED (1 << 8) /* filesystem controls freeing memory */
55#define XBF_ORDERED (1 << 11)/* use ordered writes */ 54#define XBF_ORDERED (1 << 11)/* use ordered writes */
56#define XBF_READ_AHEAD (1 << 12)/* asynchronous read-ahead */ 55#define XBF_READ_AHEAD (1 << 12)/* asynchronous read-ahead */
57#define XBF_LOG_BUFFER (1 << 13)/* this is a buffer used for the log */ 56#define XBF_LOG_BUFFER (1 << 13)/* this is a buffer used for the log */
@@ -86,14 +85,6 @@ typedef enum {
86 */ 85 */
87#define _XBF_PAGE_LOCKED (1 << 22) 86#define _XBF_PAGE_LOCKED (1 << 22)
88 87
89/*
90 * If we try a barrier write, but it fails we have to communicate
91 * this to the upper layers. Unfortunately b_error gets overwritten
92 * when the buffer is re-issued so we have to add another flag to
93 * keep this information.
94 */
95#define _XFS_BARRIER_FAILED (1 << 23)
96
97typedef unsigned int xfs_buf_flags_t; 88typedef unsigned int xfs_buf_flags_t;
98 89
99#define XFS_BUF_FLAGS \ 90#define XFS_BUF_FLAGS \
@@ -104,7 +95,6 @@ typedef unsigned int xfs_buf_flags_t;
104 { XBF_DONE, "DONE" }, \ 95 { XBF_DONE, "DONE" }, \
105 { XBF_DELWRI, "DELWRI" }, \ 96 { XBF_DELWRI, "DELWRI" }, \
106 { XBF_STALE, "STALE" }, \ 97 { XBF_STALE, "STALE" }, \
107 { XBF_FS_MANAGED, "FS_MANAGED" }, \
108 { XBF_ORDERED, "ORDERED" }, \ 98 { XBF_ORDERED, "ORDERED" }, \
109 { XBF_READ_AHEAD, "READ_AHEAD" }, \ 99 { XBF_READ_AHEAD, "READ_AHEAD" }, \
110 { XBF_LOCK, "LOCK" }, /* should never be set */\ 100 { XBF_LOCK, "LOCK" }, /* should never be set */\
@@ -114,8 +104,7 @@ typedef unsigned int xfs_buf_flags_t;
114 { _XBF_PAGES, "PAGES" }, \ 104 { _XBF_PAGES, "PAGES" }, \
115 { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \ 105 { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \
116 { _XBF_DELWRI_Q, "DELWRI_Q" }, \ 106 { _XBF_DELWRI_Q, "DELWRI_Q" }, \
117 { _XBF_PAGE_LOCKED, "PAGE_LOCKED" }, \ 107 { _XBF_PAGE_LOCKED, "PAGE_LOCKED" }
118 { _XFS_BARRIER_FAILED, "BARRIER_FAILED" }
119 108
120 109
121typedef enum { 110typedef enum {
@@ -132,14 +121,11 @@ typedef struct xfs_buftarg {
132 dev_t bt_dev; 121 dev_t bt_dev;
133 struct block_device *bt_bdev; 122 struct block_device *bt_bdev;
134 struct address_space *bt_mapping; 123 struct address_space *bt_mapping;
124 struct xfs_mount *bt_mount;
135 unsigned int bt_bsize; 125 unsigned int bt_bsize;
136 unsigned int bt_sshift; 126 unsigned int bt_sshift;
137 size_t bt_smask; 127 size_t bt_smask;
138 128
139 /* per device buffer hash table */
140 uint bt_hashshift;
141 xfs_bufhash_t *bt_hash;
142
143 /* per device delwri queue */ 129 /* per device delwri queue */
144 struct task_struct *bt_task; 130 struct task_struct *bt_task;
145 struct list_head bt_list; 131 struct list_head bt_list;
@@ -167,34 +153,41 @@ typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
167#define XB_PAGES 2 153#define XB_PAGES 2
168 154
169typedef struct xfs_buf { 155typedef struct xfs_buf {
156 /*
157 * first cacheline holds all the fields needed for an uncontended cache
158 * hit to be fully processed. The semaphore straddles the cacheline
159 * boundary, but the counter and lock sits on the first cacheline,
160 * which is the only bit that is touched if we hit the semaphore
161 * fast-path on locking.
162 */
163 struct rb_node b_rbnode; /* rbtree node */
164 xfs_off_t b_file_offset; /* offset in file */
165 size_t b_buffer_length;/* size of buffer in bytes */
166 atomic_t b_hold; /* reference count */
167 xfs_buf_flags_t b_flags; /* status flags */
170 struct semaphore b_sema; /* semaphore for lockables */ 168 struct semaphore b_sema; /* semaphore for lockables */
171 unsigned long b_queuetime; /* time buffer was queued */ 169
172 atomic_t b_pin_count; /* pin count */
173 wait_queue_head_t b_waiters; /* unpin waiters */ 170 wait_queue_head_t b_waiters; /* unpin waiters */
174 struct list_head b_list; 171 struct list_head b_list;
175 xfs_buf_flags_t b_flags; /* status flags */ 172 struct xfs_perag *b_pag; /* contains rbtree root */
176 struct list_head b_hash_list; /* hash table list */
177 xfs_bufhash_t *b_hash; /* hash table list start */
178 xfs_buftarg_t *b_target; /* buffer target (device) */ 173 xfs_buftarg_t *b_target; /* buffer target (device) */
179 atomic_t b_hold; /* reference count */
180 xfs_daddr_t b_bn; /* block number for I/O */ 174 xfs_daddr_t b_bn; /* block number for I/O */
181 xfs_off_t b_file_offset; /* offset in file */
182 size_t b_buffer_length;/* size of buffer in bytes */
183 size_t b_count_desired;/* desired transfer size */ 175 size_t b_count_desired;/* desired transfer size */
184 void *b_addr; /* virtual address of buffer */ 176 void *b_addr; /* virtual address of buffer */
185 struct work_struct b_iodone_work; 177 struct work_struct b_iodone_work;
186 atomic_t b_io_remaining; /* #outstanding I/O requests */
187 xfs_buf_iodone_t b_iodone; /* I/O completion function */ 178 xfs_buf_iodone_t b_iodone; /* I/O completion function */
188 xfs_buf_relse_t b_relse; /* releasing function */ 179 xfs_buf_relse_t b_relse; /* releasing function */
189 struct completion b_iowait; /* queue for I/O waiters */ 180 struct completion b_iowait; /* queue for I/O waiters */
190 void *b_fspriv; 181 void *b_fspriv;
191 void *b_fspriv2; 182 void *b_fspriv2;
192 struct xfs_mount *b_mount;
193 unsigned short b_error; /* error code on I/O */
194 unsigned int b_page_count; /* size of page array */
195 unsigned int b_offset; /* page offset in first page */
196 struct page **b_pages; /* array of page pointers */ 183 struct page **b_pages; /* array of page pointers */
197 struct page *b_page_array[XB_PAGES]; /* inline pages */ 184 struct page *b_page_array[XB_PAGES]; /* inline pages */
185 unsigned long b_queuetime; /* time buffer was queued */
186 atomic_t b_pin_count; /* pin count */
187 atomic_t b_io_remaining; /* #outstanding I/O requests */
188 unsigned int b_page_count; /* size of page array */
189 unsigned int b_offset; /* page offset in first page */
190 unsigned short b_error; /* error code on I/O */
198#ifdef XFS_BUF_LOCK_TRACKING 191#ifdef XFS_BUF_LOCK_TRACKING
199 int b_last_holder; 192 int b_last_holder;
200#endif 193#endif
@@ -213,11 +206,13 @@ extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
213 xfs_buf_flags_t); 206 xfs_buf_flags_t);
214 207
215extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *); 208extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
216extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *); 209extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
217extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); 210extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
218extern void xfs_buf_hold(xfs_buf_t *); 211extern void xfs_buf_hold(xfs_buf_t *);
219extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t, 212extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t);
220 xfs_buf_flags_t); 213struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp,
214 struct xfs_buftarg *target,
215 xfs_daddr_t daddr, size_t length, int flags);
221 216
222/* Releasing Buffers */ 217/* Releasing Buffers */
223extern void xfs_buf_free(xfs_buf_t *); 218extern void xfs_buf_free(xfs_buf_t *);
@@ -242,6 +237,8 @@ extern int xfs_buf_iorequest(xfs_buf_t *);
242extern int xfs_buf_iowait(xfs_buf_t *); 237extern int xfs_buf_iowait(xfs_buf_t *);
243extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, 238extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
244 xfs_buf_rw_t); 239 xfs_buf_rw_t);
240#define xfs_buf_zero(bp, off, len) \
241 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
245 242
246static inline int xfs_buf_geterror(xfs_buf_t *bp) 243static inline int xfs_buf_geterror(xfs_buf_t *bp)
247{ 244{
@@ -276,8 +273,6 @@ extern void xfs_buf_terminate(void);
276 XFS_BUF_DONE(bp); \ 273 XFS_BUF_DONE(bp); \
277 } while (0) 274 } while (0)
278 275
279#define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED)
280
281#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI) 276#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI)
282#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp) 277#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp)
283#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI) 278#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI)
@@ -356,25 +351,11 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
356 xfs_buf_rele(bp); 351 xfs_buf_rele(bp);
357} 352}
358 353
359#define xfs_biodone(bp) xfs_buf_ioend(bp, 0)
360
361#define xfs_biomove(bp, off, len, data, rw) \
362 xfs_buf_iomove((bp), (off), (len), (data), \
363 ((rw) == XBF_WRITE) ? XBRW_WRITE : XBRW_READ)
364
365#define xfs_biozero(bp, off, len) \
366 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
367
368#define xfs_iowait(bp) xfs_buf_iowait(bp)
369
370#define xfs_baread(target, rablkno, ralen) \
371 xfs_buf_readahead((target), (rablkno), (ralen), XBF_DONT_BLOCK)
372
373
374/* 354/*
375 * Handling of buftargs. 355 * Handling of buftargs.
376 */ 356 */
377extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *); 357extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
358 struct block_device *, int, const char *);
378extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); 359extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
379extern void xfs_wait_buftarg(xfs_buftarg_t *); 360extern void xfs_wait_buftarg(xfs_buftarg_t *);
380extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 361extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
deleted file mode 100644
index 55bddf3b6091..000000000000
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ /dev/null
@@ -1,28 +0,0 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_CRED_H__
19#define __XFS_CRED_H__
20
21#include <linux/capability.h>
22
23/*
24 * Credentials
25 */
26typedef const struct cred cred_t;
27
28#endif /* __XFS_CRED_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 1f279b012f94..ed88ed16811c 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -32,10 +32,9 @@ xfs_tosspages(
32 xfs_off_t last, 32 xfs_off_t last,
33 int fiopt) 33 int fiopt)
34{ 34{
35 struct address_space *mapping = VFS_I(ip)->i_mapping; 35 /* can't toss partial tail pages, so mask them out */
36 36 last &= ~(PAGE_SIZE - 1);
37 if (mapping->nrpages) 37 truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
38 truncate_inode_pages(mapping, first);
39} 38}
40 39
41int 40int
@@ -50,12 +49,11 @@ xfs_flushinval_pages(
50 49
51 trace_xfs_pagecache_inval(ip, first, last); 50 trace_xfs_pagecache_inval(ip, first, last);
52 51
53 if (mapping->nrpages) { 52 xfs_iflags_clear(ip, XFS_ITRUNCATED);
54 xfs_iflags_clear(ip, XFS_ITRUNCATED); 53 ret = filemap_write_and_wait_range(mapping, first,
55 ret = filemap_write_and_wait(mapping); 54 last == -1 ? LLONG_MAX : last);
56 if (!ret) 55 if (!ret)
57 truncate_inode_pages(mapping, first); 56 truncate_inode_pages_range(mapping, first, last);
58 }
59 return -ret; 57 return -ret;
60} 58}
61 59
@@ -71,10 +69,9 @@ xfs_flush_pages(
71 int ret = 0; 69 int ret = 0;
72 int ret2; 70 int ret2;
73 71
74 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 72 xfs_iflags_clear(ip, XFS_ITRUNCATED);
75 xfs_iflags_clear(ip, XFS_ITRUNCATED); 73 ret = -filemap_fdatawrite_range(mapping, first,
76 ret = -filemap_fdatawrite(mapping); 74 last == -1 ? LLONG_MAX : last);
77 }
78 if (flags & XBF_ASYNC) 75 if (flags & XBF_ASYNC)
79 return ret; 76 return ret;
80 ret2 = xfs_wait_on_pages(ip, first, last); 77 ret2 = xfs_wait_on_pages(ip, first, last);
@@ -91,7 +88,9 @@ xfs_wait_on_pages(
91{ 88{
92 struct address_space *mapping = VFS_I(ip)->i_mapping; 89 struct address_space *mapping = VFS_I(ip)->i_mapping;
93 90
94 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) 91 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
95 return -filemap_fdatawait(mapping); 92 return -filemap_fdatawait_range(mapping, first,
93 last == -1 ? ip->i_size - 1 : last);
94 }
96 return 0; 95 return 0;
97} 96}
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
index 2ae8b1ccb02e..76e81cff70b9 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -16,7 +16,6 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_cred.h"
20#include "xfs_sysctl.h" 19#include "xfs_sysctl.h"
21 20
22/* 21/*
diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h
deleted file mode 100644
index 69f71caf061c..000000000000
--- a/fs/xfs/linux-2.6/xfs_globals.h
+++ /dev/null
@@ -1,23 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_GLOBALS_H__
19#define __XFS_GLOBALS_H__
20
21extern uint64_t xfs_panic_mask; /* set to cause more panics */
22
23#endif /* __XFS_GLOBALS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 3b9e626f7cd1..2ea238f6d38e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -790,7 +790,7 @@ xfs_ioc_fsgetxattr(
790 xfs_ilock(ip, XFS_ILOCK_SHARED); 790 xfs_ilock(ip, XFS_ILOCK_SHARED);
791 fa.fsx_xflags = xfs_ip2xflags(ip); 791 fa.fsx_xflags = xfs_ip2xflags(ip);
792 fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; 792 fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
793 fa.fsx_projid = ip->i_d.di_projid; 793 fa.fsx_projid = xfs_get_projid(ip);
794 794
795 if (attr) { 795 if (attr) {
796 if (ip->i_afp) { 796 if (ip->i_afp) {
@@ -909,10 +909,10 @@ xfs_ioctl_setattr(
909 return XFS_ERROR(EIO); 909 return XFS_ERROR(EIO);
910 910
911 /* 911 /*
912 * Disallow 32bit project ids because on-disk structure 912 * Disallow 32bit project ids when projid32bit feature is not enabled.
913 * is 16bit only.
914 */ 913 */
915 if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1)) 914 if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) &&
915 !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
916 return XFS_ERROR(EINVAL); 916 return XFS_ERROR(EINVAL);
917 917
918 /* 918 /*
@@ -961,7 +961,7 @@ xfs_ioctl_setattr(
961 if (mask & FSX_PROJID) { 961 if (mask & FSX_PROJID) {
962 if (XFS_IS_QUOTA_RUNNING(mp) && 962 if (XFS_IS_QUOTA_RUNNING(mp) &&
963 XFS_IS_PQUOTA_ON(mp) && 963 XFS_IS_PQUOTA_ON(mp) &&
964 ip->i_d.di_projid != fa->fsx_projid) { 964 xfs_get_projid(ip) != fa->fsx_projid) {
965 ASSERT(tp); 965 ASSERT(tp);
966 code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, 966 code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
967 capable(CAP_FOWNER) ? 967 capable(CAP_FOWNER) ?
@@ -1063,12 +1063,12 @@ xfs_ioctl_setattr(
1063 * Change the ownerships and register quota modifications 1063 * Change the ownerships and register quota modifications
1064 * in the transaction. 1064 * in the transaction.
1065 */ 1065 */
1066 if (ip->i_d.di_projid != fa->fsx_projid) { 1066 if (xfs_get_projid(ip) != fa->fsx_projid) {
1067 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { 1067 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
1068 olddquot = xfs_qm_vop_chown(tp, ip, 1068 olddquot = xfs_qm_vop_chown(tp, ip,
1069 &ip->i_gdquot, gdqp); 1069 &ip->i_gdquot, gdqp);
1070 } 1070 }
1071 ip->i_d.di_projid = fa->fsx_projid; 1071 xfs_set_projid(ip, fa->fsx_projid);
1072 1072
1073 /* 1073 /*
1074 * We may have to rev the inode as well as 1074 * We may have to rev the inode as well as
@@ -1088,8 +1088,8 @@ xfs_ioctl_setattr(
1088 xfs_diflags_to_linux(ip); 1088 xfs_diflags_to_linux(ip);
1089 } 1089 }
1090 1090
1091 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1091 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1092 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1092 xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
1093 1093
1094 XFS_STATS_INC(xs_ig_attrchg); 1094 XFS_STATS_INC(xs_ig_attrchg);
1095 1095
@@ -1301,7 +1301,8 @@ xfs_file_ioctl(
1301 case XFS_IOC_ALLOCSP64: 1301 case XFS_IOC_ALLOCSP64:
1302 case XFS_IOC_FREESP64: 1302 case XFS_IOC_FREESP64:
1303 case XFS_IOC_RESVSP64: 1303 case XFS_IOC_RESVSP64:
1304 case XFS_IOC_UNRESVSP64: { 1304 case XFS_IOC_UNRESVSP64:
1305 case XFS_IOC_ZERO_RANGE: {
1305 xfs_flock64_t bf; 1306 xfs_flock64_t bf;
1306 1307
1307 if (copy_from_user(&bf, arg, sizeof(bf))) 1308 if (copy_from_user(&bf, arg, sizeof(bf)))
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 6c83f7f62dc9..b3486dfa5520 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -164,7 +164,8 @@ xfs_ioctl32_bstat_copyin(
164 get_user(bstat->bs_extsize, &bstat32->bs_extsize) || 164 get_user(bstat->bs_extsize, &bstat32->bs_extsize) ||
165 get_user(bstat->bs_extents, &bstat32->bs_extents) || 165 get_user(bstat->bs_extents, &bstat32->bs_extents) ||
166 get_user(bstat->bs_gen, &bstat32->bs_gen) || 166 get_user(bstat->bs_gen, &bstat32->bs_gen) ||
167 get_user(bstat->bs_projid, &bstat32->bs_projid) || 167 get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) ||
168 get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) ||
168 get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) || 169 get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) ||
169 get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) || 170 get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) ||
170 get_user(bstat->bs_aextents, &bstat32->bs_aextents)) 171 get_user(bstat->bs_aextents, &bstat32->bs_aextents))
@@ -218,6 +219,7 @@ xfs_bulkstat_one_fmt_compat(
218 put_user(buffer->bs_extents, &p32->bs_extents) || 219 put_user(buffer->bs_extents, &p32->bs_extents) ||
219 put_user(buffer->bs_gen, &p32->bs_gen) || 220 put_user(buffer->bs_gen, &p32->bs_gen) ||
220 put_user(buffer->bs_projid, &p32->bs_projid) || 221 put_user(buffer->bs_projid, &p32->bs_projid) ||
222 put_user(buffer->bs_projid_hi, &p32->bs_projid_hi) ||
221 put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) || 223 put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) ||
222 put_user(buffer->bs_dmstate, &p32->bs_dmstate) || 224 put_user(buffer->bs_dmstate, &p32->bs_dmstate) ||
223 put_user(buffer->bs_aextents, &p32->bs_aextents)) 225 put_user(buffer->bs_aextents, &p32->bs_aextents))
@@ -574,6 +576,7 @@ xfs_file_compat_ioctl(
574 case XFS_IOC_FSGEOMETRY_V1: 576 case XFS_IOC_FSGEOMETRY_V1:
575 case XFS_IOC_FSGROWFSDATA: 577 case XFS_IOC_FSGROWFSDATA:
576 case XFS_IOC_FSGROWFSRT: 578 case XFS_IOC_FSGROWFSRT:
579 case XFS_IOC_ZERO_RANGE:
577 return xfs_file_ioctl(filp, cmd, p); 580 return xfs_file_ioctl(filp, cmd, p);
578#else 581#else
579 case XFS_IOC_ALLOCSP_32: 582 case XFS_IOC_ALLOCSP_32:
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h
index 1024c4f8ba0d..08b605792a99 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.h
@@ -65,8 +65,10 @@ typedef struct compat_xfs_bstat {
65 __s32 bs_extsize; /* extent size */ 65 __s32 bs_extsize; /* extent size */
66 __s32 bs_extents; /* number of extents */ 66 __s32 bs_extents; /* number of extents */
67 __u32 bs_gen; /* generation count */ 67 __u32 bs_gen; /* generation count */
68 __u16 bs_projid; /* project id */ 68 __u16 bs_projid_lo; /* lower part of project id */
69 unsigned char bs_pad[14]; /* pad space, unused */ 69#define bs_projid bs_projid_lo /* (previously just bs_projid) */
70 __u16 bs_projid_hi; /* high part of project id */
71 unsigned char bs_pad[12]; /* pad space, unused */
70 __u32 bs_dmevmask; /* DMIG event mask */ 72 __u32 bs_dmevmask; /* DMIG event mask */
71 __u16 bs_dmstate; /* DMIG state info */ 73 __u16 bs_dmstate; /* DMIG state info */
72 __u16 bs_aextents; /* attribute number of extents */ 74 __u16 bs_aextents; /* attribute number of extents */
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index b1fc2a6bfe83..96107efc0c61 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -95,41 +95,6 @@ xfs_mark_inode_dirty(
95} 95}
96 96
97/* 97/*
98 * Change the requested timestamp in the given inode.
99 * We don't lock across timestamp updates, and we don't log them but
100 * we do record the fact that there is dirty information in core.
101 */
102void
103xfs_ichgtime(
104 xfs_inode_t *ip,
105 int flags)
106{
107 struct inode *inode = VFS_I(ip);
108 timespec_t tv;
109 int sync_it = 0;
110
111 tv = current_fs_time(inode->i_sb);
112
113 if ((flags & XFS_ICHGTIME_MOD) &&
114 !timespec_equal(&inode->i_mtime, &tv)) {
115 inode->i_mtime = tv;
116 sync_it = 1;
117 }
118 if ((flags & XFS_ICHGTIME_CHG) &&
119 !timespec_equal(&inode->i_ctime, &tv)) {
120 inode->i_ctime = tv;
121 sync_it = 1;
122 }
123
124 /*
125 * Update complete - now make sure everyone knows that the inode
126 * is dirty.
127 */
128 if (sync_it)
129 xfs_mark_inode_dirty_sync(ip);
130}
131
132/*
133 * Hook in SELinux. This is not quite correct yet, what we really need 98 * Hook in SELinux. This is not quite correct yet, what we really need
134 * here (as we do for default ACLs) is a mechanism by which creation of 99 * here (as we do for default ACLs) is a mechanism by which creation of
135 * these attrs can be journalled at inode creation time (along with the 100 * these attrs can be journalled at inode creation time (along with the
@@ -224,7 +189,7 @@ xfs_vn_mknod(
224 } 189 }
225 190
226 xfs_dentry_to_name(&name, dentry); 191 xfs_dentry_to_name(&name, dentry);
227 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL); 192 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
228 if (unlikely(error)) 193 if (unlikely(error))
229 goto out_free_acl; 194 goto out_free_acl;
230 195
@@ -352,7 +317,7 @@ xfs_vn_link(
352 if (unlikely(error)) 317 if (unlikely(error))
353 return -error; 318 return -error;
354 319
355 atomic_inc(&inode->i_count); 320 ihold(inode);
356 d_instantiate(dentry, inode); 321 d_instantiate(dentry, inode);
357 return 0; 322 return 0;
358} 323}
@@ -397,7 +362,7 @@ xfs_vn_symlink(
397 (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); 362 (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
398 xfs_dentry_to_name(&name, dentry); 363 xfs_dentry_to_name(&name, dentry);
399 364
400 error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL); 365 error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
401 if (unlikely(error)) 366 if (unlikely(error))
402 goto out; 367 goto out;
403 368
@@ -795,7 +760,9 @@ xfs_setup_inode(
795 760
796 inode->i_ino = ip->i_ino; 761 inode->i_ino = ip->i_ino;
797 inode->i_state = I_NEW; 762 inode->i_state = I_NEW;
798 inode_add_to_lists(ip->i_mount->m_super, inode); 763
764 inode_sb_list_add(inode);
765 insert_inode_hash(inode);
799 766
800 inode->i_mode = ip->i_d.di_mode; 767 inode->i_mode = ip->i_d.di_mode;
801 inode->i_nlink = ip->i_d.di_nlink; 768 inode->i_nlink = ip->i_d.di_nlink;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 2fa0bd9ebc7f..214ddd71ff79 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -71,6 +71,7 @@
71#include <linux/random.h> 71#include <linux/random.h>
72#include <linux/ctype.h> 72#include <linux/ctype.h>
73#include <linux/writeback.h> 73#include <linux/writeback.h>
74#include <linux/capability.h>
74 75
75#include <asm/page.h> 76#include <asm/page.h>
76#include <asm/div64.h> 77#include <asm/div64.h>
@@ -79,14 +80,12 @@
79#include <asm/byteorder.h> 80#include <asm/byteorder.h>
80#include <asm/unaligned.h> 81#include <asm/unaligned.h>
81 82
82#include <xfs_cred.h>
83#include <xfs_vnode.h> 83#include <xfs_vnode.h>
84#include <xfs_stats.h> 84#include <xfs_stats.h>
85#include <xfs_sysctl.h> 85#include <xfs_sysctl.h>
86#include <xfs_iops.h> 86#include <xfs_iops.h>
87#include <xfs_aops.h> 87#include <xfs_aops.h>
88#include <xfs_super.h> 88#include <xfs_super.h>
89#include <xfs_globals.h>
90#include <xfs_buf.h> 89#include <xfs_buf.h>
91 90
92/* 91/*
@@ -144,7 +143,7 @@
144#define SYNCHRONIZE() barrier() 143#define SYNCHRONIZE() barrier()
145#define __return_address __builtin_return_address(0) 144#define __return_address __builtin_return_address(0)
146 145
147#define dfltprid 0 146#define XFS_PROJID_DEFAULT 0
148#define MAXPATHLEN 1024 147#define MAXPATHLEN 1024
149 148
150#define MIN(a,b) (min(a,b)) 149#define MIN(a,b) (min(a,b))
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a4e07974955b..cf808782c065 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -44,7 +44,6 @@
44#include "xfs_buf_item.h" 44#include "xfs_buf_item.h"
45#include "xfs_utils.h" 45#include "xfs_utils.h"
46#include "xfs_vnodeops.h" 46#include "xfs_vnodeops.h"
47#include "xfs_version.h"
48#include "xfs_log_priv.h" 47#include "xfs_log_priv.h"
49#include "xfs_trans_priv.h" 48#include "xfs_trans_priv.h"
50#include "xfs_filestream.h" 49#include "xfs_filestream.h"
@@ -577,7 +576,7 @@ xfs_max_file_offset(
577 576
578 /* Figure out maximum filesize, on Linux this can depend on 577 /* Figure out maximum filesize, on Linux this can depend on
579 * the filesystem blocksize (on 32 bit platforms). 578 * the filesystem blocksize (on 32 bit platforms).
580 * __block_prepare_write does this in an [unsigned] long... 579 * __block_write_begin does this in an [unsigned] long...
581 * page->index << (PAGE_CACHE_SHIFT - bbits) 580 * page->index << (PAGE_CACHE_SHIFT - bbits)
582 * So, for page sized blocks (4K on 32 bit platforms), 581 * So, for page sized blocks (4K on 32 bit platforms),
583 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is 582 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
@@ -645,7 +644,7 @@ xfs_barrier_test(
645 XFS_BUF_ORDERED(sbp); 644 XFS_BUF_ORDERED(sbp);
646 645
647 xfsbdstrat(mp, sbp); 646 xfsbdstrat(mp, sbp);
648 error = xfs_iowait(sbp); 647 error = xfs_buf_iowait(sbp);
649 648
650 /* 649 /*
651 * Clear all the flags we set and possible error state in the 650 * Clear all the flags we set and possible error state in the
@@ -693,8 +692,7 @@ void
693xfs_blkdev_issue_flush( 692xfs_blkdev_issue_flush(
694 xfs_buftarg_t *buftarg) 693 xfs_buftarg_t *buftarg)
695{ 694{
696 blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL, 695 blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL);
697 BLKDEV_IFL_WAIT);
698} 696}
699 697
700STATIC void 698STATIC void
@@ -758,18 +756,20 @@ xfs_open_devices(
758 * Setup xfs_mount buffer target pointers 756 * Setup xfs_mount buffer target pointers
759 */ 757 */
760 error = ENOMEM; 758 error = ENOMEM;
761 mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname); 759 mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname);
762 if (!mp->m_ddev_targp) 760 if (!mp->m_ddev_targp)
763 goto out_close_rtdev; 761 goto out_close_rtdev;
764 762
765 if (rtdev) { 763 if (rtdev) {
766 mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname); 764 mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1,
765 mp->m_fsname);
767 if (!mp->m_rtdev_targp) 766 if (!mp->m_rtdev_targp)
768 goto out_free_ddev_targ; 767 goto out_free_ddev_targ;
769 } 768 }
770 769
771 if (logdev && logdev != ddev) { 770 if (logdev && logdev != ddev) {
772 mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname); 771 mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1,
772 mp->m_fsname);
773 if (!mp->m_logdev_targp) 773 if (!mp->m_logdev_targp)
774 goto out_free_rtdev_targ; 774 goto out_free_rtdev_targ;
775 } else { 775 } else {
@@ -972,12 +972,7 @@ xfs_fs_inode_init_once(
972 972
973/* 973/*
974 * Dirty the XFS inode when mark_inode_dirty_sync() is called so that 974 * Dirty the XFS inode when mark_inode_dirty_sync() is called so that
975 * we catch unlogged VFS level updates to the inode. Care must be taken 975 * we catch unlogged VFS level updates to the inode.
976 * here - the transaction code calls mark_inode_dirty_sync() to mark the
977 * VFS inode dirty in a transaction and clears the i_update_core field;
978 * it must clear the field after calling mark_inode_dirty_sync() to
979 * correctly indicate that the dirty state has been propagated into the
980 * inode log item.
981 * 976 *
982 * We need the barrier() to maintain correct ordering between unlogged 977 * We need the barrier() to maintain correct ordering between unlogged
983 * updates and the transaction commit code that clears the i_update_core 978 * updates and the transaction commit code that clears the i_update_core
@@ -1521,8 +1516,9 @@ xfs_fs_fill_super(
1521 if (error) 1516 if (error)
1522 goto out_free_fsname; 1517 goto out_free_fsname;
1523 1518
1524 if (xfs_icsb_init_counters(mp)) 1519 error = xfs_icsb_init_counters(mp);
1525 mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB; 1520 if (error)
1521 goto out_close_devices;
1526 1522
1527 error = xfs_readsb(mp, flags); 1523 error = xfs_readsb(mp, flags);
1528 if (error) 1524 if (error)
@@ -1583,6 +1579,7 @@ xfs_fs_fill_super(
1583 xfs_freesb(mp); 1579 xfs_freesb(mp);
1584 out_destroy_counters: 1580 out_destroy_counters:
1585 xfs_icsb_destroy_counters(mp); 1581 xfs_icsb_destroy_counters(mp);
1582 out_close_devices:
1586 xfs_close_devices(mp); 1583 xfs_close_devices(mp);
1587 out_free_fsname: 1584 out_free_fsname:
1588 xfs_free_fsname(mp); 1585 xfs_free_fsname(mp);
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 1ef4a4d2d997..50a3266c999e 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -62,6 +62,7 @@ extern void xfs_qm_exit(void);
62# define XFS_DBG_STRING "no debug" 62# define XFS_DBG_STRING "no debug"
63#endif 63#endif
64 64
65#define XFS_VERSION_STRING "SGI XFS"
65#define XFS_BUILD_OPTIONS XFS_ACL_STRING \ 66#define XFS_BUILD_OPTIONS XFS_ACL_STRING \
66 XFS_SECURITY_STRING \ 67 XFS_SECURITY_STRING \
67 XFS_REALTIME_STRING \ 68 XFS_REALTIME_STRING \
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index d59c4a65d492..37d33254981d 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -39,42 +39,39 @@
39#include <linux/kthread.h> 39#include <linux/kthread.h>
40#include <linux/freezer.h> 40#include <linux/freezer.h>
41 41
42/*
43 * The inode lookup is done in batches to keep the amount of lock traffic and
44 * radix tree lookups to a minimum. The batch size is a trade off between
45 * lookup reduction and stack usage. This is in the reclaim path, so we can't
46 * be too greedy.
47 */
48#define XFS_LOOKUP_BATCH 32
42 49
43STATIC xfs_inode_t * 50STATIC int
44xfs_inode_ag_lookup( 51xfs_inode_ag_walk_grab(
45 struct xfs_mount *mp, 52 struct xfs_inode *ip)
46 struct xfs_perag *pag,
47 uint32_t *first_index,
48 int tag)
49{ 53{
50 int nr_found; 54 struct inode *inode = VFS_I(ip);
51 struct xfs_inode *ip;
52 55
53 /* 56 /* nothing to sync during shutdown */
54 * use a gang lookup to find the next inode in the tree 57 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
55 * as the tree is sparse and a gang lookup walks to find 58 return EFSCORRUPTED;
56 * the number of objects requested. 59
57 */ 60 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
58 if (tag == XFS_ICI_NO_TAG) { 61 if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
59 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 62 return ENOENT;
60 (void **)&ip, *first_index, 1); 63
61 } else { 64 /* If we can't grab the inode, it must on it's way to reclaim. */
62 nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root, 65 if (!igrab(inode))
63 (void **)&ip, *first_index, 1, tag); 66 return ENOENT;
67
68 if (is_bad_inode(inode)) {
69 IRELE(ip);
70 return ENOENT;
64 } 71 }
65 if (!nr_found)
66 return NULL;
67 72
68 /* 73 /* inode is valid */
69 * Update the index for the next lookup. Catch overflows 74 return 0;
70 * into the next AG range which can occur if we have inodes
71 * in the last block of the AG and we are currently
72 * pointing to the last inode.
73 */
74 *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
75 if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
76 return NULL;
77 return ip;
78} 75}
79 76
80STATIC int 77STATIC int
@@ -83,49 +80,75 @@ xfs_inode_ag_walk(
83 struct xfs_perag *pag, 80 struct xfs_perag *pag,
84 int (*execute)(struct xfs_inode *ip, 81 int (*execute)(struct xfs_inode *ip,
85 struct xfs_perag *pag, int flags), 82 struct xfs_perag *pag, int flags),
86 int flags, 83 int flags)
87 int tag,
88 int exclusive,
89 int *nr_to_scan)
90{ 84{
91 uint32_t first_index; 85 uint32_t first_index;
92 int last_error = 0; 86 int last_error = 0;
93 int skipped; 87 int skipped;
88 int done;
89 int nr_found;
94 90
95restart: 91restart:
92 done = 0;
96 skipped = 0; 93 skipped = 0;
97 first_index = 0; 94 first_index = 0;
95 nr_found = 0;
98 do { 96 do {
97 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
99 int error = 0; 98 int error = 0;
100 xfs_inode_t *ip; 99 int i;
101 100
102 if (exclusive) 101 read_lock(&pag->pag_ici_lock);
103 write_lock(&pag->pag_ici_lock); 102 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
104 else 103 (void **)batch, first_index,
105 read_lock(&pag->pag_ici_lock); 104 XFS_LOOKUP_BATCH);
106 ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag); 105 if (!nr_found) {
107 if (!ip) { 106 read_unlock(&pag->pag_ici_lock);
108 if (exclusive)
109 write_unlock(&pag->pag_ici_lock);
110 else
111 read_unlock(&pag->pag_ici_lock);
112 break; 107 break;
113 } 108 }
114 109
115 /* execute releases pag->pag_ici_lock */ 110 /*
116 error = execute(ip, pag, flags); 111 * Grab the inodes before we drop the lock. if we found
117 if (error == EAGAIN) { 112 * nothing, nr == 0 and the loop will be skipped.
118 skipped++; 113 */
119 continue; 114 for (i = 0; i < nr_found; i++) {
115 struct xfs_inode *ip = batch[i];
116
117 if (done || xfs_inode_ag_walk_grab(ip))
118 batch[i] = NULL;
119
120 /*
121 * Update the index for the next lookup. Catch overflows
122 * into the next AG range which can occur if we have inodes
123 * in the last block of the AG and we are currently
124 * pointing to the last inode.
125 */
126 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
127 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
128 done = 1;
129 }
130
131 /* unlock now we've grabbed the inodes. */
132 read_unlock(&pag->pag_ici_lock);
133
134 for (i = 0; i < nr_found; i++) {
135 if (!batch[i])
136 continue;
137 error = execute(batch[i], pag, flags);
138 IRELE(batch[i]);
139 if (error == EAGAIN) {
140 skipped++;
141 continue;
142 }
143 if (error && last_error != EFSCORRUPTED)
144 last_error = error;
120 } 145 }
121 if (error)
122 last_error = error;
123 146
124 /* bail out if the filesystem is corrupted. */ 147 /* bail out if the filesystem is corrupted. */
125 if (error == EFSCORRUPTED) 148 if (error == EFSCORRUPTED)
126 break; 149 break;
127 150
128 } while ((*nr_to_scan)--); 151 } while (nr_found && !done);
129 152
130 if (skipped) { 153 if (skipped) {
131 delay(1); 154 delay(1);
@@ -134,110 +157,32 @@ restart:
134 return last_error; 157 return last_error;
135} 158}
136 159
137/*
138 * Select the next per-ag structure to iterate during the walk. The reclaim
139 * walk is optimised only to walk AGs with reclaimable inodes in them.
140 */
141static struct xfs_perag *
142xfs_inode_ag_iter_next_pag(
143 struct xfs_mount *mp,
144 xfs_agnumber_t *first,
145 int tag)
146{
147 struct xfs_perag *pag = NULL;
148
149 if (tag == XFS_ICI_RECLAIM_TAG) {
150 int found;
151 int ref;
152
153 spin_lock(&mp->m_perag_lock);
154 found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
155 (void **)&pag, *first, 1, tag);
156 if (found <= 0) {
157 spin_unlock(&mp->m_perag_lock);
158 return NULL;
159 }
160 *first = pag->pag_agno + 1;
161 /* open coded pag reference increment */
162 ref = atomic_inc_return(&pag->pag_ref);
163 spin_unlock(&mp->m_perag_lock);
164 trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_);
165 } else {
166 pag = xfs_perag_get(mp, *first);
167 (*first)++;
168 }
169 return pag;
170}
171
172int 160int
173xfs_inode_ag_iterator( 161xfs_inode_ag_iterator(
174 struct xfs_mount *mp, 162 struct xfs_mount *mp,
175 int (*execute)(struct xfs_inode *ip, 163 int (*execute)(struct xfs_inode *ip,
176 struct xfs_perag *pag, int flags), 164 struct xfs_perag *pag, int flags),
177 int flags, 165 int flags)
178 int tag,
179 int exclusive,
180 int *nr_to_scan)
181{ 166{
182 struct xfs_perag *pag; 167 struct xfs_perag *pag;
183 int error = 0; 168 int error = 0;
184 int last_error = 0; 169 int last_error = 0;
185 xfs_agnumber_t ag; 170 xfs_agnumber_t ag;
186 int nr;
187 171
188 nr = nr_to_scan ? *nr_to_scan : INT_MAX;
189 ag = 0; 172 ag = 0;
190 while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) { 173 while ((pag = xfs_perag_get(mp, ag))) {
191 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, 174 ag = pag->pag_agno + 1;
192 exclusive, &nr); 175 error = xfs_inode_ag_walk(mp, pag, execute, flags);
193 xfs_perag_put(pag); 176 xfs_perag_put(pag);
194 if (error) { 177 if (error) {
195 last_error = error; 178 last_error = error;
196 if (error == EFSCORRUPTED) 179 if (error == EFSCORRUPTED)
197 break; 180 break;
198 } 181 }
199 if (nr <= 0)
200 break;
201 } 182 }
202 if (nr_to_scan)
203 *nr_to_scan = nr;
204 return XFS_ERROR(last_error); 183 return XFS_ERROR(last_error);
205} 184}
206 185
207/* must be called with pag_ici_lock held and releases it */
208int
209xfs_sync_inode_valid(
210 struct xfs_inode *ip,
211 struct xfs_perag *pag)
212{
213 struct inode *inode = VFS_I(ip);
214 int error = EFSCORRUPTED;
215
216 /* nothing to sync during shutdown */
217 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
218 goto out_unlock;
219
220 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
221 error = ENOENT;
222 if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
223 goto out_unlock;
224
225 /* If we can't grab the inode, it must on it's way to reclaim. */
226 if (!igrab(inode))
227 goto out_unlock;
228
229 if (is_bad_inode(inode)) {
230 IRELE(ip);
231 goto out_unlock;
232 }
233
234 /* inode is valid */
235 error = 0;
236out_unlock:
237 read_unlock(&pag->pag_ici_lock);
238 return error;
239}
240
241STATIC int 186STATIC int
242xfs_sync_inode_data( 187xfs_sync_inode_data(
243 struct xfs_inode *ip, 188 struct xfs_inode *ip,
@@ -248,10 +193,6 @@ xfs_sync_inode_data(
248 struct address_space *mapping = inode->i_mapping; 193 struct address_space *mapping = inode->i_mapping;
249 int error = 0; 194 int error = 0;
250 195
251 error = xfs_sync_inode_valid(ip, pag);
252 if (error)
253 return error;
254
255 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 196 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
256 goto out_wait; 197 goto out_wait;
257 198
@@ -268,7 +209,6 @@ xfs_sync_inode_data(
268 out_wait: 209 out_wait:
269 if (flags & SYNC_WAIT) 210 if (flags & SYNC_WAIT)
270 xfs_ioend_wait(ip); 211 xfs_ioend_wait(ip);
271 IRELE(ip);
272 return error; 212 return error;
273} 213}
274 214
@@ -280,10 +220,6 @@ xfs_sync_inode_attr(
280{ 220{
281 int error = 0; 221 int error = 0;
282 222
283 error = xfs_sync_inode_valid(ip, pag);
284 if (error)
285 return error;
286
287 xfs_ilock(ip, XFS_ILOCK_SHARED); 223 xfs_ilock(ip, XFS_ILOCK_SHARED);
288 if (xfs_inode_clean(ip)) 224 if (xfs_inode_clean(ip))
289 goto out_unlock; 225 goto out_unlock;
@@ -302,7 +238,6 @@ xfs_sync_inode_attr(
302 238
303 out_unlock: 239 out_unlock:
304 xfs_iunlock(ip, XFS_ILOCK_SHARED); 240 xfs_iunlock(ip, XFS_ILOCK_SHARED);
305 IRELE(ip);
306 return error; 241 return error;
307} 242}
308 243
@@ -318,8 +253,7 @@ xfs_sync_data(
318 253
319 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); 254 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
320 255
321 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, 256 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
322 XFS_ICI_NO_TAG, 0, NULL);
323 if (error) 257 if (error)
324 return XFS_ERROR(error); 258 return XFS_ERROR(error);
325 259
@@ -337,8 +271,7 @@ xfs_sync_attr(
337{ 271{
338 ASSERT((flags & ~SYNC_WAIT) == 0); 272 ASSERT((flags & ~SYNC_WAIT) == 0);
339 273
340 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, 274 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags);
341 XFS_ICI_NO_TAG, 0, NULL);
342} 275}
343 276
344STATIC int 277STATIC int
@@ -668,14 +601,11 @@ xfs_inode_set_reclaim_tag(
668 xfs_perag_put(pag); 601 xfs_perag_put(pag);
669} 602}
670 603
671void 604STATIC void
672__xfs_inode_clear_reclaim_tag( 605__xfs_inode_clear_reclaim(
673 xfs_mount_t *mp,
674 xfs_perag_t *pag, 606 xfs_perag_t *pag,
675 xfs_inode_t *ip) 607 xfs_inode_t *ip)
676{ 608{
677 radix_tree_tag_clear(&pag->pag_ici_root,
678 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
679 pag->pag_ici_reclaimable--; 609 pag->pag_ici_reclaimable--;
680 if (!pag->pag_ici_reclaimable) { 610 if (!pag->pag_ici_reclaimable) {
681 /* clear the reclaim tag from the perag radix tree */ 611 /* clear the reclaim tag from the perag radix tree */
@@ -689,6 +619,54 @@ __xfs_inode_clear_reclaim_tag(
689 } 619 }
690} 620}
691 621
622void
623__xfs_inode_clear_reclaim_tag(
624 xfs_mount_t *mp,
625 xfs_perag_t *pag,
626 xfs_inode_t *ip)
627{
628 radix_tree_tag_clear(&pag->pag_ici_root,
629 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
630 __xfs_inode_clear_reclaim(pag, ip);
631}
632
633/*
634 * Grab the inode for reclaim exclusively.
635 * Return 0 if we grabbed it, non-zero otherwise.
636 */
637STATIC int
638xfs_reclaim_inode_grab(
639 struct xfs_inode *ip,
640 int flags)
641{
642
643 /*
644 * do some unlocked checks first to avoid unnecceary lock traffic.
645 * The first is a flush lock check, the second is a already in reclaim
646 * check. Only do these checks if we are not going to block on locks.
647 */
648 if ((flags & SYNC_TRYLOCK) &&
649 (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
650 return 1;
651 }
652
653 /*
654 * The radix tree lock here protects a thread in xfs_iget from racing
655 * with us starting reclaim on the inode. Once we have the
656 * XFS_IRECLAIM flag set it will not touch us.
657 */
658 spin_lock(&ip->i_flags_lock);
659 ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
660 if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
661 /* ignore as it is already under reclaim */
662 spin_unlock(&ip->i_flags_lock);
663 return 1;
664 }
665 __xfs_iflags_set(ip, XFS_IRECLAIM);
666 spin_unlock(&ip->i_flags_lock);
667 return 0;
668}
669
692/* 670/*
693 * Inodes in different states need to be treated differently, and the return 671 * Inodes in different states need to be treated differently, and the return
694 * value of xfs_iflush is not sufficient to get this right. The following table 672 * value of xfs_iflush is not sufficient to get this right. The following table
@@ -747,23 +725,6 @@ xfs_reclaim_inode(
747{ 725{
748 int error = 0; 726 int error = 0;
749 727
750 /*
751 * The radix tree lock here protects a thread in xfs_iget from racing
752 * with us starting reclaim on the inode. Once we have the
753 * XFS_IRECLAIM flag set it will not touch us.
754 */
755 spin_lock(&ip->i_flags_lock);
756 ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
757 if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
758 /* ignore as it is already under reclaim */
759 spin_unlock(&ip->i_flags_lock);
760 write_unlock(&pag->pag_ici_lock);
761 return 0;
762 }
763 __xfs_iflags_set(ip, XFS_IRECLAIM);
764 spin_unlock(&ip->i_flags_lock);
765 write_unlock(&pag->pag_ici_lock);
766
767 xfs_ilock(ip, XFS_ILOCK_EXCL); 728 xfs_ilock(ip, XFS_ILOCK_EXCL);
768 if (!xfs_iflock_nowait(ip)) { 729 if (!xfs_iflock_nowait(ip)) {
769 if (!(sync_mode & SYNC_WAIT)) 730 if (!(sync_mode & SYNC_WAIT))
@@ -838,6 +799,7 @@ reclaim:
838 if (!radix_tree_delete(&pag->pag_ici_root, 799 if (!radix_tree_delete(&pag->pag_ici_root,
839 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) 800 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
840 ASSERT(0); 801 ASSERT(0);
802 __xfs_inode_clear_reclaim(pag, ip);
841 write_unlock(&pag->pag_ici_lock); 803 write_unlock(&pag->pag_ici_lock);
842 804
843 /* 805 /*
@@ -859,13 +821,126 @@ reclaim:
859 821
860} 822}
861 823
824/*
825 * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
826 * corrupted, we still want to try to reclaim all the inodes. If we don't,
827 * then a shut down during filesystem unmount reclaim walk leak all the
828 * unreclaimed inodes.
829 */
830int
831xfs_reclaim_inodes_ag(
832 struct xfs_mount *mp,
833 int flags,
834 int *nr_to_scan)
835{
836 struct xfs_perag *pag;
837 int error = 0;
838 int last_error = 0;
839 xfs_agnumber_t ag;
840 int trylock = flags & SYNC_TRYLOCK;
841 int skipped;
842
843restart:
844 ag = 0;
845 skipped = 0;
846 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
847 unsigned long first_index = 0;
848 int done = 0;
849 int nr_found = 0;
850
851 ag = pag->pag_agno + 1;
852
853 if (trylock) {
854 if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
855 skipped++;
856 continue;
857 }
858 first_index = pag->pag_ici_reclaim_cursor;
859 } else
860 mutex_lock(&pag->pag_ici_reclaim_lock);
861
862 do {
863 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
864 int i;
865
866 write_lock(&pag->pag_ici_lock);
867 nr_found = radix_tree_gang_lookup_tag(
868 &pag->pag_ici_root,
869 (void **)batch, first_index,
870 XFS_LOOKUP_BATCH,
871 XFS_ICI_RECLAIM_TAG);
872 if (!nr_found) {
873 write_unlock(&pag->pag_ici_lock);
874 break;
875 }
876
877 /*
878 * Grab the inodes before we drop the lock. if we found
879 * nothing, nr == 0 and the loop will be skipped.
880 */
881 for (i = 0; i < nr_found; i++) {
882 struct xfs_inode *ip = batch[i];
883
884 if (done || xfs_reclaim_inode_grab(ip, flags))
885 batch[i] = NULL;
886
887 /*
888 * Update the index for the next lookup. Catch
889 * overflows into the next AG range which can
890 * occur if we have inodes in the last block of
891 * the AG and we are currently pointing to the
892 * last inode.
893 */
894 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
895 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
896 done = 1;
897 }
898
899 /* unlock now we've grabbed the inodes. */
900 write_unlock(&pag->pag_ici_lock);
901
902 for (i = 0; i < nr_found; i++) {
903 if (!batch[i])
904 continue;
905 error = xfs_reclaim_inode(batch[i], pag, flags);
906 if (error && last_error != EFSCORRUPTED)
907 last_error = error;
908 }
909
910 *nr_to_scan -= XFS_LOOKUP_BATCH;
911
912 } while (nr_found && !done && *nr_to_scan > 0);
913
914 if (trylock && !done)
915 pag->pag_ici_reclaim_cursor = first_index;
916 else
917 pag->pag_ici_reclaim_cursor = 0;
918 mutex_unlock(&pag->pag_ici_reclaim_lock);
919 xfs_perag_put(pag);
920 }
921
922 /*
923 * if we skipped any AG, and we still have scan count remaining, do
924 * another pass this time using blocking reclaim semantics (i.e
925 * waiting on the reclaim locks and ignoring the reclaim cursors). This
926 * ensure that when we get more reclaimers than AGs we block rather
927 * than spin trying to execute reclaim.
928 */
929 if (trylock && skipped && *nr_to_scan > 0) {
930 trylock = 0;
931 goto restart;
932 }
933 return XFS_ERROR(last_error);
934}
935
862int 936int
863xfs_reclaim_inodes( 937xfs_reclaim_inodes(
864 xfs_mount_t *mp, 938 xfs_mount_t *mp,
865 int mode) 939 int mode)
866{ 940{
867 return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode, 941 int nr_to_scan = INT_MAX;
868 XFS_ICI_RECLAIM_TAG, 1, NULL); 942
943 return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
869} 944}
870 945
871/* 946/*
@@ -887,17 +962,16 @@ xfs_reclaim_inode_shrink(
887 if (!(gfp_mask & __GFP_FS)) 962 if (!(gfp_mask & __GFP_FS))
888 return -1; 963 return -1;
889 964
890 xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0, 965 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan);
891 XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan); 966 /* terminate if we don't exhaust the scan */
892 /* if we don't exhaust the scan, don't bother coming back */
893 if (nr_to_scan > 0) 967 if (nr_to_scan > 0)
894 return -1; 968 return -1;
895 } 969 }
896 970
897 reclaimable = 0; 971 reclaimable = 0;
898 ag = 0; 972 ag = 0;
899 while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, 973 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
900 XFS_ICI_RECLAIM_TAG))) { 974 ag = pag->pag_agno + 1;
901 reclaimable += pag->pag_ici_reclaimable; 975 reclaimable += pag->pag_ici_reclaimable;
902 xfs_perag_put(pag); 976 xfs_perag_put(pag);
903 } 977 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index fe78726196f8..32ba6628290c 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -47,10 +47,10 @@ void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
47void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, 47void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
48 struct xfs_inode *ip); 48 struct xfs_inode *ip);
49 49
50int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag); 50int xfs_sync_inode_grab(struct xfs_inode *ip);
51int xfs_inode_ag_iterator(struct xfs_mount *mp, 51int xfs_inode_ag_iterator(struct xfs_mount *mp,
52 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), 52 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
53 int flags, int tag, int write_lock, int *nr_to_scan); 53 int flags);
54 54
55void xfs_inode_shrinker_register(struct xfs_mount *mp); 55void xfs_inode_shrinker_register(struct xfs_mount *mp);
56void xfs_inode_shrinker_unregister(struct xfs_mount *mp); 56void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index be5dffd282a1..acef2e98c594 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -124,7 +124,7 @@ DEFINE_EVENT(xfs_perag_class, name, \
124 unsigned long caller_ip), \ 124 unsigned long caller_ip), \
125 TP_ARGS(mp, agno, refcount, caller_ip)) 125 TP_ARGS(mp, agno, refcount, caller_ip))
126DEFINE_PERAG_REF_EVENT(xfs_perag_get); 126DEFINE_PERAG_REF_EVENT(xfs_perag_get);
127DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim); 127DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
128DEFINE_PERAG_REF_EVENT(xfs_perag_put); 128DEFINE_PERAG_REF_EVENT(xfs_perag_put);
129DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); 129DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
130DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); 130DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
@@ -325,13 +325,12 @@ DEFINE_BUF_EVENT(xfs_buf_lock);
325DEFINE_BUF_EVENT(xfs_buf_lock_done); 325DEFINE_BUF_EVENT(xfs_buf_lock_done);
326DEFINE_BUF_EVENT(xfs_buf_cond_lock); 326DEFINE_BUF_EVENT(xfs_buf_cond_lock);
327DEFINE_BUF_EVENT(xfs_buf_unlock); 327DEFINE_BUF_EVENT(xfs_buf_unlock);
328DEFINE_BUF_EVENT(xfs_buf_ordered_retry);
329DEFINE_BUF_EVENT(xfs_buf_iowait); 328DEFINE_BUF_EVENT(xfs_buf_iowait);
330DEFINE_BUF_EVENT(xfs_buf_iowait_done); 329DEFINE_BUF_EVENT(xfs_buf_iowait_done);
331DEFINE_BUF_EVENT(xfs_buf_delwri_queue); 330DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
332DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue); 331DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue);
333DEFINE_BUF_EVENT(xfs_buf_delwri_split); 332DEFINE_BUF_EVENT(xfs_buf_delwri_split);
334DEFINE_BUF_EVENT(xfs_buf_get_noaddr); 333DEFINE_BUF_EVENT(xfs_buf_get_uncached);
335DEFINE_BUF_EVENT(xfs_bdstrat_shut); 334DEFINE_BUF_EVENT(xfs_bdstrat_shut);
336DEFINE_BUF_EVENT(xfs_buf_item_relse); 335DEFINE_BUF_EVENT(xfs_buf_item_relse);
337DEFINE_BUF_EVENT(xfs_buf_item_iodone); 336DEFINE_BUF_EVENT(xfs_buf_item_iodone);
diff --git a/fs/xfs/linux-2.6/xfs_version.h b/fs/xfs/linux-2.6/xfs_version.h
deleted file mode 100644
index f8d279d7563a..000000000000
--- a/fs/xfs/linux-2.6/xfs_version.h
+++ /dev/null
@@ -1,29 +0,0 @@
1/*
2 * Copyright (c) 2001-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_VERSION_H__
19#define __XFS_VERSION_H__
20
21/*
22 * Dummy file that can contain a timestamp to put into the
23 * XFS init string, to help users keep track of what they're
24 * running
25 */
26
27#define XFS_VERSION_STRING "SGI XFS"
28
29#endif /* __XFS_VERSION_H__ */
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index e1a2f6800e01..faf8e1a83a12 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -463,87 +463,68 @@ xfs_qm_dqtobp(
463 uint flags) 463 uint flags)
464{ 464{
465 xfs_bmbt_irec_t map; 465 xfs_bmbt_irec_t map;
466 int nmaps, error; 466 int nmaps = 1, error;
467 xfs_buf_t *bp; 467 xfs_buf_t *bp;
468 xfs_inode_t *quotip; 468 xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp);
469 xfs_mount_t *mp; 469 xfs_mount_t *mp = dqp->q_mount;
470 xfs_disk_dquot_t *ddq; 470 xfs_disk_dquot_t *ddq;
471 xfs_dqid_t id; 471 xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
472 boolean_t newdquot;
473 xfs_trans_t *tp = (tpp ? *tpp : NULL); 472 xfs_trans_t *tp = (tpp ? *tpp : NULL);
474 473
475 mp = dqp->q_mount; 474 dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
476 id = be32_to_cpu(dqp->q_core.d_id);
477 nmaps = 1;
478 newdquot = B_FALSE;
479 475
480 /* 476 xfs_ilock(quotip, XFS_ILOCK_SHARED);
481 * If we don't know where the dquot lives, find out. 477 if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
482 */
483 if (dqp->q_blkno == (xfs_daddr_t) 0) {
484 /* We use the id as an index */
485 dqp->q_fileoffset = (xfs_fileoff_t)id /
486 mp->m_quotainfo->qi_dqperchunk;
487 nmaps = 1;
488 quotip = XFS_DQ_TO_QIP(dqp);
489 xfs_ilock(quotip, XFS_ILOCK_SHARED);
490 /* 478 /*
491 * Return if this type of quotas is turned off while we didn't 479 * Return if this type of quotas is turned off while we
492 * have an inode lock 480 * didn't have the quota inode lock.
493 */ 481 */
494 if (XFS_IS_THIS_QUOTA_OFF(dqp)) { 482 xfs_iunlock(quotip, XFS_ILOCK_SHARED);
495 xfs_iunlock(quotip, XFS_ILOCK_SHARED); 483 return ESRCH;
496 return (ESRCH); 484 }
497 } 485
486 /*
487 * Find the block map; no allocations yet
488 */
489 error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
490 XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
491 NULL, 0, &map, &nmaps, NULL);
492
493 xfs_iunlock(quotip, XFS_ILOCK_SHARED);
494 if (error)
495 return error;
496
497 ASSERT(nmaps == 1);
498 ASSERT(map.br_blockcount == 1);
499
500 /*
501 * Offset of dquot in the (fixed sized) dquot chunk.
502 */
503 dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
504 sizeof(xfs_dqblk_t);
505
506 ASSERT(map.br_startblock != DELAYSTARTBLOCK);
507 if (map.br_startblock == HOLESTARTBLOCK) {
498 /* 508 /*
499 * Find the block map; no allocations yet 509 * We don't allocate unless we're asked to
500 */ 510 */
501 error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset, 511 if (!(flags & XFS_QMOPT_DQALLOC))
502 XFS_DQUOT_CLUSTER_SIZE_FSB, 512 return ENOENT;
503 XFS_BMAPI_METADATA,
504 NULL, 0, &map, &nmaps, NULL);
505 513
506 xfs_iunlock(quotip, XFS_ILOCK_SHARED); 514 ASSERT(tp);
515 error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
516 dqp->q_fileoffset, &bp);
507 if (error) 517 if (error)
508 return (error); 518 return error;
509 ASSERT(nmaps == 1); 519 tp = *tpp;
510 ASSERT(map.br_blockcount == 1); 520 } else {
521 trace_xfs_dqtobp_read(dqp);
511 522
512 /* 523 /*
513 * offset of dquot in the (fixed sized) dquot chunk. 524 * store the blkno etc so that we don't have to do the
525 * mapping all the time
514 */ 526 */
515 dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) * 527 dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
516 sizeof(xfs_dqblk_t);
517 if (map.br_startblock == HOLESTARTBLOCK) {
518 /*
519 * We don't allocate unless we're asked to
520 */
521 if (!(flags & XFS_QMOPT_DQALLOC))
522 return (ENOENT);
523
524 ASSERT(tp);
525 if ((error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
526 dqp->q_fileoffset, &bp)))
527 return (error);
528 tp = *tpp;
529 newdquot = B_TRUE;
530 } else {
531 /*
532 * store the blkno etc so that we don't have to do the
533 * mapping all the time
534 */
535 dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
536 }
537 }
538 ASSERT(dqp->q_blkno != DELAYSTARTBLOCK);
539 ASSERT(dqp->q_blkno != HOLESTARTBLOCK);
540
541 /*
542 * Read in the buffer, unless we've just done the allocation
543 * (in which case we already have the buf).
544 */
545 if (!newdquot) {
546 trace_xfs_dqtobp_read(dqp);
547 528
548 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 529 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
549 dqp->q_blkno, 530 dqp->q_blkno,
@@ -552,13 +533,14 @@ xfs_qm_dqtobp(
552 if (error || !bp) 533 if (error || !bp)
553 return XFS_ERROR(error); 534 return XFS_ERROR(error);
554 } 535 }
536
555 ASSERT(XFS_BUF_ISBUSY(bp)); 537 ASSERT(XFS_BUF_ISBUSY(bp));
556 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 538 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
557 539
558 /* 540 /*
559 * calculate the location of the dquot inside the buffer. 541 * calculate the location of the dquot inside the buffer.
560 */ 542 */
561 ddq = (xfs_disk_dquot_t *)((char *)XFS_BUF_PTR(bp) + dqp->q_bufoffset); 543 ddq = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
562 544
563 /* 545 /*
564 * A simple sanity check in case we got a corrupted dquot... 546 * A simple sanity check in case we got a corrupted dquot...
@@ -1176,18 +1158,18 @@ xfs_qm_dqflush(
1176 xfs_dquot_t *dqp, 1158 xfs_dquot_t *dqp,
1177 uint flags) 1159 uint flags)
1178{ 1160{
1179 xfs_mount_t *mp; 1161 struct xfs_mount *mp = dqp->q_mount;
1180 xfs_buf_t *bp; 1162 struct xfs_buf *bp;
1181 xfs_disk_dquot_t *ddqp; 1163 struct xfs_disk_dquot *ddqp;
1182 int error; 1164 int error;
1183 1165
1184 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 1166 ASSERT(XFS_DQ_IS_LOCKED(dqp));
1185 ASSERT(!completion_done(&dqp->q_flush)); 1167 ASSERT(!completion_done(&dqp->q_flush));
1168
1186 trace_xfs_dqflush(dqp); 1169 trace_xfs_dqflush(dqp);
1187 1170
1188 /* 1171 /*
1189 * If not dirty, or it's pinned and we are not supposed to 1172 * If not dirty, or it's pinned and we are not supposed to block, nada.
1190 * block, nada.
1191 */ 1173 */
1192 if (!XFS_DQ_IS_DIRTY(dqp) || 1174 if (!XFS_DQ_IS_DIRTY(dqp) ||
1193 (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) { 1175 (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) {
@@ -1201,40 +1183,46 @@ xfs_qm_dqflush(
1201 * down forcibly. If that's the case we must not write this dquot 1183 * down forcibly. If that's the case we must not write this dquot
1202 * to disk, because the log record didn't make it to disk! 1184 * to disk, because the log record didn't make it to disk!
1203 */ 1185 */
1204 if (XFS_FORCED_SHUTDOWN(dqp->q_mount)) { 1186 if (XFS_FORCED_SHUTDOWN(mp)) {
1205 dqp->dq_flags &= ~(XFS_DQ_DIRTY); 1187 dqp->dq_flags &= ~XFS_DQ_DIRTY;
1206 xfs_dqfunlock(dqp); 1188 xfs_dqfunlock(dqp);
1207 return XFS_ERROR(EIO); 1189 return XFS_ERROR(EIO);
1208 } 1190 }
1209 1191
1210 /* 1192 /*
1211 * Get the buffer containing the on-disk dquot 1193 * Get the buffer containing the on-disk dquot
1212 * We don't need a transaction envelope because we know that the
1213 * the ondisk-dquot has already been allocated for.
1214 */ 1194 */
1215 if ((error = xfs_qm_dqtobp(NULL, dqp, &ddqp, &bp, XFS_QMOPT_DOWARN))) { 1195 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
1196 mp->m_quotainfo->qi_dqchunklen, 0, &bp);
1197 if (error) {
1216 ASSERT(error != ENOENT); 1198 ASSERT(error != ENOENT);
1217 /*
1218 * Quotas could have gotten turned off (ESRCH)
1219 */
1220 xfs_dqfunlock(dqp); 1199 xfs_dqfunlock(dqp);
1221 return (error); 1200 return error;
1222 } 1201 }
1223 1202
1224 if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id), 1203 /*
1225 0, XFS_QMOPT_DOWARN, "dqflush (incore copy)")) { 1204 * Calculate the location of the dquot inside the buffer.
1226 xfs_force_shutdown(dqp->q_mount, SHUTDOWN_CORRUPT_INCORE); 1205 */
1206 ddqp = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
1207
1208 /*
1209 * A simple sanity check in case we got a corrupted dquot..
1210 */
1211 if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
1212 XFS_QMOPT_DOWARN, "dqflush (incore copy)")) {
1213 xfs_buf_relse(bp);
1214 xfs_dqfunlock(dqp);
1215 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
1227 return XFS_ERROR(EIO); 1216 return XFS_ERROR(EIO);
1228 } 1217 }
1229 1218
1230 /* This is the only portion of data that needs to persist */ 1219 /* This is the only portion of data that needs to persist */
1231 memcpy(ddqp, &(dqp->q_core), sizeof(xfs_disk_dquot_t)); 1220 memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t));
1232 1221
1233 /* 1222 /*
1234 * Clear the dirty field and remember the flush lsn for later use. 1223 * Clear the dirty field and remember the flush lsn for later use.
1235 */ 1224 */
1236 dqp->dq_flags &= ~(XFS_DQ_DIRTY); 1225 dqp->dq_flags &= ~XFS_DQ_DIRTY;
1237 mp = dqp->q_mount;
1238 1226
1239 xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn, 1227 xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
1240 &dqp->q_logitem.qli_item.li_lsn); 1228 &dqp->q_logitem.qli_item.li_lsn);
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 9a92407109a1..f8e854b4fde8 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -55,8 +55,6 @@ uint ndquot;
55kmem_zone_t *qm_dqzone; 55kmem_zone_t *qm_dqzone;
56kmem_zone_t *qm_dqtrxzone; 56kmem_zone_t *qm_dqtrxzone;
57 57
58static cred_t xfs_zerocr;
59
60STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); 58STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int);
61STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); 59STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
62 60
@@ -837,7 +835,7 @@ xfs_qm_dqattach_locked(
837 xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, 835 xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
838 flags & XFS_QMOPT_DQALLOC, 836 flags & XFS_QMOPT_DQALLOC,
839 ip->i_udquot, &ip->i_gdquot) : 837 ip->i_udquot, &ip->i_gdquot) :
840 xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ, 838 xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
841 flags & XFS_QMOPT_DQALLOC, 839 flags & XFS_QMOPT_DQALLOC,
842 ip->i_udquot, &ip->i_gdquot); 840 ip->i_udquot, &ip->i_gdquot);
843 /* 841 /*
@@ -1199,87 +1197,6 @@ xfs_qm_list_destroy(
1199 mutex_destroy(&(list->qh_lock)); 1197 mutex_destroy(&(list->qh_lock));
1200} 1198}
1201 1199
1202
1203/*
1204 * Stripped down version of dqattach. This doesn't attach, or even look at the
1205 * dquots attached to the inode. The rationale is that there won't be any
1206 * attached at the time this is called from quotacheck.
1207 */
1208STATIC int
1209xfs_qm_dqget_noattach(
1210 xfs_inode_t *ip,
1211 xfs_dquot_t **O_udqpp,
1212 xfs_dquot_t **O_gdqpp)
1213{
1214 int error;
1215 xfs_mount_t *mp;
1216 xfs_dquot_t *udqp, *gdqp;
1217
1218 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1219 mp = ip->i_mount;
1220 udqp = NULL;
1221 gdqp = NULL;
1222
1223 if (XFS_IS_UQUOTA_ON(mp)) {
1224 ASSERT(ip->i_udquot == NULL);
1225 /*
1226 * We want the dquot allocated if it doesn't exist.
1227 */
1228 if ((error = xfs_qm_dqget(mp, ip, ip->i_d.di_uid, XFS_DQ_USER,
1229 XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN,
1230 &udqp))) {
1231 /*
1232 * Shouldn't be able to turn off quotas here.
1233 */
1234 ASSERT(error != ESRCH);
1235 ASSERT(error != ENOENT);
1236 return error;
1237 }
1238 ASSERT(udqp);
1239 }
1240
1241 if (XFS_IS_OQUOTA_ON(mp)) {
1242 ASSERT(ip->i_gdquot == NULL);
1243 if (udqp)
1244 xfs_dqunlock(udqp);
1245 error = XFS_IS_GQUOTA_ON(mp) ?
1246 xfs_qm_dqget(mp, ip,
1247 ip->i_d.di_gid, XFS_DQ_GROUP,
1248 XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
1249 &gdqp) :
1250 xfs_qm_dqget(mp, ip,
1251 ip->i_d.di_projid, XFS_DQ_PROJ,
1252 XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
1253 &gdqp);
1254 if (error) {
1255 if (udqp)
1256 xfs_qm_dqrele(udqp);
1257 ASSERT(error != ESRCH);
1258 ASSERT(error != ENOENT);
1259 return error;
1260 }
1261 ASSERT(gdqp);
1262
1263 /* Reacquire the locks in the right order */
1264 if (udqp) {
1265 if (! xfs_qm_dqlock_nowait(udqp)) {
1266 xfs_dqunlock(gdqp);
1267 xfs_dqlock(udqp);
1268 xfs_dqlock(gdqp);
1269 }
1270 }
1271 }
1272
1273 *O_udqpp = udqp;
1274 *O_gdqpp = gdqp;
1275
1276#ifdef QUOTADEBUG
1277 if (udqp) ASSERT(XFS_DQ_IS_LOCKED(udqp));
1278 if (gdqp) ASSERT(XFS_DQ_IS_LOCKED(gdqp));
1279#endif
1280 return 0;
1281}
1282
1283/* 1200/*
1284 * Create an inode and return with a reference already taken, but unlocked 1201 * Create an inode and return with a reference already taken, but unlocked
1285 * This is how we create quota inodes 1202 * This is how we create quota inodes
@@ -1305,8 +1222,8 @@ xfs_qm_qino_alloc(
1305 return error; 1222 return error;
1306 } 1223 }
1307 1224
1308 if ((error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 1225 error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed);
1309 &xfs_zerocr, 0, 1, ip, &committed))) { 1226 if (error) {
1310 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | 1227 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
1311 XFS_TRANS_ABORT); 1228 XFS_TRANS_ABORT);
1312 return error; 1229 return error;
@@ -1516,7 +1433,7 @@ xfs_qm_dqiterate(
1516 rablkcnt = map[i+1].br_blockcount; 1433 rablkcnt = map[i+1].br_blockcount;
1517 rablkno = map[i+1].br_startblock; 1434 rablkno = map[i+1].br_startblock;
1518 while (rablkcnt--) { 1435 while (rablkcnt--) {
1519 xfs_baread(mp->m_ddev_targp, 1436 xfs_buf_readahead(mp->m_ddev_targp,
1520 XFS_FSB_TO_DADDR(mp, rablkno), 1437 XFS_FSB_TO_DADDR(mp, rablkno),
1521 mp->m_quotainfo->qi_dqchunklen); 1438 mp->m_quotainfo->qi_dqchunklen);
1522 rablkno++; 1439 rablkno++;
@@ -1546,18 +1463,34 @@ xfs_qm_dqiterate(
1546 1463
1547/* 1464/*
1548 * Called by dqusage_adjust in doing a quotacheck. 1465 * Called by dqusage_adjust in doing a quotacheck.
1549 * Given the inode, and a dquot (either USR or GRP, doesn't matter), 1466 *
1550 * this updates its incore copy as well as the buffer copy. This is 1467 * Given the inode, and a dquot id this updates both the incore dqout as well
1551 * so that once the quotacheck is done, we can just log all the buffers, 1468 * as the buffer copy. This is so that once the quotacheck is done, we can
1552 * as opposed to logging numerous updates to individual dquots. 1469 * just log all the buffers, as opposed to logging numerous updates to
1470 * individual dquots.
1553 */ 1471 */
1554STATIC void 1472STATIC int
1555xfs_qm_quotacheck_dqadjust( 1473xfs_qm_quotacheck_dqadjust(
1556 xfs_dquot_t *dqp, 1474 struct xfs_inode *ip,
1475 xfs_dqid_t id,
1476 uint type,
1557 xfs_qcnt_t nblks, 1477 xfs_qcnt_t nblks,
1558 xfs_qcnt_t rtblks) 1478 xfs_qcnt_t rtblks)
1559{ 1479{
1560 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 1480 struct xfs_mount *mp = ip->i_mount;
1481 struct xfs_dquot *dqp;
1482 int error;
1483
1484 error = xfs_qm_dqget(mp, ip, id, type,
1485 XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, &dqp);
1486 if (error) {
1487 /*
1488 * Shouldn't be able to turn off quotas here.
1489 */
1490 ASSERT(error != ESRCH);
1491 ASSERT(error != ENOENT);
1492 return error;
1493 }
1561 1494
1562 trace_xfs_dqadjust(dqp); 1495 trace_xfs_dqadjust(dqp);
1563 1496
@@ -1582,11 +1515,13 @@ xfs_qm_quotacheck_dqadjust(
1582 * There are no timers for the default values set in the root dquot. 1515 * There are no timers for the default values set in the root dquot.
1583 */ 1516 */
1584 if (dqp->q_core.d_id) { 1517 if (dqp->q_core.d_id) {
1585 xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core); 1518 xfs_qm_adjust_dqlimits(mp, &dqp->q_core);
1586 xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core); 1519 xfs_qm_adjust_dqtimers(mp, &dqp->q_core);
1587 } 1520 }
1588 1521
1589 dqp->dq_flags |= XFS_DQ_DIRTY; 1522 dqp->dq_flags |= XFS_DQ_DIRTY;
1523 xfs_qm_dqput(dqp);
1524 return 0;
1590} 1525}
1591 1526
1592STATIC int 1527STATIC int
@@ -1629,8 +1564,7 @@ xfs_qm_dqusage_adjust(
1629 int *res) /* result code value */ 1564 int *res) /* result code value */
1630{ 1565{
1631 xfs_inode_t *ip; 1566 xfs_inode_t *ip;
1632 xfs_dquot_t *udqp, *gdqp; 1567 xfs_qcnt_t nblks, rtblks = 0;
1633 xfs_qcnt_t nblks, rtblks;
1634 int error; 1568 int error;
1635 1569
1636 ASSERT(XFS_IS_QUOTA_RUNNING(mp)); 1570 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
@@ -1650,51 +1584,24 @@ xfs_qm_dqusage_adjust(
1650 * the case in all other instances. It's OK that we do this because 1584 * the case in all other instances. It's OK that we do this because
1651 * quotacheck is done only at mount time. 1585 * quotacheck is done only at mount time.
1652 */ 1586 */
1653 if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip))) { 1587 error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip);
1588 if (error) {
1654 *res = BULKSTAT_RV_NOTHING; 1589 *res = BULKSTAT_RV_NOTHING;
1655 return error; 1590 return error;
1656 } 1591 }
1657 1592
1658 /* 1593 ASSERT(ip->i_delayed_blks == 0);
1659 * Obtain the locked dquots. In case of an error (eg. allocation
1660 * fails for ENOSPC), we return the negative of the error number
1661 * to bulkstat, so that it can get propagated to quotacheck() and
1662 * making us disable quotas for the file system.
1663 */
1664 if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) {
1665 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1666 IRELE(ip);
1667 *res = BULKSTAT_RV_GIVEUP;
1668 return error;
1669 }
1670 1594
1671 rtblks = 0; 1595 if (XFS_IS_REALTIME_INODE(ip)) {
1672 if (! XFS_IS_REALTIME_INODE(ip)) {
1673 nblks = (xfs_qcnt_t)ip->i_d.di_nblocks;
1674 } else {
1675 /* 1596 /*
1676 * Walk thru the extent list and count the realtime blocks. 1597 * Walk thru the extent list and count the realtime blocks.
1677 */ 1598 */
1678 if ((error = xfs_qm_get_rtblks(ip, &rtblks))) { 1599 error = xfs_qm_get_rtblks(ip, &rtblks);
1679 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1600 if (error)
1680 IRELE(ip); 1601 goto error0;
1681 if (udqp)
1682 xfs_qm_dqput(udqp);
1683 if (gdqp)
1684 xfs_qm_dqput(gdqp);
1685 *res = BULKSTAT_RV_GIVEUP;
1686 return error;
1687 }
1688 nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
1689 } 1602 }
1690 ASSERT(ip->i_delayed_blks == 0);
1691 1603
1692 /* 1604 nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
1693 * We can't release the inode while holding its dquot locks.
1694 * The inode can go into inactive and might try to acquire the dquotlocks.
1695 * So, just unlock here and do a vn_rele at the end.
1696 */
1697 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1698 1605
1699 /* 1606 /*
1700 * Add the (disk blocks and inode) resources occupied by this 1607 * Add the (disk blocks and inode) resources occupied by this
@@ -1709,26 +1616,36 @@ xfs_qm_dqusage_adjust(
1709 * and quotaoffs don't race. (Quotachecks happen at mount time only). 1616 * and quotaoffs don't race. (Quotachecks happen at mount time only).
1710 */ 1617 */
1711 if (XFS_IS_UQUOTA_ON(mp)) { 1618 if (XFS_IS_UQUOTA_ON(mp)) {
1712 ASSERT(udqp); 1619 error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_uid,
1713 xfs_qm_quotacheck_dqadjust(udqp, nblks, rtblks); 1620 XFS_DQ_USER, nblks, rtblks);
1714 xfs_qm_dqput(udqp); 1621 if (error)
1622 goto error0;
1715 } 1623 }
1716 if (XFS_IS_OQUOTA_ON(mp)) { 1624
1717 ASSERT(gdqp); 1625 if (XFS_IS_GQUOTA_ON(mp)) {
1718 xfs_qm_quotacheck_dqadjust(gdqp, nblks, rtblks); 1626 error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_gid,
1719 xfs_qm_dqput(gdqp); 1627 XFS_DQ_GROUP, nblks, rtblks);
1628 if (error)
1629 goto error0;
1720 } 1630 }
1721 /*
1722 * Now release the inode. This will send it to 'inactive', and
1723 * possibly even free blocks.
1724 */
1725 IRELE(ip);
1726 1631
1727 /* 1632 if (XFS_IS_PQUOTA_ON(mp)) {
1728 * Goto next inode. 1633 error = xfs_qm_quotacheck_dqadjust(ip, xfs_get_projid(ip),
1729 */ 1634 XFS_DQ_PROJ, nblks, rtblks);
1635 if (error)
1636 goto error0;
1637 }
1638
1639 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1640 IRELE(ip);
1730 *res = BULKSTAT_RV_DIDONE; 1641 *res = BULKSTAT_RV_DIDONE;
1731 return 0; 1642 return 0;
1643
1644error0:
1645 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1646 IRELE(ip);
1647 *res = BULKSTAT_RV_GIVEUP;
1648 return error;
1732} 1649}
1733 1650
1734/* 1651/*
@@ -2224,7 +2141,7 @@ xfs_qm_write_sb_changes(
2224 2141
2225 2142
2226/* 2143/*
2227 * Given an inode, a uid and gid (from cred_t) make sure that we have 2144 * Given an inode, a uid, gid and prid make sure that we have
2228 * allocated relevant dquot(s) on disk, and that we won't exceed inode 2145 * allocated relevant dquot(s) on disk, and that we won't exceed inode
2229 * quotas by creating this file. 2146 * quotas by creating this file.
2230 * This also attaches dquot(s) to the given inode after locking it, 2147 * This also attaches dquot(s) to the given inode after locking it,
@@ -2332,7 +2249,7 @@ xfs_qm_vop_dqalloc(
2332 xfs_dqunlock(gq); 2249 xfs_dqunlock(gq);
2333 } 2250 }
2334 } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { 2251 } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
2335 if (ip->i_d.di_projid != prid) { 2252 if (xfs_get_projid(ip) != prid) {
2336 xfs_iunlock(ip, lockflags); 2253 xfs_iunlock(ip, lockflags);
2337 if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid, 2254 if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
2338 XFS_DQ_PROJ, 2255 XFS_DQ_PROJ,
@@ -2454,7 +2371,7 @@ xfs_qm_vop_chown_reserve(
2454 } 2371 }
2455 if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) { 2372 if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
2456 if (XFS_IS_PQUOTA_ON(ip->i_mount) && 2373 if (XFS_IS_PQUOTA_ON(ip->i_mount) &&
2457 ip->i_d.di_projid != be32_to_cpu(gdqp->q_core.d_id)) 2374 xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id))
2458 prjflags = XFS_QMOPT_ENOSPC; 2375 prjflags = XFS_QMOPT_ENOSPC;
2459 2376
2460 if (prjflags || 2377 if (prjflags ||
@@ -2558,7 +2475,7 @@ xfs_qm_vop_create_dqattach(
2558 ip->i_gdquot = gdqp; 2475 ip->i_gdquot = gdqp;
2559 ASSERT(XFS_IS_OQUOTA_ON(mp)); 2476 ASSERT(XFS_IS_OQUOTA_ON(mp));
2560 ASSERT((XFS_IS_GQUOTA_ON(mp) ? 2477 ASSERT((XFS_IS_GQUOTA_ON(mp) ?
2561 ip->i_d.di_gid : ip->i_d.di_projid) == 2478 ip->i_d.di_gid : xfs_get_projid(ip)) ==
2562 be32_to_cpu(gdqp->q_core.d_id)); 2479 be32_to_cpu(gdqp->q_core.d_id));
2563 xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); 2480 xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
2564 } 2481 }
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index bea02d786c5d..45b5cb1788ab 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -81,7 +81,7 @@ xfs_qm_statvfs(
81 xfs_mount_t *mp = ip->i_mount; 81 xfs_mount_t *mp = ip->i_mount;
82 xfs_dquot_t *dqp; 82 xfs_dquot_t *dqp;
83 83
84 if (!xfs_qm_dqget(mp, NULL, ip->i_d.di_projid, XFS_DQ_PROJ, 0, &dqp)) { 84 if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) {
85 xfs_fill_statvfs_from_dquot(statp, &dqp->q_core); 85 xfs_fill_statvfs_from_dquot(statp, &dqp->q_core);
86 xfs_qm_dqput(dqp); 86 xfs_qm_dqput(dqp);
87 } 87 }
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 45e5849df238..bdebc183223e 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -276,7 +276,7 @@ xfs_qm_scall_trunc_qfile(
276 goto out_unlock; 276 goto out_unlock;
277 } 277 }
278 278
279 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 279 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
280 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 280 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
281 281
282out_unlock: 282out_unlock:
@@ -875,21 +875,14 @@ xfs_dqrele_inode(
875 struct xfs_perag *pag, 875 struct xfs_perag *pag,
876 int flags) 876 int flags)
877{ 877{
878 int error;
879
880 /* skip quota inodes */ 878 /* skip quota inodes */
881 if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || 879 if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
882 ip == ip->i_mount->m_quotainfo->qi_gquotaip) { 880 ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
883 ASSERT(ip->i_udquot == NULL); 881 ASSERT(ip->i_udquot == NULL);
884 ASSERT(ip->i_gdquot == NULL); 882 ASSERT(ip->i_gdquot == NULL);
885 read_unlock(&pag->pag_ici_lock);
886 return 0; 883 return 0;
887 } 884 }
888 885
889 error = xfs_sync_inode_valid(ip, pag);
890 if (error)
891 return error;
892
893 xfs_ilock(ip, XFS_ILOCK_EXCL); 886 xfs_ilock(ip, XFS_ILOCK_EXCL);
894 if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) { 887 if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
895 xfs_qm_dqrele(ip->i_udquot); 888 xfs_qm_dqrele(ip->i_udquot);
@@ -900,8 +893,6 @@ xfs_dqrele_inode(
900 ip->i_gdquot = NULL; 893 ip->i_gdquot = NULL;
901 } 894 }
902 xfs_iunlock(ip, XFS_ILOCK_EXCL); 895 xfs_iunlock(ip, XFS_ILOCK_EXCL);
903
904 IRELE(ip);
905 return 0; 896 return 0;
906} 897}
907 898
@@ -918,8 +909,7 @@ xfs_qm_dqrele_all_inodes(
918 uint flags) 909 uint flags)
919{ 910{
920 ASSERT(mp->m_quotainfo); 911 ASSERT(mp->m_quotainfo);
921 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, 912 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags);
922 XFS_ICI_NO_TAG, 0, NULL);
923} 913}
924 914
925/*------------------------------------------------------------------------*/ 915/*------------------------------------------------------------------------*/
@@ -1175,7 +1165,7 @@ xfs_qm_internalqcheck_adjust(
1175 } 1165 }
1176 xfs_qm_internalqcheck_get_dquots(mp, 1166 xfs_qm_internalqcheck_get_dquots(mp,
1177 (xfs_dqid_t) ip->i_d.di_uid, 1167 (xfs_dqid_t) ip->i_d.di_uid,
1178 (xfs_dqid_t) ip->i_d.di_projid, 1168 (xfs_dqid_t) xfs_get_projid(ip),
1179 (xfs_dqid_t) ip->i_d.di_gid, 1169 (xfs_dqid_t) ip->i_d.di_gid,
1180 &ud, &gd); 1170 &ud, &gd);
1181 if (XFS_IS_UQUOTA_ON(mp)) { 1171 if (XFS_IS_UQUOTA_ON(mp)) {
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 4917d4eed4ed..63c7a1a6c022 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -230,6 +230,15 @@ typedef struct xfs_perag {
230 rwlock_t pag_ici_lock; /* incore inode lock */ 230 rwlock_t pag_ici_lock; /* incore inode lock */
231 struct radix_tree_root pag_ici_root; /* incore inode cache root */ 231 struct radix_tree_root pag_ici_root; /* incore inode cache root */
232 int pag_ici_reclaimable; /* reclaimable inodes */ 232 int pag_ici_reclaimable; /* reclaimable inodes */
233 struct mutex pag_ici_reclaim_lock; /* serialisation point */
234 unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
235
236 /* buffer cache index */
237 spinlock_t pag_buf_lock; /* lock for pag_buf_tree */
238 struct rb_root pag_buf_tree; /* ordered tree of active buffers */
239
240 /* for rcu-safe freeing */
241 struct rcu_head rcu_head;
233#endif 242#endif
234 int pagb_count; /* pagb slots in use */ 243 int pagb_count; /* pagb slots in use */
235} xfs_perag_t; 244} xfs_perag_t;
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index af168faccc7a..112abc439ca5 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -675,7 +675,7 @@ xfs_alloc_ag_vextent_near(
675 xfs_agblock_t gtbnoa; /* aligned ... */ 675 xfs_agblock_t gtbnoa; /* aligned ... */
676 xfs_extlen_t gtdiff; /* difference to right side entry */ 676 xfs_extlen_t gtdiff; /* difference to right side entry */
677 xfs_extlen_t gtlen; /* length of right side entry */ 677 xfs_extlen_t gtlen; /* length of right side entry */
678 xfs_extlen_t gtlena; /* aligned ... */ 678 xfs_extlen_t gtlena = 0; /* aligned ... */
679 xfs_agblock_t gtnew; /* useful start bno of right side */ 679 xfs_agblock_t gtnew; /* useful start bno of right side */
680 int error; /* error code */ 680 int error; /* error code */
681 int i; /* result code, temporary */ 681 int i; /* result code, temporary */
@@ -684,7 +684,7 @@ xfs_alloc_ag_vextent_near(
684 xfs_agblock_t ltbnoa; /* aligned ... */ 684 xfs_agblock_t ltbnoa; /* aligned ... */
685 xfs_extlen_t ltdiff; /* difference to left side entry */ 685 xfs_extlen_t ltdiff; /* difference to left side entry */
686 xfs_extlen_t ltlen; /* length of left side entry */ 686 xfs_extlen_t ltlen; /* length of left side entry */
687 xfs_extlen_t ltlena; /* aligned ... */ 687 xfs_extlen_t ltlena = 0; /* aligned ... */
688 xfs_agblock_t ltnew; /* useful start bno of left side */ 688 xfs_agblock_t ltnew; /* useful start bno of left side */
689 xfs_extlen_t rlen; /* length of returned extent */ 689 xfs_extlen_t rlen; /* length of returned extent */
690#if defined(DEBUG) && defined(__KERNEL__) 690#if defined(DEBUG) && defined(__KERNEL__)
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 97f7328967fd..3916925e2584 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -280,38 +280,6 @@ xfs_allocbt_key_diff(
280 return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; 280 return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
281} 281}
282 282
283STATIC int
284xfs_allocbt_kill_root(
285 struct xfs_btree_cur *cur,
286 struct xfs_buf *bp,
287 int level,
288 union xfs_btree_ptr *newroot)
289{
290 int error;
291
292 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
293 XFS_BTREE_STATS_INC(cur, killroot);
294
295 /*
296 * Update the root pointer, decreasing the level by 1 and then
297 * free the old root.
298 */
299 xfs_allocbt_set_root(cur, newroot, -1);
300 error = xfs_allocbt_free_block(cur, bp);
301 if (error) {
302 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
303 return error;
304 }
305
306 XFS_BTREE_STATS_INC(cur, free);
307
308 xfs_btree_setbuf(cur, level, NULL);
309 cur->bc_nlevels--;
310
311 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
312 return 0;
313}
314
315#ifdef DEBUG 283#ifdef DEBUG
316STATIC int 284STATIC int
317xfs_allocbt_keys_inorder( 285xfs_allocbt_keys_inorder(
@@ -423,7 +391,6 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
423 391
424 .dup_cursor = xfs_allocbt_dup_cursor, 392 .dup_cursor = xfs_allocbt_dup_cursor,
425 .set_root = xfs_allocbt_set_root, 393 .set_root = xfs_allocbt_set_root,
426 .kill_root = xfs_allocbt_kill_root,
427 .alloc_block = xfs_allocbt_alloc_block, 394 .alloc_block = xfs_allocbt_alloc_block,
428 .free_block = xfs_allocbt_free_block, 395 .free_block = xfs_allocbt_free_block,
429 .update_lastrec = xfs_allocbt_update_lastrec, 396 .update_lastrec = xfs_allocbt_update_lastrec,
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index c2568242a901..c86375378810 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -355,16 +355,15 @@ xfs_attr_set_int(
355 if (mp->m_flags & XFS_MOUNT_WSYNC) { 355 if (mp->m_flags & XFS_MOUNT_WSYNC) {
356 xfs_trans_set_sync(args.trans); 356 xfs_trans_set_sync(args.trans);
357 } 357 }
358
359 if (!error && (flags & ATTR_KERNOTIME) == 0) {
360 xfs_trans_ichgtime(args.trans, dp,
361 XFS_ICHGTIME_CHG);
362 }
358 err2 = xfs_trans_commit(args.trans, 363 err2 = xfs_trans_commit(args.trans,
359 XFS_TRANS_RELEASE_LOG_RES); 364 XFS_TRANS_RELEASE_LOG_RES);
360 xfs_iunlock(dp, XFS_ILOCK_EXCL); 365 xfs_iunlock(dp, XFS_ILOCK_EXCL);
361 366
362 /*
363 * Hit the inode change time.
364 */
365 if (!error && (flags & ATTR_KERNOTIME) == 0) {
366 xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
367 }
368 return(error == 0 ? err2 : error); 367 return(error == 0 ? err2 : error);
369 } 368 }
370 369
@@ -420,6 +419,9 @@ xfs_attr_set_int(
420 xfs_trans_set_sync(args.trans); 419 xfs_trans_set_sync(args.trans);
421 } 420 }
422 421
422 if ((flags & ATTR_KERNOTIME) == 0)
423 xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
424
423 /* 425 /*
424 * Commit the last in the sequence of transactions. 426 * Commit the last in the sequence of transactions.
425 */ 427 */
@@ -427,13 +429,6 @@ xfs_attr_set_int(
427 error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); 429 error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
428 xfs_iunlock(dp, XFS_ILOCK_EXCL); 430 xfs_iunlock(dp, XFS_ILOCK_EXCL);
429 431
430 /*
431 * Hit the inode change time.
432 */
433 if (!error && (flags & ATTR_KERNOTIME) == 0) {
434 xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
435 }
436
437 return(error); 432 return(error);
438 433
439out: 434out:
@@ -567,6 +562,9 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
567 xfs_trans_set_sync(args.trans); 562 xfs_trans_set_sync(args.trans);
568 } 563 }
569 564
565 if ((flags & ATTR_KERNOTIME) == 0)
566 xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
567
570 /* 568 /*
571 * Commit the last in the sequence of transactions. 569 * Commit the last in the sequence of transactions.
572 */ 570 */
@@ -574,13 +572,6 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
574 error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); 572 error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
575 xfs_iunlock(dp, XFS_ILOCK_EXCL); 573 xfs_iunlock(dp, XFS_ILOCK_EXCL);
576 574
577 /*
578 * Hit the inode change time.
579 */
580 if (!error && (flags & ATTR_KERNOTIME) == 0) {
581 xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
582 }
583
584 return(error); 575 return(error);
585 576
586out: 577out:
@@ -1995,7 +1986,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
1995 1986
1996 tmp = (valuelen < XFS_BUF_SIZE(bp)) 1987 tmp = (valuelen < XFS_BUF_SIZE(bp))
1997 ? valuelen : XFS_BUF_SIZE(bp); 1988 ? valuelen : XFS_BUF_SIZE(bp);
1998 xfs_biomove(bp, 0, tmp, dst, XBF_READ); 1989 xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ);
1999 xfs_buf_relse(bp); 1990 xfs_buf_relse(bp);
2000 dst += tmp; 1991 dst += tmp;
2001 valuelen -= tmp; 1992 valuelen -= tmp;
@@ -2125,9 +2116,9 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2125 2116
2126 tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : 2117 tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
2127 XFS_BUF_SIZE(bp); 2118 XFS_BUF_SIZE(bp);
2128 xfs_biomove(bp, 0, tmp, src, XBF_WRITE); 2119 xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
2129 if (tmp < XFS_BUF_SIZE(bp)) 2120 if (tmp < XFS_BUF_SIZE(bp))
2130 xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); 2121 xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
2131 if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */ 2122 if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
2132 return (error); 2123 return (error);
2133 } 2124 }
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index f90dadd5a968..8abd12e32e13 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -614,7 +614,7 @@ xfs_bmap_add_extent(
614 nblks += cur->bc_private.b.allocated; 614 nblks += cur->bc_private.b.allocated;
615 ASSERT(nblks <= da_old); 615 ASSERT(nblks <= da_old);
616 if (nblks < da_old) 616 if (nblks < da_old)
617 xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, 617 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
618 (int64_t)(da_old - nblks), rsvd); 618 (int64_t)(da_old - nblks), rsvd);
619 } 619 }
620 /* 620 /*
@@ -1079,7 +1079,8 @@ xfs_bmap_add_extent_delay_real(
1079 diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - 1079 diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
1080 (cur ? cur->bc_private.b.allocated : 0)); 1080 (cur ? cur->bc_private.b.allocated : 0));
1081 if (diff > 0 && 1081 if (diff > 0 &&
1082 xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) { 1082 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
1083 -((int64_t)diff), rsvd)) {
1083 /* 1084 /*
1084 * Ick gross gag me with a spoon. 1085 * Ick gross gag me with a spoon.
1085 */ 1086 */
@@ -1089,16 +1090,18 @@ xfs_bmap_add_extent_delay_real(
1089 temp--; 1090 temp--;
1090 diff--; 1091 diff--;
1091 if (!diff || 1092 if (!diff ||
1092 !xfs_mod_incore_sb(ip->i_mount, 1093 !xfs_icsb_modify_counters(ip->i_mount,
1093 XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) 1094 XFS_SBS_FDBLOCKS,
1095 -((int64_t)diff), rsvd))
1094 break; 1096 break;
1095 } 1097 }
1096 if (temp2) { 1098 if (temp2) {
1097 temp2--; 1099 temp2--;
1098 diff--; 1100 diff--;
1099 if (!diff || 1101 if (!diff ||
1100 !xfs_mod_incore_sb(ip->i_mount, 1102 !xfs_icsb_modify_counters(ip->i_mount,
1101 XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) 1103 XFS_SBS_FDBLOCKS,
1104 -((int64_t)diff), rsvd))
1102 break; 1105 break;
1103 } 1106 }
1104 } 1107 }
@@ -1766,7 +1769,7 @@ xfs_bmap_add_extent_hole_delay(
1766 } 1769 }
1767 if (oldlen != newlen) { 1770 if (oldlen != newlen) {
1768 ASSERT(oldlen > newlen); 1771 ASSERT(oldlen > newlen);
1769 xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, 1772 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
1770 (int64_t)(oldlen - newlen), rsvd); 1773 (int64_t)(oldlen - newlen), rsvd);
1771 /* 1774 /*
1772 * Nothing to do for disk quota accounting here. 1775 * Nothing to do for disk quota accounting here.
@@ -3111,9 +3114,10 @@ xfs_bmap_del_extent(
3111 * Nothing to do for disk quota accounting here. 3114 * Nothing to do for disk quota accounting here.
3112 */ 3115 */
3113 ASSERT(da_old >= da_new); 3116 ASSERT(da_old >= da_new);
3114 if (da_old > da_new) 3117 if (da_old > da_new) {
3115 xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int64_t)(da_old - da_new), 3118 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
3116 rsvd); 3119 (int64_t)(da_old - da_new), rsvd);
3120 }
3117done: 3121done:
3118 *logflagsp = flags; 3122 *logflagsp = flags;
3119 return error; 3123 return error;
@@ -4526,13 +4530,13 @@ xfs_bmapi(
4526 -((int64_t)extsz), (flags & 4530 -((int64_t)extsz), (flags &
4527 XFS_BMAPI_RSVBLOCKS)); 4531 XFS_BMAPI_RSVBLOCKS));
4528 } else { 4532 } else {
4529 error = xfs_mod_incore_sb(mp, 4533 error = xfs_icsb_modify_counters(mp,
4530 XFS_SBS_FDBLOCKS, 4534 XFS_SBS_FDBLOCKS,
4531 -((int64_t)alen), (flags & 4535 -((int64_t)alen), (flags &
4532 XFS_BMAPI_RSVBLOCKS)); 4536 XFS_BMAPI_RSVBLOCKS));
4533 } 4537 }
4534 if (!error) { 4538 if (!error) {
4535 error = xfs_mod_incore_sb(mp, 4539 error = xfs_icsb_modify_counters(mp,
4536 XFS_SBS_FDBLOCKS, 4540 XFS_SBS_FDBLOCKS,
4537 -((int64_t)indlen), (flags & 4541 -((int64_t)indlen), (flags &
4538 XFS_BMAPI_RSVBLOCKS)); 4542 XFS_BMAPI_RSVBLOCKS));
@@ -4542,7 +4546,7 @@ xfs_bmapi(
4542 (int64_t)extsz, (flags & 4546 (int64_t)extsz, (flags &
4543 XFS_BMAPI_RSVBLOCKS)); 4547 XFS_BMAPI_RSVBLOCKS));
4544 else if (error) 4548 else if (error)
4545 xfs_mod_incore_sb(mp, 4549 xfs_icsb_modify_counters(mp,
4546 XFS_SBS_FDBLOCKS, 4550 XFS_SBS_FDBLOCKS,
4547 (int64_t)alen, (flags & 4551 (int64_t)alen, (flags &
4548 XFS_BMAPI_RSVBLOCKS)); 4552 XFS_BMAPI_RSVBLOCKS));
@@ -4744,8 +4748,12 @@ xfs_bmapi(
4744 * Check if writing previously allocated but 4748 * Check if writing previously allocated but
4745 * unwritten extents. 4749 * unwritten extents.
4746 */ 4750 */
4747 if (wr && mval->br_state == XFS_EXT_UNWRITTEN && 4751 if (wr &&
4748 ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) { 4752 ((mval->br_state == XFS_EXT_UNWRITTEN &&
4753 ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) ||
4754 (mval->br_state == XFS_EXT_NORM &&
4755 ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT)) ==
4756 (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT))))) {
4749 /* 4757 /*
4750 * Modify (by adding) the state flag, if writing. 4758 * Modify (by adding) the state flag, if writing.
4751 */ 4759 */
@@ -4757,7 +4765,9 @@ xfs_bmapi(
4757 *firstblock; 4765 *firstblock;
4758 cur->bc_private.b.flist = flist; 4766 cur->bc_private.b.flist = flist;
4759 } 4767 }
4760 mval->br_state = XFS_EXT_NORM; 4768 mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
4769 ? XFS_EXT_NORM
4770 : XFS_EXT_UNWRITTEN;
4761 error = xfs_bmap_add_extent(ip, lastx, &cur, mval, 4771 error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
4762 firstblock, flist, &tmp_logflags, 4772 firstblock, flist, &tmp_logflags,
4763 whichfork, (flags & XFS_BMAPI_RSVBLOCKS)); 4773 whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
@@ -5200,7 +5210,7 @@ xfs_bunmapi(
5200 ip, -((long)del.br_blockcount), 0, 5210 ip, -((long)del.br_blockcount), 0,
5201 XFS_QMOPT_RES_RTBLKS); 5211 XFS_QMOPT_RES_RTBLKS);
5202 } else { 5212 } else {
5203 xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, 5213 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
5204 (int64_t)del.br_blockcount, rsvd); 5214 (int64_t)del.br_blockcount, rsvd);
5205 (void)xfs_trans_reserve_quota_nblks(NULL, 5215 (void)xfs_trans_reserve_quota_nblks(NULL,
5206 ip, -((long)del.br_blockcount), 0, 5216 ip, -((long)del.br_blockcount), 0,
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index b13569a6179b..71ec9b6ecdfc 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -74,9 +74,12 @@ typedef struct xfs_bmap_free
74#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */ 74#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */
75 /* combine contig. space */ 75 /* combine contig. space */
76#define XFS_BMAPI_CONTIG 0x100 /* must allocate only one extent */ 76#define XFS_BMAPI_CONTIG 0x100 /* must allocate only one extent */
77#define XFS_BMAPI_CONVERT 0x200 /* unwritten extent conversion - */ 77/*
78 /* need write cache flushing and no */ 78 * unwritten extent conversion - this needs write cache flushing and no additional
79 /* additional allocation alignments */ 79 * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
80 * from written to unwritten, otherwise convert from unwritten to written.
81 */
82#define XFS_BMAPI_CONVERT 0x200
80 83
81#define XFS_BMAPI_FLAGS \ 84#define XFS_BMAPI_FLAGS \
82 { XFS_BMAPI_WRITE, "WRITE" }, \ 85 { XFS_BMAPI_WRITE, "WRITE" }, \
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 829af92f0fba..04f9cca8da7e 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -217,7 +217,7 @@ xfs_btree_del_cursor(
217 */ 217 */
218 for (i = 0; i < cur->bc_nlevels; i++) { 218 for (i = 0; i < cur->bc_nlevels; i++) {
219 if (cur->bc_bufs[i]) 219 if (cur->bc_bufs[i])
220 xfs_btree_setbuf(cur, i, NULL); 220 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
221 else if (!error) 221 else if (!error)
222 break; 222 break;
223 } 223 }
@@ -656,7 +656,7 @@ xfs_btree_reada_bufl(
656 656
657 ASSERT(fsbno != NULLFSBLOCK); 657 ASSERT(fsbno != NULLFSBLOCK);
658 d = XFS_FSB_TO_DADDR(mp, fsbno); 658 d = XFS_FSB_TO_DADDR(mp, fsbno);
659 xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count); 659 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
660} 660}
661 661
662/* 662/*
@@ -676,7 +676,7 @@ xfs_btree_reada_bufs(
676 ASSERT(agno != NULLAGNUMBER); 676 ASSERT(agno != NULLAGNUMBER);
677 ASSERT(agbno != NULLAGBLOCK); 677 ASSERT(agbno != NULLAGBLOCK);
678 d = XFS_AGB_TO_DADDR(mp, agno, agbno); 678 d = XFS_AGB_TO_DADDR(mp, agno, agbno);
679 xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count); 679 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
680} 680}
681 681
682STATIC int 682STATIC int
@@ -763,22 +763,19 @@ xfs_btree_readahead(
763 * Set the buffer for level "lev" in the cursor to bp, releasing 763 * Set the buffer for level "lev" in the cursor to bp, releasing
764 * any previous buffer. 764 * any previous buffer.
765 */ 765 */
766void 766STATIC void
767xfs_btree_setbuf( 767xfs_btree_setbuf(
768 xfs_btree_cur_t *cur, /* btree cursor */ 768 xfs_btree_cur_t *cur, /* btree cursor */
769 int lev, /* level in btree */ 769 int lev, /* level in btree */
770 xfs_buf_t *bp) /* new buffer to set */ 770 xfs_buf_t *bp) /* new buffer to set */
771{ 771{
772 struct xfs_btree_block *b; /* btree block */ 772 struct xfs_btree_block *b; /* btree block */
773 xfs_buf_t *obp; /* old buffer pointer */
774 773
775 obp = cur->bc_bufs[lev]; 774 if (cur->bc_bufs[lev])
776 if (obp) 775 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
777 xfs_trans_brelse(cur->bc_tp, obp);
778 cur->bc_bufs[lev] = bp; 776 cur->bc_bufs[lev] = bp;
779 cur->bc_ra[lev] = 0; 777 cur->bc_ra[lev] = 0;
780 if (!bp) 778
781 return;
782 b = XFS_BUF_TO_BLOCK(bp); 779 b = XFS_BUF_TO_BLOCK(bp);
783 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 780 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
784 if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO) 781 if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO)
@@ -3011,6 +3008,43 @@ out0:
3011 return 0; 3008 return 0;
3012} 3009}
3013 3010
3011/*
3012 * Kill the current root node, and replace it with it's only child node.
3013 */
3014STATIC int
3015xfs_btree_kill_root(
3016 struct xfs_btree_cur *cur,
3017 struct xfs_buf *bp,
3018 int level,
3019 union xfs_btree_ptr *newroot)
3020{
3021 int error;
3022
3023 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3024 XFS_BTREE_STATS_INC(cur, killroot);
3025
3026 /*
3027 * Update the root pointer, decreasing the level by 1 and then
3028 * free the old root.
3029 */
3030 cur->bc_ops->set_root(cur, newroot, -1);
3031
3032 error = cur->bc_ops->free_block(cur, bp);
3033 if (error) {
3034 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3035 return error;
3036 }
3037
3038 XFS_BTREE_STATS_INC(cur, free);
3039
3040 cur->bc_bufs[level] = NULL;
3041 cur->bc_ra[level] = 0;
3042 cur->bc_nlevels--;
3043
3044 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3045 return 0;
3046}
3047
3014STATIC int 3048STATIC int
3015xfs_btree_dec_cursor( 3049xfs_btree_dec_cursor(
3016 struct xfs_btree_cur *cur, 3050 struct xfs_btree_cur *cur,
@@ -3195,7 +3229,7 @@ xfs_btree_delrec(
3195 * Make it the new root of the btree. 3229 * Make it the new root of the btree.
3196 */ 3230 */
3197 pp = xfs_btree_ptr_addr(cur, 1, block); 3231 pp = xfs_btree_ptr_addr(cur, 1, block);
3198 error = cur->bc_ops->kill_root(cur, bp, level, pp); 3232 error = xfs_btree_kill_root(cur, bp, level, pp);
3199 if (error) 3233 if (error)
3200 goto error0; 3234 goto error0;
3201 } else if (level > 0) { 3235 } else if (level > 0) {
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 7fa07062bdda..82fafc66bd1f 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -152,9 +152,7 @@ struct xfs_btree_ops {
152 152
153 /* update btree root pointer */ 153 /* update btree root pointer */
154 void (*set_root)(struct xfs_btree_cur *cur, 154 void (*set_root)(struct xfs_btree_cur *cur,
155 union xfs_btree_ptr *nptr, int level_change); 155 union xfs_btree_ptr *nptr, int level_change);
156 int (*kill_root)(struct xfs_btree_cur *cur, struct xfs_buf *bp,
157 int level, union xfs_btree_ptr *newroot);
158 156
159 /* block allocation / freeing */ 157 /* block allocation / freeing */
160 int (*alloc_block)(struct xfs_btree_cur *cur, 158 int (*alloc_block)(struct xfs_btree_cur *cur,
@@ -399,16 +397,6 @@ xfs_btree_reada_bufs(
399 xfs_agblock_t agbno, /* allocation group block number */ 397 xfs_agblock_t agbno, /* allocation group block number */
400 xfs_extlen_t count); /* count of filesystem blocks */ 398 xfs_extlen_t count); /* count of filesystem blocks */
401 399
402/*
403 * Set the buffer for level "lev" in the cursor to bp, releasing
404 * any previous buffer.
405 */
406void
407xfs_btree_setbuf(
408 xfs_btree_cur_t *cur, /* btree cursor */
409 int lev, /* level in btree */
410 struct xfs_buf *bp); /* new buffer to set */
411
412 400
413/* 401/*
414 * Common btree core entry points. 402 * Common btree core entry points.
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 1b09d7a280df..2686d0d54c5b 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -692,8 +692,7 @@ xfs_buf_item_init(
692 * the first. If we do already have one, there is 692 * the first. If we do already have one, there is
693 * nothing to do here so return. 693 * nothing to do here so return.
694 */ 694 */
695 if (bp->b_mount != mp) 695 ASSERT(bp->b_target->bt_mount == mp);
696 bp->b_mount = mp;
697 if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { 696 if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
698 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 697 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
699 if (lip->li_type == XFS_LI_BUF) { 698 if (lip->li_type == XFS_LI_BUF) {
@@ -974,7 +973,7 @@ xfs_buf_iodone_callbacks(
974 xfs_buf_do_callbacks(bp, lip); 973 xfs_buf_do_callbacks(bp, lip);
975 XFS_BUF_SET_FSPRIVATE(bp, NULL); 974 XFS_BUF_SET_FSPRIVATE(bp, NULL);
976 XFS_BUF_CLR_IODONE_FUNC(bp); 975 XFS_BUF_CLR_IODONE_FUNC(bp);
977 xfs_biodone(bp); 976 xfs_buf_ioend(bp, 0);
978 return; 977 return;
979 } 978 }
980 979
@@ -1033,7 +1032,7 @@ xfs_buf_iodone_callbacks(
1033 xfs_buf_do_callbacks(bp, lip); 1032 xfs_buf_do_callbacks(bp, lip);
1034 XFS_BUF_SET_FSPRIVATE(bp, NULL); 1033 XFS_BUF_SET_FSPRIVATE(bp, NULL);
1035 XFS_BUF_CLR_IODONE_FUNC(bp); 1034 XFS_BUF_CLR_IODONE_FUNC(bp);
1036 xfs_biodone(bp); 1035 xfs_buf_ioend(bp, 0);
1037} 1036}
1038 1037
1039/* 1038/*
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 30fa0e206fba..1c00bedb3175 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -2042,7 +2042,7 @@ xfs_da_do_buf(
2042 mappedbno, nmapped, 0, &bp); 2042 mappedbno, nmapped, 0, &bp);
2043 break; 2043 break;
2044 case 3: 2044 case 3:
2045 xfs_baread(mp->m_ddev_targp, mappedbno, nmapped); 2045 xfs_buf_readahead(mp->m_ddev_targp, mappedbno, nmapped);
2046 error = 0; 2046 error = 0;
2047 bp = NULL; 2047 bp = NULL;
2048 break; 2048 break;
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index e5b153b2e6a3..dffba9ba0db6 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -49,8 +49,9 @@ typedef struct xfs_dinode {
49 __be32 di_uid; /* owner's user id */ 49 __be32 di_uid; /* owner's user id */
50 __be32 di_gid; /* owner's group id */ 50 __be32 di_gid; /* owner's group id */
51 __be32 di_nlink; /* number of links to file */ 51 __be32 di_nlink; /* number of links to file */
52 __be16 di_projid; /* owner's project id */ 52 __be16 di_projid_lo; /* lower part of owner's project id */
53 __u8 di_pad[8]; /* unused, zeroed space */ 53 __be16 di_projid_hi; /* higher part owner's project id */
54 __u8 di_pad[6]; /* unused, zeroed space */
54 __be16 di_flushiter; /* incremented on flush */ 55 __be16 di_flushiter; /* incremented on flush */
55 xfs_timestamp_t di_atime; /* time last accessed */ 56 xfs_timestamp_t di_atime; /* time last accessed */
56 xfs_timestamp_t di_mtime; /* time last modified */ 57 xfs_timestamp_t di_mtime; /* time last modified */
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 504be8640e91..ae891223be90 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -961,7 +961,7 @@ xfs_dir2_leaf_getdents(
961 if (i > ra_current && 961 if (i > ra_current &&
962 map[ra_index].br_blockcount >= 962 map[ra_index].br_blockcount >=
963 mp->m_dirblkfsbs) { 963 mp->m_dirblkfsbs) {
964 xfs_baread(mp->m_ddev_targp, 964 xfs_buf_readahead(mp->m_ddev_targp,
965 XFS_FSB_TO_DADDR(mp, 965 XFS_FSB_TO_DADDR(mp,
966 map[ra_index].br_startblock + 966 map[ra_index].br_startblock +
967 ra_offset), 967 ra_offset),
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 87c2e9d02288..8f6fc1a96386 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -293,9 +293,11 @@ typedef struct xfs_bstat {
293 __s32 bs_extsize; /* extent size */ 293 __s32 bs_extsize; /* extent size */
294 __s32 bs_extents; /* number of extents */ 294 __s32 bs_extents; /* number of extents */
295 __u32 bs_gen; /* generation count */ 295 __u32 bs_gen; /* generation count */
296 __u16 bs_projid; /* project id */ 296 __u16 bs_projid_lo; /* lower part of project id */
297#define bs_projid bs_projid_lo /* (previously just bs_projid) */
297 __u16 bs_forkoff; /* inode fork offset in bytes */ 298 __u16 bs_forkoff; /* inode fork offset in bytes */
298 unsigned char bs_pad[12]; /* pad space, unused */ 299 __u16 bs_projid_hi; /* higher part of project id */
300 unsigned char bs_pad[10]; /* pad space, unused */
299 __u32 bs_dmevmask; /* DMIG event mask */ 301 __u32 bs_dmevmask; /* DMIG event mask */
300 __u16 bs_dmstate; /* DMIG state info */ 302 __u16 bs_dmstate; /* DMIG state info */
301 __u16 bs_aextents; /* attribute number of extents */ 303 __u16 bs_aextents; /* attribute number of extents */
@@ -448,6 +450,7 @@ typedef struct xfs_handle {
448/* XFS_IOC_SETBIOSIZE ---- deprecated 46 */ 450/* XFS_IOC_SETBIOSIZE ---- deprecated 46 */
449/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */ 451/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */
450#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) 452#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap)
453#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64)
451 454
452/* 455/*
453 * ioctl commands that replace IRIX syssgi()'s 456 * ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 43b1d5699335..a7c116e814af 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -144,12 +144,11 @@ xfs_growfs_data_private(
144 if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) 144 if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
145 return error; 145 return error;
146 dpct = pct - mp->m_sb.sb_imax_pct; 146 dpct = pct - mp->m_sb.sb_imax_pct;
147 error = xfs_read_buf(mp, mp->m_ddev_targp, 147 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
148 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), 148 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
149 XFS_FSS_TO_BB(mp, 1), 0, &bp); 149 BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
150 if (error) 150 if (!bp)
151 return error; 151 return EIO;
152 ASSERT(bp);
153 xfs_buf_relse(bp); 152 xfs_buf_relse(bp);
154 153
155 new = nb; /* use new as a temporary here */ 154 new = nb; /* use new as a temporary here */
@@ -597,7 +596,8 @@ out:
597 * the extra reserve blocks from the reserve..... 596 * the extra reserve blocks from the reserve.....
598 */ 597 */
599 int error; 598 int error;
600 error = xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, fdblks_delta, 0); 599 error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
600 fdblks_delta, 0);
601 if (error == ENOSPC) 601 if (error == ENOSPC)
602 goto retry; 602 goto retry;
603 } 603 }
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 5371d2dc360e..0626a32c3447 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -212,7 +212,7 @@ xfs_ialloc_inode_init(
212 * to log a whole cluster of inodes instead of all the 212 * to log a whole cluster of inodes instead of all the
213 * individual transactions causing a lot of log traffic. 213 * individual transactions causing a lot of log traffic.
214 */ 214 */
215 xfs_biozero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); 215 xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
216 for (i = 0; i < ninodes; i++) { 216 for (i = 0; i < ninodes; i++) {
217 int ioffset = i << mp->m_sb.sb_inodelog; 217 int ioffset = i << mp->m_sb.sb_inodelog;
218 uint isize = sizeof(struct xfs_dinode); 218 uint isize = sizeof(struct xfs_dinode);
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index d352862cefa0..16921f55c542 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -183,38 +183,6 @@ xfs_inobt_key_diff(
183 cur->bc_rec.i.ir_startino; 183 cur->bc_rec.i.ir_startino;
184} 184}
185 185
186STATIC int
187xfs_inobt_kill_root(
188 struct xfs_btree_cur *cur,
189 struct xfs_buf *bp,
190 int level,
191 union xfs_btree_ptr *newroot)
192{
193 int error;
194
195 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
196 XFS_BTREE_STATS_INC(cur, killroot);
197
198 /*
199 * Update the root pointer, decreasing the level by 1 and then
200 * free the old root.
201 */
202 xfs_inobt_set_root(cur, newroot, -1);
203 error = xfs_inobt_free_block(cur, bp);
204 if (error) {
205 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
206 return error;
207 }
208
209 XFS_BTREE_STATS_INC(cur, free);
210
211 cur->bc_bufs[level] = NULL;
212 cur->bc_nlevels--;
213
214 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
215 return 0;
216}
217
218#ifdef DEBUG 186#ifdef DEBUG
219STATIC int 187STATIC int
220xfs_inobt_keys_inorder( 188xfs_inobt_keys_inorder(
@@ -309,7 +277,6 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
309 277
310 .dup_cursor = xfs_inobt_dup_cursor, 278 .dup_cursor = xfs_inobt_dup_cursor,
311 .set_root = xfs_inobt_set_root, 279 .set_root = xfs_inobt_set_root,
312 .kill_root = xfs_inobt_kill_root,
313 .alloc_block = xfs_inobt_alloc_block, 280 .alloc_block = xfs_inobt_alloc_block,
314 .free_block = xfs_inobt_free_block, 281 .free_block = xfs_inobt_free_block,
315 .get_minrecs = xfs_inobt_get_minrecs, 282 .get_minrecs = xfs_inobt_get_minrecs,
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index b1ecc6f97ade..0cdd26932d8e 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -365,8 +365,8 @@ xfs_iget(
365 xfs_perag_t *pag; 365 xfs_perag_t *pag;
366 xfs_agino_t agino; 366 xfs_agino_t agino;
367 367
368 /* the radix tree exists only in inode capable AGs */ 368 /* reject inode numbers outside existing AGs */
369 if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi) 369 if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
370 return EINVAL; 370 return EINVAL;
371 371
372 /* get the perag structure and ensure that it's inode capable */ 372 /* get the perag structure and ensure that it's inode capable */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 34798f391c49..108c7a085f94 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -660,7 +660,8 @@ xfs_dinode_from_disk(
660 to->di_uid = be32_to_cpu(from->di_uid); 660 to->di_uid = be32_to_cpu(from->di_uid);
661 to->di_gid = be32_to_cpu(from->di_gid); 661 to->di_gid = be32_to_cpu(from->di_gid);
662 to->di_nlink = be32_to_cpu(from->di_nlink); 662 to->di_nlink = be32_to_cpu(from->di_nlink);
663 to->di_projid = be16_to_cpu(from->di_projid); 663 to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
664 to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
664 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 665 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
665 to->di_flushiter = be16_to_cpu(from->di_flushiter); 666 to->di_flushiter = be16_to_cpu(from->di_flushiter);
666 to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec); 667 to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
@@ -695,7 +696,8 @@ xfs_dinode_to_disk(
695 to->di_uid = cpu_to_be32(from->di_uid); 696 to->di_uid = cpu_to_be32(from->di_uid);
696 to->di_gid = cpu_to_be32(from->di_gid); 697 to->di_gid = cpu_to_be32(from->di_gid);
697 to->di_nlink = cpu_to_be32(from->di_nlink); 698 to->di_nlink = cpu_to_be32(from->di_nlink);
698 to->di_projid = cpu_to_be16(from->di_projid); 699 to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
700 to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
699 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 701 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
700 to->di_flushiter = cpu_to_be16(from->di_flushiter); 702 to->di_flushiter = cpu_to_be16(from->di_flushiter);
701 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); 703 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
@@ -874,7 +876,7 @@ xfs_iread(
874 if (ip->i_d.di_version == 1) { 876 if (ip->i_d.di_version == 1) {
875 ip->i_d.di_nlink = ip->i_d.di_onlink; 877 ip->i_d.di_nlink = ip->i_d.di_onlink;
876 ip->i_d.di_onlink = 0; 878 ip->i_d.di_onlink = 0;
877 ip->i_d.di_projid = 0; 879 xfs_set_projid(ip, 0);
878 } 880 }
879 881
880 ip->i_delayed_blks = 0; 882 ip->i_delayed_blks = 0;
@@ -982,8 +984,7 @@ xfs_ialloc(
982 mode_t mode, 984 mode_t mode,
983 xfs_nlink_t nlink, 985 xfs_nlink_t nlink,
984 xfs_dev_t rdev, 986 xfs_dev_t rdev,
985 cred_t *cr, 987 prid_t prid,
986 xfs_prid_t prid,
987 int okalloc, 988 int okalloc,
988 xfs_buf_t **ialloc_context, 989 xfs_buf_t **ialloc_context,
989 boolean_t *call_again, 990 boolean_t *call_again,
@@ -1027,7 +1028,7 @@ xfs_ialloc(
1027 ASSERT(ip->i_d.di_nlink == nlink); 1028 ASSERT(ip->i_d.di_nlink == nlink);
1028 ip->i_d.di_uid = current_fsuid(); 1029 ip->i_d.di_uid = current_fsuid();
1029 ip->i_d.di_gid = current_fsgid(); 1030 ip->i_d.di_gid = current_fsgid();
1030 ip->i_d.di_projid = prid; 1031 xfs_set_projid(ip, prid);
1031 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 1032 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
1032 1033
1033 /* 1034 /*
@@ -2725,7 +2726,7 @@ cluster_corrupt_out:
2725 XFS_BUF_UNDONE(bp); 2726 XFS_BUF_UNDONE(bp);
2726 XFS_BUF_STALE(bp); 2727 XFS_BUF_STALE(bp);
2727 XFS_BUF_ERROR(bp,EIO); 2728 XFS_BUF_ERROR(bp,EIO);
2728 xfs_biodone(bp); 2729 xfs_buf_ioend(bp, 0);
2729 } else { 2730 } else {
2730 XFS_BUF_STALE(bp); 2731 XFS_BUF_STALE(bp);
2731 xfs_buf_relse(bp); 2732 xfs_buf_relse(bp);
@@ -3008,7 +3009,7 @@ xfs_iflush_int(
3008 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 3009 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
3009 memset(&(dip->di_pad[0]), 0, 3010 memset(&(dip->di_pad[0]), 0,
3010 sizeof(dip->di_pad)); 3011 sizeof(dip->di_pad));
3011 ASSERT(ip->i_d.di_projid == 0); 3012 ASSERT(xfs_get_projid(ip) == 0);
3012 } 3013 }
3013 } 3014 }
3014 3015
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0898c5417d12..fb2ca2e4cdc9 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -134,8 +134,9 @@ typedef struct xfs_icdinode {
134 __uint32_t di_uid; /* owner's user id */ 134 __uint32_t di_uid; /* owner's user id */
135 __uint32_t di_gid; /* owner's group id */ 135 __uint32_t di_gid; /* owner's group id */
136 __uint32_t di_nlink; /* number of links to file */ 136 __uint32_t di_nlink; /* number of links to file */
137 __uint16_t di_projid; /* owner's project id */ 137 __uint16_t di_projid_lo; /* lower part of owner's project id */
138 __uint8_t di_pad[8]; /* unused, zeroed space */ 138 __uint16_t di_projid_hi; /* higher part of owner's project id */
139 __uint8_t di_pad[6]; /* unused, zeroed space */
139 __uint16_t di_flushiter; /* incremented on flush */ 140 __uint16_t di_flushiter; /* incremented on flush */
140 xfs_ictimestamp_t di_atime; /* time last accessed */ 141 xfs_ictimestamp_t di_atime; /* time last accessed */
141 xfs_ictimestamp_t di_mtime; /* time last modified */ 142 xfs_ictimestamp_t di_mtime; /* time last modified */
@@ -212,7 +213,6 @@ typedef struct xfs_icdinode {
212#ifdef __KERNEL__ 213#ifdef __KERNEL__
213 214
214struct bhv_desc; 215struct bhv_desc;
215struct cred;
216struct xfs_buf; 216struct xfs_buf;
217struct xfs_bmap_free; 217struct xfs_bmap_free;
218struct xfs_bmbt_irec; 218struct xfs_bmbt_irec;
@@ -335,6 +335,25 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
335} 335}
336 336
337/* 337/*
338 * Project quota id helpers (previously projid was 16bit only
339 * and using two 16bit values to hold new 32bit projid was choosen
340 * to retain compatibility with "old" filesystems).
341 */
342static inline prid_t
343xfs_get_projid(struct xfs_inode *ip)
344{
345 return (prid_t)ip->i_d.di_projid_hi << 16 | ip->i_d.di_projid_lo;
346}
347
348static inline void
349xfs_set_projid(struct xfs_inode *ip,
350 prid_t projid)
351{
352 ip->i_d.di_projid_hi = (__uint16_t) (projid >> 16);
353 ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff);
354}
355
356/*
338 * Manage the i_flush queue embedded in the inode. This completion 357 * Manage the i_flush queue embedded in the inode. This completion
339 * queue synchronizes processes attempting to flush the in-core 358 * queue synchronizes processes attempting to flush the in-core
340 * inode back to disk. 359 * inode back to disk.
@@ -456,8 +475,8 @@ void xfs_inode_free(struct xfs_inode *ip);
456 * xfs_inode.c prototypes. 475 * xfs_inode.c prototypes.
457 */ 476 */
458int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t, 477int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
459 xfs_nlink_t, xfs_dev_t, cred_t *, xfs_prid_t, 478 xfs_nlink_t, xfs_dev_t, prid_t, int,
460 int, struct xfs_buf **, boolean_t *, xfs_inode_t **); 479 struct xfs_buf **, boolean_t *, xfs_inode_t **);
461 480
462uint xfs_ip2xflags(struct xfs_inode *); 481uint xfs_ip2xflags(struct xfs_inode *);
463uint xfs_dic2xflags(struct xfs_dinode *); 482uint xfs_dic2xflags(struct xfs_dinode *);
@@ -471,7 +490,6 @@ int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
471void xfs_iext_realloc(xfs_inode_t *, int, int); 490void xfs_iext_realloc(xfs_inode_t *, int, int);
472void xfs_iunpin_wait(xfs_inode_t *); 491void xfs_iunpin_wait(xfs_inode_t *);
473int xfs_iflush(xfs_inode_t *, uint); 492int xfs_iflush(xfs_inode_t *, uint);
474void xfs_ichgtime(xfs_inode_t *, int);
475void xfs_lock_inodes(xfs_inode_t **, int, uint); 493void xfs_lock_inodes(xfs_inode_t **, int, uint);
476void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); 494void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
477 495
@@ -482,7 +500,7 @@ void xfs_mark_inode_dirty_sync(xfs_inode_t *);
482#define IHOLD(ip) \ 500#define IHOLD(ip) \
483do { \ 501do { \
484 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ 502 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
485 atomic_inc(&(VFS_I(ip)->i_count)); \ 503 ihold(VFS_I(ip)); \
486 trace_xfs_ihold(ip, _THIS_IP_); \ 504 trace_xfs_ihold(ip, _THIS_IP_); \
487} while (0) 505} while (0)
488 506
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index fe00777e2796..c7ac020705df 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -223,15 +223,6 @@ xfs_inode_item_format(
223 nvecs = 1; 223 nvecs = 1;
224 224
225 /* 225 /*
226 * Make sure the linux inode is dirty. We do this before
227 * clearing i_update_core as the VFS will call back into
228 * XFS here and set i_update_core, so we need to dirty the
229 * inode first so that the ordering of i_update_core and
230 * unlogged modifications still works as described below.
231 */
232 xfs_mark_inode_dirty_sync(ip);
233
234 /*
235 * Clear i_update_core if the timestamps (or any other 226 * Clear i_update_core if the timestamps (or any other
236 * non-transactional modification) need flushing/logging 227 * non-transactional modification) need flushing/logging
237 * and we're about to log them with the rest of the core. 228 * and we're about to log them with the rest of the core.
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 7e3626e5925c..dc1882adaf54 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -92,7 +92,8 @@ xfs_bulkstat_one_int(
92 * further change. 92 * further change.
93 */ 93 */
94 buf->bs_nlink = dic->di_nlink; 94 buf->bs_nlink = dic->di_nlink;
95 buf->bs_projid = dic->di_projid; 95 buf->bs_projid_lo = dic->di_projid_lo;
96 buf->bs_projid_hi = dic->di_projid_hi;
96 buf->bs_ino = ino; 97 buf->bs_ino = ino;
97 buf->bs_mode = dic->di_mode; 98 buf->bs_mode = dic->di_mode;
98 buf->bs_uid = dic->di_uid; 99 buf->bs_uid = dic->di_uid;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 33f718f92a48..cee4ab9f8a9e 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -917,19 +917,6 @@ xlog_iodone(xfs_buf_t *bp)
917 l = iclog->ic_log; 917 l = iclog->ic_log;
918 918
919 /* 919 /*
920 * If the _XFS_BARRIER_FAILED flag was set by a lower
921 * layer, it means the underlying device no longer supports
922 * barrier I/O. Warn loudly and turn off barriers.
923 */
924 if (bp->b_flags & _XFS_BARRIER_FAILED) {
925 bp->b_flags &= ~_XFS_BARRIER_FAILED;
926 l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER;
927 xfs_fs_cmn_err(CE_WARN, l->l_mp,
928 "xlog_iodone: Barriers are no longer supported"
929 " by device. Disabling barriers\n");
930 }
931
932 /*
933 * Race to shutdown the filesystem if we see an error. 920 * Race to shutdown the filesystem if we see an error.
934 */ 921 */
935 if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp, 922 if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp,
@@ -1131,7 +1118,8 @@ xlog_alloc_log(xfs_mount_t *mp,
1131 iclog->ic_prev = prev_iclog; 1118 iclog->ic_prev = prev_iclog;
1132 prev_iclog = iclog; 1119 prev_iclog = iclog;
1133 1120
1134 bp = xfs_buf_get_noaddr(log->l_iclog_size, mp->m_logdev_targp); 1121 bp = xfs_buf_get_uncached(mp->m_logdev_targp,
1122 log->l_iclog_size, 0);
1135 if (!bp) 1123 if (!bp)
1136 goto out_free_iclog; 1124 goto out_free_iclog;
1137 if (!XFS_BUF_CPSEMA(bp)) 1125 if (!XFS_BUF_CPSEMA(bp))
@@ -1309,7 +1297,7 @@ xlog_bdstrat(
1309 if (iclog->ic_state & XLOG_STATE_IOERROR) { 1297 if (iclog->ic_state & XLOG_STATE_IOERROR) {
1310 XFS_BUF_ERROR(bp, EIO); 1298 XFS_BUF_ERROR(bp, EIO);
1311 XFS_BUF_STALE(bp); 1299 XFS_BUF_STALE(bp);
1312 xfs_biodone(bp); 1300 xfs_buf_ioend(bp, 0);
1313 /* 1301 /*
1314 * It would seem logical to return EIO here, but we rely on 1302 * It would seem logical to return EIO here, but we rely on
1315 * the log state machine to propagate I/O errors instead of 1303 * the log state machine to propagate I/O errors instead of
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index ed575fb4b495..23d6ceb5e97b 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -146,102 +146,6 @@ xlog_cil_init_post_recovery(
146} 146}
147 147
148/* 148/*
149 * Insert the log item into the CIL and calculate the difference in space
150 * consumed by the item. Add the space to the checkpoint ticket and calculate
151 * if the change requires additional log metadata. If it does, take that space
152 * as well. Remove the amount of space we addded to the checkpoint ticket from
153 * the current transaction ticket so that the accounting works out correctly.
154 *
155 * If this is the first time the item is being placed into the CIL in this
156 * context, pin it so it can't be written to disk until the CIL is flushed to
157 * the iclog and the iclog written to disk.
158 */
159static void
160xlog_cil_insert(
161 struct log *log,
162 struct xlog_ticket *ticket,
163 struct xfs_log_item *item,
164 struct xfs_log_vec *lv)
165{
166 struct xfs_cil *cil = log->l_cilp;
167 struct xfs_log_vec *old = lv->lv_item->li_lv;
168 struct xfs_cil_ctx *ctx = cil->xc_ctx;
169 int len;
170 int diff_iovecs;
171 int iclog_space;
172
173 if (old) {
174 /* existing lv on log item, space used is a delta */
175 ASSERT(!list_empty(&item->li_cil));
176 ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
177
178 len = lv->lv_buf_len - old->lv_buf_len;
179 diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
180 kmem_free(old->lv_buf);
181 kmem_free(old);
182 } else {
183 /* new lv, must pin the log item */
184 ASSERT(!lv->lv_item->li_lv);
185 ASSERT(list_empty(&item->li_cil));
186
187 len = lv->lv_buf_len;
188 diff_iovecs = lv->lv_niovecs;
189 IOP_PIN(lv->lv_item);
190
191 }
192 len += diff_iovecs * sizeof(xlog_op_header_t);
193
194 /* attach new log vector to log item */
195 lv->lv_item->li_lv = lv;
196
197 spin_lock(&cil->xc_cil_lock);
198 list_move_tail(&item->li_cil, &cil->xc_cil);
199 ctx->nvecs += diff_iovecs;
200
201 /*
202 * If this is the first time the item is being committed to the CIL,
203 * store the sequence number on the log item so we can tell
204 * in future commits whether this is the first checkpoint the item is
205 * being committed into.
206 */
207 if (!item->li_seq)
208 item->li_seq = ctx->sequence;
209
210 /*
211 * Now transfer enough transaction reservation to the context ticket
212 * for the checkpoint. The context ticket is special - the unit
213 * reservation has to grow as well as the current reservation as we
214 * steal from tickets so we can correctly determine the space used
215 * during the transaction commit.
216 */
217 if (ctx->ticket->t_curr_res == 0) {
218 /* first commit in checkpoint, steal the header reservation */
219 ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
220 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
221 ticket->t_curr_res -= ctx->ticket->t_unit_res;
222 }
223
224 /* do we need space for more log record headers? */
225 iclog_space = log->l_iclog_size - log->l_iclog_hsize;
226 if (len > 0 && (ctx->space_used / iclog_space !=
227 (ctx->space_used + len) / iclog_space)) {
228 int hdrs;
229
230 hdrs = (len + iclog_space - 1) / iclog_space;
231 /* need to take into account split region headers, too */
232 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
233 ctx->ticket->t_unit_res += hdrs;
234 ctx->ticket->t_curr_res += hdrs;
235 ticket->t_curr_res -= hdrs;
236 ASSERT(ticket->t_curr_res >= len);
237 }
238 ticket->t_curr_res -= len;
239 ctx->space_used += len;
240
241 spin_unlock(&cil->xc_cil_lock);
242}
243
244/*
245 * Format log item into a flat buffers 149 * Format log item into a flat buffers
246 * 150 *
247 * For delayed logging, we need to hold a formatted buffer containing all the 151 * For delayed logging, we need to hold a formatted buffer containing all the
@@ -286,7 +190,7 @@ xlog_cil_format_items(
286 len += lv->lv_iovecp[index].i_len; 190 len += lv->lv_iovecp[index].i_len;
287 191
288 lv->lv_buf_len = len; 192 lv->lv_buf_len = len;
289 lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS); 193 lv->lv_buf = kmem_alloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
290 ptr = lv->lv_buf; 194 ptr = lv->lv_buf;
291 195
292 for (index = 0; index < lv->lv_niovecs; index++) { 196 for (index = 0; index < lv->lv_niovecs; index++) {
@@ -300,21 +204,136 @@ xlog_cil_format_items(
300 } 204 }
301} 205}
302 206
207/*
208 * Prepare the log item for insertion into the CIL. Calculate the difference in
209 * log space and vectors it will consume, and if it is a new item pin it as
210 * well.
211 */
212STATIC void
213xfs_cil_prepare_item(
214 struct log *log,
215 struct xfs_log_vec *lv,
216 int *len,
217 int *diff_iovecs)
218{
219 struct xfs_log_vec *old = lv->lv_item->li_lv;
220
221 if (old) {
222 /* existing lv on log item, space used is a delta */
223 ASSERT(!list_empty(&lv->lv_item->li_cil));
224 ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
225
226 *len += lv->lv_buf_len - old->lv_buf_len;
227 *diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
228 kmem_free(old->lv_buf);
229 kmem_free(old);
230 } else {
231 /* new lv, must pin the log item */
232 ASSERT(!lv->lv_item->li_lv);
233 ASSERT(list_empty(&lv->lv_item->li_cil));
234
235 *len += lv->lv_buf_len;
236 *diff_iovecs += lv->lv_niovecs;
237 IOP_PIN(lv->lv_item);
238
239 }
240
241 /* attach new log vector to log item */
242 lv->lv_item->li_lv = lv;
243
244 /*
245 * If this is the first time the item is being committed to the
246 * CIL, store the sequence number on the log item so we can
247 * tell in future commits whether this is the first checkpoint
248 * the item is being committed into.
249 */
250 if (!lv->lv_item->li_seq)
251 lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
252}
253
254/*
255 * Insert the log items into the CIL and calculate the difference in space
256 * consumed by the item. Add the space to the checkpoint ticket and calculate
257 * if the change requires additional log metadata. If it does, take that space
258 * as well. Remove the amount of space we addded to the checkpoint ticket from
259 * the current transaction ticket so that the accounting works out correctly.
260 */
303static void 261static void
304xlog_cil_insert_items( 262xlog_cil_insert_items(
305 struct log *log, 263 struct log *log,
306 struct xfs_log_vec *log_vector, 264 struct xfs_log_vec *log_vector,
307 struct xlog_ticket *ticket, 265 struct xlog_ticket *ticket)
308 xfs_lsn_t *start_lsn)
309{ 266{
310 struct xfs_log_vec *lv; 267 struct xfs_cil *cil = log->l_cilp;
311 268 struct xfs_cil_ctx *ctx = cil->xc_ctx;
312 if (start_lsn) 269 struct xfs_log_vec *lv;
313 *start_lsn = log->l_cilp->xc_ctx->sequence; 270 int len = 0;
271 int diff_iovecs = 0;
272 int iclog_space;
314 273
315 ASSERT(log_vector); 274 ASSERT(log_vector);
275
276 /*
277 * Do all the accounting aggregation and switching of log vectors
278 * around in a separate loop to the insertion of items into the CIL.
279 * Then we can do a separate loop to update the CIL within a single
280 * lock/unlock pair. This reduces the number of round trips on the CIL
281 * lock from O(nr_logvectors) to O(1) and greatly reduces the overall
282 * hold time for the transaction commit.
283 *
284 * If this is the first time the item is being placed into the CIL in
285 * this context, pin it so it can't be written to disk until the CIL is
286 * flushed to the iclog and the iclog written to disk.
287 *
288 * We can do this safely because the context can't checkpoint until we
289 * are done so it doesn't matter exactly how we update the CIL.
290 */
316 for (lv = log_vector; lv; lv = lv->lv_next) 291 for (lv = log_vector; lv; lv = lv->lv_next)
317 xlog_cil_insert(log, ticket, lv->lv_item, lv); 292 xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
293
294 /* account for space used by new iovec headers */
295 len += diff_iovecs * sizeof(xlog_op_header_t);
296
297 spin_lock(&cil->xc_cil_lock);
298
299 /* move the items to the tail of the CIL */
300 for (lv = log_vector; lv; lv = lv->lv_next)
301 list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
302
303 ctx->nvecs += diff_iovecs;
304
305 /*
306 * Now transfer enough transaction reservation to the context ticket
307 * for the checkpoint. The context ticket is special - the unit
308 * reservation has to grow as well as the current reservation as we
309 * steal from tickets so we can correctly determine the space used
310 * during the transaction commit.
311 */
312 if (ctx->ticket->t_curr_res == 0) {
313 /* first commit in checkpoint, steal the header reservation */
314 ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
315 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
316 ticket->t_curr_res -= ctx->ticket->t_unit_res;
317 }
318
319 /* do we need space for more log record headers? */
320 iclog_space = log->l_iclog_size - log->l_iclog_hsize;
321 if (len > 0 && (ctx->space_used / iclog_space !=
322 (ctx->space_used + len) / iclog_space)) {
323 int hdrs;
324
325 hdrs = (len + iclog_space - 1) / iclog_space;
326 /* need to take into account split region headers, too */
327 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
328 ctx->ticket->t_unit_res += hdrs;
329 ctx->ticket->t_curr_res += hdrs;
330 ticket->t_curr_res -= hdrs;
331 ASSERT(ticket->t_curr_res >= len);
332 }
333 ticket->t_curr_res -= len;
334 ctx->space_used += len;
335
336 spin_unlock(&cil->xc_cil_lock);
318} 337}
319 338
320static void 339static void
@@ -405,9 +424,15 @@ xlog_cil_push(
405 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); 424 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
406 new_ctx->ticket = xlog_cil_ticket_alloc(log); 425 new_ctx->ticket = xlog_cil_ticket_alloc(log);
407 426
408 /* lock out transaction commit, but don't block on background push */ 427 /*
428 * Lock out transaction commit, but don't block for background pushes
429 * unless we are well over the CIL space limit. See the definition of
430 * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic
431 * used here.
432 */
409 if (!down_write_trylock(&cil->xc_ctx_lock)) { 433 if (!down_write_trylock(&cil->xc_ctx_lock)) {
410 if (!push_seq) 434 if (!push_seq &&
435 cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log))
411 goto out_free_ticket; 436 goto out_free_ticket;
412 down_write(&cil->xc_ctx_lock); 437 down_write(&cil->xc_ctx_lock);
413 } 438 }
@@ -422,7 +447,7 @@ xlog_cil_push(
422 goto out_skip; 447 goto out_skip;
423 448
424 /* check for a previously pushed seqeunce */ 449 /* check for a previously pushed seqeunce */
425 if (push_seq < cil->xc_ctx->sequence) 450 if (push_seq && push_seq < cil->xc_ctx->sequence)
426 goto out_skip; 451 goto out_skip;
427 452
428 /* 453 /*
@@ -632,7 +657,10 @@ xfs_log_commit_cil(
632 657
633 /* lock out background commit */ 658 /* lock out background commit */
634 down_read(&log->l_cilp->xc_ctx_lock); 659 down_read(&log->l_cilp->xc_ctx_lock);
635 xlog_cil_insert_items(log, log_vector, tp->t_ticket, commit_lsn); 660 if (commit_lsn)
661 *commit_lsn = log->l_cilp->xc_ctx->sequence;
662
663 xlog_cil_insert_items(log, log_vector, tp->t_ticket);
636 664
637 /* check we didn't blow the reservation */ 665 /* check we didn't blow the reservation */
638 if (tp->t_ticket->t_curr_res < 0) 666 if (tp->t_ticket->t_curr_res < 0)
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index ced52b98b322..edcdfe01617f 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -426,13 +426,13 @@ struct xfs_cil {
426}; 426};
427 427
428/* 428/*
429 * The amount of log space we should the CIL to aggregate is difficult to size. 429 * The amount of log space we allow the CIL to aggregate is difficult to size.
430 * Whatever we chose we have to make we can get a reservation for the log space 430 * Whatever we choose, we have to make sure we can get a reservation for the
431 * effectively, that it is large enough to capture sufficient relogging to 431 * log space effectively, that it is large enough to capture sufficient
432 * reduce log buffer IO significantly, but it is not too large for the log or 432 * relogging to reduce log buffer IO significantly, but it is not too large for
433 * induces too much latency when writing out through the iclogs. We track both 433 * the log or induces too much latency when writing out through the iclogs. We
434 * space consumed and the number of vectors in the checkpoint context, so we 434 * track both space consumed and the number of vectors in the checkpoint
435 * need to decide which to use for limiting. 435 * context, so we need to decide which to use for limiting.
436 * 436 *
437 * Every log buffer we write out during a push needs a header reserved, which 437 * Every log buffer we write out during a push needs a header reserved, which
438 * is at least one sector and more for v2 logs. Hence we need a reservation of 438 * is at least one sector and more for v2 logs. Hence we need a reservation of
@@ -459,16 +459,21 @@ struct xfs_cil {
459 * checkpoint transaction ticket is specific to the checkpoint context, rather 459 * checkpoint transaction ticket is specific to the checkpoint context, rather
460 * than the CIL itself. 460 * than the CIL itself.
461 * 461 *
462 * With dynamic reservations, we can basically make up arbitrary limits for the 462 * With dynamic reservations, we can effectively make up arbitrary limits for
463 * checkpoint size so long as they don't violate any other size rules. Hence 463 * the checkpoint size so long as they don't violate any other size rules.
464 * the initial maximum size for the checkpoint transaction will be set to a 464 * Recovery imposes a rule that no transaction exceed half the log, so we are
465 * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit 465 * limited by that. Furthermore, the log transaction reservation subsystem
466 * right now based on the latency of writing out a large amount of data through 466 * tries to keep 25% of the log free, so we need to keep below that limit or we
467 * the circular iclog buffers. 467 * risk running out of free log space to start any new transactions.
468 *
469 * In order to keep background CIL push efficient, we will set a lower
470 * threshold at which background pushing is attempted without blocking current
471 * transaction commits. A separate, higher bound defines when CIL pushes are
472 * enforced to ensure we stay within our maximum checkpoint size bounds.
473 * threshold, yet give us plenty of space for aggregation on large logs.
468 */ 474 */
469 475#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3)
470#define XLOG_CIL_SPACE_LIMIT(log) \ 476#define XLOG_CIL_HARD_SPACE_LIMIT(log) (3 * (log->l_logsize >> 4))
471 (min((log->l_logsize >> 2), (8 * 1024 * 1024)))
472 477
473/* 478/*
474 * The reservation head lsn is not made up of a cycle number and block number. 479 * The reservation head lsn is not made up of a cycle number and block number.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 6f3f5fa37acf..966d3f97458c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -107,7 +107,8 @@ xlog_get_bp(
107 nbblks += log->l_sectBBsize; 107 nbblks += log->l_sectBBsize;
108 nbblks = round_up(nbblks, log->l_sectBBsize); 108 nbblks = round_up(nbblks, log->l_sectBBsize);
109 109
110 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp); 110 return xfs_buf_get_uncached(log->l_mp->m_logdev_targp,
111 BBTOB(nbblks), 0);
111} 112}
112 113
113STATIC void 114STATIC void
@@ -167,7 +168,7 @@ xlog_bread_noalign(
167 XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); 168 XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
168 169
169 xfsbdstrat(log->l_mp, bp); 170 xfsbdstrat(log->l_mp, bp);
170 error = xfs_iowait(bp); 171 error = xfs_buf_iowait(bp);
171 if (error) 172 if (error)
172 xfs_ioerror_alert("xlog_bread", log->l_mp, 173 xfs_ioerror_alert("xlog_bread", log->l_mp,
173 bp, XFS_BUF_ADDR(bp)); 174 bp, XFS_BUF_ADDR(bp));
@@ -321,12 +322,13 @@ xlog_recover_iodone(
321 * this during recovery. One strike! 322 * this during recovery. One strike!
322 */ 323 */
323 xfs_ioerror_alert("xlog_recover_iodone", 324 xfs_ioerror_alert("xlog_recover_iodone",
324 bp->b_mount, bp, XFS_BUF_ADDR(bp)); 325 bp->b_target->bt_mount, bp,
325 xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); 326 XFS_BUF_ADDR(bp));
327 xfs_force_shutdown(bp->b_target->bt_mount,
328 SHUTDOWN_META_IO_ERROR);
326 } 329 }
327 bp->b_mount = NULL;
328 XFS_BUF_CLR_IODONE_FUNC(bp); 330 XFS_BUF_CLR_IODONE_FUNC(bp);
329 xfs_biodone(bp); 331 xfs_buf_ioend(bp, 0);
330} 332}
331 333
332/* 334/*
@@ -2275,8 +2277,7 @@ xlog_recover_do_buffer_trans(
2275 XFS_BUF_STALE(bp); 2277 XFS_BUF_STALE(bp);
2276 error = xfs_bwrite(mp, bp); 2278 error = xfs_bwrite(mp, bp);
2277 } else { 2279 } else {
2278 ASSERT(bp->b_mount == NULL || bp->b_mount == mp); 2280 ASSERT(bp->b_target->bt_mount == mp);
2279 bp->b_mount = mp;
2280 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); 2281 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2281 xfs_bdwrite(mp, bp); 2282 xfs_bdwrite(mp, bp);
2282 } 2283 }
@@ -2540,8 +2541,7 @@ xlog_recover_do_inode_trans(
2540 } 2541 }
2541 2542
2542write_inode_buffer: 2543write_inode_buffer:
2543 ASSERT(bp->b_mount == NULL || bp->b_mount == mp); 2544 ASSERT(bp->b_target->bt_mount == mp);
2544 bp->b_mount = mp;
2545 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); 2545 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2546 xfs_bdwrite(mp, bp); 2546 xfs_bdwrite(mp, bp);
2547error: 2547error:
@@ -2678,8 +2678,7 @@ xlog_recover_do_dquot_trans(
2678 memcpy(ddq, recddq, item->ri_buf[1].i_len); 2678 memcpy(ddq, recddq, item->ri_buf[1].i_len);
2679 2679
2680 ASSERT(dq_f->qlf_size == 2); 2680 ASSERT(dq_f->qlf_size == 2);
2681 ASSERT(bp->b_mount == NULL || bp->b_mount == mp); 2681 ASSERT(bp->b_target->bt_mount == mp);
2682 bp->b_mount = mp;
2683 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); 2682 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2684 xfs_bdwrite(mp, bp); 2683 xfs_bdwrite(mp, bp);
2685 2684
@@ -3817,7 +3816,7 @@ xlog_do_recover(
3817 XFS_BUF_READ(bp); 3816 XFS_BUF_READ(bp);
3818 XFS_BUF_UNASYNC(bp); 3817 XFS_BUF_UNASYNC(bp);
3819 xfsbdstrat(log->l_mp, bp); 3818 xfsbdstrat(log->l_mp, bp);
3820 error = xfs_iowait(bp); 3819 error = xfs_buf_iowait(bp);
3821 if (error) { 3820 if (error) {
3822 xfs_ioerror_alert("xlog_do_recover", 3821 xfs_ioerror_alert("xlog_do_recover",
3823 log->l_mp, bp, XFS_BUF_ADDR(bp)); 3822 log->l_mp, bp, XFS_BUF_ADDR(bp));
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index aeb9d72ebf6e..b1498ab5a399 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -52,16 +52,11 @@ STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
52 int); 52 int);
53STATIC void xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t, 53STATIC void xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t,
54 int); 54 int);
55STATIC int xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t,
56 int64_t, int);
57STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t); 55STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
58
59#else 56#else
60 57
61#define xfs_icsb_balance_counter(mp, a, b) do { } while (0) 58#define xfs_icsb_balance_counter(mp, a, b) do { } while (0)
62#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0) 59#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0)
63#define xfs_icsb_modify_counters(mp, a, b, c) do { } while (0)
64
65#endif 60#endif
66 61
67static const struct { 62static const struct {
@@ -199,6 +194,8 @@ xfs_uuid_unmount(
199 194
200/* 195/*
201 * Reference counting access wrappers to the perag structures. 196 * Reference counting access wrappers to the perag structures.
197 * Because we never free per-ag structures, the only thing we
198 * have to protect against changes is the tree structure itself.
202 */ 199 */
203struct xfs_perag * 200struct xfs_perag *
204xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno) 201xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
@@ -206,19 +203,43 @@ xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
206 struct xfs_perag *pag; 203 struct xfs_perag *pag;
207 int ref = 0; 204 int ref = 0;
208 205
209 spin_lock(&mp->m_perag_lock); 206 rcu_read_lock();
210 pag = radix_tree_lookup(&mp->m_perag_tree, agno); 207 pag = radix_tree_lookup(&mp->m_perag_tree, agno);
211 if (pag) { 208 if (pag) {
212 ASSERT(atomic_read(&pag->pag_ref) >= 0); 209 ASSERT(atomic_read(&pag->pag_ref) >= 0);
213 /* catch leaks in the positive direction during testing */
214 ASSERT(atomic_read(&pag->pag_ref) < 1000);
215 ref = atomic_inc_return(&pag->pag_ref); 210 ref = atomic_inc_return(&pag->pag_ref);
216 } 211 }
217 spin_unlock(&mp->m_perag_lock); 212 rcu_read_unlock();
218 trace_xfs_perag_get(mp, agno, ref, _RET_IP_); 213 trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
219 return pag; 214 return pag;
220} 215}
221 216
217/*
218 * search from @first to find the next perag with the given tag set.
219 */
220struct xfs_perag *
221xfs_perag_get_tag(
222 struct xfs_mount *mp,
223 xfs_agnumber_t first,
224 int tag)
225{
226 struct xfs_perag *pag;
227 int found;
228 int ref;
229
230 rcu_read_lock();
231 found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
232 (void **)&pag, first, 1, tag);
233 if (found <= 0) {
234 rcu_read_unlock();
235 return NULL;
236 }
237 ref = atomic_inc_return(&pag->pag_ref);
238 rcu_read_unlock();
239 trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
240 return pag;
241}
242
222void 243void
223xfs_perag_put(struct xfs_perag *pag) 244xfs_perag_put(struct xfs_perag *pag)
224{ 245{
@@ -229,10 +250,18 @@ xfs_perag_put(struct xfs_perag *pag)
229 trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_); 250 trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
230} 251}
231 252
253STATIC void
254__xfs_free_perag(
255 struct rcu_head *head)
256{
257 struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
258
259 ASSERT(atomic_read(&pag->pag_ref) == 0);
260 kmem_free(pag);
261}
262
232/* 263/*
233 * Free up the resources associated with a mount structure. Assume that 264 * Free up the per-ag resources associated with the mount structure.
234 * the structure was initially zeroed, so we can tell which fields got
235 * initialized.
236 */ 265 */
237STATIC void 266STATIC void
238xfs_free_perag( 267xfs_free_perag(
@@ -244,10 +273,9 @@ xfs_free_perag(
244 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 273 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
245 spin_lock(&mp->m_perag_lock); 274 spin_lock(&mp->m_perag_lock);
246 pag = radix_tree_delete(&mp->m_perag_tree, agno); 275 pag = radix_tree_delete(&mp->m_perag_tree, agno);
247 ASSERT(pag);
248 ASSERT(atomic_read(&pag->pag_ref) == 0);
249 spin_unlock(&mp->m_perag_lock); 276 spin_unlock(&mp->m_perag_lock);
250 kmem_free(pag); 277 ASSERT(pag);
278 call_rcu(&pag->rcu_head, __xfs_free_perag);
251 } 279 }
252} 280}
253 281
@@ -444,7 +472,10 @@ xfs_initialize_perag(
444 pag->pag_agno = index; 472 pag->pag_agno = index;
445 pag->pag_mount = mp; 473 pag->pag_mount = mp;
446 rwlock_init(&pag->pag_ici_lock); 474 rwlock_init(&pag->pag_ici_lock);
475 mutex_init(&pag->pag_ici_reclaim_lock);
447 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); 476 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
477 spin_lock_init(&pag->pag_buf_lock);
478 pag->pag_buf_tree = RB_ROOT;
448 479
449 if (radix_tree_preload(GFP_NOFS)) 480 if (radix_tree_preload(GFP_NOFS))
450 goto out_unwind; 481 goto out_unwind;
@@ -639,7 +670,6 @@ int
639xfs_readsb(xfs_mount_t *mp, int flags) 670xfs_readsb(xfs_mount_t *mp, int flags)
640{ 671{
641 unsigned int sector_size; 672 unsigned int sector_size;
642 unsigned int extra_flags;
643 xfs_buf_t *bp; 673 xfs_buf_t *bp;
644 int error; 674 int error;
645 675
@@ -652,28 +682,24 @@ xfs_readsb(xfs_mount_t *mp, int flags)
652 * access to the superblock. 682 * access to the superblock.
653 */ 683 */
654 sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); 684 sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
655 extra_flags = XBF_LOCK | XBF_FS_MANAGED | XBF_MAPPED;
656 685
657 bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size), 686reread:
658 extra_flags); 687 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
659 if (!bp || XFS_BUF_ISERROR(bp)) { 688 XFS_SB_DADDR, sector_size, 0);
660 xfs_fs_mount_cmn_err(flags, "SB read failed"); 689 if (!bp) {
661 error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM; 690 xfs_fs_mount_cmn_err(flags, "SB buffer read failed");
662 goto fail; 691 return EIO;
663 } 692 }
664 ASSERT(XFS_BUF_ISBUSY(bp));
665 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
666 693
667 /* 694 /*
668 * Initialize the mount structure from the superblock. 695 * Initialize the mount structure from the superblock.
669 * But first do some basic consistency checking. 696 * But first do some basic consistency checking.
670 */ 697 */
671 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); 698 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
672
673 error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags); 699 error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
674 if (error) { 700 if (error) {
675 xfs_fs_mount_cmn_err(flags, "SB validate failed"); 701 xfs_fs_mount_cmn_err(flags, "SB validate failed");
676 goto fail; 702 goto release_buf;
677 } 703 }
678 704
679 /* 705 /*
@@ -684,7 +710,7 @@ xfs_readsb(xfs_mount_t *mp, int flags)
684 "device supports only %u byte sectors (not %u)", 710 "device supports only %u byte sectors (not %u)",
685 sector_size, mp->m_sb.sb_sectsize); 711 sector_size, mp->m_sb.sb_sectsize);
686 error = ENOSYS; 712 error = ENOSYS;
687 goto fail; 713 goto release_buf;
688 } 714 }
689 715
690 /* 716 /*
@@ -692,33 +718,20 @@ xfs_readsb(xfs_mount_t *mp, int flags)
692 * re-read the superblock so the buffer is correctly sized. 718 * re-read the superblock so the buffer is correctly sized.
693 */ 719 */
694 if (sector_size < mp->m_sb.sb_sectsize) { 720 if (sector_size < mp->m_sb.sb_sectsize) {
695 XFS_BUF_UNMANAGE(bp);
696 xfs_buf_relse(bp); 721 xfs_buf_relse(bp);
697 sector_size = mp->m_sb.sb_sectsize; 722 sector_size = mp->m_sb.sb_sectsize;
698 bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, 723 goto reread;
699 BTOBB(sector_size), extra_flags);
700 if (!bp || XFS_BUF_ISERROR(bp)) {
701 xfs_fs_mount_cmn_err(flags, "SB re-read failed");
702 error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
703 goto fail;
704 }
705 ASSERT(XFS_BUF_ISBUSY(bp));
706 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
707 } 724 }
708 725
709 /* Initialize per-cpu counters */ 726 /* Initialize per-cpu counters */
710 xfs_icsb_reinit_counters(mp); 727 xfs_icsb_reinit_counters(mp);
711 728
712 mp->m_sb_bp = bp; 729 mp->m_sb_bp = bp;
713 xfs_buf_relse(bp); 730 xfs_buf_unlock(bp);
714 ASSERT(XFS_BUF_VALUSEMA(bp) > 0);
715 return 0; 731 return 0;
716 732
717 fail: 733release_buf:
718 if (bp) { 734 xfs_buf_relse(bp);
719 XFS_BUF_UNMANAGE(bp);
720 xfs_buf_relse(bp);
721 }
722 return error; 735 return error;
723} 736}
724 737
@@ -991,42 +1004,35 @@ xfs_check_sizes(xfs_mount_t *mp)
991{ 1004{
992 xfs_buf_t *bp; 1005 xfs_buf_t *bp;
993 xfs_daddr_t d; 1006 xfs_daddr_t d;
994 int error;
995 1007
996 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); 1008 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
997 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { 1009 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
998 cmn_err(CE_WARN, "XFS: size check 1 failed"); 1010 cmn_err(CE_WARN, "XFS: filesystem size mismatch detected");
999 return XFS_ERROR(EFBIG); 1011 return XFS_ERROR(EFBIG);
1000 } 1012 }
1001 error = xfs_read_buf(mp, mp->m_ddev_targp, 1013 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
1002 d - XFS_FSS_TO_BB(mp, 1), 1014 d - XFS_FSS_TO_BB(mp, 1),
1003 XFS_FSS_TO_BB(mp, 1), 0, &bp); 1015 BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
1004 if (!error) { 1016 if (!bp) {
1005 xfs_buf_relse(bp); 1017 cmn_err(CE_WARN, "XFS: last sector read failed");
1006 } else { 1018 return EIO;
1007 cmn_err(CE_WARN, "XFS: size check 2 failed");
1008 if (error == ENOSPC)
1009 error = XFS_ERROR(EFBIG);
1010 return error;
1011 } 1019 }
1020 xfs_buf_relse(bp);
1012 1021
1013 if (mp->m_logdev_targp != mp->m_ddev_targp) { 1022 if (mp->m_logdev_targp != mp->m_ddev_targp) {
1014 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); 1023 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
1015 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { 1024 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
1016 cmn_err(CE_WARN, "XFS: size check 3 failed"); 1025 cmn_err(CE_WARN, "XFS: log size mismatch detected");
1017 return XFS_ERROR(EFBIG); 1026 return XFS_ERROR(EFBIG);
1018 } 1027 }
1019 error = xfs_read_buf(mp, mp->m_logdev_targp, 1028 bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp,
1020 d - XFS_FSB_TO_BB(mp, 1), 1029 d - XFS_FSB_TO_BB(mp, 1),
1021 XFS_FSB_TO_BB(mp, 1), 0, &bp); 1030 XFS_FSB_TO_B(mp, 1), 0);
1022 if (!error) { 1031 if (!bp) {
1023 xfs_buf_relse(bp); 1032 cmn_err(CE_WARN, "XFS: log device read failed");
1024 } else { 1033 return EIO;
1025 cmn_err(CE_WARN, "XFS: size check 3 failed");
1026 if (error == ENOSPC)
1027 error = XFS_ERROR(EFBIG);
1028 return error;
1029 } 1034 }
1035 xfs_buf_relse(bp);
1030 } 1036 }
1031 return 0; 1037 return 0;
1032} 1038}
@@ -1601,7 +1607,7 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
1601 XFS_BUF_UNASYNC(sbp); 1607 XFS_BUF_UNASYNC(sbp);
1602 ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp); 1608 ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp);
1603 xfsbdstrat(mp, sbp); 1609 xfsbdstrat(mp, sbp);
1604 error = xfs_iowait(sbp); 1610 error = xfs_buf_iowait(sbp);
1605 if (error) 1611 if (error)
1606 xfs_ioerror_alert("xfs_unmountfs_writesb", 1612 xfs_ioerror_alert("xfs_unmountfs_writesb",
1607 mp, sbp, XFS_BUF_ADDR(sbp)); 1613 mp, sbp, XFS_BUF_ADDR(sbp));
@@ -1832,135 +1838,72 @@ xfs_mod_incore_sb_unlocked(
1832 */ 1838 */
1833int 1839int
1834xfs_mod_incore_sb( 1840xfs_mod_incore_sb(
1835 xfs_mount_t *mp, 1841 struct xfs_mount *mp,
1836 xfs_sb_field_t field, 1842 xfs_sb_field_t field,
1837 int64_t delta, 1843 int64_t delta,
1838 int rsvd) 1844 int rsvd)
1839{ 1845{
1840 int status; 1846 int status;
1841 1847
1842 /* check for per-cpu counters */
1843 switch (field) {
1844#ifdef HAVE_PERCPU_SB 1848#ifdef HAVE_PERCPU_SB
1845 case XFS_SBS_ICOUNT: 1849 ASSERT(field < XFS_SBS_ICOUNT || field > XFS_SBS_FDBLOCKS);
1846 case XFS_SBS_IFREE:
1847 case XFS_SBS_FDBLOCKS:
1848 if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
1849 status = xfs_icsb_modify_counters(mp, field,
1850 delta, rsvd);
1851 break;
1852 }
1853 /* FALLTHROUGH */
1854#endif 1850#endif
1855 default: 1851 spin_lock(&mp->m_sb_lock);
1856 spin_lock(&mp->m_sb_lock); 1852 status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
1857 status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); 1853 spin_unlock(&mp->m_sb_lock);
1858 spin_unlock(&mp->m_sb_lock);
1859 break;
1860 }
1861 1854
1862 return status; 1855 return status;
1863} 1856}
1864 1857
1865/* 1858/*
1866 * xfs_mod_incore_sb_batch() is used to change more than one field 1859 * Change more than one field in the in-core superblock structure at a time.
1867 * in the in-core superblock structure at a time. This modification
1868 * is protected by a lock internal to this module. The fields and
1869 * changes to those fields are specified in the array of xfs_mod_sb
1870 * structures passed in.
1871 * 1860 *
1872 * Either all of the specified deltas will be applied or none of 1861 * The fields and changes to those fields are specified in the array of
1873 * them will. If any modified field dips below 0, then all modifications 1862 * xfs_mod_sb structures passed in. Either all of the specified deltas
1874 * will be backed out and EINVAL will be returned. 1863 * will be applied or none of them will. If any modified field dips below 0,
1864 * then all modifications will be backed out and EINVAL will be returned.
1865 *
1866 * Note that this function may not be used for the superblock values that
1867 * are tracked with the in-memory per-cpu counters - a direct call to
1868 * xfs_icsb_modify_counters is required for these.
1875 */ 1869 */
1876int 1870int
1877xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd) 1871xfs_mod_incore_sb_batch(
1872 struct xfs_mount *mp,
1873 xfs_mod_sb_t *msb,
1874 uint nmsb,
1875 int rsvd)
1878{ 1876{
1879 int status=0; 1877 xfs_mod_sb_t *msbp = &msb[0];
1880 xfs_mod_sb_t *msbp; 1878 int error = 0;
1881 1879
1882 /* 1880 /*
1883 * Loop through the array of mod structures and apply each 1881 * Loop through the array of mod structures and apply each individually.
1884 * individually. If any fail, then back out all those 1882 * If any fail, then back out all those which have already been applied.
1885 * which have already been applied. Do all of this within 1883 * Do all of this within the scope of the m_sb_lock so that all of the
1886 * the scope of the m_sb_lock so that all of the changes will 1884 * changes will be atomic.
1887 * be atomic.
1888 */ 1885 */
1889 spin_lock(&mp->m_sb_lock); 1886 spin_lock(&mp->m_sb_lock);
1890 msbp = &msb[0];
1891 for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) { 1887 for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) {
1892 /* 1888 ASSERT(msbp->msb_field < XFS_SBS_ICOUNT ||
1893 * Apply the delta at index n. If it fails, break 1889 msbp->msb_field > XFS_SBS_FDBLOCKS);
1894 * from the loop so we'll fall into the undo loop
1895 * below.
1896 */
1897 switch (msbp->msb_field) {
1898#ifdef HAVE_PERCPU_SB
1899 case XFS_SBS_ICOUNT:
1900 case XFS_SBS_IFREE:
1901 case XFS_SBS_FDBLOCKS:
1902 if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
1903 spin_unlock(&mp->m_sb_lock);
1904 status = xfs_icsb_modify_counters(mp,
1905 msbp->msb_field,
1906 msbp->msb_delta, rsvd);
1907 spin_lock(&mp->m_sb_lock);
1908 break;
1909 }
1910 /* FALLTHROUGH */
1911#endif
1912 default:
1913 status = xfs_mod_incore_sb_unlocked(mp,
1914 msbp->msb_field,
1915 msbp->msb_delta, rsvd);
1916 break;
1917 }
1918 1890
1919 if (status != 0) { 1891 error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
1920 break; 1892 msbp->msb_delta, rsvd);
1921 } 1893 if (error)
1894 goto unwind;
1922 } 1895 }
1896 spin_unlock(&mp->m_sb_lock);
1897 return 0;
1923 1898
1924 /* 1899unwind:
1925 * If we didn't complete the loop above, then back out 1900 while (--msbp >= msb) {
1926 * any changes made to the superblock. If you add code 1901 error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
1927 * between the loop above and here, make sure that you 1902 -msbp->msb_delta, rsvd);
1928 * preserve the value of status. Loop back until 1903 ASSERT(error == 0);
1929 * we step below the beginning of the array. Make sure
1930 * we don't touch anything back there.
1931 */
1932 if (status != 0) {
1933 msbp--;
1934 while (msbp >= msb) {
1935 switch (msbp->msb_field) {
1936#ifdef HAVE_PERCPU_SB
1937 case XFS_SBS_ICOUNT:
1938 case XFS_SBS_IFREE:
1939 case XFS_SBS_FDBLOCKS:
1940 if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
1941 spin_unlock(&mp->m_sb_lock);
1942 status = xfs_icsb_modify_counters(mp,
1943 msbp->msb_field,
1944 -(msbp->msb_delta),
1945 rsvd);
1946 spin_lock(&mp->m_sb_lock);
1947 break;
1948 }
1949 /* FALLTHROUGH */
1950#endif
1951 default:
1952 status = xfs_mod_incore_sb_unlocked(mp,
1953 msbp->msb_field,
1954 -(msbp->msb_delta),
1955 rsvd);
1956 break;
1957 }
1958 ASSERT(status == 0);
1959 msbp--;
1960 }
1961 } 1904 }
1962 spin_unlock(&mp->m_sb_lock); 1905 spin_unlock(&mp->m_sb_lock);
1963 return status; 1906 return error;
1964} 1907}
1965 1908
1966/* 1909/*
@@ -1998,18 +1941,13 @@ xfs_getsb(
1998 */ 1941 */
1999void 1942void
2000xfs_freesb( 1943xfs_freesb(
2001 xfs_mount_t *mp) 1944 struct xfs_mount *mp)
2002{ 1945{
2003 xfs_buf_t *bp; 1946 struct xfs_buf *bp = mp->m_sb_bp;
2004 1947
2005 /* 1948 xfs_buf_lock(bp);
2006 * Use xfs_getsb() so that the buffer will be locked
2007 * when we call xfs_buf_relse().
2008 */
2009 bp = xfs_getsb(mp, 0);
2010 XFS_BUF_UNMANAGE(bp);
2011 xfs_buf_relse(bp);
2012 mp->m_sb_bp = NULL; 1949 mp->m_sb_bp = NULL;
1950 xfs_buf_relse(bp);
2013} 1951}
2014 1952
2015/* 1953/*
@@ -2496,7 +2434,7 @@ xfs_icsb_balance_counter(
2496 spin_unlock(&mp->m_sb_lock); 2434 spin_unlock(&mp->m_sb_lock);
2497} 2435}
2498 2436
2499STATIC int 2437int
2500xfs_icsb_modify_counters( 2438xfs_icsb_modify_counters(
2501 xfs_mount_t *mp, 2439 xfs_mount_t *mp,
2502 xfs_sb_field_t field, 2440 xfs_sb_field_t field,
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 622da2179a57..5861b4980740 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -53,7 +53,6 @@ typedef struct xfs_trans_reservations {
53 53
54#include "xfs_sync.h" 54#include "xfs_sync.h"
55 55
56struct cred;
57struct log; 56struct log;
58struct xfs_mount_args; 57struct xfs_mount_args;
59struct xfs_inode; 58struct xfs_inode;
@@ -91,6 +90,8 @@ extern void xfs_icsb_reinit_counters(struct xfs_mount *);
91extern void xfs_icsb_destroy_counters(struct xfs_mount *); 90extern void xfs_icsb_destroy_counters(struct xfs_mount *);
92extern void xfs_icsb_sync_counters(struct xfs_mount *, int); 91extern void xfs_icsb_sync_counters(struct xfs_mount *, int);
93extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int); 92extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
93extern int xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
94 int64_t, int);
94 95
95#else 96#else
96#define xfs_icsb_init_counters(mp) (0) 97#define xfs_icsb_init_counters(mp) (0)
@@ -98,6 +99,8 @@ extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
98#define xfs_icsb_reinit_counters(mp) do { } while (0) 99#define xfs_icsb_reinit_counters(mp) do { } while (0)
99#define xfs_icsb_sync_counters(mp, flags) do { } while (0) 100#define xfs_icsb_sync_counters(mp, flags) do { } while (0)
100#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0) 101#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
102#define xfs_icsb_modify_counters(mp, field, delta, rsvd) \
103 xfs_mod_incore_sb(mp, field, delta, rsvd)
101#endif 104#endif
102 105
103typedef struct xfs_mount { 106typedef struct xfs_mount {
@@ -232,8 +235,6 @@ typedef struct xfs_mount {
232#define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */ 235#define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */
233#define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred 236#define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred
234 * I/O size in stat() */ 237 * I/O size in stat() */
235#define XFS_MOUNT_NO_PERCPU_SB (1ULL << 23) /* don't use per-cpu superblock
236 counters */
237#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams 238#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams
238 allocator */ 239 allocator */
239#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ 240#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */
@@ -327,6 +328,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
327 * perag get/put wrappers for ref counting 328 * perag get/put wrappers for ref counting
328 */ 329 */
329struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno); 330struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
331struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
332 int tag);
330void xfs_perag_put(struct xfs_perag *pag); 333void xfs_perag_put(struct xfs_perag *pag);
331 334
332/* 335/*
diff --git a/fs/xfs/xfs_refcache.h b/fs/xfs/xfs_refcache.h
deleted file mode 100644
index 2dec79edb510..000000000000
--- a/fs/xfs/xfs_refcache.h
+++ /dev/null
@@ -1,52 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_REFCACHE_H__
19#define __XFS_REFCACHE_H__
20
21#ifdef HAVE_REFCACHE
22/*
23 * Maximum size (in inodes) for the NFS reference cache
24 */
25#define XFS_REFCACHE_SIZE_MAX 512
26
27struct xfs_inode;
28struct xfs_mount;
29
30extern void xfs_refcache_insert(struct xfs_inode *);
31extern void xfs_refcache_purge_ip(struct xfs_inode *);
32extern void xfs_refcache_purge_mp(struct xfs_mount *);
33extern void xfs_refcache_purge_some(struct xfs_mount *);
34extern void xfs_refcache_resize(int);
35extern void xfs_refcache_destroy(void);
36
37extern void xfs_refcache_iunlock(struct xfs_inode *, uint);
38
39#else
40
41#define xfs_refcache_insert(ip) do { } while (0)
42#define xfs_refcache_purge_ip(ip) do { } while (0)
43#define xfs_refcache_purge_mp(mp) do { } while (0)
44#define xfs_refcache_purge_some(mp) do { } while (0)
45#define xfs_refcache_resize(size) do { } while (0)
46#define xfs_refcache_destroy() do { } while (0)
47
48#define xfs_refcache_iunlock(ip, flags) xfs_iunlock(ip, flags)
49
50#endif
51
52#endif /* __XFS_REFCACHE_H__ */
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 8fca957200df..d2af0a8381a6 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -183,7 +183,7 @@ xfs_rename(
183 * tree quota mechanism would be circumvented. 183 * tree quota mechanism would be circumvented.
184 */ 184 */
185 if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && 185 if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
186 (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) { 186 (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
187 error = XFS_ERROR(EXDEV); 187 error = XFS_ERROR(EXDEV);
188 goto error_return; 188 goto error_return;
189 } 189 }
@@ -211,7 +211,9 @@ xfs_rename(
211 goto error_return; 211 goto error_return;
212 if (error) 212 if (error)
213 goto abort_return; 213 goto abort_return;
214 xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 214
215 xfs_trans_ichgtime(tp, target_dp,
216 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
215 217
216 if (new_parent && src_is_directory) { 218 if (new_parent && src_is_directory) {
217 error = xfs_bumplink(tp, target_dp); 219 error = xfs_bumplink(tp, target_dp);
@@ -249,7 +251,9 @@ xfs_rename(
249 &first_block, &free_list, spaceres); 251 &first_block, &free_list, spaceres);
250 if (error) 252 if (error)
251 goto abort_return; 253 goto abort_return;
252 xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 254
255 xfs_trans_ichgtime(tp, target_dp,
256 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
253 257
254 /* 258 /*
255 * Decrement the link count on the target since the target 259 * Decrement the link count on the target since the target
@@ -292,7 +296,7 @@ xfs_rename(
292 * inode isn't really being changed, but old unix file systems did 296 * inode isn't really being changed, but old unix file systems did
293 * it and some incremental backup programs won't work without it. 297 * it and some incremental backup programs won't work without it.
294 */ 298 */
295 xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG); 299 xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
296 300
297 /* 301 /*
298 * Adjust the link count on src_dp. This is necessary when 302 * Adjust the link count on src_dp. This is necessary when
@@ -315,7 +319,7 @@ xfs_rename(
315 if (error) 319 if (error)
316 goto abort_return; 320 goto abort_return;
317 321
318 xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 322 xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
319 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); 323 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
320 if (new_parent) 324 if (new_parent)
321 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); 325 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 891260fea11e..12a191385310 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -39,6 +39,7 @@
39#include "xfs_trans_space.h" 39#include "xfs_trans_space.h"
40#include "xfs_utils.h" 40#include "xfs_utils.h"
41#include "xfs_trace.h" 41#include "xfs_trace.h"
42#include "xfs_buf.h"
42 43
43 44
44/* 45/*
@@ -1883,13 +1884,13 @@ xfs_growfs_rt(
1883 /* 1884 /*
1884 * Read in the last block of the device, make sure it exists. 1885 * Read in the last block of the device, make sure it exists.
1885 */ 1886 */
1886 error = xfs_read_buf(mp, mp->m_rtdev_targp, 1887 bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
1887 XFS_FSB_TO_BB(mp, nrblocks - 1), 1888 XFS_FSB_TO_BB(mp, nrblocks - 1),
1888 XFS_FSB_TO_BB(mp, 1), 0, &bp); 1889 XFS_FSB_TO_B(mp, 1), 0);
1889 if (error) 1890 if (!bp)
1890 return error; 1891 return EIO;
1891 ASSERT(bp);
1892 xfs_buf_relse(bp); 1892 xfs_buf_relse(bp);
1893
1893 /* 1894 /*
1894 * Calculate new parameters. These are the final values to be reached. 1895 * Calculate new parameters. These are the final values to be reached.
1895 */ 1896 */
@@ -2215,7 +2216,6 @@ xfs_rtmount_init(
2215{ 2216{
2216 xfs_buf_t *bp; /* buffer for last block of subvolume */ 2217 xfs_buf_t *bp; /* buffer for last block of subvolume */
2217 xfs_daddr_t d; /* address of last block of subvolume */ 2218 xfs_daddr_t d; /* address of last block of subvolume */
2218 int error; /* error return value */
2219 xfs_sb_t *sbp; /* filesystem superblock copy in mount */ 2219 xfs_sb_t *sbp; /* filesystem superblock copy in mount */
2220 2220
2221 sbp = &mp->m_sb; 2221 sbp = &mp->m_sb;
@@ -2242,15 +2242,12 @@ xfs_rtmount_init(
2242 (unsigned long long) mp->m_sb.sb_rblocks); 2242 (unsigned long long) mp->m_sb.sb_rblocks);
2243 return XFS_ERROR(EFBIG); 2243 return XFS_ERROR(EFBIG);
2244 } 2244 }
2245 error = xfs_read_buf(mp, mp->m_rtdev_targp, 2245 bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
2246 d - XFS_FSB_TO_BB(mp, 1), 2246 d - XFS_FSB_TO_BB(mp, 1),
2247 XFS_FSB_TO_BB(mp, 1), 0, &bp); 2247 XFS_FSB_TO_B(mp, 1), 0);
2248 if (error) { 2248 if (!bp) {
2249 cmn_err(CE_WARN, 2249 cmn_err(CE_WARN, "XFS: realtime device size check failed");
2250 "XFS: realtime mount -- xfs_read_buf failed, returned %d", error); 2250 return EIO;
2251 if (error == ENOSPC)
2252 return XFS_ERROR(EFBIG);
2253 return error;
2254 } 2251 }
2255 xfs_buf_relse(bp); 2252 xfs_buf_relse(bp);
2256 return 0; 2253 return 0;
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 1b017c657494..1eb2ba586814 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -80,10 +80,12 @@ struct xfs_mount;
80#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004 80#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004
81#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ 81#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */
82#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ 82#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */
83#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */
83 84
84#define XFS_SB_VERSION2_OKREALFBITS \ 85#define XFS_SB_VERSION2_OKREALFBITS \
85 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ 86 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
86 XFS_SB_VERSION2_ATTR2BIT) 87 XFS_SB_VERSION2_ATTR2BIT | \
88 XFS_SB_VERSION2_PROJID32BIT)
87#define XFS_SB_VERSION2_OKSASHFBITS \ 89#define XFS_SB_VERSION2_OKSASHFBITS \
88 (0) 90 (0)
89#define XFS_SB_VERSION2_OKREALBITS \ 91#define XFS_SB_VERSION2_OKREALBITS \
@@ -495,6 +497,12 @@ static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
495 sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT; 497 sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
496} 498}
497 499
500static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
501{
502 return xfs_sb_version_hasmorebits(sbp) &&
503 (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT);
504}
505
498/* 506/*
499 * end of superblock version macros 507 * end of superblock version macros
500 */ 508 */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 1c47edaea0d2..f6d956b7711e 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -696,7 +696,7 @@ xfs_trans_reserve(
696 * fail if the count would go below zero. 696 * fail if the count would go below zero.
697 */ 697 */
698 if (blocks > 0) { 698 if (blocks > 0) {
699 error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS, 699 error = xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
700 -((int64_t)blocks), rsvd); 700 -((int64_t)blocks), rsvd);
701 if (error != 0) { 701 if (error != 0) {
702 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 702 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
@@ -767,7 +767,7 @@ undo_log:
767 767
768undo_blocks: 768undo_blocks:
769 if (blocks > 0) { 769 if (blocks > 0) {
770 (void) xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS, 770 xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
771 (int64_t)blocks, rsvd); 771 (int64_t)blocks, rsvd);
772 tp->t_blk_res = 0; 772 tp->t_blk_res = 0;
773 } 773 }
@@ -1009,7 +1009,7 @@ void
1009xfs_trans_unreserve_and_mod_sb( 1009xfs_trans_unreserve_and_mod_sb(
1010 xfs_trans_t *tp) 1010 xfs_trans_t *tp)
1011{ 1011{
1012 xfs_mod_sb_t msb[14]; /* If you add cases, add entries */ 1012 xfs_mod_sb_t msb[9]; /* If you add cases, add entries */
1013 xfs_mod_sb_t *msbp; 1013 xfs_mod_sb_t *msbp;
1014 xfs_mount_t *mp = tp->t_mountp; 1014 xfs_mount_t *mp = tp->t_mountp;
1015 /* REFERENCED */ 1015 /* REFERENCED */
@@ -1017,55 +1017,61 @@ xfs_trans_unreserve_and_mod_sb(
1017 int rsvd; 1017 int rsvd;
1018 int64_t blkdelta = 0; 1018 int64_t blkdelta = 0;
1019 int64_t rtxdelta = 0; 1019 int64_t rtxdelta = 0;
1020 int64_t idelta = 0;
1021 int64_t ifreedelta = 0;
1020 1022
1021 msbp = msb; 1023 msbp = msb;
1022 rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; 1024 rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
1023 1025
1024 /* calculate free blocks delta */ 1026 /* calculate deltas */
1025 if (tp->t_blk_res > 0) 1027 if (tp->t_blk_res > 0)
1026 blkdelta = tp->t_blk_res; 1028 blkdelta = tp->t_blk_res;
1027
1028 if ((tp->t_fdblocks_delta != 0) && 1029 if ((tp->t_fdblocks_delta != 0) &&
1029 (xfs_sb_version_haslazysbcount(&mp->m_sb) || 1030 (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
1030 (tp->t_flags & XFS_TRANS_SB_DIRTY))) 1031 (tp->t_flags & XFS_TRANS_SB_DIRTY)))
1031 blkdelta += tp->t_fdblocks_delta; 1032 blkdelta += tp->t_fdblocks_delta;
1032 1033
1033 if (blkdelta != 0) {
1034 msbp->msb_field = XFS_SBS_FDBLOCKS;
1035 msbp->msb_delta = blkdelta;
1036 msbp++;
1037 }
1038
1039 /* calculate free realtime extents delta */
1040 if (tp->t_rtx_res > 0) 1034 if (tp->t_rtx_res > 0)
1041 rtxdelta = tp->t_rtx_res; 1035 rtxdelta = tp->t_rtx_res;
1042
1043 if ((tp->t_frextents_delta != 0) && 1036 if ((tp->t_frextents_delta != 0) &&
1044 (tp->t_flags & XFS_TRANS_SB_DIRTY)) 1037 (tp->t_flags & XFS_TRANS_SB_DIRTY))
1045 rtxdelta += tp->t_frextents_delta; 1038 rtxdelta += tp->t_frextents_delta;
1046 1039
1040 if (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
1041 (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
1042 idelta = tp->t_icount_delta;
1043 ifreedelta = tp->t_ifree_delta;
1044 }
1045
1046 /* apply the per-cpu counters */
1047 if (blkdelta) {
1048 error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
1049 blkdelta, rsvd);
1050 if (error)
1051 goto out;
1052 }
1053
1054 if (idelta) {
1055 error = xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT,
1056 idelta, rsvd);
1057 if (error)
1058 goto out_undo_fdblocks;
1059 }
1060
1061 if (ifreedelta) {
1062 error = xfs_icsb_modify_counters(mp, XFS_SBS_IFREE,
1063 ifreedelta, rsvd);
1064 if (error)
1065 goto out_undo_icount;
1066 }
1067
1068 /* apply remaining deltas */
1047 if (rtxdelta != 0) { 1069 if (rtxdelta != 0) {
1048 msbp->msb_field = XFS_SBS_FREXTENTS; 1070 msbp->msb_field = XFS_SBS_FREXTENTS;
1049 msbp->msb_delta = rtxdelta; 1071 msbp->msb_delta = rtxdelta;
1050 msbp++; 1072 msbp++;
1051 } 1073 }
1052 1074
1053 /* apply remaining deltas */
1054
1055 if (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
1056 (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
1057 if (tp->t_icount_delta != 0) {
1058 msbp->msb_field = XFS_SBS_ICOUNT;
1059 msbp->msb_delta = tp->t_icount_delta;
1060 msbp++;
1061 }
1062 if (tp->t_ifree_delta != 0) {
1063 msbp->msb_field = XFS_SBS_IFREE;
1064 msbp->msb_delta = tp->t_ifree_delta;
1065 msbp++;
1066 }
1067 }
1068
1069 if (tp->t_flags & XFS_TRANS_SB_DIRTY) { 1075 if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
1070 if (tp->t_dblocks_delta != 0) { 1076 if (tp->t_dblocks_delta != 0) {
1071 msbp->msb_field = XFS_SBS_DBLOCKS; 1077 msbp->msb_field = XFS_SBS_DBLOCKS;
@@ -1115,8 +1121,24 @@ xfs_trans_unreserve_and_mod_sb(
1115 if (msbp > msb) { 1121 if (msbp > msb) {
1116 error = xfs_mod_incore_sb_batch(tp->t_mountp, msb, 1122 error = xfs_mod_incore_sb_batch(tp->t_mountp, msb,
1117 (uint)(msbp - msb), rsvd); 1123 (uint)(msbp - msb), rsvd);
1118 ASSERT(error == 0); 1124 if (error)
1125 goto out_undo_ifreecount;
1119 } 1126 }
1127
1128 return;
1129
1130out_undo_ifreecount:
1131 if (ifreedelta)
1132 xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, -ifreedelta, rsvd);
1133out_undo_icount:
1134 if (idelta)
1135 xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, -idelta, rsvd);
1136out_undo_fdblocks:
1137 if (blkdelta)
1138 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
1139out:
1140 ASSERT(error = 0);
1141 return;
1120} 1142}
1121 1143
1122/* 1144/*
@@ -1389,15 +1411,12 @@ xfs_trans_item_committed(
1389 */ 1411 */
1390STATIC void 1412STATIC void
1391xfs_trans_committed( 1413xfs_trans_committed(
1392 struct xfs_trans *tp, 1414 void *arg,
1393 int abortflag) 1415 int abortflag)
1394{ 1416{
1417 struct xfs_trans *tp = arg;
1395 struct xfs_log_item_desc *lidp, *next; 1418 struct xfs_log_item_desc *lidp, *next;
1396 1419
1397 /* Call the transaction's completion callback if there is one. */
1398 if (tp->t_callback != NULL)
1399 tp->t_callback(tp, tp->t_callarg);
1400
1401 list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { 1420 list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
1402 xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag); 1421 xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag);
1403 xfs_trans_free_item_desc(lidp); 1422 xfs_trans_free_item_desc(lidp);
@@ -1525,7 +1544,7 @@ xfs_trans_commit_iclog(
1525 * running in simulation mode (the log is explicitly turned 1544 * running in simulation mode (the log is explicitly turned
1526 * off). 1545 * off).
1527 */ 1546 */
1528 tp->t_logcb.cb_func = (void(*)(void*, int))xfs_trans_committed; 1547 tp->t_logcb.cb_func = xfs_trans_committed;
1529 tp->t_logcb.cb_arg = tp; 1548 tp->t_logcb.cb_arg = tp;
1530 1549
1531 /* 1550 /*
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c13c0f97b494..246286b77a86 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -399,8 +399,6 @@ typedef struct xfs_trans {
399 * transaction. */ 399 * transaction. */
400 struct xfs_mount *t_mountp; /* ptr to fs mount struct */ 400 struct xfs_mount *t_mountp; /* ptr to fs mount struct */
401 struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */ 401 struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */
402 xfs_trans_callback_t t_callback; /* transaction callback */
403 void *t_callarg; /* callback arg */
404 unsigned int t_flags; /* misc flags */ 402 unsigned int t_flags; /* misc flags */
405 int64_t t_icount_delta; /* superblock icount change */ 403 int64_t t_icount_delta; /* superblock icount change */
406 int64_t t_ifree_delta; /* superblock ifree change */ 404 int64_t t_ifree_delta; /* superblock ifree change */
@@ -473,6 +471,7 @@ void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
473void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); 471void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
474int xfs_trans_iget(struct xfs_mount *, xfs_trans_t *, 472int xfs_trans_iget(struct xfs_mount *, xfs_trans_t *,
475 xfs_ino_t , uint, uint, struct xfs_inode **); 473 xfs_ino_t , uint, uint, struct xfs_inode **);
474void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
476void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint); 475void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
477void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *); 476void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
478void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); 477void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 90af025e6839..c47918c302a5 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -336,7 +336,7 @@ xfs_trans_read_buf(
336 ASSERT(!XFS_BUF_ISASYNC(bp)); 336 ASSERT(!XFS_BUF_ISASYNC(bp));
337 XFS_BUF_READ(bp); 337 XFS_BUF_READ(bp);
338 xfsbdstrat(tp->t_mountp, bp); 338 xfsbdstrat(tp->t_mountp, bp);
339 error = xfs_iowait(bp); 339 error = xfs_buf_iowait(bp);
340 if (error) { 340 if (error) {
341 xfs_ioerror_alert("xfs_trans_read_buf", mp, 341 xfs_ioerror_alert("xfs_trans_read_buf", mp,
342 bp, blkno); 342 bp, blkno);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index cdc53a1050c5..ccb34532768b 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -118,6 +118,36 @@ xfs_trans_ijoin_ref(
118} 118}
119 119
120/* 120/*
121 * Transactional inode timestamp update. Requires the inode to be locked and
122 * joined to the transaction supplied. Relies on the transaction subsystem to
123 * track dirty state and update/writeback the inode accordingly.
124 */
125void
126xfs_trans_ichgtime(
127 struct xfs_trans *tp,
128 struct xfs_inode *ip,
129 int flags)
130{
131 struct inode *inode = VFS_I(ip);
132 timespec_t tv;
133
134 ASSERT(tp);
135 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
136 ASSERT(ip->i_transp == tp);
137
138 tv = current_fs_time(inode->i_sb);
139
140 if ((flags & XFS_ICHGTIME_MOD) &&
141 !timespec_equal(&inode->i_mtime, &tv)) {
142 inode->i_mtime = tv;
143 }
144 if ((flags & XFS_ICHGTIME_CHG) &&
145 !timespec_equal(&inode->i_ctime, &tv)) {
146 inode->i_ctime = tv;
147 }
148}
149
150/*
121 * This is called to mark the fields indicated in fieldmask as needing 151 * This is called to mark the fields indicated in fieldmask as needing
122 * to be logged when the transaction is committed. The inode must 152 * to be logged when the transaction is committed. The inode must
123 * already be associated with the given transaction. 153 * already be associated with the given transaction.
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 320775295e32..26d1867d8156 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -73,8 +73,6 @@ typedef __int32_t xfs_tid_t; /* transaction identifier */
73typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ 73typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */
74typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */ 74typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */
75 75
76typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */
77
78typedef __uint32_t xlog_tid_t; /* transaction ID type */ 76typedef __uint32_t xlog_tid_t; /* transaction ID type */
79 77
80/* 78/*
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index b7d5769d2df0..8b32d1a4c5a1 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -56,7 +56,6 @@ xfs_dir_ialloc(
56 mode_t mode, 56 mode_t mode,
57 xfs_nlink_t nlink, 57 xfs_nlink_t nlink,
58 xfs_dev_t rdev, 58 xfs_dev_t rdev,
59 cred_t *credp,
60 prid_t prid, /* project id */ 59 prid_t prid, /* project id */
61 int okalloc, /* ok to allocate new space */ 60 int okalloc, /* ok to allocate new space */
62 xfs_inode_t **ipp, /* pointer to inode; it will be 61 xfs_inode_t **ipp, /* pointer to inode; it will be
@@ -93,7 +92,7 @@ xfs_dir_ialloc(
93 * transaction commit so that no other process can steal 92 * transaction commit so that no other process can steal
94 * the inode(s) that we've just allocated. 93 * the inode(s) that we've just allocated.
95 */ 94 */
96 code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid, okalloc, 95 code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
97 &ialloc_context, &call_again, &ip); 96 &ialloc_context, &call_again, &ip);
98 97
99 /* 98 /*
@@ -197,7 +196,7 @@ xfs_dir_ialloc(
197 * other allocations in this allocation group, 196 * other allocations in this allocation group,
198 * this call should always succeed. 197 * this call should always succeed.
199 */ 198 */
200 code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid, 199 code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
201 okalloc, &ialloc_context, &call_again, &ip); 200 okalloc, &ialloc_context, &call_again, &ip);
202 201
203 /* 202 /*
@@ -235,7 +234,7 @@ xfs_droplink(
235{ 234{
236 int error; 235 int error;
237 236
238 xfs_ichgtime(ip, XFS_ICHGTIME_CHG); 237 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
239 238
240 ASSERT (ip->i_d.di_nlink > 0); 239 ASSERT (ip->i_d.di_nlink > 0);
241 ip->i_d.di_nlink--; 240 ip->i_d.di_nlink--;
@@ -299,7 +298,7 @@ xfs_bumplink(
299{ 298{
300 if (ip->i_d.di_nlink >= XFS_MAXLINK) 299 if (ip->i_d.di_nlink >= XFS_MAXLINK)
301 return XFS_ERROR(EMLINK); 300 return XFS_ERROR(EMLINK);
302 xfs_ichgtime(ip, XFS_ICHGTIME_CHG); 301 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
303 302
304 ASSERT(ip->i_d.di_nlink > 0); 303 ASSERT(ip->i_d.di_nlink > 0);
305 ip->i_d.di_nlink++; 304 ip->i_d.di_nlink++;
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index f55b9678264f..456fca314933 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -19,8 +19,7 @@
19#define __XFS_UTILS_H__ 19#define __XFS_UTILS_H__
20 20
21extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t, 21extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
22 xfs_dev_t, cred_t *, prid_t, int, 22 xfs_dev_t, prid_t, int, xfs_inode_t **, int *);
23 xfs_inode_t **, int *);
24extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *); 23extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *);
25extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *); 24extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *);
26extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *); 25extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 4c7c7bfb2b2f..8e4a63c4151a 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -114,7 +114,7 @@ xfs_setattr(
114 */ 114 */
115 ASSERT(udqp == NULL); 115 ASSERT(udqp == NULL);
116 ASSERT(gdqp == NULL); 116 ASSERT(gdqp == NULL);
117 code = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_d.di_projid, 117 code = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
118 qflags, &udqp, &gdqp); 118 qflags, &udqp, &gdqp);
119 if (code) 119 if (code)
120 return code; 120 return code;
@@ -184,8 +184,11 @@ xfs_setattr(
184 ip->i_size == 0 && ip->i_d.di_nextents == 0) { 184 ip->i_size == 0 && ip->i_d.di_nextents == 0) {
185 xfs_iunlock(ip, XFS_ILOCK_EXCL); 185 xfs_iunlock(ip, XFS_ILOCK_EXCL);
186 lock_flags &= ~XFS_ILOCK_EXCL; 186 lock_flags &= ~XFS_ILOCK_EXCL;
187 if (mask & ATTR_CTIME) 187 if (mask & ATTR_CTIME) {
188 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 188 inode->i_mtime = inode->i_ctime =
189 current_fs_time(inode->i_sb);
190 xfs_mark_inode_dirty_sync(ip);
191 }
189 code = 0; 192 code = 0;
190 goto error_return; 193 goto error_return;
191 } 194 }
@@ -1253,8 +1256,7 @@ xfs_create(
1253 struct xfs_name *name, 1256 struct xfs_name *name,
1254 mode_t mode, 1257 mode_t mode,
1255 xfs_dev_t rdev, 1258 xfs_dev_t rdev,
1256 xfs_inode_t **ipp, 1259 xfs_inode_t **ipp)
1257 cred_t *credp)
1258{ 1260{
1259 int is_dir = S_ISDIR(mode); 1261 int is_dir = S_ISDIR(mode);
1260 struct xfs_mount *mp = dp->i_mount; 1262 struct xfs_mount *mp = dp->i_mount;
@@ -1266,7 +1268,7 @@ xfs_create(
1266 boolean_t unlock_dp_on_error = B_FALSE; 1268 boolean_t unlock_dp_on_error = B_FALSE;
1267 uint cancel_flags; 1269 uint cancel_flags;
1268 int committed; 1270 int committed;
1269 xfs_prid_t prid; 1271 prid_t prid;
1270 struct xfs_dquot *udqp = NULL; 1272 struct xfs_dquot *udqp = NULL;
1271 struct xfs_dquot *gdqp = NULL; 1273 struct xfs_dquot *gdqp = NULL;
1272 uint resblks; 1274 uint resblks;
@@ -1279,9 +1281,9 @@ xfs_create(
1279 return XFS_ERROR(EIO); 1281 return XFS_ERROR(EIO);
1280 1282
1281 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 1283 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1282 prid = dp->i_d.di_projid; 1284 prid = xfs_get_projid(dp);
1283 else 1285 else
1284 prid = dfltprid; 1286 prid = XFS_PROJID_DEFAULT;
1285 1287
1286 /* 1288 /*
1287 * Make sure that we have allocated dquot(s) on disk. 1289 * Make sure that we have allocated dquot(s) on disk.
@@ -1360,7 +1362,7 @@ xfs_create(
1360 * entry pointing to them, but a directory also the "." entry 1362 * entry pointing to them, but a directory also the "." entry
1361 * pointing to itself. 1363 * pointing to itself.
1362 */ 1364 */
1363 error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, credp, 1365 error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
1364 prid, resblks > 0, &ip, &committed); 1366 prid, resblks > 0, &ip, &committed);
1365 if (error) { 1367 if (error) {
1366 if (error == ENOSPC) 1368 if (error == ENOSPC)
@@ -1391,7 +1393,7 @@ xfs_create(
1391 ASSERT(error != ENOSPC); 1393 ASSERT(error != ENOSPC);
1392 goto out_trans_abort; 1394 goto out_trans_abort;
1393 } 1395 }
1394 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1396 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1395 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 1397 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1396 1398
1397 if (is_dir) { 1399 if (is_dir) {
@@ -1742,7 +1744,7 @@ xfs_remove(
1742 ASSERT(error != ENOENT); 1744 ASSERT(error != ENOENT);
1743 goto out_bmap_cancel; 1745 goto out_bmap_cancel;
1744 } 1746 }
1745 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1747 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1746 1748
1747 if (is_dir) { 1749 if (is_dir) {
1748 /* 1750 /*
@@ -1880,7 +1882,7 @@ xfs_link(
1880 * the tree quota mechanism could be circumvented. 1882 * the tree quota mechanism could be circumvented.
1881 */ 1883 */
1882 if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && 1884 if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1883 (tdp->i_d.di_projid != sip->i_d.di_projid))) { 1885 (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
1884 error = XFS_ERROR(EXDEV); 1886 error = XFS_ERROR(EXDEV);
1885 goto error_return; 1887 goto error_return;
1886 } 1888 }
@@ -1895,7 +1897,7 @@ xfs_link(
1895 &first_block, &free_list, resblks); 1897 &first_block, &free_list, resblks);
1896 if (error) 1898 if (error)
1897 goto abort_return; 1899 goto abort_return;
1898 xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1900 xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1899 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); 1901 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1900 1902
1901 error = xfs_bumplink(tp, sip); 1903 error = xfs_bumplink(tp, sip);
@@ -1933,8 +1935,7 @@ xfs_symlink(
1933 struct xfs_name *link_name, 1935 struct xfs_name *link_name,
1934 const char *target_path, 1936 const char *target_path,
1935 mode_t mode, 1937 mode_t mode,
1936 xfs_inode_t **ipp, 1938 xfs_inode_t **ipp)
1937 cred_t *credp)
1938{ 1939{
1939 xfs_mount_t *mp = dp->i_mount; 1940 xfs_mount_t *mp = dp->i_mount;
1940 xfs_trans_t *tp; 1941 xfs_trans_t *tp;
@@ -1955,7 +1956,7 @@ xfs_symlink(
1955 int byte_cnt; 1956 int byte_cnt;
1956 int n; 1957 int n;
1957 xfs_buf_t *bp; 1958 xfs_buf_t *bp;
1958 xfs_prid_t prid; 1959 prid_t prid;
1959 struct xfs_dquot *udqp, *gdqp; 1960 struct xfs_dquot *udqp, *gdqp;
1960 uint resblks; 1961 uint resblks;
1961 1962
@@ -1978,9 +1979,9 @@ xfs_symlink(
1978 1979
1979 udqp = gdqp = NULL; 1980 udqp = gdqp = NULL;
1980 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 1981 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1981 prid = dp->i_d.di_projid; 1982 prid = xfs_get_projid(dp);
1982 else 1983 else
1983 prid = (xfs_prid_t)dfltprid; 1984 prid = XFS_PROJID_DEFAULT;
1984 1985
1985 /* 1986 /*
1986 * Make sure that we have allocated dquot(s) on disk. 1987 * Make sure that we have allocated dquot(s) on disk.
@@ -2046,8 +2047,8 @@ xfs_symlink(
2046 /* 2047 /*
2047 * Allocate an inode for the symlink. 2048 * Allocate an inode for the symlink.
2048 */ 2049 */
2049 error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 2050 error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
2050 1, 0, credp, prid, resblks > 0, &ip, NULL); 2051 prid, resblks > 0, &ip, NULL);
2051 if (error) { 2052 if (error) {
2052 if (error == ENOSPC) 2053 if (error == ENOSPC)
2053 goto error_return; 2054 goto error_return;
@@ -2129,7 +2130,7 @@ xfs_symlink(
2129 &first_block, &free_list, resblks); 2130 &first_block, &free_list, resblks);
2130 if (error) 2131 if (error)
2131 goto error1; 2132 goto error1;
2132 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2133 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2133 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 2134 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2134 2135
2135 /* 2136 /*
@@ -2272,7 +2273,7 @@ xfs_alloc_file_space(
2272 count = len; 2273 count = len;
2273 imapp = &imaps[0]; 2274 imapp = &imaps[0];
2274 nimaps = 1; 2275 nimaps = 1;
2275 bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0); 2276 bmapi_flag = XFS_BMAPI_WRITE | alloc_type;
2276 startoffset_fsb = XFS_B_TO_FSBT(mp, offset); 2277 startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
2277 allocatesize_fsb = XFS_B_TO_FSB(mp, count); 2278 allocatesize_fsb = XFS_B_TO_FSB(mp, count);
2278 2279
@@ -2431,9 +2432,9 @@ xfs_zero_remaining_bytes(
2431 if (endoff > ip->i_size) 2432 if (endoff > ip->i_size)
2432 endoff = ip->i_size; 2433 endoff = ip->i_size;
2433 2434
2434 bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize, 2435 bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
2435 XFS_IS_REALTIME_INODE(ip) ? 2436 mp->m_rtdev_targp : mp->m_ddev_targp,
2436 mp->m_rtdev_targp : mp->m_ddev_targp); 2437 mp->m_sb.sb_blocksize, XBF_DONT_BLOCK);
2437 if (!bp) 2438 if (!bp)
2438 return XFS_ERROR(ENOMEM); 2439 return XFS_ERROR(ENOMEM);
2439 2440
@@ -2459,7 +2460,7 @@ xfs_zero_remaining_bytes(
2459 XFS_BUF_READ(bp); 2460 XFS_BUF_READ(bp);
2460 XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock)); 2461 XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
2461 xfsbdstrat(mp, bp); 2462 xfsbdstrat(mp, bp);
2462 error = xfs_iowait(bp); 2463 error = xfs_buf_iowait(bp);
2463 if (error) { 2464 if (error) {
2464 xfs_ioerror_alert("xfs_zero_remaining_bytes(read)", 2465 xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
2465 mp, bp, XFS_BUF_ADDR(bp)); 2466 mp, bp, XFS_BUF_ADDR(bp));
@@ -2472,7 +2473,7 @@ xfs_zero_remaining_bytes(
2472 XFS_BUF_UNREAD(bp); 2473 XFS_BUF_UNREAD(bp);
2473 XFS_BUF_WRITE(bp); 2474 XFS_BUF_WRITE(bp);
2474 xfsbdstrat(mp, bp); 2475 xfsbdstrat(mp, bp);
2475 error = xfs_iowait(bp); 2476 error = xfs_buf_iowait(bp);
2476 if (error) { 2477 if (error) {
2477 xfs_ioerror_alert("xfs_zero_remaining_bytes(write)", 2478 xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
2478 mp, bp, XFS_BUF_ADDR(bp)); 2479 mp, bp, XFS_BUF_ADDR(bp));
@@ -2711,6 +2712,7 @@ xfs_change_file_space(
2711 xfs_off_t llen; 2712 xfs_off_t llen;
2712 xfs_trans_t *tp; 2713 xfs_trans_t *tp;
2713 struct iattr iattr; 2714 struct iattr iattr;
2715 int prealloc_type;
2714 2716
2715 if (!S_ISREG(ip->i_d.di_mode)) 2717 if (!S_ISREG(ip->i_d.di_mode))
2716 return XFS_ERROR(EINVAL); 2718 return XFS_ERROR(EINVAL);
@@ -2753,12 +2755,17 @@ xfs_change_file_space(
2753 * size to be changed. 2755 * size to be changed.
2754 */ 2756 */
2755 setprealloc = clrprealloc = 0; 2757 setprealloc = clrprealloc = 0;
2758 prealloc_type = XFS_BMAPI_PREALLOC;
2756 2759
2757 switch (cmd) { 2760 switch (cmd) {
2761 case XFS_IOC_ZERO_RANGE:
2762 prealloc_type |= XFS_BMAPI_CONVERT;
2763 xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0);
2764 /* FALLTHRU */
2758 case XFS_IOC_RESVSP: 2765 case XFS_IOC_RESVSP:
2759 case XFS_IOC_RESVSP64: 2766 case XFS_IOC_RESVSP64:
2760 error = xfs_alloc_file_space(ip, startoffset, bf->l_len, 2767 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
2761 1, attr_flags); 2768 prealloc_type, attr_flags);
2762 if (error) 2769 if (error)
2763 return error; 2770 return error;
2764 setprealloc = 1; 2771 setprealloc = 1;
@@ -2827,7 +2834,7 @@ xfs_change_file_space(
2827 if (ip->i_d.di_mode & S_IXGRP) 2834 if (ip->i_d.di_mode & S_IXGRP)
2828 ip->i_d.di_mode &= ~S_ISGID; 2835 ip->i_d.di_mode &= ~S_ISGID;
2829 2836
2830 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2837 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2831 } 2838 }
2832 if (setprealloc) 2839 if (setprealloc)
2833 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; 2840 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index d8dfa8d0dadd..f6702927eee4 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -2,7 +2,6 @@
2#define _XFS_VNODEOPS_H 1 2#define _XFS_VNODEOPS_H 1
3 3
4struct attrlist_cursor_kern; 4struct attrlist_cursor_kern;
5struct cred;
6struct file; 5struct file;
7struct iattr; 6struct iattr;
8struct inode; 7struct inode;
@@ -26,7 +25,7 @@ int xfs_inactive(struct xfs_inode *ip);
26int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, 25int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
27 struct xfs_inode **ipp, struct xfs_name *ci_name); 26 struct xfs_inode **ipp, struct xfs_name *ci_name);
28int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode, 27int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode,
29 xfs_dev_t rdev, struct xfs_inode **ipp, cred_t *credp); 28 xfs_dev_t rdev, struct xfs_inode **ipp);
30int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, 29int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
31 struct xfs_inode *ip); 30 struct xfs_inode *ip);
32int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, 31int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
@@ -34,8 +33,7 @@ int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
34int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize, 33int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize,
35 xfs_off_t *offset, filldir_t filldir); 34 xfs_off_t *offset, filldir_t filldir);
36int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, 35int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
37 const char *target_path, mode_t mode, struct xfs_inode **ipp, 36 const char *target_path, mode_t mode, struct xfs_inode **ipp);
38 cred_t *credp);
39int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); 37int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
40int xfs_change_file_space(struct xfs_inode *ip, int cmd, 38int xfs_change_file_space(struct xfs_inode *ip, int cmd,
41 xfs_flock64_t *bf, xfs_off_t offset, int attr_flags); 39 xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);