aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2010-05-03 09:25:35 -0400
committerJiri Kosina <jkosina@suse.cz>2010-05-03 09:25:35 -0400
commitd6d53cbc6b10d28646fb6184d1069f336ec76dc4 (patch)
tree1c844b3ce8bd430becbbb74875898b08d9f89bb5 /fs
parent0b5adf92ec793c665b0de63ac146d190a921c391 (diff)
parent6a740aa4f47b9f29bad5292cf51f008f3edad9b1 (diff)
Merge branch 'hid-suspend' into picolcd
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/cache.c1
-rw-r--r--fs/9p/fid.c13
-rw-r--r--fs/9p/v9fs.c22
-rw-r--r--fs/9p/v9fs.h1
-rw-r--r--fs/9p/vfs_dentry.c1
-rw-r--r--fs/9p/vfs_dir.c3
-rw-r--r--fs/9p/vfs_inode.c10
-rw-r--r--fs/9p/vfs_super.c4
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile1
-rw-r--r--fs/adfs/super.c1
-rw-r--r--fs/affs/bitmap.c1
-rw-r--r--fs/affs/inode.c1
-rw-r--r--fs/affs/super.c1
-rw-r--r--fs/afs/cache.c1
-rw-r--r--fs/afs/cmservice.c1
-rw-r--r--fs/afs/dir.c1
-rw-r--r--fs/afs/file.c2
-rw-r--r--fs/afs/fsclient.c1
-rw-r--r--fs/afs/inode.c1
-rw-r--r--fs/afs/mntpt.c2
-rw-r--r--fs/afs/rxrpc.c1
-rw-r--r--fs/afs/security.c5
-rw-r--r--fs/afs/vlclient.c1
-rw-r--r--fs/afs/vlocation.c1
-rw-r--r--fs/afs/vnode.c1
-rw-r--r--fs/anon_inodes.c1
-rw-r--r--fs/autofs/root.c1
-rw-r--r--fs/autofs4/dev-ioctl.c1
-rw-r--r--fs/autofs4/root.c1
-rw-r--r--fs/befs/datastream.c1
-rw-r--r--fs/binfmt_aout.c16
-rw-r--r--fs/binfmt_elf_fdpic.c2
-rw-r--r--fs/binfmt_em86.c1
-rw-r--r--fs/binfmt_script.c1
-rw-r--r--fs/bio-integrity.c1
-rw-r--r--fs/btrfs/acl.c1
-rw-r--r--fs/btrfs/async-thread.c1
-rw-r--r--fs/btrfs/compression.c23
-rw-r--r--fs/btrfs/ctree.c5
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/delayed-ref.c1
-rw-r--r--fs/btrfs/disk-io.c13
-rw-r--r--fs/btrfs/extent-tree.c24
-rw-r--r--fs/btrfs/extent_io.c16
-rw-r--r--fs/btrfs/extent_map.c1
-rw-r--r--fs/btrfs/file-item.c1
-rw-r--r--fs/btrfs/file.c1
-rw-r--r--fs/btrfs/free-space-cache.c1
-rw-r--r--fs/btrfs/inode.c60
-rw-r--r--fs/btrfs/ioctl.c8
-rw-r--r--fs/btrfs/locking.c1
-rw-r--r--fs/btrfs/ordered-data.c7
-rw-r--r--fs/btrfs/ref-cache.c1
-rw-r--r--fs/btrfs/relocation.c1
-rw-r--r--fs/btrfs/super.c24
-rw-r--r--fs/btrfs/transaction.c113
-rw-r--r--fs/btrfs/tree-log.c1
-rw-r--r--fs/btrfs/volumes.c11
-rw-r--r--fs/cachefiles/interface.c1
-rw-r--r--fs/cachefiles/namei.c1
-rw-r--r--fs/cachefiles/rdwr.c1
-rw-r--r--fs/cachefiles/xattr.c1
-rw-r--r--fs/ceph/Kconfig27
-rw-r--r--fs/ceph/Makefile39
-rw-r--r--fs/ceph/README20
-rw-r--r--fs/ceph/addr.c1195
-rw-r--r--fs/ceph/armor.c99
-rw-r--r--fs/ceph/auth.c258
-rw-r--r--fs/ceph/auth.h84
-rw-r--r--fs/ceph/auth_none.c122
-rw-r--r--fs/ceph/auth_none.h28
-rw-r--r--fs/ceph/auth_x.c680
-rw-r--r--fs/ceph/auth_x.h49
-rw-r--r--fs/ceph/auth_x_protocol.h90
-rw-r--r--fs/ceph/buffer.c81
-rw-r--r--fs/ceph/buffer.h39
-rw-r--r--fs/ceph/caps.c2933
-rw-r--r--fs/ceph/ceph_debug.h37
-rw-r--r--fs/ceph/ceph_frag.c21
-rw-r--r--fs/ceph/ceph_frag.h109
-rw-r--r--fs/ceph/ceph_fs.c74
-rw-r--r--fs/ceph/ceph_fs.h650
-rw-r--r--fs/ceph/ceph_hash.c118
-rw-r--r--fs/ceph/ceph_hash.h13
-rw-r--r--fs/ceph/ceph_strings.c176
-rw-r--r--fs/ceph/crush/crush.c151
-rw-r--r--fs/ceph/crush/crush.h180
-rw-r--r--fs/ceph/crush/hash.c149
-rw-r--r--fs/ceph/crush/hash.h17
-rw-r--r--fs/ceph/crush/mapper.c596
-rw-r--r--fs/ceph/crush/mapper.h20
-rw-r--r--fs/ceph/crypto.c409
-rw-r--r--fs/ceph/crypto.h48
-rw-r--r--fs/ceph/debugfs.c484
-rw-r--r--fs/ceph/decode.h194
-rw-r--r--fs/ceph/dir.c1223
-rw-r--r--fs/ceph/export.c224
-rw-r--r--fs/ceph/file.c938
-rw-r--r--fs/ceph/inode.c1766
-rw-r--r--fs/ceph/ioctl.c160
-rw-r--r--fs/ceph/ioctl.h40
-rw-r--r--fs/ceph/mds_client.c3043
-rw-r--r--fs/ceph/mds_client.h335
-rw-r--r--fs/ceph/mdsmap.c174
-rw-r--r--fs/ceph/mdsmap.h54
-rw-r--r--fs/ceph/messenger.c2240
-rw-r--r--fs/ceph/messenger.h255
-rw-r--r--fs/ceph/mon_client.c835
-rw-r--r--fs/ceph/mon_client.h119
-rw-r--r--fs/ceph/msgpool.c186
-rw-r--r--fs/ceph/msgpool.h27
-rw-r--r--fs/ceph/msgr.h158
-rw-r--r--fs/ceph/osd_client.c1550
-rw-r--r--fs/ceph/osd_client.h166
-rw-r--r--fs/ceph/osdmap.c1024
-rw-r--r--fs/ceph/osdmap.h125
-rw-r--r--fs/ceph/pagelist.c55
-rw-r--r--fs/ceph/pagelist.h54
-rw-r--r--fs/ceph/rados.h374
-rw-r--r--fs/ceph/snap.c907
-rw-r--r--fs/ceph/super.c1031
-rw-r--r--fs/ceph/super.h902
-rw-r--r--fs/ceph/types.h29
-rw-r--r--fs/ceph/xattr.c845
-rw-r--r--fs/cifs/cifs_dfs_ref.c1
-rw-r--r--fs/cifs/cifs_spnego.c1
-rw-r--r--fs/cifs/cifs_unicode.c1
-rw-r--r--fs/cifs/cifsacl.c1
-rw-r--r--fs/cifs/cifsencrypt.c1
-rw-r--r--fs/cifs/cifsfs.c3
-rw-r--r--fs/cifs/cifsfs.h3
-rw-r--r--fs/cifs/cifsglob.h2
-rw-r--r--fs/cifs/cifsproto.h6
-rw-r--r--fs/cifs/cifssmb.c136
-rw-r--r--fs/cifs/connect.c1
-rw-r--r--fs/cifs/dir.c2
-rw-r--r--fs/cifs/dns_resolve.c1
-rw-r--r--fs/cifs/file.c8
-rw-r--r--fs/cifs/inode.c298
-rw-r--r--fs/cifs/link.c1
-rw-r--r--fs/cifs/readdir.c1
-rw-r--r--fs/cifs/sess.c1
-rw-r--r--fs/cifs/smbencrypt.c1
-rw-r--r--fs/cifs/transport.c1
-rw-r--r--fs/cifs/xattr.c1
-rw-r--r--fs/coda/dir.c1
-rw-r--r--fs/coda/file.c1
-rw-r--r--fs/coda/inode.c1
-rw-r--r--fs/coda/upcall.c1
-rw-r--r--fs/compat.c1
-rw-r--r--fs/compat_ioctl.c2
-rw-r--r--fs/configfs/inode.c1
-rw-r--r--fs/configfs/mount.c1
-rw-r--r--fs/configfs/symlink.c1
-rw-r--r--fs/debugfs/inode.c1
-rw-r--r--fs/devpts/inode.c1
-rw-r--r--fs/dlm/config.c1
-rw-r--r--fs/dlm/debug_fs.c1
-rw-r--r--fs/dlm/lock.c1
-rw-r--r--fs/dlm/lowcomms.c1
-rw-r--r--fs/dlm/netlink.c1
-rw-r--r--fs/dlm/plock.c1
-rw-r--r--fs/dlm/user.c1
-rw-r--r--fs/ecryptfs/crypto.c1
-rw-r--r--fs/ecryptfs/dentry.c1
-rw-r--r--fs/ecryptfs/file.c1
-rw-r--r--fs/ecryptfs/inode.c1
-rw-r--r--fs/ecryptfs/keystore.c1
-rw-r--r--fs/ecryptfs/kthread.c1
-rw-r--r--fs/ecryptfs/main.c1
-rw-r--r--fs/ecryptfs/messaging.c1
-rw-r--r--fs/ecryptfs/miscdev.c1
-rw-r--r--fs/ecryptfs/mmap.c1
-rw-r--r--fs/ecryptfs/super.c1
-rw-r--r--fs/eventfd.c1
-rw-r--r--fs/exofs/inode.c1
-rw-r--r--fs/exofs/ios.c1
-rw-r--r--fs/exofs/super.c1
-rw-r--r--fs/ext2/balloc.c1
-rw-r--r--fs/ext2/xattr_security.c1
-rw-r--r--fs/ext3/balloc.c1
-rw-r--r--fs/ext3/ialloc.c4
-rw-r--r--fs/ext3/inode.c2
-rw-r--r--fs/ext3/xattr_security.c1
-rw-r--r--fs/ext4/block_validity.c1
-rw-r--r--fs/ext4/ialloc.c4
-rw-r--r--fs/ext4/inode.c5
-rw-r--r--fs/ext4/mballoc.c1
-rw-r--r--fs/ext4/migrate.c1
-rw-r--r--fs/ext4/move_extent.c1
-rw-r--r--fs/ext4/super.c29
-rw-r--r--fs/ext4/xattr_security.c1
-rw-r--r--fs/fat/cache.c1
-rw-r--r--fs/fat/namei_vfat.c6
-rw-r--r--fs/fifo.c1
-rw-r--r--fs/filesystems.c2
-rw-r--r--fs/freevxfs/vxfs_subr.c1
-rw-r--r--fs/fs-writeback.c1
-rw-r--r--fs/fscache/object-list.c1
-rw-r--r--fs/fscache/object.c6
-rw-r--r--fs/fscache/operation.c5
-rw-r--r--fs/fscache/page.c2
-rw-r--r--fs/fuse/cuse.c1
-rw-r--r--fs/generic_acl.c1
-rw-r--r--fs/gfs2/bmap.c1
-rw-r--r--fs/gfs2/dentry.c1
-rw-r--r--fs/gfs2/export.c1
-rw-r--r--fs/gfs2/glops.c1
-rw-r--r--fs/gfs2/lock_dlm.c1
-rw-r--r--fs/gfs2/rgrp.h2
-rw-r--r--fs/gfs2/sys.c1
-rw-r--r--fs/gfs2/util.c1
-rw-r--r--fs/hfs/bnode.c1
-rw-r--r--fs/hfs/btree.c1
-rw-r--r--fs/hfs/mdb.c1
-rw-r--r--fs/hfs/super.c1
-rw-r--r--fs/hfsplus/options.c1
-rw-r--r--fs/hostfs/hostfs_kern.c1
-rw-r--r--fs/hpfs/buffer.c1
-rw-r--r--fs/hpfs/dir.c1
-rw-r--r--fs/hpfs/inode.c1
-rw-r--r--fs/hpfs/super.c1
-rw-r--r--fs/ioprio.c1
-rw-r--r--fs/isofs/dir.c1
-rw-r--r--fs/isofs/namei.c1
-rw-r--r--fs/jbd/commit.c1
-rw-r--r--fs/jbd/recovery.c1
-rw-r--r--fs/jbd2/recovery.c1
-rw-r--r--fs/jffs2/compr_lzo.c1
-rw-r--r--fs/jffs2/compr_zlib.c1
-rw-r--r--fs/jffs2/debug.c1
-rw-r--r--fs/jffs2/file.c1
-rw-r--r--fs/jffs2/nodelist.c1
-rw-r--r--fs/jffs2/nodemgmt.c1
-rw-r--r--fs/jffs2/symlink.c1
-rw-r--r--fs/jffs2/write.c1
-rw-r--r--fs/jfs/acl.c1
-rw-r--r--fs/jfs/jfs_dmap.c1
-rw-r--r--fs/jfs/jfs_dtree.c1
-rw-r--r--fs/jfs/jfs_imap.c1
-rw-r--r--fs/jfs/jfs_logmgr.c1
-rw-r--r--fs/jfs/jfs_metapage.c1
-rw-r--r--fs/jfs/jfs_unicode.h1
-rw-r--r--fs/jfs/super.c1
-rw-r--r--fs/jfs/xattr.c1
-rw-r--r--fs/libfs.c1
-rw-r--r--fs/lockd/clntlock.c1
-rw-r--r--fs/lockd/clntproc.c1
-rw-r--r--fs/lockd/mon.c1
-rw-r--r--fs/lockd/svc.c1
-rw-r--r--fs/lockd/svc4proc.c1
-rw-r--r--fs/lockd/svclock.c1
-rw-r--r--fs/lockd/svcproc.c1
-rw-r--r--fs/lockd/svcsubs.c1
-rw-r--r--fs/logfs/dev_bdev.c10
-rw-r--r--fs/logfs/dir.c6
-rw-r--r--fs/logfs/gc.c1
-rw-r--r--fs/logfs/inode.c1
-rw-r--r--fs/logfs/journal.c8
-rw-r--r--fs/logfs/logfs.h1
-rw-r--r--fs/logfs/readwrite.c14
-rw-r--r--fs/logfs/segment.c55
-rw-r--r--fs/logfs/super.c16
-rw-r--r--fs/minix/itree_v1.c1
-rw-r--r--fs/mpage.c1
-rw-r--r--fs/namei.c18
-rw-r--r--fs/ncpfs/dir.c1
-rw-r--r--fs/ncpfs/file.c1
-rw-r--r--fs/ncpfs/ioctl.c1
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/ncpfs/sock.c1
-rw-r--r--fs/ncpfs/symlink.c1
-rw-r--r--fs/nfs/cache_lib.c1
-rw-r--r--fs/nfs/callback_proc.c1
-rw-r--r--fs/nfs/callback_xdr.c1
-rw-r--r--fs/nfs/client.c1
-rw-r--r--fs/nfs/delegation.c1
-rw-r--r--fs/nfs/direct.c1
-rw-r--r--fs/nfs/dns_resolve.c1
-rw-r--r--fs/nfs/file.c5
-rw-r--r--fs/nfs/fscache.c1
-rw-r--r--fs/nfs/inode.c1
-rw-r--r--fs/nfs/namespace.c1
-rw-r--r--fs/nfs/nfs2xdr.c1
-rw-r--r--fs/nfs/nfs3acl.c1
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs3xdr.c1
-rw-r--r--fs/nfs/nfs4namespace.c1
-rw-r--r--fs/nfs/nfs4proc.c1
-rw-r--r--fs/nfs/nfs4xdr.c3
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/super.c1
-rw-r--r--fs/nfs/symlink.c1
-rw-r--r--fs/nfs_common/nfsacl.c1
-rw-r--r--fs/nfsd/export.c1
-rw-r--r--fs/nfsd/nfs2acl.c1
-rw-r--r--fs/nfsd/nfs3acl.c1
-rw-r--r--fs/nfsd/nfs4acl.c1
-rw-r--r--fs/nfsd/nfs4callback.c1
-rw-r--r--fs/nfsd/nfs4idmap.c1
-rw-r--r--fs/nfsd/nfs4proc.c1
-rw-r--r--fs/nfsd/nfs4recover.c1
-rw-r--r--fs/nfsd/nfs4state.c1
-rw-r--r--fs/nfsd/nfs4xdr.c1
-rw-r--r--fs/nfsd/nfscache.c2
-rw-r--r--fs/nfsd/nfsctl.c1
-rw-r--r--fs/nfsd/vfs.c1
-rw-r--r--fs/nilfs2/alloc.c1
-rw-r--r--fs/nilfs2/btnode.c1
-rw-r--r--fs/nilfs2/gcinode.c1
-rw-r--r--fs/nilfs2/inode.c1
-rw-r--r--fs/nilfs2/ioctl.c1
-rw-r--r--fs/nilfs2/mdt.c1
-rw-r--r--fs/nilfs2/page.c1
-rw-r--r--fs/nilfs2/recovery.c1
-rw-r--r--fs/nilfs2/segbuf.c9
-rw-r--r--fs/nilfs2/segment.c16
-rw-r--r--fs/nilfs2/the_nilfs.h1
-rw-r--r--fs/notify/fsnotify.c1
-rw-r--r--fs/notify/inode_mark.c1
-rw-r--r--fs/ntfs/aops.c1
-rw-r--r--fs/ntfs/attrib.c1
-rw-r--r--fs/ntfs/compress.c1
-rw-r--r--fs/ntfs/dir.c1
-rw-r--r--fs/ntfs/file.c1
-rw-r--r--fs/ntfs/index.c2
-rw-r--r--fs/ntfs/mft.c1
-rw-r--r--fs/ntfs/namei.c1
-rw-r--r--fs/ocfs2/acl.c78
-rw-r--r--fs/ocfs2/buffer_head_io.c1
-rw-r--r--fs/ocfs2/cluster/heartbeat.c1
-rw-r--r--fs/ocfs2/cluster/nodemanager.c1
-rw-r--r--fs/ocfs2/cluster/quorum.c1
-rw-r--r--fs/ocfs2/dlm/dlmast.c1
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c1
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c4
-rw-r--r--fs/ocfs2/dlm/dlmthread.c1
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c1
-rw-r--r--fs/ocfs2/extent_map.c1
-rw-r--r--fs/ocfs2/heartbeat.c1
-rw-r--r--fs/ocfs2/inode.c16
-rw-r--r--fs/ocfs2/localalloc.c10
-rw-r--r--fs/ocfs2/locks.c2
-rw-r--r--fs/ocfs2/mmap.c1
-rw-r--r--fs/ocfs2/namei.c28
-rw-r--r--fs/ocfs2/ocfs2.h14
-rw-r--r--fs/ocfs2/quota_global.c1
-rw-r--r--fs/ocfs2/quota_local.c1
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/ocfs2/stack_o2cb.c1
-rw-r--r--fs/ocfs2/stack_user.c1
-rw-r--r--fs/ocfs2/suballoc.c129
-rw-r--r--fs/ocfs2/suballoc.h5
-rw-r--r--fs/ocfs2/sysfile.c1
-rw-r--r--fs/ocfs2/xattr.c12
-rw-r--r--fs/omfs/inode.c1
-rw-r--r--fs/open.c2
-rw-r--r--fs/partitions/check.c1
-rw-r--r--fs/partitions/efi.c1
-rw-r--r--fs/partitions/msdos.c85
-rw-r--r--fs/proc/array.c1
-rw-r--r--fs/proc/base.c6
-rw-r--r--fs/proc/generic.c1
-rw-r--r--fs/proc/inode.c1
-rw-r--r--fs/proc/kcore.c3
-rw-r--r--fs/proc/nommu.c1
-rw-r--r--fs/proc/proc_devtree.c1
-rw-r--r--fs/proc/proc_net.c1
-rw-r--r--fs/proc/stat.c1
-rw-r--r--fs/proc/task_mmu.c88
-rw-r--r--fs/proc/task_nommu.c1
-rw-r--r--fs/proc/vmcore.c1
-rw-r--r--fs/quota/netlink.c1
-rw-r--r--fs/ramfs/file-nommu.c1
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/read_write.c2
-rw-r--r--fs/reiserfs/dir.c1
-rw-r--r--fs/reiserfs/fix_node.c1
-rw-r--r--fs/reiserfs/inode.c1
-rw-r--r--fs/reiserfs/journal.c16
-rw-r--r--fs/reiserfs/namei.c1
-rw-r--r--fs/reiserfs/super.c11
-rw-r--r--fs/reiserfs/xattr.c1
-rw-r--r--fs/reiserfs/xattr_acl.c1
-rw-r--r--fs/reiserfs/xattr_security.c3
-rw-r--r--fs/signalfd.c1
-rw-r--r--fs/smbfs/file.c1
-rw-r--r--fs/smbfs/smbiod.c1
-rw-r--r--fs/smbfs/symlink.c1
-rw-r--r--fs/splice.c1
-rw-r--r--fs/squashfs/symlink.c1
-rw-r--r--fs/squashfs/zlib_wrapper.c1
-rw-r--r--fs/sync.c1
-rw-r--r--fs/sysfs/inode.c1
-rw-r--r--fs/sysfs/mount.c1
-rw-r--r--fs/sysfs/symlink.c1
-rw-r--r--fs/timerfd.c1
-rw-r--r--fs/ubifs/commit.c1
-rw-r--r--fs/ubifs/debug.c1
-rw-r--r--fs/ubifs/file.c1
-rw-r--r--fs/ubifs/gc.c1
-rw-r--r--fs/ubifs/io.c1
-rw-r--r--fs/ubifs/lpt.c1
-rw-r--r--fs/ubifs/lpt_commit.c1
-rw-r--r--fs/ubifs/recovery.c1
-rw-r--r--fs/ubifs/sb.c1
-rw-r--r--fs/ubifs/tnc.c1
-rw-r--r--fs/ubifs/ubifs.h1
-rw-r--r--fs/ubifs/xattr.c1
-rw-r--r--fs/udf/partition.c1
-rw-r--r--fs/udf/symlink.c1
-rw-r--r--fs/udf/unicode.c1
-rw-r--r--fs/xattr_acl.c2
-rw-r--r--fs/xfs/linux-2.6/kmem.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c1
422 files changed, 29239 insertions, 670 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index e777961939f3..0dbe0d139ac2 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24#include <linux/file.h> 24#include <linux/file.h>
25#include <linux/slab.h>
25#include <linux/stat.h> 26#include <linux/stat.h>
26#include <linux/sched.h> 27#include <linux/sched.h>
27#include <linux/fs.h> 28#include <linux/fs.h>
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 08b2eb157048..7317b39b2815 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -24,6 +24,7 @@
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/errno.h> 25#include <linux/errno.h>
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/slab.h>
27#include <linux/sched.h> 28#include <linux/sched.h>
28#include <linux/idr.h> 29#include <linux/idr.h>
29#include <net/9p/9p.h> 30#include <net/9p/9p.h>
@@ -110,7 +111,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
110{ 111{
111 int i, n, l, clone, any, access; 112 int i, n, l, clone, any, access;
112 u32 uid; 113 u32 uid;
113 struct p9_fid *fid; 114 struct p9_fid *fid, *old_fid = NULL;
114 struct dentry *d, *ds; 115 struct dentry *d, *ds;
115 struct v9fs_session_info *v9ses; 116 struct v9fs_session_info *v9ses;
116 char **wnames, *uname; 117 char **wnames, *uname;
@@ -183,10 +184,18 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
183 l = min(n - i, P9_MAXWELEM); 184 l = min(n - i, P9_MAXWELEM);
184 fid = p9_client_walk(fid, l, &wnames[i], clone); 185 fid = p9_client_walk(fid, l, &wnames[i], clone);
185 if (IS_ERR(fid)) { 186 if (IS_ERR(fid)) {
187 if (old_fid) {
188 /*
189 * If we fail, clunk fid which are mapping
190 * to path component and not the last component
191 * of the path.
192 */
193 p9_client_clunk(old_fid);
194 }
186 kfree(wnames); 195 kfree(wnames);
187 return fid; 196 return fid;
188 } 197 }
189 198 old_fid = fid;
190 i += l; 199 i += l;
191 clone = 0; 200 clone = 0;
192 } 201 }
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6c7f6a251115..5c5bc8480070 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -29,6 +29,7 @@
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/parser.h> 30#include <linux/parser.h>
31#include <linux/idr.h> 31#include <linux/idr.h>
32#include <linux/slab.h>
32#include <net/9p/9p.h> 33#include <net/9p/9p.h>
33#include <net/9p/client.h> 34#include <net/9p/client.h>
34#include <net/9p/transport.h> 35#include <net/9p/transport.h>
@@ -241,7 +242,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
241 list_add(&v9ses->slist, &v9fs_sessionlist); 242 list_add(&v9ses->slist, &v9fs_sessionlist);
242 spin_unlock(&v9fs_sessionlist_lock); 243 spin_unlock(&v9fs_sessionlist_lock);
243 244
244 v9ses->flags = V9FS_PROTO_2000U | V9FS_ACCESS_USER; 245 v9ses->flags = V9FS_ACCESS_USER;
245 strcpy(v9ses->uname, V9FS_DEFUSER); 246 strcpy(v9ses->uname, V9FS_DEFUSER);
246 strcpy(v9ses->aname, V9FS_DEFANAME); 247 strcpy(v9ses->aname, V9FS_DEFANAME);
247 v9ses->uid = ~0; 248 v9ses->uid = ~0;
@@ -262,8 +263,10 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
262 goto error; 263 goto error;
263 } 264 }
264 265
265 if (!p9_is_proto_dotu(v9ses->clnt)) 266 if (p9_is_proto_dotl(v9ses->clnt))
266 v9ses->flags &= ~V9FS_PROTO_2000U; 267 v9ses->flags |= V9FS_PROTO_2000L;
268 else if (p9_is_proto_dotu(v9ses->clnt))
269 v9ses->flags |= V9FS_PROTO_2000U;
267 270
268 v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; 271 v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
269 272
@@ -340,6 +343,19 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
340 p9_client_disconnect(v9ses->clnt); 343 p9_client_disconnect(v9ses->clnt);
341} 344}
342 345
346/**
347 * v9fs_session_begin_cancel - Begin terminate of a session
348 * @v9ses: session to terminate
349 *
350 * After this call we don't allow any request other than clunk.
351 */
352
353void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses)
354{
355 P9_DPRINTK(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses);
356 p9_client_begin_disconnect(v9ses->clnt);
357}
358
343extern int v9fs_error_init(void); 359extern int v9fs_error_init(void);
344 360
345static struct kobject *v9fs_kobj; 361static struct kobject *v9fs_kobj;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 6b801d1ddf4b..a0a8d3dd1361 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -108,6 +108,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
108 char *); 108 char *);
109void v9fs_session_close(struct v9fs_session_info *v9ses); 109void v9fs_session_close(struct v9fs_session_info *v9ses);
110void v9fs_session_cancel(struct v9fs_session_info *v9ses); 110void v9fs_session_cancel(struct v9fs_session_info *v9ses);
111void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
111 112
112#define V9FS_MAGIC 0x01021997 113#define V9FS_MAGIC 0x01021997
113 114
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index d74325295b1e..cbf4e50f3933 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -34,6 +34,7 @@
34#include <linux/namei.h> 34#include <linux/namei.h>
35#include <linux/idr.h> 35#include <linux/idr.h>
36#include <linux/sched.h> 36#include <linux/sched.h>
37#include <linux/slab.h>
37#include <net/9p/9p.h> 38#include <net/9p/9p.h>
38#include <net/9p/client.h> 39#include <net/9p/client.h>
39 40
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index d8a3afe4ff72..0adfd64dfcee 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -32,6 +32,7 @@
32#include <linux/sched.h> 32#include <linux/sched.h>
33#include <linux/inet.h> 33#include <linux/inet.h>
34#include <linux/idr.h> 34#include <linux/idr.h>
35#include <linux/slab.h>
35#include <net/9p/9p.h> 36#include <net/9p/9p.h>
36#include <net/9p/client.h> 37#include <net/9p/client.h>
37 38
@@ -130,6 +131,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
130 rdir = (struct p9_rdir *) fid->rdir; 131 rdir = (struct p9_rdir *) fid->rdir;
131 132
132 err = mutex_lock_interruptible(&rdir->mutex); 133 err = mutex_lock_interruptible(&rdir->mutex);
134 if (err)
135 return err;
133 while (err == 0) { 136 while (err == 0) {
134 if (rdir->tail == rdir->head) { 137 if (rdir->tail == rdir->head) {
135 err = v9fs_file_readn(filp, rdir->buf, NULL, 138 err = v9fs_file_readn(filp, rdir->buf, NULL,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 5fe45d692c9f..f2434fc9d2c4 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -34,6 +34,7 @@
34#include <linux/namei.h> 34#include <linux/namei.h>
35#include <linux/idr.h> 35#include <linux/idr.h>
36#include <linux/sched.h> 36#include <linux/sched.h>
37#include <linux/slab.h>
37#include <net/9p/9p.h> 38#include <net/9p/9p.h>
38#include <net/9p/client.h> 39#include <net/9p/client.h>
39 40
@@ -431,6 +432,7 @@ error:
431 432
432static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir) 433static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
433{ 434{
435 int retval;
434 struct inode *file_inode; 436 struct inode *file_inode;
435 struct v9fs_session_info *v9ses; 437 struct v9fs_session_info *v9ses;
436 struct p9_fid *v9fid; 438 struct p9_fid *v9fid;
@@ -444,7 +446,10 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
444 if (IS_ERR(v9fid)) 446 if (IS_ERR(v9fid))
445 return PTR_ERR(v9fid); 447 return PTR_ERR(v9fid);
446 448
447 return p9_client_remove(v9fid); 449 retval = p9_client_remove(v9fid);
450 if (!retval)
451 drop_nlink(file_inode);
452 return retval;
448} 453}
449 454
450static int 455static int
@@ -656,6 +661,9 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
656 P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n", 661 P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
657 dir, dentry->d_name.name, dentry, nameidata); 662 dir, dentry->d_name.name, dentry, nameidata);
658 663
664 if (dentry->d_name.len > NAME_MAX)
665 return ERR_PTR(-ENAMETOOLONG);
666
659 sb = dir->i_sb; 667 sb = dir->i_sb;
660 v9ses = v9fs_inode2v9ses(dir); 668 v9ses = v9fs_inode2v9ses(dir);
661 dfid = v9fs_fid_lookup(dentry->d_parent); 669 dfid = v9fs_fid_lookup(dentry->d_parent);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 69357c0d9899..491108bd6e0d 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -37,6 +37,7 @@
37#include <linux/mount.h> 37#include <linux/mount.h>
38#include <linux/idr.h> 38#include <linux/idr.h>
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <linux/slab.h>
40#include <net/9p/9p.h> 41#include <net/9p/9p.h>
41#include <net/9p/client.h> 42#include <net/9p/client.h>
42 43
@@ -193,6 +194,7 @@ static void v9fs_kill_super(struct super_block *s)
193 194
194 kill_anon_super(s); 195 kill_anon_super(s);
195 196
197 v9fs_session_cancel(v9ses);
196 v9fs_session_close(v9ses); 198 v9fs_session_close(v9ses);
197 kfree(v9ses); 199 kfree(v9ses);
198 s->s_fs_info = NULL; 200 s->s_fs_info = NULL;
@@ -205,7 +207,7 @@ v9fs_umount_begin(struct super_block *sb)
205 struct v9fs_session_info *v9ses; 207 struct v9fs_session_info *v9ses;
206 208
207 v9ses = sb->s_fs_info; 209 v9ses = sb->s_fs_info;
208 v9fs_session_cancel(v9ses); 210 v9fs_session_begin_cancel(v9ses);
209} 211}
210 212
211static const struct super_operations v9fs_super_ops = { 213static const struct super_operations v9fs_super_ops = {
diff --git a/fs/Kconfig b/fs/Kconfig
index 7405f071be67..5f85b5947613 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -235,6 +235,7 @@ config NFS_COMMON
235 235
236source "net/sunrpc/Kconfig" 236source "net/sunrpc/Kconfig"
237source "fs/smbfs/Kconfig" 237source "fs/smbfs/Kconfig"
238source "fs/ceph/Kconfig"
238source "fs/cifs/Kconfig" 239source "fs/cifs/Kconfig"
239source "fs/ncpfs/Kconfig" 240source "fs/ncpfs/Kconfig"
240source "fs/coda/Kconfig" 241source "fs/coda/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index c3633aa46911..97f340f14ba2 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -125,3 +125,4 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/
125obj-$(CONFIG_BTRFS_FS) += btrfs/ 125obj-$(CONFIG_BTRFS_FS) += btrfs/
126obj-$(CONFIG_GFS2_FS) += gfs2/ 126obj-$(CONFIG_GFS2_FS) += gfs2/
127obj-$(CONFIG_EXOFS_FS) += exofs/ 127obj-$(CONFIG_EXOFS_FS) += exofs/
128obj-$(CONFIG_CEPH_FS) += ceph/
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 6910a98bd73c..4a3af7075c1d 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -13,6 +13,7 @@
13#include <linux/parser.h> 13#include <linux/parser.h>
14#include <linux/mount.h> 14#include <linux/mount.h>
15#include <linux/seq_file.h> 15#include <linux/seq_file.h>
16#include <linux/slab.h>
16#include <linux/smp_lock.h> 17#include <linux/smp_lock.h>
17#include <linux/statfs.h> 18#include <linux/statfs.h>
18#include "adfs.h" 19#include "adfs.h"
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index 8306d53307ed..3e262711ae06 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -7,6 +7,7 @@
7 * block allocation, deallocation, calculation of free space. 7 * block allocation, deallocation, calculation of free space.
8 */ 8 */
9 9
10#include <linux/slab.h>
10#include "affs.h" 11#include "affs.h"
11 12
12/* This is, of course, shamelessly stolen from fs/minix */ 13/* This is, of course, shamelessly stolen from fs/minix */
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index c9744d771d98..f4b2a4ee4f91 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -10,6 +10,7 @@
10 * (C) 1991 Linus Torvalds - minix filesystem 10 * (C) 1991 Linus Torvalds - minix filesystem
11 */ 11 */
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/gfp.h>
13#include "affs.h" 14#include "affs.h"
14 15
15extern const struct inode_operations affs_symlink_inode_operations; 16extern const struct inode_operations affs_symlink_inode_operations;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index d41e9673cd97..16a3e4765f68 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -17,6 +17,7 @@
17#include <linux/magic.h> 17#include <linux/magic.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/smp_lock.h> 19#include <linux/smp_lock.h>
20#include <linux/slab.h>
20#include "affs.h" 21#include "affs.h"
21 22
22extern struct timezone sys_tz; 23extern struct timezone sys_tz;
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index e2b1d3f16519..0fb315dd4d2a 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -9,7 +9,6 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/slab.h>
13#include <linux/sched.h> 12#include <linux/sched.h>
14#include "internal.h" 13#include "internal.h"
15 14
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index eb765489164f..a3bcec75c54a 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/slab.h>
14#include <linux/sched.h> 15#include <linux/sched.h>
15#include <linux/ip.h> 16#include <linux/ip.h>
16#include "internal.h" 17#include "internal.h"
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 88067f36e5e7..adc1cb771b57 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -12,7 +12,6 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
17#include <linux/pagemap.h> 16#include <linux/pagemap.h>
18#include <linux/ctype.h> 17#include <linux/ctype.h>
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 39b301662f22..0df9bc2b724d 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -12,10 +12,10 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
17#include <linux/pagemap.h> 16#include <linux/pagemap.h>
18#include <linux/writeback.h> 17#include <linux/writeback.h>
18#include <linux/gfp.h>
19#include "internal.h" 19#include "internal.h"
20 20
21static int afs_readpage(struct file *file, struct page *page); 21static int afs_readpage(struct file *file, struct page *page);
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 023b95b0d9d7..4bd0218473a9 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -10,6 +10,7 @@
10 */ 10 */
11 11
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/slab.h>
13#include <linux/sched.h> 14#include <linux/sched.h>
14#include <linux/circ_buf.h> 15#include <linux/circ_buf.h>
15#include "internal.h" 16#include "internal.h"
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index c048f0658751..d00b312e3110 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -16,7 +16,6 @@
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/slab.h>
20#include <linux/fs.h> 19#include <linux/fs.h>
21#include <linux/pagemap.h> 20#include <linux/pagemap.h>
22#include <linux/sched.h> 21#include <linux/sched.h>
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 5ffb570cd3a8..5e813a816ce4 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -12,11 +12,11 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
17#include <linux/pagemap.h> 16#include <linux/pagemap.h>
18#include <linux/mount.h> 17#include <linux/mount.h>
19#include <linux/namei.h> 18#include <linux/namei.h>
19#include <linux/gfp.h>
20#include "internal.h" 20#include "internal.h"
21 21
22 22
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index bde3f19c0995..67cf810e0fd6 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -9,6 +9,7 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/slab.h>
12#include <net/sock.h> 13#include <net/sock.h>
13#include <net/af_rxrpc.h> 14#include <net/af_rxrpc.h>
14#include <rxrpc/packet.h> 15#include <rxrpc/packet.h>
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 3ef504370034..bb4ed144d0e4 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -189,8 +189,9 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order)
189 if (!permits) 189 if (!permits)
190 goto out_unlock; 190 goto out_unlock;
191 191
192 memcpy(permits->permits, xpermits->permits, 192 if (xpermits)
193 count * sizeof(struct afs_permit)); 193 memcpy(permits->permits, xpermits->permits,
194 count * sizeof(struct afs_permit));
194 195
195 _debug("key %x access %x", 196 _debug("key %x access %x",
196 key_serial(key), vnode->status.caller_access); 197 key_serial(key), vnode->status.caller_access);
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 36c1306e09e0..340afd0cd182 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -9,6 +9,7 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/gfp.h>
12#include <linux/init.h> 13#include <linux/init.h>
13#include <linux/sched.h> 14#include <linux/sched.h>
14#include "internal.h" 15#include "internal.h"
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 6e689208def2..9ac260d1361d 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/slab.h>
14#include <linux/init.h> 15#include <linux/init.h>
15#include <linux/sched.h> 16#include <linux/sched.h>
16#include "internal.h" 17#include "internal.h"
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index 2f05c4fc2a70..25cf4c3f4ff7 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -12,7 +12,6 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
17#include <linux/sched.h> 16#include <linux/sched.h>
18#include "internal.h" 17#include "internal.h"
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 2de009565d8e..e4b75d6eda83 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -12,7 +12,6 @@
12#include <linux/file.h> 12#include <linux/file.h>
13#include <linux/poll.h> 13#include <linux/poll.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/slab.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/fs.h> 16#include <linux/fs.h>
18#include <linux/mount.h> 17#include <linux/mount.h>
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 4a1401cea0a1..8713c7cfbc79 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -13,6 +13,7 @@
13#include <linux/capability.h> 13#include <linux/capability.h>
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/stat.h> 15#include <linux/stat.h>
16#include <linux/slab.h>
16#include <linux/param.h> 17#include <linux/param.h>
17#include <linux/time.h> 18#include <linux/time.h>
18#include <linux/smp_lock.h> 19#include <linux/smp_lock.h>
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index c8a80dffb455..d29b7f6df862 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -22,6 +22,7 @@
22#include <linux/magic.h> 22#include <linux/magic.h>
23#include <linux/dcache.h> 23#include <linux/dcache.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/slab.h>
25 26
26#include "autofs_i.h" 27#include "autofs_i.h"
27 28
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index a015b49891df..109a6c606d92 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -15,6 +15,7 @@
15#include <linux/capability.h> 15#include <linux/capability.h>
16#include <linux/errno.h> 16#include <linux/errno.h>
17#include <linux/stat.h> 17#include <linux/stat.h>
18#include <linux/slab.h>
18#include <linux/param.h> 19#include <linux/param.h>
19#include <linux/time.h> 20#include <linux/time.h>
20#include "autofs_i.h" 21#include "autofs_i.h"
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index e3287d0d1a58..59096b5e0fc7 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -11,7 +11,6 @@
11 */ 11 */
12 12
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/slab.h>
15#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
16#include <linux/string.h> 15#include <linux/string.h>
17 16
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 15d80bb35d6f..f96eff04e11a 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -20,11 +20,11 @@
20#include <linux/fcntl.h> 20#include <linux/fcntl.h>
21#include <linux/ptrace.h> 21#include <linux/ptrace.h>
22#include <linux/user.h> 22#include <linux/user.h>
23#include <linux/slab.h>
24#include <linux/binfmts.h> 23#include <linux/binfmts.h>
25#include <linux/personality.h> 24#include <linux/personality.h>
26#include <linux/init.h> 25#include <linux/init.h>
27#include <linux/coredump.h> 26#include <linux/coredump.h>
27#include <linux/slab.h>
28 28
29#include <asm/system.h> 29#include <asm/system.h>
30#include <asm/uaccess.h> 30#include <asm/uaccess.h>
@@ -75,14 +75,16 @@ static int aout_core_dump(struct coredump_params *cprm)
75 struct file *file = cprm->file; 75 struct file *file = cprm->file;
76 mm_segment_t fs; 76 mm_segment_t fs;
77 int has_dumped = 0; 77 int has_dumped = 0;
78 unsigned long dump_start, dump_size; 78 void __user *dump_start;
79 int dump_size;
79 struct user dump; 80 struct user dump;
80#ifdef __alpha__ 81#ifdef __alpha__
81# define START_DATA(u) (u.start_data) 82# define START_DATA(u) ((void __user *)u.start_data)
82#else 83#else
83# define START_DATA(u) ((u.u_tsize << PAGE_SHIFT) + u.start_code) 84# define START_DATA(u) ((void __user *)((u.u_tsize << PAGE_SHIFT) + \
85 u.start_code))
84#endif 86#endif
85# define START_STACK(u) (u.start_stack) 87# define START_STACK(u) ((void __user *)u.start_stack)
86 88
87 fs = get_fs(); 89 fs = get_fs();
88 set_fs(KERNEL_DS); 90 set_fs(KERNEL_DS);
@@ -104,9 +106,9 @@ static int aout_core_dump(struct coredump_params *cprm)
104 106
105/* make sure we actually have a data and stack area to dump */ 107/* make sure we actually have a data and stack area to dump */
106 set_fs(USER_DS); 108 set_fs(USER_DS);
107 if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize << PAGE_SHIFT)) 109 if (!access_ok(VERIFY_READ, START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
108 dump.u_dsize = 0; 110 dump.u_dsize = 0;
109 if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize << PAGE_SHIFT)) 111 if (!access_ok(VERIFY_READ, START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
110 dump.u_ssize = 0; 112 dump.u_ssize = 0;
111 113
112 set_fs(KERNEL_DS); 114 set_fs(KERNEL_DS);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 2c32d00a6690..7ab23e006e4c 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1590,7 +1590,7 @@ static size_t elf_core_vma_data_size(unsigned long mm_flags)
1590 struct vm_area_struct *vma; 1590 struct vm_area_struct *vma;
1591 size_t size = 0; 1591 size_t size = 0;
1592 1592
1593 for (vma = current->mm->mmap; vma; vma->vm_next) 1593 for (vma = current->mm->mmap; vma; vma = vma->vm_next)
1594 if (maydump(vma, mm_flags)) 1594 if (maydump(vma, mm_flags))
1595 size += vma->vm_end - vma->vm_start; 1595 size += vma->vm_end - vma->vm_start;
1596 return size; 1596 return size;
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index 32fb00b52cd0..b8e8b0acf9bd 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -11,7 +11,6 @@
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/stat.h> 13#include <linux/stat.h>
14#include <linux/slab.h>
15#include <linux/binfmts.h> 14#include <linux/binfmts.h>
16#include <linux/elf.h> 15#include <linux/elf.h>
17#include <linux/init.h> 16#include <linux/init.h>
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 08343505e184..aca9d55afb22 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -8,7 +8,6 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/string.h> 9#include <linux/string.h>
10#include <linux/stat.h> 10#include <linux/stat.h>
11#include <linux/slab.h>
12#include <linux/binfmts.h> 11#include <linux/binfmts.h>
13#include <linux/init.h> 12#include <linux/init.h>
14#include <linux/file.h> 13#include <linux/file.h>
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index a16f29e888cd..612a5c38d3c1 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -24,6 +24,7 @@
24#include <linux/mempool.h> 24#include <linux/mempool.h>
25#include <linux/bio.h> 25#include <linux/bio.h>
26#include <linux/workqueue.h> 26#include <linux/workqueue.h>
27#include <linux/slab.h>
27 28
28struct integrity_slab { 29struct integrity_slab {
29 struct kmem_cache *slab; 30 struct kmem_cache *slab;
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6df6d6ed74fd..6ef7b26724ec 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -22,6 +22,7 @@
22#include <linux/posix_acl_xattr.h> 22#include <linux/posix_acl_xattr.h>
23#include <linux/posix_acl.h> 23#include <linux/posix_acl.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/slab.h>
25 26
26#include "ctree.h" 27#include "ctree.h"
27#include "btrfs_inode.h" 28#include "btrfs_inode.h"
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c0861e781cdb..462859a30141 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/kthread.h> 19#include <linux/kthread.h>
20#include <linux/slab.h>
20#include <linux/list.h> 21#include <linux/list.h>
21#include <linux/spinlock.h> 22#include <linux/spinlock.h>
22#include <linux/freezer.h> 23#include <linux/freezer.h>
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 28b92a7218ab..396039b3a8a2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -31,7 +31,7 @@
31#include <linux/swap.h> 31#include <linux/swap.h>
32#include <linux/writeback.h> 32#include <linux/writeback.h>
33#include <linux/bit_spinlock.h> 33#include <linux/bit_spinlock.h>
34#include <linux/pagevec.h> 34#include <linux/slab.h>
35#include "compat.h" 35#include "compat.h"
36#include "ctree.h" 36#include "ctree.h"
37#include "disk-io.h" 37#include "disk-io.h"
@@ -445,7 +445,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
445 unsigned long nr_pages = 0; 445 unsigned long nr_pages = 0;
446 struct extent_map *em; 446 struct extent_map *em;
447 struct address_space *mapping = inode->i_mapping; 447 struct address_space *mapping = inode->i_mapping;
448 struct pagevec pvec;
449 struct extent_map_tree *em_tree; 448 struct extent_map_tree *em_tree;
450 struct extent_io_tree *tree; 449 struct extent_io_tree *tree;
451 u64 end; 450 u64 end;
@@ -461,7 +460,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
461 460
462 end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; 461 end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
463 462
464 pagevec_init(&pvec, 0);
465 while (last_offset < compressed_end) { 463 while (last_offset < compressed_end) {
466 page_index = last_offset >> PAGE_CACHE_SHIFT; 464 page_index = last_offset >> PAGE_CACHE_SHIFT;
467 465
@@ -478,26 +476,17 @@ static noinline int add_ra_bio_pages(struct inode *inode,
478 goto next; 476 goto next;
479 } 477 }
480 478
481 page = alloc_page(mapping_gfp_mask(mapping) & ~__GFP_FS); 479 page = __page_cache_alloc(mapping_gfp_mask(mapping) &
480 ~__GFP_FS);
482 if (!page) 481 if (!page)
483 break; 482 break;
484 483
485 page->index = page_index; 484 if (add_to_page_cache_lru(page, mapping, page_index,
486 /* 485 GFP_NOFS)) {
487 * what we want to do here is call add_to_page_cache_lru,
488 * but that isn't exported, so we reproduce it here
489 */
490 if (add_to_page_cache(page, mapping,
491 page->index, GFP_NOFS)) {
492 page_cache_release(page); 486 page_cache_release(page);
493 goto next; 487 goto next;
494 } 488 }
495 489
496 /* open coding of lru_cache_add, also not exported */
497 page_cache_get(page);
498 if (!pagevec_add(&pvec, page))
499 __pagevec_lru_add_file(&pvec);
500
501 end = last_offset + PAGE_CACHE_SIZE - 1; 490 end = last_offset + PAGE_CACHE_SIZE - 1;
502 /* 491 /*
503 * at this point, we have a locked page in the page cache 492 * at this point, we have a locked page in the page cache
@@ -551,8 +540,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
551next: 540next:
552 last_offset += PAGE_CACHE_SIZE; 541 last_offset += PAGE_CACHE_SIZE;
553 } 542 }
554 if (pagevec_count(&pvec))
555 __pagevec_lru_add_file(&pvec);
556 return 0; 543 return 0;
557} 544}
558 545
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c4bc570a396e..6795a713b205 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "disk-io.h" 22#include "disk-io.h"
22#include "transaction.h" 23#include "transaction.h"
@@ -3040,6 +3041,10 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3040 if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0])) 3041 if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
3041 goto err; 3042 goto err;
3042 3043
3044 /* the leaf has changed, it now has room. return now */
3045 if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
3046 goto err;
3047
3043 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3048 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3044 fi = btrfs_item_ptr(leaf, path->slots[0], 3049 fi = btrfs_item_ptr(leaf, path->slots[0],
3045 struct btrfs_file_extent_item); 3050 struct btrfs_file_extent_item);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0af2e3868573..746a7248678e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -26,6 +26,7 @@
26#include <linux/completion.h> 26#include <linux/completion.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/wait.h> 28#include <linux/wait.h>
29#include <linux/slab.h>
29#include <asm/kmap_types.h> 30#include <asm/kmap_types.h>
30#include "extent_io.h" 31#include "extent_io.h"
31#include "extent_map.h" 32#include "extent_map.h"
@@ -834,7 +835,6 @@ struct btrfs_fs_info {
834 u64 last_trans_log_full_commit; 835 u64 last_trans_log_full_commit;
835 u64 open_ioctl_trans; 836 u64 open_ioctl_trans;
836 unsigned long mount_opt; 837 unsigned long mount_opt;
837 u64 max_extent;
838 u64 max_inline; 838 u64 max_inline;
839 u64 alloc_start; 839 u64 alloc_start;
840 struct btrfs_transaction *running_transaction; 840 struct btrfs_transaction *running_transaction;
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 84e6781413b1..902ce507c4e3 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include <linux/sort.h> 21#include <linux/sort.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "delayed-ref.h" 23#include "delayed-ref.h"
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 11d0ad30e203..e7b8f2c89ccb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -27,6 +27,7 @@
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <linux/freezer.h> 28#include <linux/freezer.h>
29#include <linux/crc32c.h> 29#include <linux/crc32c.h>
30#include <linux/slab.h>
30#include "compat.h" 31#include "compat.h"
31#include "ctree.h" 32#include "ctree.h"
32#include "disk-io.h" 33#include "disk-io.h"
@@ -1634,7 +1635,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1634 atomic_set(&fs_info->async_submit_draining, 0); 1635 atomic_set(&fs_info->async_submit_draining, 0);
1635 atomic_set(&fs_info->nr_async_bios, 0); 1636 atomic_set(&fs_info->nr_async_bios, 0);
1636 fs_info->sb = sb; 1637 fs_info->sb = sb;
1637 fs_info->max_extent = (u64)-1;
1638 fs_info->max_inline = 8192 * 1024; 1638 fs_info->max_inline = 8192 * 1024;
1639 fs_info->metadata_ratio = 0; 1639 fs_info->metadata_ratio = 0;
1640 1640
@@ -1922,7 +1922,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1922 1922
1923 csum_root->track_dirty = 1; 1923 csum_root->track_dirty = 1;
1924 1924
1925 btrfs_read_block_groups(extent_root); 1925 ret = btrfs_read_block_groups(extent_root);
1926 if (ret) {
1927 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
1928 goto fail_block_groups;
1929 }
1926 1930
1927 fs_info->generation = generation; 1931 fs_info->generation = generation;
1928 fs_info->last_trans_committed = generation; 1932 fs_info->last_trans_committed = generation;
@@ -1932,7 +1936,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1932 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 1936 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1933 "btrfs-cleaner"); 1937 "btrfs-cleaner");
1934 if (IS_ERR(fs_info->cleaner_kthread)) 1938 if (IS_ERR(fs_info->cleaner_kthread))
1935 goto fail_csum_root; 1939 goto fail_block_groups;
1936 1940
1937 fs_info->transaction_kthread = kthread_run(transaction_kthread, 1941 fs_info->transaction_kthread = kthread_run(transaction_kthread,
1938 tree_root, 1942 tree_root,
@@ -2020,7 +2024,8 @@ fail_cleaner:
2020 filemap_write_and_wait(fs_info->btree_inode->i_mapping); 2024 filemap_write_and_wait(fs_info->btree_inode->i_mapping);
2021 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2025 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2022 2026
2023fail_csum_root: 2027fail_block_groups:
2028 btrfs_free_block_groups(fs_info);
2024 free_extent_buffer(csum_root->node); 2029 free_extent_buffer(csum_root->node);
2025 free_extent_buffer(csum_root->commit_root); 2030 free_extent_buffer(csum_root->commit_root);
2026fail_dev_root: 2031fail_dev_root:
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1727b26fb194..9e23ffea7f54 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -22,6 +22,7 @@
22#include <linux/sort.h> 22#include <linux/sort.h>
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/slab.h>
25#include "compat.h" 26#include "compat.h"
26#include "hash.h" 27#include "hash.h"
27#include "ctree.h" 28#include "ctree.h"
@@ -2676,6 +2677,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2676 2677
2677 INIT_LIST_HEAD(&found->block_groups); 2678 INIT_LIST_HEAD(&found->block_groups);
2678 init_rwsem(&found->groups_sem); 2679 init_rwsem(&found->groups_sem);
2680 init_waitqueue_head(&found->flush_wait);
2681 init_waitqueue_head(&found->allocate_wait);
2679 spin_lock_init(&found->lock); 2682 spin_lock_init(&found->lock);
2680 found->flags = flags; 2683 found->flags = flags;
2681 found->total_bytes = total_bytes; 2684 found->total_bytes = total_bytes;
@@ -2846,7 +2849,7 @@ int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2846 } 2849 }
2847 spin_unlock(&BTRFS_I(inode)->accounting_lock); 2850 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2848 2851
2849 BTRFS_I(inode)->reserved_extents--; 2852 BTRFS_I(inode)->reserved_extents -= num_items;
2850 BUG_ON(BTRFS_I(inode)->reserved_extents < 0); 2853 BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
2851 2854
2852 if (meta_sinfo->bytes_delalloc < num_bytes) { 2855 if (meta_sinfo->bytes_delalloc < num_bytes) {
@@ -2944,12 +2947,10 @@ static void flush_delalloc(struct btrfs_root *root,
2944 2947
2945 spin_lock(&info->lock); 2948 spin_lock(&info->lock);
2946 2949
2947 if (!info->flushing) { 2950 if (!info->flushing)
2948 info->flushing = 1; 2951 info->flushing = 1;
2949 init_waitqueue_head(&info->flush_wait); 2952 else
2950 } else {
2951 wait = true; 2953 wait = true;
2952 }
2953 2954
2954 spin_unlock(&info->lock); 2955 spin_unlock(&info->lock);
2955 2956
@@ -3011,7 +3012,6 @@ static int maybe_allocate_chunk(struct btrfs_root *root,
3011 if (!info->allocating_chunk) { 3012 if (!info->allocating_chunk) {
3012 info->force_alloc = 1; 3013 info->force_alloc = 1;
3013 info->allocating_chunk = 1; 3014 info->allocating_chunk = 1;
3014 init_waitqueue_head(&info->allocate_wait);
3015 } else { 3015 } else {
3016 wait = true; 3016 wait = true;
3017 } 3017 }
@@ -3111,7 +3111,7 @@ again:
3111 return -ENOSPC; 3111 return -ENOSPC;
3112 } 3112 }
3113 3113
3114 BTRFS_I(inode)->reserved_extents++; 3114 BTRFS_I(inode)->reserved_extents += num_items;
3115 check_force_delalloc(meta_sinfo); 3115 check_force_delalloc(meta_sinfo);
3116 spin_unlock(&meta_sinfo->lock); 3116 spin_unlock(&meta_sinfo->lock);
3117 3117
@@ -4170,6 +4170,10 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4170 ins->offset = 0; 4170 ins->offset = 0;
4171 4171
4172 space_info = __find_space_info(root->fs_info, data); 4172 space_info = __find_space_info(root->fs_info, data);
4173 if (!space_info) {
4174 printk(KERN_ERR "No space info for %d\n", data);
4175 return -ENOSPC;
4176 }
4173 4177
4174 if (orig_root->ref_cows || empty_size) 4178 if (orig_root->ref_cows || empty_size)
4175 allowed_chunk_alloc = 1; 4179 allowed_chunk_alloc = 1;
@@ -5205,6 +5209,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5205 next = btrfs_find_tree_block(root, bytenr, blocksize); 5209 next = btrfs_find_tree_block(root, bytenr, blocksize);
5206 if (!next) { 5210 if (!next) {
5207 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 5211 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
5212 if (!next)
5213 return -ENOMEM;
5208 reada = 1; 5214 reada = 1;
5209 } 5215 }
5210 btrfs_tree_lock(next); 5216 btrfs_tree_lock(next);
@@ -5417,7 +5423,8 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5417 if (ret > 0) { 5423 if (ret > 0) {
5418 path->slots[level]++; 5424 path->slots[level]++;
5419 continue; 5425 continue;
5420 } 5426 } else if (ret < 0)
5427 return ret;
5421 level = wc->level; 5428 level = wc->level;
5422 } 5429 }
5423 return 0; 5430 return 0;
@@ -7369,7 +7376,6 @@ static int find_first_block_group(struct btrfs_root *root,
7369 } 7376 }
7370 path->slots[0]++; 7377 path->slots[0]++;
7371 } 7378 }
7372 ret = -ENOENT;
7373out: 7379out:
7374 return ret; 7380 return ret;
7375} 7381}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c99121ac5d6b..d2d03684fab2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2,7 +2,6 @@
2#include <linux/slab.h> 2#include <linux/slab.h>
3#include <linux/bio.h> 3#include <linux/bio.h>
4#include <linux/mm.h> 4#include <linux/mm.h>
5#include <linux/gfp.h>
6#include <linux/pagemap.h> 5#include <linux/pagemap.h>
7#include <linux/page-flags.h> 6#include <linux/page-flags.h>
8#include <linux/module.h> 7#include <linux/module.h>
@@ -2679,33 +2678,20 @@ int extent_readpages(struct extent_io_tree *tree,
2679{ 2678{
2680 struct bio *bio = NULL; 2679 struct bio *bio = NULL;
2681 unsigned page_idx; 2680 unsigned page_idx;
2682 struct pagevec pvec;
2683 unsigned long bio_flags = 0; 2681 unsigned long bio_flags = 0;
2684 2682
2685 pagevec_init(&pvec, 0);
2686 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 2683 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2687 struct page *page = list_entry(pages->prev, struct page, lru); 2684 struct page *page = list_entry(pages->prev, struct page, lru);
2688 2685
2689 prefetchw(&page->flags); 2686 prefetchw(&page->flags);
2690 list_del(&page->lru); 2687 list_del(&page->lru);
2691 /* 2688 if (!add_to_page_cache_lru(page, mapping,
2692 * what we want to do here is call add_to_page_cache_lru,
2693 * but that isn't exported, so we reproduce it here
2694 */
2695 if (!add_to_page_cache(page, mapping,
2696 page->index, GFP_KERNEL)) { 2689 page->index, GFP_KERNEL)) {
2697
2698 /* open coding of lru_cache_add, also not exported */
2699 page_cache_get(page);
2700 if (!pagevec_add(&pvec, page))
2701 __pagevec_lru_add_file(&pvec);
2702 __extent_read_full_page(tree, page, get_extent, 2690 __extent_read_full_page(tree, page, get_extent,
2703 &bio, 0, &bio_flags); 2691 &bio, 0, &bio_flags);
2704 } 2692 }
2705 page_cache_release(page); 2693 page_cache_release(page);
2706 } 2694 }
2707 if (pagevec_count(&pvec))
2708 __pagevec_lru_add_file(&pvec);
2709 BUG_ON(!list_empty(pages)); 2695 BUG_ON(!list_empty(pages));
2710 if (bio) 2696 if (bio)
2711 submit_one_bio(READ, bio, 0, bio_flags); 2697 submit_one_bio(READ, bio, 0, bio_flags);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 28d87ba60ce8..454ca52d6451 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1,5 +1,4 @@
1#include <linux/err.h> 1#include <linux/err.h>
2#include <linux/gfp.h>
3#include <linux/slab.h> 2#include <linux/slab.h>
4#include <linux/module.h> 3#include <linux/module.h>
5#include <linux/spinlock.h> 4#include <linux/spinlock.h>
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 9b99886562d0..54a255065aa3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/bio.h> 19#include <linux/bio.h>
20#include <linux/slab.h>
20#include <linux/pagemap.h> 21#include <linux/pagemap.h>
21#include <linux/highmem.h> 22#include <linux/highmem.h>
22#include "ctree.h" 23#include "ctree.h"
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ee3323c7fc1c..29ff749ff4ca 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -28,6 +28,7 @@
28#include <linux/writeback.h> 28#include <linux/writeback.h>
29#include <linux/statfs.h> 29#include <linux/statfs.h>
30#include <linux/compat.h> 30#include <linux/compat.h>
31#include <linux/slab.h>
31#include "ctree.h" 32#include "ctree.h"
32#include "disk-io.h" 33#include "disk-io.h"
33#include "transaction.h" 34#include "transaction.h"
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index dd831ed31eea..f488fac04d99 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h>
21#include <linux/math64.h> 22#include <linux/math64.h>
22#include "ctree.h" 23#include "ctree.h"
23#include "free-space-cache.h" 24#include "free-space-cache.h"
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 02bb099845fd..2bfdc641d4e3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -36,6 +36,7 @@
36#include <linux/xattr.h> 36#include <linux/xattr.h>
37#include <linux/posix_acl.h> 37#include <linux/posix_acl.h>
38#include <linux/falloc.h> 38#include <linux/falloc.h>
39#include <linux/slab.h>
39#include "compat.h" 40#include "compat.h"
40#include "ctree.h" 41#include "ctree.h"
41#include "disk-io.h" 42#include "disk-io.h"
@@ -796,7 +797,7 @@ static noinline int cow_file_range(struct inode *inode,
796 while (disk_num_bytes > 0) { 797 while (disk_num_bytes > 0) {
797 unsigned long op; 798 unsigned long op;
798 799
799 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); 800 cur_alloc_size = disk_num_bytes;
800 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 801 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
801 root->sectorsize, 0, alloc_hint, 802 root->sectorsize, 0, alloc_hint,
802 (u64)-1, &ins, 1); 803 (u64)-1, &ins, 1);
@@ -1227,30 +1228,9 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1227static int btrfs_split_extent_hook(struct inode *inode, 1228static int btrfs_split_extent_hook(struct inode *inode,
1228 struct extent_state *orig, u64 split) 1229 struct extent_state *orig, u64 split)
1229{ 1230{
1230 struct btrfs_root *root = BTRFS_I(inode)->root;
1231 u64 size;
1232
1233 if (!(orig->state & EXTENT_DELALLOC)) 1231 if (!(orig->state & EXTENT_DELALLOC))
1234 return 0; 1232 return 0;
1235 1233
1236 size = orig->end - orig->start + 1;
1237 if (size > root->fs_info->max_extent) {
1238 u64 num_extents;
1239 u64 new_size;
1240
1241 new_size = orig->end - split + 1;
1242 num_extents = div64_u64(size + root->fs_info->max_extent - 1,
1243 root->fs_info->max_extent);
1244
1245 /*
1246 * if we break a large extent up then leave oustanding_extents
1247 * be, since we've already accounted for the large extent.
1248 */
1249 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1250 root->fs_info->max_extent) < num_extents)
1251 return 0;
1252 }
1253
1254 spin_lock(&BTRFS_I(inode)->accounting_lock); 1234 spin_lock(&BTRFS_I(inode)->accounting_lock);
1255 BTRFS_I(inode)->outstanding_extents++; 1235 BTRFS_I(inode)->outstanding_extents++;
1256 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1236 spin_unlock(&BTRFS_I(inode)->accounting_lock);
@@ -1268,38 +1248,10 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1268 struct extent_state *new, 1248 struct extent_state *new,
1269 struct extent_state *other) 1249 struct extent_state *other)
1270{ 1250{
1271 struct btrfs_root *root = BTRFS_I(inode)->root;
1272 u64 new_size, old_size;
1273 u64 num_extents;
1274
1275 /* not delalloc, ignore it */ 1251 /* not delalloc, ignore it */
1276 if (!(other->state & EXTENT_DELALLOC)) 1252 if (!(other->state & EXTENT_DELALLOC))
1277 return 0; 1253 return 0;
1278 1254
1279 old_size = other->end - other->start + 1;
1280 if (new->start < other->start)
1281 new_size = other->end - new->start + 1;
1282 else
1283 new_size = new->end - other->start + 1;
1284
1285 /* we're not bigger than the max, unreserve the space and go */
1286 if (new_size <= root->fs_info->max_extent) {
1287 spin_lock(&BTRFS_I(inode)->accounting_lock);
1288 BTRFS_I(inode)->outstanding_extents--;
1289 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1290 return 0;
1291 }
1292
1293 /*
1294 * If we grew by another max_extent, just return, we want to keep that
1295 * reserved amount.
1296 */
1297 num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
1298 root->fs_info->max_extent);
1299 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1300 root->fs_info->max_extent) > num_extents)
1301 return 0;
1302
1303 spin_lock(&BTRFS_I(inode)->accounting_lock); 1255 spin_lock(&BTRFS_I(inode)->accounting_lock);
1304 BTRFS_I(inode)->outstanding_extents--; 1256 BTRFS_I(inode)->outstanding_extents--;
1305 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1257 spin_unlock(&BTRFS_I(inode)->accounting_lock);
@@ -1328,6 +1280,7 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1328 BTRFS_I(inode)->outstanding_extents++; 1280 BTRFS_I(inode)->outstanding_extents++;
1329 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1281 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1330 btrfs_delalloc_reserve_space(root, inode, end - start + 1); 1282 btrfs_delalloc_reserve_space(root, inode, end - start + 1);
1283
1331 spin_lock(&root->fs_info->delalloc_lock); 1284 spin_lock(&root->fs_info->delalloc_lock);
1332 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1285 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
1333 root->fs_info->delalloc_bytes += end - start + 1; 1286 root->fs_info->delalloc_bytes += end - start + 1;
@@ -1356,6 +1309,7 @@ static int btrfs_clear_bit_hook(struct inode *inode,
1356 1309
1357 if (bits & EXTENT_DO_ACCOUNTING) { 1310 if (bits & EXTENT_DO_ACCOUNTING) {
1358 spin_lock(&BTRFS_I(inode)->accounting_lock); 1311 spin_lock(&BTRFS_I(inode)->accounting_lock);
1312 WARN_ON(!BTRFS_I(inode)->outstanding_extents);
1359 BTRFS_I(inode)->outstanding_extents--; 1313 BTRFS_I(inode)->outstanding_extents--;
1360 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1314 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1361 btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 1315 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
@@ -5384,7 +5338,6 @@ free:
5384void btrfs_drop_inode(struct inode *inode) 5338void btrfs_drop_inode(struct inode *inode)
5385{ 5339{
5386 struct btrfs_root *root = BTRFS_I(inode)->root; 5340 struct btrfs_root *root = BTRFS_I(inode)->root;
5387
5388 if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0) 5341 if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
5389 generic_delete_inode(inode); 5342 generic_delete_inode(inode);
5390 else 5343 else
@@ -5788,18 +5741,15 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
5788 struct btrfs_trans_handle *trans; 5741 struct btrfs_trans_handle *trans;
5789 struct btrfs_root *root = BTRFS_I(inode)->root; 5742 struct btrfs_root *root = BTRFS_I(inode)->root;
5790 struct btrfs_key ins; 5743 struct btrfs_key ins;
5791 u64 alloc_size;
5792 u64 cur_offset = start; 5744 u64 cur_offset = start;
5793 u64 num_bytes = end - start; 5745 u64 num_bytes = end - start;
5794 int ret = 0; 5746 int ret = 0;
5795 u64 i_size; 5747 u64 i_size;
5796 5748
5797 while (num_bytes > 0) { 5749 while (num_bytes > 0) {
5798 alloc_size = min(num_bytes, root->fs_info->max_extent);
5799
5800 trans = btrfs_start_transaction(root, 1); 5750 trans = btrfs_start_transaction(root, 1);
5801 5751
5802 ret = btrfs_reserve_extent(trans, root, alloc_size, 5752 ret = btrfs_reserve_extent(trans, root, num_bytes,
5803 root->sectorsize, 0, alloc_hint, 5753 root->sectorsize, 0, alloc_hint,
5804 (u64)-1, &ins, 1); 5754 (u64)-1, &ins, 1);
5805 if (ret) { 5755 if (ret) {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2845c6ceecd2..e84ef60ffe35 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -39,6 +39,7 @@
39#include <linux/security.h> 39#include <linux/security.h>
40#include <linux/xattr.h> 40#include <linux/xattr.h>
41#include <linux/vmalloc.h> 41#include <linux/vmalloc.h>
42#include <linux/slab.h>
42#include "compat.h" 43#include "compat.h"
43#include "ctree.h" 44#include "ctree.h"
44#include "disk-io.h" 45#include "disk-io.h"
@@ -48,7 +49,6 @@
48#include "print-tree.h" 49#include "print-tree.h"
49#include "volumes.h" 50#include "volumes.h"
50#include "locking.h" 51#include "locking.h"
51#include "ctree.h"
52 52
53/* Mask out flags that are inappropriate for the given type of inode. */ 53/* Mask out flags that are inappropriate for the given type of inode. */
54static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 54static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -511,7 +511,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
511 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 511 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
512 unlock_extent(io_tree, start, start + len - 1, GFP_NOFS); 512 unlock_extent(io_tree, start, start + len - 1, GFP_NOFS);
513 513
514 if (!em) 514 if (IS_ERR(em))
515 return 0; 515 return 0;
516 } 516 }
517 517
@@ -1212,6 +1212,9 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
1212 return -EPERM; 1212 return -EPERM;
1213 1213
1214 args = kmalloc(sizeof(*args), GFP_KERNEL); 1214 args = kmalloc(sizeof(*args), GFP_KERNEL);
1215 if (!args)
1216 return -ENOMEM;
1217
1215 if (copy_from_user(args, argp, sizeof(*args))) { 1218 if (copy_from_user(args, argp, sizeof(*args))) {
1216 kfree(args); 1219 kfree(args);
1217 return -EFAULT; 1220 return -EFAULT;
@@ -1375,6 +1378,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1375 sizeof(*range))) { 1378 sizeof(*range))) {
1376 ret = -EFAULT; 1379 ret = -EFAULT;
1377 kfree(range); 1380 kfree(range);
1381 goto out;
1378 } 1382 }
1379 /* compression requires us to start the IO */ 1383 /* compression requires us to start the IO */
1380 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { 1384 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 1c36e5cd8f55..6151f2ea38bb 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -16,7 +16,6 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/gfp.h>
20#include <linux/pagemap.h> 19#include <linux/pagemap.h>
21#include <linux/spinlock.h> 20#include <linux/spinlock.h>
22#include <linux/page-flags.h> 21#include <linux/page-flags.h>
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a8ffecd0b491..a127c0ebb2dc 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -16,7 +16,6 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/gfp.h>
20#include <linux/slab.h> 19#include <linux/slab.h>
21#include <linux/blkdev.h> 20#include <linux/blkdev.h>
22#include <linux/writeback.h> 21#include <linux/writeback.h>
@@ -303,6 +302,7 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
303 struct btrfs_ordered_extent *entry) 302 struct btrfs_ordered_extent *entry)
304{ 303{
305 struct btrfs_ordered_inode_tree *tree; 304 struct btrfs_ordered_inode_tree *tree;
305 struct btrfs_root *root = BTRFS_I(inode)->root;
306 struct rb_node *node; 306 struct rb_node *node;
307 307
308 tree = &BTRFS_I(inode)->ordered_tree; 308 tree = &BTRFS_I(inode)->ordered_tree;
@@ -312,12 +312,13 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
312 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 312 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
313 313
314 spin_lock(&BTRFS_I(inode)->accounting_lock); 314 spin_lock(&BTRFS_I(inode)->accounting_lock);
315 WARN_ON(!BTRFS_I(inode)->outstanding_extents);
315 BTRFS_I(inode)->outstanding_extents--; 316 BTRFS_I(inode)->outstanding_extents--;
316 spin_unlock(&BTRFS_I(inode)->accounting_lock); 317 spin_unlock(&BTRFS_I(inode)->accounting_lock);
317 btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root, 318 btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
318 inode, 1); 319 inode, 1);
319 320
320 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 321 spin_lock(&root->fs_info->ordered_extent_lock);
321 list_del_init(&entry->root_extent_list); 322 list_del_init(&entry->root_extent_list);
322 323
323 /* 324 /*
@@ -329,7 +330,7 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
329 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { 330 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
330 list_del_init(&BTRFS_I(inode)->ordered_operations); 331 list_del_init(&BTRFS_I(inode)->ordered_operations);
331 } 332 }
332 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 333 spin_unlock(&root->fs_info->ordered_extent_lock);
333 334
334 return 0; 335 return 0;
335} 336}
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index d0cc62bccb94..a97314cf6bd6 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include <linux/sort.h> 21#include <linux/sort.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "ref-cache.h" 23#include "ref-cache.h"
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 0b23942cbc0d..e558dd941ded 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -21,6 +21,7 @@
21#include <linux/writeback.h> 21#include <linux/writeback.h>
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/rbtree.h> 23#include <linux/rbtree.h>
24#include <linux/slab.h>
24#include "ctree.h" 25#include "ctree.h"
25#include "disk-io.h" 26#include "disk-io.h"
26#include "transaction.h" 27#include "transaction.h"
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9ac612e6ca60..1866dff0538e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -38,6 +38,7 @@
38#include <linux/namei.h> 38#include <linux/namei.h>
39#include <linux/miscdevice.h> 39#include <linux/miscdevice.h>
40#include <linux/magic.h> 40#include <linux/magic.h>
41#include <linux/slab.h>
41#include "compat.h" 42#include "compat.h"
42#include "ctree.h" 43#include "ctree.h"
43#include "disk-io.h" 44#include "disk-io.h"
@@ -64,10 +65,9 @@ static void btrfs_put_super(struct super_block *sb)
64 65
65enum { 66enum {
66 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum, 67 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
67 Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, 68 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
68 Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, 69 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
69 Opt_noacl, Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio, 70 Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
70 Opt_flushoncommit,
71 Opt_discard, Opt_err, 71 Opt_discard, Opt_err,
72}; 72};
73 73
@@ -79,7 +79,6 @@ static match_table_t tokens = {
79 {Opt_nodatasum, "nodatasum"}, 79 {Opt_nodatasum, "nodatasum"},
80 {Opt_nodatacow, "nodatacow"}, 80 {Opt_nodatacow, "nodatacow"},
81 {Opt_nobarrier, "nobarrier"}, 81 {Opt_nobarrier, "nobarrier"},
82 {Opt_max_extent, "max_extent=%s"},
83 {Opt_max_inline, "max_inline=%s"}, 82 {Opt_max_inline, "max_inline=%s"},
84 {Opt_alloc_start, "alloc_start=%s"}, 83 {Opt_alloc_start, "alloc_start=%s"},
85 {Opt_thread_pool, "thread_pool=%d"}, 84 {Opt_thread_pool, "thread_pool=%d"},
@@ -188,18 +187,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
188 info->thread_pool_size); 187 info->thread_pool_size);
189 } 188 }
190 break; 189 break;
191 case Opt_max_extent:
192 num = match_strdup(&args[0]);
193 if (num) {
194 info->max_extent = memparse(num, NULL);
195 kfree(num);
196
197 info->max_extent = max_t(u64,
198 info->max_extent, root->sectorsize);
199 printk(KERN_INFO "btrfs: max_extent at %llu\n",
200 (unsigned long long)info->max_extent);
201 }
202 break;
203 case Opt_max_inline: 190 case Opt_max_inline:
204 num = match_strdup(&args[0]); 191 num = match_strdup(&args[0]);
205 if (num) { 192 if (num) {
@@ -529,9 +516,6 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
529 seq_puts(seq, ",nodatacow"); 516 seq_puts(seq, ",nodatacow");
530 if (btrfs_test_opt(root, NOBARRIER)) 517 if (btrfs_test_opt(root, NOBARRIER))
531 seq_puts(seq, ",nobarrier"); 518 seq_puts(seq, ",nobarrier");
532 if (info->max_extent != (u64)-1)
533 seq_printf(seq, ",max_extent=%llu",
534 (unsigned long long)info->max_extent);
535 if (info->max_inline != 8192 * 1024) 519 if (info->max_inline != 8192 * 1024)
536 seq_printf(seq, ",max_inline=%llu", 520 seq_printf(seq, ",max_inline=%llu",
537 (unsigned long long)info->max_inline); 521 (unsigned long long)info->max_inline);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2d654c1c794d..2cb116099b90 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/slab.h>
20#include <linux/sched.h> 21#include <linux/sched.h>
21#include <linux/writeback.h> 22#include <linux/writeback.h>
22#include <linux/pagemap.h> 23#include <linux/pagemap.h>
@@ -147,18 +148,13 @@ static void wait_current_trans(struct btrfs_root *root)
147 while (1) { 148 while (1) {
148 prepare_to_wait(&root->fs_info->transaction_wait, &wait, 149 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
149 TASK_UNINTERRUPTIBLE); 150 TASK_UNINTERRUPTIBLE);
150 if (cur_trans->blocked) { 151 if (!cur_trans->blocked)
151 mutex_unlock(&root->fs_info->trans_mutex);
152 schedule();
153 mutex_lock(&root->fs_info->trans_mutex);
154 finish_wait(&root->fs_info->transaction_wait,
155 &wait);
156 } else {
157 finish_wait(&root->fs_info->transaction_wait,
158 &wait);
159 break; 152 break;
160 } 153 mutex_unlock(&root->fs_info->trans_mutex);
154 schedule();
155 mutex_lock(&root->fs_info->trans_mutex);
161 } 156 }
157 finish_wait(&root->fs_info->transaction_wait, &wait);
162 put_transaction(cur_trans); 158 put_transaction(cur_trans);
163 } 159 }
164} 160}
@@ -760,10 +756,17 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
760 struct btrfs_root_item *new_root_item; 756 struct btrfs_root_item *new_root_item;
761 struct btrfs_root *tree_root = fs_info->tree_root; 757 struct btrfs_root *tree_root = fs_info->tree_root;
762 struct btrfs_root *root = pending->root; 758 struct btrfs_root *root = pending->root;
759 struct btrfs_root *parent_root;
760 struct inode *parent_inode;
763 struct extent_buffer *tmp; 761 struct extent_buffer *tmp;
764 struct extent_buffer *old; 762 struct extent_buffer *old;
765 int ret; 763 int ret;
766 u64 objectid; 764 u64 objectid;
765 int namelen;
766 u64 index = 0;
767
768 parent_inode = pending->dentry->d_parent->d_inode;
769 parent_root = BTRFS_I(parent_inode)->root;
767 770
768 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 771 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
769 if (!new_root_item) { 772 if (!new_root_item) {
@@ -774,79 +777,59 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
774 if (ret) 777 if (ret)
775 goto fail; 778 goto fail;
776 779
777 record_root_in_trans(trans, root);
778 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
779 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
780
781 key.objectid = objectid; 780 key.objectid = objectid;
782 /* record when the snapshot was created in key.offset */ 781 /* record when the snapshot was created in key.offset */
783 key.offset = trans->transid; 782 key.offset = trans->transid;
784 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 783 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
785 784
786 old = btrfs_lock_root_node(root);
787 btrfs_cow_block(trans, root, old, NULL, 0, &old);
788 btrfs_set_lock_blocking(old);
789
790 btrfs_copy_root(trans, root, old, &tmp, objectid);
791 btrfs_tree_unlock(old);
792 free_extent_buffer(old);
793
794 btrfs_set_root_node(new_root_item, tmp);
795 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
796 new_root_item);
797 btrfs_tree_unlock(tmp);
798 free_extent_buffer(tmp);
799 if (ret)
800 goto fail;
801
802 key.offset = (u64)-1;
803 memcpy(&pending->root_key, &key, sizeof(key)); 785 memcpy(&pending->root_key, &key, sizeof(key));
804fail: 786 pending->root_key.offset = (u64)-1;
805 kfree(new_root_item);
806 return ret;
807}
808
809static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
810 struct btrfs_pending_snapshot *pending)
811{
812 int ret;
813 int namelen;
814 u64 index = 0;
815 struct btrfs_trans_handle *trans;
816 struct inode *parent_inode;
817 struct btrfs_root *parent_root;
818
819 parent_inode = pending->dentry->d_parent->d_inode;
820 parent_root = BTRFS_I(parent_inode)->root;
821 trans = btrfs_join_transaction(parent_root, 1);
822 787
788 record_root_in_trans(trans, parent_root);
823 /* 789 /*
824 * insert the directory item 790 * insert the directory item
825 */ 791 */
826 namelen = strlen(pending->name); 792 namelen = strlen(pending->name);
827 ret = btrfs_set_inode_index(parent_inode, &index); 793 ret = btrfs_set_inode_index(parent_inode, &index);
794 BUG_ON(ret);
828 ret = btrfs_insert_dir_item(trans, parent_root, 795 ret = btrfs_insert_dir_item(trans, parent_root,
829 pending->name, namelen, 796 pending->name, namelen,
830 parent_inode->i_ino, 797 parent_inode->i_ino,
831 &pending->root_key, BTRFS_FT_DIR, index); 798 &pending->root_key, BTRFS_FT_DIR, index);
832 799 BUG_ON(ret);
833 if (ret)
834 goto fail;
835 800
836 btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2); 801 btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
837 ret = btrfs_update_inode(trans, parent_root, parent_inode); 802 ret = btrfs_update_inode(trans, parent_root, parent_inode);
838 BUG_ON(ret); 803 BUG_ON(ret);
839 804
805 record_root_in_trans(trans, root);
806 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
807 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
808
809 old = btrfs_lock_root_node(root);
810 btrfs_cow_block(trans, root, old, NULL, 0, &old);
811 btrfs_set_lock_blocking(old);
812
813 btrfs_copy_root(trans, root, old, &tmp, objectid);
814 btrfs_tree_unlock(old);
815 free_extent_buffer(old);
816
817 btrfs_set_root_node(new_root_item, tmp);
818 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
819 new_root_item);
820 BUG_ON(ret);
821 btrfs_tree_unlock(tmp);
822 free_extent_buffer(tmp);
823
840 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, 824 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
841 pending->root_key.objectid, 825 pending->root_key.objectid,
842 parent_root->root_key.objectid, 826 parent_root->root_key.objectid,
843 parent_inode->i_ino, index, pending->name, 827 parent_inode->i_ino, index, pending->name,
844 namelen); 828 namelen);
845
846 BUG_ON(ret); 829 BUG_ON(ret);
847 830
848fail: 831fail:
849 btrfs_end_transaction(trans, fs_info->fs_root); 832 kfree(new_root_item);
850 return ret; 833 return ret;
851} 834}
852 835
@@ -867,25 +850,6 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
867 return 0; 850 return 0;
868} 851}
869 852
870static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
871 struct btrfs_fs_info *fs_info)
872{
873 struct btrfs_pending_snapshot *pending;
874 struct list_head *head = &trans->transaction->pending_snapshots;
875 int ret;
876
877 while (!list_empty(head)) {
878 pending = list_entry(head->next,
879 struct btrfs_pending_snapshot, list);
880 ret = finish_pending_snapshot(fs_info, pending);
881 BUG_ON(ret);
882 list_del(&pending->list);
883 kfree(pending->name);
884 kfree(pending);
885 }
886 return 0;
887}
888
889static void update_super_roots(struct btrfs_root *root) 853static void update_super_roots(struct btrfs_root *root)
890{ 854{
891 struct btrfs_root_item *root_item; 855 struct btrfs_root_item *root_item;
@@ -1097,9 +1061,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1097 1061
1098 btrfs_finish_extent_commit(trans, root); 1062 btrfs_finish_extent_commit(trans, root);
1099 1063
1100 /* do the directory inserts of any pending snapshot creations */
1101 finish_pending_snapshots(trans, root->fs_info);
1102
1103 mutex_lock(&root->fs_info->trans_mutex); 1064 mutex_lock(&root->fs_info->trans_mutex);
1104 1065
1105 cur_trans->commit_done = 1; 1066 cur_trans->commit_done = 1;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1255fcc8ade5..af57dd2b43d4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "transaction.h" 22#include "transaction.h"
22#include "disk-io.h" 23#include "disk-io.h"
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9df8e3f1ccab..aa7dc36dac78 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/bio.h> 19#include <linux/bio.h>
20#include <linux/slab.h>
20#include <linux/buffer_head.h> 21#include <linux/buffer_head.h>
21#include <linux/blkdev.h> 22#include <linux/blkdev.h>
22#include <linux/random.h> 23#include <linux/random.h>
@@ -2198,9 +2199,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2198 min_stripes = 2; 2199 min_stripes = 2;
2199 } 2200 }
2200 if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2201 if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
2201 num_stripes = min_t(u64, 2, fs_devices->rw_devices); 2202 if (fs_devices->rw_devices < 2)
2202 if (num_stripes < 2)
2203 return -ENOSPC; 2203 return -ENOSPC;
2204 num_stripes = 2;
2204 min_stripes = 2; 2205 min_stripes = 2;
2205 } 2206 }
2206 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2207 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
@@ -2244,8 +2245,10 @@ again:
2244 do_div(calc_size, stripe_len); 2245 do_div(calc_size, stripe_len);
2245 calc_size *= stripe_len; 2246 calc_size *= stripe_len;
2246 } 2247 }
2248
2247 /* we don't want tiny stripes */ 2249 /* we don't want tiny stripes */
2248 calc_size = max_t(u64, min_stripe_size, calc_size); 2250 if (!looped)
2251 calc_size = max_t(u64, min_stripe_size, calc_size);
2249 2252
2250 do_div(calc_size, stripe_len); 2253 do_div(calc_size, stripe_len);
2251 calc_size *= stripe_len; 2254 calc_size *= stripe_len;
@@ -3389,6 +3392,8 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
3389 key.type = 0; 3392 key.type = 0;
3390again: 3393again:
3391 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3394 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3395 if (ret < 0)
3396 goto error;
3392 while (1) { 3397 while (1) {
3393 leaf = path->nodes[0]; 3398 leaf = path->nodes[0];
3394 slot = path->slots[0]; 3399 slot = path->slots[0];
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 27089311fbea..37fe101a4e0d 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -9,6 +9,7 @@
9 * 2 of the Licence, or (at your option) any later version. 9 * 2 of the Licence, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/slab.h>
12#include <linux/mount.h> 13#include <linux/mount.h>
13#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
14#include "internal.h" 15#include "internal.h"
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index eeb4986ea7db..d5db84a1ee0d 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -19,6 +19,7 @@
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/namei.h> 20#include <linux/namei.h>
21#include <linux/security.h> 21#include <linux/security.h>
22#include <linux/slab.h>
22#include "internal.h" 23#include "internal.h"
23 24
24#define CACHEFILES_KEYBUF_SIZE 512 25#define CACHEFILES_KEYBUF_SIZE 512
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 1d8332563863..0f0d41fbb03f 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -10,6 +10,7 @@
10 */ 10 */
11 11
12#include <linux/mount.h> 12#include <linux/mount.h>
13#include <linux/slab.h>
13#include <linux/file.h> 14#include <linux/file.h>
14#include "internal.h" 15#include "internal.h"
15 16
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index f3e7a0bf068b..e18b183b47e1 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -16,6 +16,7 @@
16#include <linux/fsnotify.h> 16#include <linux/fsnotify.h>
17#include <linux/quotaops.h> 17#include <linux/quotaops.h>
18#include <linux/xattr.h> 18#include <linux/xattr.h>
19#include <linux/slab.h>
19#include "internal.h" 20#include "internal.h"
20 21
21static const char cachefiles_xattr_cache[] = 22static const char cachefiles_xattr_cache[] =
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
new file mode 100644
index 000000000000..04b8280582a9
--- /dev/null
+++ b/fs/ceph/Kconfig
@@ -0,0 +1,27 @@
1config CEPH_FS
2 tristate "Ceph distributed file system (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL
4 select LIBCRC32C
5 select CONFIG_CRYPTO_AES
6 help
7 Choose Y or M here to include support for mounting the
8 experimental Ceph distributed file system. Ceph is an extremely
9 scalable file system designed to provide high performance,
10 reliable access to petabytes of storage.
11
12 More information at http://ceph.newdream.net/.
13
14 If unsure, say N.
15
16config CEPH_FS_PRETTYDEBUG
17 bool "Include file:line in ceph debug output"
18 depends on CEPH_FS
19 default n
20 help
21 If you say Y here, debug output will include a filename and
22 line to aid debugging. This icnreases kernel size and slows
23 execution slightly when debug call sites are enabled (e.g.,
24 via CONFIG_DYNAMIC_DEBUG).
25
26 If unsure, say N.
27
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
new file mode 100644
index 000000000000..6a660e610be8
--- /dev/null
+++ b/fs/ceph/Makefile
@@ -0,0 +1,39 @@
1#
2# Makefile for CEPH filesystem.
3#
4
5ifneq ($(KERNELRELEASE),)
6
7obj-$(CONFIG_CEPH_FS) += ceph.o
8
9ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
10 export.o caps.o snap.o xattr.o \
11 messenger.o msgpool.o buffer.o pagelist.o \
12 mds_client.o mdsmap.o \
13 mon_client.o \
14 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
15 debugfs.o \
16 auth.o auth_none.o \
17 crypto.o armor.o \
18 auth_x.o \
19 ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
20
21else
22#Otherwise we were called directly from the command
23# line; invoke the kernel build system.
24
25KERNELDIR ?= /lib/modules/$(shell uname -r)/build
26PWD := $(shell pwd)
27
28default: all
29
30all:
31 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
32
33modules_install:
34 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
35
36clean:
37 $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
38
39endif
diff --git a/fs/ceph/README b/fs/ceph/README
new file mode 100644
index 000000000000..18352fab37c0
--- /dev/null
+++ b/fs/ceph/README
@@ -0,0 +1,20 @@
1#
2# The following files are shared by (and manually synchronized
3# between) the Ceph userland and kernel client.
4#
5# userland kernel
6src/include/ceph_fs.h fs/ceph/ceph_fs.h
7src/include/ceph_fs.cc fs/ceph/ceph_fs.c
8src/include/msgr.h fs/ceph/msgr.h
9src/include/rados.h fs/ceph/rados.h
10src/include/ceph_strings.cc fs/ceph/ceph_strings.c
11src/include/ceph_frag.h fs/ceph/ceph_frag.h
12src/include/ceph_frag.cc fs/ceph/ceph_frag.c
13src/include/ceph_hash.h fs/ceph/ceph_hash.h
14src/include/ceph_hash.cc fs/ceph/ceph_hash.c
15src/crush/crush.c fs/ceph/crush/crush.c
16src/crush/crush.h fs/ceph/crush/crush.h
17src/crush/mapper.c fs/ceph/crush/mapper.c
18src/crush/mapper.h fs/ceph/crush/mapper.h
19src/crush/hash.h fs/ceph/crush/hash.h
20src/crush/hash.c fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
new file mode 100644
index 000000000000..aa3cd7cc3e40
--- /dev/null
+++ b/fs/ceph/addr.c
@@ -0,0 +1,1195 @@
1#include "ceph_debug.h"
2
3#include <linux/backing-dev.h>
4#include <linux/fs.h>
5#include <linux/mm.h>
6#include <linux/pagemap.h>
7#include <linux/writeback.h> /* generic_writepages */
8#include <linux/slab.h>
9#include <linux/pagevec.h>
10#include <linux/task_io_accounting_ops.h>
11
12#include "super.h"
13#include "osd_client.h"
14
15/*
16 * Ceph address space ops.
17 *
18 * There are a few funny things going on here.
19 *
20 * The page->private field is used to reference a struct
21 * ceph_snap_context for _every_ dirty page. This indicates which
22 * snapshot the page was logically dirtied in, and thus which snap
23 * context needs to be associated with the osd write during writeback.
24 *
25 * Similarly, struct ceph_inode_info maintains a set of counters to
26 * count dirty pages on the inode. In the absense of snapshots,
27 * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
28 *
29 * When a snapshot is taken (that is, when the client receives
30 * notification that a snapshot was taken), each inode with caps and
31 * with dirty pages (dirty pages implies there is a cap) gets a new
32 * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
33 * order, new snaps go to the tail). The i_wrbuffer_ref_head count is
34 * moved to capsnap->dirty. (Unless a sync write is currently in
35 * progress. In that case, the capsnap is said to be "pending", new
36 * writes cannot start, and the capsnap isn't "finalized" until the
37 * write completes (or fails) and a final size/mtime for the inode for
38 * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0.
39 *
40 * On writeback, we must submit writes to the osd IN SNAP ORDER. So,
41 * we look for the first capsnap in i_cap_snaps and write out pages in
42 * that snap context _only_. Then we move on to the next capsnap,
43 * eventually reaching the "live" or "head" context (i.e., pages that
44 * are not yet snapped) and are writing the most recently dirtied
45 * pages.
46 *
47 * Invalidate and so forth must take care to ensure the dirty page
48 * accounting is preserved.
49 */
50
51#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
52#define CONGESTION_OFF_THRESH(congestion_kb) \
53 (CONGESTION_ON_THRESH(congestion_kb) - \
54 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
55
56
57
58/*
59 * Dirty a page. Optimistically adjust accounting, on the assumption
60 * that we won't race with invalidate. If we do, readjust.
61 */
62static int ceph_set_page_dirty(struct page *page)
63{
64 struct address_space *mapping = page->mapping;
65 struct inode *inode;
66 struct ceph_inode_info *ci;
67 int undo = 0;
68 struct ceph_snap_context *snapc;
69
70 if (unlikely(!mapping))
71 return !TestSetPageDirty(page);
72
73 if (TestSetPageDirty(page)) {
74 dout("%p set_page_dirty %p idx %lu -- already dirty\n",
75 mapping->host, page, page->index);
76 return 0;
77 }
78
79 inode = mapping->host;
80 ci = ceph_inode(inode);
81
82 /*
83 * Note that we're grabbing a snapc ref here without holding
84 * any locks!
85 */
86 snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
87
88 /* dirty the head */
89 spin_lock(&inode->i_lock);
90 if (ci->i_wrbuffer_ref_head == 0)
91 ci->i_head_snapc = ceph_get_snap_context(snapc);
92 ++ci->i_wrbuffer_ref_head;
93 if (ci->i_wrbuffer_ref == 0)
94 igrab(inode);
95 ++ci->i_wrbuffer_ref;
96 dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
97 "snapc %p seq %lld (%d snaps)\n",
98 mapping->host, page, page->index,
99 ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
100 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
101 snapc, snapc->seq, snapc->num_snaps);
102 spin_unlock(&inode->i_lock);
103
104 /* now adjust page */
105 spin_lock_irq(&mapping->tree_lock);
106 if (page->mapping) { /* Race with truncate? */
107 WARN_ON_ONCE(!PageUptodate(page));
108
109 if (mapping_cap_account_dirty(mapping)) {
110 __inc_zone_page_state(page, NR_FILE_DIRTY);
111 __inc_bdi_stat(mapping->backing_dev_info,
112 BDI_RECLAIMABLE);
113 task_io_account_write(PAGE_CACHE_SIZE);
114 }
115 radix_tree_tag_set(&mapping->page_tree,
116 page_index(page), PAGECACHE_TAG_DIRTY);
117
118 /*
119 * Reference snap context in page->private. Also set
120 * PagePrivate so that we get invalidatepage callback.
121 */
122 page->private = (unsigned long)snapc;
123 SetPagePrivate(page);
124 } else {
125 dout("ANON set_page_dirty %p (raced truncate?)\n", page);
126 undo = 1;
127 }
128
129 spin_unlock_irq(&mapping->tree_lock);
130
131 if (undo)
132 /* whoops, we failed to dirty the page */
133 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
134
135 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
136
137 BUG_ON(!PageDirty(page));
138 return 1;
139}
140
141/*
142 * If we are truncating the full page (i.e. offset == 0), adjust the
143 * dirty page counters appropriately. Only called if there is private
144 * data on the page.
145 */
146static void ceph_invalidatepage(struct page *page, unsigned long offset)
147{
148 struct inode *inode;
149 struct ceph_inode_info *ci;
150 struct ceph_snap_context *snapc = (void *)page->private;
151
152 BUG_ON(!PageLocked(page));
153 BUG_ON(!page->private);
154 BUG_ON(!PagePrivate(page));
155 BUG_ON(!page->mapping);
156
157 inode = page->mapping->host;
158
159 /*
160 * We can get non-dirty pages here due to races between
161 * set_page_dirty and truncate_complete_page; just spit out a
162 * warning, in case we end up with accounting problems later.
163 */
164 if (!PageDirty(page))
165 pr_err("%p invalidatepage %p page not dirty\n", inode, page);
166
167 if (offset == 0)
168 ClearPageChecked(page);
169
170 ci = ceph_inode(inode);
171 if (offset == 0) {
172 dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
173 inode, page, page->index, offset);
174 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
175 ceph_put_snap_context(snapc);
176 page->private = 0;
177 ClearPagePrivate(page);
178 } else {
179 dout("%p invalidatepage %p idx %lu partial dirty page\n",
180 inode, page, page->index);
181 }
182}
183
184/* just a sanity check */
185static int ceph_releasepage(struct page *page, gfp_t g)
186{
187 struct inode *inode = page->mapping ? page->mapping->host : NULL;
188 dout("%p releasepage %p idx %lu\n", inode, page, page->index);
189 WARN_ON(PageDirty(page));
190 WARN_ON(page->private);
191 WARN_ON(PagePrivate(page));
192 return 0;
193}
194
195/*
196 * read a single page, without unlocking it.
197 */
198static int readpage_nounlock(struct file *filp, struct page *page)
199{
200 struct inode *inode = filp->f_dentry->d_inode;
201 struct ceph_inode_info *ci = ceph_inode(inode);
202 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
203 int err = 0;
204 u64 len = PAGE_CACHE_SIZE;
205
206 dout("readpage inode %p file %p page %p index %lu\n",
207 inode, filp, page, page->index);
208 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
209 page->index << PAGE_CACHE_SHIFT, &len,
210 ci->i_truncate_seq, ci->i_truncate_size,
211 &page, 1);
212 if (err == -ENOENT)
213 err = 0;
214 if (err < 0) {
215 SetPageError(page);
216 goto out;
217 } else if (err < PAGE_CACHE_SIZE) {
218 /* zero fill remainder of page */
219 zero_user_segment(page, err, PAGE_CACHE_SIZE);
220 }
221 SetPageUptodate(page);
222
223out:
224 return err < 0 ? err : 0;
225}
226
227static int ceph_readpage(struct file *filp, struct page *page)
228{
229 int r = readpage_nounlock(filp, page);
230 unlock_page(page);
231 return r;
232}
233
234/*
235 * Build a vector of contiguous pages from the provided page list.
236 */
237static struct page **page_vector_from_list(struct list_head *page_list,
238 unsigned *nr_pages)
239{
240 struct page **pages;
241 struct page *page;
242 int next_index, contig_pages = 0;
243
244 /* build page vector */
245 pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
246 if (!pages)
247 return ERR_PTR(-ENOMEM);
248
249 BUG_ON(list_empty(page_list));
250 next_index = list_entry(page_list->prev, struct page, lru)->index;
251 list_for_each_entry_reverse(page, page_list, lru) {
252 if (page->index == next_index) {
253 dout("readpages page %d %p\n", contig_pages, page);
254 pages[contig_pages] = page;
255 contig_pages++;
256 next_index++;
257 } else {
258 break;
259 }
260 }
261 *nr_pages = contig_pages;
262 return pages;
263}
264
265/*
266 * Read multiple pages. Leave pages we don't read + unlock in page_list;
267 * the caller (VM) cleans them up.
268 */
269static int ceph_readpages(struct file *file, struct address_space *mapping,
270 struct list_head *page_list, unsigned nr_pages)
271{
272 struct inode *inode = file->f_dentry->d_inode;
273 struct ceph_inode_info *ci = ceph_inode(inode);
274 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
275 int rc = 0;
276 struct page **pages;
277 struct pagevec pvec;
278 loff_t offset;
279 u64 len;
280
281 dout("readpages %p file %p nr_pages %d\n",
282 inode, file, nr_pages);
283
284 pages = page_vector_from_list(page_list, &nr_pages);
285 if (IS_ERR(pages))
286 return PTR_ERR(pages);
287
288 /* guess read extent */
289 offset = pages[0]->index << PAGE_CACHE_SHIFT;
290 len = nr_pages << PAGE_CACHE_SHIFT;
291 rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
292 offset, &len,
293 ci->i_truncate_seq, ci->i_truncate_size,
294 pages, nr_pages);
295 if (rc == -ENOENT)
296 rc = 0;
297 if (rc < 0)
298 goto out;
299
300 /* set uptodate and add to lru in pagevec-sized chunks */
301 pagevec_init(&pvec, 0);
302 for (; !list_empty(page_list) && len > 0;
303 rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
304 struct page *page =
305 list_entry(page_list->prev, struct page, lru);
306
307 list_del(&page->lru);
308
309 if (rc < (int)PAGE_CACHE_SIZE) {
310 /* zero (remainder of) page */
311 int s = rc < 0 ? 0 : rc;
312 zero_user_segment(page, s, PAGE_CACHE_SIZE);
313 }
314
315 if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
316 page_cache_release(page);
317 dout("readpages %p add_to_page_cache failed %p\n",
318 inode, page);
319 continue;
320 }
321 dout("readpages %p adding %p idx %lu\n", inode, page,
322 page->index);
323 flush_dcache_page(page);
324 SetPageUptodate(page);
325 unlock_page(page);
326 if (pagevec_add(&pvec, page) == 0)
327 pagevec_lru_add_file(&pvec); /* add to lru */
328 }
329 pagevec_lru_add_file(&pvec);
330 rc = 0;
331
332out:
333 kfree(pages);
334 return rc;
335}
336
337/*
338 * Get ref for the oldest snapc for an inode with dirty data... that is, the
339 * only snap context we are allowed to write back.
340 *
341 * Caller holds i_lock.
342 */
343static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
344 u64 *snap_size)
345{
346 struct ceph_inode_info *ci = ceph_inode(inode);
347 struct ceph_snap_context *snapc = NULL;
348 struct ceph_cap_snap *capsnap = NULL;
349
350 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
351 dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
352 capsnap->context, capsnap->dirty_pages);
353 if (capsnap->dirty_pages) {
354 snapc = ceph_get_snap_context(capsnap->context);
355 if (snap_size)
356 *snap_size = capsnap->size;
357 break;
358 }
359 }
360 if (!snapc && ci->i_snap_realm) {
361 snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
362 dout(" head snapc %p has %d dirty pages\n",
363 snapc, ci->i_wrbuffer_ref_head);
364 }
365 return snapc;
366}
367
368static struct ceph_snap_context *get_oldest_context(struct inode *inode,
369 u64 *snap_size)
370{
371 struct ceph_snap_context *snapc = NULL;
372
373 spin_lock(&inode->i_lock);
374 snapc = __get_oldest_context(inode, snap_size);
375 spin_unlock(&inode->i_lock);
376 return snapc;
377}
378
379/*
380 * Write a single page, but leave the page locked.
381 *
382 * If we get a write error, set the page error bit, but still adjust the
383 * dirty page accounting (i.e., page is no longer dirty).
384 */
385static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
386{
387 struct inode *inode;
388 struct ceph_inode_info *ci;
389 struct ceph_client *client;
390 struct ceph_osd_client *osdc;
391 loff_t page_off = page->index << PAGE_CACHE_SHIFT;
392 int len = PAGE_CACHE_SIZE;
393 loff_t i_size;
394 int err = 0;
395 struct ceph_snap_context *snapc;
396 u64 snap_size = 0;
397 long writeback_stat;
398
399 dout("writepage %p idx %lu\n", page, page->index);
400
401 if (!page->mapping || !page->mapping->host) {
402 dout("writepage %p - no mapping\n", page);
403 return -EFAULT;
404 }
405 inode = page->mapping->host;
406 ci = ceph_inode(inode);
407 client = ceph_inode_to_client(inode);
408 osdc = &client->osdc;
409
410 /* verify this is a writeable snap context */
411 snapc = (void *)page->private;
412 if (snapc == NULL) {
413 dout("writepage %p page %p not dirty?\n", inode, page);
414 goto out;
415 }
416 if (snapc != get_oldest_context(inode, &snap_size)) {
417 dout("writepage %p page %p snapc %p not writeable - noop\n",
418 inode, page, (void *)page->private);
419 /* we should only noop if called by kswapd */
420 WARN_ON((current->flags & PF_MEMALLOC) == 0);
421 goto out;
422 }
423
424 /* is this a partial page at end of file? */
425 if (snap_size)
426 i_size = snap_size;
427 else
428 i_size = i_size_read(inode);
429 if (i_size < page_off + len)
430 len = i_size - page_off;
431
432 dout("writepage %p page %p index %lu on %llu~%u\n",
433 inode, page, page->index, page_off, len);
434
435 writeback_stat = atomic_long_inc_return(&client->writeback_count);
436 if (writeback_stat >
437 CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
438 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
439
440 set_page_writeback(page);
441 err = ceph_osdc_writepages(osdc, ceph_vino(inode),
442 &ci->i_layout, snapc,
443 page_off, len,
444 ci->i_truncate_seq, ci->i_truncate_size,
445 &inode->i_mtime,
446 &page, 1, 0, 0, true);
447 if (err < 0) {
448 dout("writepage setting page/mapping error %d %p\n", err, page);
449 SetPageError(page);
450 mapping_set_error(&inode->i_data, err);
451 if (wbc)
452 wbc->pages_skipped++;
453 } else {
454 dout("writepage cleaned page %p\n", page);
455 err = 0; /* vfs expects us to return 0 */
456 }
457 page->private = 0;
458 ClearPagePrivate(page);
459 end_page_writeback(page);
460 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
461 ceph_put_snap_context(snapc);
462out:
463 return err;
464}
465
466static int ceph_writepage(struct page *page, struct writeback_control *wbc)
467{
468 int err;
469 struct inode *inode = page->mapping->host;
470 BUG_ON(!inode);
471 igrab(inode);
472 err = writepage_nounlock(page, wbc);
473 unlock_page(page);
474 iput(inode);
475 return err;
476}
477
478
479/*
480 * lame release_pages helper. release_pages() isn't exported to
481 * modules.
482 */
483static void ceph_release_pages(struct page **pages, int num)
484{
485 struct pagevec pvec;
486 int i;
487
488 pagevec_init(&pvec, 0);
489 for (i = 0; i < num; i++) {
490 if (pagevec_add(&pvec, pages[i]) == 0)
491 pagevec_release(&pvec);
492 }
493 pagevec_release(&pvec);
494}
495
496
497/*
498 * async writeback completion handler.
499 *
500 * If we get an error, set the mapping error bit, but not the individual
501 * page error bits.
502 */
503static void writepages_finish(struct ceph_osd_request *req,
504 struct ceph_msg *msg)
505{
506 struct inode *inode = req->r_inode;
507 struct ceph_osd_reply_head *replyhead;
508 struct ceph_osd_op *op;
509 struct ceph_inode_info *ci = ceph_inode(inode);
510 unsigned wrote;
511 struct page *page;
512 int i;
513 struct ceph_snap_context *snapc = req->r_snapc;
514 struct address_space *mapping = inode->i_mapping;
515 struct writeback_control *wbc = req->r_wbc;
516 __s32 rc = -EIO;
517 u64 bytes = 0;
518 struct ceph_client *client = ceph_inode_to_client(inode);
519 long writeback_stat;
520 unsigned issued = __ceph_caps_issued(ci, NULL);
521
522 /* parse reply */
523 replyhead = msg->front.iov_base;
524 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
525 op = (void *)(replyhead + 1);
526 rc = le32_to_cpu(replyhead->result);
527 bytes = le64_to_cpu(op->extent.length);
528
529 if (rc >= 0) {
530 /*
531 * Assume we wrote the pages we originally sent. The
532 * osd might reply with fewer pages if our writeback
533 * raced with a truncation and was adjusted at the osd,
534 * so don't believe the reply.
535 */
536 wrote = req->r_num_pages;
537 } else {
538 wrote = 0;
539 mapping_set_error(mapping, rc);
540 }
541 dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
542 inode, rc, bytes, wrote);
543
544 /* clean all pages */
545 for (i = 0; i < req->r_num_pages; i++) {
546 page = req->r_pages[i];
547 BUG_ON(!page);
548 WARN_ON(!PageUptodate(page));
549
550 writeback_stat =
551 atomic_long_dec_return(&client->writeback_count);
552 if (writeback_stat <
553 CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
554 clear_bdi_congested(&client->backing_dev_info,
555 BLK_RW_ASYNC);
556
557 if (i >= wrote) {
558 dout("inode %p skipping page %p\n", inode, page);
559 wbc->pages_skipped++;
560 }
561 page->private = 0;
562 ClearPagePrivate(page);
563 ceph_put_snap_context(snapc);
564 dout("unlocking %d %p\n", i, page);
565 end_page_writeback(page);
566
567 /*
568 * We lost the cache cap, need to truncate the page before
569 * it is unlocked, otherwise we'd truncate it later in the
570 * page truncation thread, possibly losing some data that
571 * raced its way in
572 */
573 if ((issued & CEPH_CAP_FILE_CACHE) == 0)
574 generic_error_remove_page(inode->i_mapping, page);
575
576 unlock_page(page);
577 }
578 dout("%p wrote+cleaned %d pages\n", inode, wrote);
579 ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
580
581 ceph_release_pages(req->r_pages, req->r_num_pages);
582 if (req->r_pages_from_pool)
583 mempool_free(req->r_pages,
584 ceph_client(inode->i_sb)->wb_pagevec_pool);
585 else
586 kfree(req->r_pages);
587 ceph_osdc_put_request(req);
588}
589
590/*
591 * allocate a page vec, either directly, or if necessary, via a the
592 * mempool. we avoid the mempool if we can because req->r_num_pages
593 * may be less than the maximum write size.
594 */
595static void alloc_page_vec(struct ceph_client *client,
596 struct ceph_osd_request *req)
597{
598 req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
599 GFP_NOFS);
600 if (!req->r_pages) {
601 req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
602 req->r_pages_from_pool = 1;
603 WARN_ON(!req->r_pages);
604 }
605}
606
607/*
608 * initiate async writeback
609 */
610static int ceph_writepages_start(struct address_space *mapping,
611 struct writeback_control *wbc)
612{
613 struct inode *inode = mapping->host;
614 struct backing_dev_info *bdi = mapping->backing_dev_info;
615 struct ceph_inode_info *ci = ceph_inode(inode);
616 struct ceph_client *client;
617 pgoff_t index, start, end;
618 int range_whole = 0;
619 int should_loop = 1;
620 pgoff_t max_pages = 0, max_pages_ever = 0;
621 struct ceph_snap_context *snapc = NULL, *last_snapc = NULL;
622 struct pagevec pvec;
623 int done = 0;
624 int rc = 0;
625 unsigned wsize = 1 << inode->i_blkbits;
626 struct ceph_osd_request *req = NULL;
627 int do_sync;
628 u64 snap_size = 0;
629
630 /*
631 * Include a 'sync' in the OSD request if this is a data
632 * integrity write (e.g., O_SYNC write or fsync()), or if our
633 * cap is being revoked.
634 */
635 do_sync = wbc->sync_mode == WB_SYNC_ALL;
636 if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
637 do_sync = 1;
638 dout("writepages_start %p dosync=%d (mode=%s)\n",
639 inode, do_sync,
640 wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
641 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
642
643 client = ceph_inode_to_client(inode);
644 if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
645 pr_warning("writepage_start %p on forced umount\n", inode);
646 return -EIO; /* we're in a forced umount, don't write! */
647 }
648 if (client->mount_args->wsize && client->mount_args->wsize < wsize)
649 wsize = client->mount_args->wsize;
650 if (wsize < PAGE_CACHE_SIZE)
651 wsize = PAGE_CACHE_SIZE;
652 max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
653
654 pagevec_init(&pvec, 0);
655
656 /* ?? */
657 if (wbc->nonblocking && bdi_write_congested(bdi)) {
658 dout(" writepages congested\n");
659 wbc->encountered_congestion = 1;
660 goto out_final;
661 }
662
663 /* where to start/end? */
664 if (wbc->range_cyclic) {
665 start = mapping->writeback_index; /* Start from prev offset */
666 end = -1;
667 dout(" cyclic, start at %lu\n", start);
668 } else {
669 start = wbc->range_start >> PAGE_CACHE_SHIFT;
670 end = wbc->range_end >> PAGE_CACHE_SHIFT;
671 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
672 range_whole = 1;
673 should_loop = 0;
674 dout(" not cyclic, %lu to %lu\n", start, end);
675 }
676 index = start;
677
678retry:
679 /* find oldest snap context with dirty data */
680 ceph_put_snap_context(snapc);
681 snapc = get_oldest_context(inode, &snap_size);
682 if (!snapc) {
683 /* hmm, why does writepages get called when there
684 is no dirty data? */
685 dout(" no snap context with dirty data?\n");
686 goto out;
687 }
688 dout(" oldest snapc is %p seq %lld (%d snaps)\n",
689 snapc, snapc->seq, snapc->num_snaps);
690 if (last_snapc && snapc != last_snapc) {
691 /* if we switched to a newer snapc, restart our scan at the
692 * start of the original file range. */
693 dout(" snapc differs from last pass, restarting at %lu\n",
694 index);
695 index = start;
696 }
697 last_snapc = snapc;
698
699 while (!done && index <= end) {
700 unsigned i;
701 int first;
702 pgoff_t next;
703 int pvec_pages, locked_pages;
704 struct page *page;
705 int want;
706 u64 offset, len;
707 struct ceph_osd_request_head *reqhead;
708 struct ceph_osd_op *op;
709 long writeback_stat;
710
711 next = 0;
712 locked_pages = 0;
713 max_pages = max_pages_ever;
714
715get_more_pages:
716 first = -1;
717 want = min(end - index,
718 min((pgoff_t)PAGEVEC_SIZE,
719 max_pages - (pgoff_t)locked_pages) - 1)
720 + 1;
721 pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
722 PAGECACHE_TAG_DIRTY,
723 want);
724 dout("pagevec_lookup_tag got %d\n", pvec_pages);
725 if (!pvec_pages && !locked_pages)
726 break;
727 for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
728 page = pvec.pages[i];
729 dout("? %p idx %lu\n", page, page->index);
730 if (locked_pages == 0)
731 lock_page(page); /* first page */
732 else if (!trylock_page(page))
733 break;
734
735 /* only dirty pages, or our accounting breaks */
736 if (unlikely(!PageDirty(page)) ||
737 unlikely(page->mapping != mapping)) {
738 dout("!dirty or !mapping %p\n", page);
739 unlock_page(page);
740 break;
741 }
742 if (!wbc->range_cyclic && page->index > end) {
743 dout("end of range %p\n", page);
744 done = 1;
745 unlock_page(page);
746 break;
747 }
748 if (next && (page->index != next)) {
749 dout("not consecutive %p\n", page);
750 unlock_page(page);
751 break;
752 }
753 if (wbc->sync_mode != WB_SYNC_NONE) {
754 dout("waiting on writeback %p\n", page);
755 wait_on_page_writeback(page);
756 }
757 if ((snap_size && page_offset(page) > snap_size) ||
758 (!snap_size &&
759 page_offset(page) > i_size_read(inode))) {
760 dout("%p page eof %llu\n", page, snap_size ?
761 snap_size : i_size_read(inode));
762 done = 1;
763 unlock_page(page);
764 break;
765 }
766 if (PageWriteback(page)) {
767 dout("%p under writeback\n", page);
768 unlock_page(page);
769 break;
770 }
771
772 /* only if matching snap context */
773 if (snapc != (void *)page->private) {
774 dout("page snapc %p != oldest %p\n",
775 (void *)page->private, snapc);
776 unlock_page(page);
777 if (!locked_pages)
778 continue; /* keep looking for snap */
779 break;
780 }
781
782 if (!clear_page_dirty_for_io(page)) {
783 dout("%p !clear_page_dirty_for_io\n", page);
784 unlock_page(page);
785 break;
786 }
787
788 /* ok */
789 if (locked_pages == 0) {
790 /* prepare async write request */
791 offset = page->index << PAGE_CACHE_SHIFT;
792 len = wsize;
793 req = ceph_osdc_new_request(&client->osdc,
794 &ci->i_layout,
795 ceph_vino(inode),
796 offset, &len,
797 CEPH_OSD_OP_WRITE,
798 CEPH_OSD_FLAG_WRITE |
799 CEPH_OSD_FLAG_ONDISK,
800 snapc, do_sync,
801 ci->i_truncate_seq,
802 ci->i_truncate_size,
803 &inode->i_mtime, true, 1);
804 max_pages = req->r_num_pages;
805
806 alloc_page_vec(client, req);
807 req->r_callback = writepages_finish;
808 req->r_inode = inode;
809 req->r_wbc = wbc;
810 }
811
812 /* note position of first page in pvec */
813 if (first < 0)
814 first = i;
815 dout("%p will write page %p idx %lu\n",
816 inode, page, page->index);
817
818 writeback_stat = atomic_long_inc_return(&client->writeback_count);
819 if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) {
820 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
821 }
822
823 set_page_writeback(page);
824 req->r_pages[locked_pages] = page;
825 locked_pages++;
826 next = page->index + 1;
827 }
828
829 /* did we get anything? */
830 if (!locked_pages)
831 goto release_pvec_pages;
832 if (i) {
833 int j;
834 BUG_ON(!locked_pages || first < 0);
835
836 if (pvec_pages && i == pvec_pages &&
837 locked_pages < max_pages) {
838 dout("reached end pvec, trying for more\n");
839 pagevec_reinit(&pvec);
840 goto get_more_pages;
841 }
842
843 /* shift unused pages over in the pvec... we
844 * will need to release them below. */
845 for (j = i; j < pvec_pages; j++) {
846 dout(" pvec leftover page %p\n",
847 pvec.pages[j]);
848 pvec.pages[j-i+first] = pvec.pages[j];
849 }
850 pvec.nr -= i-first;
851 }
852
853 /* submit the write */
854 offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
855 len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
856 (u64)locked_pages << PAGE_CACHE_SHIFT);
857 dout("writepages got %d pages at %llu~%llu\n",
858 locked_pages, offset, len);
859
860 /* revise final length, page count */
861 req->r_num_pages = locked_pages;
862 reqhead = req->r_request->front.iov_base;
863 op = (void *)(reqhead + 1);
864 op->extent.length = cpu_to_le64(len);
865 op->payload_len = cpu_to_le32(len);
866 req->r_request->hdr.data_len = cpu_to_le32(len);
867
868 ceph_osdc_start_request(&client->osdc, req, true);
869 req = NULL;
870
871 /* continue? */
872 index = next;
873 wbc->nr_to_write -= locked_pages;
874 if (wbc->nr_to_write <= 0)
875 done = 1;
876
877release_pvec_pages:
878 dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
879 pvec.nr ? pvec.pages[0] : NULL);
880 pagevec_release(&pvec);
881
882 if (locked_pages && !done)
883 goto retry;
884 }
885
886 if (should_loop && !done) {
887 /* more to do; loop back to beginning of file */
888 dout("writepages looping back to beginning of file\n");
889 should_loop = 0;
890 index = 0;
891 goto retry;
892 }
893
894 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
895 mapping->writeback_index = index;
896
897out:
898 if (req)
899 ceph_osdc_put_request(req);
900 if (rc > 0)
901 rc = 0; /* vfs expects us to return 0 */
902 ceph_put_snap_context(snapc);
903 dout("writepages done, rc = %d\n", rc);
904out_final:
905 return rc;
906}
907
908
909
910/*
911 * See if a given @snapc is either writeable, or already written.
912 */
913static int context_is_writeable_or_written(struct inode *inode,
914 struct ceph_snap_context *snapc)
915{
916 struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
917 return !oldest || snapc->seq <= oldest->seq;
918}
919
920/*
921 * We are only allowed to write into/dirty the page if the page is
922 * clean, or already dirty within the same snap context.
923 *
924 * called with page locked.
925 * return success with page locked,
926 * or any failure (incl -EAGAIN) with page unlocked.
927 */
928static int ceph_update_writeable_page(struct file *file,
929 loff_t pos, unsigned len,
930 struct page *page)
931{
932 struct inode *inode = file->f_dentry->d_inode;
933 struct ceph_inode_info *ci = ceph_inode(inode);
934 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
935 loff_t page_off = pos & PAGE_CACHE_MASK;
936 int pos_in_page = pos & ~PAGE_CACHE_MASK;
937 int end_in_page = pos_in_page + len;
938 loff_t i_size;
939 struct ceph_snap_context *snapc;
940 int r;
941
942retry_locked:
943 /* writepages currently holds page lock, but if we change that later, */
944 wait_on_page_writeback(page);
945
946 /* check snap context */
947 BUG_ON(!ci->i_snap_realm);
948 down_read(&mdsc->snap_rwsem);
949 BUG_ON(!ci->i_snap_realm->cached_context);
950 if (page->private &&
951 (void *)page->private != ci->i_snap_realm->cached_context) {
952 /*
953 * this page is already dirty in another (older) snap
954 * context! is it writeable now?
955 */
956 snapc = get_oldest_context(inode, NULL);
957 up_read(&mdsc->snap_rwsem);
958
959 if (snapc != (void *)page->private) {
960 dout(" page %p snapc %p not current or oldest\n",
961 page, (void *)page->private);
962 /*
963 * queue for writeback, and wait for snapc to
964 * be writeable or written
965 */
966 snapc = ceph_get_snap_context((void *)page->private);
967 unlock_page(page);
968 ceph_queue_writeback(inode);
969 r = wait_event_interruptible(ci->i_cap_wq,
970 context_is_writeable_or_written(inode, snapc));
971 ceph_put_snap_context(snapc);
972 if (r == -ERESTARTSYS)
973 return r;
974 return -EAGAIN;
975 }
976
977 /* yay, writeable, do it now (without dropping page lock) */
978 dout(" page %p snapc %p not current, but oldest\n",
979 page, snapc);
980 if (!clear_page_dirty_for_io(page))
981 goto retry_locked;
982 r = writepage_nounlock(page, NULL);
983 if (r < 0)
984 goto fail_nosnap;
985 goto retry_locked;
986 }
987
988 if (PageUptodate(page)) {
989 dout(" page %p already uptodate\n", page);
990 return 0;
991 }
992
993 /* full page? */
994 if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
995 return 0;
996
997 /* past end of file? */
998 i_size = inode->i_size; /* caller holds i_mutex */
999
1000 if (i_size + len > inode->i_sb->s_maxbytes) {
1001 /* file is too big */
1002 r = -EINVAL;
1003 goto fail;
1004 }
1005
1006 if (page_off >= i_size ||
1007 (pos_in_page == 0 && (pos+len) >= i_size &&
1008 end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
1009 dout(" zeroing %p 0 - %d and %d - %d\n",
1010 page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
1011 zero_user_segments(page,
1012 0, pos_in_page,
1013 end_in_page, PAGE_CACHE_SIZE);
1014 return 0;
1015 }
1016
1017 /* we need to read it. */
1018 up_read(&mdsc->snap_rwsem);
1019 r = readpage_nounlock(file, page);
1020 if (r < 0)
1021 goto fail_nosnap;
1022 goto retry_locked;
1023
1024fail:
1025 up_read(&mdsc->snap_rwsem);
1026fail_nosnap:
1027 unlock_page(page);
1028 return r;
1029}
1030
1031/*
1032 * We are only allowed to write into/dirty the page if the page is
1033 * clean, or already dirty within the same snap context.
1034 */
1035static int ceph_write_begin(struct file *file, struct address_space *mapping,
1036 loff_t pos, unsigned len, unsigned flags,
1037 struct page **pagep, void **fsdata)
1038{
1039 struct inode *inode = file->f_dentry->d_inode;
1040 struct page *page;
1041 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1042 int r;
1043
1044 do {
1045 /* get a page */
1046 page = grab_cache_page_write_begin(mapping, index, 0);
1047 if (!page)
1048 return -ENOMEM;
1049 *pagep = page;
1050
1051 dout("write_begin file %p inode %p page %p %d~%d\n", file,
1052 inode, page, (int)pos, (int)len);
1053
1054 r = ceph_update_writeable_page(file, pos, len, page);
1055 } while (r == -EAGAIN);
1056
1057 return r;
1058}
1059
1060/*
1061 * we don't do anything in here that simple_write_end doesn't do
1062 * except adjust dirty page accounting and drop read lock on
1063 * mdsc->snap_rwsem.
1064 */
1065static int ceph_write_end(struct file *file, struct address_space *mapping,
1066 loff_t pos, unsigned len, unsigned copied,
1067 struct page *page, void *fsdata)
1068{
1069 struct inode *inode = file->f_dentry->d_inode;
1070 struct ceph_client *client = ceph_inode_to_client(inode);
1071 struct ceph_mds_client *mdsc = &client->mdsc;
1072 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1073 int check_cap = 0;
1074
1075 dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
1076 inode, page, (int)pos, (int)copied, (int)len);
1077
1078 /* zero the stale part of the page if we did a short copy */
1079 if (copied < len)
1080 zero_user_segment(page, from+copied, len);
1081
1082 /* did file size increase? */
1083 /* (no need for i_size_read(); we caller holds i_mutex */
1084 if (pos+copied > inode->i_size)
1085 check_cap = ceph_inode_set_size(inode, pos+copied);
1086
1087 if (!PageUptodate(page))
1088 SetPageUptodate(page);
1089
1090 set_page_dirty(page);
1091
1092 unlock_page(page);
1093 up_read(&mdsc->snap_rwsem);
1094 page_cache_release(page);
1095
1096 if (check_cap)
1097 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
1098
1099 return copied;
1100}
1101
1102/*
1103 * we set .direct_IO to indicate direct io is supported, but since we
1104 * intercept O_DIRECT reads and writes early, this function should
1105 * never get called.
1106 */
1107static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
1108 const struct iovec *iov,
1109 loff_t pos, unsigned long nr_segs)
1110{
1111 WARN_ON(1);
1112 return -EINVAL;
1113}
1114
1115const struct address_space_operations ceph_aops = {
1116 .readpage = ceph_readpage,
1117 .readpages = ceph_readpages,
1118 .writepage = ceph_writepage,
1119 .writepages = ceph_writepages_start,
1120 .write_begin = ceph_write_begin,
1121 .write_end = ceph_write_end,
1122 .set_page_dirty = ceph_set_page_dirty,
1123 .invalidatepage = ceph_invalidatepage,
1124 .releasepage = ceph_releasepage,
1125 .direct_IO = ceph_direct_io,
1126};
1127
1128
1129/*
1130 * vm ops
1131 */
1132
1133/*
1134 * Reuse write_begin here for simplicity.
1135 */
1136static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1137{
1138 struct inode *inode = vma->vm_file->f_dentry->d_inode;
1139 struct page *page = vmf->page;
1140 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
1141 loff_t off = page->index << PAGE_CACHE_SHIFT;
1142 loff_t size, len;
1143 int ret;
1144
1145 size = i_size_read(inode);
1146 if (off + PAGE_CACHE_SIZE <= size)
1147 len = PAGE_CACHE_SIZE;
1148 else
1149 len = size & ~PAGE_CACHE_MASK;
1150
1151 dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
1152 off, len, page, page->index);
1153
1154 lock_page(page);
1155
1156 ret = VM_FAULT_NOPAGE;
1157 if ((off > size) ||
1158 (page->mapping != inode->i_mapping))
1159 goto out;
1160
1161 ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
1162 if (ret == 0) {
1163 /* success. we'll keep the page locked. */
1164 set_page_dirty(page);
1165 up_read(&mdsc->snap_rwsem);
1166 ret = VM_FAULT_LOCKED;
1167 } else {
1168 if (ret == -ENOMEM)
1169 ret = VM_FAULT_OOM;
1170 else
1171 ret = VM_FAULT_SIGBUS;
1172 }
1173out:
1174 dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
1175 if (ret != VM_FAULT_LOCKED)
1176 unlock_page(page);
1177 return ret;
1178}
1179
1180static struct vm_operations_struct ceph_vmops = {
1181 .fault = filemap_fault,
1182 .page_mkwrite = ceph_page_mkwrite,
1183};
1184
1185int ceph_mmap(struct file *file, struct vm_area_struct *vma)
1186{
1187 struct address_space *mapping = file->f_mapping;
1188
1189 if (!mapping->a_ops->readpage)
1190 return -ENOEXEC;
1191 file_accessed(file);
1192 vma->vm_ops = &ceph_vmops;
1193 vma->vm_flags |= VM_CAN_NONLINEAR;
1194 return 0;
1195}
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
new file mode 100644
index 000000000000..67b2c030924b
--- /dev/null
+++ b/fs/ceph/armor.c
@@ -0,0 +1,99 @@
1
2#include <linux/errno.h>
3
4/*
5 * base64 encode/decode.
6 */
7
8const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
9
10static int encode_bits(int c)
11{
12 return pem_key[c];
13}
14
15static int decode_bits(char c)
16{
17 if (c >= 'A' && c <= 'Z')
18 return c - 'A';
19 if (c >= 'a' && c <= 'z')
20 return c - 'a' + 26;
21 if (c >= '0' && c <= '9')
22 return c - '0' + 52;
23 if (c == '+')
24 return 62;
25 if (c == '/')
26 return 63;
27 if (c == '=')
28 return 0; /* just non-negative, please */
29 return -EINVAL;
30}
31
32int ceph_armor(char *dst, const char *src, const char *end)
33{
34 int olen = 0;
35 int line = 0;
36
37 while (src < end) {
38 unsigned char a, b, c;
39
40 a = *src++;
41 *dst++ = encode_bits(a >> 2);
42 if (src < end) {
43 b = *src++;
44 *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
45 if (src < end) {
46 c = *src++;
47 *dst++ = encode_bits(((b & 15) << 2) |
48 (c >> 6));
49 *dst++ = encode_bits(c & 63);
50 } else {
51 *dst++ = encode_bits((b & 15) << 2);
52 *dst++ = '=';
53 }
54 } else {
55 *dst++ = encode_bits(((a & 3) << 4));
56 *dst++ = '=';
57 *dst++ = '=';
58 }
59 olen += 4;
60 line += 4;
61 if (line == 64) {
62 line = 0;
63 *(dst++) = '\n';
64 olen++;
65 }
66 }
67 return olen;
68}
69
70int ceph_unarmor(char *dst, const char *src, const char *end)
71{
72 int olen = 0;
73
74 while (src < end) {
75 int a, b, c, d;
76
77 if (src < end && src[0] == '\n')
78 src++;
79 if (src + 4 > end)
80 return -EINVAL;
81 a = decode_bits(src[0]);
82 b = decode_bits(src[1]);
83 c = decode_bits(src[2]);
84 d = decode_bits(src[3]);
85 if (a < 0 || b < 0 || c < 0 || d < 0)
86 return -EINVAL;
87
88 *dst++ = (a << 2) | (b >> 4);
89 if (src[2] == '=')
90 return olen + 1;
91 *dst++ = ((b & 15) << 4) | (c >> 2);
92 if (src[3] == '=')
93 return olen + 2;
94 *dst++ = ((c & 3) << 6) | d;
95 olen += 3;
96 src += 4;
97 }
98 return olen;
99}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
new file mode 100644
index 000000000000..f6394b94b866
--- /dev/null
+++ b/fs/ceph/auth.c
@@ -0,0 +1,258 @@
1#include "ceph_debug.h"
2
3#include <linux/module.h>
4#include <linux/slab.h>
5#include <linux/err.h>
6
7#include "types.h"
8#include "auth_none.h"
9#include "auth_x.h"
10#include "decode.h"
11#include "super.h"
12
13#include "messenger.h"
14
15/*
16 * get protocol handler
17 */
18static u32 supported_protocols[] = {
19 CEPH_AUTH_NONE,
20 CEPH_AUTH_CEPHX
21};
22
23int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
24{
25 switch (protocol) {
26 case CEPH_AUTH_NONE:
27 return ceph_auth_none_init(ac);
28 case CEPH_AUTH_CEPHX:
29 return ceph_x_init(ac);
30 default:
31 return -ENOENT;
32 }
33}
34
35/*
36 * setup, teardown.
37 */
38struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
39{
40 struct ceph_auth_client *ac;
41 int ret;
42
43 dout("auth_init name '%s' secret '%s'\n", name, secret);
44
45 ret = -ENOMEM;
46 ac = kzalloc(sizeof(*ac), GFP_NOFS);
47 if (!ac)
48 goto out;
49
50 ac->negotiating = true;
51 if (name)
52 ac->name = name;
53 else
54 ac->name = CEPH_AUTH_NAME_DEFAULT;
55 dout("auth_init name %s secret %s\n", ac->name, secret);
56 ac->secret = secret;
57 return ac;
58
59out:
60 return ERR_PTR(ret);
61}
62
63void ceph_auth_destroy(struct ceph_auth_client *ac)
64{
65 dout("auth_destroy %p\n", ac);
66 if (ac->ops)
67 ac->ops->destroy(ac);
68 kfree(ac);
69}
70
71/*
72 * Reset occurs when reconnecting to the monitor.
73 */
74void ceph_auth_reset(struct ceph_auth_client *ac)
75{
76 dout("auth_reset %p\n", ac);
77 if (ac->ops && !ac->negotiating)
78 ac->ops->reset(ac);
79 ac->negotiating = true;
80}
81
82int ceph_entity_name_encode(const char *name, void **p, void *end)
83{
84 int len = strlen(name);
85
86 if (*p + 2*sizeof(u32) + len > end)
87 return -ERANGE;
88 ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
89 ceph_encode_32(p, len);
90 ceph_encode_copy(p, name, len);
91 return 0;
92}
93
94/*
95 * Initiate protocol negotiation with monitor. Include entity name
96 * and list supported protocols.
97 */
98int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
99{
100 struct ceph_mon_request_header *monhdr = buf;
101 void *p = monhdr + 1, *end = buf + len, *lenp;
102 int i, num;
103 int ret;
104
105 dout("auth_build_hello\n");
106 monhdr->have_version = 0;
107 monhdr->session_mon = cpu_to_le16(-1);
108 monhdr->session_mon_tid = 0;
109
110 ceph_encode_32(&p, 0); /* no protocol, yet */
111
112 lenp = p;
113 p += sizeof(u32);
114
115 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
116 ceph_encode_8(&p, 1);
117 num = ARRAY_SIZE(supported_protocols);
118 ceph_encode_32(&p, num);
119 ceph_decode_need(&p, end, num * sizeof(u32), bad);
120 for (i = 0; i < num; i++)
121 ceph_encode_32(&p, supported_protocols[i]);
122
123 ret = ceph_entity_name_encode(ac->name, &p, end);
124 if (ret < 0)
125 return ret;
126 ceph_decode_need(&p, end, sizeof(u64), bad);
127 ceph_encode_64(&p, ac->global_id);
128
129 ceph_encode_32(&lenp, p - lenp - sizeof(u32));
130 return p - buf;
131
132bad:
133 return -ERANGE;
134}
135
136int ceph_build_auth_request(struct ceph_auth_client *ac,
137 void *msg_buf, size_t msg_len)
138{
139 struct ceph_mon_request_header *monhdr = msg_buf;
140 void *p = monhdr + 1;
141 void *end = msg_buf + msg_len;
142 int ret;
143
144 monhdr->have_version = 0;
145 monhdr->session_mon = cpu_to_le16(-1);
146 monhdr->session_mon_tid = 0;
147
148 ceph_encode_32(&p, ac->protocol);
149
150 ret = ac->ops->build_request(ac, p + sizeof(u32), end);
151 if (ret < 0) {
152 pr_err("error %d building request\n", ret);
153 return ret;
154 }
155 dout(" built request %d bytes\n", ret);
156 ceph_encode_32(&p, ret);
157 return p + ret - msg_buf;
158}
159
160/*
161 * Handle auth message from monitor.
162 */
163int ceph_handle_auth_reply(struct ceph_auth_client *ac,
164 void *buf, size_t len,
165 void *reply_buf, size_t reply_len)
166{
167 void *p = buf;
168 void *end = buf + len;
169 int protocol;
170 s32 result;
171 u64 global_id;
172 void *payload, *payload_end;
173 int payload_len;
174 char *result_msg;
175 int result_msg_len;
176 int ret = -EINVAL;
177
178 dout("handle_auth_reply %p %p\n", p, end);
179 ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
180 protocol = ceph_decode_32(&p);
181 result = ceph_decode_32(&p);
182 global_id = ceph_decode_64(&p);
183 payload_len = ceph_decode_32(&p);
184 payload = p;
185 p += payload_len;
186 ceph_decode_need(&p, end, sizeof(u32), bad);
187 result_msg_len = ceph_decode_32(&p);
188 result_msg = p;
189 p += result_msg_len;
190 if (p != end)
191 goto bad;
192
193 dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
194 result_msg, global_id, payload_len);
195
196 payload_end = payload + payload_len;
197
198 if (global_id && ac->global_id != global_id) {
199 dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
200 ac->global_id = global_id;
201 }
202
203 if (ac->negotiating) {
204 /* server does not support our protocols? */
205 if (!protocol && result < 0) {
206 ret = result;
207 goto out;
208 }
209 /* set up (new) protocol handler? */
210 if (ac->protocol && ac->protocol != protocol) {
211 ac->ops->destroy(ac);
212 ac->protocol = 0;
213 ac->ops = NULL;
214 }
215 if (ac->protocol != protocol) {
216 ret = ceph_auth_init_protocol(ac, protocol);
217 if (ret) {
218 pr_err("error %d on auth protocol %d init\n",
219 ret, protocol);
220 goto out;
221 }
222 }
223
224 ac->negotiating = false;
225 }
226
227 ret = ac->ops->handle_reply(ac, result, payload, payload_end);
228 if (ret == -EAGAIN) {
229 return ceph_build_auth_request(ac, reply_buf, reply_len);
230 } else if (ret) {
231 pr_err("authentication error %d\n", ret);
232 return ret;
233 }
234 return 0;
235
236bad:
237 pr_err("failed to decode auth msg\n");
238out:
239 return ret;
240}
241
242int ceph_build_auth(struct ceph_auth_client *ac,
243 void *msg_buf, size_t msg_len)
244{
245 if (!ac->protocol)
246 return ceph_auth_build_hello(ac, msg_buf, msg_len);
247 BUG_ON(!ac->ops);
248 if (!ac->ops->is_authenticated(ac))
249 return ceph_build_auth_request(ac, msg_buf, msg_len);
250 return 0;
251}
252
253int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
254{
255 if (!ac->ops)
256 return 0;
257 return ac->ops->is_authenticated(ac);
258}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
new file mode 100644
index 000000000000..ca4f57cfb267
--- /dev/null
+++ b/fs/ceph/auth.h
@@ -0,0 +1,84 @@
1#ifndef _FS_CEPH_AUTH_H
2#define _FS_CEPH_AUTH_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * Abstract interface for communicating with the authenticate module.
9 * There is some handshake that takes place between us and the monitor
10 * to acquire the necessary keys. These are used to generate an
11 * 'authorizer' that we use when connecting to a service (mds, osd).
12 */
13
14struct ceph_auth_client;
15struct ceph_authorizer;
16
17struct ceph_auth_client_ops {
18 /*
19 * true if we are authenticated and can connect to
20 * services.
21 */
22 int (*is_authenticated)(struct ceph_auth_client *ac);
23
24 /*
25 * build requests and process replies during monitor
26 * handshake. if handle_reply returns -EAGAIN, we build
27 * another request.
28 */
29 int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
30 int (*handle_reply)(struct ceph_auth_client *ac, int result,
31 void *buf, void *end);
32
33 /*
34 * Create authorizer for connecting to a service, and verify
35 * the response to authenticate the service.
36 */
37 int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
38 struct ceph_authorizer **a,
39 void **buf, size_t *len,
40 void **reply_buf, size_t *reply_len);
41 int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
42 struct ceph_authorizer *a, size_t len);
43 void (*destroy_authorizer)(struct ceph_auth_client *ac,
44 struct ceph_authorizer *a);
45 void (*invalidate_authorizer)(struct ceph_auth_client *ac,
46 int peer_type);
47
48 /* reset when we (re)connect to a monitor */
49 void (*reset)(struct ceph_auth_client *ac);
50
51 void (*destroy)(struct ceph_auth_client *ac);
52};
53
54struct ceph_auth_client {
55 u32 protocol; /* CEPH_AUTH_* */
56 void *private; /* for use by protocol implementation */
57 const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */
58
59 bool negotiating; /* true if negotiating protocol */
60 const char *name; /* entity name */
61 u64 global_id; /* our unique id in system */
62 const char *secret; /* our secret key */
63 unsigned want_keys; /* which services we want */
64};
65
66extern struct ceph_auth_client *ceph_auth_init(const char *name,
67 const char *secret);
68extern void ceph_auth_destroy(struct ceph_auth_client *ac);
69
70extern void ceph_auth_reset(struct ceph_auth_client *ac);
71
72extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
73 void *buf, size_t len);
74extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
75 void *buf, size_t len,
76 void *reply_buf, size_t reply_len);
77extern int ceph_entity_name_encode(const char *name, void **p, void *end);
78
79extern int ceph_build_auth(struct ceph_auth_client *ac,
80 void *msg_buf, size_t msg_len);
81
82extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
83
84#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
new file mode 100644
index 000000000000..8cd9e3af07f7
--- /dev/null
+++ b/fs/ceph/auth_none.c
@@ -0,0 +1,122 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7#include <linux/slab.h>
8
9#include "auth_none.h"
10#include "auth.h"
11#include "decode.h"
12
13static void reset(struct ceph_auth_client *ac)
14{
15 struct ceph_auth_none_info *xi = ac->private;
16
17 xi->starting = true;
18 xi->built_authorizer = false;
19}
20
21static void destroy(struct ceph_auth_client *ac)
22{
23 kfree(ac->private);
24 ac->private = NULL;
25}
26
27static int is_authenticated(struct ceph_auth_client *ac)
28{
29 struct ceph_auth_none_info *xi = ac->private;
30
31 return !xi->starting;
32}
33
34/*
35 * the generic auth code decode the global_id, and we carry no actual
36 * authenticate state, so nothing happens here.
37 */
38static int handle_reply(struct ceph_auth_client *ac, int result,
39 void *buf, void *end)
40{
41 struct ceph_auth_none_info *xi = ac->private;
42
43 xi->starting = false;
44 return result;
45}
46
47/*
48 * build an 'authorizer' with our entity_name and global_id. we can
49 * reuse a single static copy since it is identical for all services
50 * we connect to.
51 */
52static int ceph_auth_none_create_authorizer(
53 struct ceph_auth_client *ac, int peer_type,
54 struct ceph_authorizer **a,
55 void **buf, size_t *len,
56 void **reply_buf, size_t *reply_len)
57{
58 struct ceph_auth_none_info *ai = ac->private;
59 struct ceph_none_authorizer *au = &ai->au;
60 void *p, *end;
61 int ret;
62
63 if (!ai->built_authorizer) {
64 p = au->buf;
65 end = p + sizeof(au->buf);
66 ceph_encode_8(&p, 1);
67 ret = ceph_entity_name_encode(ac->name, &p, end - 8);
68 if (ret < 0)
69 goto bad;
70 ceph_decode_need(&p, end, sizeof(u64), bad2);
71 ceph_encode_64(&p, ac->global_id);
72 au->buf_len = p - (void *)au->buf;
73 ai->built_authorizer = true;
74 dout("built authorizer len %d\n", au->buf_len);
75 }
76
77 *a = (struct ceph_authorizer *)au;
78 *buf = au->buf;
79 *len = au->buf_len;
80 *reply_buf = au->reply_buf;
81 *reply_len = sizeof(au->reply_buf);
82 return 0;
83
84bad2:
85 ret = -ERANGE;
86bad:
87 return ret;
88}
89
90static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
91 struct ceph_authorizer *a)
92{
93 /* nothing to do */
94}
95
96static const struct ceph_auth_client_ops ceph_auth_none_ops = {
97 .reset = reset,
98 .destroy = destroy,
99 .is_authenticated = is_authenticated,
100 .handle_reply = handle_reply,
101 .create_authorizer = ceph_auth_none_create_authorizer,
102 .destroy_authorizer = ceph_auth_none_destroy_authorizer,
103};
104
105int ceph_auth_none_init(struct ceph_auth_client *ac)
106{
107 struct ceph_auth_none_info *xi;
108
109 dout("ceph_auth_none_init %p\n", ac);
110 xi = kzalloc(sizeof(*xi), GFP_NOFS);
111 if (!xi)
112 return -ENOMEM;
113
114 xi->starting = true;
115 xi->built_authorizer = false;
116
117 ac->protocol = CEPH_AUTH_NONE;
118 ac->private = xi;
119 ac->ops = &ceph_auth_none_ops;
120 return 0;
121}
122
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
new file mode 100644
index 000000000000..56c05533a31c
--- /dev/null
+++ b/fs/ceph/auth_none.h
@@ -0,0 +1,28 @@
1#ifndef _FS_CEPH_AUTH_NONE_H
2#define _FS_CEPH_AUTH_NONE_H
3
4#include "auth.h"
5
6/*
7 * null security mode.
8 *
9 * we use a single static authorizer that simply encodes our entity name
10 * and global id.
11 */
12
13struct ceph_none_authorizer {
14 char buf[128];
15 int buf_len;
16 char reply_buf[0];
17};
18
19struct ceph_auth_none_info {
20 bool starting;
21 bool built_authorizer;
22 struct ceph_none_authorizer au; /* we only need one; it's static */
23};
24
25extern int ceph_auth_none_init(struct ceph_auth_client *ac);
26
27#endif
28
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
new file mode 100644
index 000000000000..d9001a4dc8cc
--- /dev/null
+++ b/fs/ceph/auth_x.c
@@ -0,0 +1,680 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7#include <linux/slab.h>
8
9#include "auth_x.h"
10#include "auth_x_protocol.h"
11#include "crypto.h"
12#include "auth.h"
13#include "decode.h"
14
15struct kmem_cache *ceph_x_ticketbuf_cachep;
16
17#define TEMP_TICKET_BUF_LEN 256
18
19static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
20
21static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
22{
23 struct ceph_x_info *xi = ac->private;
24 int need;
25
26 ceph_x_validate_tickets(ac, &need);
27 dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
28 ac->want_keys, need, xi->have_keys);
29 return (ac->want_keys & xi->have_keys) == ac->want_keys;
30}
31
32static int ceph_x_encrypt_buflen(int ilen)
33{
34 return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
35 sizeof(u32);
36}
37
38static int ceph_x_encrypt(struct ceph_crypto_key *secret,
39 void *ibuf, int ilen, void *obuf, size_t olen)
40{
41 struct ceph_x_encrypt_header head = {
42 .struct_v = 1,
43 .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
44 };
45 size_t len = olen - sizeof(u32);
46 int ret;
47
48 ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
49 &head, sizeof(head), ibuf, ilen);
50 if (ret)
51 return ret;
52 ceph_encode_32(&obuf, len);
53 return len + sizeof(u32);
54}
55
56static int ceph_x_decrypt(struct ceph_crypto_key *secret,
57 void **p, void *end, void *obuf, size_t olen)
58{
59 struct ceph_x_encrypt_header head;
60 size_t head_len = sizeof(head);
61 int len, ret;
62
63 len = ceph_decode_32(p);
64 if (*p + len > end)
65 return -EINVAL;
66
67 dout("ceph_x_decrypt len %d\n", len);
68 ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
69 *p, len);
70 if (ret)
71 return ret;
72 if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
73 return -EPERM;
74 *p += len;
75 return olen;
76}
77
78/*
79 * get existing (or insert new) ticket handler
80 */
81struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac,
82 int service)
83{
84 struct ceph_x_ticket_handler *th;
85 struct ceph_x_info *xi = ac->private;
86 struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
87
88 while (*p) {
89 parent = *p;
90 th = rb_entry(parent, struct ceph_x_ticket_handler, node);
91 if (service < th->service)
92 p = &(*p)->rb_left;
93 else if (service > th->service)
94 p = &(*p)->rb_right;
95 else
96 return th;
97 }
98
99 /* add it */
100 th = kzalloc(sizeof(*th), GFP_NOFS);
101 if (!th)
102 return ERR_PTR(-ENOMEM);
103 th->service = service;
104 rb_link_node(&th->node, parent, p);
105 rb_insert_color(&th->node, &xi->ticket_handlers);
106 return th;
107}
108
109static void remove_ticket_handler(struct ceph_auth_client *ac,
110 struct ceph_x_ticket_handler *th)
111{
112 struct ceph_x_info *xi = ac->private;
113
114 dout("remove_ticket_handler %p %d\n", th, th->service);
115 rb_erase(&th->node, &xi->ticket_handlers);
116 ceph_crypto_key_destroy(&th->session_key);
117 if (th->ticket_blob)
118 ceph_buffer_put(th->ticket_blob);
119 kfree(th);
120}
121
122static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
123 struct ceph_crypto_key *secret,
124 void *buf, void *end)
125{
126 struct ceph_x_info *xi = ac->private;
127 int num;
128 void *p = buf;
129 int ret;
130 char *dbuf;
131 char *ticket_buf;
132 u8 struct_v;
133
134 dbuf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, GFP_NOFS | GFP_ATOMIC);
135 if (!dbuf)
136 return -ENOMEM;
137
138 ret = -ENOMEM;
139 ticket_buf = kmem_cache_alloc(ceph_x_ticketbuf_cachep,
140 GFP_NOFS | GFP_ATOMIC);
141 if (!ticket_buf)
142 goto out_dbuf;
143
144 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
145 struct_v = ceph_decode_8(&p);
146 if (struct_v != 1)
147 goto bad;
148 num = ceph_decode_32(&p);
149 dout("%d tickets\n", num);
150 while (num--) {
151 int type;
152 u8 struct_v;
153 struct ceph_x_ticket_handler *th;
154 void *dp, *dend;
155 int dlen;
156 char is_enc;
157 struct timespec validity;
158 struct ceph_crypto_key old_key;
159 void *tp, *tpend;
160 struct ceph_timespec new_validity;
161 struct ceph_crypto_key new_session_key;
162 struct ceph_buffer *new_ticket_blob;
163 unsigned long new_expires, new_renew_after;
164 u64 new_secret_id;
165
166 ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
167
168 type = ceph_decode_32(&p);
169 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
170
171 struct_v = ceph_decode_8(&p);
172 if (struct_v != 1)
173 goto bad;
174
175 th = get_ticket_handler(ac, type);
176 if (IS_ERR(th)) {
177 ret = PTR_ERR(th);
178 goto out;
179 }
180
181 /* blob for me */
182 dlen = ceph_x_decrypt(secret, &p, end, dbuf,
183 TEMP_TICKET_BUF_LEN);
184 if (dlen <= 0) {
185 ret = dlen;
186 goto out;
187 }
188 dout(" decrypted %d bytes\n", dlen);
189 dend = dbuf + dlen;
190 dp = dbuf;
191
192 struct_v = ceph_decode_8(&dp);
193 if (struct_v != 1)
194 goto bad;
195
196 memcpy(&old_key, &th->session_key, sizeof(old_key));
197 ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
198 if (ret)
199 goto out;
200
201 ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
202 ceph_decode_timespec(&validity, &new_validity);
203 new_expires = get_seconds() + validity.tv_sec;
204 new_renew_after = new_expires - (validity.tv_sec / 4);
205 dout(" expires=%lu renew_after=%lu\n", new_expires,
206 new_renew_after);
207
208 /* ticket blob for service */
209 ceph_decode_8_safe(&p, end, is_enc, bad);
210 tp = ticket_buf;
211 if (is_enc) {
212 /* encrypted */
213 dout(" encrypted ticket\n");
214 dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
215 TEMP_TICKET_BUF_LEN);
216 if (dlen < 0) {
217 ret = dlen;
218 goto out;
219 }
220 dlen = ceph_decode_32(&tp);
221 } else {
222 /* unencrypted */
223 ceph_decode_32_safe(&p, end, dlen, bad);
224 ceph_decode_need(&p, end, dlen, bad);
225 ceph_decode_copy(&p, ticket_buf, dlen);
226 }
227 tpend = tp + dlen;
228 dout(" ticket blob is %d bytes\n", dlen);
229 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
230 struct_v = ceph_decode_8(&tp);
231 new_secret_id = ceph_decode_64(&tp);
232 ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
233 if (ret)
234 goto out;
235
236 /* all is well, update our ticket */
237 ceph_crypto_key_destroy(&th->session_key);
238 if (th->ticket_blob)
239 ceph_buffer_put(th->ticket_blob);
240 th->session_key = new_session_key;
241 th->ticket_blob = new_ticket_blob;
242 th->validity = new_validity;
243 th->secret_id = new_secret_id;
244 th->expires = new_expires;
245 th->renew_after = new_renew_after;
246 dout(" got ticket service %d (%s) secret_id %lld len %d\n",
247 type, ceph_entity_type_name(type), th->secret_id,
248 (int)th->ticket_blob->vec.iov_len);
249 xi->have_keys |= th->service;
250 }
251
252 ret = 0;
253out:
254 kmem_cache_free(ceph_x_ticketbuf_cachep, ticket_buf);
255out_dbuf:
256 kmem_cache_free(ceph_x_ticketbuf_cachep, dbuf);
257 return ret;
258
259bad:
260 ret = -EINVAL;
261 goto out;
262}
263
264static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
265 struct ceph_x_ticket_handler *th,
266 struct ceph_x_authorizer *au)
267{
268 int maxlen;
269 struct ceph_x_authorize_a *msg_a;
270 struct ceph_x_authorize_b msg_b;
271 void *p, *end;
272 int ret;
273 int ticket_blob_len =
274 (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
275
276 dout("build_authorizer for %s %p\n",
277 ceph_entity_type_name(th->service), au);
278
279 maxlen = sizeof(*msg_a) + sizeof(msg_b) +
280 ceph_x_encrypt_buflen(ticket_blob_len);
281 dout(" need len %d\n", maxlen);
282 if (au->buf && au->buf->alloc_len < maxlen) {
283 ceph_buffer_put(au->buf);
284 au->buf = NULL;
285 }
286 if (!au->buf) {
287 au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
288 if (!au->buf)
289 return -ENOMEM;
290 }
291 au->service = th->service;
292
293 msg_a = au->buf->vec.iov_base;
294 msg_a->struct_v = 1;
295 msg_a->global_id = cpu_to_le64(ac->global_id);
296 msg_a->service_id = cpu_to_le32(th->service);
297 msg_a->ticket_blob.struct_v = 1;
298 msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
299 msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
300 if (ticket_blob_len) {
301 memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
302 th->ticket_blob->vec.iov_len);
303 }
304 dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
305 le64_to_cpu(msg_a->ticket_blob.secret_id));
306
307 p = msg_a + 1;
308 p += ticket_blob_len;
309 end = au->buf->vec.iov_base + au->buf->vec.iov_len;
310
311 get_random_bytes(&au->nonce, sizeof(au->nonce));
312 msg_b.struct_v = 1;
313 msg_b.nonce = cpu_to_le64(au->nonce);
314 ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
315 p, end - p);
316 if (ret < 0)
317 goto out_buf;
318 p += ret;
319 au->buf->vec.iov_len = p - au->buf->vec.iov_base;
320 dout(" built authorizer nonce %llx len %d\n", au->nonce,
321 (int)au->buf->vec.iov_len);
322 BUG_ON(au->buf->vec.iov_len > maxlen);
323 return 0;
324
325out_buf:
326 ceph_buffer_put(au->buf);
327 au->buf = NULL;
328 return ret;
329}
330
331static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
332 void **p, void *end)
333{
334 ceph_decode_need(p, end, 1 + sizeof(u64), bad);
335 ceph_encode_8(p, 1);
336 ceph_encode_64(p, th->secret_id);
337 if (th->ticket_blob) {
338 const char *buf = th->ticket_blob->vec.iov_base;
339 u32 len = th->ticket_blob->vec.iov_len;
340
341 ceph_encode_32_safe(p, end, len, bad);
342 ceph_encode_copy_safe(p, end, buf, len, bad);
343 } else {
344 ceph_encode_32_safe(p, end, 0, bad);
345 }
346
347 return 0;
348bad:
349 return -ERANGE;
350}
351
352static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
353{
354 int want = ac->want_keys;
355 struct ceph_x_info *xi = ac->private;
356 int service;
357
358 *pneed = ac->want_keys & ~(xi->have_keys);
359
360 for (service = 1; service <= want; service <<= 1) {
361 struct ceph_x_ticket_handler *th;
362
363 if (!(ac->want_keys & service))
364 continue;
365
366 if (*pneed & service)
367 continue;
368
369 th = get_ticket_handler(ac, service);
370
371 if (!th) {
372 *pneed |= service;
373 continue;
374 }
375
376 if (get_seconds() >= th->renew_after)
377 *pneed |= service;
378 if (get_seconds() >= th->expires)
379 xi->have_keys &= ~service;
380 }
381}
382
383
384static int ceph_x_build_request(struct ceph_auth_client *ac,
385 void *buf, void *end)
386{
387 struct ceph_x_info *xi = ac->private;
388 int need;
389 struct ceph_x_request_header *head = buf;
390 int ret;
391 struct ceph_x_ticket_handler *th =
392 get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
393
394 ceph_x_validate_tickets(ac, &need);
395
396 dout("build_request want %x have %x need %x\n",
397 ac->want_keys, xi->have_keys, need);
398
399 if (need & CEPH_ENTITY_TYPE_AUTH) {
400 struct ceph_x_authenticate *auth = (void *)(head + 1);
401 void *p = auth + 1;
402 struct ceph_x_challenge_blob tmp;
403 char tmp_enc[40];
404 u64 *u;
405
406 if (p > end)
407 return -ERANGE;
408
409 dout(" get_auth_session_key\n");
410 head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
411
412 /* encrypt and hash */
413 get_random_bytes(&auth->client_challenge, sizeof(u64));
414 tmp.client_challenge = auth->client_challenge;
415 tmp.server_challenge = cpu_to_le64(xi->server_challenge);
416 ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
417 tmp_enc, sizeof(tmp_enc));
418 if (ret < 0)
419 return ret;
420
421 auth->struct_v = 1;
422 auth->key = 0;
423 for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
424 auth->key ^= *u;
425 dout(" server_challenge %llx client_challenge %llx key %llx\n",
426 xi->server_challenge, le64_to_cpu(auth->client_challenge),
427 le64_to_cpu(auth->key));
428
429 /* now encode the old ticket if exists */
430 ret = ceph_x_encode_ticket(th, &p, end);
431 if (ret < 0)
432 return ret;
433
434 return p - buf;
435 }
436
437 if (need) {
438 void *p = head + 1;
439 struct ceph_x_service_ticket_request *req;
440
441 if (p > end)
442 return -ERANGE;
443 head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
444
445 BUG_ON(!th);
446 ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
447 if (ret)
448 return ret;
449 ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
450 xi->auth_authorizer.buf->vec.iov_len);
451
452 req = p;
453 req->keys = cpu_to_le32(need);
454 p += sizeof(*req);
455 return p - buf;
456 }
457
458 return 0;
459}
460
461static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
462 void *buf, void *end)
463{
464 struct ceph_x_info *xi = ac->private;
465 struct ceph_x_reply_header *head = buf;
466 struct ceph_x_ticket_handler *th;
467 int len = end - buf;
468 int op;
469 int ret;
470
471 if (result)
472 return result; /* XXX hmm? */
473
474 if (xi->starting) {
475 /* it's a hello */
476 struct ceph_x_server_challenge *sc = buf;
477
478 if (len != sizeof(*sc))
479 return -EINVAL;
480 xi->server_challenge = le64_to_cpu(sc->server_challenge);
481 dout("handle_reply got server challenge %llx\n",
482 xi->server_challenge);
483 xi->starting = false;
484 xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
485 return -EAGAIN;
486 }
487
488 op = le32_to_cpu(head->op);
489 result = le32_to_cpu(head->result);
490 dout("handle_reply op %d result %d\n", op, result);
491 switch (op) {
492 case CEPHX_GET_AUTH_SESSION_KEY:
493 /* verify auth key */
494 ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
495 buf + sizeof(*head), end);
496 break;
497
498 case CEPHX_GET_PRINCIPAL_SESSION_KEY:
499 th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
500 BUG_ON(!th);
501 ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
502 buf + sizeof(*head), end);
503 break;
504
505 default:
506 return -EINVAL;
507 }
508 if (ret)
509 return ret;
510 if (ac->want_keys == xi->have_keys)
511 return 0;
512 return -EAGAIN;
513}
514
515static int ceph_x_create_authorizer(
516 struct ceph_auth_client *ac, int peer_type,
517 struct ceph_authorizer **a,
518 void **buf, size_t *len,
519 void **reply_buf, size_t *reply_len)
520{
521 struct ceph_x_authorizer *au;
522 struct ceph_x_ticket_handler *th;
523 int ret;
524
525 th = get_ticket_handler(ac, peer_type);
526 if (IS_ERR(th))
527 return PTR_ERR(th);
528
529 au = kzalloc(sizeof(*au), GFP_NOFS);
530 if (!au)
531 return -ENOMEM;
532
533 ret = ceph_x_build_authorizer(ac, th, au);
534 if (ret) {
535 kfree(au);
536 return ret;
537 }
538
539 *a = (struct ceph_authorizer *)au;
540 *buf = au->buf->vec.iov_base;
541 *len = au->buf->vec.iov_len;
542 *reply_buf = au->reply_buf;
543 *reply_len = sizeof(au->reply_buf);
544 return 0;
545}
546
547static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
548 struct ceph_authorizer *a, size_t len)
549{
550 struct ceph_x_authorizer *au = (void *)a;
551 struct ceph_x_ticket_handler *th;
552 int ret = 0;
553 struct ceph_x_authorize_reply reply;
554 void *p = au->reply_buf;
555 void *end = p + sizeof(au->reply_buf);
556
557 th = get_ticket_handler(ac, au->service);
558 if (!th)
559 return -EIO; /* hrm! */
560 ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
561 if (ret < 0)
562 return ret;
563 if (ret != sizeof(reply))
564 return -EPERM;
565
566 if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
567 ret = -EPERM;
568 else
569 ret = 0;
570 dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
571 au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
572 return ret;
573}
574
575static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
576 struct ceph_authorizer *a)
577{
578 struct ceph_x_authorizer *au = (void *)a;
579
580 ceph_buffer_put(au->buf);
581 kfree(au);
582}
583
584
585static void ceph_x_reset(struct ceph_auth_client *ac)
586{
587 struct ceph_x_info *xi = ac->private;
588
589 dout("reset\n");
590 xi->starting = true;
591 xi->server_challenge = 0;
592}
593
594static void ceph_x_destroy(struct ceph_auth_client *ac)
595{
596 struct ceph_x_info *xi = ac->private;
597 struct rb_node *p;
598
599 dout("ceph_x_destroy %p\n", ac);
600 ceph_crypto_key_destroy(&xi->secret);
601
602 while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
603 struct ceph_x_ticket_handler *th =
604 rb_entry(p, struct ceph_x_ticket_handler, node);
605 remove_ticket_handler(ac, th);
606 }
607
608 kmem_cache_destroy(ceph_x_ticketbuf_cachep);
609
610 kfree(ac->private);
611 ac->private = NULL;
612}
613
614static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
615 int peer_type)
616{
617 struct ceph_x_ticket_handler *th;
618
619 th = get_ticket_handler(ac, peer_type);
620 if (th && !IS_ERR(th))
621 remove_ticket_handler(ac, th);
622}
623
624
625static const struct ceph_auth_client_ops ceph_x_ops = {
626 .is_authenticated = ceph_x_is_authenticated,
627 .build_request = ceph_x_build_request,
628 .handle_reply = ceph_x_handle_reply,
629 .create_authorizer = ceph_x_create_authorizer,
630 .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
631 .destroy_authorizer = ceph_x_destroy_authorizer,
632 .invalidate_authorizer = ceph_x_invalidate_authorizer,
633 .reset = ceph_x_reset,
634 .destroy = ceph_x_destroy,
635};
636
637
638int ceph_x_init(struct ceph_auth_client *ac)
639{
640 struct ceph_x_info *xi;
641 int ret;
642
643 dout("ceph_x_init %p\n", ac);
644 xi = kzalloc(sizeof(*xi), GFP_NOFS);
645 if (!xi)
646 return -ENOMEM;
647
648 ret = -ENOMEM;
649 ceph_x_ticketbuf_cachep = kmem_cache_create("ceph_x_ticketbuf",
650 TEMP_TICKET_BUF_LEN, 8,
651 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
652 NULL);
653 if (!ceph_x_ticketbuf_cachep)
654 goto done_nomem;
655 ret = -EINVAL;
656 if (!ac->secret) {
657 pr_err("no secret set (for auth_x protocol)\n");
658 goto done_nomem;
659 }
660
661 ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
662 if (ret)
663 goto done_nomem;
664
665 xi->starting = true;
666 xi->ticket_handlers = RB_ROOT;
667
668 ac->protocol = CEPH_AUTH_CEPHX;
669 ac->private = xi;
670 ac->ops = &ceph_x_ops;
671 return 0;
672
673done_nomem:
674 kfree(xi);
675 if (ceph_x_ticketbuf_cachep)
676 kmem_cache_destroy(ceph_x_ticketbuf_cachep);
677 return ret;
678}
679
680
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h
new file mode 100644
index 000000000000..ff6f8180e681
--- /dev/null
+++ b/fs/ceph/auth_x.h
@@ -0,0 +1,49 @@
1#ifndef _FS_CEPH_AUTH_X_H
2#define _FS_CEPH_AUTH_X_H
3
4#include <linux/rbtree.h>
5
6#include "crypto.h"
7#include "auth.h"
8#include "auth_x_protocol.h"
9
10/*
11 * Handle ticket for a single service.
12 */
13struct ceph_x_ticket_handler {
14 struct rb_node node;
15 unsigned service;
16
17 struct ceph_crypto_key session_key;
18 struct ceph_timespec validity;
19
20 u64 secret_id;
21 struct ceph_buffer *ticket_blob;
22
23 unsigned long renew_after, expires;
24};
25
26
27struct ceph_x_authorizer {
28 struct ceph_buffer *buf;
29 unsigned service;
30 u64 nonce;
31 char reply_buf[128]; /* big enough for encrypted blob */
32};
33
34struct ceph_x_info {
35 struct ceph_crypto_key secret;
36
37 bool starting;
38 u64 server_challenge;
39
40 unsigned have_keys;
41 struct rb_root ticket_handlers;
42
43 struct ceph_x_authorizer auth_authorizer;
44};
45
46extern int ceph_x_init(struct ceph_auth_client *ac);
47
48#endif
49
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h
new file mode 100644
index 000000000000..671d30576c4f
--- /dev/null
+++ b/fs/ceph/auth_x_protocol.h
@@ -0,0 +1,90 @@
1#ifndef __FS_CEPH_AUTH_X_PROTOCOL
2#define __FS_CEPH_AUTH_X_PROTOCOL
3
4#define CEPHX_GET_AUTH_SESSION_KEY 0x0100
5#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
6#define CEPHX_GET_ROTATING_KEY 0x0400
7
8/* common bits */
9struct ceph_x_ticket_blob {
10 __u8 struct_v;
11 __le64 secret_id;
12 __le32 blob_len;
13 char blob[];
14} __attribute__ ((packed));
15
16
17/* common request/reply headers */
18struct ceph_x_request_header {
19 __le16 op;
20} __attribute__ ((packed));
21
22struct ceph_x_reply_header {
23 __le16 op;
24 __le32 result;
25} __attribute__ ((packed));
26
27
28/* authenticate handshake */
29
30/* initial hello (no reply header) */
31struct ceph_x_server_challenge {
32 __u8 struct_v;
33 __le64 server_challenge;
34} __attribute__ ((packed));
35
36struct ceph_x_authenticate {
37 __u8 struct_v;
38 __le64 client_challenge;
39 __le64 key;
40 /* ticket blob */
41} __attribute__ ((packed));
42
43struct ceph_x_service_ticket_request {
44 __u8 struct_v;
45 __le32 keys;
46} __attribute__ ((packed));
47
48struct ceph_x_challenge_blob {
49 __le64 server_challenge;
50 __le64 client_challenge;
51} __attribute__ ((packed));
52
53
54
55/* authorize handshake */
56
57/*
58 * The authorizer consists of two pieces:
59 * a - service id, ticket blob
60 * b - encrypted with session key
61 */
62struct ceph_x_authorize_a {
63 __u8 struct_v;
64 __le64 global_id;
65 __le32 service_id;
66 struct ceph_x_ticket_blob ticket_blob;
67} __attribute__ ((packed));
68
69struct ceph_x_authorize_b {
70 __u8 struct_v;
71 __le64 nonce;
72} __attribute__ ((packed));
73
74struct ceph_x_authorize_reply {
75 __u8 struct_v;
76 __le64 nonce_plus_one;
77} __attribute__ ((packed));
78
79
80/*
81 * encyption bundle
82 */
83#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
84
85struct ceph_x_encrypt_header {
86 __u8 struct_v;
87 __le64 magic;
88} __attribute__ ((packed));
89
90#endif
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
new file mode 100644
index 000000000000..c67535d70aa6
--- /dev/null
+++ b/fs/ceph/buffer.c
@@ -0,0 +1,81 @@
1
2#include "ceph_debug.h"
3
4#include <linux/slab.h>
5
6#include "buffer.h"
7#include "decode.h"
8
9struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
10{
11 struct ceph_buffer *b;
12
13 b = kmalloc(sizeof(*b), gfp);
14 if (!b)
15 return NULL;
16
17 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
18 if (b->vec.iov_base) {
19 b->is_vmalloc = false;
20 } else {
21 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
22 if (!b->vec.iov_base) {
23 kfree(b);
24 return NULL;
25 }
26 b->is_vmalloc = true;
27 }
28
29 kref_init(&b->kref);
30 b->alloc_len = len;
31 b->vec.iov_len = len;
32 dout("buffer_new %p\n", b);
33 return b;
34}
35
36void ceph_buffer_release(struct kref *kref)
37{
38 struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
39
40 dout("buffer_release %p\n", b);
41 if (b->vec.iov_base) {
42 if (b->is_vmalloc)
43 vfree(b->vec.iov_base);
44 else
45 kfree(b->vec.iov_base);
46 }
47 kfree(b);
48}
49
50int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
51{
52 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
53 if (b->vec.iov_base) {
54 b->is_vmalloc = false;
55 } else {
56 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
57 b->is_vmalloc = true;
58 }
59 if (!b->vec.iov_base)
60 return -ENOMEM;
61 b->alloc_len = len;
62 b->vec.iov_len = len;
63 return 0;
64}
65
66int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
67{
68 size_t len;
69
70 ceph_decode_need(p, end, sizeof(u32), bad);
71 len = ceph_decode_32(p);
72 dout("decode_buffer len %d\n", (int)len);
73 ceph_decode_need(p, end, len, bad);
74 *b = ceph_buffer_new(len, GFP_NOFS);
75 if (!*b)
76 return -ENOMEM;
77 ceph_decode_copy(p, (*b)->vec.iov_base, len);
78 return 0;
79bad:
80 return -EINVAL;
81}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
new file mode 100644
index 000000000000..58d19014068f
--- /dev/null
+++ b/fs/ceph/buffer.h
@@ -0,0 +1,39 @@
1#ifndef __FS_CEPH_BUFFER_H
2#define __FS_CEPH_BUFFER_H
3
4#include <linux/kref.h>
5#include <linux/mm.h>
6#include <linux/vmalloc.h>
7#include <linux/types.h>
8#include <linux/uio.h>
9
10/*
11 * a simple reference counted buffer.
12 *
13 * use kmalloc for small sizes (<= one page), vmalloc for larger
14 * sizes.
15 */
16struct ceph_buffer {
17 struct kref kref;
18 struct kvec vec;
19 size_t alloc_len;
20 bool is_vmalloc;
21};
22
23extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
24extern void ceph_buffer_release(struct kref *kref);
25
26static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
27{
28 kref_get(&b->kref);
29 return b;
30}
31
32static inline void ceph_buffer_put(struct ceph_buffer *b)
33{
34 kref_put(&b->kref, ceph_buffer_release);
35}
36
37extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
38
39#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
new file mode 100644
index 000000000000..3710e077a857
--- /dev/null
+++ b/fs/ceph/caps.c
@@ -0,0 +1,2933 @@
1#include "ceph_debug.h"
2
3#include <linux/fs.h>
4#include <linux/kernel.h>
5#include <linux/sched.h>
6#include <linux/slab.h>
7#include <linux/vmalloc.h>
8#include <linux/wait.h>
9#include <linux/writeback.h>
10
11#include "super.h"
12#include "decode.h"
13#include "messenger.h"
14
15/*
16 * Capability management
17 *
18 * The Ceph metadata servers control client access to inode metadata
19 * and file data by issuing capabilities, granting clients permission
20 * to read and/or write both inode field and file data to OSDs
21 * (storage nodes). Each capability consists of a set of bits
22 * indicating which operations are allowed.
23 *
24 * If the client holds a *_SHARED cap, the client has a coherent value
25 * that can be safely read from the cached inode.
26 *
27 * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
28 * client is allowed to change inode attributes (e.g., file size,
29 * mtime), note its dirty state in the ceph_cap, and asynchronously
30 * flush that metadata change to the MDS.
31 *
32 * In the event of a conflicting operation (perhaps by another
33 * client), the MDS will revoke the conflicting client capabilities.
34 *
35 * In order for a client to cache an inode, it must hold a capability
36 * with at least one MDS server. When inodes are released, release
37 * notifications are batched and periodically sent en masse to the MDS
38 * cluster to release server state.
39 */
40
41
42/*
43 * Generate readable cap strings for debugging output.
44 */
45#define MAX_CAP_STR 20
46static char cap_str[MAX_CAP_STR][40];
47static DEFINE_SPINLOCK(cap_str_lock);
48static int last_cap_str;
49
50static char *gcap_string(char *s, int c)
51{
52 if (c & CEPH_CAP_GSHARED)
53 *s++ = 's';
54 if (c & CEPH_CAP_GEXCL)
55 *s++ = 'x';
56 if (c & CEPH_CAP_GCACHE)
57 *s++ = 'c';
58 if (c & CEPH_CAP_GRD)
59 *s++ = 'r';
60 if (c & CEPH_CAP_GWR)
61 *s++ = 'w';
62 if (c & CEPH_CAP_GBUFFER)
63 *s++ = 'b';
64 if (c & CEPH_CAP_GLAZYIO)
65 *s++ = 'l';
66 return s;
67}
68
69const char *ceph_cap_string(int caps)
70{
71 int i;
72 char *s;
73 int c;
74
75 spin_lock(&cap_str_lock);
76 i = last_cap_str++;
77 if (last_cap_str == MAX_CAP_STR)
78 last_cap_str = 0;
79 spin_unlock(&cap_str_lock);
80
81 s = cap_str[i];
82
83 if (caps & CEPH_CAP_PIN)
84 *s++ = 'p';
85
86 c = (caps >> CEPH_CAP_SAUTH) & 3;
87 if (c) {
88 *s++ = 'A';
89 s = gcap_string(s, c);
90 }
91
92 c = (caps >> CEPH_CAP_SLINK) & 3;
93 if (c) {
94 *s++ = 'L';
95 s = gcap_string(s, c);
96 }
97
98 c = (caps >> CEPH_CAP_SXATTR) & 3;
99 if (c) {
100 *s++ = 'X';
101 s = gcap_string(s, c);
102 }
103
104 c = caps >> CEPH_CAP_SFILE;
105 if (c) {
106 *s++ = 'F';
107 s = gcap_string(s, c);
108 }
109
110 if (s == cap_str[i])
111 *s++ = '-';
112 *s = 0;
113 return cap_str[i];
114}
115
116/*
117 * Cap reservations
118 *
119 * Maintain a global pool of preallocated struct ceph_caps, referenced
120 * by struct ceph_caps_reservations. This ensures that we preallocate
121 * memory needed to successfully process an MDS response. (If an MDS
122 * sends us cap information and we fail to process it, we will have
123 * problems due to the client and MDS being out of sync.)
124 *
125 * Reservations are 'owned' by a ceph_cap_reservation context.
126 */
127static spinlock_t caps_list_lock;
128static struct list_head caps_list; /* unused (reserved or unreserved) */
129static int caps_total_count; /* total caps allocated */
130static int caps_use_count; /* in use */
131static int caps_reserve_count; /* unused, reserved */
132static int caps_avail_count; /* unused, unreserved */
133static int caps_min_count; /* keep at least this many (unreserved) */
134
135void __init ceph_caps_init(void)
136{
137 INIT_LIST_HEAD(&caps_list);
138 spin_lock_init(&caps_list_lock);
139}
140
141void ceph_caps_finalize(void)
142{
143 struct ceph_cap *cap;
144
145 spin_lock(&caps_list_lock);
146 while (!list_empty(&caps_list)) {
147 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
148 list_del(&cap->caps_item);
149 kmem_cache_free(ceph_cap_cachep, cap);
150 }
151 caps_total_count = 0;
152 caps_avail_count = 0;
153 caps_use_count = 0;
154 caps_reserve_count = 0;
155 caps_min_count = 0;
156 spin_unlock(&caps_list_lock);
157}
158
159void ceph_adjust_min_caps(int delta)
160{
161 spin_lock(&caps_list_lock);
162 caps_min_count += delta;
163 BUG_ON(caps_min_count < 0);
164 spin_unlock(&caps_list_lock);
165}
166
167int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
168{
169 int i;
170 struct ceph_cap *cap;
171 int have;
172 int alloc = 0;
173 LIST_HEAD(newcaps);
174 int ret = 0;
175
176 dout("reserve caps ctx=%p need=%d\n", ctx, need);
177
178 /* first reserve any caps that are already allocated */
179 spin_lock(&caps_list_lock);
180 if (caps_avail_count >= need)
181 have = need;
182 else
183 have = caps_avail_count;
184 caps_avail_count -= have;
185 caps_reserve_count += have;
186 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
187 caps_avail_count);
188 spin_unlock(&caps_list_lock);
189
190 for (i = have; i < need; i++) {
191 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
192 if (!cap) {
193 ret = -ENOMEM;
194 goto out_alloc_count;
195 }
196 list_add(&cap->caps_item, &newcaps);
197 alloc++;
198 }
199 BUG_ON(have + alloc != need);
200
201 spin_lock(&caps_list_lock);
202 caps_total_count += alloc;
203 caps_reserve_count += alloc;
204 list_splice(&newcaps, &caps_list);
205
206 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
207 caps_avail_count);
208 spin_unlock(&caps_list_lock);
209
210 ctx->count = need;
211 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
212 ctx, caps_total_count, caps_use_count, caps_reserve_count,
213 caps_avail_count);
214 return 0;
215
216out_alloc_count:
217 /* we didn't manage to reserve as much as we needed */
218 pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
219 ctx, need, have);
220 return ret;
221}
222
223int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
224{
225 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
226 if (ctx->count) {
227 spin_lock(&caps_list_lock);
228 BUG_ON(caps_reserve_count < ctx->count);
229 caps_reserve_count -= ctx->count;
230 caps_avail_count += ctx->count;
231 ctx->count = 0;
232 dout("unreserve caps %d = %d used + %d resv + %d avail\n",
233 caps_total_count, caps_use_count, caps_reserve_count,
234 caps_avail_count);
235 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
236 caps_avail_count);
237 spin_unlock(&caps_list_lock);
238 }
239 return 0;
240}
241
242static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
243{
244 struct ceph_cap *cap = NULL;
245
246 /* temporary, until we do something about cap import/export */
247 if (!ctx)
248 return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
249
250 spin_lock(&caps_list_lock);
251 dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
252 ctx, ctx->count, caps_total_count, caps_use_count,
253 caps_reserve_count, caps_avail_count);
254 BUG_ON(!ctx->count);
255 BUG_ON(ctx->count > caps_reserve_count);
256 BUG_ON(list_empty(&caps_list));
257
258 ctx->count--;
259 caps_reserve_count--;
260 caps_use_count++;
261
262 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
263 list_del(&cap->caps_item);
264
265 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
266 caps_avail_count);
267 spin_unlock(&caps_list_lock);
268 return cap;
269}
270
271void ceph_put_cap(struct ceph_cap *cap)
272{
273 spin_lock(&caps_list_lock);
274 dout("put_cap %p %d = %d used + %d resv + %d avail\n",
275 cap, caps_total_count, caps_use_count,
276 caps_reserve_count, caps_avail_count);
277 caps_use_count--;
278 /*
279 * Keep some preallocated caps around (ceph_min_count), to
280 * avoid lots of free/alloc churn.
281 */
282 if (caps_avail_count >= caps_reserve_count + caps_min_count) {
283 caps_total_count--;
284 kmem_cache_free(ceph_cap_cachep, cap);
285 } else {
286 caps_avail_count++;
287 list_add(&cap->caps_item, &caps_list);
288 }
289
290 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
291 caps_avail_count);
292 spin_unlock(&caps_list_lock);
293}
294
295void ceph_reservation_status(struct ceph_client *client,
296 int *total, int *avail, int *used, int *reserved,
297 int *min)
298{
299 if (total)
300 *total = caps_total_count;
301 if (avail)
302 *avail = caps_avail_count;
303 if (used)
304 *used = caps_use_count;
305 if (reserved)
306 *reserved = caps_reserve_count;
307 if (min)
308 *min = caps_min_count;
309}
310
311/*
312 * Find ceph_cap for given mds, if any.
313 *
314 * Called with i_lock held.
315 */
316static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
317{
318 struct ceph_cap *cap;
319 struct rb_node *n = ci->i_caps.rb_node;
320
321 while (n) {
322 cap = rb_entry(n, struct ceph_cap, ci_node);
323 if (mds < cap->mds)
324 n = n->rb_left;
325 else if (mds > cap->mds)
326 n = n->rb_right;
327 else
328 return cap;
329 }
330 return NULL;
331}
332
333/*
334 * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else
335 * -1.
336 */
337static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq)
338{
339 struct ceph_cap *cap;
340 int mds = -1;
341 struct rb_node *p;
342
343 /* prefer mds with WR|WRBUFFER|EXCL caps */
344 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
345 cap = rb_entry(p, struct ceph_cap, ci_node);
346 mds = cap->mds;
347 if (mseq)
348 *mseq = cap->mseq;
349 if (cap->issued & (CEPH_CAP_FILE_WR |
350 CEPH_CAP_FILE_BUFFER |
351 CEPH_CAP_FILE_EXCL))
352 break;
353 }
354 return mds;
355}
356
357int ceph_get_cap_mds(struct inode *inode)
358{
359 int mds;
360 spin_lock(&inode->i_lock);
361 mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
362 spin_unlock(&inode->i_lock);
363 return mds;
364}
365
366/*
367 * Called under i_lock.
368 */
369static void __insert_cap_node(struct ceph_inode_info *ci,
370 struct ceph_cap *new)
371{
372 struct rb_node **p = &ci->i_caps.rb_node;
373 struct rb_node *parent = NULL;
374 struct ceph_cap *cap = NULL;
375
376 while (*p) {
377 parent = *p;
378 cap = rb_entry(parent, struct ceph_cap, ci_node);
379 if (new->mds < cap->mds)
380 p = &(*p)->rb_left;
381 else if (new->mds > cap->mds)
382 p = &(*p)->rb_right;
383 else
384 BUG();
385 }
386
387 rb_link_node(&new->ci_node, parent, p);
388 rb_insert_color(&new->ci_node, &ci->i_caps);
389}
390
391/*
392 * (re)set cap hold timeouts, which control the delayed release
393 * of unused caps back to the MDS. Should be called on cap use.
394 */
395static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
396 struct ceph_inode_info *ci)
397{
398 struct ceph_mount_args *ma = mdsc->client->mount_args;
399
400 ci->i_hold_caps_min = round_jiffies(jiffies +
401 ma->caps_wanted_delay_min * HZ);
402 ci->i_hold_caps_max = round_jiffies(jiffies +
403 ma->caps_wanted_delay_max * HZ);
404 dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
405 ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
406}
407
408/*
409 * (Re)queue cap at the end of the delayed cap release list.
410 *
411 * If I_FLUSH is set, leave the inode at the front of the list.
412 *
413 * Caller holds i_lock
414 * -> we take mdsc->cap_delay_lock
415 */
416static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
417 struct ceph_inode_info *ci)
418{
419 __cap_set_timeouts(mdsc, ci);
420 dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
421 ci->i_ceph_flags, ci->i_hold_caps_max);
422 if (!mdsc->stopping) {
423 spin_lock(&mdsc->cap_delay_lock);
424 if (!list_empty(&ci->i_cap_delay_list)) {
425 if (ci->i_ceph_flags & CEPH_I_FLUSH)
426 goto no_change;
427 list_del_init(&ci->i_cap_delay_list);
428 }
429 list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
430no_change:
431 spin_unlock(&mdsc->cap_delay_lock);
432 }
433}
434
435/*
436 * Queue an inode for immediate writeback. Mark inode with I_FLUSH,
437 * indicating we should send a cap message to flush dirty metadata
438 * asap, and move to the front of the delayed cap list.
439 */
440static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
441 struct ceph_inode_info *ci)
442{
443 dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
444 spin_lock(&mdsc->cap_delay_lock);
445 ci->i_ceph_flags |= CEPH_I_FLUSH;
446 if (!list_empty(&ci->i_cap_delay_list))
447 list_del_init(&ci->i_cap_delay_list);
448 list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
449 spin_unlock(&mdsc->cap_delay_lock);
450}
451
452/*
453 * Cancel delayed work on cap.
454 *
455 * Caller must hold i_lock.
456 */
457static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
458 struct ceph_inode_info *ci)
459{
460 dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
461 if (list_empty(&ci->i_cap_delay_list))
462 return;
463 spin_lock(&mdsc->cap_delay_lock);
464 list_del_init(&ci->i_cap_delay_list);
465 spin_unlock(&mdsc->cap_delay_lock);
466}
467
468/*
469 * Common issue checks for add_cap, handle_cap_grant.
470 */
471static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
472 unsigned issued)
473{
474 unsigned had = __ceph_caps_issued(ci, NULL);
475
476 /*
477 * Each time we receive FILE_CACHE anew, we increment
478 * i_rdcache_gen.
479 */
480 if ((issued & CEPH_CAP_FILE_CACHE) &&
481 (had & CEPH_CAP_FILE_CACHE) == 0)
482 ci->i_rdcache_gen++;
483
484 /*
485 * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
486 * don't know what happened to this directory while we didn't
487 * have the cap.
488 */
489 if ((issued & CEPH_CAP_FILE_SHARED) &&
490 (had & CEPH_CAP_FILE_SHARED) == 0) {
491 ci->i_shared_gen++;
492 if (S_ISDIR(ci->vfs_inode.i_mode)) {
493 dout(" marking %p NOT complete\n", &ci->vfs_inode);
494 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
495 }
496 }
497}
498
499/*
500 * Add a capability under the given MDS session.
501 *
502 * Caller should hold session snap_rwsem (read) and s_mutex.
503 *
504 * @fmode is the open file mode, if we are opening a file, otherwise
505 * it is < 0. (This is so we can atomically add the cap and add an
506 * open file reference to it.)
507 */
508int ceph_add_cap(struct inode *inode,
509 struct ceph_mds_session *session, u64 cap_id,
510 int fmode, unsigned issued, unsigned wanted,
511 unsigned seq, unsigned mseq, u64 realmino, int flags,
512 struct ceph_cap_reservation *caps_reservation)
513{
514 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
515 struct ceph_inode_info *ci = ceph_inode(inode);
516 struct ceph_cap *new_cap = NULL;
517 struct ceph_cap *cap;
518 int mds = session->s_mds;
519 int actual_wanted;
520
521 dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
522 session->s_mds, cap_id, ceph_cap_string(issued), seq);
523
524 /*
525 * If we are opening the file, include file mode wanted bits
526 * in wanted.
527 */
528 if (fmode >= 0)
529 wanted |= ceph_caps_for_mode(fmode);
530
531retry:
532 spin_lock(&inode->i_lock);
533 cap = __get_cap_for_mds(ci, mds);
534 if (!cap) {
535 if (new_cap) {
536 cap = new_cap;
537 new_cap = NULL;
538 } else {
539 spin_unlock(&inode->i_lock);
540 new_cap = get_cap(caps_reservation);
541 if (new_cap == NULL)
542 return -ENOMEM;
543 goto retry;
544 }
545
546 cap->issued = 0;
547 cap->implemented = 0;
548 cap->mds = mds;
549 cap->mds_wanted = 0;
550
551 cap->ci = ci;
552 __insert_cap_node(ci, cap);
553
554 /* clear out old exporting info? (i.e. on cap import) */
555 if (ci->i_cap_exporting_mds == mds) {
556 ci->i_cap_exporting_issued = 0;
557 ci->i_cap_exporting_mseq = 0;
558 ci->i_cap_exporting_mds = -1;
559 }
560
561 /* add to session cap list */
562 cap->session = session;
563 spin_lock(&session->s_cap_lock);
564 list_add_tail(&cap->session_caps, &session->s_caps);
565 session->s_nr_caps++;
566 spin_unlock(&session->s_cap_lock);
567 }
568
569 if (!ci->i_snap_realm) {
570 /*
571 * add this inode to the appropriate snap realm
572 */
573 struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
574 realmino);
575 if (realm) {
576 ceph_get_snap_realm(mdsc, realm);
577 spin_lock(&realm->inodes_with_caps_lock);
578 ci->i_snap_realm = realm;
579 list_add(&ci->i_snap_realm_item,
580 &realm->inodes_with_caps);
581 spin_unlock(&realm->inodes_with_caps_lock);
582 } else {
583 pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
584 realmino);
585 }
586 }
587
588 __check_cap_issue(ci, cap, issued);
589
590 /*
591 * If we are issued caps we don't want, or the mds' wanted
592 * value appears to be off, queue a check so we'll release
593 * later and/or update the mds wanted value.
594 */
595 actual_wanted = __ceph_caps_wanted(ci);
596 if ((wanted & ~actual_wanted) ||
597 (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
598 dout(" issued %s, mds wanted %s, actual %s, queueing\n",
599 ceph_cap_string(issued), ceph_cap_string(wanted),
600 ceph_cap_string(actual_wanted));
601 __cap_delay_requeue(mdsc, ci);
602 }
603
604 if (flags & CEPH_CAP_FLAG_AUTH)
605 ci->i_auth_cap = cap;
606 else if (ci->i_auth_cap == cap)
607 ci->i_auth_cap = NULL;
608
609 dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
610 inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
611 ceph_cap_string(issued|cap->issued), seq, mds);
612 cap->cap_id = cap_id;
613 cap->issued = issued;
614 cap->implemented |= issued;
615 cap->mds_wanted |= wanted;
616 cap->seq = seq;
617 cap->issue_seq = seq;
618 cap->mseq = mseq;
619 cap->cap_gen = session->s_cap_gen;
620
621 if (fmode >= 0)
622 __ceph_get_fmode(ci, fmode);
623 spin_unlock(&inode->i_lock);
624 wake_up(&ci->i_cap_wq);
625 return 0;
626}
627
628/*
629 * Return true if cap has not timed out and belongs to the current
630 * generation of the MDS session (i.e. has not gone 'stale' due to
631 * us losing touch with the mds).
632 */
633static int __cap_is_valid(struct ceph_cap *cap)
634{
635 unsigned long ttl;
636 u32 gen;
637
638 spin_lock(&cap->session->s_cap_lock);
639 gen = cap->session->s_cap_gen;
640 ttl = cap->session->s_cap_ttl;
641 spin_unlock(&cap->session->s_cap_lock);
642
643 if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
644 dout("__cap_is_valid %p cap %p issued %s "
645 "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
646 cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
647 return 0;
648 }
649
650 return 1;
651}
652
653/*
654 * Return set of valid cap bits issued to us. Note that caps time
655 * out, and may be invalidated in bulk if the client session times out
656 * and session->s_cap_gen is bumped.
657 */
658int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
659{
660 int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
661 struct ceph_cap *cap;
662 struct rb_node *p;
663
664 if (implemented)
665 *implemented = 0;
666 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
667 cap = rb_entry(p, struct ceph_cap, ci_node);
668 if (!__cap_is_valid(cap))
669 continue;
670 dout("__ceph_caps_issued %p cap %p issued %s\n",
671 &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
672 have |= cap->issued;
673 if (implemented)
674 *implemented |= cap->implemented;
675 }
676 return have;
677}
678
679/*
680 * Get cap bits issued by caps other than @ocap
681 */
682int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
683{
684 int have = ci->i_snap_caps;
685 struct ceph_cap *cap;
686 struct rb_node *p;
687
688 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
689 cap = rb_entry(p, struct ceph_cap, ci_node);
690 if (cap == ocap)
691 continue;
692 if (!__cap_is_valid(cap))
693 continue;
694 have |= cap->issued;
695 }
696 return have;
697}
698
699/*
700 * Move a cap to the end of the LRU (oldest caps at list head, newest
701 * at list tail).
702 */
703static void __touch_cap(struct ceph_cap *cap)
704{
705 struct ceph_mds_session *s = cap->session;
706
707 spin_lock(&s->s_cap_lock);
708 if (s->s_cap_iterator == NULL) {
709 dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
710 s->s_mds);
711 list_move_tail(&cap->session_caps, &s->s_caps);
712 } else {
713 dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
714 &cap->ci->vfs_inode, cap, s->s_mds);
715 }
716 spin_unlock(&s->s_cap_lock);
717}
718
719/*
720 * Check if we hold the given mask. If so, move the cap(s) to the
721 * front of their respective LRUs. (This is the preferred way for
722 * callers to check for caps they want.)
723 */
724int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
725{
726 struct ceph_cap *cap;
727 struct rb_node *p;
728 int have = ci->i_snap_caps;
729
730 if ((have & mask) == mask) {
731 dout("__ceph_caps_issued_mask %p snap issued %s"
732 " (mask %s)\n", &ci->vfs_inode,
733 ceph_cap_string(have),
734 ceph_cap_string(mask));
735 return 1;
736 }
737
738 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
739 cap = rb_entry(p, struct ceph_cap, ci_node);
740 if (!__cap_is_valid(cap))
741 continue;
742 if ((cap->issued & mask) == mask) {
743 dout("__ceph_caps_issued_mask %p cap %p issued %s"
744 " (mask %s)\n", &ci->vfs_inode, cap,
745 ceph_cap_string(cap->issued),
746 ceph_cap_string(mask));
747 if (touch)
748 __touch_cap(cap);
749 return 1;
750 }
751
752 /* does a combination of caps satisfy mask? */
753 have |= cap->issued;
754 if ((have & mask) == mask) {
755 dout("__ceph_caps_issued_mask %p combo issued %s"
756 " (mask %s)\n", &ci->vfs_inode,
757 ceph_cap_string(cap->issued),
758 ceph_cap_string(mask));
759 if (touch) {
760 struct rb_node *q;
761
762 /* touch this + preceeding caps */
763 __touch_cap(cap);
764 for (q = rb_first(&ci->i_caps); q != p;
765 q = rb_next(q)) {
766 cap = rb_entry(q, struct ceph_cap,
767 ci_node);
768 if (!__cap_is_valid(cap))
769 continue;
770 __touch_cap(cap);
771 }
772 }
773 return 1;
774 }
775 }
776
777 return 0;
778}
779
780/*
781 * Return true if mask caps are currently being revoked by an MDS.
782 */
783int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
784{
785 struct inode *inode = &ci->vfs_inode;
786 struct ceph_cap *cap;
787 struct rb_node *p;
788 int ret = 0;
789
790 spin_lock(&inode->i_lock);
791 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
792 cap = rb_entry(p, struct ceph_cap, ci_node);
793 if (__cap_is_valid(cap) &&
794 (cap->implemented & ~cap->issued & mask)) {
795 ret = 1;
796 break;
797 }
798 }
799 spin_unlock(&inode->i_lock);
800 dout("ceph_caps_revoking %p %s = %d\n", inode,
801 ceph_cap_string(mask), ret);
802 return ret;
803}
804
805int __ceph_caps_used(struct ceph_inode_info *ci)
806{
807 int used = 0;
808 if (ci->i_pin_ref)
809 used |= CEPH_CAP_PIN;
810 if (ci->i_rd_ref)
811 used |= CEPH_CAP_FILE_RD;
812 if (ci->i_rdcache_ref || ci->i_rdcache_gen)
813 used |= CEPH_CAP_FILE_CACHE;
814 if (ci->i_wr_ref)
815 used |= CEPH_CAP_FILE_WR;
816 if (ci->i_wrbuffer_ref)
817 used |= CEPH_CAP_FILE_BUFFER;
818 return used;
819}
820
821/*
822 * wanted, by virtue of open file modes
823 */
824int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
825{
826 int want = 0;
827 int mode;
828 for (mode = 0; mode < 4; mode++)
829 if (ci->i_nr_by_mode[mode])
830 want |= ceph_caps_for_mode(mode);
831 return want;
832}
833
834/*
835 * Return caps we have registered with the MDS(s) as 'wanted'.
836 */
837int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
838{
839 struct ceph_cap *cap;
840 struct rb_node *p;
841 int mds_wanted = 0;
842
843 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
844 cap = rb_entry(p, struct ceph_cap, ci_node);
845 if (!__cap_is_valid(cap))
846 continue;
847 mds_wanted |= cap->mds_wanted;
848 }
849 return mds_wanted;
850}
851
852/*
853 * called under i_lock
854 */
855static int __ceph_is_any_caps(struct ceph_inode_info *ci)
856{
857 return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
858}
859
860/*
861 * caller should hold i_lock.
862 * caller will not hold session s_mutex if called from destroy_inode.
863 */
864void __ceph_remove_cap(struct ceph_cap *cap)
865{
866 struct ceph_mds_session *session = cap->session;
867 struct ceph_inode_info *ci = cap->ci;
868 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
869
870 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
871
872 /* remove from inode list */
873 rb_erase(&cap->ci_node, &ci->i_caps);
874 cap->ci = NULL;
875 if (ci->i_auth_cap == cap)
876 ci->i_auth_cap = NULL;
877
878 /* remove from session list */
879 spin_lock(&session->s_cap_lock);
880 if (session->s_cap_iterator == cap) {
881 /* not yet, we are iterating over this very cap */
882 dout("__ceph_remove_cap delaying %p removal from session %p\n",
883 cap, cap->session);
884 } else {
885 list_del_init(&cap->session_caps);
886 session->s_nr_caps--;
887 cap->session = NULL;
888 }
889 spin_unlock(&session->s_cap_lock);
890
891 if (cap->session == NULL)
892 ceph_put_cap(cap);
893
894 if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
895 struct ceph_snap_realm *realm = ci->i_snap_realm;
896 spin_lock(&realm->inodes_with_caps_lock);
897 list_del_init(&ci->i_snap_realm_item);
898 ci->i_snap_realm_counter++;
899 ci->i_snap_realm = NULL;
900 spin_unlock(&realm->inodes_with_caps_lock);
901 ceph_put_snap_realm(mdsc, realm);
902 }
903 if (!__ceph_is_any_real_caps(ci))
904 __cap_delay_cancel(mdsc, ci);
905}
906
907/*
908 * Build and send a cap message to the given MDS.
909 *
910 * Caller should be holding s_mutex.
911 */
912static int send_cap_msg(struct ceph_mds_session *session,
913 u64 ino, u64 cid, int op,
914 int caps, int wanted, int dirty,
915 u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
916 u64 size, u64 max_size,
917 struct timespec *mtime, struct timespec *atime,
918 u64 time_warp_seq,
919 uid_t uid, gid_t gid, mode_t mode,
920 u64 xattr_version,
921 struct ceph_buffer *xattrs_buf,
922 u64 follows)
923{
924 struct ceph_mds_caps *fc;
925 struct ceph_msg *msg;
926
927 dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
928 " seq %u/%u mseq %u follows %lld size %llu/%llu"
929 " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
930 cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
931 ceph_cap_string(dirty),
932 seq, issue_seq, mseq, follows, size, max_size,
933 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
934
935 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
936 if (IS_ERR(msg))
937 return PTR_ERR(msg);
938
939 msg->hdr.tid = cpu_to_le64(flush_tid);
940
941 fc = msg->front.iov_base;
942 memset(fc, 0, sizeof(*fc));
943
944 fc->cap_id = cpu_to_le64(cid);
945 fc->op = cpu_to_le32(op);
946 fc->seq = cpu_to_le32(seq);
947 fc->issue_seq = cpu_to_le32(issue_seq);
948 fc->migrate_seq = cpu_to_le32(mseq);
949 fc->caps = cpu_to_le32(caps);
950 fc->wanted = cpu_to_le32(wanted);
951 fc->dirty = cpu_to_le32(dirty);
952 fc->ino = cpu_to_le64(ino);
953 fc->snap_follows = cpu_to_le64(follows);
954
955 fc->size = cpu_to_le64(size);
956 fc->max_size = cpu_to_le64(max_size);
957 if (mtime)
958 ceph_encode_timespec(&fc->mtime, mtime);
959 if (atime)
960 ceph_encode_timespec(&fc->atime, atime);
961 fc->time_warp_seq = cpu_to_le32(time_warp_seq);
962
963 fc->uid = cpu_to_le32(uid);
964 fc->gid = cpu_to_le32(gid);
965 fc->mode = cpu_to_le32(mode);
966
967 fc->xattr_version = cpu_to_le64(xattr_version);
968 if (xattrs_buf) {
969 msg->middle = ceph_buffer_get(xattrs_buf);
970 fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
971 msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
972 }
973
974 ceph_con_send(&session->s_con, msg);
975 return 0;
976}
977
978/*
979 * Queue cap releases when an inode is dropped from our cache. Since
980 * inode is about to be destroyed, there is no need for i_lock.
981 */
982void ceph_queue_caps_release(struct inode *inode)
983{
984 struct ceph_inode_info *ci = ceph_inode(inode);
985 struct rb_node *p;
986
987 p = rb_first(&ci->i_caps);
988 while (p) {
989 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
990 struct ceph_mds_session *session = cap->session;
991 struct ceph_msg *msg;
992 struct ceph_mds_cap_release *head;
993 struct ceph_mds_cap_item *item;
994
995 spin_lock(&session->s_cap_lock);
996 BUG_ON(!session->s_num_cap_releases);
997 msg = list_first_entry(&session->s_cap_releases,
998 struct ceph_msg, list_head);
999
1000 dout(" adding %p release to mds%d msg %p (%d left)\n",
1001 inode, session->s_mds, msg, session->s_num_cap_releases);
1002
1003 BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
1004 head = msg->front.iov_base;
1005 head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
1006 item = msg->front.iov_base + msg->front.iov_len;
1007 item->ino = cpu_to_le64(ceph_ino(inode));
1008 item->cap_id = cpu_to_le64(cap->cap_id);
1009 item->migrate_seq = cpu_to_le32(cap->mseq);
1010 item->seq = cpu_to_le32(cap->issue_seq);
1011
1012 session->s_num_cap_releases--;
1013
1014 msg->front.iov_len += sizeof(*item);
1015 if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
1016 dout(" release msg %p full\n", msg);
1017 list_move_tail(&msg->list_head,
1018 &session->s_cap_releases_done);
1019 } else {
1020 dout(" release msg %p at %d/%d (%d)\n", msg,
1021 (int)le32_to_cpu(head->num),
1022 (int)CEPH_CAPS_PER_RELEASE,
1023 (int)msg->front.iov_len);
1024 }
1025 spin_unlock(&session->s_cap_lock);
1026 p = rb_next(p);
1027 __ceph_remove_cap(cap);
1028 }
1029}
1030
1031/*
1032 * Send a cap msg on the given inode. Update our caps state, then
1033 * drop i_lock and send the message.
1034 *
1035 * Make note of max_size reported/requested from mds, revoked caps
1036 * that have now been implemented.
1037 *
1038 * Make half-hearted attempt ot to invalidate page cache if we are
1039 * dropping RDCACHE. Note that this will leave behind locked pages
1040 * that we'll then need to deal with elsewhere.
1041 *
1042 * Return non-zero if delayed release, or we experienced an error
1043 * such that the caller should requeue + retry later.
1044 *
1045 * called with i_lock, then drops it.
1046 * caller should hold snap_rwsem (read), s_mutex.
1047 */
1048static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1049 int op, int used, int want, int retain, int flushing,
1050 unsigned *pflush_tid)
1051 __releases(cap->ci->vfs_inode->i_lock)
1052{
1053 struct ceph_inode_info *ci = cap->ci;
1054 struct inode *inode = &ci->vfs_inode;
1055 u64 cap_id = cap->cap_id;
1056 int held, revoking, dropping, keep;
1057 u64 seq, issue_seq, mseq, time_warp_seq, follows;
1058 u64 size, max_size;
1059 struct timespec mtime, atime;
1060 int wake = 0;
1061 mode_t mode;
1062 uid_t uid;
1063 gid_t gid;
1064 struct ceph_mds_session *session;
1065 u64 xattr_version = 0;
1066 int delayed = 0;
1067 u64 flush_tid = 0;
1068 int i;
1069 int ret;
1070
1071 held = cap->issued | cap->implemented;
1072 revoking = cap->implemented & ~cap->issued;
1073 retain &= ~revoking;
1074 dropping = cap->issued & ~retain;
1075
1076 dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
1077 inode, cap, cap->session,
1078 ceph_cap_string(held), ceph_cap_string(held & retain),
1079 ceph_cap_string(revoking));
1080 BUG_ON((retain & CEPH_CAP_PIN) == 0);
1081
1082 session = cap->session;
1083
1084 /* don't release wanted unless we've waited a bit. */
1085 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1086 time_before(jiffies, ci->i_hold_caps_min)) {
1087 dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
1088 ceph_cap_string(cap->issued),
1089 ceph_cap_string(cap->issued & retain),
1090 ceph_cap_string(cap->mds_wanted),
1091 ceph_cap_string(want));
1092 want |= cap->mds_wanted;
1093 retain |= cap->issued;
1094 delayed = 1;
1095 }
1096 ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
1097
1098 cap->issued &= retain; /* drop bits we don't want */
1099 if (cap->implemented & ~cap->issued) {
1100 /*
1101 * Wake up any waiters on wanted -> needed transition.
1102 * This is due to the weird transition from buffered
1103 * to sync IO... we need to flush dirty pages _before_
1104 * allowing sync writes to avoid reordering.
1105 */
1106 wake = 1;
1107 }
1108 cap->implemented &= cap->issued | used;
1109 cap->mds_wanted = want;
1110
1111 if (flushing) {
1112 /*
1113 * assign a tid for flush operations so we can avoid
1114 * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
1115 * clean type races. track latest tid for every bit
1116 * so we can handle flush AxFw, flush Fw, and have the
1117 * first ack clean Ax.
1118 */
1119 flush_tid = ++ci->i_cap_flush_last_tid;
1120 if (pflush_tid)
1121 *pflush_tid = flush_tid;
1122 dout(" cap_flush_tid %d\n", (int)flush_tid);
1123 for (i = 0; i < CEPH_CAP_BITS; i++)
1124 if (flushing & (1 << i))
1125 ci->i_cap_flush_tid[i] = flush_tid;
1126 }
1127
1128 keep = cap->implemented;
1129 seq = cap->seq;
1130 issue_seq = cap->issue_seq;
1131 mseq = cap->mseq;
1132 size = inode->i_size;
1133 ci->i_reported_size = size;
1134 max_size = ci->i_wanted_max_size;
1135 ci->i_requested_max_size = max_size;
1136 mtime = inode->i_mtime;
1137 atime = inode->i_atime;
1138 time_warp_seq = ci->i_time_warp_seq;
1139 follows = ci->i_snap_realm->cached_context->seq;
1140 uid = inode->i_uid;
1141 gid = inode->i_gid;
1142 mode = inode->i_mode;
1143
1144 if (dropping & CEPH_CAP_XATTR_EXCL) {
1145 __ceph_build_xattrs_blob(ci);
1146 xattr_version = ci->i_xattrs.version + 1;
1147 }
1148
1149 spin_unlock(&inode->i_lock);
1150
1151 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
1152 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
1153 size, max_size, &mtime, &atime, time_warp_seq,
1154 uid, gid, mode,
1155 xattr_version,
1156 (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
1157 follows);
1158 if (ret < 0) {
1159 dout("error sending cap msg, must requeue %p\n", inode);
1160 delayed = 1;
1161 }
1162
1163 if (wake)
1164 wake_up(&ci->i_cap_wq);
1165
1166 return delayed;
1167}
1168
1169/*
1170 * When a snapshot is taken, clients accumulate dirty metadata on
1171 * inodes with capabilities in ceph_cap_snaps to describe the file
1172 * state at the time the snapshot was taken. This must be flushed
1173 * asynchronously back to the MDS once sync writes complete and dirty
1174 * data is written out.
1175 *
1176 * Called under i_lock. Takes s_mutex as needed.
1177 */
1178void __ceph_flush_snaps(struct ceph_inode_info *ci,
1179 struct ceph_mds_session **psession)
1180{
1181 struct inode *inode = &ci->vfs_inode;
1182 int mds;
1183 struct ceph_cap_snap *capsnap;
1184 u32 mseq;
1185 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
1186 struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
1187 session->s_mutex */
1188 u64 next_follows = 0; /* keep track of how far we've gotten through the
1189 i_cap_snaps list, and skip these entries next time
1190 around to avoid an infinite loop */
1191
1192 if (psession)
1193 session = *psession;
1194
1195 dout("__flush_snaps %p\n", inode);
1196retry:
1197 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
1198 /* avoid an infiniute loop after retry */
1199 if (capsnap->follows < next_follows)
1200 continue;
1201 /*
1202 * we need to wait for sync writes to complete and for dirty
1203 * pages to be written out.
1204 */
1205 if (capsnap->dirty_pages || capsnap->writing)
1206 continue;
1207
1208 /* pick mds, take s_mutex */
1209 mds = __ceph_get_cap_mds(ci, &mseq);
1210 if (session && session->s_mds != mds) {
1211 dout("oops, wrong session %p mutex\n", session);
1212 mutex_unlock(&session->s_mutex);
1213 ceph_put_mds_session(session);
1214 session = NULL;
1215 }
1216 if (!session) {
1217 spin_unlock(&inode->i_lock);
1218 mutex_lock(&mdsc->mutex);
1219 session = __ceph_lookup_mds_session(mdsc, mds);
1220 mutex_unlock(&mdsc->mutex);
1221 if (session) {
1222 dout("inverting session/ino locks on %p\n",
1223 session);
1224 mutex_lock(&session->s_mutex);
1225 }
1226 /*
1227 * if session == NULL, we raced against a cap
1228 * deletion. retry, and we'll get a better
1229 * @mds value next time.
1230 */
1231 spin_lock(&inode->i_lock);
1232 goto retry;
1233 }
1234
1235 capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
1236 atomic_inc(&capsnap->nref);
1237 if (!list_empty(&capsnap->flushing_item))
1238 list_del_init(&capsnap->flushing_item);
1239 list_add_tail(&capsnap->flushing_item,
1240 &session->s_cap_snaps_flushing);
1241 spin_unlock(&inode->i_lock);
1242
1243 dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
1244 inode, capsnap, next_follows, capsnap->size);
1245 send_cap_msg(session, ceph_vino(inode).ino, 0,
1246 CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
1247 capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
1248 capsnap->size, 0,
1249 &capsnap->mtime, &capsnap->atime,
1250 capsnap->time_warp_seq,
1251 capsnap->uid, capsnap->gid, capsnap->mode,
1252 0, NULL,
1253 capsnap->follows);
1254
1255 next_follows = capsnap->follows + 1;
1256 ceph_put_cap_snap(capsnap);
1257
1258 spin_lock(&inode->i_lock);
1259 goto retry;
1260 }
1261
1262 /* we flushed them all; remove this inode from the queue */
1263 spin_lock(&mdsc->snap_flush_lock);
1264 list_del_init(&ci->i_snap_flush_item);
1265 spin_unlock(&mdsc->snap_flush_lock);
1266
1267 if (psession)
1268 *psession = session;
1269 else if (session) {
1270 mutex_unlock(&session->s_mutex);
1271 ceph_put_mds_session(session);
1272 }
1273}
1274
1275static void ceph_flush_snaps(struct ceph_inode_info *ci)
1276{
1277 struct inode *inode = &ci->vfs_inode;
1278
1279 spin_lock(&inode->i_lock);
1280 __ceph_flush_snaps(ci, NULL);
1281 spin_unlock(&inode->i_lock);
1282}
1283
1284/*
1285 * Mark caps dirty. If inode is newly dirty, add to the global dirty
1286 * list.
1287 */
1288void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1289{
1290 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
1291 struct inode *inode = &ci->vfs_inode;
1292 int was = ci->i_dirty_caps;
1293 int dirty = 0;
1294
1295 dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
1296 ceph_cap_string(mask), ceph_cap_string(was),
1297 ceph_cap_string(was | mask));
1298 ci->i_dirty_caps |= mask;
1299 if (was == 0) {
1300 dout(" inode %p now dirty\n", &ci->vfs_inode);
1301 BUG_ON(!list_empty(&ci->i_dirty_item));
1302 spin_lock(&mdsc->cap_dirty_lock);
1303 list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
1304 spin_unlock(&mdsc->cap_dirty_lock);
1305 if (ci->i_flushing_caps == 0) {
1306 igrab(inode);
1307 dirty |= I_DIRTY_SYNC;
1308 }
1309 }
1310 BUG_ON(list_empty(&ci->i_dirty_item));
1311 if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
1312 (mask & CEPH_CAP_FILE_BUFFER))
1313 dirty |= I_DIRTY_DATASYNC;
1314 if (dirty)
1315 __mark_inode_dirty(inode, dirty);
1316 __cap_delay_requeue(mdsc, ci);
1317}
1318
1319/*
1320 * Add dirty inode to the flushing list. Assigned a seq number so we
1321 * can wait for caps to flush without starving.
1322 *
1323 * Called under i_lock.
1324 */
1325static int __mark_caps_flushing(struct inode *inode,
1326 struct ceph_mds_session *session)
1327{
1328 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1329 struct ceph_inode_info *ci = ceph_inode(inode);
1330 int flushing;
1331
1332 BUG_ON(ci->i_dirty_caps == 0);
1333 BUG_ON(list_empty(&ci->i_dirty_item));
1334
1335 flushing = ci->i_dirty_caps;
1336 dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
1337 ceph_cap_string(flushing),
1338 ceph_cap_string(ci->i_flushing_caps),
1339 ceph_cap_string(ci->i_flushing_caps | flushing));
1340 ci->i_flushing_caps |= flushing;
1341 ci->i_dirty_caps = 0;
1342 dout(" inode %p now !dirty\n", inode);
1343
1344 spin_lock(&mdsc->cap_dirty_lock);
1345 list_del_init(&ci->i_dirty_item);
1346
1347 ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
1348 if (list_empty(&ci->i_flushing_item)) {
1349 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1350 mdsc->num_cap_flushing++;
1351 dout(" inode %p now flushing seq %lld\n", inode,
1352 ci->i_cap_flush_seq);
1353 } else {
1354 list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1355 dout(" inode %p now flushing (more) seq %lld\n", inode,
1356 ci->i_cap_flush_seq);
1357 }
1358 spin_unlock(&mdsc->cap_dirty_lock);
1359
1360 return flushing;
1361}
1362
1363/*
1364 * try to invalidate mapping pages without blocking.
1365 */
1366static int mapping_is_empty(struct address_space *mapping)
1367{
1368 struct page *page = find_get_page(mapping, 0);
1369
1370 if (!page)
1371 return 1;
1372
1373 put_page(page);
1374 return 0;
1375}
1376
1377static int try_nonblocking_invalidate(struct inode *inode)
1378{
1379 struct ceph_inode_info *ci = ceph_inode(inode);
1380 u32 invalidating_gen = ci->i_rdcache_gen;
1381
1382 spin_unlock(&inode->i_lock);
1383 invalidate_mapping_pages(&inode->i_data, 0, -1);
1384 spin_lock(&inode->i_lock);
1385
1386 if (mapping_is_empty(&inode->i_data) &&
1387 invalidating_gen == ci->i_rdcache_gen) {
1388 /* success. */
1389 dout("try_nonblocking_invalidate %p success\n", inode);
1390 ci->i_rdcache_gen = 0;
1391 ci->i_rdcache_revoking = 0;
1392 return 0;
1393 }
1394 dout("try_nonblocking_invalidate %p failed\n", inode);
1395 return -1;
1396}
1397
1398/*
1399 * Swiss army knife function to examine currently used and wanted
1400 * versus held caps. Release, flush, ack revoked caps to mds as
1401 * appropriate.
1402 *
1403 * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
1404 * cap release further.
1405 * CHECK_CAPS_AUTHONLY - we should only check the auth cap
1406 * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
1407 * further delay.
1408 */
1409void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1410 struct ceph_mds_session *session)
1411 __releases(session->s_mutex)
1412{
1413 struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
1414 struct ceph_mds_client *mdsc = &client->mdsc;
1415 struct inode *inode = &ci->vfs_inode;
1416 struct ceph_cap *cap;
1417 int file_wanted, used;
1418 int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
1419 int issued, implemented, want, retain, revoking, flushing = 0;
1420 int mds = -1; /* keep track of how far we've gone through i_caps list
1421 to avoid an infinite loop on retry */
1422 struct rb_node *p;
1423 int tried_invalidate = 0;
1424 int delayed = 0, sent = 0, force_requeue = 0, num;
1425 int queue_invalidate = 0;
1426 int is_delayed = flags & CHECK_CAPS_NODELAY;
1427
1428 /* if we are unmounting, flush any unused caps immediately. */
1429 if (mdsc->stopping)
1430 is_delayed = 1;
1431
1432 spin_lock(&inode->i_lock);
1433
1434 if (ci->i_ceph_flags & CEPH_I_FLUSH)
1435 flags |= CHECK_CAPS_FLUSH;
1436
1437 /* flush snaps first time around only */
1438 if (!list_empty(&ci->i_cap_snaps))
1439 __ceph_flush_snaps(ci, &session);
1440 goto retry_locked;
1441retry:
1442 spin_lock(&inode->i_lock);
1443retry_locked:
1444 file_wanted = __ceph_caps_file_wanted(ci);
1445 used = __ceph_caps_used(ci);
1446 want = file_wanted | used;
1447 issued = __ceph_caps_issued(ci, &implemented);
1448 revoking = implemented & ~issued;
1449
1450 retain = want | CEPH_CAP_PIN;
1451 if (!mdsc->stopping && inode->i_nlink > 0) {
1452 if (want) {
1453 retain |= CEPH_CAP_ANY; /* be greedy */
1454 } else {
1455 retain |= CEPH_CAP_ANY_SHARED;
1456 /*
1457 * keep RD only if we didn't have the file open RW,
1458 * because then the mds would revoke it anyway to
1459 * journal max_size=0.
1460 */
1461 if (ci->i_max_size == 0)
1462 retain |= CEPH_CAP_ANY_RD;
1463 }
1464 }
1465
1466 dout("check_caps %p file_want %s used %s dirty %s flushing %s"
1467 " issued %s revoking %s retain %s %s%s%s\n", inode,
1468 ceph_cap_string(file_wanted),
1469 ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
1470 ceph_cap_string(ci->i_flushing_caps),
1471 ceph_cap_string(issued), ceph_cap_string(revoking),
1472 ceph_cap_string(retain),
1473 (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
1474 (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
1475 (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
1476
1477 /*
1478 * If we no longer need to hold onto old our caps, and we may
1479 * have cached pages, but don't want them, then try to invalidate.
1480 * If we fail, it's because pages are locked.... try again later.
1481 */
1482 if ((!is_delayed || mdsc->stopping) &&
1483 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
1484 ci->i_rdcache_gen && /* may have cached pages */
1485 (file_wanted == 0 || /* no open files */
1486 (revoking & CEPH_CAP_FILE_CACHE)) && /* or revoking cache */
1487 !tried_invalidate) {
1488 dout("check_caps trying to invalidate on %p\n", inode);
1489 if (try_nonblocking_invalidate(inode) < 0) {
1490 if (revoking & CEPH_CAP_FILE_CACHE) {
1491 dout("check_caps queuing invalidate\n");
1492 queue_invalidate = 1;
1493 ci->i_rdcache_revoking = ci->i_rdcache_gen;
1494 } else {
1495 dout("check_caps failed to invalidate pages\n");
1496 /* we failed to invalidate pages. check these
1497 caps again later. */
1498 force_requeue = 1;
1499 __cap_set_timeouts(mdsc, ci);
1500 }
1501 }
1502 tried_invalidate = 1;
1503 goto retry_locked;
1504 }
1505
1506 num = 0;
1507 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
1508 cap = rb_entry(p, struct ceph_cap, ci_node);
1509 num++;
1510
1511 /* avoid looping forever */
1512 if (mds >= cap->mds ||
1513 ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
1514 continue;
1515
1516 /* NOTE: no side-effects allowed, until we take s_mutex */
1517
1518 revoking = cap->implemented & ~cap->issued;
1519 if (revoking)
1520 dout(" mds%d revoking %s\n", cap->mds,
1521 ceph_cap_string(revoking));
1522
1523 if (cap == ci->i_auth_cap &&
1524 (cap->issued & CEPH_CAP_FILE_WR)) {
1525 /* request larger max_size from MDS? */
1526 if (ci->i_wanted_max_size > ci->i_max_size &&
1527 ci->i_wanted_max_size > ci->i_requested_max_size) {
1528 dout("requesting new max_size\n");
1529 goto ack;
1530 }
1531
1532 /* approaching file_max? */
1533 if ((inode->i_size << 1) >= ci->i_max_size &&
1534 (ci->i_reported_size << 1) < ci->i_max_size) {
1535 dout("i_size approaching max_size\n");
1536 goto ack;
1537 }
1538 }
1539 /* flush anything dirty? */
1540 if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
1541 ci->i_dirty_caps) {
1542 dout("flushing dirty caps\n");
1543 goto ack;
1544 }
1545
1546 /* completed revocation? going down and there are no caps? */
1547 if (revoking && (revoking & used) == 0) {
1548 dout("completed revocation of %s\n",
1549 ceph_cap_string(cap->implemented & ~cap->issued));
1550 goto ack;
1551 }
1552
1553 /* want more caps from mds? */
1554 if (want & ~(cap->mds_wanted | cap->issued))
1555 goto ack;
1556
1557 /* things we might delay */
1558 if ((cap->issued & ~retain) == 0 &&
1559 cap->mds_wanted == want)
1560 continue; /* nope, all good */
1561
1562 if (is_delayed)
1563 goto ack;
1564
1565 /* delay? */
1566 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1567 time_before(jiffies, ci->i_hold_caps_max)) {
1568 dout(" delaying issued %s -> %s, wanted %s -> %s\n",
1569 ceph_cap_string(cap->issued),
1570 ceph_cap_string(cap->issued & retain),
1571 ceph_cap_string(cap->mds_wanted),
1572 ceph_cap_string(want));
1573 delayed++;
1574 continue;
1575 }
1576
1577ack:
1578 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1579 dout(" skipping %p I_NOFLUSH set\n", inode);
1580 continue;
1581 }
1582
1583 if (session && session != cap->session) {
1584 dout("oops, wrong session %p mutex\n", session);
1585 mutex_unlock(&session->s_mutex);
1586 session = NULL;
1587 }
1588 if (!session) {
1589 session = cap->session;
1590 if (mutex_trylock(&session->s_mutex) == 0) {
1591 dout("inverting session/ino locks on %p\n",
1592 session);
1593 spin_unlock(&inode->i_lock);
1594 if (took_snap_rwsem) {
1595 up_read(&mdsc->snap_rwsem);
1596 took_snap_rwsem = 0;
1597 }
1598 mutex_lock(&session->s_mutex);
1599 goto retry;
1600 }
1601 }
1602 /* take snap_rwsem after session mutex */
1603 if (!took_snap_rwsem) {
1604 if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
1605 dout("inverting snap/in locks on %p\n",
1606 inode);
1607 spin_unlock(&inode->i_lock);
1608 down_read(&mdsc->snap_rwsem);
1609 took_snap_rwsem = 1;
1610 goto retry;
1611 }
1612 took_snap_rwsem = 1;
1613 }
1614
1615 if (cap == ci->i_auth_cap && ci->i_dirty_caps)
1616 flushing = __mark_caps_flushing(inode, session);
1617
1618 mds = cap->mds; /* remember mds, so we don't repeat */
1619 sent++;
1620
1621 /* __send_cap drops i_lock */
1622 delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
1623 retain, flushing, NULL);
1624 goto retry; /* retake i_lock and restart our cap scan. */
1625 }
1626
1627 /*
1628 * Reschedule delayed caps release if we delayed anything,
1629 * otherwise cancel.
1630 */
1631 if (delayed && is_delayed)
1632 force_requeue = 1; /* __send_cap delayed release; requeue */
1633 if (!delayed && !is_delayed)
1634 __cap_delay_cancel(mdsc, ci);
1635 else if (!is_delayed || force_requeue)
1636 __cap_delay_requeue(mdsc, ci);
1637
1638 spin_unlock(&inode->i_lock);
1639
1640 if (queue_invalidate)
1641 ceph_queue_invalidate(inode);
1642
1643 if (session)
1644 mutex_unlock(&session->s_mutex);
1645 if (took_snap_rwsem)
1646 up_read(&mdsc->snap_rwsem);
1647}
1648
1649/*
1650 * Try to flush dirty caps back to the auth mds.
1651 */
1652static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
1653 unsigned *flush_tid)
1654{
1655 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1656 struct ceph_inode_info *ci = ceph_inode(inode);
1657 int unlock_session = session ? 0 : 1;
1658 int flushing = 0;
1659
1660retry:
1661 spin_lock(&inode->i_lock);
1662 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1663 dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
1664 goto out;
1665 }
1666 if (ci->i_dirty_caps && ci->i_auth_cap) {
1667 struct ceph_cap *cap = ci->i_auth_cap;
1668 int used = __ceph_caps_used(ci);
1669 int want = __ceph_caps_wanted(ci);
1670 int delayed;
1671
1672 if (!session) {
1673 spin_unlock(&inode->i_lock);
1674 session = cap->session;
1675 mutex_lock(&session->s_mutex);
1676 goto retry;
1677 }
1678 BUG_ON(session != cap->session);
1679 if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
1680 goto out;
1681
1682 flushing = __mark_caps_flushing(inode, session);
1683
1684 /* __send_cap drops i_lock */
1685 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
1686 cap->issued | cap->implemented, flushing,
1687 flush_tid);
1688 if (!delayed)
1689 goto out_unlocked;
1690
1691 spin_lock(&inode->i_lock);
1692 __cap_delay_requeue(mdsc, ci);
1693 }
1694out:
1695 spin_unlock(&inode->i_lock);
1696out_unlocked:
1697 if (session && unlock_session)
1698 mutex_unlock(&session->s_mutex);
1699 return flushing;
1700}
1701
1702/*
1703 * Return true if we've flushed caps through the given flush_tid.
1704 */
1705static int caps_are_flushed(struct inode *inode, unsigned tid)
1706{
1707 struct ceph_inode_info *ci = ceph_inode(inode);
1708 int dirty, i, ret = 1;
1709
1710 spin_lock(&inode->i_lock);
1711 dirty = __ceph_caps_dirty(ci);
1712 for (i = 0; i < CEPH_CAP_BITS; i++)
1713 if ((ci->i_flushing_caps & (1 << i)) &&
1714 ci->i_cap_flush_tid[i] <= tid) {
1715 /* still flushing this bit */
1716 ret = 0;
1717 break;
1718 }
1719 spin_unlock(&inode->i_lock);
1720 return ret;
1721}
1722
1723/*
1724 * Wait on any unsafe replies for the given inode. First wait on the
1725 * newest request, and make that the upper bound. Then, if there are
1726 * more requests, keep waiting on the oldest as long as it is still older
1727 * than the original request.
1728 */
1729static void sync_write_wait(struct inode *inode)
1730{
1731 struct ceph_inode_info *ci = ceph_inode(inode);
1732 struct list_head *head = &ci->i_unsafe_writes;
1733 struct ceph_osd_request *req;
1734 u64 last_tid;
1735
1736 spin_lock(&ci->i_unsafe_lock);
1737 if (list_empty(head))
1738 goto out;
1739
1740 /* set upper bound as _last_ entry in chain */
1741 req = list_entry(head->prev, struct ceph_osd_request,
1742 r_unsafe_item);
1743 last_tid = req->r_tid;
1744
1745 do {
1746 ceph_osdc_get_request(req);
1747 spin_unlock(&ci->i_unsafe_lock);
1748 dout("sync_write_wait on tid %llu (until %llu)\n",
1749 req->r_tid, last_tid);
1750 wait_for_completion(&req->r_safe_completion);
1751 spin_lock(&ci->i_unsafe_lock);
1752 ceph_osdc_put_request(req);
1753
1754 /*
1755 * from here on look at first entry in chain, since we
1756 * only want to wait for anything older than last_tid
1757 */
1758 if (list_empty(head))
1759 break;
1760 req = list_entry(head->next, struct ceph_osd_request,
1761 r_unsafe_item);
1762 } while (req->r_tid < last_tid);
1763out:
1764 spin_unlock(&ci->i_unsafe_lock);
1765}
1766
1767int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
1768{
1769 struct inode *inode = dentry->d_inode;
1770 struct ceph_inode_info *ci = ceph_inode(inode);
1771 unsigned flush_tid;
1772 int ret;
1773 int dirty;
1774
1775 dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
1776 sync_write_wait(inode);
1777
1778 ret = filemap_write_and_wait(inode->i_mapping);
1779 if (ret < 0)
1780 return ret;
1781
1782 dirty = try_flush_caps(inode, NULL, &flush_tid);
1783 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
1784
1785 /*
1786 * only wait on non-file metadata writeback (the mds
1787 * can recover size and mtime, so we don't need to
1788 * wait for that)
1789 */
1790 if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
1791 dout("fsync waiting for flush_tid %u\n", flush_tid);
1792 ret = wait_event_interruptible(ci->i_cap_wq,
1793 caps_are_flushed(inode, flush_tid));
1794 }
1795
1796 dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
1797 return ret;
1798}
1799
1800/*
1801 * Flush any dirty caps back to the mds. If we aren't asked to wait,
1802 * queue inode for flush but don't do so immediately, because we can
1803 * get by with fewer MDS messages if we wait for data writeback to
1804 * complete first.
1805 */
1806int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1807{
1808 struct ceph_inode_info *ci = ceph_inode(inode);
1809 unsigned flush_tid;
1810 int err = 0;
1811 int dirty;
1812 int wait = wbc->sync_mode == WB_SYNC_ALL;
1813
1814 dout("write_inode %p wait=%d\n", inode, wait);
1815 if (wait) {
1816 dirty = try_flush_caps(inode, NULL, &flush_tid);
1817 if (dirty)
1818 err = wait_event_interruptible(ci->i_cap_wq,
1819 caps_are_flushed(inode, flush_tid));
1820 } else {
1821 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1822
1823 spin_lock(&inode->i_lock);
1824 if (__ceph_caps_dirty(ci))
1825 __cap_delay_requeue_front(mdsc, ci);
1826 spin_unlock(&inode->i_lock);
1827 }
1828 return err;
1829}
1830
1831/*
1832 * After a recovering MDS goes active, we need to resend any caps
1833 * we were flushing.
1834 *
1835 * Caller holds session->s_mutex.
1836 */
1837static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
1838 struct ceph_mds_session *session)
1839{
1840 struct ceph_cap_snap *capsnap;
1841
1842 dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
1843 list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
1844 flushing_item) {
1845 struct ceph_inode_info *ci = capsnap->ci;
1846 struct inode *inode = &ci->vfs_inode;
1847 struct ceph_cap *cap;
1848
1849 spin_lock(&inode->i_lock);
1850 cap = ci->i_auth_cap;
1851 if (cap && cap->session == session) {
1852 dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
1853 cap, capsnap);
1854 __ceph_flush_snaps(ci, &session);
1855 } else {
1856 pr_err("%p auth cap %p not mds%d ???\n", inode,
1857 cap, session->s_mds);
1858 spin_unlock(&inode->i_lock);
1859 }
1860 }
1861}
1862
1863void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
1864 struct ceph_mds_session *session)
1865{
1866 struct ceph_inode_info *ci;
1867
1868 kick_flushing_capsnaps(mdsc, session);
1869
1870 dout("kick_flushing_caps mds%d\n", session->s_mds);
1871 list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
1872 struct inode *inode = &ci->vfs_inode;
1873 struct ceph_cap *cap;
1874 int delayed = 0;
1875
1876 spin_lock(&inode->i_lock);
1877 cap = ci->i_auth_cap;
1878 if (cap && cap->session == session) {
1879 dout("kick_flushing_caps %p cap %p %s\n", inode,
1880 cap, ceph_cap_string(ci->i_flushing_caps));
1881 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
1882 __ceph_caps_used(ci),
1883 __ceph_caps_wanted(ci),
1884 cap->issued | cap->implemented,
1885 ci->i_flushing_caps, NULL);
1886 if (delayed) {
1887 spin_lock(&inode->i_lock);
1888 __cap_delay_requeue(mdsc, ci);
1889 spin_unlock(&inode->i_lock);
1890 }
1891 } else {
1892 pr_err("%p auth cap %p not mds%d ???\n", inode,
1893 cap, session->s_mds);
1894 spin_unlock(&inode->i_lock);
1895 }
1896 }
1897}
1898
1899
1900/*
1901 * Take references to capabilities we hold, so that we don't release
1902 * them to the MDS prematurely.
1903 *
1904 * Protected by i_lock.
1905 */
1906static void __take_cap_refs(struct ceph_inode_info *ci, int got)
1907{
1908 if (got & CEPH_CAP_PIN)
1909 ci->i_pin_ref++;
1910 if (got & CEPH_CAP_FILE_RD)
1911 ci->i_rd_ref++;
1912 if (got & CEPH_CAP_FILE_CACHE)
1913 ci->i_rdcache_ref++;
1914 if (got & CEPH_CAP_FILE_WR)
1915 ci->i_wr_ref++;
1916 if (got & CEPH_CAP_FILE_BUFFER) {
1917 if (ci->i_wrbuffer_ref == 0)
1918 igrab(&ci->vfs_inode);
1919 ci->i_wrbuffer_ref++;
1920 dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n",
1921 &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref);
1922 }
1923}
1924
1925/*
1926 * Try to grab cap references. Specify those refs we @want, and the
1927 * minimal set we @need. Also include the larger offset we are writing
1928 * to (when applicable), and check against max_size here as well.
1929 * Note that caller is responsible for ensuring max_size increases are
1930 * requested from the MDS.
1931 */
1932static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
1933 int *got, loff_t endoff, int *check_max, int *err)
1934{
1935 struct inode *inode = &ci->vfs_inode;
1936 int ret = 0;
1937 int have, implemented;
1938 int file_wanted;
1939
1940 dout("get_cap_refs %p need %s want %s\n", inode,
1941 ceph_cap_string(need), ceph_cap_string(want));
1942 spin_lock(&inode->i_lock);
1943
1944 /* make sure file is actually open */
1945 file_wanted = __ceph_caps_file_wanted(ci);
1946 if ((file_wanted & need) == 0) {
1947 dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
1948 ceph_cap_string(need), ceph_cap_string(file_wanted));
1949 *err = -EBADF;
1950 ret = 1;
1951 goto out;
1952 }
1953
1954 if (need & CEPH_CAP_FILE_WR) {
1955 if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
1956 dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
1957 inode, endoff, ci->i_max_size);
1958 if (endoff > ci->i_wanted_max_size) {
1959 *check_max = 1;
1960 ret = 1;
1961 }
1962 goto out;
1963 }
1964 /*
1965 * If a sync write is in progress, we must wait, so that we
1966 * can get a final snapshot value for size+mtime.
1967 */
1968 if (__ceph_have_pending_cap_snap(ci)) {
1969 dout("get_cap_refs %p cap_snap_pending\n", inode);
1970 goto out;
1971 }
1972 }
1973 have = __ceph_caps_issued(ci, &implemented);
1974
1975 /*
1976 * disallow writes while a truncate is pending
1977 */
1978 if (ci->i_truncate_pending)
1979 have &= ~CEPH_CAP_FILE_WR;
1980
1981 if ((have & need) == need) {
1982 /*
1983 * Look at (implemented & ~have & not) so that we keep waiting
1984 * on transition from wanted -> needed caps. This is needed
1985 * for WRBUFFER|WR -> WR to avoid a new WR sync write from
1986 * going before a prior buffered writeback happens.
1987 */
1988 int not = want & ~(have & need);
1989 int revoking = implemented & ~have;
1990 dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
1991 inode, ceph_cap_string(have), ceph_cap_string(not),
1992 ceph_cap_string(revoking));
1993 if ((revoking & not) == 0) {
1994 *got = need | (have & want);
1995 __take_cap_refs(ci, *got);
1996 ret = 1;
1997 }
1998 } else {
1999 dout("get_cap_refs %p have %s needed %s\n", inode,
2000 ceph_cap_string(have), ceph_cap_string(need));
2001 }
2002out:
2003 spin_unlock(&inode->i_lock);
2004 dout("get_cap_refs %p ret %d got %s\n", inode,
2005 ret, ceph_cap_string(*got));
2006 return ret;
2007}
2008
2009/*
2010 * Check the offset we are writing up to against our current
2011 * max_size. If necessary, tell the MDS we want to write to
2012 * a larger offset.
2013 */
2014static void check_max_size(struct inode *inode, loff_t endoff)
2015{
2016 struct ceph_inode_info *ci = ceph_inode(inode);
2017 int check = 0;
2018
2019 /* do we need to explicitly request a larger max_size? */
2020 spin_lock(&inode->i_lock);
2021 if ((endoff >= ci->i_max_size ||
2022 endoff > (inode->i_size << 1)) &&
2023 endoff > ci->i_wanted_max_size) {
2024 dout("write %p at large endoff %llu, req max_size\n",
2025 inode, endoff);
2026 ci->i_wanted_max_size = endoff;
2027 check = 1;
2028 }
2029 spin_unlock(&inode->i_lock);
2030 if (check)
2031 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2032}
2033
2034/*
2035 * Wait for caps, and take cap references. If we can't get a WR cap
2036 * due to a small max_size, make sure we check_max_size (and possibly
2037 * ask the mds) so we don't get hung up indefinitely.
2038 */
2039int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
2040 loff_t endoff)
2041{
2042 int check_max, ret, err;
2043
2044retry:
2045 if (endoff > 0)
2046 check_max_size(&ci->vfs_inode, endoff);
2047 check_max = 0;
2048 err = 0;
2049 ret = wait_event_interruptible(ci->i_cap_wq,
2050 try_get_cap_refs(ci, need, want,
2051 got, endoff,
2052 &check_max, &err));
2053 if (err)
2054 ret = err;
2055 if (check_max)
2056 goto retry;
2057 return ret;
2058}
2059
2060/*
2061 * Take cap refs. Caller must already know we hold at least one ref
2062 * on the caps in question or we don't know this is safe.
2063 */
2064void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
2065{
2066 spin_lock(&ci->vfs_inode.i_lock);
2067 __take_cap_refs(ci, caps);
2068 spin_unlock(&ci->vfs_inode.i_lock);
2069}
2070
2071/*
2072 * Release cap refs.
2073 *
2074 * If we released the last ref on any given cap, call ceph_check_caps
2075 * to release (or schedule a release).
2076 *
2077 * If we are releasing a WR cap (from a sync write), finalize any affected
2078 * cap_snap, and wake up any waiters.
2079 */
2080void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
2081{
2082 struct inode *inode = &ci->vfs_inode;
2083 int last = 0, put = 0, flushsnaps = 0, wake = 0;
2084 struct ceph_cap_snap *capsnap;
2085
2086 spin_lock(&inode->i_lock);
2087 if (had & CEPH_CAP_PIN)
2088 --ci->i_pin_ref;
2089 if (had & CEPH_CAP_FILE_RD)
2090 if (--ci->i_rd_ref == 0)
2091 last++;
2092 if (had & CEPH_CAP_FILE_CACHE)
2093 if (--ci->i_rdcache_ref == 0)
2094 last++;
2095 if (had & CEPH_CAP_FILE_BUFFER) {
2096 if (--ci->i_wrbuffer_ref == 0) {
2097 last++;
2098 put++;
2099 }
2100 dout("put_cap_refs %p wrbuffer %d -> %d (?)\n",
2101 inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref);
2102 }
2103 if (had & CEPH_CAP_FILE_WR)
2104 if (--ci->i_wr_ref == 0) {
2105 last++;
2106 if (!list_empty(&ci->i_cap_snaps)) {
2107 capsnap = list_first_entry(&ci->i_cap_snaps,
2108 struct ceph_cap_snap,
2109 ci_item);
2110 if (capsnap->writing) {
2111 capsnap->writing = 0;
2112 flushsnaps =
2113 __ceph_finish_cap_snap(ci,
2114 capsnap);
2115 wake = 1;
2116 }
2117 }
2118 }
2119 spin_unlock(&inode->i_lock);
2120
2121 dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had),
2122 last ? "last" : "");
2123
2124 if (last && !flushsnaps)
2125 ceph_check_caps(ci, 0, NULL);
2126 else if (flushsnaps)
2127 ceph_flush_snaps(ci);
2128 if (wake)
2129 wake_up(&ci->i_cap_wq);
2130 if (put)
2131 iput(inode);
2132}
2133
2134/*
2135 * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
2136 * context. Adjust per-snap dirty page accounting as appropriate.
2137 * Once all dirty data for a cap_snap is flushed, flush snapped file
2138 * metadata back to the MDS. If we dropped the last ref, call
2139 * ceph_check_caps.
2140 */
2141void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2142 struct ceph_snap_context *snapc)
2143{
2144 struct inode *inode = &ci->vfs_inode;
2145 int last = 0;
2146 int last_snap = 0;
2147 int found = 0;
2148 struct ceph_cap_snap *capsnap = NULL;
2149
2150 spin_lock(&inode->i_lock);
2151 ci->i_wrbuffer_ref -= nr;
2152 last = !ci->i_wrbuffer_ref;
2153
2154 if (ci->i_head_snapc == snapc) {
2155 ci->i_wrbuffer_ref_head -= nr;
2156 if (!ci->i_wrbuffer_ref_head) {
2157 ceph_put_snap_context(ci->i_head_snapc);
2158 ci->i_head_snapc = NULL;
2159 }
2160 dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
2161 inode,
2162 ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
2163 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
2164 last ? " LAST" : "");
2165 } else {
2166 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2167 if (capsnap->context == snapc) {
2168 found = 1;
2169 capsnap->dirty_pages -= nr;
2170 last_snap = !capsnap->dirty_pages;
2171 break;
2172 }
2173 }
2174 BUG_ON(!found);
2175 dout("put_wrbuffer_cap_refs on %p cap_snap %p "
2176 " snap %lld %d/%d -> %d/%d %s%s\n",
2177 inode, capsnap, capsnap->context->seq,
2178 ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
2179 ci->i_wrbuffer_ref, capsnap->dirty_pages,
2180 last ? " (wrbuffer last)" : "",
2181 last_snap ? " (capsnap last)" : "");
2182 }
2183
2184 spin_unlock(&inode->i_lock);
2185
2186 if (last) {
2187 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2188 iput(inode);
2189 } else if (last_snap) {
2190 ceph_flush_snaps(ci);
2191 wake_up(&ci->i_cap_wq);
2192 }
2193}
2194
2195/*
2196 * Handle a cap GRANT message from the MDS. (Note that a GRANT may
2197 * actually be a revocation if it specifies a smaller cap set.)
2198 *
2199 * caller holds s_mutex and i_lock, we drop both.
2200 *
2201 * return value:
2202 * 0 - ok
2203 * 1 - check_caps on auth cap only (writeback)
2204 * 2 - check_caps (ack revoke)
2205 */
2206static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2207 struct ceph_mds_session *session,
2208 struct ceph_cap *cap,
2209 struct ceph_buffer *xattr_buf)
2210 __releases(inode->i_lock)
2211 __releases(session->s_mutex)
2212{
2213 struct ceph_inode_info *ci = ceph_inode(inode);
2214 int mds = session->s_mds;
2215 int seq = le32_to_cpu(grant->seq);
2216 int newcaps = le32_to_cpu(grant->caps);
2217 int issued, implemented, used, wanted, dirty;
2218 u64 size = le64_to_cpu(grant->size);
2219 u64 max_size = le64_to_cpu(grant->max_size);
2220 struct timespec mtime, atime, ctime;
2221 int check_caps = 0;
2222 int wake = 0;
2223 int writeback = 0;
2224 int revoked_rdcache = 0;
2225 int queue_invalidate = 0;
2226
2227 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
2228 inode, cap, mds, seq, ceph_cap_string(newcaps));
2229 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
2230 inode->i_size);
2231
2232 /*
2233 * If CACHE is being revoked, and we have no dirty buffers,
2234 * try to invalidate (once). (If there are dirty buffers, we
2235 * will invalidate _after_ writeback.)
2236 */
2237 if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
2238 !ci->i_wrbuffer_ref) {
2239 if (try_nonblocking_invalidate(inode) == 0) {
2240 revoked_rdcache = 1;
2241 } else {
2242 /* there were locked pages.. invalidate later
2243 in a separate thread. */
2244 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
2245 queue_invalidate = 1;
2246 ci->i_rdcache_revoking = ci->i_rdcache_gen;
2247 }
2248 }
2249 }
2250
2251 /* side effects now are allowed */
2252
2253 issued = __ceph_caps_issued(ci, &implemented);
2254 issued |= implemented | __ceph_caps_dirty(ci);
2255
2256 cap->cap_gen = session->s_cap_gen;
2257
2258 __check_cap_issue(ci, cap, newcaps);
2259
2260 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
2261 inode->i_mode = le32_to_cpu(grant->mode);
2262 inode->i_uid = le32_to_cpu(grant->uid);
2263 inode->i_gid = le32_to_cpu(grant->gid);
2264 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
2265 inode->i_uid, inode->i_gid);
2266 }
2267
2268 if ((issued & CEPH_CAP_LINK_EXCL) == 0)
2269 inode->i_nlink = le32_to_cpu(grant->nlink);
2270
2271 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
2272 int len = le32_to_cpu(grant->xattr_len);
2273 u64 version = le64_to_cpu(grant->xattr_version);
2274
2275 if (version > ci->i_xattrs.version) {
2276 dout(" got new xattrs v%llu on %p len %d\n",
2277 version, inode, len);
2278 if (ci->i_xattrs.blob)
2279 ceph_buffer_put(ci->i_xattrs.blob);
2280 ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
2281 ci->i_xattrs.version = version;
2282 }
2283 }
2284
2285 /* size/ctime/mtime/atime? */
2286 ceph_fill_file_size(inode, issued,
2287 le32_to_cpu(grant->truncate_seq),
2288 le64_to_cpu(grant->truncate_size), size);
2289 ceph_decode_timespec(&mtime, &grant->mtime);
2290 ceph_decode_timespec(&atime, &grant->atime);
2291 ceph_decode_timespec(&ctime, &grant->ctime);
2292 ceph_fill_file_time(inode, issued,
2293 le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
2294 &atime);
2295
2296 /* max size increase? */
2297 if (max_size != ci->i_max_size) {
2298 dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
2299 ci->i_max_size = max_size;
2300 if (max_size >= ci->i_wanted_max_size) {
2301 ci->i_wanted_max_size = 0; /* reset */
2302 ci->i_requested_max_size = 0;
2303 }
2304 wake = 1;
2305 }
2306
2307 /* check cap bits */
2308 wanted = __ceph_caps_wanted(ci);
2309 used = __ceph_caps_used(ci);
2310 dirty = __ceph_caps_dirty(ci);
2311 dout(" my wanted = %s, used = %s, dirty %s\n",
2312 ceph_cap_string(wanted),
2313 ceph_cap_string(used),
2314 ceph_cap_string(dirty));
2315 if (wanted != le32_to_cpu(grant->wanted)) {
2316 dout("mds wanted %s -> %s\n",
2317 ceph_cap_string(le32_to_cpu(grant->wanted)),
2318 ceph_cap_string(wanted));
2319 grant->wanted = cpu_to_le32(wanted);
2320 }
2321
2322 cap->seq = seq;
2323
2324 /* file layout may have changed */
2325 ci->i_layout = grant->layout;
2326
2327 /* revocation, grant, or no-op? */
2328 if (cap->issued & ~newcaps) {
2329 dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
2330 ceph_cap_string(newcaps));
2331 if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
2332 writeback = 1; /* will delay ack */
2333 else if (dirty & ~newcaps)
2334 check_caps = 1; /* initiate writeback in check_caps */
2335 else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
2336 revoked_rdcache)
2337 check_caps = 2; /* send revoke ack in check_caps */
2338 cap->issued = newcaps;
2339 cap->implemented |= newcaps;
2340 } else if (cap->issued == newcaps) {
2341 dout("caps unchanged: %s -> %s\n",
2342 ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
2343 } else {
2344 dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
2345 ceph_cap_string(newcaps));
2346 cap->issued = newcaps;
2347 cap->implemented |= newcaps; /* add bits only, to
2348 * avoid stepping on a
2349 * pending revocation */
2350 wake = 1;
2351 }
2352 BUG_ON(cap->issued & ~cap->implemented);
2353
2354 spin_unlock(&inode->i_lock);
2355 if (writeback)
2356 /*
2357 * queue inode for writeback: we can't actually call
2358 * filemap_write_and_wait, etc. from message handler
2359 * context.
2360 */
2361 ceph_queue_writeback(inode);
2362 if (queue_invalidate)
2363 ceph_queue_invalidate(inode);
2364 if (wake)
2365 wake_up(&ci->i_cap_wq);
2366
2367 if (check_caps == 1)
2368 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
2369 session);
2370 else if (check_caps == 2)
2371 ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
2372 else
2373 mutex_unlock(&session->s_mutex);
2374}
2375
2376/*
2377 * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
2378 * MDS has been safely committed.
2379 */
2380static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2381 struct ceph_mds_caps *m,
2382 struct ceph_mds_session *session,
2383 struct ceph_cap *cap)
2384 __releases(inode->i_lock)
2385{
2386 struct ceph_inode_info *ci = ceph_inode(inode);
2387 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
2388 unsigned seq = le32_to_cpu(m->seq);
2389 int dirty = le32_to_cpu(m->dirty);
2390 int cleaned = 0;
2391 int drop = 0;
2392 int i;
2393
2394 for (i = 0; i < CEPH_CAP_BITS; i++)
2395 if ((dirty & (1 << i)) &&
2396 flush_tid == ci->i_cap_flush_tid[i])
2397 cleaned |= 1 << i;
2398
2399 dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
2400 " flushing %s -> %s\n",
2401 inode, session->s_mds, seq, ceph_cap_string(dirty),
2402 ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
2403 ceph_cap_string(ci->i_flushing_caps & ~cleaned));
2404
2405 if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
2406 goto out;
2407
2408 ci->i_flushing_caps &= ~cleaned;
2409
2410 spin_lock(&mdsc->cap_dirty_lock);
2411 if (ci->i_flushing_caps == 0) {
2412 list_del_init(&ci->i_flushing_item);
2413 if (!list_empty(&session->s_cap_flushing))
2414 dout(" mds%d still flushing cap on %p\n",
2415 session->s_mds,
2416 &list_entry(session->s_cap_flushing.next,
2417 struct ceph_inode_info,
2418 i_flushing_item)->vfs_inode);
2419 mdsc->num_cap_flushing--;
2420 wake_up(&mdsc->cap_flushing_wq);
2421 dout(" inode %p now !flushing\n", inode);
2422
2423 if (ci->i_dirty_caps == 0) {
2424 dout(" inode %p now clean\n", inode);
2425 BUG_ON(!list_empty(&ci->i_dirty_item));
2426 drop = 1;
2427 } else {
2428 BUG_ON(list_empty(&ci->i_dirty_item));
2429 }
2430 }
2431 spin_unlock(&mdsc->cap_dirty_lock);
2432 wake_up(&ci->i_cap_wq);
2433
2434out:
2435 spin_unlock(&inode->i_lock);
2436 if (drop)
2437 iput(inode);
2438}
2439
2440/*
2441 * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
2442 * throw away our cap_snap.
2443 *
2444 * Caller hold s_mutex.
2445 */
2446static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
2447 struct ceph_mds_caps *m,
2448 struct ceph_mds_session *session)
2449{
2450 struct ceph_inode_info *ci = ceph_inode(inode);
2451 u64 follows = le64_to_cpu(m->snap_follows);
2452 struct ceph_cap_snap *capsnap;
2453 int drop = 0;
2454
2455 dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
2456 inode, ci, session->s_mds, follows);
2457
2458 spin_lock(&inode->i_lock);
2459 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2460 if (capsnap->follows == follows) {
2461 if (capsnap->flush_tid != flush_tid) {
2462 dout(" cap_snap %p follows %lld tid %lld !="
2463 " %lld\n", capsnap, follows,
2464 flush_tid, capsnap->flush_tid);
2465 break;
2466 }
2467 WARN_ON(capsnap->dirty_pages || capsnap->writing);
2468 dout(" removing cap_snap %p follows %lld\n",
2469 capsnap, follows);
2470 ceph_put_snap_context(capsnap->context);
2471 list_del(&capsnap->ci_item);
2472 list_del(&capsnap->flushing_item);
2473 ceph_put_cap_snap(capsnap);
2474 drop = 1;
2475 break;
2476 } else {
2477 dout(" skipping cap_snap %p follows %lld\n",
2478 capsnap, capsnap->follows);
2479 }
2480 }
2481 spin_unlock(&inode->i_lock);
2482 if (drop)
2483 iput(inode);
2484}
2485
2486/*
2487 * Handle TRUNC from MDS, indicating file truncation.
2488 *
2489 * caller hold s_mutex.
2490 */
2491static void handle_cap_trunc(struct inode *inode,
2492 struct ceph_mds_caps *trunc,
2493 struct ceph_mds_session *session)
2494 __releases(inode->i_lock)
2495{
2496 struct ceph_inode_info *ci = ceph_inode(inode);
2497 int mds = session->s_mds;
2498 int seq = le32_to_cpu(trunc->seq);
2499 u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
2500 u64 truncate_size = le64_to_cpu(trunc->truncate_size);
2501 u64 size = le64_to_cpu(trunc->size);
2502 int implemented = 0;
2503 int dirty = __ceph_caps_dirty(ci);
2504 int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
2505 int queue_trunc = 0;
2506
2507 issued |= implemented | dirty;
2508
2509 dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
2510 inode, mds, seq, truncate_size, truncate_seq);
2511 queue_trunc = ceph_fill_file_size(inode, issued,
2512 truncate_seq, truncate_size, size);
2513 spin_unlock(&inode->i_lock);
2514
2515 if (queue_trunc)
2516 ceph_queue_vmtruncate(inode);
2517}
2518
2519/*
2520 * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
2521 * different one. If we are the most recent migration we've seen (as
2522 * indicated by mseq), make note of the migrating cap bits for the
2523 * duration (until we see the corresponding IMPORT).
2524 *
2525 * caller holds s_mutex
2526 */
2527static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2528 struct ceph_mds_session *session)
2529{
2530 struct ceph_inode_info *ci = ceph_inode(inode);
2531 int mds = session->s_mds;
2532 unsigned mseq = le32_to_cpu(ex->migrate_seq);
2533 struct ceph_cap *cap = NULL, *t;
2534 struct rb_node *p;
2535 int remember = 1;
2536
2537 dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
2538 inode, ci, mds, mseq);
2539
2540 spin_lock(&inode->i_lock);
2541
2542 /* make sure we haven't seen a higher mseq */
2543 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
2544 t = rb_entry(p, struct ceph_cap, ci_node);
2545 if (ceph_seq_cmp(t->mseq, mseq) > 0) {
2546 dout(" higher mseq on cap from mds%d\n",
2547 t->session->s_mds);
2548 remember = 0;
2549 }
2550 if (t->session->s_mds == mds)
2551 cap = t;
2552 }
2553
2554 if (cap) {
2555 if (remember) {
2556 /* make note */
2557 ci->i_cap_exporting_mds = mds;
2558 ci->i_cap_exporting_mseq = mseq;
2559 ci->i_cap_exporting_issued = cap->issued;
2560 }
2561 __ceph_remove_cap(cap);
2562 }
2563 /* else, we already released it */
2564
2565 spin_unlock(&inode->i_lock);
2566}
2567
2568/*
2569 * Handle cap IMPORT. If there are temp bits from an older EXPORT,
2570 * clean them up.
2571 *
2572 * caller holds s_mutex.
2573 */
2574static void handle_cap_import(struct ceph_mds_client *mdsc,
2575 struct inode *inode, struct ceph_mds_caps *im,
2576 struct ceph_mds_session *session,
2577 void *snaptrace, int snaptrace_len)
2578{
2579 struct ceph_inode_info *ci = ceph_inode(inode);
2580 int mds = session->s_mds;
2581 unsigned issued = le32_to_cpu(im->caps);
2582 unsigned wanted = le32_to_cpu(im->wanted);
2583 unsigned seq = le32_to_cpu(im->seq);
2584 unsigned mseq = le32_to_cpu(im->migrate_seq);
2585 u64 realmino = le64_to_cpu(im->realm);
2586 u64 cap_id = le64_to_cpu(im->cap_id);
2587
2588 if (ci->i_cap_exporting_mds >= 0 &&
2589 ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
2590 dout("handle_cap_import inode %p ci %p mds%d mseq %d"
2591 " - cleared exporting from mds%d\n",
2592 inode, ci, mds, mseq,
2593 ci->i_cap_exporting_mds);
2594 ci->i_cap_exporting_issued = 0;
2595 ci->i_cap_exporting_mseq = 0;
2596 ci->i_cap_exporting_mds = -1;
2597 } else {
2598 dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
2599 inode, ci, mds, mseq);
2600 }
2601
2602 down_write(&mdsc->snap_rwsem);
2603 ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
2604 false);
2605 downgrade_write(&mdsc->snap_rwsem);
2606 ceph_add_cap(inode, session, cap_id, -1,
2607 issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
2608 NULL /* no caps context */);
2609 try_flush_caps(inode, session, NULL);
2610 up_read(&mdsc->snap_rwsem);
2611}
2612
2613/*
2614 * Handle a caps message from the MDS.
2615 *
2616 * Identify the appropriate session, inode, and call the right handler
2617 * based on the cap op.
2618 */
2619void ceph_handle_caps(struct ceph_mds_session *session,
2620 struct ceph_msg *msg)
2621{
2622 struct ceph_mds_client *mdsc = session->s_mdsc;
2623 struct super_block *sb = mdsc->client->sb;
2624 struct inode *inode;
2625 struct ceph_cap *cap;
2626 struct ceph_mds_caps *h;
2627 int mds = session->s_mds;
2628 int op;
2629 u32 seq;
2630 struct ceph_vino vino;
2631 u64 cap_id;
2632 u64 size, max_size;
2633 u64 tid;
2634 void *snaptrace;
2635
2636 dout("handle_caps from mds%d\n", mds);
2637
2638 /* decode */
2639 tid = le64_to_cpu(msg->hdr.tid);
2640 if (msg->front.iov_len < sizeof(*h))
2641 goto bad;
2642 h = msg->front.iov_base;
2643 snaptrace = h + 1;
2644 op = le32_to_cpu(h->op);
2645 vino.ino = le64_to_cpu(h->ino);
2646 vino.snap = CEPH_NOSNAP;
2647 cap_id = le64_to_cpu(h->cap_id);
2648 seq = le32_to_cpu(h->seq);
2649 size = le64_to_cpu(h->size);
2650 max_size = le64_to_cpu(h->max_size);
2651
2652 mutex_lock(&session->s_mutex);
2653 session->s_seq++;
2654 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
2655 (unsigned)seq);
2656
2657 /* lookup ino */
2658 inode = ceph_find_inode(sb, vino);
2659 dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
2660 vino.snap, inode);
2661 if (!inode) {
2662 dout(" i don't have ino %llx\n", vino.ino);
2663 goto done;
2664 }
2665
2666 /* these will work even if we don't have a cap yet */
2667 switch (op) {
2668 case CEPH_CAP_OP_FLUSHSNAP_ACK:
2669 handle_cap_flushsnap_ack(inode, tid, h, session);
2670 goto done;
2671
2672 case CEPH_CAP_OP_EXPORT:
2673 handle_cap_export(inode, h, session);
2674 goto done;
2675
2676 case CEPH_CAP_OP_IMPORT:
2677 handle_cap_import(mdsc, inode, h, session,
2678 snaptrace, le32_to_cpu(h->snap_trace_len));
2679 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
2680 session);
2681 goto done_unlocked;
2682 }
2683
2684 /* the rest require a cap */
2685 spin_lock(&inode->i_lock);
2686 cap = __get_cap_for_mds(ceph_inode(inode), mds);
2687 if (!cap) {
2688 dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
2689 inode, ceph_ino(inode), ceph_snap(inode), mds);
2690 spin_unlock(&inode->i_lock);
2691 goto done;
2692 }
2693
2694 /* note that each of these drops i_lock for us */
2695 switch (op) {
2696 case CEPH_CAP_OP_REVOKE:
2697 case CEPH_CAP_OP_GRANT:
2698 handle_cap_grant(inode, h, session, cap, msg->middle);
2699 goto done_unlocked;
2700
2701 case CEPH_CAP_OP_FLUSH_ACK:
2702 handle_cap_flush_ack(inode, tid, h, session, cap);
2703 break;
2704
2705 case CEPH_CAP_OP_TRUNC:
2706 handle_cap_trunc(inode, h, session);
2707 break;
2708
2709 default:
2710 spin_unlock(&inode->i_lock);
2711 pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
2712 ceph_cap_op_name(op));
2713 }
2714
2715done:
2716 mutex_unlock(&session->s_mutex);
2717done_unlocked:
2718 if (inode)
2719 iput(inode);
2720 return;
2721
2722bad:
2723 pr_err("ceph_handle_caps: corrupt message\n");
2724 ceph_msg_dump(msg);
2725 return;
2726}
2727
2728/*
2729 * Delayed work handler to process end of delayed cap release LRU list.
2730 */
2731void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
2732{
2733 struct ceph_inode_info *ci;
2734 int flags = CHECK_CAPS_NODELAY;
2735
2736 dout("check_delayed_caps\n");
2737 while (1) {
2738 spin_lock(&mdsc->cap_delay_lock);
2739 if (list_empty(&mdsc->cap_delay_list))
2740 break;
2741 ci = list_first_entry(&mdsc->cap_delay_list,
2742 struct ceph_inode_info,
2743 i_cap_delay_list);
2744 if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
2745 time_before(jiffies, ci->i_hold_caps_max))
2746 break;
2747 list_del_init(&ci->i_cap_delay_list);
2748 spin_unlock(&mdsc->cap_delay_lock);
2749 dout("check_delayed_caps on %p\n", &ci->vfs_inode);
2750 ceph_check_caps(ci, flags, NULL);
2751 }
2752 spin_unlock(&mdsc->cap_delay_lock);
2753}
2754
2755/*
2756 * Flush all dirty caps to the mds
2757 */
2758void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
2759{
2760 struct ceph_inode_info *ci, *nci = NULL;
2761 struct inode *inode, *ninode = NULL;
2762 struct list_head *p, *n;
2763
2764 dout("flush_dirty_caps\n");
2765 spin_lock(&mdsc->cap_dirty_lock);
2766 list_for_each_safe(p, n, &mdsc->cap_dirty) {
2767 if (nci) {
2768 ci = nci;
2769 inode = ninode;
2770 ci->i_ceph_flags &= ~CEPH_I_NOFLUSH;
2771 dout("flush_dirty_caps inode %p (was next inode)\n",
2772 inode);
2773 } else {
2774 ci = list_entry(p, struct ceph_inode_info,
2775 i_dirty_item);
2776 inode = igrab(&ci->vfs_inode);
2777 BUG_ON(!inode);
2778 dout("flush_dirty_caps inode %p\n", inode);
2779 }
2780 if (n != &mdsc->cap_dirty) {
2781 nci = list_entry(n, struct ceph_inode_info,
2782 i_dirty_item);
2783 ninode = igrab(&nci->vfs_inode);
2784 BUG_ON(!ninode);
2785 nci->i_ceph_flags |= CEPH_I_NOFLUSH;
2786 dout("flush_dirty_caps next inode %p, noflush\n",
2787 ninode);
2788 } else {
2789 nci = NULL;
2790 ninode = NULL;
2791 }
2792 spin_unlock(&mdsc->cap_dirty_lock);
2793 if (inode) {
2794 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
2795 NULL);
2796 iput(inode);
2797 }
2798 spin_lock(&mdsc->cap_dirty_lock);
2799 }
2800 spin_unlock(&mdsc->cap_dirty_lock);
2801}
2802
2803/*
2804 * Drop open file reference. If we were the last open file,
2805 * we may need to release capabilities to the MDS (or schedule
2806 * their delayed release).
2807 */
2808void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
2809{
2810 struct inode *inode = &ci->vfs_inode;
2811 int last = 0;
2812
2813 spin_lock(&inode->i_lock);
2814 dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
2815 ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
2816 BUG_ON(ci->i_nr_by_mode[fmode] == 0);
2817 if (--ci->i_nr_by_mode[fmode] == 0)
2818 last++;
2819 spin_unlock(&inode->i_lock);
2820
2821 if (last && ci->i_vino.snap == CEPH_NOSNAP)
2822 ceph_check_caps(ci, 0, NULL);
2823}
2824
2825/*
2826 * Helpers for embedding cap and dentry lease releases into mds
2827 * requests.
2828 *
2829 * @force is used by dentry_release (below) to force inclusion of a
2830 * record for the directory inode, even when there aren't any caps to
2831 * drop.
2832 */
2833int ceph_encode_inode_release(void **p, struct inode *inode,
2834 int mds, int drop, int unless, int force)
2835{
2836 struct ceph_inode_info *ci = ceph_inode(inode);
2837 struct ceph_cap *cap;
2838 struct ceph_mds_request_release *rel = *p;
2839 int ret = 0;
2840 int used = 0;
2841
2842 spin_lock(&inode->i_lock);
2843 used = __ceph_caps_used(ci);
2844
2845 dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode,
2846 mds, ceph_cap_string(used), ceph_cap_string(drop),
2847 ceph_cap_string(unless));
2848
2849 /* only drop unused caps */
2850 drop &= ~used;
2851
2852 cap = __get_cap_for_mds(ci, mds);
2853 if (cap && __cap_is_valid(cap)) {
2854 if (force ||
2855 ((cap->issued & drop) &&
2856 (cap->issued & unless) == 0)) {
2857 if ((cap->issued & drop) &&
2858 (cap->issued & unless) == 0) {
2859 dout("encode_inode_release %p cap %p %s -> "
2860 "%s\n", inode, cap,
2861 ceph_cap_string(cap->issued),
2862 ceph_cap_string(cap->issued & ~drop));
2863 cap->issued &= ~drop;
2864 cap->implemented &= ~drop;
2865 if (ci->i_ceph_flags & CEPH_I_NODELAY) {
2866 int wanted = __ceph_caps_wanted(ci);
2867 dout(" wanted %s -> %s (act %s)\n",
2868 ceph_cap_string(cap->mds_wanted),
2869 ceph_cap_string(cap->mds_wanted &
2870 ~wanted),
2871 ceph_cap_string(wanted));
2872 cap->mds_wanted &= wanted;
2873 }
2874 } else {
2875 dout("encode_inode_release %p cap %p %s"
2876 " (force)\n", inode, cap,
2877 ceph_cap_string(cap->issued));
2878 }
2879
2880 rel->ino = cpu_to_le64(ceph_ino(inode));
2881 rel->cap_id = cpu_to_le64(cap->cap_id);
2882 rel->seq = cpu_to_le32(cap->seq);
2883 rel->issue_seq = cpu_to_le32(cap->issue_seq),
2884 rel->mseq = cpu_to_le32(cap->mseq);
2885 rel->caps = cpu_to_le32(cap->issued);
2886 rel->wanted = cpu_to_le32(cap->mds_wanted);
2887 rel->dname_len = 0;
2888 rel->dname_seq = 0;
2889 *p += sizeof(*rel);
2890 ret = 1;
2891 } else {
2892 dout("encode_inode_release %p cap %p %s\n",
2893 inode, cap, ceph_cap_string(cap->issued));
2894 }
2895 }
2896 spin_unlock(&inode->i_lock);
2897 return ret;
2898}
2899
2900int ceph_encode_dentry_release(void **p, struct dentry *dentry,
2901 int mds, int drop, int unless)
2902{
2903 struct inode *dir = dentry->d_parent->d_inode;
2904 struct ceph_mds_request_release *rel = *p;
2905 struct ceph_dentry_info *di = ceph_dentry(dentry);
2906 int force = 0;
2907 int ret;
2908
2909 /*
2910 * force an record for the directory caps if we have a dentry lease.
2911 * this is racy (can't take i_lock and d_lock together), but it
2912 * doesn't have to be perfect; the mds will revoke anything we don't
2913 * release.
2914 */
2915 spin_lock(&dentry->d_lock);
2916 if (di->lease_session && di->lease_session->s_mds == mds)
2917 force = 1;
2918 spin_unlock(&dentry->d_lock);
2919
2920 ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
2921
2922 spin_lock(&dentry->d_lock);
2923 if (ret && di->lease_session && di->lease_session->s_mds == mds) {
2924 dout("encode_dentry_release %p mds%d seq %d\n",
2925 dentry, mds, (int)di->lease_seq);
2926 rel->dname_len = cpu_to_le32(dentry->d_name.len);
2927 memcpy(*p, dentry->d_name.name, dentry->d_name.len);
2928 *p += dentry->d_name.len;
2929 rel->dname_seq = cpu_to_le32(di->lease_seq);
2930 }
2931 spin_unlock(&dentry->d_lock);
2932 return ret;
2933}
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
new file mode 100644
index 000000000000..1818c2305610
--- /dev/null
+++ b/fs/ceph/ceph_debug.h
@@ -0,0 +1,37 @@
1#ifndef _FS_CEPH_DEBUG_H
2#define _FS_CEPH_DEBUG_H
3
4#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
5
6#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
7
8/*
9 * wrap pr_debug to include a filename:lineno prefix on each line.
10 * this incurs some overhead (kernel size and execution time) due to
11 * the extra function call at each call site.
12 */
13
14# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
15extern const char *ceph_file_part(const char *s, int len);
16# define dout(fmt, ...) \
17 pr_debug(" %12.12s:%-4d : " fmt, \
18 ceph_file_part(__FILE__, sizeof(__FILE__)), \
19 __LINE__, ##__VA_ARGS__)
20# else
21/* faux printk call just to see any compiler warnings. */
22# define dout(fmt, ...) do { \
23 if (0) \
24 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
25 } while (0)
26# endif
27
28#else
29
30/*
31 * or, just wrap pr_debug
32 */
33# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
34
35#endif
36
37#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
new file mode 100644
index 000000000000..ab6cf35c4091
--- /dev/null
+++ b/fs/ceph/ceph_frag.c
@@ -0,0 +1,21 @@
1/*
2 * Ceph 'frag' type
3 */
4#include "types.h"
5
6int ceph_frag_compare(__u32 a, __u32 b)
7{
8 unsigned va = ceph_frag_value(a);
9 unsigned vb = ceph_frag_value(b);
10 if (va < vb)
11 return -1;
12 if (va > vb)
13 return 1;
14 va = ceph_frag_bits(a);
15 vb = ceph_frag_bits(b);
16 if (va < vb)
17 return -1;
18 if (va > vb)
19 return 1;
20 return 0;
21}
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
new file mode 100644
index 000000000000..793f50cb7c22
--- /dev/null
+++ b/fs/ceph/ceph_frag.h
@@ -0,0 +1,109 @@
1#ifndef _FS_CEPH_FRAG_H
2#define _FS_CEPH_FRAG_H
3
4/*
5 * "Frags" are a way to describe a subset of a 32-bit number space,
6 * using a mask and a value to match against that mask. Any given frag
7 * (subset of the number space) can be partitioned into 2^n sub-frags.
8 *
9 * Frags are encoded into a 32-bit word:
10 * 8 upper bits = "bits"
11 * 24 lower bits = "value"
12 * (We could go to 5+27 bits, but who cares.)
13 *
14 * We use the _most_ significant bits of the 24 bit value. This makes
15 * values logically sort.
16 *
17 * Unfortunately, because the "bits" field is still in the high bits, we
18 * can't sort encoded frags numerically. However, it does allow you
19 * to feed encoded frags as values into frag_contains_value.
20 */
21static inline __u32 ceph_frag_make(__u32 b, __u32 v)
22{
23 return (b << 24) |
24 (v & (0xffffffu << (24-b)) & 0xffffffu);
25}
26static inline __u32 ceph_frag_bits(__u32 f)
27{
28 return f >> 24;
29}
30static inline __u32 ceph_frag_value(__u32 f)
31{
32 return f & 0xffffffu;
33}
34static inline __u32 ceph_frag_mask(__u32 f)
35{
36 return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
37}
38static inline __u32 ceph_frag_mask_shift(__u32 f)
39{
40 return 24 - ceph_frag_bits(f);
41}
42
43static inline int ceph_frag_contains_value(__u32 f, __u32 v)
44{
45 return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
46}
47static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
48{
49 /* is sub as specific as us, and contained by us? */
50 return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
51 (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
52}
53
54static inline __u32 ceph_frag_parent(__u32 f)
55{
56 return ceph_frag_make(ceph_frag_bits(f) - 1,
57 ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
58}
59static inline int ceph_frag_is_left_child(__u32 f)
60{
61 return ceph_frag_bits(f) > 0 &&
62 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
63}
64static inline int ceph_frag_is_right_child(__u32 f)
65{
66 return ceph_frag_bits(f) > 0 &&
67 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
68}
69static inline __u32 ceph_frag_sibling(__u32 f)
70{
71 return ceph_frag_make(ceph_frag_bits(f),
72 ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
73}
74static inline __u32 ceph_frag_left_child(__u32 f)
75{
76 return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
77}
78static inline __u32 ceph_frag_right_child(__u32 f)
79{
80 return ceph_frag_make(ceph_frag_bits(f)+1,
81 ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
82}
83static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
84{
85 int newbits = ceph_frag_bits(f) + by;
86 return ceph_frag_make(newbits,
87 ceph_frag_value(f) | (i << (24 - newbits)));
88}
89static inline int ceph_frag_is_leftmost(__u32 f)
90{
91 return ceph_frag_value(f) == 0;
92}
93static inline int ceph_frag_is_rightmost(__u32 f)
94{
95 return ceph_frag_value(f) == ceph_frag_mask(f);
96}
97static inline __u32 ceph_frag_next(__u32 f)
98{
99 return ceph_frag_make(ceph_frag_bits(f),
100 ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
101}
102
103/*
104 * comparator to sort frags logically, as when traversing the
105 * number space in ascending order...
106 */
107int ceph_frag_compare(__u32 a, __u32 b);
108
109#endif
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
new file mode 100644
index 000000000000..79d76bc4303f
--- /dev/null
+++ b/fs/ceph/ceph_fs.c
@@ -0,0 +1,74 @@
1/*
2 * Some non-inline ceph helpers
3 */
4#include "types.h"
5
6/*
7 * return true if @layout appears to be valid
8 */
9int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
10{
11 __u32 su = le32_to_cpu(layout->fl_stripe_unit);
12 __u32 sc = le32_to_cpu(layout->fl_stripe_count);
13 __u32 os = le32_to_cpu(layout->fl_object_size);
14
15 /* stripe unit, object size must be non-zero, 64k increment */
16 if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
17 return 0;
18 if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
19 return 0;
20 /* object size must be a multiple of stripe unit */
21 if (os < su || os % su)
22 return 0;
23 /* stripe count must be non-zero */
24 if (!sc)
25 return 0;
26 return 1;
27}
28
29
30int ceph_flags_to_mode(int flags)
31{
32#ifdef O_DIRECTORY /* fixme */
33 if ((flags & O_DIRECTORY) == O_DIRECTORY)
34 return CEPH_FILE_MODE_PIN;
35#endif
36#ifdef O_LAZY
37 if (flags & O_LAZY)
38 return CEPH_FILE_MODE_LAZY;
39#endif
40 if ((flags & O_APPEND) == O_APPEND)
41 flags |= O_WRONLY;
42
43 flags &= O_ACCMODE;
44 if ((flags & O_RDWR) == O_RDWR)
45 return CEPH_FILE_MODE_RDWR;
46 if ((flags & O_WRONLY) == O_WRONLY)
47 return CEPH_FILE_MODE_WR;
48 return CEPH_FILE_MODE_RD;
49}
50
51int ceph_caps_for_mode(int mode)
52{
53 switch (mode) {
54 case CEPH_FILE_MODE_PIN:
55 return CEPH_CAP_PIN;
56 case CEPH_FILE_MODE_RD:
57 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
58 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
59 case CEPH_FILE_MODE_RDWR:
60 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
61 CEPH_CAP_FILE_EXCL |
62 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
63 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
64 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
65 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
66 case CEPH_FILE_MODE_WR:
67 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
68 CEPH_CAP_FILE_EXCL |
69 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
70 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
71 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
72 }
73 return 0;
74}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
new file mode 100644
index 000000000000..0c2241ef3653
--- /dev/null
+++ b/fs/ceph/ceph_fs.h
@@ -0,0 +1,650 @@
1/*
2 * ceph_fs.h - Ceph constants and data types to share between kernel and
3 * user space.
4 *
5 * Most types in this file are defined as little-endian, and are
6 * primarily intended to describe data structures that pass over the
7 * wire or that are stored on disk.
8 *
9 * LGPL2
10 */
11
12#ifndef _FS_CEPH_CEPH_FS_H
13#define _FS_CEPH_CEPH_FS_H
14
15#include "msgr.h"
16#include "rados.h"
17
18/*
19 * Ceph release version
20 */
21#define CEPH_VERSION_MAJOR 0
22#define CEPH_VERSION_MINOR 19
23#define CEPH_VERSION_PATCH 0
24
25#define _CEPH_STRINGIFY(x) #x
26#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
27#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
28 "." CEPH_STRINGIFY(z)
29#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
30 CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
31
32/*
33 * subprotocol versions. when specific messages types or high-level
34 * protocols change, bump the affected components. we keep rev
35 * internal cluster protocols separately from the public,
36 * client-facing protocol.
37 */
38#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
39#define CEPH_MDS_PROTOCOL 9 /* cluster internal */
40#define CEPH_MON_PROTOCOL 5 /* cluster internal */
41#define CEPH_OSDC_PROTOCOL 24 /* server/client */
42#define CEPH_MDSC_PROTOCOL 32 /* server/client */
43#define CEPH_MONC_PROTOCOL 15 /* server/client */
44
45
46#define CEPH_INO_ROOT 1
47#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
48
49/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
50#define CEPH_MAX_MON 31
51
52
53/*
54 * feature bits
55 */
56#define CEPH_FEATURE_SUPPORTED 0
57#define CEPH_FEATURE_REQUIRED 0
58
59
60/*
61 * ceph_file_layout - describe data layout for a file/inode
62 */
63struct ceph_file_layout {
64 /* file -> object mapping */
65 __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
66 of page size. */
67 __le32 fl_stripe_count; /* over this many objects */
68 __le32 fl_object_size; /* until objects are this big, then move to
69 new objects */
70 __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */
71
72 /* pg -> disk layout */
73 __le32 fl_object_stripe_unit; /* for per-object parity, if any */
74
75 /* object -> pg layout */
76 __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
77 __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
78} __attribute__ ((packed));
79
80#define CEPH_MIN_STRIPE_UNIT 65536
81
82int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
83
84
85/* crypto algorithms */
86#define CEPH_CRYPTO_NONE 0x0
87#define CEPH_CRYPTO_AES 0x1
88
89/* security/authentication protocols */
90#define CEPH_AUTH_UNKNOWN 0x0
91#define CEPH_AUTH_NONE 0x1
92#define CEPH_AUTH_CEPHX 0x2
93
94
95/*********************************************
96 * message layer
97 */
98
99/*
100 * message types
101 */
102
103/* misc */
104#define CEPH_MSG_SHUTDOWN 1
105#define CEPH_MSG_PING 2
106
107/* client <-> monitor */
108#define CEPH_MSG_MON_MAP 4
109#define CEPH_MSG_MON_GET_MAP 5
110#define CEPH_MSG_STATFS 13
111#define CEPH_MSG_STATFS_REPLY 14
112#define CEPH_MSG_MON_SUBSCRIBE 15
113#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
114#define CEPH_MSG_AUTH 17
115#define CEPH_MSG_AUTH_REPLY 18
116
117/* client <-> mds */
118#define CEPH_MSG_MDS_MAP 21
119
120#define CEPH_MSG_CLIENT_SESSION 22
121#define CEPH_MSG_CLIENT_RECONNECT 23
122
123#define CEPH_MSG_CLIENT_REQUEST 24
124#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
125#define CEPH_MSG_CLIENT_REPLY 26
126#define CEPH_MSG_CLIENT_CAPS 0x310
127#define CEPH_MSG_CLIENT_LEASE 0x311
128#define CEPH_MSG_CLIENT_SNAP 0x312
129#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
130
131/* osd */
132#define CEPH_MSG_OSD_MAP 41
133#define CEPH_MSG_OSD_OP 42
134#define CEPH_MSG_OSD_OPREPLY 43
135
136struct ceph_mon_request_header {
137 __le64 have_version;
138 __le16 session_mon;
139 __le64 session_mon_tid;
140} __attribute__ ((packed));
141
142struct ceph_mon_statfs {
143 struct ceph_mon_request_header monhdr;
144 struct ceph_fsid fsid;
145} __attribute__ ((packed));
146
147struct ceph_statfs {
148 __le64 kb, kb_used, kb_avail;
149 __le64 num_objects;
150} __attribute__ ((packed));
151
152struct ceph_mon_statfs_reply {
153 struct ceph_fsid fsid;
154 __le64 version;
155 struct ceph_statfs st;
156} __attribute__ ((packed));
157
158struct ceph_osd_getmap {
159 struct ceph_mon_request_header monhdr;
160 struct ceph_fsid fsid;
161 __le32 start;
162} __attribute__ ((packed));
163
164struct ceph_mds_getmap {
165 struct ceph_mon_request_header monhdr;
166 struct ceph_fsid fsid;
167} __attribute__ ((packed));
168
169struct ceph_client_mount {
170 struct ceph_mon_request_header monhdr;
171} __attribute__ ((packed));
172
173struct ceph_mon_subscribe_item {
174 __le64 have_version; __le64 have;
175 __u8 onetime;
176} __attribute__ ((packed));
177
178struct ceph_mon_subscribe_ack {
179 __le32 duration; /* seconds */
180 struct ceph_fsid fsid;
181} __attribute__ ((packed));
182
183/*
184 * mds states
185 * > 0 -> in
186 * <= 0 -> out
187 */
188#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */
189#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees.
190 empty log. */
191#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */
192#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
193#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
194#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
195#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
196
197#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
198#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
199 operations (import, rename, etc.) */
200#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */
201#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */
202#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
203#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */
204#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */
205
206extern const char *ceph_mds_state_name(int s);
207
208
209/*
210 * metadata lock types.
211 * - these are bitmasks.. we can compose them
212 * - they also define the lock ordering by the MDS
213 * - a few of these are internal to the mds
214 */
215#define CEPH_LOCK_DN 1
216#define CEPH_LOCK_ISNAP 2
217#define CEPH_LOCK_IVERSION 4 /* mds internal */
218#define CEPH_LOCK_IFILE 8 /* mds internal */
219#define CEPH_LOCK_IAUTH 32
220#define CEPH_LOCK_ILINK 64
221#define CEPH_LOCK_IDFT 128 /* dir frag tree */
222#define CEPH_LOCK_INEST 256 /* mds internal */
223#define CEPH_LOCK_IXATTR 512
224#define CEPH_LOCK_INO 2048 /* immutable inode bits; not a lock */
225
226/* client_session ops */
227enum {
228 CEPH_SESSION_REQUEST_OPEN,
229 CEPH_SESSION_OPEN,
230 CEPH_SESSION_REQUEST_CLOSE,
231 CEPH_SESSION_CLOSE,
232 CEPH_SESSION_REQUEST_RENEWCAPS,
233 CEPH_SESSION_RENEWCAPS,
234 CEPH_SESSION_STALE,
235 CEPH_SESSION_RECALL_STATE,
236};
237
238extern const char *ceph_session_op_name(int op);
239
240struct ceph_mds_session_head {
241 __le32 op;
242 __le64 seq;
243 struct ceph_timespec stamp;
244 __le32 max_caps, max_leases;
245} __attribute__ ((packed));
246
247/* client_request */
248/*
249 * metadata ops.
250 * & 0x001000 -> write op
251 * & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
252 & & 0x100000 -> use weird ino/path trace
253 */
254#define CEPH_MDS_OP_WRITE 0x001000
255enum {
256 CEPH_MDS_OP_LOOKUP = 0x00100,
257 CEPH_MDS_OP_GETATTR = 0x00101,
258 CEPH_MDS_OP_LOOKUPHASH = 0x00102,
259 CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
260
261 CEPH_MDS_OP_SETXATTR = 0x01105,
262 CEPH_MDS_OP_RMXATTR = 0x01106,
263 CEPH_MDS_OP_SETLAYOUT = 0x01107,
264 CEPH_MDS_OP_SETATTR = 0x01108,
265
266 CEPH_MDS_OP_MKNOD = 0x01201,
267 CEPH_MDS_OP_LINK = 0x01202,
268 CEPH_MDS_OP_UNLINK = 0x01203,
269 CEPH_MDS_OP_RENAME = 0x01204,
270 CEPH_MDS_OP_MKDIR = 0x01220,
271 CEPH_MDS_OP_RMDIR = 0x01221,
272 CEPH_MDS_OP_SYMLINK = 0x01222,
273
274 CEPH_MDS_OP_CREATE = 0x01301,
275 CEPH_MDS_OP_OPEN = 0x00302,
276 CEPH_MDS_OP_READDIR = 0x00305,
277
278 CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
279 CEPH_MDS_OP_MKSNAP = 0x01400,
280 CEPH_MDS_OP_RMSNAP = 0x01401,
281 CEPH_MDS_OP_LSSNAP = 0x00402,
282};
283
284extern const char *ceph_mds_op_name(int op);
285
286
287#define CEPH_SETATTR_MODE 1
288#define CEPH_SETATTR_UID 2
289#define CEPH_SETATTR_GID 4
290#define CEPH_SETATTR_MTIME 8
291#define CEPH_SETATTR_ATIME 16
292#define CEPH_SETATTR_SIZE 32
293#define CEPH_SETATTR_CTIME 64
294
295union ceph_mds_request_args {
296 struct {
297 __le32 mask; /* CEPH_CAP_* */
298 } __attribute__ ((packed)) getattr;
299 struct {
300 __le32 mode;
301 __le32 uid;
302 __le32 gid;
303 struct ceph_timespec mtime;
304 struct ceph_timespec atime;
305 __le64 size, old_size; /* old_size needed by truncate */
306 __le32 mask; /* CEPH_SETATTR_* */
307 } __attribute__ ((packed)) setattr;
308 struct {
309 __le32 frag; /* which dir fragment */
310 __le32 max_entries; /* how many dentries to grab */
311 } __attribute__ ((packed)) readdir;
312 struct {
313 __le32 mode;
314 __le32 rdev;
315 } __attribute__ ((packed)) mknod;
316 struct {
317 __le32 mode;
318 } __attribute__ ((packed)) mkdir;
319 struct {
320 __le32 flags;
321 __le32 mode;
322 __le32 stripe_unit; /* layout for newly created file */
323 __le32 stripe_count; /* ... */
324 __le32 object_size;
325 __le32 file_replication;
326 __le32 preferred;
327 } __attribute__ ((packed)) open;
328 struct {
329 __le32 flags;
330 } __attribute__ ((packed)) setxattr;
331 struct {
332 struct ceph_file_layout layout;
333 } __attribute__ ((packed)) setlayout;
334} __attribute__ ((packed));
335
336#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
337#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
338
339struct ceph_mds_request_head {
340 __le64 oldest_client_tid;
341 __le32 mdsmap_epoch; /* on client */
342 __le32 flags; /* CEPH_MDS_FLAG_* */
343 __u8 num_retry, num_fwd; /* count retry, fwd attempts */
344 __le16 num_releases; /* # include cap/lease release records */
345 __le32 op; /* mds op code */
346 __le32 caller_uid, caller_gid;
347 __le64 ino; /* use this ino for openc, mkdir, mknod,
348 etc. (if replaying) */
349 union ceph_mds_request_args args;
350} __attribute__ ((packed));
351
352/* cap/lease release record */
353struct ceph_mds_request_release {
354 __le64 ino, cap_id; /* ino and unique cap id */
355 __le32 caps, wanted; /* new issued, wanted */
356 __le32 seq, issue_seq, mseq;
357 __le32 dname_seq; /* if releasing a dentry lease, a */
358 __le32 dname_len; /* string follows. */
359} __attribute__ ((packed));
360
361/* client reply */
362struct ceph_mds_reply_head {
363 __le32 op;
364 __le32 result;
365 __le32 mdsmap_epoch;
366 __u8 safe; /* true if committed to disk */
367 __u8 is_dentry, is_target; /* true if dentry, target inode records
368 are included with reply */
369} __attribute__ ((packed));
370
371/* one for each node split */
372struct ceph_frag_tree_split {
373 __le32 frag; /* this frag splits... */
374 __le32 by; /* ...by this many bits */
375} __attribute__ ((packed));
376
377struct ceph_frag_tree_head {
378 __le32 nsplits; /* num ceph_frag_tree_split records */
379 struct ceph_frag_tree_split splits[];
380} __attribute__ ((packed));
381
382/* capability issue, for bundling with mds reply */
383struct ceph_mds_reply_cap {
384 __le32 caps, wanted; /* caps issued, wanted */
385 __le64 cap_id;
386 __le32 seq, mseq;
387 __le64 realm; /* snap realm */
388 __u8 flags; /* CEPH_CAP_FLAG_* */
389} __attribute__ ((packed));
390
391#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */
392
393/* inode record, for bundling with mds reply */
394struct ceph_mds_reply_inode {
395 __le64 ino;
396 __le64 snapid;
397 __le32 rdev;
398 __le64 version; /* inode version */
399 __le64 xattr_version; /* version for xattr blob */
400 struct ceph_mds_reply_cap cap; /* caps issued for this inode */
401 struct ceph_file_layout layout;
402 struct ceph_timespec ctime, mtime, atime;
403 __le32 time_warp_seq;
404 __le64 size, max_size, truncate_size;
405 __le32 truncate_seq;
406 __le32 mode, uid, gid;
407 __le32 nlink;
408 __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */
409 struct ceph_timespec rctime;
410 struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */
411} __attribute__ ((packed));
412/* followed by frag array, then symlink string, then xattr blob */
413
414/* reply_lease follows dname, and reply_inode */
415struct ceph_mds_reply_lease {
416 __le16 mask; /* lease type(s) */
417 __le32 duration_ms; /* lease duration */
418 __le32 seq;
419} __attribute__ ((packed));
420
421struct ceph_mds_reply_dirfrag {
422 __le32 frag; /* fragment */
423 __le32 auth; /* auth mds, if this is a delegation point */
424 __le32 ndist; /* number of mds' this is replicated on */
425 __le32 dist[];
426} __attribute__ ((packed));
427
428/* file access modes */
429#define CEPH_FILE_MODE_PIN 0
430#define CEPH_FILE_MODE_RD 1
431#define CEPH_FILE_MODE_WR 2
432#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
433#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
434#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
435
436int ceph_flags_to_mode(int flags);
437
438
439/* capability bits */
440#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */
441
442/* generic cap bits */
443#define CEPH_CAP_GSHARED 1 /* client can reads */
444#define CEPH_CAP_GEXCL 2 /* client can read and update */
445#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */
446#define CEPH_CAP_GRD 8 /* (file) client can read */
447#define CEPH_CAP_GWR 16 /* (file) client can write */
448#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */
449#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
450#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
451
452/* per-lock shift */
453#define CEPH_CAP_SAUTH 2
454#define CEPH_CAP_SLINK 4
455#define CEPH_CAP_SXATTR 6
456#define CEPH_CAP_SFILE 8 /* goes at the end (uses >2 cap bits) */
457
458#define CEPH_CAP_BITS 16
459
460/* composed values */
461#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
462#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)
463#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK)
464#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK)
465#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR)
466#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR)
467#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE)
468#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE)
469#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE)
470#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE)
471#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE)
472#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE)
473#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
474#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
475#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
476
477/* cap masks (for getattr) */
478#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
479#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */
480#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN
481#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED
482#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED
483#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED
484#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED
485#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED
486#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED
487#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED
488#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */
489#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED
490#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \
491 CEPH_CAP_AUTH_SHARED | \
492 CEPH_CAP_LINK_SHARED | \
493 CEPH_CAP_FILE_SHARED | \
494 CEPH_CAP_XATTR_SHARED)
495
496#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
497 CEPH_CAP_LINK_SHARED | \
498 CEPH_CAP_XATTR_SHARED | \
499 CEPH_CAP_FILE_SHARED)
500#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \
501 CEPH_CAP_FILE_CACHE)
502
503#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \
504 CEPH_CAP_LINK_EXCL | \
505 CEPH_CAP_XATTR_EXCL | \
506 CEPH_CAP_FILE_EXCL)
507#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
508 CEPH_CAP_FILE_EXCL)
509#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
510#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
511 CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN)
512
513#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
514 CEPH_LOCK_IXATTR)
515
516int ceph_caps_for_mode(int mode);
517
518enum {
519 CEPH_CAP_OP_GRANT, /* mds->client grant */
520 CEPH_CAP_OP_REVOKE, /* mds->client revoke */
521 CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */
522 CEPH_CAP_OP_EXPORT, /* mds has exported the cap */
523 CEPH_CAP_OP_IMPORT, /* mds has imported the cap */
524 CEPH_CAP_OP_UPDATE, /* client->mds update */
525 CEPH_CAP_OP_DROP, /* client->mds drop cap bits */
526 CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */
527 CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */
528 CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
529 CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
530 CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
531 CEPH_CAP_OP_RENEW, /* client->mds renewal request */
532};
533
534extern const char *ceph_cap_op_name(int op);
535
536/*
537 * caps message, used for capability callbacks, acks, requests, etc.
538 */
539struct ceph_mds_caps {
540 __le32 op; /* CEPH_CAP_OP_* */
541 __le64 ino, realm;
542 __le64 cap_id;
543 __le32 seq, issue_seq;
544 __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
545 __le32 migrate_seq;
546 __le64 snap_follows;
547 __le32 snap_trace_len;
548
549 /* authlock */
550 __le32 uid, gid, mode;
551
552 /* linklock */
553 __le32 nlink;
554
555 /* xattrlock */
556 __le32 xattr_len;
557 __le64 xattr_version;
558
559 /* filelock */
560 __le64 size, max_size, truncate_size;
561 __le32 truncate_seq;
562 struct ceph_timespec mtime, atime, ctime;
563 struct ceph_file_layout layout;
564 __le32 time_warp_seq;
565} __attribute__ ((packed));
566
567/* cap release msg head */
568struct ceph_mds_cap_release {
569 __le32 num; /* number of cap_items that follow */
570} __attribute__ ((packed));
571
572struct ceph_mds_cap_item {
573 __le64 ino;
574 __le64 cap_id;
575 __le32 migrate_seq, seq;
576} __attribute__ ((packed));
577
578#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */
579#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */
580#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */
581#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */
582
583extern const char *ceph_lease_op_name(int o);
584
585/* lease msg header */
586struct ceph_mds_lease {
587 __u8 action; /* CEPH_MDS_LEASE_* */
588 __le16 mask; /* which lease */
589 __le64 ino;
590 __le64 first, last; /* snap range */
591 __le32 seq;
592 __le32 duration_ms; /* duration of renewal */
593} __attribute__ ((packed));
594/* followed by a __le32+string for dname */
595
596/* client reconnect */
597struct ceph_mds_cap_reconnect {
598 __le64 cap_id;
599 __le32 wanted;
600 __le32 issued;
601 __le64 size;
602 struct ceph_timespec mtime, atime;
603 __le64 snaprealm;
604 __le64 pathbase; /* base ino for our path to this ino */
605} __attribute__ ((packed));
606/* followed by encoded string */
607
608struct ceph_mds_snaprealm_reconnect {
609 __le64 ino; /* snap realm base */
610 __le64 seq; /* snap seq for this snap realm */
611 __le64 parent; /* parent realm */
612} __attribute__ ((packed));
613
614/*
615 * snaps
616 */
617enum {
618 CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */
619 CEPH_SNAP_OP_CREATE,
620 CEPH_SNAP_OP_DESTROY,
621 CEPH_SNAP_OP_SPLIT,
622};
623
624extern const char *ceph_snap_op_name(int o);
625
626/* snap msg header */
627struct ceph_mds_snap_head {
628 __le32 op; /* CEPH_SNAP_OP_* */
629 __le64 split; /* ino to split off, if any */
630 __le32 num_split_inos; /* # inos belonging to new child realm */
631 __le32 num_split_realms; /* # child realms udner new child realm */
632 __le32 trace_len; /* size of snap trace blob */
633} __attribute__ ((packed));
634/* followed by split ino list, then split realms, then the trace blob */
635
636/*
637 * encode info about a snaprealm, as viewed by a client
638 */
639struct ceph_mds_snap_realm {
640 __le64 ino; /* ino */
641 __le64 created; /* snap: when created */
642 __le64 parent; /* ino: parent realm */
643 __le64 parent_since; /* snap: same parent since */
644 __le64 seq; /* snap: version */
645 __le32 num_snaps;
646 __le32 num_prior_parent_snaps;
647} __attribute__ ((packed));
648/* followed by my snap list, then prior parent snap list */
649
650#endif
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
new file mode 100644
index 000000000000..bd570015d147
--- /dev/null
+++ b/fs/ceph/ceph_hash.c
@@ -0,0 +1,118 @@
1
2#include "types.h"
3
4/*
5 * Robert Jenkin's hash function.
6 * http://burtleburtle.net/bob/hash/evahash.html
7 * This is in the public domain.
8 */
9#define mix(a, b, c) \
10 do { \
11 a = a - b; a = a - c; a = a ^ (c >> 13); \
12 b = b - c; b = b - a; b = b ^ (a << 8); \
13 c = c - a; c = c - b; c = c ^ (b >> 13); \
14 a = a - b; a = a - c; a = a ^ (c >> 12); \
15 b = b - c; b = b - a; b = b ^ (a << 16); \
16 c = c - a; c = c - b; c = c ^ (b >> 5); \
17 a = a - b; a = a - c; a = a ^ (c >> 3); \
18 b = b - c; b = b - a; b = b ^ (a << 10); \
19 c = c - a; c = c - b; c = c ^ (b >> 15); \
20 } while (0)
21
22unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
23{
24 const unsigned char *k = (const unsigned char *)str;
25 __u32 a, b, c; /* the internal state */
26 __u32 len; /* how many key bytes still need mixing */
27
28 /* Set up the internal state */
29 len = length;
30 a = 0x9e3779b9; /* the golden ratio; an arbitrary value */
31 b = a;
32 c = 0; /* variable initialization of internal state */
33
34 /* handle most of the key */
35 while (len >= 12) {
36 a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
37 ((__u32)k[3] << 24));
38 b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
39 ((__u32)k[7] << 24));
40 c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
41 ((__u32)k[11] << 24));
42 mix(a, b, c);
43 k = k + 12;
44 len = len - 12;
45 }
46
47 /* handle the last 11 bytes */
48 c = c + length;
49 switch (len) { /* all the case statements fall through */
50 case 11:
51 c = c + ((__u32)k[10] << 24);
52 case 10:
53 c = c + ((__u32)k[9] << 16);
54 case 9:
55 c = c + ((__u32)k[8] << 8);
56 /* the first byte of c is reserved for the length */
57 case 8:
58 b = b + ((__u32)k[7] << 24);
59 case 7:
60 b = b + ((__u32)k[6] << 16);
61 case 6:
62 b = b + ((__u32)k[5] << 8);
63 case 5:
64 b = b + k[4];
65 case 4:
66 a = a + ((__u32)k[3] << 24);
67 case 3:
68 a = a + ((__u32)k[2] << 16);
69 case 2:
70 a = a + ((__u32)k[1] << 8);
71 case 1:
72 a = a + k[0];
73 /* case 0: nothing left to add */
74 }
75 mix(a, b, c);
76
77 return c;
78}
79
80/*
81 * linux dcache hash
82 */
83unsigned ceph_str_hash_linux(const char *str, unsigned length)
84{
85 unsigned long hash = 0;
86 unsigned char c;
87
88 while (length--) {
89 c = *str++;
90 hash = (hash + (c << 4) + (c >> 4)) * 11;
91 }
92 return hash;
93}
94
95
96unsigned ceph_str_hash(int type, const char *s, unsigned len)
97{
98 switch (type) {
99 case CEPH_STR_HASH_LINUX:
100 return ceph_str_hash_linux(s, len);
101 case CEPH_STR_HASH_RJENKINS:
102 return ceph_str_hash_rjenkins(s, len);
103 default:
104 return -1;
105 }
106}
107
108const char *ceph_str_hash_name(int type)
109{
110 switch (type) {
111 case CEPH_STR_HASH_LINUX:
112 return "linux";
113 case CEPH_STR_HASH_RJENKINS:
114 return "rjenkins";
115 default:
116 return "unknown";
117 }
118}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
new file mode 100644
index 000000000000..5ac470c433c9
--- /dev/null
+++ b/fs/ceph/ceph_hash.h
@@ -0,0 +1,13 @@
1#ifndef _FS_CEPH_HASH_H
2#define _FS_CEPH_HASH_H
3
4#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
5#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
6
7extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
8extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
9
10extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
11extern const char *ceph_str_hash_name(int type);
12
13#endif
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
new file mode 100644
index 000000000000..8e4be6a80c62
--- /dev/null
+++ b/fs/ceph/ceph_strings.c
@@ -0,0 +1,176 @@
1/*
2 * Ceph string constants
3 */
4#include "types.h"
5
6const char *ceph_entity_type_name(int type)
7{
8 switch (type) {
9 case CEPH_ENTITY_TYPE_MDS: return "mds";
10 case CEPH_ENTITY_TYPE_OSD: return "osd";
11 case CEPH_ENTITY_TYPE_MON: return "mon";
12 case CEPH_ENTITY_TYPE_CLIENT: return "client";
13 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
14 case CEPH_ENTITY_TYPE_AUTH: return "auth";
15 default: return "unknown";
16 }
17}
18
19const char *ceph_osd_op_name(int op)
20{
21 switch (op) {
22 case CEPH_OSD_OP_READ: return "read";
23 case CEPH_OSD_OP_STAT: return "stat";
24
25 case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
26
27 case CEPH_OSD_OP_WRITE: return "write";
28 case CEPH_OSD_OP_DELETE: return "delete";
29 case CEPH_OSD_OP_TRUNCATE: return "truncate";
30 case CEPH_OSD_OP_ZERO: return "zero";
31 case CEPH_OSD_OP_WRITEFULL: return "writefull";
32
33 case CEPH_OSD_OP_APPEND: return "append";
34 case CEPH_OSD_OP_STARTSYNC: return "startsync";
35 case CEPH_OSD_OP_SETTRUNC: return "settrunc";
36 case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
37
38 case CEPH_OSD_OP_TMAPUP: return "tmapup";
39 case CEPH_OSD_OP_TMAPGET: return "tmapget";
40 case CEPH_OSD_OP_TMAPPUT: return "tmapput";
41
42 case CEPH_OSD_OP_GETXATTR: return "getxattr";
43 case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
44 case CEPH_OSD_OP_SETXATTR: return "setxattr";
45 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
46 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
47 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
48
49 case CEPH_OSD_OP_PULL: return "pull";
50 case CEPH_OSD_OP_PUSH: return "push";
51 case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
52 case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
53 case CEPH_OSD_OP_SCRUB: return "scrub";
54
55 case CEPH_OSD_OP_WRLOCK: return "wrlock";
56 case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
57 case CEPH_OSD_OP_RDLOCK: return "rdlock";
58 case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
59 case CEPH_OSD_OP_UPLOCK: return "uplock";
60 case CEPH_OSD_OP_DNLOCK: return "dnlock";
61
62 case CEPH_OSD_OP_CALL: return "call";
63
64 case CEPH_OSD_OP_PGLS: return "pgls";
65 }
66 return "???";
67}
68
69const char *ceph_mds_state_name(int s)
70{
71 switch (s) {
72 /* down and out */
73 case CEPH_MDS_STATE_DNE: return "down:dne";
74 case CEPH_MDS_STATE_STOPPED: return "down:stopped";
75 /* up and out */
76 case CEPH_MDS_STATE_BOOT: return "up:boot";
77 case CEPH_MDS_STATE_STANDBY: return "up:standby";
78 case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay";
79 case CEPH_MDS_STATE_CREATING: return "up:creating";
80 case CEPH_MDS_STATE_STARTING: return "up:starting";
81 /* up and in */
82 case CEPH_MDS_STATE_REPLAY: return "up:replay";
83 case CEPH_MDS_STATE_RESOLVE: return "up:resolve";
84 case CEPH_MDS_STATE_RECONNECT: return "up:reconnect";
85 case CEPH_MDS_STATE_REJOIN: return "up:rejoin";
86 case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
87 case CEPH_MDS_STATE_ACTIVE: return "up:active";
88 case CEPH_MDS_STATE_STOPPING: return "up:stopping";
89 }
90 return "???";
91}
92
93const char *ceph_session_op_name(int op)
94{
95 switch (op) {
96 case CEPH_SESSION_REQUEST_OPEN: return "request_open";
97 case CEPH_SESSION_OPEN: return "open";
98 case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
99 case CEPH_SESSION_CLOSE: return "close";
100 case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
101 case CEPH_SESSION_RENEWCAPS: return "renewcaps";
102 case CEPH_SESSION_STALE: return "stale";
103 case CEPH_SESSION_RECALL_STATE: return "recall_state";
104 }
105 return "???";
106}
107
108const char *ceph_mds_op_name(int op)
109{
110 switch (op) {
111 case CEPH_MDS_OP_LOOKUP: return "lookup";
112 case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
113 case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
114 case CEPH_MDS_OP_GETATTR: return "getattr";
115 case CEPH_MDS_OP_SETXATTR: return "setxattr";
116 case CEPH_MDS_OP_SETATTR: return "setattr";
117 case CEPH_MDS_OP_RMXATTR: return "rmxattr";
118 case CEPH_MDS_OP_READDIR: return "readdir";
119 case CEPH_MDS_OP_MKNOD: return "mknod";
120 case CEPH_MDS_OP_LINK: return "link";
121 case CEPH_MDS_OP_UNLINK: return "unlink";
122 case CEPH_MDS_OP_RENAME: return "rename";
123 case CEPH_MDS_OP_MKDIR: return "mkdir";
124 case CEPH_MDS_OP_RMDIR: return "rmdir";
125 case CEPH_MDS_OP_SYMLINK: return "symlink";
126 case CEPH_MDS_OP_CREATE: return "create";
127 case CEPH_MDS_OP_OPEN: return "open";
128 case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
129 case CEPH_MDS_OP_LSSNAP: return "lssnap";
130 case CEPH_MDS_OP_MKSNAP: return "mksnap";
131 case CEPH_MDS_OP_RMSNAP: return "rmsnap";
132 }
133 return "???";
134}
135
136const char *ceph_cap_op_name(int op)
137{
138 switch (op) {
139 case CEPH_CAP_OP_GRANT: return "grant";
140 case CEPH_CAP_OP_REVOKE: return "revoke";
141 case CEPH_CAP_OP_TRUNC: return "trunc";
142 case CEPH_CAP_OP_EXPORT: return "export";
143 case CEPH_CAP_OP_IMPORT: return "import";
144 case CEPH_CAP_OP_UPDATE: return "update";
145 case CEPH_CAP_OP_DROP: return "drop";
146 case CEPH_CAP_OP_FLUSH: return "flush";
147 case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
148 case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
149 case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
150 case CEPH_CAP_OP_RELEASE: return "release";
151 case CEPH_CAP_OP_RENEW: return "renew";
152 }
153 return "???";
154}
155
156const char *ceph_lease_op_name(int o)
157{
158 switch (o) {
159 case CEPH_MDS_LEASE_REVOKE: return "revoke";
160 case CEPH_MDS_LEASE_RELEASE: return "release";
161 case CEPH_MDS_LEASE_RENEW: return "renew";
162 case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
163 }
164 return "???";
165}
166
167const char *ceph_snap_op_name(int o)
168{
169 switch (o) {
170 case CEPH_SNAP_OP_UPDATE: return "update";
171 case CEPH_SNAP_OP_CREATE: return "create";
172 case CEPH_SNAP_OP_DESTROY: return "destroy";
173 case CEPH_SNAP_OP_SPLIT: return "split";
174 }
175 return "???";
176}
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
new file mode 100644
index 000000000000..fabd302e5779
--- /dev/null
+++ b/fs/ceph/crush/crush.c
@@ -0,0 +1,151 @@
1
2#ifdef __KERNEL__
3# include <linux/slab.h>
4#else
5# include <stdlib.h>
6# include <assert.h>
7# define kfree(x) do { if (x) free(x); } while (0)
8# define BUG_ON(x) assert(!(x))
9#endif
10
11#include "crush.h"
12
13const char *crush_bucket_alg_name(int alg)
14{
15 switch (alg) {
16 case CRUSH_BUCKET_UNIFORM: return "uniform";
17 case CRUSH_BUCKET_LIST: return "list";
18 case CRUSH_BUCKET_TREE: return "tree";
19 case CRUSH_BUCKET_STRAW: return "straw";
20 default: return "unknown";
21 }
22}
23
24/**
25 * crush_get_bucket_item_weight - Get weight of an item in given bucket
26 * @b: bucket pointer
27 * @p: item index in bucket
28 */
29int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
30{
31 if (p >= b->size)
32 return 0;
33
34 switch (b->alg) {
35 case CRUSH_BUCKET_UNIFORM:
36 return ((struct crush_bucket_uniform *)b)->item_weight;
37 case CRUSH_BUCKET_LIST:
38 return ((struct crush_bucket_list *)b)->item_weights[p];
39 case CRUSH_BUCKET_TREE:
40 if (p & 1)
41 return ((struct crush_bucket_tree *)b)->node_weights[p];
42 return 0;
43 case CRUSH_BUCKET_STRAW:
44 return ((struct crush_bucket_straw *)b)->item_weights[p];
45 }
46 return 0;
47}
48
49/**
50 * crush_calc_parents - Calculate parent vectors for the given crush map.
51 * @map: crush_map pointer
52 */
53void crush_calc_parents(struct crush_map *map)
54{
55 int i, b, c;
56
57 for (b = 0; b < map->max_buckets; b++) {
58 if (map->buckets[b] == NULL)
59 continue;
60 for (i = 0; i < map->buckets[b]->size; i++) {
61 c = map->buckets[b]->items[i];
62 BUG_ON(c >= map->max_devices ||
63 c < -map->max_buckets);
64 if (c >= 0)
65 map->device_parents[c] = map->buckets[b]->id;
66 else
67 map->bucket_parents[-1-c] = map->buckets[b]->id;
68 }
69 }
70}
71
72void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
73{
74 kfree(b->h.perm);
75 kfree(b->h.items);
76 kfree(b);
77}
78
79void crush_destroy_bucket_list(struct crush_bucket_list *b)
80{
81 kfree(b->item_weights);
82 kfree(b->sum_weights);
83 kfree(b->h.perm);
84 kfree(b->h.items);
85 kfree(b);
86}
87
88void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
89{
90 kfree(b->node_weights);
91 kfree(b);
92}
93
94void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
95{
96 kfree(b->straws);
97 kfree(b->item_weights);
98 kfree(b->h.perm);
99 kfree(b->h.items);
100 kfree(b);
101}
102
103void crush_destroy_bucket(struct crush_bucket *b)
104{
105 switch (b->alg) {
106 case CRUSH_BUCKET_UNIFORM:
107 crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
108 break;
109 case CRUSH_BUCKET_LIST:
110 crush_destroy_bucket_list((struct crush_bucket_list *)b);
111 break;
112 case CRUSH_BUCKET_TREE:
113 crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
114 break;
115 case CRUSH_BUCKET_STRAW:
116 crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
117 break;
118 }
119}
120
121/**
122 * crush_destroy - Destroy a crush_map
123 * @map: crush_map pointer
124 */
125void crush_destroy(struct crush_map *map)
126{
127 int b;
128
129 /* buckets */
130 if (map->buckets) {
131 for (b = 0; b < map->max_buckets; b++) {
132 if (map->buckets[b] == NULL)
133 continue;
134 crush_destroy_bucket(map->buckets[b]);
135 }
136 kfree(map->buckets);
137 }
138
139 /* rules */
140 if (map->rules) {
141 for (b = 0; b < map->max_rules; b++)
142 kfree(map->rules[b]);
143 kfree(map->rules);
144 }
145
146 kfree(map->bucket_parents);
147 kfree(map->device_parents);
148 kfree(map);
149}
150
151
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
new file mode 100644
index 000000000000..dcd7e7523700
--- /dev/null
+++ b/fs/ceph/crush/crush.h
@@ -0,0 +1,180 @@
1#ifndef _CRUSH_CRUSH_H
2#define _CRUSH_CRUSH_H
3
4#include <linux/types.h>
5
6/*
7 * CRUSH is a pseudo-random data distribution algorithm that
8 * efficiently distributes input values (typically, data objects)
9 * across a heterogeneous, structured storage cluster.
10 *
11 * The algorithm was originally described in detail in this paper
12 * (although the algorithm has evolved somewhat since then):
13 *
14 * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
15 *
16 * LGPL2
17 */
18
19
20#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
21
22
23#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
24#define CRUSH_MAX_SET 10 /* max size of a mapping result */
25
26
27/*
28 * CRUSH uses user-defined "rules" to describe how inputs should be
29 * mapped to devices. A rule consists of sequence of steps to perform
30 * to generate the set of output devices.
31 */
32struct crush_rule_step {
33 __u32 op;
34 __s32 arg1;
35 __s32 arg2;
36};
37
38/* step op codes */
39enum {
40 CRUSH_RULE_NOOP = 0,
41 CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
42 CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
43 /* arg2 = type */
44 CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
45 CRUSH_RULE_EMIT = 4, /* no args */
46 CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
47 CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
48};
49
50/*
51 * for specifying choose num (arg1) relative to the max parameter
52 * passed to do_rule
53 */
54#define CRUSH_CHOOSE_N 0
55#define CRUSH_CHOOSE_N_MINUS(x) (-(x))
56
57/*
58 * The rule mask is used to describe what the rule is intended for.
59 * Given a ruleset and size of output set, we search through the
60 * rule list for a matching rule_mask.
61 */
62struct crush_rule_mask {
63 __u8 ruleset;
64 __u8 type;
65 __u8 min_size;
66 __u8 max_size;
67};
68
69struct crush_rule {
70 __u32 len;
71 struct crush_rule_mask mask;
72 struct crush_rule_step steps[0];
73};
74
75#define crush_rule_size(len) (sizeof(struct crush_rule) + \
76 (len)*sizeof(struct crush_rule_step))
77
78
79
80/*
81 * A bucket is a named container of other items (either devices or
82 * other buckets). Items within a bucket are chosen using one of a
83 * few different algorithms. The table summarizes how the speed of
84 * each option measures up against mapping stability when items are
85 * added or removed.
86 *
87 * Bucket Alg Speed Additions Removals
88 * ------------------------------------------------
89 * uniform O(1) poor poor
90 * list O(n) optimal poor
91 * tree O(log n) good good
92 * straw O(n) optimal optimal
93 */
94enum {
95 CRUSH_BUCKET_UNIFORM = 1,
96 CRUSH_BUCKET_LIST = 2,
97 CRUSH_BUCKET_TREE = 3,
98 CRUSH_BUCKET_STRAW = 4
99};
100extern const char *crush_bucket_alg_name(int alg);
101
102struct crush_bucket {
103 __s32 id; /* this'll be negative */
104 __u16 type; /* non-zero; type=0 is reserved for devices */
105 __u8 alg; /* one of CRUSH_BUCKET_* */
106 __u8 hash; /* which hash function to use, CRUSH_HASH_* */
107 __u32 weight; /* 16-bit fixed point */
108 __u32 size; /* num items */
109 __s32 *items;
110
111 /*
112 * cached random permutation: used for uniform bucket and for
113 * the linear search fallback for the other bucket types.
114 */
115 __u32 perm_x; /* @x for which *perm is defined */
116 __u32 perm_n; /* num elements of *perm that are permuted/defined */
117 __u32 *perm;
118};
119
120struct crush_bucket_uniform {
121 struct crush_bucket h;
122 __u32 item_weight; /* 16-bit fixed point; all items equally weighted */
123};
124
125struct crush_bucket_list {
126 struct crush_bucket h;
127 __u32 *item_weights; /* 16-bit fixed point */
128 __u32 *sum_weights; /* 16-bit fixed point. element i is sum
129 of weights 0..i, inclusive */
130};
131
132struct crush_bucket_tree {
133 struct crush_bucket h; /* note: h.size is _tree_ size, not number of
134 actual items */
135 __u8 num_nodes;
136 __u32 *node_weights;
137};
138
139struct crush_bucket_straw {
140 struct crush_bucket h;
141 __u32 *item_weights; /* 16-bit fixed point */
142 __u32 *straws; /* 16-bit fixed point */
143};
144
145
146
147/*
148 * CRUSH map includes all buckets, rules, etc.
149 */
150struct crush_map {
151 struct crush_bucket **buckets;
152 struct crush_rule **rules;
153
154 /*
155 * Parent pointers to identify the parent bucket a device or
156 * bucket in the hierarchy. If an item appears more than
157 * once, this is the _last_ time it appeared (where buckets
158 * are processed in bucket id order, from -1 on down to
159 * -max_buckets.
160 */
161 __u32 *bucket_parents;
162 __u32 *device_parents;
163
164 __s32 max_buckets;
165 __u32 max_rules;
166 __s32 max_devices;
167};
168
169
170/* crush.c */
171extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
172extern void crush_calc_parents(struct crush_map *map);
173extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
174extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
175extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
176extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
177extern void crush_destroy_bucket(struct crush_bucket *b);
178extern void crush_destroy(struct crush_map *map);
179
180#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
new file mode 100644
index 000000000000..5873aed694bf
--- /dev/null
+++ b/fs/ceph/crush/hash.c
@@ -0,0 +1,149 @@
1
2#include <linux/types.h>
3#include "hash.h"
4
5/*
6 * Robert Jenkins' function for mixing 32-bit values
7 * http://burtleburtle.net/bob/hash/evahash.html
8 * a, b = random bits, c = input and output
9 */
10#define crush_hashmix(a, b, c) do { \
11 a = a-b; a = a-c; a = a^(c>>13); \
12 b = b-c; b = b-a; b = b^(a<<8); \
13 c = c-a; c = c-b; c = c^(b>>13); \
14 a = a-b; a = a-c; a = a^(c>>12); \
15 b = b-c; b = b-a; b = b^(a<<16); \
16 c = c-a; c = c-b; c = c^(b>>5); \
17 a = a-b; a = a-c; a = a^(c>>3); \
18 b = b-c; b = b-a; b = b^(a<<10); \
19 c = c-a; c = c-b; c = c^(b>>15); \
20 } while (0)
21
22#define crush_hash_seed 1315423911
23
24static __u32 crush_hash32_rjenkins1(__u32 a)
25{
26 __u32 hash = crush_hash_seed ^ a;
27 __u32 b = a;
28 __u32 x = 231232;
29 __u32 y = 1232;
30 crush_hashmix(b, x, hash);
31 crush_hashmix(y, a, hash);
32 return hash;
33}
34
35static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
36{
37 __u32 hash = crush_hash_seed ^ a ^ b;
38 __u32 x = 231232;
39 __u32 y = 1232;
40 crush_hashmix(a, b, hash);
41 crush_hashmix(x, a, hash);
42 crush_hashmix(b, y, hash);
43 return hash;
44}
45
46static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
47{
48 __u32 hash = crush_hash_seed ^ a ^ b ^ c;
49 __u32 x = 231232;
50 __u32 y = 1232;
51 crush_hashmix(a, b, hash);
52 crush_hashmix(c, x, hash);
53 crush_hashmix(y, a, hash);
54 crush_hashmix(b, x, hash);
55 crush_hashmix(y, c, hash);
56 return hash;
57}
58
59static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
60{
61 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
62 __u32 x = 231232;
63 __u32 y = 1232;
64 crush_hashmix(a, b, hash);
65 crush_hashmix(c, d, hash);
66 crush_hashmix(a, x, hash);
67 crush_hashmix(y, b, hash);
68 crush_hashmix(c, x, hash);
69 crush_hashmix(y, d, hash);
70 return hash;
71}
72
73static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
74 __u32 e)
75{
76 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
77 __u32 x = 231232;
78 __u32 y = 1232;
79 crush_hashmix(a, b, hash);
80 crush_hashmix(c, d, hash);
81 crush_hashmix(e, x, hash);
82 crush_hashmix(y, a, hash);
83 crush_hashmix(b, x, hash);
84 crush_hashmix(y, c, hash);
85 crush_hashmix(d, x, hash);
86 crush_hashmix(y, e, hash);
87 return hash;
88}
89
90
91__u32 crush_hash32(int type, __u32 a)
92{
93 switch (type) {
94 case CRUSH_HASH_RJENKINS1:
95 return crush_hash32_rjenkins1(a);
96 default:
97 return 0;
98 }
99}
100
101__u32 crush_hash32_2(int type, __u32 a, __u32 b)
102{
103 switch (type) {
104 case CRUSH_HASH_RJENKINS1:
105 return crush_hash32_rjenkins1_2(a, b);
106 default:
107 return 0;
108 }
109}
110
111__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
112{
113 switch (type) {
114 case CRUSH_HASH_RJENKINS1:
115 return crush_hash32_rjenkins1_3(a, b, c);
116 default:
117 return 0;
118 }
119}
120
121__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
122{
123 switch (type) {
124 case CRUSH_HASH_RJENKINS1:
125 return crush_hash32_rjenkins1_4(a, b, c, d);
126 default:
127 return 0;
128 }
129}
130
131__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
132{
133 switch (type) {
134 case CRUSH_HASH_RJENKINS1:
135 return crush_hash32_rjenkins1_5(a, b, c, d, e);
136 default:
137 return 0;
138 }
139}
140
141const char *crush_hash_name(int type)
142{
143 switch (type) {
144 case CRUSH_HASH_RJENKINS1:
145 return "rjenkins1";
146 default:
147 return "unknown";
148 }
149}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
new file mode 100644
index 000000000000..ff48e110e4bb
--- /dev/null
+++ b/fs/ceph/crush/hash.h
@@ -0,0 +1,17 @@
1#ifndef _CRUSH_HASH_H
2#define _CRUSH_HASH_H
3
4#define CRUSH_HASH_RJENKINS1 0
5
6#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
7
8extern const char *crush_hash_name(int type);
9
10extern __u32 crush_hash32(int type, __u32 a);
11extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
12extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
13extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
14extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
15 __u32 e);
16
17#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
new file mode 100644
index 000000000000..9ba54efb6543
--- /dev/null
+++ b/fs/ceph/crush/mapper.c
@@ -0,0 +1,596 @@
1
2#ifdef __KERNEL__
3# include <linux/string.h>
4# include <linux/slab.h>
5# include <linux/bug.h>
6# include <linux/kernel.h>
7# ifndef dprintk
8# define dprintk(args...)
9# endif
10#else
11# include <string.h>
12# include <stdio.h>
13# include <stdlib.h>
14# include <assert.h>
15# define BUG_ON(x) assert(!(x))
16# define dprintk(args...) /* printf(args) */
17# define kmalloc(x, f) malloc(x)
18# define kfree(x) free(x)
19#endif
20
21#include "crush.h"
22#include "hash.h"
23
24/*
25 * Implement the core CRUSH mapping algorithm.
26 */
27
28/**
29 * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
30 * @map: the crush_map
31 * @ruleset: the storage ruleset id (user defined)
32 * @type: storage ruleset type (user defined)
33 * @size: output set size
34 */
35int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
36{
37 int i;
38
39 for (i = 0; i < map->max_rules; i++) {
40 if (map->rules[i] &&
41 map->rules[i]->mask.ruleset == ruleset &&
42 map->rules[i]->mask.type == type &&
43 map->rules[i]->mask.min_size <= size &&
44 map->rules[i]->mask.max_size >= size)
45 return i;
46 }
47 return -1;
48}
49
50
51/*
52 * bucket choose methods
53 *
54 * For each bucket algorithm, we have a "choose" method that, given a
55 * crush input @x and replica position (usually, position in output set) @r,
56 * will produce an item in the bucket.
57 */
58
59/*
60 * Choose based on a random permutation of the bucket.
61 *
62 * We used to use some prime number arithmetic to do this, but it
63 * wasn't very random, and had some other bad behaviors. Instead, we
64 * calculate an actual random permutation of the bucket members.
65 * Since this is expensive, we optimize for the r=0 case, which
66 * captures the vast majority of calls.
67 */
68static int bucket_perm_choose(struct crush_bucket *bucket,
69 int x, int r)
70{
71 unsigned pr = r % bucket->size;
72 unsigned i, s;
73
74 /* start a new permutation if @x has changed */
75 if (bucket->perm_x != x || bucket->perm_n == 0) {
76 dprintk("bucket %d new x=%d\n", bucket->id, x);
77 bucket->perm_x = x;
78
79 /* optimize common r=0 case */
80 if (pr == 0) {
81 s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
82 bucket->size;
83 bucket->perm[0] = s;
84 bucket->perm_n = 0xffff; /* magic value, see below */
85 goto out;
86 }
87
88 for (i = 0; i < bucket->size; i++)
89 bucket->perm[i] = i;
90 bucket->perm_n = 0;
91 } else if (bucket->perm_n == 0xffff) {
92 /* clean up after the r=0 case above */
93 for (i = 1; i < bucket->size; i++)
94 bucket->perm[i] = i;
95 bucket->perm[bucket->perm[0]] = 0;
96 bucket->perm_n = 1;
97 }
98
99 /* calculate permutation up to pr */
100 for (i = 0; i < bucket->perm_n; i++)
101 dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
102 while (bucket->perm_n <= pr) {
103 unsigned p = bucket->perm_n;
104 /* no point in swapping the final entry */
105 if (p < bucket->size - 1) {
106 i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
107 (bucket->size - p);
108 if (i) {
109 unsigned t = bucket->perm[p + i];
110 bucket->perm[p + i] = bucket->perm[p];
111 bucket->perm[p] = t;
112 }
113 dprintk(" perm_choose swap %d with %d\n", p, p+i);
114 }
115 bucket->perm_n++;
116 }
117 for (i = 0; i < bucket->size; i++)
118 dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
119
120 s = bucket->perm[pr];
121out:
122 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
123 bucket->size, x, r, pr, s);
124 return bucket->items[s];
125}
126
127/* uniform */
128static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
129 int x, int r)
130{
131 return bucket_perm_choose(&bucket->h, x, r);
132}
133
134/* list */
135static int bucket_list_choose(struct crush_bucket_list *bucket,
136 int x, int r)
137{
138 int i;
139
140 for (i = bucket->h.size-1; i >= 0; i--) {
141 __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
142 r, bucket->h.id);
143 w &= 0xffff;
144 dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
145 "sw %x rand %llx",
146 i, x, r, bucket->h.items[i], bucket->item_weights[i],
147 bucket->sum_weights[i], w);
148 w *= bucket->sum_weights[i];
149 w = w >> 16;
150 /*dprintk(" scaled %llx\n", w);*/
151 if (w < bucket->item_weights[i])
152 return bucket->h.items[i];
153 }
154
155 BUG_ON(1);
156 return 0;
157}
158
159
160/* (binary) tree */
161static int height(int n)
162{
163 int h = 0;
164 while ((n & 1) == 0) {
165 h++;
166 n = n >> 1;
167 }
168 return h;
169}
170
171static int left(int x)
172{
173 int h = height(x);
174 return x - (1 << (h-1));
175}
176
177static int right(int x)
178{
179 int h = height(x);
180 return x + (1 << (h-1));
181}
182
183static int terminal(int x)
184{
185 return x & 1;
186}
187
188static int bucket_tree_choose(struct crush_bucket_tree *bucket,
189 int x, int r)
190{
191 int n, l;
192 __u32 w;
193 __u64 t;
194
195 /* start at root */
196 n = bucket->num_nodes >> 1;
197
198 while (!terminal(n)) {
199 /* pick point in [0, w) */
200 w = bucket->node_weights[n];
201 t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
202 bucket->h.id) * (__u64)w;
203 t = t >> 32;
204
205 /* descend to the left or right? */
206 l = left(n);
207 if (t < bucket->node_weights[l])
208 n = l;
209 else
210 n = right(n);
211 }
212
213 return bucket->h.items[n >> 1];
214}
215
216
217/* straw */
218
219static int bucket_straw_choose(struct crush_bucket_straw *bucket,
220 int x, int r)
221{
222 int i;
223 int high = 0;
224 __u64 high_draw = 0;
225 __u64 draw;
226
227 for (i = 0; i < bucket->h.size; i++) {
228 draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
229 draw &= 0xffff;
230 draw *= bucket->straws[i];
231 if (i == 0 || draw > high_draw) {
232 high = i;
233 high_draw = draw;
234 }
235 }
236 return bucket->h.items[high];
237}
238
239static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
240{
241 dprintk("choose %d x=%d r=%d\n", in->id, x, r);
242 switch (in->alg) {
243 case CRUSH_BUCKET_UNIFORM:
244 return bucket_uniform_choose((struct crush_bucket_uniform *)in,
245 x, r);
246 case CRUSH_BUCKET_LIST:
247 return bucket_list_choose((struct crush_bucket_list *)in,
248 x, r);
249 case CRUSH_BUCKET_TREE:
250 return bucket_tree_choose((struct crush_bucket_tree *)in,
251 x, r);
252 case CRUSH_BUCKET_STRAW:
253 return bucket_straw_choose((struct crush_bucket_straw *)in,
254 x, r);
255 default:
256 BUG_ON(1);
257 return in->items[0];
258 }
259}
260
261/*
262 * true if device is marked "out" (failed, fully offloaded)
263 * of the cluster
264 */
265static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
266{
267 if (weight[item] >= 0x1000)
268 return 0;
269 if (weight[item] == 0)
270 return 1;
271 if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
272 < weight[item])
273 return 0;
274 return 1;
275}
276
277/**
278 * crush_choose - choose numrep distinct items of given type
279 * @map: the crush_map
280 * @bucket: the bucket we are choose an item from
281 * @x: crush input value
282 * @numrep: the number of items to choose
283 * @type: the type of item to choose
284 * @out: pointer to output vector
285 * @outpos: our position in that vector
286 * @firstn: true if choosing "first n" items, false if choosing "indep"
287 * @recurse_to_leaf: true if we want one device under each item of given type
288 * @out2: second output vector for leaf items (if @recurse_to_leaf)
289 */
290static int crush_choose(struct crush_map *map,
291 struct crush_bucket *bucket,
292 __u32 *weight,
293 int x, int numrep, int type,
294 int *out, int outpos,
295 int firstn, int recurse_to_leaf,
296 int *out2)
297{
298 int rep;
299 int ftotal, flocal;
300 int retry_descent, retry_bucket, skip_rep;
301 struct crush_bucket *in = bucket;
302 int r;
303 int i;
304 int item = 0;
305 int itemtype;
306 int collide, reject;
307 const int orig_tries = 5; /* attempts before we fall back to search */
308 dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos);
309
310 for (rep = outpos; rep < numrep; rep++) {
311 /* keep trying until we get a non-out, non-colliding item */
312 ftotal = 0;
313 skip_rep = 0;
314 do {
315 retry_descent = 0;
316 in = bucket; /* initial bucket */
317
318 /* choose through intervening buckets */
319 flocal = 0;
320 do {
321 collide = 0;
322 retry_bucket = 0;
323 r = rep;
324 if (in->alg == CRUSH_BUCKET_UNIFORM) {
325 /* be careful */
326 if (firstn || numrep >= in->size)
327 /* r' = r + f_total */
328 r += ftotal;
329 else if (in->size % numrep == 0)
330 /* r'=r+(n+1)*f_local */
331 r += (numrep+1) *
332 (flocal+ftotal);
333 else
334 /* r' = r + n*f_local */
335 r += numrep * (flocal+ftotal);
336 } else {
337 if (firstn)
338 /* r' = r + f_total */
339 r += ftotal;
340 else
341 /* r' = r + n*f_local */
342 r += numrep * (flocal+ftotal);
343 }
344
345 /* bucket choose */
346 if (in->size == 0) {
347 reject = 1;
348 goto reject;
349 }
350 if (flocal >= (in->size>>1) &&
351 flocal > orig_tries)
352 item = bucket_perm_choose(in, x, r);
353 else
354 item = crush_bucket_choose(in, x, r);
355 BUG_ON(item >= map->max_devices);
356
357 /* desired type? */
358 if (item < 0)
359 itemtype = map->buckets[-1-item]->type;
360 else
361 itemtype = 0;
362 dprintk(" item %d type %d\n", item, itemtype);
363
364 /* keep going? */
365 if (itemtype != type) {
366 BUG_ON(item >= 0 ||
367 (-1-item) >= map->max_buckets);
368 in = map->buckets[-1-item];
369 continue;
370 }
371
372 /* collision? */
373 for (i = 0; i < outpos; i++) {
374 if (out[i] == item) {
375 collide = 1;
376 break;
377 }
378 }
379
380 if (recurse_to_leaf &&
381 item < 0 &&
382 crush_choose(map, map->buckets[-1-item],
383 weight,
384 x, outpos+1, 0,
385 out2, outpos,
386 firstn, 0, NULL) <= outpos) {
387 reject = 1;
388 } else {
389 /* out? */
390 if (itemtype == 0)
391 reject = is_out(map, weight,
392 item, x);
393 else
394 reject = 0;
395 }
396
397reject:
398 if (reject || collide) {
399 ftotal++;
400 flocal++;
401
402 if (collide && flocal < 3)
403 /* retry locally a few times */
404 retry_bucket = 1;
405 else if (flocal < in->size + orig_tries)
406 /* exhaustive bucket search */
407 retry_bucket = 1;
408 else if (ftotal < 20)
409 /* then retry descent */
410 retry_descent = 1;
411 else
412 /* else give up */
413 skip_rep = 1;
414 dprintk(" reject %d collide %d "
415 "ftotal %d flocal %d\n",
416 reject, collide, ftotal,
417 flocal);
418 }
419 } while (retry_bucket);
420 } while (retry_descent);
421
422 if (skip_rep) {
423 dprintk("skip rep\n");
424 continue;
425 }
426
427 dprintk("choose got %d\n", item);
428 out[outpos] = item;
429 outpos++;
430 }
431
432 dprintk("choose returns %d\n", outpos);
433 return outpos;
434}
435
436
437/**
438 * crush_do_rule - calculate a mapping with the given input and rule
439 * @map: the crush_map
440 * @ruleno: the rule id
441 * @x: hash input
442 * @result: pointer to result vector
443 * @result_max: maximum result size
444 * @force: force initial replica choice; -1 for none
445 */
446int crush_do_rule(struct crush_map *map,
447 int ruleno, int x, int *result, int result_max,
448 int force, __u32 *weight)
449{
450 int result_len;
451 int force_context[CRUSH_MAX_DEPTH];
452 int force_pos = -1;
453 int a[CRUSH_MAX_SET];
454 int b[CRUSH_MAX_SET];
455 int c[CRUSH_MAX_SET];
456 int recurse_to_leaf;
457 int *w;
458 int wsize = 0;
459 int *o;
460 int osize;
461 int *tmp;
462 struct crush_rule *rule;
463 int step;
464 int i, j;
465 int numrep;
466 int firstn;
467 int rc = -1;
468
469 BUG_ON(ruleno >= map->max_rules);
470
471 rule = map->rules[ruleno];
472 result_len = 0;
473 w = a;
474 o = b;
475
476 /*
477 * determine hierarchical context of force, if any. note
478 * that this may or may not correspond to the specific types
479 * referenced by the crush rule.
480 */
481 if (force >= 0) {
482 if (force >= map->max_devices ||
483 map->device_parents[force] == 0) {
484 /*dprintk("CRUSH: forcefed device dne\n");*/
485 rc = -1; /* force fed device dne */
486 goto out;
487 }
488 if (!is_out(map, weight, force, x)) {
489 while (1) {
490 force_context[++force_pos] = force;
491 if (force >= 0)
492 force = map->device_parents[force];
493 else
494 force = map->bucket_parents[-1-force];
495 if (force == 0)
496 break;
497 }
498 }
499 }
500
501 for (step = 0; step < rule->len; step++) {
502 firstn = 0;
503 switch (rule->steps[step].op) {
504 case CRUSH_RULE_TAKE:
505 w[0] = rule->steps[step].arg1;
506 if (force_pos >= 0) {
507 BUG_ON(force_context[force_pos] != w[0]);
508 force_pos--;
509 }
510 wsize = 1;
511 break;
512
513 case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
514 case CRUSH_RULE_CHOOSE_FIRSTN:
515 firstn = 1;
516 case CRUSH_RULE_CHOOSE_LEAF_INDEP:
517 case CRUSH_RULE_CHOOSE_INDEP:
518 BUG_ON(wsize == 0);
519
520 recurse_to_leaf =
521 rule->steps[step].op ==
522 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
523 rule->steps[step].op ==
524 CRUSH_RULE_CHOOSE_LEAF_INDEP;
525
526 /* reset output */
527 osize = 0;
528
529 for (i = 0; i < wsize; i++) {
530 /*
531 * see CRUSH_N, CRUSH_N_MINUS macros.
532 * basically, numrep <= 0 means relative to
533 * the provided result_max
534 */
535 numrep = rule->steps[step].arg1;
536 if (numrep <= 0) {
537 numrep += result_max;
538 if (numrep <= 0)
539 continue;
540 }
541 j = 0;
542 if (osize == 0 && force_pos >= 0) {
543 /* skip any intermediate types */
544 while (force_pos &&
545 force_context[force_pos] < 0 &&
546 rule->steps[step].arg2 !=
547 map->buckets[-1 -
548 force_context[force_pos]]->type)
549 force_pos--;
550 o[osize] = force_context[force_pos];
551 if (recurse_to_leaf)
552 c[osize] = force_context[0];
553 j++;
554 force_pos--;
555 }
556 osize += crush_choose(map,
557 map->buckets[-1-w[i]],
558 weight,
559 x, numrep,
560 rule->steps[step].arg2,
561 o+osize, j,
562 firstn,
563 recurse_to_leaf, c+osize);
564 }
565
566 if (recurse_to_leaf)
567 /* copy final _leaf_ values to output set */
568 memcpy(o, c, osize*sizeof(*o));
569
570 /* swap t and w arrays */
571 tmp = o;
572 o = w;
573 w = tmp;
574 wsize = osize;
575 break;
576
577
578 case CRUSH_RULE_EMIT:
579 for (i = 0; i < wsize && result_len < result_max; i++) {
580 result[result_len] = w[i];
581 result_len++;
582 }
583 wsize = 0;
584 break;
585
586 default:
587 BUG_ON(1);
588 }
589 }
590 rc = result_len;
591
592out:
593 return rc;
594}
595
596
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
new file mode 100644
index 000000000000..98e90046fd9f
--- /dev/null
+++ b/fs/ceph/crush/mapper.h
@@ -0,0 +1,20 @@
1#ifndef _CRUSH_MAPPER_H
2#define _CRUSH_MAPPER_H
3
4/*
5 * CRUSH functions for find rules and then mapping an input to an
6 * output set.
7 *
8 * LGPL2
9 */
10
11#include "crush.h"
12
13extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
14extern int crush_do_rule(struct crush_map *map,
15 int ruleno,
16 int x, int *result, int result_max,
17 int forcefeed, /* -1 for none */
18 __u32 *weights);
19
20#endif
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
new file mode 100644
index 000000000000..f704b3b62424
--- /dev/null
+++ b/fs/ceph/crypto.c
@@ -0,0 +1,409 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/scatterlist.h>
6#include <linux/slab.h>
7#include <crypto/hash.h>
8
9#include "crypto.h"
10#include "decode.h"
11
12int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
13{
14 if (*p + sizeof(u16) + sizeof(key->created) +
15 sizeof(u16) + key->len > end)
16 return -ERANGE;
17 ceph_encode_16(p, key->type);
18 ceph_encode_copy(p, &key->created, sizeof(key->created));
19 ceph_encode_16(p, key->len);
20 ceph_encode_copy(p, key->key, key->len);
21 return 0;
22}
23
24int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
25{
26 ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
27 key->type = ceph_decode_16(p);
28 ceph_decode_copy(p, &key->created, sizeof(key->created));
29 key->len = ceph_decode_16(p);
30 ceph_decode_need(p, end, key->len, bad);
31 key->key = kmalloc(key->len, GFP_NOFS);
32 if (!key->key)
33 return -ENOMEM;
34 ceph_decode_copy(p, key->key, key->len);
35 return 0;
36
37bad:
38 dout("failed to decode crypto key\n");
39 return -EINVAL;
40}
41
42int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
43{
44 int inlen = strlen(inkey);
45 int blen = inlen * 3 / 4;
46 void *buf, *p;
47 int ret;
48
49 dout("crypto_key_unarmor %s\n", inkey);
50 buf = kmalloc(blen, GFP_NOFS);
51 if (!buf)
52 return -ENOMEM;
53 blen = ceph_unarmor(buf, inkey, inkey+inlen);
54 if (blen < 0) {
55 kfree(buf);
56 return blen;
57 }
58
59 p = buf;
60 ret = ceph_crypto_key_decode(key, &p, p + blen);
61 kfree(buf);
62 if (ret)
63 return ret;
64 dout("crypto_key_unarmor key %p type %d len %d\n", key,
65 key->type, key->len);
66 return 0;
67}
68
69
70
71#define AES_KEY_SIZE 16
72
73static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
74{
75 return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
76}
77
78const u8 *aes_iv = "cephsageyudagreg";
79
80int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len,
81 const void *src, size_t src_len)
82{
83 struct scatterlist sg_in[2], sg_out[1];
84 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
85 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
86 int ret;
87 void *iv;
88 int ivsize;
89 size_t zero_padding = (0x10 - (src_len & 0x0f));
90 char pad[16];
91
92 if (IS_ERR(tfm))
93 return PTR_ERR(tfm);
94
95 memset(pad, zero_padding, zero_padding);
96
97 *dst_len = src_len + zero_padding;
98
99 crypto_blkcipher_setkey((void *)tfm, key, key_len);
100 sg_init_table(sg_in, 2);
101 sg_set_buf(&sg_in[0], src, src_len);
102 sg_set_buf(&sg_in[1], pad, zero_padding);
103 sg_init_table(sg_out, 1);
104 sg_set_buf(sg_out, dst, *dst_len);
105 iv = crypto_blkcipher_crt(tfm)->iv;
106 ivsize = crypto_blkcipher_ivsize(tfm);
107
108 memcpy(iv, aes_iv, ivsize);
109 /*
110 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
111 key, key_len, 1);
112 print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
113 src, src_len, 1);
114 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
115 pad, zero_padding, 1);
116 */
117 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
118 src_len + zero_padding);
119 crypto_free_blkcipher(tfm);
120 if (ret < 0)
121 pr_err("ceph_aes_crypt failed %d\n", ret);
122 /*
123 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
124 dst, *dst_len, 1);
125 */
126 return 0;
127}
128
129int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len,
130 const void *src1, size_t src1_len,
131 const void *src2, size_t src2_len)
132{
133 struct scatterlist sg_in[3], sg_out[1];
134 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
135 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
136 int ret;
137 void *iv;
138 int ivsize;
139 size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
140 char pad[16];
141
142 if (IS_ERR(tfm))
143 return PTR_ERR(tfm);
144
145 memset(pad, zero_padding, zero_padding);
146
147 *dst_len = src1_len + src2_len + zero_padding;
148
149 crypto_blkcipher_setkey((void *)tfm, key, key_len);
150 sg_init_table(sg_in, 3);
151 sg_set_buf(&sg_in[0], src1, src1_len);
152 sg_set_buf(&sg_in[1], src2, src2_len);
153 sg_set_buf(&sg_in[2], pad, zero_padding);
154 sg_init_table(sg_out, 1);
155 sg_set_buf(sg_out, dst, *dst_len);
156 iv = crypto_blkcipher_crt(tfm)->iv;
157 ivsize = crypto_blkcipher_ivsize(tfm);
158
159 memcpy(iv, aes_iv, ivsize);
160 /*
161 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
162 key, key_len, 1);
163 print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
164 src1, src1_len, 1);
165 print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
166 src2, src2_len, 1);
167 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
168 pad, zero_padding, 1);
169 */
170 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
171 src1_len + src2_len + zero_padding);
172 crypto_free_blkcipher(tfm);
173 if (ret < 0)
174 pr_err("ceph_aes_crypt2 failed %d\n", ret);
175 /*
176 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
177 dst, *dst_len, 1);
178 */
179 return 0;
180}
181
182int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len,
183 const void *src, size_t src_len)
184{
185 struct scatterlist sg_in[1], sg_out[2];
186 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
187 struct blkcipher_desc desc = { .tfm = tfm };
188 char pad[16];
189 void *iv;
190 int ivsize;
191 int ret;
192 int last_byte;
193
194 if (IS_ERR(tfm))
195 return PTR_ERR(tfm);
196
197 crypto_blkcipher_setkey((void *)tfm, key, key_len);
198 sg_init_table(sg_in, 1);
199 sg_init_table(sg_out, 2);
200 sg_set_buf(sg_in, src, src_len);
201 sg_set_buf(&sg_out[0], dst, *dst_len);
202 sg_set_buf(&sg_out[1], pad, sizeof(pad));
203
204 iv = crypto_blkcipher_crt(tfm)->iv;
205 ivsize = crypto_blkcipher_ivsize(tfm);
206
207 memcpy(iv, aes_iv, ivsize);
208
209 /*
210 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
211 key, key_len, 1);
212 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
213 src, src_len, 1);
214 */
215
216 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
217 crypto_free_blkcipher(tfm);
218 if (ret < 0) {
219 pr_err("ceph_aes_decrypt failed %d\n", ret);
220 return ret;
221 }
222
223 if (src_len <= *dst_len)
224 last_byte = ((char *)dst)[src_len - 1];
225 else
226 last_byte = pad[src_len - *dst_len - 1];
227 if (last_byte <= 16 && src_len >= last_byte) {
228 *dst_len = src_len - last_byte;
229 } else {
230 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
231 last_byte, (int)src_len);
232 return -EPERM; /* bad padding */
233 }
234 /*
235 print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
236 dst, *dst_len, 1);
237 */
238 return 0;
239}
240
241int ceph_aes_decrypt2(const void *key, int key_len,
242 void *dst1, size_t *dst1_len,
243 void *dst2, size_t *dst2_len,
244 const void *src, size_t src_len)
245{
246 struct scatterlist sg_in[1], sg_out[3];
247 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
248 struct blkcipher_desc desc = { .tfm = tfm };
249 char pad[16];
250 void *iv;
251 int ivsize;
252 int ret;
253 int last_byte;
254
255 if (IS_ERR(tfm))
256 return PTR_ERR(tfm);
257
258 sg_init_table(sg_in, 1);
259 sg_set_buf(sg_in, src, src_len);
260 sg_init_table(sg_out, 3);
261 sg_set_buf(&sg_out[0], dst1, *dst1_len);
262 sg_set_buf(&sg_out[1], dst2, *dst2_len);
263 sg_set_buf(&sg_out[2], pad, sizeof(pad));
264
265 crypto_blkcipher_setkey((void *)tfm, key, key_len);
266 iv = crypto_blkcipher_crt(tfm)->iv;
267 ivsize = crypto_blkcipher_ivsize(tfm);
268
269 memcpy(iv, aes_iv, ivsize);
270
271 /*
272 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
273 key, key_len, 1);
274 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
275 src, src_len, 1);
276 */
277
278 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
279 crypto_free_blkcipher(tfm);
280 if (ret < 0) {
281 pr_err("ceph_aes_decrypt failed %d\n", ret);
282 return ret;
283 }
284
285 if (src_len <= *dst1_len)
286 last_byte = ((char *)dst1)[src_len - 1];
287 else if (src_len <= *dst1_len + *dst2_len)
288 last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
289 else
290 last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
291 if (last_byte <= 16 && src_len >= last_byte) {
292 src_len -= last_byte;
293 } else {
294 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
295 last_byte, (int)src_len);
296 return -EPERM; /* bad padding */
297 }
298
299 if (src_len < *dst1_len) {
300 *dst1_len = src_len;
301 *dst2_len = 0;
302 } else {
303 *dst2_len = src_len - *dst1_len;
304 }
305 /*
306 print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1,
307 dst1, *dst1_len, 1);
308 print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1,
309 dst2, *dst2_len, 1);
310 */
311
312 return 0;
313}
314
315
316int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
317 const void *src, size_t src_len)
318{
319 switch (secret->type) {
320 case CEPH_CRYPTO_NONE:
321 if (*dst_len < src_len)
322 return -ERANGE;
323 memcpy(dst, src, src_len);
324 *dst_len = src_len;
325 return 0;
326
327 case CEPH_CRYPTO_AES:
328 return ceph_aes_decrypt(secret->key, secret->len, dst,
329 dst_len, src, src_len);
330
331 default:
332 return -EINVAL;
333 }
334}
335
336int ceph_decrypt2(struct ceph_crypto_key *secret,
337 void *dst1, size_t *dst1_len,
338 void *dst2, size_t *dst2_len,
339 const void *src, size_t src_len)
340{
341 size_t t;
342
343 switch (secret->type) {
344 case CEPH_CRYPTO_NONE:
345 if (*dst1_len + *dst2_len < src_len)
346 return -ERANGE;
347 t = min(*dst1_len, src_len);
348 memcpy(dst1, src, t);
349 *dst1_len = t;
350 src += t;
351 src_len -= t;
352 if (src_len) {
353 t = min(*dst2_len, src_len);
354 memcpy(dst2, src, t);
355 *dst2_len = t;
356 }
357 return 0;
358
359 case CEPH_CRYPTO_AES:
360 return ceph_aes_decrypt2(secret->key, secret->len,
361 dst1, dst1_len, dst2, dst2_len,
362 src, src_len);
363
364 default:
365 return -EINVAL;
366 }
367}
368
369int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
370 const void *src, size_t src_len)
371{
372 switch (secret->type) {
373 case CEPH_CRYPTO_NONE:
374 if (*dst_len < src_len)
375 return -ERANGE;
376 memcpy(dst, src, src_len);
377 *dst_len = src_len;
378 return 0;
379
380 case CEPH_CRYPTO_AES:
381 return ceph_aes_encrypt(secret->key, secret->len, dst,
382 dst_len, src, src_len);
383
384 default:
385 return -EINVAL;
386 }
387}
388
389int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
390 const void *src1, size_t src1_len,
391 const void *src2, size_t src2_len)
392{
393 switch (secret->type) {
394 case CEPH_CRYPTO_NONE:
395 if (*dst_len < src1_len + src2_len)
396 return -ERANGE;
397 memcpy(dst, src1, src1_len);
398 memcpy(dst + src1_len, src2, src2_len);
399 *dst_len = src1_len + src2_len;
400 return 0;
401
402 case CEPH_CRYPTO_AES:
403 return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
404 src1, src1_len, src2, src2_len);
405
406 default:
407 return -EINVAL;
408 }
409}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
new file mode 100644
index 000000000000..40b502e6bd89
--- /dev/null
+++ b/fs/ceph/crypto.h
@@ -0,0 +1,48 @@
1#ifndef _FS_CEPH_CRYPTO_H
2#define _FS_CEPH_CRYPTO_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * cryptographic secret
9 */
10struct ceph_crypto_key {
11 int type;
12 struct ceph_timespec created;
13 int len;
14 void *key;
15};
16
17static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
18{
19 kfree(key->key);
20}
21
22extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
23 void **p, void *end);
24extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
25 void **p, void *end);
26extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
27
28/* crypto.c */
29extern int ceph_decrypt(struct ceph_crypto_key *secret,
30 void *dst, size_t *dst_len,
31 const void *src, size_t src_len);
32extern int ceph_encrypt(struct ceph_crypto_key *secret,
33 void *dst, size_t *dst_len,
34 const void *src, size_t src_len);
35extern int ceph_decrypt2(struct ceph_crypto_key *secret,
36 void *dst1, size_t *dst1_len,
37 void *dst2, size_t *dst2_len,
38 const void *src, size_t src_len);
39extern int ceph_encrypt2(struct ceph_crypto_key *secret,
40 void *dst, size_t *dst_len,
41 const void *src1, size_t src1_len,
42 const void *src2, size_t src2_len);
43
44/* armor.c */
45extern int ceph_armor(char *dst, const void *src, const void *end);
46extern int ceph_unarmor(void *dst, const char *src, const char *end);
47
48#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
new file mode 100644
index 000000000000..f7048da92acc
--- /dev/null
+++ b/fs/ceph/debugfs.c
@@ -0,0 +1,484 @@
1#include "ceph_debug.h"
2
3#include <linux/device.h>
4#include <linux/slab.h>
5#include <linux/module.h>
6#include <linux/ctype.h>
7#include <linux/debugfs.h>
8#include <linux/seq_file.h>
9
10#include "super.h"
11#include "mds_client.h"
12#include "mon_client.h"
13#include "auth.h"
14
15#ifdef CONFIG_DEBUG_FS
16
17/*
18 * Implement /sys/kernel/debug/ceph fun
19 *
20 * /sys/kernel/debug/ceph/client* - an instance of the ceph client
21 * .../osdmap - current osdmap
22 * .../mdsmap - current mdsmap
23 * .../monmap - current monmap
24 * .../osdc - active osd requests
25 * .../mdsc - active mds requests
26 * .../monc - mon client state
27 * .../dentry_lru - dump contents of dentry lru
28 * .../caps - expose cap (reservation) stats
29 * .../bdi - symlink to ../../bdi/something
30 */
31
32static struct dentry *ceph_debugfs_dir;
33
34static int monmap_show(struct seq_file *s, void *p)
35{
36 int i;
37 struct ceph_client *client = s->private;
38
39 if (client->monc.monmap == NULL)
40 return 0;
41
42 seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
43 for (i = 0; i < client->monc.monmap->num_mon; i++) {
44 struct ceph_entity_inst *inst =
45 &client->monc.monmap->mon_inst[i];
46
47 seq_printf(s, "\t%s%lld\t%s\n",
48 ENTITY_NAME(inst->name),
49 pr_addr(&inst->addr.in_addr));
50 }
51 return 0;
52}
53
54static int mdsmap_show(struct seq_file *s, void *p)
55{
56 int i;
57 struct ceph_client *client = s->private;
58
59 if (client->mdsc.mdsmap == NULL)
60 return 0;
61 seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
62 seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
63 seq_printf(s, "session_timeout %d\n",
64 client->mdsc.mdsmap->m_session_timeout);
65 seq_printf(s, "session_autoclose %d\n",
66 client->mdsc.mdsmap->m_session_autoclose);
67 for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
68 struct ceph_entity_addr *addr =
69 &client->mdsc.mdsmap->m_info[i].addr;
70 int state = client->mdsc.mdsmap->m_info[i].state;
71
72 seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
73 ceph_mds_state_name(state));
74 }
75 return 0;
76}
77
78static int osdmap_show(struct seq_file *s, void *p)
79{
80 int i;
81 struct ceph_client *client = s->private;
82 struct rb_node *n;
83
84 if (client->osdc.osdmap == NULL)
85 return 0;
86 seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
87 seq_printf(s, "flags%s%s\n",
88 (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
89 " NEARFULL" : "",
90 (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
91 " FULL" : "");
92 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
93 struct ceph_pg_pool_info *pool =
94 rb_entry(n, struct ceph_pg_pool_info, node);
95 seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
96 pool->id, pool->v.pg_num, pool->pg_num_mask,
97 pool->v.lpg_num, pool->lpg_num_mask);
98 }
99 for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
100 struct ceph_entity_addr *addr =
101 &client->osdc.osdmap->osd_addr[i];
102 int state = client->osdc.osdmap->osd_state[i];
103 char sb[64];
104
105 seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
106 i, pr_addr(&addr->in_addr),
107 ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
108 ceph_osdmap_state_str(sb, sizeof(sb), state));
109 }
110 return 0;
111}
112
113static int monc_show(struct seq_file *s, void *p)
114{
115 struct ceph_client *client = s->private;
116 struct ceph_mon_statfs_request *req;
117 struct ceph_mon_client *monc = &client->monc;
118 struct rb_node *rp;
119
120 mutex_lock(&monc->mutex);
121
122 if (monc->have_mdsmap)
123 seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
124 if (monc->have_osdmap)
125 seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
126 if (monc->want_next_osdmap)
127 seq_printf(s, "want next osdmap\n");
128
129 for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) {
130 req = rb_entry(rp, struct ceph_mon_statfs_request, node);
131 seq_printf(s, "%lld statfs\n", req->tid);
132 }
133
134 mutex_unlock(&monc->mutex);
135 return 0;
136}
137
138static int mdsc_show(struct seq_file *s, void *p)
139{
140 struct ceph_client *client = s->private;
141 struct ceph_mds_client *mdsc = &client->mdsc;
142 struct ceph_mds_request *req;
143 struct rb_node *rp;
144 int pathlen;
145 u64 pathbase;
146 char *path;
147
148 mutex_lock(&mdsc->mutex);
149 for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
150 req = rb_entry(rp, struct ceph_mds_request, r_node);
151
152 if (req->r_request)
153 seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
154 else
155 seq_printf(s, "%lld\t(no request)\t", req->r_tid);
156
157 seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
158
159 if (req->r_got_unsafe)
160 seq_printf(s, "\t(unsafe)");
161 else
162 seq_printf(s, "\t");
163
164 if (req->r_inode) {
165 seq_printf(s, " #%llx", ceph_ino(req->r_inode));
166 } else if (req->r_dentry) {
167 path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
168 &pathbase, 0);
169 spin_lock(&req->r_dentry->d_lock);
170 seq_printf(s, " #%llx/%.*s (%s)",
171 ceph_ino(req->r_dentry->d_parent->d_inode),
172 req->r_dentry->d_name.len,
173 req->r_dentry->d_name.name,
174 path ? path : "");
175 spin_unlock(&req->r_dentry->d_lock);
176 kfree(path);
177 } else if (req->r_path1) {
178 seq_printf(s, " #%llx/%s", req->r_ino1.ino,
179 req->r_path1);
180 }
181
182 if (req->r_old_dentry) {
183 path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
184 &pathbase, 0);
185 spin_lock(&req->r_old_dentry->d_lock);
186 seq_printf(s, " #%llx/%.*s (%s)",
187 ceph_ino(req->r_old_dentry->d_parent->d_inode),
188 req->r_old_dentry->d_name.len,
189 req->r_old_dentry->d_name.name,
190 path ? path : "");
191 spin_unlock(&req->r_old_dentry->d_lock);
192 kfree(path);
193 } else if (req->r_path2) {
194 if (req->r_ino2.ino)
195 seq_printf(s, " #%llx/%s", req->r_ino2.ino,
196 req->r_path2);
197 else
198 seq_printf(s, " %s", req->r_path2);
199 }
200
201 seq_printf(s, "\n");
202 }
203 mutex_unlock(&mdsc->mutex);
204
205 return 0;
206}
207
208static int osdc_show(struct seq_file *s, void *pp)
209{
210 struct ceph_client *client = s->private;
211 struct ceph_osd_client *osdc = &client->osdc;
212 struct rb_node *p;
213
214 mutex_lock(&osdc->request_mutex);
215 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
216 struct ceph_osd_request *req;
217 struct ceph_osd_request_head *head;
218 struct ceph_osd_op *op;
219 int num_ops;
220 int opcode, olen;
221 int i;
222
223 req = rb_entry(p, struct ceph_osd_request, r_node);
224
225 seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
226 req->r_osd ? req->r_osd->o_osd : -1,
227 le32_to_cpu(req->r_pgid.pool),
228 le16_to_cpu(req->r_pgid.ps));
229
230 head = req->r_request->front.iov_base;
231 op = (void *)(head + 1);
232
233 num_ops = le16_to_cpu(head->num_ops);
234 olen = le32_to_cpu(head->object_len);
235 seq_printf(s, "%.*s", olen,
236 (const char *)(head->ops + num_ops));
237
238 if (req->r_reassert_version.epoch)
239 seq_printf(s, "\t%u'%llu",
240 (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
241 le64_to_cpu(req->r_reassert_version.version));
242 else
243 seq_printf(s, "\t");
244
245 for (i = 0; i < num_ops; i++) {
246 opcode = le16_to_cpu(op->op);
247 seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
248 op++;
249 }
250
251 seq_printf(s, "\n");
252 }
253 mutex_unlock(&osdc->request_mutex);
254 return 0;
255}
256
257static int caps_show(struct seq_file *s, void *p)
258{
259 struct ceph_client *client = p;
260 int total, avail, used, reserved, min;
261
262 ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
263 seq_printf(s, "total\t\t%d\n"
264 "avail\t\t%d\n"
265 "used\t\t%d\n"
266 "reserved\t%d\n"
267 "min\t%d\n",
268 total, avail, used, reserved, min);
269 return 0;
270}
271
272static int dentry_lru_show(struct seq_file *s, void *ptr)
273{
274 struct ceph_client *client = s->private;
275 struct ceph_mds_client *mdsc = &client->mdsc;
276 struct ceph_dentry_info *di;
277
278 spin_lock(&mdsc->dentry_lru_lock);
279 list_for_each_entry(di, &mdsc->dentry_lru, lru) {
280 struct dentry *dentry = di->dentry;
281 seq_printf(s, "%p %p\t%.*s\n",
282 di, dentry, dentry->d_name.len, dentry->d_name.name);
283 }
284 spin_unlock(&mdsc->dentry_lru_lock);
285
286 return 0;
287}
288
289#define DEFINE_SHOW_FUNC(name) \
290static int name##_open(struct inode *inode, struct file *file) \
291{ \
292 struct seq_file *sf; \
293 int ret; \
294 \
295 ret = single_open(file, name, NULL); \
296 sf = file->private_data; \
297 sf->private = inode->i_private; \
298 return ret; \
299} \
300 \
301static const struct file_operations name##_fops = { \
302 .open = name##_open, \
303 .read = seq_read, \
304 .llseek = seq_lseek, \
305 .release = single_release, \
306};
307
308DEFINE_SHOW_FUNC(monmap_show)
309DEFINE_SHOW_FUNC(mdsmap_show)
310DEFINE_SHOW_FUNC(osdmap_show)
311DEFINE_SHOW_FUNC(monc_show)
312DEFINE_SHOW_FUNC(mdsc_show)
313DEFINE_SHOW_FUNC(osdc_show)
314DEFINE_SHOW_FUNC(dentry_lru_show)
315DEFINE_SHOW_FUNC(caps_show)
316
317static int congestion_kb_set(void *data, u64 val)
318{
319 struct ceph_client *client = (struct ceph_client *)data;
320
321 if (client)
322 client->mount_args->congestion_kb = (int)val;
323
324 return 0;
325}
326
327static int congestion_kb_get(void *data, u64 *val)
328{
329 struct ceph_client *client = (struct ceph_client *)data;
330
331 if (client)
332 *val = (u64)client->mount_args->congestion_kb;
333
334 return 0;
335}
336
337
338DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
339 congestion_kb_set, "%llu\n");
340
341int __init ceph_debugfs_init(void)
342{
343 ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
344 if (!ceph_debugfs_dir)
345 return -ENOMEM;
346 return 0;
347}
348
349void ceph_debugfs_cleanup(void)
350{
351 debugfs_remove(ceph_debugfs_dir);
352}
353
354int ceph_debugfs_client_init(struct ceph_client *client)
355{
356 int ret = 0;
357 char name[80];
358
359 snprintf(name, sizeof(name), FSID_FORMAT ".client%lld",
360 PR_FSID(&client->fsid), client->monc.auth->global_id);
361
362 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
363 if (!client->debugfs_dir)
364 goto out;
365
366 client->monc.debugfs_file = debugfs_create_file("monc",
367 0600,
368 client->debugfs_dir,
369 client,
370 &monc_show_fops);
371 if (!client->monc.debugfs_file)
372 goto out;
373
374 client->mdsc.debugfs_file = debugfs_create_file("mdsc",
375 0600,
376 client->debugfs_dir,
377 client,
378 &mdsc_show_fops);
379 if (!client->mdsc.debugfs_file)
380 goto out;
381
382 client->osdc.debugfs_file = debugfs_create_file("osdc",
383 0600,
384 client->debugfs_dir,
385 client,
386 &osdc_show_fops);
387 if (!client->osdc.debugfs_file)
388 goto out;
389
390 client->debugfs_monmap = debugfs_create_file("monmap",
391 0600,
392 client->debugfs_dir,
393 client,
394 &monmap_show_fops);
395 if (!client->debugfs_monmap)
396 goto out;
397
398 client->debugfs_mdsmap = debugfs_create_file("mdsmap",
399 0600,
400 client->debugfs_dir,
401 client,
402 &mdsmap_show_fops);
403 if (!client->debugfs_mdsmap)
404 goto out;
405
406 client->debugfs_osdmap = debugfs_create_file("osdmap",
407 0600,
408 client->debugfs_dir,
409 client,
410 &osdmap_show_fops);
411 if (!client->debugfs_osdmap)
412 goto out;
413
414 client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
415 0600,
416 client->debugfs_dir,
417 client,
418 &dentry_lru_show_fops);
419 if (!client->debugfs_dentry_lru)
420 goto out;
421
422 client->debugfs_caps = debugfs_create_file("caps",
423 0400,
424 client->debugfs_dir,
425 client,
426 &caps_show_fops);
427 if (!client->debugfs_caps)
428 goto out;
429
430 client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb",
431 0600,
432 client->debugfs_dir,
433 client,
434 &congestion_kb_fops);
435 if (!client->debugfs_congestion_kb)
436 goto out;
437
438 sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
439 client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
440 name);
441
442 return 0;
443
444out:
445 ceph_debugfs_client_cleanup(client);
446 return ret;
447}
448
449void ceph_debugfs_client_cleanup(struct ceph_client *client)
450{
451 debugfs_remove(client->debugfs_bdi);
452 debugfs_remove(client->debugfs_caps);
453 debugfs_remove(client->debugfs_dentry_lru);
454 debugfs_remove(client->debugfs_osdmap);
455 debugfs_remove(client->debugfs_mdsmap);
456 debugfs_remove(client->debugfs_monmap);
457 debugfs_remove(client->osdc.debugfs_file);
458 debugfs_remove(client->mdsc.debugfs_file);
459 debugfs_remove(client->monc.debugfs_file);
460 debugfs_remove(client->debugfs_congestion_kb);
461 debugfs_remove(client->debugfs_dir);
462}
463
464#else // CONFIG_DEBUG_FS
465
466int __init ceph_debugfs_init(void)
467{
468 return 0;
469}
470
471void ceph_debugfs_cleanup(void)
472{
473}
474
475int ceph_debugfs_client_init(struct ceph_client *client)
476{
477 return 0;
478}
479
480void ceph_debugfs_client_cleanup(struct ceph_client *client)
481{
482}
483
484#endif // CONFIG_DEBUG_FS
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
new file mode 100644
index 000000000000..65b3e022eaf5
--- /dev/null
+++ b/fs/ceph/decode.h
@@ -0,0 +1,194 @@
1#ifndef __CEPH_DECODE_H
2#define __CEPH_DECODE_H
3
4#include <asm/unaligned.h>
5#include <linux/time.h>
6
7#include "types.h"
8
9/*
10 * in all cases,
11 * void **p pointer to position pointer
12 * void *end pointer to end of buffer (last byte + 1)
13 */
14
15static inline u64 ceph_decode_64(void **p)
16{
17 u64 v = get_unaligned_le64(*p);
18 *p += sizeof(u64);
19 return v;
20}
21static inline u32 ceph_decode_32(void **p)
22{
23 u32 v = get_unaligned_le32(*p);
24 *p += sizeof(u32);
25 return v;
26}
27static inline u16 ceph_decode_16(void **p)
28{
29 u16 v = get_unaligned_le16(*p);
30 *p += sizeof(u16);
31 return v;
32}
33static inline u8 ceph_decode_8(void **p)
34{
35 u8 v = *(u8 *)*p;
36 (*p)++;
37 return v;
38}
39static inline void ceph_decode_copy(void **p, void *pv, size_t n)
40{
41 memcpy(pv, *p, n);
42 *p += n;
43}
44
45/*
46 * bounds check input.
47 */
48#define ceph_decode_need(p, end, n, bad) \
49 do { \
50 if (unlikely(*(p) + (n) > (end))) \
51 goto bad; \
52 } while (0)
53
54#define ceph_decode_64_safe(p, end, v, bad) \
55 do { \
56 ceph_decode_need(p, end, sizeof(u64), bad); \
57 v = ceph_decode_64(p); \
58 } while (0)
59#define ceph_decode_32_safe(p, end, v, bad) \
60 do { \
61 ceph_decode_need(p, end, sizeof(u32), bad); \
62 v = ceph_decode_32(p); \
63 } while (0)
64#define ceph_decode_16_safe(p, end, v, bad) \
65 do { \
66 ceph_decode_need(p, end, sizeof(u16), bad); \
67 v = ceph_decode_16(p); \
68 } while (0)
69#define ceph_decode_8_safe(p, end, v, bad) \
70 do { \
71 ceph_decode_need(p, end, sizeof(u8), bad); \
72 v = ceph_decode_8(p); \
73 } while (0)
74
75#define ceph_decode_copy_safe(p, end, pv, n, bad) \
76 do { \
77 ceph_decode_need(p, end, n, bad); \
78 ceph_decode_copy(p, pv, n); \
79 } while (0)
80
81/*
82 * struct ceph_timespec <-> struct timespec
83 */
84static inline void ceph_decode_timespec(struct timespec *ts,
85 const struct ceph_timespec *tv)
86{
87 ts->tv_sec = le32_to_cpu(tv->tv_sec);
88 ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
89}
90static inline void ceph_encode_timespec(struct ceph_timespec *tv,
91 const struct timespec *ts)
92{
93 tv->tv_sec = cpu_to_le32(ts->tv_sec);
94 tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
95}
96
97/*
98 * sockaddr_storage <-> ceph_sockaddr
99 */
100static inline void ceph_encode_addr(struct ceph_entity_addr *a)
101{
102 a->in_addr.ss_family = htons(a->in_addr.ss_family);
103}
104static inline void ceph_decode_addr(struct ceph_entity_addr *a)
105{
106 a->in_addr.ss_family = ntohs(a->in_addr.ss_family);
107 WARN_ON(a->in_addr.ss_family == 512);
108}
109
110/*
111 * encoders
112 */
113static inline void ceph_encode_64(void **p, u64 v)
114{
115 put_unaligned_le64(v, (__le64 *)*p);
116 *p += sizeof(u64);
117}
118static inline void ceph_encode_32(void **p, u32 v)
119{
120 put_unaligned_le32(v, (__le32 *)*p);
121 *p += sizeof(u32);
122}
123static inline void ceph_encode_16(void **p, u16 v)
124{
125 put_unaligned_le16(v, (__le16 *)*p);
126 *p += sizeof(u16);
127}
128static inline void ceph_encode_8(void **p, u8 v)
129{
130 *(u8 *)*p = v;
131 (*p)++;
132}
133static inline void ceph_encode_copy(void **p, const void *s, int len)
134{
135 memcpy(*p, s, len);
136 *p += len;
137}
138
139/*
140 * filepath, string encoders
141 */
142static inline void ceph_encode_filepath(void **p, void *end,
143 u64 ino, const char *path)
144{
145 u32 len = path ? strlen(path) : 0;
146 BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
147 ceph_encode_8(p, 1);
148 ceph_encode_64(p, ino);
149 ceph_encode_32(p, len);
150 if (len)
151 memcpy(*p, path, len);
152 *p += len;
153}
154
155static inline void ceph_encode_string(void **p, void *end,
156 const char *s, u32 len)
157{
158 BUG_ON(*p + sizeof(len) + len > end);
159 ceph_encode_32(p, len);
160 if (len)
161 memcpy(*p, s, len);
162 *p += len;
163}
164
165#define ceph_encode_need(p, end, n, bad) \
166 do { \
167 if (unlikely(*(p) + (n) > (end))) \
168 goto bad; \
169 } while (0)
170
171#define ceph_encode_64_safe(p, end, v, bad) \
172 do { \
173 ceph_encode_need(p, end, sizeof(u64), bad); \
174 ceph_encode_64(p, v); \
175 } while (0)
176#define ceph_encode_32_safe(p, end, v, bad) \
177 do { \
178 ceph_encode_need(p, end, sizeof(u32), bad); \
179 ceph_encode_32(p, v); \
180 } while (0)
181#define ceph_encode_16_safe(p, end, v, bad) \
182 do { \
183 ceph_encode_need(p, end, sizeof(u16), bad); \
184 ceph_encode_16(p, v); \
185 } while (0)
186
187#define ceph_encode_copy_safe(p, end, pv, n, bad) \
188 do { \
189 ceph_encode_need(p, end, n, bad); \
190 ceph_encode_copy(p, pv, n); \
191 } while (0)
192
193
194#endif
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
new file mode 100644
index 000000000000..7261dc6c2ead
--- /dev/null
+++ b/fs/ceph/dir.c
@@ -0,0 +1,1223 @@
1#include "ceph_debug.h"
2
3#include <linux/spinlock.h>
4#include <linux/fs_struct.h>
5#include <linux/namei.h>
6#include <linux/slab.h>
7#include <linux/sched.h>
8
9#include "super.h"
10
11/*
12 * Directory operations: readdir, lookup, create, link, unlink,
13 * rename, etc.
14 */
15
16/*
17 * Ceph MDS operations are specified in terms of a base ino and
18 * relative path. Thus, the client can specify an operation on a
19 * specific inode (e.g., a getattr due to fstat(2)), or as a path
20 * relative to, say, the root directory.
21 *
22 * Normally, we limit ourselves to strict inode ops (no path component)
23 * or dentry operations (a single path component relative to an ino). The
24 * exception to this is open_root_dentry(), which will open the mount
25 * point by name.
26 */
27
28const struct inode_operations ceph_dir_iops;
29const struct file_operations ceph_dir_fops;
30struct dentry_operations ceph_dentry_ops;
31
32/*
33 * Initialize ceph dentry state.
34 */
35int ceph_init_dentry(struct dentry *dentry)
36{
37 struct ceph_dentry_info *di;
38
39 if (dentry->d_fsdata)
40 return 0;
41
42 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
43 dentry->d_op = &ceph_dentry_ops;
44 else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
45 dentry->d_op = &ceph_snapdir_dentry_ops;
46 else
47 dentry->d_op = &ceph_snap_dentry_ops;
48
49 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS);
50 if (!di)
51 return -ENOMEM; /* oh well */
52
53 spin_lock(&dentry->d_lock);
54 if (dentry->d_fsdata) /* lost a race */
55 goto out_unlock;
56 di->dentry = dentry;
57 di->lease_session = NULL;
58 dentry->d_fsdata = di;
59 dentry->d_time = jiffies;
60 ceph_dentry_lru_add(dentry);
61out_unlock:
62 spin_unlock(&dentry->d_lock);
63 return 0;
64}
65
66
67
68/*
69 * for readdir, we encode the directory frag and offset within that
70 * frag into f_pos.
71 */
72static unsigned fpos_frag(loff_t p)
73{
74 return p >> 32;
75}
76static unsigned fpos_off(loff_t p)
77{
78 return p & 0xffffffff;
79}
80
81/*
82 * When possible, we try to satisfy a readdir by peeking at the
83 * dcache. We make this work by carefully ordering dentries on
84 * d_u.d_child when we initially get results back from the MDS, and
85 * falling back to a "normal" sync readdir if any dentries in the dir
86 * are dropped.
87 *
88 * I_COMPLETE tells indicates we have all dentries in the dir. It is
89 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
90 * the MDS if/when the directory is modified).
91 */
92static int __dcache_readdir(struct file *filp,
93 void *dirent, filldir_t filldir)
94{
95 struct inode *inode = filp->f_dentry->d_inode;
96 struct ceph_file_info *fi = filp->private_data;
97 struct dentry *parent = filp->f_dentry;
98 struct inode *dir = parent->d_inode;
99 struct list_head *p;
100 struct dentry *dentry, *last;
101 struct ceph_dentry_info *di;
102 int err = 0;
103
104 /* claim ref on last dentry we returned */
105 last = fi->dentry;
106 fi->dentry = NULL;
107
108 dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
109 last);
110
111 spin_lock(&dcache_lock);
112
113 /* start at beginning? */
114 if (filp->f_pos == 2 || (last &&
115 filp->f_pos < ceph_dentry(last)->offset)) {
116 if (list_empty(&parent->d_subdirs))
117 goto out_unlock;
118 p = parent->d_subdirs.prev;
119 dout(" initial p %p/%p\n", p->prev, p->next);
120 } else {
121 p = last->d_u.d_child.prev;
122 }
123
124more:
125 dentry = list_entry(p, struct dentry, d_u.d_child);
126 di = ceph_dentry(dentry);
127 while (1) {
128 dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next,
129 parent->d_subdirs.prev, parent->d_subdirs.next);
130 if (p == &parent->d_subdirs) {
131 fi->at_end = 1;
132 goto out_unlock;
133 }
134 if (!d_unhashed(dentry) && dentry->d_inode &&
135 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
136 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
137 filp->f_pos <= di->offset)
138 break;
139 dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
140 dentry->d_name.len, dentry->d_name.name, di->offset,
141 filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
142 !dentry->d_inode ? " null" : "");
143 p = p->prev;
144 dentry = list_entry(p, struct dentry, d_u.d_child);
145 di = ceph_dentry(dentry);
146 }
147
148 atomic_inc(&dentry->d_count);
149 spin_unlock(&dcache_lock);
150 spin_unlock(&inode->i_lock);
151
152 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
153 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
154 filp->f_pos = di->offset;
155 err = filldir(dirent, dentry->d_name.name,
156 dentry->d_name.len, di->offset,
157 dentry->d_inode->i_ino,
158 dentry->d_inode->i_mode >> 12);
159
160 if (last) {
161 if (err < 0) {
162 /* remember our position */
163 fi->dentry = last;
164 fi->next_offset = di->offset;
165 } else {
166 dput(last);
167 }
168 last = NULL;
169 }
170
171 spin_lock(&inode->i_lock);
172 spin_lock(&dcache_lock);
173
174 if (err < 0)
175 goto out_unlock;
176
177 last = dentry;
178
179 p = p->prev;
180 filp->f_pos++;
181
182 /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
183 if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
184 goto more;
185 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
186 err = -EAGAIN;
187
188out_unlock:
189 spin_unlock(&dcache_lock);
190
191 if (last) {
192 spin_unlock(&inode->i_lock);
193 dput(last);
194 spin_lock(&inode->i_lock);
195 }
196
197 return err;
198}
199
200/*
201 * make note of the last dentry we read, so we can
202 * continue at the same lexicographical point,
203 * regardless of what dir changes take place on the
204 * server.
205 */
206static int note_last_dentry(struct ceph_file_info *fi, const char *name,
207 int len)
208{
209 kfree(fi->last_name);
210 fi->last_name = kmalloc(len+1, GFP_NOFS);
211 if (!fi->last_name)
212 return -ENOMEM;
213 memcpy(fi->last_name, name, len);
214 fi->last_name[len] = 0;
215 dout("note_last_dentry '%s'\n", fi->last_name);
216 return 0;
217}
218
219static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
220{
221 struct ceph_file_info *fi = filp->private_data;
222 struct inode *inode = filp->f_dentry->d_inode;
223 struct ceph_inode_info *ci = ceph_inode(inode);
224 struct ceph_client *client = ceph_inode_to_client(inode);
225 struct ceph_mds_client *mdsc = &client->mdsc;
226 unsigned frag = fpos_frag(filp->f_pos);
227 int off = fpos_off(filp->f_pos);
228 int err;
229 u32 ftype;
230 struct ceph_mds_reply_info_parsed *rinfo;
231 const int max_entries = client->mount_args->max_readdir;
232
233 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
234 if (fi->at_end)
235 return 0;
236
237 /* always start with . and .. */
238 if (filp->f_pos == 0) {
239 /* note dir version at start of readdir so we can tell
240 * if any dentries get dropped */
241 fi->dir_release_count = ci->i_release_count;
242
243 dout("readdir off 0 -> '.'\n");
244 if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
245 inode->i_ino, inode->i_mode >> 12) < 0)
246 return 0;
247 filp->f_pos = 1;
248 off = 1;
249 }
250 if (filp->f_pos == 1) {
251 dout("readdir off 1 -> '..'\n");
252 if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
253 filp->f_dentry->d_parent->d_inode->i_ino,
254 inode->i_mode >> 12) < 0)
255 return 0;
256 filp->f_pos = 2;
257 off = 2;
258 }
259
260 /* can we use the dcache? */
261 spin_lock(&inode->i_lock);
262 if ((filp->f_pos == 2 || fi->dentry) &&
263 !ceph_test_opt(client, NOASYNCREADDIR) &&
264 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
265 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
266 err = __dcache_readdir(filp, dirent, filldir);
267 if (err != -EAGAIN) {
268 spin_unlock(&inode->i_lock);
269 return err;
270 }
271 }
272 spin_unlock(&inode->i_lock);
273 if (fi->dentry) {
274 err = note_last_dentry(fi, fi->dentry->d_name.name,
275 fi->dentry->d_name.len);
276 if (err)
277 return err;
278 dput(fi->dentry);
279 fi->dentry = NULL;
280 }
281
282 /* proceed with a normal readdir */
283
284more:
285 /* do we have the correct frag content buffered? */
286 if (fi->frag != frag || fi->last_readdir == NULL) {
287 struct ceph_mds_request *req;
288 int op = ceph_snap(inode) == CEPH_SNAPDIR ?
289 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
290
291 /* discard old result, if any */
292 if (fi->last_readdir) {
293 ceph_mdsc_put_request(fi->last_readdir);
294 fi->last_readdir = NULL;
295 }
296
297 /* requery frag tree, as the frag topology may have changed */
298 frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
299
300 dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
301 ceph_vinop(inode), frag, fi->last_name);
302 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
303 if (IS_ERR(req))
304 return PTR_ERR(req);
305 req->r_inode = igrab(inode);
306 req->r_dentry = dget(filp->f_dentry);
307 /* hints to request -> mds selection code */
308 req->r_direct_mode = USE_AUTH_MDS;
309 req->r_direct_hash = ceph_frag_value(frag);
310 req->r_direct_is_hash = true;
311 req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
312 req->r_readdir_offset = fi->next_offset;
313 req->r_args.readdir.frag = cpu_to_le32(frag);
314 req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
315 req->r_num_caps = max_entries;
316 err = ceph_mdsc_do_request(mdsc, NULL, req);
317 if (err < 0) {
318 ceph_mdsc_put_request(req);
319 return err;
320 }
321 dout("readdir got and parsed readdir result=%d"
322 " on frag %x, end=%d, complete=%d\n", err, frag,
323 (int)req->r_reply_info.dir_end,
324 (int)req->r_reply_info.dir_complete);
325
326 if (!req->r_did_prepopulate) {
327 dout("readdir !did_prepopulate");
328 fi->dir_release_count--; /* preclude I_COMPLETE */
329 }
330
331 /* note next offset and last dentry name */
332 fi->offset = fi->next_offset;
333 fi->last_readdir = req;
334
335 if (req->r_reply_info.dir_end) {
336 kfree(fi->last_name);
337 fi->last_name = NULL;
338 fi->next_offset = 0;
339 } else {
340 rinfo = &req->r_reply_info;
341 err = note_last_dentry(fi,
342 rinfo->dir_dname[rinfo->dir_nr-1],
343 rinfo->dir_dname_len[rinfo->dir_nr-1]);
344 if (err)
345 return err;
346 fi->next_offset += rinfo->dir_nr;
347 }
348 }
349
350 rinfo = &fi->last_readdir->r_reply_info;
351 dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
352 rinfo->dir_nr, off, fi->offset);
353 while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) {
354 u64 pos = ceph_make_fpos(frag, off);
355 struct ceph_mds_reply_inode *in =
356 rinfo->dir_in[off - fi->offset].in;
357 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
358 off, off - fi->offset, rinfo->dir_nr, pos,
359 rinfo->dir_dname_len[off - fi->offset],
360 rinfo->dir_dname[off - fi->offset], in);
361 BUG_ON(!in);
362 ftype = le32_to_cpu(in->mode) >> 12;
363 if (filldir(dirent,
364 rinfo->dir_dname[off - fi->offset],
365 rinfo->dir_dname_len[off - fi->offset],
366 pos,
367 le64_to_cpu(in->ino),
368 ftype) < 0) {
369 dout("filldir stopping us...\n");
370 return 0;
371 }
372 off++;
373 filp->f_pos = pos + 1;
374 }
375
376 if (fi->last_name) {
377 ceph_mdsc_put_request(fi->last_readdir);
378 fi->last_readdir = NULL;
379 goto more;
380 }
381
382 /* more frags? */
383 if (!ceph_frag_is_rightmost(frag)) {
384 frag = ceph_frag_next(frag);
385 off = 0;
386 filp->f_pos = ceph_make_fpos(frag, off);
387 dout("readdir next frag is %x\n", frag);
388 goto more;
389 }
390 fi->at_end = 1;
391
392 /*
393 * if dir_release_count still matches the dir, no dentries
394 * were released during the whole readdir, and we should have
395 * the complete dir contents in our cache.
396 */
397 spin_lock(&inode->i_lock);
398 if (ci->i_release_count == fi->dir_release_count) {
399 dout(" marking %p complete\n", inode);
400 ci->i_ceph_flags |= CEPH_I_COMPLETE;
401 ci->i_max_offset = filp->f_pos;
402 }
403 spin_unlock(&inode->i_lock);
404
405 dout("readdir %p filp %p done.\n", inode, filp);
406 return 0;
407}
408
409static void reset_readdir(struct ceph_file_info *fi)
410{
411 if (fi->last_readdir) {
412 ceph_mdsc_put_request(fi->last_readdir);
413 fi->last_readdir = NULL;
414 }
415 kfree(fi->last_name);
416 fi->next_offset = 2; /* compensate for . and .. */
417 if (fi->dentry) {
418 dput(fi->dentry);
419 fi->dentry = NULL;
420 }
421 fi->at_end = 0;
422}
423
424static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
425{
426 struct ceph_file_info *fi = file->private_data;
427 struct inode *inode = file->f_mapping->host;
428 loff_t old_offset = offset;
429 loff_t retval;
430
431 mutex_lock(&inode->i_mutex);
432 switch (origin) {
433 case SEEK_END:
434 offset += inode->i_size + 2; /* FIXME */
435 break;
436 case SEEK_CUR:
437 offset += file->f_pos;
438 }
439 retval = -EINVAL;
440 if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
441 if (offset != file->f_pos) {
442 file->f_pos = offset;
443 file->f_version = 0;
444 fi->at_end = 0;
445 }
446 retval = offset;
447
448 /*
449 * discard buffered readdir content on seekdir(0), or
450 * seek to new frag, or seek prior to current chunk.
451 */
452 if (offset == 0 ||
453 fpos_frag(offset) != fpos_frag(old_offset) ||
454 fpos_off(offset) < fi->offset) {
455 dout("dir_llseek dropping %p content\n", file);
456 reset_readdir(fi);
457 }
458
459 /* bump dir_release_count if we did a forward seek */
460 if (offset > old_offset)
461 fi->dir_release_count--;
462 }
463 mutex_unlock(&inode->i_mutex);
464 return retval;
465}
466
467/*
468 * Process result of a lookup/open request.
469 *
470 * Mainly, make sure we return the final req->r_dentry (if it already
471 * existed) in place of the original VFS-provided dentry when they
472 * differ.
473 *
474 * Gracefully handle the case where the MDS replies with -ENOENT and
475 * no trace (which it may do, at its discretion, e.g., if it doesn't
476 * care to issue a lease on the negative dentry).
477 */
478struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
479 struct dentry *dentry, int err)
480{
481 struct ceph_client *client = ceph_client(dentry->d_sb);
482 struct inode *parent = dentry->d_parent->d_inode;
483
484 /* .snap dir? */
485 if (err == -ENOENT &&
486 ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
487 strcmp(dentry->d_name.name,
488 client->mount_args->snapdir_name) == 0) {
489 struct inode *inode = ceph_get_snapdir(parent);
490 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
491 dentry, dentry->d_name.len, dentry->d_name.name, inode);
492 d_add(dentry, inode);
493 err = 0;
494 }
495
496 if (err == -ENOENT) {
497 /* no trace? */
498 err = 0;
499 if (!req->r_reply_info.head->is_dentry) {
500 dout("ENOENT and no trace, dentry %p inode %p\n",
501 dentry, dentry->d_inode);
502 if (dentry->d_inode) {
503 d_drop(dentry);
504 err = -ENOENT;
505 } else {
506 d_add(dentry, NULL);
507 }
508 }
509 }
510 if (err)
511 dentry = ERR_PTR(err);
512 else if (dentry != req->r_dentry)
513 dentry = dget(req->r_dentry); /* we got spliced */
514 else
515 dentry = NULL;
516 return dentry;
517}
518
519static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
520{
521 return ceph_ino(inode) == CEPH_INO_ROOT &&
522 strncmp(dentry->d_name.name, ".ceph", 5) == 0;
523}
524
525/*
526 * Look up a single dir entry. If there is a lookup intent, inform
527 * the MDS so that it gets our 'caps wanted' value in a single op.
528 */
529static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
530 struct nameidata *nd)
531{
532 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
533 struct ceph_mds_client *mdsc = &client->mdsc;
534 struct ceph_mds_request *req;
535 int op;
536 int err;
537
538 dout("lookup %p dentry %p '%.*s'\n",
539 dir, dentry, dentry->d_name.len, dentry->d_name.name);
540
541 if (dentry->d_name.len > NAME_MAX)
542 return ERR_PTR(-ENAMETOOLONG);
543
544 err = ceph_init_dentry(dentry);
545 if (err < 0)
546 return ERR_PTR(err);
547
548 /* open (but not create!) intent? */
549 if (nd &&
550 (nd->flags & LOOKUP_OPEN) &&
551 (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */
552 !(nd->intent.open.flags & O_CREAT)) {
553 int mode = nd->intent.open.create_mode & ~current->fs->umask;
554 return ceph_lookup_open(dir, dentry, nd, mode, 1);
555 }
556
557 /* can we conclude ENOENT locally? */
558 if (dentry->d_inode == NULL) {
559 struct ceph_inode_info *ci = ceph_inode(dir);
560 struct ceph_dentry_info *di = ceph_dentry(dentry);
561
562 spin_lock(&dir->i_lock);
563 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
564 if (strncmp(dentry->d_name.name,
565 client->mount_args->snapdir_name,
566 dentry->d_name.len) &&
567 !is_root_ceph_dentry(dir, dentry) &&
568 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
569 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
570 di->offset = ci->i_max_offset++;
571 spin_unlock(&dir->i_lock);
572 dout(" dir %p complete, -ENOENT\n", dir);
573 d_add(dentry, NULL);
574 di->lease_shared_gen = ci->i_shared_gen;
575 return NULL;
576 }
577 spin_unlock(&dir->i_lock);
578 }
579
580 op = ceph_snap(dir) == CEPH_SNAPDIR ?
581 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
582 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
583 if (IS_ERR(req))
584 return ERR_PTR(PTR_ERR(req));
585 req->r_dentry = dget(dentry);
586 req->r_num_caps = 2;
587 /* we only need inode linkage */
588 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
589 req->r_locked_dir = dir;
590 err = ceph_mdsc_do_request(mdsc, NULL, req);
591 dentry = ceph_finish_lookup(req, dentry, err);
592 ceph_mdsc_put_request(req); /* will dput(dentry) */
593 dout("lookup result=%p\n", dentry);
594 return dentry;
595}
596
597/*
598 * If we do a create but get no trace back from the MDS, follow up with
599 * a lookup (the VFS expects us to link up the provided dentry).
600 */
601int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
602{
603 struct dentry *result = ceph_lookup(dir, dentry, NULL);
604
605 if (result && !IS_ERR(result)) {
606 /*
607 * We created the item, then did a lookup, and found
608 * it was already linked to another inode we already
609 * had in our cache (and thus got spliced). Link our
610 * dentry to that inode, but don't hash it, just in
611 * case the VFS wants to dereference it.
612 */
613 BUG_ON(!result->d_inode);
614 d_instantiate(dentry, result->d_inode);
615 return 0;
616 }
617 return PTR_ERR(result);
618}
619
620static int ceph_mknod(struct inode *dir, struct dentry *dentry,
621 int mode, dev_t rdev)
622{
623 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
624 struct ceph_mds_client *mdsc = &client->mdsc;
625 struct ceph_mds_request *req;
626 int err;
627
628 if (ceph_snap(dir) != CEPH_NOSNAP)
629 return -EROFS;
630
631 dout("mknod in dir %p dentry %p mode 0%o rdev %d\n",
632 dir, dentry, mode, rdev);
633 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
634 if (IS_ERR(req)) {
635 d_drop(dentry);
636 return PTR_ERR(req);
637 }
638 req->r_dentry = dget(dentry);
639 req->r_num_caps = 2;
640 req->r_locked_dir = dir;
641 req->r_args.mknod.mode = cpu_to_le32(mode);
642 req->r_args.mknod.rdev = cpu_to_le32(rdev);
643 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
644 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
645 err = ceph_mdsc_do_request(mdsc, dir, req);
646 if (!err && !req->r_reply_info.head->is_dentry)
647 err = ceph_handle_notrace_create(dir, dentry);
648 ceph_mdsc_put_request(req);
649 if (err)
650 d_drop(dentry);
651 return err;
652}
653
654static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
655 struct nameidata *nd)
656{
657 dout("create in dir %p dentry %p name '%.*s'\n",
658 dir, dentry, dentry->d_name.len, dentry->d_name.name);
659
660 if (ceph_snap(dir) != CEPH_NOSNAP)
661 return -EROFS;
662
663 if (nd) {
664 BUG_ON((nd->flags & LOOKUP_OPEN) == 0);
665 dentry = ceph_lookup_open(dir, dentry, nd, mode, 0);
666 /* hrm, what should i do here if we get aliased? */
667 if (IS_ERR(dentry))
668 return PTR_ERR(dentry);
669 return 0;
670 }
671
672 /* fall back to mknod */
673 return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0);
674}
675
676static int ceph_symlink(struct inode *dir, struct dentry *dentry,
677 const char *dest)
678{
679 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
680 struct ceph_mds_client *mdsc = &client->mdsc;
681 struct ceph_mds_request *req;
682 int err;
683
684 if (ceph_snap(dir) != CEPH_NOSNAP)
685 return -EROFS;
686
687 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
688 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
689 if (IS_ERR(req)) {
690 d_drop(dentry);
691 return PTR_ERR(req);
692 }
693 req->r_dentry = dget(dentry);
694 req->r_num_caps = 2;
695 req->r_path2 = kstrdup(dest, GFP_NOFS);
696 req->r_locked_dir = dir;
697 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
698 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
699 err = ceph_mdsc_do_request(mdsc, dir, req);
700 if (!err && !req->r_reply_info.head->is_dentry)
701 err = ceph_handle_notrace_create(dir, dentry);
702 ceph_mdsc_put_request(req);
703 if (err)
704 d_drop(dentry);
705 return err;
706}
707
708static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
709{
710 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
711 struct ceph_mds_client *mdsc = &client->mdsc;
712 struct ceph_mds_request *req;
713 int err = -EROFS;
714 int op;
715
716 if (ceph_snap(dir) == CEPH_SNAPDIR) {
717 /* mkdir .snap/foo is a MKSNAP */
718 op = CEPH_MDS_OP_MKSNAP;
719 dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
720 dentry->d_name.len, dentry->d_name.name, dentry);
721 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
722 dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode);
723 op = CEPH_MDS_OP_MKDIR;
724 } else {
725 goto out;
726 }
727 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
728 if (IS_ERR(req)) {
729 err = PTR_ERR(req);
730 goto out;
731 }
732
733 req->r_dentry = dget(dentry);
734 req->r_num_caps = 2;
735 req->r_locked_dir = dir;
736 req->r_args.mkdir.mode = cpu_to_le32(mode);
737 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
738 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
739 err = ceph_mdsc_do_request(mdsc, dir, req);
740 if (!err && !req->r_reply_info.head->is_dentry)
741 err = ceph_handle_notrace_create(dir, dentry);
742 ceph_mdsc_put_request(req);
743out:
744 if (err < 0)
745 d_drop(dentry);
746 return err;
747}
748
749static int ceph_link(struct dentry *old_dentry, struct inode *dir,
750 struct dentry *dentry)
751{
752 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
753 struct ceph_mds_client *mdsc = &client->mdsc;
754 struct ceph_mds_request *req;
755 int err;
756
757 if (ceph_snap(dir) != CEPH_NOSNAP)
758 return -EROFS;
759
760 dout("link in dir %p old_dentry %p dentry %p\n", dir,
761 old_dentry, dentry);
762 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
763 if (IS_ERR(req)) {
764 d_drop(dentry);
765 return PTR_ERR(req);
766 }
767 req->r_dentry = dget(dentry);
768 req->r_num_caps = 2;
769 req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
770 req->r_locked_dir = dir;
771 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
772 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
773 err = ceph_mdsc_do_request(mdsc, dir, req);
774 if (err)
775 d_drop(dentry);
776 else if (!req->r_reply_info.head->is_dentry)
777 d_instantiate(dentry, igrab(old_dentry->d_inode));
778 ceph_mdsc_put_request(req);
779 return err;
780}
781
782/*
783 * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
784 * looks like the link count will hit 0, drop any other caps (other
785 * than PIN) we don't specifically want (due to the file still being
786 * open).
787 */
788static int drop_caps_for_unlink(struct inode *inode)
789{
790 struct ceph_inode_info *ci = ceph_inode(inode);
791 int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
792
793 spin_lock(&inode->i_lock);
794 if (inode->i_nlink == 1) {
795 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
796 ci->i_ceph_flags |= CEPH_I_NODELAY;
797 }
798 spin_unlock(&inode->i_lock);
799 return drop;
800}
801
802/*
803 * rmdir and unlink are differ only by the metadata op code
804 */
805static int ceph_unlink(struct inode *dir, struct dentry *dentry)
806{
807 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
808 struct ceph_mds_client *mdsc = &client->mdsc;
809 struct inode *inode = dentry->d_inode;
810 struct ceph_mds_request *req;
811 int err = -EROFS;
812 int op;
813
814 if (ceph_snap(dir) == CEPH_SNAPDIR) {
815 /* rmdir .snap/foo is RMSNAP */
816 dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
817 dentry->d_name.name, dentry);
818 op = CEPH_MDS_OP_RMSNAP;
819 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
820 dout("unlink/rmdir dir %p dn %p inode %p\n",
821 dir, dentry, inode);
822 op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ?
823 CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
824 } else
825 goto out;
826 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
827 if (IS_ERR(req)) {
828 err = PTR_ERR(req);
829 goto out;
830 }
831 req->r_dentry = dget(dentry);
832 req->r_num_caps = 2;
833 req->r_locked_dir = dir;
834 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
835 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
836 req->r_inode_drop = drop_caps_for_unlink(inode);
837 err = ceph_mdsc_do_request(mdsc, dir, req);
838 if (!err && !req->r_reply_info.head->is_dentry)
839 d_delete(dentry);
840 ceph_mdsc_put_request(req);
841out:
842 return err;
843}
844
845static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
846 struct inode *new_dir, struct dentry *new_dentry)
847{
848 struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
849 struct ceph_mds_client *mdsc = &client->mdsc;
850 struct ceph_mds_request *req;
851 int err;
852
853 if (ceph_snap(old_dir) != ceph_snap(new_dir))
854 return -EXDEV;
855 if (ceph_snap(old_dir) != CEPH_NOSNAP ||
856 ceph_snap(new_dir) != CEPH_NOSNAP)
857 return -EROFS;
858 dout("rename dir %p dentry %p to dir %p dentry %p\n",
859 old_dir, old_dentry, new_dir, new_dentry);
860 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
861 if (IS_ERR(req))
862 return PTR_ERR(req);
863 req->r_dentry = dget(new_dentry);
864 req->r_num_caps = 2;
865 req->r_old_dentry = dget(old_dentry);
866 req->r_locked_dir = new_dir;
867 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
868 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
869 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
870 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
871 /* release LINK_RDCACHE on source inode (mds will lock it) */
872 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
873 if (new_dentry->d_inode)
874 req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
875 err = ceph_mdsc_do_request(mdsc, old_dir, req);
876 if (!err && !req->r_reply_info.head->is_dentry) {
877 /*
878 * Normally d_move() is done by fill_trace (called by
879 * do_request, above). If there is no trace, we need
880 * to do it here.
881 */
882 d_move(old_dentry, new_dentry);
883 }
884 ceph_mdsc_put_request(req);
885 return err;
886}
887
888
889/*
890 * Check if dentry lease is valid. If not, delete the lease. Try to
891 * renew if the least is more than half up.
892 */
893static int dentry_lease_is_valid(struct dentry *dentry)
894{
895 struct ceph_dentry_info *di;
896 struct ceph_mds_session *s;
897 int valid = 0;
898 u32 gen;
899 unsigned long ttl;
900 struct ceph_mds_session *session = NULL;
901 struct inode *dir = NULL;
902 u32 seq = 0;
903
904 spin_lock(&dentry->d_lock);
905 di = ceph_dentry(dentry);
906 if (di && di->lease_session) {
907 s = di->lease_session;
908 spin_lock(&s->s_cap_lock);
909 gen = s->s_cap_gen;
910 ttl = s->s_cap_ttl;
911 spin_unlock(&s->s_cap_lock);
912
913 if (di->lease_gen == gen &&
914 time_before(jiffies, dentry->d_time) &&
915 time_before(jiffies, ttl)) {
916 valid = 1;
917 if (di->lease_renew_after &&
918 time_after(jiffies, di->lease_renew_after)) {
919 /* we should renew */
920 dir = dentry->d_parent->d_inode;
921 session = ceph_get_mds_session(s);
922 seq = di->lease_seq;
923 di->lease_renew_after = 0;
924 di->lease_renew_from = jiffies;
925 }
926 }
927 }
928 spin_unlock(&dentry->d_lock);
929
930 if (session) {
931 ceph_mdsc_lease_send_msg(session, dir, dentry,
932 CEPH_MDS_LEASE_RENEW, seq);
933 ceph_put_mds_session(session);
934 }
935 dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
936 return valid;
937}
938
939/*
940 * Check if directory-wide content lease/cap is valid.
941 */
942static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
943{
944 struct ceph_inode_info *ci = ceph_inode(dir);
945 struct ceph_dentry_info *di = ceph_dentry(dentry);
946 int valid = 0;
947
948 spin_lock(&dir->i_lock);
949 if (ci->i_shared_gen == di->lease_shared_gen)
950 valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
951 spin_unlock(&dir->i_lock);
952 dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
953 dir, (unsigned)ci->i_shared_gen, dentry,
954 (unsigned)di->lease_shared_gen, valid);
955 return valid;
956}
957
958/*
959 * Check if cached dentry can be trusted.
960 */
961static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
962{
963 struct inode *dir = dentry->d_parent->d_inode;
964
965 dout("d_revalidate %p '%.*s' inode %p\n", dentry,
966 dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
967
968 /* always trust cached snapped dentries, snapdir dentry */
969 if (ceph_snap(dir) != CEPH_NOSNAP) {
970 dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
971 dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
972 goto out_touch;
973 }
974 if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR)
975 goto out_touch;
976
977 if (dentry_lease_is_valid(dentry) ||
978 dir_lease_is_valid(dir, dentry))
979 goto out_touch;
980
981 dout("d_revalidate %p invalid\n", dentry);
982 d_drop(dentry);
983 return 0;
984out_touch:
985 ceph_dentry_lru_touch(dentry);
986 return 1;
987}
988
989/*
990 * When a dentry is released, clear the dir I_COMPLETE if it was part
991 * of the current dir gen.
992 */
993static void ceph_dentry_release(struct dentry *dentry)
994{
995 struct ceph_dentry_info *di = ceph_dentry(dentry);
996 struct inode *parent_inode = dentry->d_parent->d_inode;
997
998 if (parent_inode) {
999 struct ceph_inode_info *ci = ceph_inode(parent_inode);
1000
1001 spin_lock(&parent_inode->i_lock);
1002 if (ci->i_shared_gen == di->lease_shared_gen) {
1003 dout(" clearing %p complete (d_release)\n",
1004 parent_inode);
1005 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1006 ci->i_release_count++;
1007 }
1008 spin_unlock(&parent_inode->i_lock);
1009 }
1010 if (di) {
1011 ceph_dentry_lru_del(dentry);
1012 if (di->lease_session)
1013 ceph_put_mds_session(di->lease_session);
1014 kmem_cache_free(ceph_dentry_cachep, di);
1015 dentry->d_fsdata = NULL;
1016 }
1017}
1018
1019static int ceph_snapdir_d_revalidate(struct dentry *dentry,
1020 struct nameidata *nd)
1021{
1022 /*
1023 * Eventually, we'll want to revalidate snapped metadata
1024 * too... probably...
1025 */
1026 return 1;
1027}
1028
1029
1030
1031/*
1032 * read() on a dir. This weird interface hack only works if mounted
1033 * with '-o dirstat'.
1034 */
1035static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1036 loff_t *ppos)
1037{
1038 struct ceph_file_info *cf = file->private_data;
1039 struct inode *inode = file->f_dentry->d_inode;
1040 struct ceph_inode_info *ci = ceph_inode(inode);
1041 int left;
1042
1043 if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT))
1044 return -EISDIR;
1045
1046 if (!cf->dir_info) {
1047 cf->dir_info = kmalloc(1024, GFP_NOFS);
1048 if (!cf->dir_info)
1049 return -ENOMEM;
1050 cf->dir_info_len =
1051 sprintf(cf->dir_info,
1052 "entries: %20lld\n"
1053 " files: %20lld\n"
1054 " subdirs: %20lld\n"
1055 "rentries: %20lld\n"
1056 " rfiles: %20lld\n"
1057 " rsubdirs: %20lld\n"
1058 "rbytes: %20lld\n"
1059 "rctime: %10ld.%09ld\n",
1060 ci->i_files + ci->i_subdirs,
1061 ci->i_files,
1062 ci->i_subdirs,
1063 ci->i_rfiles + ci->i_rsubdirs,
1064 ci->i_rfiles,
1065 ci->i_rsubdirs,
1066 ci->i_rbytes,
1067 (long)ci->i_rctime.tv_sec,
1068 (long)ci->i_rctime.tv_nsec);
1069 }
1070
1071 if (*ppos >= cf->dir_info_len)
1072 return 0;
1073 size = min_t(unsigned, size, cf->dir_info_len-*ppos);
1074 left = copy_to_user(buf, cf->dir_info + *ppos, size);
1075 if (left == size)
1076 return -EFAULT;
1077 *ppos += (size - left);
1078 return size - left;
1079}
1080
1081/*
1082 * an fsync() on a dir will wait for any uncommitted directory
1083 * operations to commit.
1084 */
1085static int ceph_dir_fsync(struct file *file, struct dentry *dentry,
1086 int datasync)
1087{
1088 struct inode *inode = dentry->d_inode;
1089 struct ceph_inode_info *ci = ceph_inode(inode);
1090 struct list_head *head = &ci->i_unsafe_dirops;
1091 struct ceph_mds_request *req;
1092 u64 last_tid;
1093 int ret = 0;
1094
1095 dout("dir_fsync %p\n", inode);
1096 spin_lock(&ci->i_unsafe_lock);
1097 if (list_empty(head))
1098 goto out;
1099
1100 req = list_entry(head->prev,
1101 struct ceph_mds_request, r_unsafe_dir_item);
1102 last_tid = req->r_tid;
1103
1104 do {
1105 ceph_mdsc_get_request(req);
1106 spin_unlock(&ci->i_unsafe_lock);
1107 dout("dir_fsync %p wait on tid %llu (until %llu)\n",
1108 inode, req->r_tid, last_tid);
1109 if (req->r_timeout) {
1110 ret = wait_for_completion_timeout(
1111 &req->r_safe_completion, req->r_timeout);
1112 if (ret > 0)
1113 ret = 0;
1114 else if (ret == 0)
1115 ret = -EIO; /* timed out */
1116 } else {
1117 wait_for_completion(&req->r_safe_completion);
1118 }
1119 spin_lock(&ci->i_unsafe_lock);
1120 ceph_mdsc_put_request(req);
1121
1122 if (ret || list_empty(head))
1123 break;
1124 req = list_entry(head->next,
1125 struct ceph_mds_request, r_unsafe_dir_item);
1126 } while (req->r_tid < last_tid);
1127out:
1128 spin_unlock(&ci->i_unsafe_lock);
1129 return ret;
1130}
1131
1132/*
1133 * We maintain a private dentry LRU.
1134 *
1135 * FIXME: this needs to be changed to a per-mds lru to be useful.
1136 */
1137void ceph_dentry_lru_add(struct dentry *dn)
1138{
1139 struct ceph_dentry_info *di = ceph_dentry(dn);
1140 struct ceph_mds_client *mdsc;
1141
1142 dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1143 dn->d_name.len, dn->d_name.name);
1144 if (di) {
1145 mdsc = &ceph_client(dn->d_sb)->mdsc;
1146 spin_lock(&mdsc->dentry_lru_lock);
1147 list_add_tail(&di->lru, &mdsc->dentry_lru);
1148 mdsc->num_dentry++;
1149 spin_unlock(&mdsc->dentry_lru_lock);
1150 }
1151}
1152
1153void ceph_dentry_lru_touch(struct dentry *dn)
1154{
1155 struct ceph_dentry_info *di = ceph_dentry(dn);
1156 struct ceph_mds_client *mdsc;
1157
1158 dout("dentry_lru_touch %p %p '%.*s'\n", di, dn,
1159 dn->d_name.len, dn->d_name.name);
1160 if (di) {
1161 mdsc = &ceph_client(dn->d_sb)->mdsc;
1162 spin_lock(&mdsc->dentry_lru_lock);
1163 list_move_tail(&di->lru, &mdsc->dentry_lru);
1164 spin_unlock(&mdsc->dentry_lru_lock);
1165 }
1166}
1167
1168void ceph_dentry_lru_del(struct dentry *dn)
1169{
1170 struct ceph_dentry_info *di = ceph_dentry(dn);
1171 struct ceph_mds_client *mdsc;
1172
1173 dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1174 dn->d_name.len, dn->d_name.name);
1175 if (di) {
1176 mdsc = &ceph_client(dn->d_sb)->mdsc;
1177 spin_lock(&mdsc->dentry_lru_lock);
1178 list_del_init(&di->lru);
1179 mdsc->num_dentry--;
1180 spin_unlock(&mdsc->dentry_lru_lock);
1181 }
1182}
1183
1184const struct file_operations ceph_dir_fops = {
1185 .read = ceph_read_dir,
1186 .readdir = ceph_readdir,
1187 .llseek = ceph_dir_llseek,
1188 .open = ceph_open,
1189 .release = ceph_release,
1190 .unlocked_ioctl = ceph_ioctl,
1191 .fsync = ceph_dir_fsync,
1192};
1193
1194const struct inode_operations ceph_dir_iops = {
1195 .lookup = ceph_lookup,
1196 .permission = ceph_permission,
1197 .getattr = ceph_getattr,
1198 .setattr = ceph_setattr,
1199 .setxattr = ceph_setxattr,
1200 .getxattr = ceph_getxattr,
1201 .listxattr = ceph_listxattr,
1202 .removexattr = ceph_removexattr,
1203 .mknod = ceph_mknod,
1204 .symlink = ceph_symlink,
1205 .mkdir = ceph_mkdir,
1206 .link = ceph_link,
1207 .unlink = ceph_unlink,
1208 .rmdir = ceph_unlink,
1209 .rename = ceph_rename,
1210 .create = ceph_create,
1211};
1212
1213struct dentry_operations ceph_dentry_ops = {
1214 .d_revalidate = ceph_d_revalidate,
1215 .d_release = ceph_dentry_release,
1216};
1217
1218struct dentry_operations ceph_snapdir_dentry_ops = {
1219 .d_revalidate = ceph_snapdir_d_revalidate,
1220};
1221
1222struct dentry_operations ceph_snap_dentry_ops = {
1223};
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
new file mode 100644
index 000000000000..9d67572fb328
--- /dev/null
+++ b/fs/ceph/export.c
@@ -0,0 +1,224 @@
1#include "ceph_debug.h"
2
3#include <linux/exportfs.h>
4#include <linux/slab.h>
5#include <asm/unaligned.h>
6
7#include "super.h"
8
9/*
10 * NFS export support
11 *
12 * NFS re-export of a ceph mount is, at present, only semireliable.
13 * The basic issue is that the Ceph architectures doesn't lend itself
14 * well to generating filehandles that will remain valid forever.
15 *
16 * So, we do our best. If you're lucky, your inode will be in the
17 * client's cache. If it's not, and you have a connectable fh, then
18 * the MDS server may be able to find it for you. Otherwise, you get
19 * ESTALE.
20 *
21 * There are ways to this more reliable, but in the non-connectable fh
22 * case, we won't every work perfectly, and in the connectable case,
23 * some changes are needed on the MDS side to work better.
24 */
25
26/*
27 * Basic fh
28 */
29struct ceph_nfs_fh {
30 u64 ino;
31} __attribute__ ((packed));
32
33/*
34 * Larger 'connectable' fh that includes parent ino and name hash.
35 * Use this whenever possible, as it works more reliably.
36 */
37struct ceph_nfs_confh {
38 u64 ino, parent_ino;
39 u32 parent_name_hash;
40} __attribute__ ((packed));
41
42static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
43 int connectable)
44{
45 struct ceph_nfs_fh *fh = (void *)rawfh;
46 struct ceph_nfs_confh *cfh = (void *)rawfh;
47 struct dentry *parent = dentry->d_parent;
48 struct inode *inode = dentry->d_inode;
49 int type;
50
51 /* don't re-export snaps */
52 if (ceph_snap(inode) != CEPH_NOSNAP)
53 return -EINVAL;
54
55 if (*max_len >= sizeof(*cfh)) {
56 dout("encode_fh %p connectable\n", dentry);
57 cfh->ino = ceph_ino(dentry->d_inode);
58 cfh->parent_ino = ceph_ino(parent->d_inode);
59 cfh->parent_name_hash = parent->d_name.hash;
60 *max_len = sizeof(*cfh);
61 type = 2;
62 } else if (*max_len > sizeof(*fh)) {
63 if (connectable)
64 return -ENOSPC;
65 dout("encode_fh %p\n", dentry);
66 fh->ino = ceph_ino(dentry->d_inode);
67 *max_len = sizeof(*fh);
68 type = 1;
69 } else {
70 return -ENOSPC;
71 }
72 return type;
73}
74
75/*
76 * convert regular fh to dentry
77 *
78 * FIXME: we should try harder by querying the mds for the ino.
79 */
80static struct dentry *__fh_to_dentry(struct super_block *sb,
81 struct ceph_nfs_fh *fh)
82{
83 struct inode *inode;
84 struct dentry *dentry;
85 struct ceph_vino vino;
86 int err;
87
88 dout("__fh_to_dentry %llx\n", fh->ino);
89 vino.ino = fh->ino;
90 vino.snap = CEPH_NOSNAP;
91 inode = ceph_find_inode(sb, vino);
92 if (!inode)
93 return ERR_PTR(-ESTALE);
94
95 dentry = d_obtain_alias(inode);
96 if (!dentry) {
97 pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
98 fh->ino, inode);
99 iput(inode);
100 return ERR_PTR(-ENOMEM);
101 }
102 err = ceph_init_dentry(dentry);
103
104 if (err < 0) {
105 iput(inode);
106 return ERR_PTR(err);
107 }
108 dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
109 return dentry;
110}
111
112/*
113 * convert connectable fh to dentry
114 */
115static struct dentry *__cfh_to_dentry(struct super_block *sb,
116 struct ceph_nfs_confh *cfh)
117{
118 struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc;
119 struct inode *inode;
120 struct dentry *dentry;
121 struct ceph_vino vino;
122 int err;
123
124 dout("__cfh_to_dentry %llx (%llx/%x)\n",
125 cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
126
127 vino.ino = cfh->ino;
128 vino.snap = CEPH_NOSNAP;
129 inode = ceph_find_inode(sb, vino);
130 if (!inode) {
131 struct ceph_mds_request *req;
132
133 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
134 USE_ANY_MDS);
135 if (IS_ERR(req))
136 return ERR_PTR(PTR_ERR(req));
137
138 req->r_ino1 = vino;
139 req->r_ino2.ino = cfh->parent_ino;
140 req->r_ino2.snap = CEPH_NOSNAP;
141 req->r_path2 = kmalloc(16, GFP_NOFS);
142 snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
143 req->r_num_caps = 1;
144 err = ceph_mdsc_do_request(mdsc, NULL, req);
145 ceph_mdsc_put_request(req);
146 inode = ceph_find_inode(sb, vino);
147 if (!inode)
148 return ERR_PTR(err ? err : -ESTALE);
149 }
150
151 dentry = d_obtain_alias(inode);
152 if (!dentry) {
153 pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
154 cfh->ino, inode);
155 iput(inode);
156 return ERR_PTR(-ENOMEM);
157 }
158 err = ceph_init_dentry(dentry);
159 if (err < 0) {
160 iput(inode);
161 return ERR_PTR(err);
162 }
163 dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
164 return dentry;
165}
166
167static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
168 int fh_len, int fh_type)
169{
170 if (fh_type == 1)
171 return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw);
172 else
173 return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw);
174}
175
176/*
177 * get parent, if possible.
178 *
179 * FIXME: we could do better by querying the mds to discover the
180 * parent.
181 */
182static struct dentry *ceph_fh_to_parent(struct super_block *sb,
183 struct fid *fid,
184 int fh_len, int fh_type)
185{
186 struct ceph_nfs_confh *cfh = (void *)fid->raw;
187 struct ceph_vino vino;
188 struct inode *inode;
189 struct dentry *dentry;
190 int err;
191
192 if (fh_type == 1)
193 return ERR_PTR(-ESTALE);
194
195 pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
196 cfh->parent_name_hash);
197
198 vino.ino = cfh->ino;
199 vino.snap = CEPH_NOSNAP;
200 inode = ceph_find_inode(sb, vino);
201 if (!inode)
202 return ERR_PTR(-ESTALE);
203
204 dentry = d_obtain_alias(inode);
205 if (!dentry) {
206 pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
207 cfh->ino, inode);
208 iput(inode);
209 return ERR_PTR(-ENOMEM);
210 }
211 err = ceph_init_dentry(dentry);
212 if (err < 0) {
213 iput(inode);
214 return ERR_PTR(err);
215 }
216 dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry);
217 return dentry;
218}
219
220const struct export_operations ceph_export_ops = {
221 .encode_fh = ceph_encode_fh,
222 .fh_to_dentry = ceph_fh_to_dentry,
223 .fh_to_parent = ceph_fh_to_parent,
224};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
new file mode 100644
index 000000000000..4add3d5da2c1
--- /dev/null
+++ b/fs/ceph/file.c
@@ -0,0 +1,938 @@
1#include "ceph_debug.h"
2
3#include <linux/sched.h>
4#include <linux/slab.h>
5#include <linux/file.h>
6#include <linux/namei.h>
7#include <linux/writeback.h>
8
9#include "super.h"
10#include "mds_client.h"
11
12/*
13 * Ceph file operations
14 *
15 * Implement basic open/close functionality, and implement
16 * read/write.
17 *
18 * We implement three modes of file I/O:
19 * - buffered uses the generic_file_aio_{read,write} helpers
20 *
21 * - synchronous is used when there is multi-client read/write
22 * sharing, avoids the page cache, and synchronously waits for an
23 * ack from the OSD.
24 *
25 * - direct io takes the variant of the sync path that references
26 * user pages directly.
27 *
28 * fsync() flushes and waits on dirty pages, but just queues metadata
29 * for writeback: since the MDS can recover size and mtime there is no
30 * need to wait for MDS acknowledgement.
31 */
32
33
34/*
35 * Prepare an open request. Preallocate ceph_cap to avoid an
36 * inopportune ENOMEM later.
37 */
38static struct ceph_mds_request *
39prepare_open_request(struct super_block *sb, int flags, int create_mode)
40{
41 struct ceph_client *client = ceph_sb_to_client(sb);
42 struct ceph_mds_client *mdsc = &client->mdsc;
43 struct ceph_mds_request *req;
44 int want_auth = USE_ANY_MDS;
45 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
46
47 if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
48 want_auth = USE_AUTH_MDS;
49
50 req = ceph_mdsc_create_request(mdsc, op, want_auth);
51 if (IS_ERR(req))
52 goto out;
53 req->r_fmode = ceph_flags_to_mode(flags);
54 req->r_args.open.flags = cpu_to_le32(flags);
55 req->r_args.open.mode = cpu_to_le32(create_mode);
56 req->r_args.open.preferred = cpu_to_le32(-1);
57out:
58 return req;
59}
60
61/*
62 * initialize private struct file data.
63 * if we fail, clean up by dropping fmode reference on the ceph_inode
64 */
65static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
66{
67 struct ceph_file_info *cf;
68 int ret = 0;
69
70 switch (inode->i_mode & S_IFMT) {
71 case S_IFREG:
72 case S_IFDIR:
73 dout("init_file %p %p 0%o (regular)\n", inode, file,
74 inode->i_mode);
75 cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
76 if (cf == NULL) {
77 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
78 return -ENOMEM;
79 }
80 cf->fmode = fmode;
81 cf->next_offset = 2;
82 file->private_data = cf;
83 BUG_ON(inode->i_fop->release != ceph_release);
84 break;
85
86 case S_IFLNK:
87 dout("init_file %p %p 0%o (symlink)\n", inode, file,
88 inode->i_mode);
89 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
90 break;
91
92 default:
93 dout("init_file %p %p 0%o (special)\n", inode, file,
94 inode->i_mode);
95 /*
96 * we need to drop the open ref now, since we don't
97 * have .release set to ceph_release.
98 */
99 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
100 BUG_ON(inode->i_fop->release == ceph_release);
101
102 /* call the proper open fop */
103 ret = inode->i_fop->open(inode, file);
104 }
105 return ret;
106}
107
108/*
109 * If the filp already has private_data, that means the file was
110 * already opened by intent during lookup, and we do nothing.
111 *
112 * If we already have the requisite capabilities, we can satisfy
113 * the open request locally (no need to request new caps from the
114 * MDS). We do, however, need to inform the MDS (asynchronously)
115 * if our wanted caps set expands.
116 */
117int ceph_open(struct inode *inode, struct file *file)
118{
119 struct ceph_inode_info *ci = ceph_inode(inode);
120 struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
121 struct ceph_mds_client *mdsc = &client->mdsc;
122 struct ceph_mds_request *req;
123 struct ceph_file_info *cf = file->private_data;
124 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
125 int err;
126 int flags, fmode, wanted;
127
128 if (cf) {
129 dout("open file %p is already opened\n", file);
130 return 0;
131 }
132
133 /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */
134 flags = file->f_flags & ~(O_CREAT|O_EXCL);
135 if (S_ISDIR(inode->i_mode))
136 flags = O_DIRECTORY; /* mds likes to know */
137
138 dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
139 ceph_vinop(inode), file, flags, file->f_flags);
140 fmode = ceph_flags_to_mode(flags);
141 wanted = ceph_caps_for_mode(fmode);
142
143 /* snapped files are read-only */
144 if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
145 return -EROFS;
146
147 /* trivially open snapdir */
148 if (ceph_snap(inode) == CEPH_SNAPDIR) {
149 spin_lock(&inode->i_lock);
150 __ceph_get_fmode(ci, fmode);
151 spin_unlock(&inode->i_lock);
152 return ceph_init_file(inode, file, fmode);
153 }
154
155 /*
156 * No need to block if we have any caps. Update wanted set
157 * asynchronously.
158 */
159 spin_lock(&inode->i_lock);
160 if (__ceph_is_any_real_caps(ci)) {
161 int mds_wanted = __ceph_caps_mds_wanted(ci);
162 int issued = __ceph_caps_issued(ci, NULL);
163
164 dout("open %p fmode %d want %s issued %s using existing\n",
165 inode, fmode, ceph_cap_string(wanted),
166 ceph_cap_string(issued));
167 __ceph_get_fmode(ci, fmode);
168 spin_unlock(&inode->i_lock);
169
170 /* adjust wanted? */
171 if ((issued & wanted) != wanted &&
172 (mds_wanted & wanted) != wanted &&
173 ceph_snap(inode) != CEPH_SNAPDIR)
174 ceph_check_caps(ci, 0, NULL);
175
176 return ceph_init_file(inode, file, fmode);
177 } else if (ceph_snap(inode) != CEPH_NOSNAP &&
178 (ci->i_snap_caps & wanted) == wanted) {
179 __ceph_get_fmode(ci, fmode);
180 spin_unlock(&inode->i_lock);
181 return ceph_init_file(inode, file, fmode);
182 }
183 spin_unlock(&inode->i_lock);
184
185 dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
186 req = prepare_open_request(inode->i_sb, flags, 0);
187 if (IS_ERR(req)) {
188 err = PTR_ERR(req);
189 goto out;
190 }
191 req->r_inode = igrab(inode);
192 req->r_num_caps = 1;
193 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
194 if (!err)
195 err = ceph_init_file(inode, file, req->r_fmode);
196 ceph_mdsc_put_request(req);
197 dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
198out:
199 return err;
200}
201
202
203/*
204 * Do a lookup + open with a single request.
205 *
206 * If this succeeds, but some subsequent check in the vfs
207 * may_open() fails, the struct *file gets cleaned up (i.e.
208 * ceph_release gets called). So fear not!
209 */
210/*
211 * flags
212 * path_lookup_open -> LOOKUP_OPEN
213 * path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE
214 */
215struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
216 struct nameidata *nd, int mode,
217 int locked_dir)
218{
219 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
220 struct ceph_mds_client *mdsc = &client->mdsc;
221 struct file *file = nd->intent.open.file;
222 struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
223 struct ceph_mds_request *req;
224 int err;
225 int flags = nd->intent.open.flags - 1; /* silly vfs! */
226
227 dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n",
228 dentry, dentry->d_name.len, dentry->d_name.name, flags, mode);
229
230 /* do the open */
231 req = prepare_open_request(dir->i_sb, flags, mode);
232 if (IS_ERR(req))
233 return ERR_PTR(PTR_ERR(req));
234 req->r_dentry = dget(dentry);
235 req->r_num_caps = 2;
236 if (flags & O_CREAT) {
237 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
238 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
239 }
240 req->r_locked_dir = dir; /* caller holds dir->i_mutex */
241 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
242 dentry = ceph_finish_lookup(req, dentry, err);
243 if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
244 err = ceph_handle_notrace_create(dir, dentry);
245 if (!err)
246 err = ceph_init_file(req->r_dentry->d_inode, file,
247 req->r_fmode);
248 ceph_mdsc_put_request(req);
249 dout("ceph_lookup_open result=%p\n", dentry);
250 return dentry;
251}
252
253int ceph_release(struct inode *inode, struct file *file)
254{
255 struct ceph_inode_info *ci = ceph_inode(inode);
256 struct ceph_file_info *cf = file->private_data;
257
258 dout("release inode %p file %p\n", inode, file);
259 ceph_put_fmode(ci, cf->fmode);
260 if (cf->last_readdir)
261 ceph_mdsc_put_request(cf->last_readdir);
262 kfree(cf->last_name);
263 kfree(cf->dir_info);
264 dput(cf->dentry);
265 kmem_cache_free(ceph_file_cachep, cf);
266
267 /* wake up anyone waiting for caps on this inode */
268 wake_up(&ci->i_cap_wq);
269 return 0;
270}
271
272/*
273 * build a vector of user pages
274 */
275static struct page **get_direct_page_vector(const char __user *data,
276 int num_pages,
277 loff_t off, size_t len)
278{
279 struct page **pages;
280 int rc;
281
282 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
283 if (!pages)
284 return ERR_PTR(-ENOMEM);
285
286 down_read(&current->mm->mmap_sem);
287 rc = get_user_pages(current, current->mm, (unsigned long)data,
288 num_pages, 0, 0, pages, NULL);
289 up_read(&current->mm->mmap_sem);
290 if (rc < 0)
291 goto fail;
292 return pages;
293
294fail:
295 kfree(pages);
296 return ERR_PTR(rc);
297}
298
299static void put_page_vector(struct page **pages, int num_pages)
300{
301 int i;
302
303 for (i = 0; i < num_pages; i++)
304 put_page(pages[i]);
305 kfree(pages);
306}
307
308void ceph_release_page_vector(struct page **pages, int num_pages)
309{
310 int i;
311
312 for (i = 0; i < num_pages; i++)
313 __free_pages(pages[i], 0);
314 kfree(pages);
315}
316
317/*
318 * allocate a vector new pages
319 */
320static struct page **alloc_page_vector(int num_pages)
321{
322 struct page **pages;
323 int i;
324
325 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
326 if (!pages)
327 return ERR_PTR(-ENOMEM);
328 for (i = 0; i < num_pages; i++) {
329 pages[i] = alloc_page(GFP_NOFS);
330 if (pages[i] == NULL) {
331 ceph_release_page_vector(pages, i);
332 return ERR_PTR(-ENOMEM);
333 }
334 }
335 return pages;
336}
337
338/*
339 * copy user data into a page vector
340 */
341static int copy_user_to_page_vector(struct page **pages,
342 const char __user *data,
343 loff_t off, size_t len)
344{
345 int i = 0;
346 int po = off & ~PAGE_CACHE_MASK;
347 int left = len;
348 int l, bad;
349
350 while (left > 0) {
351 l = min_t(int, PAGE_CACHE_SIZE-po, left);
352 bad = copy_from_user(page_address(pages[i]) + po, data, l);
353 if (bad == l)
354 return -EFAULT;
355 data += l - bad;
356 left -= l - bad;
357 po += l - bad;
358 if (po == PAGE_CACHE_SIZE) {
359 po = 0;
360 i++;
361 }
362 }
363 return len;
364}
365
366/*
367 * copy user data from a page vector into a user pointer
368 */
369static int copy_page_vector_to_user(struct page **pages, char __user *data,
370 loff_t off, size_t len)
371{
372 int i = 0;
373 int po = off & ~PAGE_CACHE_MASK;
374 int left = len;
375 int l, bad;
376
377 while (left > 0) {
378 l = min_t(int, left, PAGE_CACHE_SIZE-po);
379 bad = copy_to_user(data, page_address(pages[i]) + po, l);
380 if (bad == l)
381 return -EFAULT;
382 data += l - bad;
383 left -= l - bad;
384 if (po) {
385 po += l - bad;
386 if (po == PAGE_CACHE_SIZE)
387 po = 0;
388 }
389 i++;
390 }
391 return len;
392}
393
394/*
395 * Zero an extent within a page vector. Offset is relative to the
396 * start of the first page.
397 */
398static void zero_page_vector_range(int off, int len, struct page **pages)
399{
400 int i = off >> PAGE_CACHE_SHIFT;
401
402 off &= ~PAGE_CACHE_MASK;
403
404 dout("zero_page_vector_page %u~%u\n", off, len);
405
406 /* leading partial page? */
407 if (off) {
408 int end = min((int)PAGE_CACHE_SIZE, off + len);
409 dout("zeroing %d %p head from %d\n", i, pages[i],
410 (int)off);
411 zero_user_segment(pages[i], off, end);
412 len -= (end - off);
413 i++;
414 }
415 while (len >= PAGE_CACHE_SIZE) {
416 dout("zeroing %d %p len=%d\n", i, pages[i], len);
417 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
418 len -= PAGE_CACHE_SIZE;
419 i++;
420 }
421 /* trailing partial page? */
422 if (len) {
423 dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
424 zero_user_segment(pages[i], 0, len);
425 }
426}
427
428
429/*
430 * Read a range of bytes striped over one or more objects. Iterate over
431 * objects we stripe over. (That's not atomic, but good enough for now.)
432 *
433 * If we get a short result from the OSD, check against i_size; we need to
434 * only return a short read to the caller if we hit EOF.
435 */
436static int striped_read(struct inode *inode,
437 u64 off, u64 len,
438 struct page **pages, int num_pages,
439 int *checkeof)
440{
441 struct ceph_client *client = ceph_inode_to_client(inode);
442 struct ceph_inode_info *ci = ceph_inode(inode);
443 u64 pos, this_len;
444 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
445 int left, pages_left;
446 int read;
447 struct page **page_pos;
448 int ret;
449 bool hit_stripe, was_short;
450
451 /*
452 * we may need to do multiple reads. not atomic, unfortunately.
453 */
454 pos = off;
455 left = len;
456 page_pos = pages;
457 pages_left = num_pages;
458 read = 0;
459
460more:
461 this_len = left;
462 ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
463 &ci->i_layout, pos, &this_len,
464 ci->i_truncate_seq,
465 ci->i_truncate_size,
466 page_pos, pages_left);
467 hit_stripe = this_len < left;
468 was_short = ret >= 0 && ret < this_len;
469 if (ret == -ENOENT)
470 ret = 0;
471 dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
472 ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
473
474 if (ret > 0) {
475 int didpages =
476 ((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT;
477
478 if (read < pos - off) {
479 dout(" zero gap %llu to %llu\n", off + read, pos);
480 zero_page_vector_range(page_off + read,
481 pos - off - read, pages);
482 }
483 pos += ret;
484 read = pos - off;
485 left -= ret;
486 page_pos += didpages;
487 pages_left -= didpages;
488
489 /* hit stripe? */
490 if (left && hit_stripe)
491 goto more;
492 }
493
494 if (was_short) {
495 /* was original extent fully inside i_size? */
496 if (pos + left <= inode->i_size) {
497 dout("zero tail\n");
498 zero_page_vector_range(page_off + read, len - read,
499 pages);
500 read = len;
501 goto out;
502 }
503
504 /* check i_size */
505 *checkeof = 1;
506 }
507
508out:
509 if (ret >= 0)
510 ret = read;
511 dout("striped_read returns %d\n", ret);
512 return ret;
513}
514
515/*
516 * Completely synchronous read and write methods. Direct from __user
517 * buffer to osd, or directly to user pages (if O_DIRECT).
518 *
519 * If the read spans object boundary, just do multiple reads.
520 */
521static ssize_t ceph_sync_read(struct file *file, char __user *data,
522 unsigned len, loff_t *poff, int *checkeof)
523{
524 struct inode *inode = file->f_dentry->d_inode;
525 struct page **pages;
526 u64 off = *poff;
527 int num_pages = calc_pages_for(off, len);
528 int ret;
529
530 dout("sync_read on file %p %llu~%u %s\n", file, off, len,
531 (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
532
533 if (file->f_flags & O_DIRECT) {
534 pages = get_direct_page_vector(data, num_pages, off, len);
535
536 /*
537 * flush any page cache pages in this range. this
538 * will make concurrent normal and O_DIRECT io slow,
539 * but it will at least behave sensibly when they are
540 * in sequence.
541 */
542 } else {
543 pages = alloc_page_vector(num_pages);
544 }
545 if (IS_ERR(pages))
546 return PTR_ERR(pages);
547
548 ret = filemap_write_and_wait(inode->i_mapping);
549 if (ret < 0)
550 goto done;
551
552 ret = striped_read(inode, off, len, pages, num_pages, checkeof);
553
554 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
555 ret = copy_page_vector_to_user(pages, data, off, ret);
556 if (ret >= 0)
557 *poff = off + ret;
558
559done:
560 if (file->f_flags & O_DIRECT)
561 put_page_vector(pages, num_pages);
562 else
563 ceph_release_page_vector(pages, num_pages);
564 dout("sync_read result %d\n", ret);
565 return ret;
566}
567
568/*
569 * Write commit callback, called if we requested both an ACK and
570 * ONDISK commit reply from the OSD.
571 */
572static void sync_write_commit(struct ceph_osd_request *req,
573 struct ceph_msg *msg)
574{
575 struct ceph_inode_info *ci = ceph_inode(req->r_inode);
576
577 dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
578 spin_lock(&ci->i_unsafe_lock);
579 list_del_init(&req->r_unsafe_item);
580 spin_unlock(&ci->i_unsafe_lock);
581 ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
582}
583
584/*
585 * Synchronous write, straight from __user pointer or user pages (if
586 * O_DIRECT).
587 *
588 * If write spans object boundary, just do multiple writes. (For a
589 * correct atomic write, we should e.g. take write locks on all
590 * objects, rollback on failure, etc.)
591 */
592static ssize_t ceph_sync_write(struct file *file, const char __user *data,
593 size_t left, loff_t *offset)
594{
595 struct inode *inode = file->f_dentry->d_inode;
596 struct ceph_inode_info *ci = ceph_inode(inode);
597 struct ceph_client *client = ceph_inode_to_client(inode);
598 struct ceph_osd_request *req;
599 struct page **pages;
600 int num_pages;
601 long long unsigned pos;
602 u64 len;
603 int written = 0;
604 int flags;
605 int do_sync = 0;
606 int check_caps = 0;
607 int ret;
608 struct timespec mtime = CURRENT_TIME;
609
610 if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
611 return -EROFS;
612
613 dout("sync_write on file %p %lld~%u %s\n", file, *offset,
614 (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
615
616 if (file->f_flags & O_APPEND)
617 pos = i_size_read(inode);
618 else
619 pos = *offset;
620
621 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
622 if (ret < 0)
623 return ret;
624
625 ret = invalidate_inode_pages2_range(inode->i_mapping,
626 pos >> PAGE_CACHE_SHIFT,
627 (pos + left) >> PAGE_CACHE_SHIFT);
628 if (ret < 0)
629 dout("invalidate_inode_pages2_range returned %d\n", ret);
630
631 flags = CEPH_OSD_FLAG_ORDERSNAP |
632 CEPH_OSD_FLAG_ONDISK |
633 CEPH_OSD_FLAG_WRITE;
634 if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
635 flags |= CEPH_OSD_FLAG_ACK;
636 else
637 do_sync = 1;
638
639 /*
640 * we may need to do multiple writes here if we span an object
641 * boundary. this isn't atomic, unfortunately. :(
642 */
643more:
644 len = left;
645 req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
646 ceph_vino(inode), pos, &len,
647 CEPH_OSD_OP_WRITE, flags,
648 ci->i_snap_realm->cached_context,
649 do_sync,
650 ci->i_truncate_seq, ci->i_truncate_size,
651 &mtime, false, 2);
652 if (IS_ERR(req))
653 return PTR_ERR(req);
654
655 num_pages = calc_pages_for(pos, len);
656
657 if (file->f_flags & O_DIRECT) {
658 pages = get_direct_page_vector(data, num_pages, pos, len);
659 if (IS_ERR(pages)) {
660 ret = PTR_ERR(pages);
661 goto out;
662 }
663
664 /*
665 * throw out any page cache pages in this range. this
666 * may block.
667 */
668 truncate_inode_pages_range(inode->i_mapping, pos, pos+len);
669 } else {
670 pages = alloc_page_vector(num_pages);
671 if (IS_ERR(pages)) {
672 ret = PTR_ERR(pages);
673 goto out;
674 }
675 ret = copy_user_to_page_vector(pages, data, pos, len);
676 if (ret < 0) {
677 ceph_release_page_vector(pages, num_pages);
678 goto out;
679 }
680
681 if ((file->f_flags & O_SYNC) == 0) {
682 /* get a second commit callback */
683 req->r_safe_callback = sync_write_commit;
684 req->r_own_pages = 1;
685 }
686 }
687 req->r_pages = pages;
688 req->r_num_pages = num_pages;
689 req->r_inode = inode;
690
691 ret = ceph_osdc_start_request(&client->osdc, req, false);
692 if (!ret) {
693 if (req->r_safe_callback) {
694 /*
695 * Add to inode unsafe list only after we
696 * start_request so that a tid has been assigned.
697 */
698 spin_lock(&ci->i_unsafe_lock);
699 list_add(&ci->i_unsafe_writes, &req->r_unsafe_item);
700 spin_unlock(&ci->i_unsafe_lock);
701 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
702 }
703 ret = ceph_osdc_wait_request(&client->osdc, req);
704 }
705
706 if (file->f_flags & O_DIRECT)
707 put_page_vector(pages, num_pages);
708 else if (file->f_flags & O_SYNC)
709 ceph_release_page_vector(pages, num_pages);
710
711out:
712 ceph_osdc_put_request(req);
713 if (ret == 0) {
714 pos += len;
715 written += len;
716 left -= len;
717 if (left)
718 goto more;
719
720 ret = written;
721 *offset = pos;
722 if (pos > i_size_read(inode))
723 check_caps = ceph_inode_set_size(inode, pos);
724 if (check_caps)
725 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
726 NULL);
727 }
728 return ret;
729}
730
731/*
732 * Wrap generic_file_aio_read with checks for cap bits on the inode.
733 * Atomically grab references, so that those bits are not released
734 * back to the MDS mid-read.
735 *
736 * Hmm, the sync read case isn't actually async... should it be?
737 */
738static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
739 unsigned long nr_segs, loff_t pos)
740{
741 struct file *filp = iocb->ki_filp;
742 loff_t *ppos = &iocb->ki_pos;
743 size_t len = iov->iov_len;
744 struct inode *inode = filp->f_dentry->d_inode;
745 struct ceph_inode_info *ci = ceph_inode(inode);
746 void *base = iov->iov_base;
747 ssize_t ret;
748 int got = 0;
749 int checkeof = 0, read = 0;
750
751 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
752 inode, ceph_vinop(inode), pos, (unsigned)len, inode);
753again:
754 __ceph_do_pending_vmtruncate(inode);
755 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE,
756 &got, -1);
757 if (ret < 0)
758 goto out;
759 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
760 inode, ceph_vinop(inode), pos, (unsigned)len,
761 ceph_cap_string(got));
762
763 if ((got & CEPH_CAP_FILE_CACHE) == 0 ||
764 (iocb->ki_filp->f_flags & O_DIRECT) ||
765 (inode->i_sb->s_flags & MS_SYNCHRONOUS))
766 /* hmm, this isn't really async... */
767 ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
768 else
769 ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
770
771out:
772 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
773 inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
774 ceph_put_cap_refs(ci, got);
775
776 if (checkeof && ret >= 0) {
777 int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
778
779 /* hit EOF or hole? */
780 if (statret == 0 && *ppos < inode->i_size) {
781 dout("aio_read sync_read hit hole, reading more\n");
782 read += ret;
783 base += ret;
784 len -= ret;
785 checkeof = 0;
786 goto again;
787 }
788 }
789 if (ret >= 0)
790 ret += read;
791
792 return ret;
793}
794
795/*
796 * Take cap references to avoid releasing caps to MDS mid-write.
797 *
798 * If we are synchronous, and write with an old snap context, the OSD
799 * may return EOLDSNAPC. In that case, retry the write.. _after_
800 * dropping our cap refs and allowing the pending snap to logically
801 * complete _before_ this write occurs.
802 *
803 * If we are near ENOSPC, write synchronously.
804 */
805static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
806 unsigned long nr_segs, loff_t pos)
807{
808 struct file *file = iocb->ki_filp;
809 struct inode *inode = file->f_dentry->d_inode;
810 struct ceph_inode_info *ci = ceph_inode(inode);
811 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
812 loff_t endoff = pos + iov->iov_len;
813 int got = 0;
814 int ret, err;
815
816 if (ceph_snap(inode) != CEPH_NOSNAP)
817 return -EROFS;
818
819retry_snap:
820 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
821 return -ENOSPC;
822 __ceph_do_pending_vmtruncate(inode);
823 dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
824 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
825 inode->i_size);
826 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
827 &got, endoff);
828 if (ret < 0)
829 goto out;
830
831 dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n",
832 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
833 ceph_cap_string(got));
834
835 if ((got & CEPH_CAP_FILE_BUFFER) == 0 ||
836 (iocb->ki_filp->f_flags & O_DIRECT) ||
837 (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
838 ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
839 &iocb->ki_pos);
840 } else {
841 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
842
843 if ((ret >= 0 || ret == -EIOCBQUEUED) &&
844 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
845 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
846 err = vfs_fsync_range(file, file->f_path.dentry,
847 pos, pos + ret - 1, 1);
848 if (err < 0)
849 ret = err;
850 }
851 }
852 if (ret >= 0) {
853 spin_lock(&inode->i_lock);
854 __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
855 spin_unlock(&inode->i_lock);
856 }
857
858out:
859 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
860 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
861 ceph_cap_string(got));
862 ceph_put_cap_refs(ci, got);
863
864 if (ret == -EOLDSNAPC) {
865 dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
866 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
867 goto retry_snap;
868 }
869
870 return ret;
871}
872
873/*
874 * llseek. be sure to verify file size on SEEK_END.
875 */
876static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
877{
878 struct inode *inode = file->f_mapping->host;
879 int ret;
880
881 mutex_lock(&inode->i_mutex);
882 __ceph_do_pending_vmtruncate(inode);
883 switch (origin) {
884 case SEEK_END:
885 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
886 if (ret < 0) {
887 offset = ret;
888 goto out;
889 }
890 offset += inode->i_size;
891 break;
892 case SEEK_CUR:
893 /*
894 * Here we special-case the lseek(fd, 0, SEEK_CUR)
895 * position-querying operation. Avoid rewriting the "same"
896 * f_pos value back to the file because a concurrent read(),
897 * write() or lseek() might have altered it
898 */
899 if (offset == 0) {
900 offset = file->f_pos;
901 goto out;
902 }
903 offset += file->f_pos;
904 break;
905 }
906
907 if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
908 offset = -EINVAL;
909 goto out;
910 }
911
912 /* Special lock needed here? */
913 if (offset != file->f_pos) {
914 file->f_pos = offset;
915 file->f_version = 0;
916 }
917
918out:
919 mutex_unlock(&inode->i_mutex);
920 return offset;
921}
922
923const struct file_operations ceph_file_fops = {
924 .open = ceph_open,
925 .release = ceph_release,
926 .llseek = ceph_llseek,
927 .read = do_sync_read,
928 .write = do_sync_write,
929 .aio_read = ceph_aio_read,
930 .aio_write = ceph_aio_write,
931 .mmap = ceph_mmap,
932 .fsync = ceph_fsync,
933 .splice_read = generic_file_splice_read,
934 .splice_write = generic_file_splice_write,
935 .unlocked_ioctl = ceph_ioctl,
936 .compat_ioctl = ceph_ioctl,
937};
938
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
new file mode 100644
index 000000000000..aca82d55cc53
--- /dev/null
+++ b/fs/ceph/inode.c
@@ -0,0 +1,1766 @@
1#include "ceph_debug.h"
2
3#include <linux/module.h>
4#include <linux/fs.h>
5#include <linux/smp_lock.h>
6#include <linux/slab.h>
7#include <linux/string.h>
8#include <linux/uaccess.h>
9#include <linux/kernel.h>
10#include <linux/namei.h>
11#include <linux/writeback.h>
12#include <linux/vmalloc.h>
13#include <linux/pagevec.h>
14
15#include "super.h"
16#include "decode.h"
17
18/*
19 * Ceph inode operations
20 *
21 * Implement basic inode helpers (get, alloc) and inode ops (getattr,
22 * setattr, etc.), xattr helpers, and helpers for assimilating
23 * metadata returned by the MDS into our cache.
24 *
25 * Also define helpers for doing asynchronous writeback, invalidation,
26 * and truncation for the benefit of those who can't afford to block
27 * (typically because they are in the message handler path).
28 */
29
30static const struct inode_operations ceph_symlink_iops;
31
32static void ceph_invalidate_work(struct work_struct *work);
33static void ceph_writeback_work(struct work_struct *work);
34static void ceph_vmtruncate_work(struct work_struct *work);
35
36/*
37 * find or create an inode, given the ceph ino number
38 */
39struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
40{
41 struct inode *inode;
42 ino_t t = ceph_vino_to_ino(vino);
43
44 inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
45 if (inode == NULL)
46 return ERR_PTR(-ENOMEM);
47 if (inode->i_state & I_NEW) {
48 dout("get_inode created new inode %p %llx.%llx ino %llx\n",
49 inode, ceph_vinop(inode), (u64)inode->i_ino);
50 unlock_new_inode(inode);
51 }
52
53 dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
54 vino.snap, inode);
55 return inode;
56}
57
58/*
59 * get/constuct snapdir inode for a given directory
60 */
61struct inode *ceph_get_snapdir(struct inode *parent)
62{
63 struct ceph_vino vino = {
64 .ino = ceph_ino(parent),
65 .snap = CEPH_SNAPDIR,
66 };
67 struct inode *inode = ceph_get_inode(parent->i_sb, vino);
68 struct ceph_inode_info *ci = ceph_inode(inode);
69
70 BUG_ON(!S_ISDIR(parent->i_mode));
71 if (IS_ERR(inode))
72 return ERR_PTR(PTR_ERR(inode));
73 inode->i_mode = parent->i_mode;
74 inode->i_uid = parent->i_uid;
75 inode->i_gid = parent->i_gid;
76 inode->i_op = &ceph_dir_iops;
77 inode->i_fop = &ceph_dir_fops;
78 ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
79 ci->i_rbytes = 0;
80 return inode;
81}
82
83const struct inode_operations ceph_file_iops = {
84 .permission = ceph_permission,
85 .setattr = ceph_setattr,
86 .getattr = ceph_getattr,
87 .setxattr = ceph_setxattr,
88 .getxattr = ceph_getxattr,
89 .listxattr = ceph_listxattr,
90 .removexattr = ceph_removexattr,
91};
92
93
94/*
95 * We use a 'frag tree' to keep track of the MDS's directory fragments
96 * for a given inode (usually there is just a single fragment). We
97 * need to know when a child frag is delegated to a new MDS, or when
98 * it is flagged as replicated, so we can direct our requests
99 * accordingly.
100 */
101
102/*
103 * find/create a frag in the tree
104 */
105static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
106 u32 f)
107{
108 struct rb_node **p;
109 struct rb_node *parent = NULL;
110 struct ceph_inode_frag *frag;
111 int c;
112
113 p = &ci->i_fragtree.rb_node;
114 while (*p) {
115 parent = *p;
116 frag = rb_entry(parent, struct ceph_inode_frag, node);
117 c = ceph_frag_compare(f, frag->frag);
118 if (c < 0)
119 p = &(*p)->rb_left;
120 else if (c > 0)
121 p = &(*p)->rb_right;
122 else
123 return frag;
124 }
125
126 frag = kmalloc(sizeof(*frag), GFP_NOFS);
127 if (!frag) {
128 pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx "
129 "frag %x\n", &ci->vfs_inode,
130 ceph_vinop(&ci->vfs_inode), f);
131 return ERR_PTR(-ENOMEM);
132 }
133 frag->frag = f;
134 frag->split_by = 0;
135 frag->mds = -1;
136 frag->ndist = 0;
137
138 rb_link_node(&frag->node, parent, p);
139 rb_insert_color(&frag->node, &ci->i_fragtree);
140
141 dout("get_or_create_frag added %llx.%llx frag %x\n",
142 ceph_vinop(&ci->vfs_inode), f);
143 return frag;
144}
145
146/*
147 * find a specific frag @f
148 */
149struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
150{
151 struct rb_node *n = ci->i_fragtree.rb_node;
152
153 while (n) {
154 struct ceph_inode_frag *frag =
155 rb_entry(n, struct ceph_inode_frag, node);
156 int c = ceph_frag_compare(f, frag->frag);
157 if (c < 0)
158 n = n->rb_left;
159 else if (c > 0)
160 n = n->rb_right;
161 else
162 return frag;
163 }
164 return NULL;
165}
166
167/*
168 * Choose frag containing the given value @v. If @pfrag is
169 * specified, copy the frag delegation info to the caller if
170 * it is present.
171 */
172u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
173 struct ceph_inode_frag *pfrag,
174 int *found)
175{
176 u32 t = ceph_frag_make(0, 0);
177 struct ceph_inode_frag *frag;
178 unsigned nway, i;
179 u32 n;
180
181 if (found)
182 *found = 0;
183
184 mutex_lock(&ci->i_fragtree_mutex);
185 while (1) {
186 WARN_ON(!ceph_frag_contains_value(t, v));
187 frag = __ceph_find_frag(ci, t);
188 if (!frag)
189 break; /* t is a leaf */
190 if (frag->split_by == 0) {
191 if (pfrag)
192 memcpy(pfrag, frag, sizeof(*pfrag));
193 if (found)
194 *found = 1;
195 break;
196 }
197
198 /* choose child */
199 nway = 1 << frag->split_by;
200 dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
201 frag->split_by, nway);
202 for (i = 0; i < nway; i++) {
203 n = ceph_frag_make_child(t, frag->split_by, i);
204 if (ceph_frag_contains_value(n, v)) {
205 t = n;
206 break;
207 }
208 }
209 BUG_ON(i == nway);
210 }
211 dout("choose_frag(%x) = %x\n", v, t);
212
213 mutex_unlock(&ci->i_fragtree_mutex);
214 return t;
215}
216
217/*
218 * Process dirfrag (delegation) info from the mds. Include leaf
219 * fragment in tree ONLY if ndist > 0. Otherwise, only
220 * branches/splits are included in i_fragtree)
221 */
222static int ceph_fill_dirfrag(struct inode *inode,
223 struct ceph_mds_reply_dirfrag *dirinfo)
224{
225 struct ceph_inode_info *ci = ceph_inode(inode);
226 struct ceph_inode_frag *frag;
227 u32 id = le32_to_cpu(dirinfo->frag);
228 int mds = le32_to_cpu(dirinfo->auth);
229 int ndist = le32_to_cpu(dirinfo->ndist);
230 int i;
231 int err = 0;
232
233 mutex_lock(&ci->i_fragtree_mutex);
234 if (ndist == 0) {
235 /* no delegation info needed. */
236 frag = __ceph_find_frag(ci, id);
237 if (!frag)
238 goto out;
239 if (frag->split_by == 0) {
240 /* tree leaf, remove */
241 dout("fill_dirfrag removed %llx.%llx frag %x"
242 " (no ref)\n", ceph_vinop(inode), id);
243 rb_erase(&frag->node, &ci->i_fragtree);
244 kfree(frag);
245 } else {
246 /* tree branch, keep and clear */
247 dout("fill_dirfrag cleared %llx.%llx frag %x"
248 " referral\n", ceph_vinop(inode), id);
249 frag->mds = -1;
250 frag->ndist = 0;
251 }
252 goto out;
253 }
254
255
256 /* find/add this frag to store mds delegation info */
257 frag = __get_or_create_frag(ci, id);
258 if (IS_ERR(frag)) {
259 /* this is not the end of the world; we can continue
260 with bad/inaccurate delegation info */
261 pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
262 ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
263 err = -ENOMEM;
264 goto out;
265 }
266
267 frag->mds = mds;
268 frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
269 for (i = 0; i < frag->ndist; i++)
270 frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
271 dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
272 ceph_vinop(inode), frag->frag, frag->ndist);
273
274out:
275 mutex_unlock(&ci->i_fragtree_mutex);
276 return err;
277}
278
279
280/*
281 * initialize a newly allocated inode.
282 */
283struct inode *ceph_alloc_inode(struct super_block *sb)
284{
285 struct ceph_inode_info *ci;
286 int i;
287
288 ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
289 if (!ci)
290 return NULL;
291
292 dout("alloc_inode %p\n", &ci->vfs_inode);
293
294 ci->i_version = 0;
295 ci->i_time_warp_seq = 0;
296 ci->i_ceph_flags = 0;
297 ci->i_release_count = 0;
298 ci->i_symlink = NULL;
299
300 ci->i_fragtree = RB_ROOT;
301 mutex_init(&ci->i_fragtree_mutex);
302
303 ci->i_xattrs.blob = NULL;
304 ci->i_xattrs.prealloc_blob = NULL;
305 ci->i_xattrs.dirty = false;
306 ci->i_xattrs.index = RB_ROOT;
307 ci->i_xattrs.count = 0;
308 ci->i_xattrs.names_size = 0;
309 ci->i_xattrs.vals_size = 0;
310 ci->i_xattrs.version = 0;
311 ci->i_xattrs.index_version = 0;
312
313 ci->i_caps = RB_ROOT;
314 ci->i_auth_cap = NULL;
315 ci->i_dirty_caps = 0;
316 ci->i_flushing_caps = 0;
317 INIT_LIST_HEAD(&ci->i_dirty_item);
318 INIT_LIST_HEAD(&ci->i_flushing_item);
319 ci->i_cap_flush_seq = 0;
320 ci->i_cap_flush_last_tid = 0;
321 memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
322 init_waitqueue_head(&ci->i_cap_wq);
323 ci->i_hold_caps_min = 0;
324 ci->i_hold_caps_max = 0;
325 INIT_LIST_HEAD(&ci->i_cap_delay_list);
326 ci->i_cap_exporting_mds = 0;
327 ci->i_cap_exporting_mseq = 0;
328 ci->i_cap_exporting_issued = 0;
329 INIT_LIST_HEAD(&ci->i_cap_snaps);
330 ci->i_head_snapc = NULL;
331 ci->i_snap_caps = 0;
332
333 for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
334 ci->i_nr_by_mode[i] = 0;
335
336 ci->i_truncate_seq = 0;
337 ci->i_truncate_size = 0;
338 ci->i_truncate_pending = 0;
339
340 ci->i_max_size = 0;
341 ci->i_reported_size = 0;
342 ci->i_wanted_max_size = 0;
343 ci->i_requested_max_size = 0;
344
345 ci->i_pin_ref = 0;
346 ci->i_rd_ref = 0;
347 ci->i_rdcache_ref = 0;
348 ci->i_wr_ref = 0;
349 ci->i_wrbuffer_ref = 0;
350 ci->i_wrbuffer_ref_head = 0;
351 ci->i_shared_gen = 0;
352 ci->i_rdcache_gen = 0;
353 ci->i_rdcache_revoking = 0;
354
355 INIT_LIST_HEAD(&ci->i_unsafe_writes);
356 INIT_LIST_HEAD(&ci->i_unsafe_dirops);
357 spin_lock_init(&ci->i_unsafe_lock);
358
359 ci->i_snap_realm = NULL;
360 INIT_LIST_HEAD(&ci->i_snap_realm_item);
361 INIT_LIST_HEAD(&ci->i_snap_flush_item);
362
363 INIT_WORK(&ci->i_wb_work, ceph_writeback_work);
364 INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work);
365
366 INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
367
368 return &ci->vfs_inode;
369}
370
371void ceph_destroy_inode(struct inode *inode)
372{
373 struct ceph_inode_info *ci = ceph_inode(inode);
374 struct ceph_inode_frag *frag;
375 struct rb_node *n;
376
377 dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
378
379 ceph_queue_caps_release(inode);
380
381 /*
382 * we may still have a snap_realm reference if there are stray
383 * caps in i_cap_exporting_issued or i_snap_caps.
384 */
385 if (ci->i_snap_realm) {
386 struct ceph_mds_client *mdsc =
387 &ceph_client(ci->vfs_inode.i_sb)->mdsc;
388 struct ceph_snap_realm *realm = ci->i_snap_realm;
389
390 dout(" dropping residual ref to snap realm %p\n", realm);
391 spin_lock(&realm->inodes_with_caps_lock);
392 list_del_init(&ci->i_snap_realm_item);
393 spin_unlock(&realm->inodes_with_caps_lock);
394 ceph_put_snap_realm(mdsc, realm);
395 }
396
397 kfree(ci->i_symlink);
398 while ((n = rb_first(&ci->i_fragtree)) != NULL) {
399 frag = rb_entry(n, struct ceph_inode_frag, node);
400 rb_erase(n, &ci->i_fragtree);
401 kfree(frag);
402 }
403
404 __ceph_destroy_xattrs(ci);
405 if (ci->i_xattrs.blob)
406 ceph_buffer_put(ci->i_xattrs.blob);
407 if (ci->i_xattrs.prealloc_blob)
408 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
409
410 kmem_cache_free(ceph_inode_cachep, ci);
411}
412
413
414/*
415 * Helpers to fill in size, ctime, mtime, and atime. We have to be
416 * careful because either the client or MDS may have more up to date
417 * info, depending on which capabilities are held, and whether
418 * time_warp_seq or truncate_seq have increased. (Ordinarily, mtime
419 * and size are monotonically increasing, except when utimes() or
420 * truncate() increments the corresponding _seq values.)
421 */
422int ceph_fill_file_size(struct inode *inode, int issued,
423 u32 truncate_seq, u64 truncate_size, u64 size)
424{
425 struct ceph_inode_info *ci = ceph_inode(inode);
426 int queue_trunc = 0;
427
428 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
429 (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
430 dout("size %lld -> %llu\n", inode->i_size, size);
431 inode->i_size = size;
432 inode->i_blocks = (size + (1<<9) - 1) >> 9;
433 ci->i_reported_size = size;
434 if (truncate_seq != ci->i_truncate_seq) {
435 dout("truncate_seq %u -> %u\n",
436 ci->i_truncate_seq, truncate_seq);
437 ci->i_truncate_seq = truncate_seq;
438 /*
439 * If we hold relevant caps, or in the case where we're
440 * not the only client referencing this file and we
441 * don't hold those caps, then we need to check whether
442 * the file is either opened or mmaped
443 */
444 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
445 CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
446 CEPH_CAP_FILE_EXCL)) ||
447 mapping_mapped(inode->i_mapping) ||
448 __ceph_caps_file_wanted(ci)) {
449 ci->i_truncate_pending++;
450 queue_trunc = 1;
451 }
452 }
453 }
454 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
455 ci->i_truncate_size != truncate_size) {
456 dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
457 truncate_size);
458 ci->i_truncate_size = truncate_size;
459 }
460 return queue_trunc;
461}
462
463void ceph_fill_file_time(struct inode *inode, int issued,
464 u64 time_warp_seq, struct timespec *ctime,
465 struct timespec *mtime, struct timespec *atime)
466{
467 struct ceph_inode_info *ci = ceph_inode(inode);
468 int warn = 0;
469
470 if (issued & (CEPH_CAP_FILE_EXCL|
471 CEPH_CAP_FILE_WR|
472 CEPH_CAP_FILE_BUFFER)) {
473 if (timespec_compare(ctime, &inode->i_ctime) > 0) {
474 dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
475 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
476 ctime->tv_sec, ctime->tv_nsec);
477 inode->i_ctime = *ctime;
478 }
479 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
480 /* the MDS did a utimes() */
481 dout("mtime %ld.%09ld -> %ld.%09ld "
482 "tw %d -> %d\n",
483 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
484 mtime->tv_sec, mtime->tv_nsec,
485 ci->i_time_warp_seq, (int)time_warp_seq);
486
487 inode->i_mtime = *mtime;
488 inode->i_atime = *atime;
489 ci->i_time_warp_seq = time_warp_seq;
490 } else if (time_warp_seq == ci->i_time_warp_seq) {
491 /* nobody did utimes(); take the max */
492 if (timespec_compare(mtime, &inode->i_mtime) > 0) {
493 dout("mtime %ld.%09ld -> %ld.%09ld inc\n",
494 inode->i_mtime.tv_sec,
495 inode->i_mtime.tv_nsec,
496 mtime->tv_sec, mtime->tv_nsec);
497 inode->i_mtime = *mtime;
498 }
499 if (timespec_compare(atime, &inode->i_atime) > 0) {
500 dout("atime %ld.%09ld -> %ld.%09ld inc\n",
501 inode->i_atime.tv_sec,
502 inode->i_atime.tv_nsec,
503 atime->tv_sec, atime->tv_nsec);
504 inode->i_atime = *atime;
505 }
506 } else if (issued & CEPH_CAP_FILE_EXCL) {
507 /* we did a utimes(); ignore mds values */
508 } else {
509 warn = 1;
510 }
511 } else {
512 /* we have no write caps; whatever the MDS says is true */
513 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
514 inode->i_ctime = *ctime;
515 inode->i_mtime = *mtime;
516 inode->i_atime = *atime;
517 ci->i_time_warp_seq = time_warp_seq;
518 } else {
519 warn = 1;
520 }
521 }
522 if (warn) /* time_warp_seq shouldn't go backwards */
523 dout("%p mds time_warp_seq %llu < %u\n",
524 inode, time_warp_seq, ci->i_time_warp_seq);
525}
526
527/*
528 * Populate an inode based on info from mds. May be called on new or
529 * existing inodes.
530 */
531static int fill_inode(struct inode *inode,
532 struct ceph_mds_reply_info_in *iinfo,
533 struct ceph_mds_reply_dirfrag *dirinfo,
534 struct ceph_mds_session *session,
535 unsigned long ttl_from, int cap_fmode,
536 struct ceph_cap_reservation *caps_reservation)
537{
538 struct ceph_mds_reply_inode *info = iinfo->in;
539 struct ceph_inode_info *ci = ceph_inode(inode);
540 int i;
541 int issued, implemented;
542 struct timespec mtime, atime, ctime;
543 u32 nsplits;
544 struct ceph_buffer *xattr_blob = NULL;
545 int err = 0;
546 int queue_trunc = 0;
547
548 dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
549 inode, ceph_vinop(inode), le64_to_cpu(info->version),
550 ci->i_version);
551
552 /*
553 * prealloc xattr data, if it looks like we'll need it. only
554 * if len > 4 (meaning there are actually xattrs; the first 4
555 * bytes are the xattr count).
556 */
557 if (iinfo->xattr_len > 4) {
558 xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
559 if (!xattr_blob)
560 pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
561 iinfo->xattr_len);
562 }
563
564 spin_lock(&inode->i_lock);
565
566 /*
567 * provided version will be odd if inode value is projected,
568 * even if stable. skip the update if we have a newer info
569 * (e.g., due to inode info racing form multiple MDSs), or if
570 * we are getting projected (unstable) inode info.
571 */
572 if (le64_to_cpu(info->version) > 0 &&
573 (ci->i_version & ~1) > le64_to_cpu(info->version))
574 goto no_change;
575
576 issued = __ceph_caps_issued(ci, &implemented);
577 issued |= implemented | __ceph_caps_dirty(ci);
578
579 /* update inode */
580 ci->i_version = le64_to_cpu(info->version);
581 inode->i_version++;
582 inode->i_rdev = le32_to_cpu(info->rdev);
583
584 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
585 inode->i_mode = le32_to_cpu(info->mode);
586 inode->i_uid = le32_to_cpu(info->uid);
587 inode->i_gid = le32_to_cpu(info->gid);
588 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
589 inode->i_uid, inode->i_gid);
590 }
591
592 if ((issued & CEPH_CAP_LINK_EXCL) == 0)
593 inode->i_nlink = le32_to_cpu(info->nlink);
594
595 /* be careful with mtime, atime, size */
596 ceph_decode_timespec(&atime, &info->atime);
597 ceph_decode_timespec(&mtime, &info->mtime);
598 ceph_decode_timespec(&ctime, &info->ctime);
599 queue_trunc = ceph_fill_file_size(inode, issued,
600 le32_to_cpu(info->truncate_seq),
601 le64_to_cpu(info->truncate_size),
602 le64_to_cpu(info->size));
603 ceph_fill_file_time(inode, issued,
604 le32_to_cpu(info->time_warp_seq),
605 &ctime, &mtime, &atime);
606
607 ci->i_max_size = le64_to_cpu(info->max_size);
608 ci->i_layout = info->layout;
609 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
610
611 /* xattrs */
612 /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
613 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
614 le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
615 if (ci->i_xattrs.blob)
616 ceph_buffer_put(ci->i_xattrs.blob);
617 ci->i_xattrs.blob = xattr_blob;
618 if (xattr_blob)
619 memcpy(ci->i_xattrs.blob->vec.iov_base,
620 iinfo->xattr_data, iinfo->xattr_len);
621 ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
622 }
623
624 inode->i_mapping->a_ops = &ceph_aops;
625 inode->i_mapping->backing_dev_info =
626 &ceph_client(inode->i_sb)->backing_dev_info;
627
628 switch (inode->i_mode & S_IFMT) {
629 case S_IFIFO:
630 case S_IFBLK:
631 case S_IFCHR:
632 case S_IFSOCK:
633 init_special_inode(inode, inode->i_mode, inode->i_rdev);
634 inode->i_op = &ceph_file_iops;
635 break;
636 case S_IFREG:
637 inode->i_op = &ceph_file_iops;
638 inode->i_fop = &ceph_file_fops;
639 break;
640 case S_IFLNK:
641 inode->i_op = &ceph_symlink_iops;
642 if (!ci->i_symlink) {
643 int symlen = iinfo->symlink_len;
644 char *sym;
645
646 BUG_ON(symlen != inode->i_size);
647 spin_unlock(&inode->i_lock);
648
649 err = -ENOMEM;
650 sym = kmalloc(symlen+1, GFP_NOFS);
651 if (!sym)
652 goto out;
653 memcpy(sym, iinfo->symlink, symlen);
654 sym[symlen] = 0;
655
656 spin_lock(&inode->i_lock);
657 if (!ci->i_symlink)
658 ci->i_symlink = sym;
659 else
660 kfree(sym); /* lost a race */
661 }
662 break;
663 case S_IFDIR:
664 inode->i_op = &ceph_dir_iops;
665 inode->i_fop = &ceph_dir_fops;
666
667 ci->i_files = le64_to_cpu(info->files);
668 ci->i_subdirs = le64_to_cpu(info->subdirs);
669 ci->i_rbytes = le64_to_cpu(info->rbytes);
670 ci->i_rfiles = le64_to_cpu(info->rfiles);
671 ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
672 ceph_decode_timespec(&ci->i_rctime, &info->rctime);
673
674 /* set dir completion flag? */
675 if (ci->i_files == 0 && ci->i_subdirs == 0 &&
676 ceph_snap(inode) == CEPH_NOSNAP &&
677 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) {
678 dout(" marking %p complete (empty)\n", inode);
679 ci->i_ceph_flags |= CEPH_I_COMPLETE;
680 ci->i_max_offset = 2;
681 }
682
683 /* it may be better to set st_size in getattr instead? */
684 if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES))
685 inode->i_size = ci->i_rbytes;
686 break;
687 default:
688 pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
689 ceph_vinop(inode), inode->i_mode);
690 }
691
692no_change:
693 spin_unlock(&inode->i_lock);
694
695 /* queue truncate if we saw i_size decrease */
696 if (queue_trunc)
697 ceph_queue_vmtruncate(inode);
698
699 /* populate frag tree */
700 /* FIXME: move me up, if/when version reflects fragtree changes */
701 nsplits = le32_to_cpu(info->fragtree.nsplits);
702 mutex_lock(&ci->i_fragtree_mutex);
703 for (i = 0; i < nsplits; i++) {
704 u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
705 struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
706
707 if (IS_ERR(frag))
708 continue;
709 frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
710 dout(" frag %x split by %d\n", frag->frag, frag->split_by);
711 }
712 mutex_unlock(&ci->i_fragtree_mutex);
713
714 /* were we issued a capability? */
715 if (info->cap.caps) {
716 if (ceph_snap(inode) == CEPH_NOSNAP) {
717 ceph_add_cap(inode, session,
718 le64_to_cpu(info->cap.cap_id),
719 cap_fmode,
720 le32_to_cpu(info->cap.caps),
721 le32_to_cpu(info->cap.wanted),
722 le32_to_cpu(info->cap.seq),
723 le32_to_cpu(info->cap.mseq),
724 le64_to_cpu(info->cap.realm),
725 info->cap.flags,
726 caps_reservation);
727 } else {
728 spin_lock(&inode->i_lock);
729 dout(" %p got snap_caps %s\n", inode,
730 ceph_cap_string(le32_to_cpu(info->cap.caps)));
731 ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
732 if (cap_fmode >= 0)
733 __ceph_get_fmode(ci, cap_fmode);
734 spin_unlock(&inode->i_lock);
735 }
736 }
737
738 /* update delegation info? */
739 if (dirinfo)
740 ceph_fill_dirfrag(inode, dirinfo);
741
742 err = 0;
743
744out:
745 if (xattr_blob)
746 ceph_buffer_put(xattr_blob);
747 return err;
748}
749
750/*
751 * caller should hold session s_mutex.
752 */
753static void update_dentry_lease(struct dentry *dentry,
754 struct ceph_mds_reply_lease *lease,
755 struct ceph_mds_session *session,
756 unsigned long from_time)
757{
758 struct ceph_dentry_info *di = ceph_dentry(dentry);
759 long unsigned duration = le32_to_cpu(lease->duration_ms);
760 long unsigned ttl = from_time + (duration * HZ) / 1000;
761 long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
762 struct inode *dir;
763
764 /* only track leases on regular dentries */
765 if (dentry->d_op != &ceph_dentry_ops)
766 return;
767
768 spin_lock(&dentry->d_lock);
769 dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n",
770 dentry, le16_to_cpu(lease->mask), duration, ttl);
771
772 /* make lease_rdcache_gen match directory */
773 dir = dentry->d_parent->d_inode;
774 di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
775
776 if (lease->mask == 0)
777 goto out_unlock;
778
779 if (di->lease_gen == session->s_cap_gen &&
780 time_before(ttl, dentry->d_time))
781 goto out_unlock; /* we already have a newer lease. */
782
783 if (di->lease_session && di->lease_session != session)
784 goto out_unlock;
785
786 ceph_dentry_lru_touch(dentry);
787
788 if (!di->lease_session)
789 di->lease_session = ceph_get_mds_session(session);
790 di->lease_gen = session->s_cap_gen;
791 di->lease_seq = le32_to_cpu(lease->seq);
792 di->lease_renew_after = half_ttl;
793 di->lease_renew_from = 0;
794 dentry->d_time = ttl;
795out_unlock:
796 spin_unlock(&dentry->d_lock);
797 return;
798}
799
800/*
801 * splice a dentry to an inode.
802 * caller must hold directory i_mutex for this to be safe.
803 *
804 * we will only rehash the resulting dentry if @prehash is
805 * true; @prehash will be set to false (for the benefit of
806 * the caller) if we fail.
807 */
808static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
809 bool *prehash)
810{
811 struct dentry *realdn;
812
813 /* dn must be unhashed */
814 if (!d_unhashed(dn))
815 d_drop(dn);
816 realdn = d_materialise_unique(dn, in);
817 if (IS_ERR(realdn)) {
818 pr_err("splice_dentry error %p inode %p ino %llx.%llx\n",
819 dn, in, ceph_vinop(in));
820 if (prehash)
821 *prehash = false; /* don't rehash on error */
822 dn = realdn; /* note realdn contains the error */
823 goto out;
824 } else if (realdn) {
825 dout("dn %p (%d) spliced with %p (%d) "
826 "inode %p ino %llx.%llx\n",
827 dn, atomic_read(&dn->d_count),
828 realdn, atomic_read(&realdn->d_count),
829 realdn->d_inode, ceph_vinop(realdn->d_inode));
830 dput(dn);
831 dn = realdn;
832 } else {
833 BUG_ON(!ceph_dentry(dn));
834
835 dout("dn %p attached to %p ino %llx.%llx\n",
836 dn, dn->d_inode, ceph_vinop(dn->d_inode));
837 }
838 if ((!prehash || *prehash) && d_unhashed(dn))
839 d_rehash(dn);
840out:
841 return dn;
842}
843
844/*
845 * Set dentry's directory position based on the current dir's max, and
846 * order it in d_subdirs, so that dcache_readdir behaves.
847 */
848static void ceph_set_dentry_offset(struct dentry *dn)
849{
850 struct dentry *dir = dn->d_parent;
851 struct inode *inode = dn->d_parent->d_inode;
852 struct ceph_dentry_info *di;
853
854 BUG_ON(!inode);
855
856 di = ceph_dentry(dn);
857
858 spin_lock(&inode->i_lock);
859 di->offset = ceph_inode(inode)->i_max_offset++;
860 spin_unlock(&inode->i_lock);
861
862 spin_lock(&dcache_lock);
863 spin_lock(&dn->d_lock);
864 list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
865 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
866 dn->d_u.d_child.prev, dn->d_u.d_child.next);
867 spin_unlock(&dn->d_lock);
868 spin_unlock(&dcache_lock);
869}
870
871/*
872 * Incorporate results into the local cache. This is either just
873 * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
874 * after a lookup).
875 *
876 * A reply may contain
877 * a directory inode along with a dentry.
878 * and/or a target inode
879 *
880 * Called with snap_rwsem (read).
881 */
882int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
883 struct ceph_mds_session *session)
884{
885 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
886 struct inode *in = NULL;
887 struct ceph_mds_reply_inode *ininfo;
888 struct ceph_vino vino;
889 int i = 0;
890 int err = 0;
891
892 dout("fill_trace %p is_dentry %d is_target %d\n", req,
893 rinfo->head->is_dentry, rinfo->head->is_target);
894
895#if 0
896 /*
897 * Debugging hook:
898 *
899 * If we resend completed ops to a recovering mds, we get no
900 * trace. Since that is very rare, pretend this is the case
901 * to ensure the 'no trace' handlers in the callers behave.
902 *
903 * Fill in inodes unconditionally to avoid breaking cap
904 * invariants.
905 */
906 if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
907 pr_info("fill_trace faking empty trace on %lld %s\n",
908 req->r_tid, ceph_mds_op_name(rinfo->head->op));
909 if (rinfo->head->is_dentry) {
910 rinfo->head->is_dentry = 0;
911 err = fill_inode(req->r_locked_dir,
912 &rinfo->diri, rinfo->dirfrag,
913 session, req->r_request_started, -1);
914 }
915 if (rinfo->head->is_target) {
916 rinfo->head->is_target = 0;
917 ininfo = rinfo->targeti.in;
918 vino.ino = le64_to_cpu(ininfo->ino);
919 vino.snap = le64_to_cpu(ininfo->snapid);
920 in = ceph_get_inode(sb, vino);
921 err = fill_inode(in, &rinfo->targeti, NULL,
922 session, req->r_request_started,
923 req->r_fmode);
924 iput(in);
925 }
926 }
927#endif
928
929 if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
930 dout("fill_trace reply is empty!\n");
931 if (rinfo->head->result == 0 && req->r_locked_dir) {
932 struct ceph_inode_info *ci =
933 ceph_inode(req->r_locked_dir);
934 dout(" clearing %p complete (empty trace)\n",
935 req->r_locked_dir);
936 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
937 ci->i_release_count++;
938 }
939 return 0;
940 }
941
942 if (rinfo->head->is_dentry) {
943 struct inode *dir = req->r_locked_dir;
944
945 err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
946 session, req->r_request_started, -1,
947 &req->r_caps_reservation);
948 if (err < 0)
949 return err;
950 }
951
952 if (rinfo->head->is_dentry && !req->r_aborted) {
953 /*
954 * lookup link rename : null -> possibly existing inode
955 * mknod symlink mkdir : null -> new inode
956 * unlink : linked -> null
957 */
958 struct inode *dir = req->r_locked_dir;
959 struct dentry *dn = req->r_dentry;
960 bool have_dir_cap, have_lease;
961
962 BUG_ON(!dn);
963 BUG_ON(!dir);
964 BUG_ON(dn->d_parent->d_inode != dir);
965 BUG_ON(ceph_ino(dir) !=
966 le64_to_cpu(rinfo->diri.in->ino));
967 BUG_ON(ceph_snap(dir) !=
968 le64_to_cpu(rinfo->diri.in->snapid));
969
970 /* do we have a lease on the whole dir? */
971 have_dir_cap =
972 (le32_to_cpu(rinfo->diri.in->cap.caps) &
973 CEPH_CAP_FILE_SHARED);
974
975 /* do we have a dn lease? */
976 have_lease = have_dir_cap ||
977 (le16_to_cpu(rinfo->dlease->mask) &
978 CEPH_LOCK_DN);
979
980 if (!have_lease)
981 dout("fill_trace no dentry lease or dir cap\n");
982
983 /* rename? */
984 if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
985 dout(" src %p '%.*s' dst %p '%.*s'\n",
986 req->r_old_dentry,
987 req->r_old_dentry->d_name.len,
988 req->r_old_dentry->d_name.name,
989 dn, dn->d_name.len, dn->d_name.name);
990 dout("fill_trace doing d_move %p -> %p\n",
991 req->r_old_dentry, dn);
992 d_move(req->r_old_dentry, dn);
993 dout(" src %p '%.*s' dst %p '%.*s'\n",
994 req->r_old_dentry,
995 req->r_old_dentry->d_name.len,
996 req->r_old_dentry->d_name.name,
997 dn, dn->d_name.len, dn->d_name.name);
998 /* ensure target dentry is invalidated, despite
999 rehashing bug in vfs_rename_dir */
1000 dn->d_time = jiffies;
1001 ceph_dentry(dn)->lease_shared_gen = 0;
1002 /* take overwritten dentry's readdir offset */
1003 ceph_dentry(req->r_old_dentry)->offset =
1004 ceph_dentry(dn)->offset;
1005 dn = req->r_old_dentry; /* use old_dentry */
1006 in = dn->d_inode;
1007 }
1008
1009 /* null dentry? */
1010 if (!rinfo->head->is_target) {
1011 dout("fill_trace null dentry\n");
1012 if (dn->d_inode) {
1013 dout("d_delete %p\n", dn);
1014 d_delete(dn);
1015 } else {
1016 dout("d_instantiate %p NULL\n", dn);
1017 d_instantiate(dn, NULL);
1018 if (have_lease && d_unhashed(dn))
1019 d_rehash(dn);
1020 update_dentry_lease(dn, rinfo->dlease,
1021 session,
1022 req->r_request_started);
1023 }
1024 goto done;
1025 }
1026
1027 /* attach proper inode */
1028 ininfo = rinfo->targeti.in;
1029 vino.ino = le64_to_cpu(ininfo->ino);
1030 vino.snap = le64_to_cpu(ininfo->snapid);
1031 if (!dn->d_inode) {
1032 in = ceph_get_inode(sb, vino);
1033 if (IS_ERR(in)) {
1034 pr_err("fill_trace bad get_inode "
1035 "%llx.%llx\n", vino.ino, vino.snap);
1036 err = PTR_ERR(in);
1037 d_delete(dn);
1038 goto done;
1039 }
1040 dn = splice_dentry(dn, in, &have_lease);
1041 if (IS_ERR(dn)) {
1042 err = PTR_ERR(dn);
1043 goto done;
1044 }
1045 req->r_dentry = dn; /* may have spliced */
1046 ceph_set_dentry_offset(dn);
1047 igrab(in);
1048 } else if (ceph_ino(in) == vino.ino &&
1049 ceph_snap(in) == vino.snap) {
1050 igrab(in);
1051 } else {
1052 dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
1053 dn, in, ceph_ino(in), ceph_snap(in),
1054 vino.ino, vino.snap);
1055 have_lease = false;
1056 in = NULL;
1057 }
1058
1059 if (have_lease)
1060 update_dentry_lease(dn, rinfo->dlease, session,
1061 req->r_request_started);
1062 dout(" final dn %p\n", dn);
1063 i++;
1064 } else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
1065 req->r_op == CEPH_MDS_OP_MKSNAP) {
1066 struct dentry *dn = req->r_dentry;
1067
1068 /* fill out a snapdir LOOKUPSNAP dentry */
1069 BUG_ON(!dn);
1070 BUG_ON(!req->r_locked_dir);
1071 BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
1072 ininfo = rinfo->targeti.in;
1073 vino.ino = le64_to_cpu(ininfo->ino);
1074 vino.snap = le64_to_cpu(ininfo->snapid);
1075 in = ceph_get_inode(sb, vino);
1076 if (IS_ERR(in)) {
1077 pr_err("fill_inode get_inode badness %llx.%llx\n",
1078 vino.ino, vino.snap);
1079 err = PTR_ERR(in);
1080 d_delete(dn);
1081 goto done;
1082 }
1083 dout(" linking snapped dir %p to dn %p\n", in, dn);
1084 dn = splice_dentry(dn, in, NULL);
1085 if (IS_ERR(dn)) {
1086 err = PTR_ERR(dn);
1087 goto done;
1088 }
1089 ceph_set_dentry_offset(dn);
1090 req->r_dentry = dn; /* may have spliced */
1091 igrab(in);
1092 rinfo->head->is_dentry = 1; /* fool notrace handlers */
1093 }
1094
1095 if (rinfo->head->is_target) {
1096 vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1097 vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1098
1099 if (in == NULL || ceph_ino(in) != vino.ino ||
1100 ceph_snap(in) != vino.snap) {
1101 in = ceph_get_inode(sb, vino);
1102 if (IS_ERR(in)) {
1103 err = PTR_ERR(in);
1104 goto done;
1105 }
1106 }
1107 req->r_target_inode = in;
1108
1109 err = fill_inode(in,
1110 &rinfo->targeti, NULL,
1111 session, req->r_request_started,
1112 (le32_to_cpu(rinfo->head->result) == 0) ?
1113 req->r_fmode : -1,
1114 &req->r_caps_reservation);
1115 if (err < 0) {
1116 pr_err("fill_inode badness %p %llx.%llx\n",
1117 in, ceph_vinop(in));
1118 goto done;
1119 }
1120 }
1121
1122done:
1123 dout("fill_trace done err=%d\n", err);
1124 return err;
1125}
1126
1127/*
1128 * Prepopulate our cache with readdir results, leases, etc.
1129 */
1130int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1131 struct ceph_mds_session *session)
1132{
1133 struct dentry *parent = req->r_dentry;
1134 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1135 struct qstr dname;
1136 struct dentry *dn;
1137 struct inode *in;
1138 int err = 0, i;
1139 struct inode *snapdir = NULL;
1140 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
1141 u64 frag = le32_to_cpu(rhead->args.readdir.frag);
1142 struct ceph_dentry_info *di;
1143
1144 if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
1145 snapdir = ceph_get_snapdir(parent->d_inode);
1146 parent = d_find_alias(snapdir);
1147 dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
1148 rinfo->dir_nr, parent);
1149 } else {
1150 dout("readdir_prepopulate %d items under dn %p\n",
1151 rinfo->dir_nr, parent);
1152 if (rinfo->dir_dir)
1153 ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
1154 }
1155
1156 for (i = 0; i < rinfo->dir_nr; i++) {
1157 struct ceph_vino vino;
1158
1159 dname.name = rinfo->dir_dname[i];
1160 dname.len = rinfo->dir_dname_len[i];
1161 dname.hash = full_name_hash(dname.name, dname.len);
1162
1163 vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
1164 vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
1165
1166retry_lookup:
1167 dn = d_lookup(parent, &dname);
1168 dout("d_lookup on parent=%p name=%.*s got %p\n",
1169 parent, dname.len, dname.name, dn);
1170
1171 if (!dn) {
1172 dn = d_alloc(parent, &dname);
1173 dout("d_alloc %p '%.*s' = %p\n", parent,
1174 dname.len, dname.name, dn);
1175 if (dn == NULL) {
1176 dout("d_alloc badness\n");
1177 err = -ENOMEM;
1178 goto out;
1179 }
1180 err = ceph_init_dentry(dn);
1181 if (err < 0)
1182 goto out;
1183 } else if (dn->d_inode &&
1184 (ceph_ino(dn->d_inode) != vino.ino ||
1185 ceph_snap(dn->d_inode) != vino.snap)) {
1186 dout(" dn %p points to wrong inode %p\n",
1187 dn, dn->d_inode);
1188 d_delete(dn);
1189 dput(dn);
1190 goto retry_lookup;
1191 } else {
1192 /* reorder parent's d_subdirs */
1193 spin_lock(&dcache_lock);
1194 spin_lock(&dn->d_lock);
1195 list_move(&dn->d_u.d_child, &parent->d_subdirs);
1196 spin_unlock(&dn->d_lock);
1197 spin_unlock(&dcache_lock);
1198 }
1199
1200 di = dn->d_fsdata;
1201 di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
1202
1203 /* inode */
1204 if (dn->d_inode) {
1205 in = dn->d_inode;
1206 } else {
1207 in = ceph_get_inode(parent->d_sb, vino);
1208 if (in == NULL) {
1209 dout("new_inode badness\n");
1210 d_delete(dn);
1211 dput(dn);
1212 err = -ENOMEM;
1213 goto out;
1214 }
1215 dn = splice_dentry(dn, in, NULL);
1216 }
1217
1218 if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
1219 req->r_request_started, -1,
1220 &req->r_caps_reservation) < 0) {
1221 pr_err("fill_inode badness on %p\n", in);
1222 dput(dn);
1223 continue;
1224 }
1225 update_dentry_lease(dn, rinfo->dir_dlease[i],
1226 req->r_session, req->r_request_started);
1227 dput(dn);
1228 }
1229 req->r_did_prepopulate = true;
1230
1231out:
1232 if (snapdir) {
1233 iput(snapdir);
1234 dput(parent);
1235 }
1236 dout("readdir_prepopulate done\n");
1237 return err;
1238}
1239
1240int ceph_inode_set_size(struct inode *inode, loff_t size)
1241{
1242 struct ceph_inode_info *ci = ceph_inode(inode);
1243 int ret = 0;
1244
1245 spin_lock(&inode->i_lock);
1246 dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
1247 inode->i_size = size;
1248 inode->i_blocks = (size + (1 << 9) - 1) >> 9;
1249
1250 /* tell the MDS if we are approaching max_size */
1251 if ((size << 1) >= ci->i_max_size &&
1252 (ci->i_reported_size << 1) < ci->i_max_size)
1253 ret = 1;
1254
1255 spin_unlock(&inode->i_lock);
1256 return ret;
1257}
1258
1259/*
1260 * Write back inode data in a worker thread. (This can't be done
1261 * in the message handler context.)
1262 */
1263void ceph_queue_writeback(struct inode *inode)
1264{
1265 if (queue_work(ceph_inode_to_client(inode)->wb_wq,
1266 &ceph_inode(inode)->i_wb_work)) {
1267 dout("ceph_queue_writeback %p\n", inode);
1268 igrab(inode);
1269 } else {
1270 dout("ceph_queue_writeback %p failed\n", inode);
1271 }
1272}
1273
1274static void ceph_writeback_work(struct work_struct *work)
1275{
1276 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1277 i_wb_work);
1278 struct inode *inode = &ci->vfs_inode;
1279
1280 dout("writeback %p\n", inode);
1281 filemap_fdatawrite(&inode->i_data);
1282 iput(inode);
1283}
1284
1285/*
1286 * queue an async invalidation
1287 */
1288void ceph_queue_invalidate(struct inode *inode)
1289{
1290 if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
1291 &ceph_inode(inode)->i_pg_inv_work)) {
1292 dout("ceph_queue_invalidate %p\n", inode);
1293 igrab(inode);
1294 } else {
1295 dout("ceph_queue_invalidate %p failed\n", inode);
1296 }
1297}
1298
1299/*
1300 * invalidate any pages that are not dirty or under writeback. this
1301 * includes pages that are clean and mapped.
1302 */
1303static void ceph_invalidate_nondirty_pages(struct address_space *mapping)
1304{
1305 struct pagevec pvec;
1306 pgoff_t next = 0;
1307 int i;
1308
1309 pagevec_init(&pvec, 0);
1310 while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
1311 for (i = 0; i < pagevec_count(&pvec); i++) {
1312 struct page *page = pvec.pages[i];
1313 pgoff_t index;
1314 int skip_page =
1315 (PageDirty(page) || PageWriteback(page));
1316
1317 if (!skip_page)
1318 skip_page = !trylock_page(page);
1319
1320 /*
1321 * We really shouldn't be looking at the ->index of an
1322 * unlocked page. But we're not allowed to lock these
1323 * pages. So we rely upon nobody altering the ->index
1324 * of this (pinned-by-us) page.
1325 */
1326 index = page->index;
1327 if (index > next)
1328 next = index;
1329 next++;
1330
1331 if (skip_page)
1332 continue;
1333
1334 generic_error_remove_page(mapping, page);
1335 unlock_page(page);
1336 }
1337 pagevec_release(&pvec);
1338 cond_resched();
1339 }
1340}
1341
1342/*
1343 * Invalidate inode pages in a worker thread. (This can't be done
1344 * in the message handler context.)
1345 */
1346static void ceph_invalidate_work(struct work_struct *work)
1347{
1348 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1349 i_pg_inv_work);
1350 struct inode *inode = &ci->vfs_inode;
1351 u32 orig_gen;
1352 int check = 0;
1353
1354 spin_lock(&inode->i_lock);
1355 dout("invalidate_pages %p gen %d revoking %d\n", inode,
1356 ci->i_rdcache_gen, ci->i_rdcache_revoking);
1357 if (ci->i_rdcache_gen == 0 ||
1358 ci->i_rdcache_revoking != ci->i_rdcache_gen) {
1359 BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
1360 /* nevermind! */
1361 ci->i_rdcache_revoking = 0;
1362 spin_unlock(&inode->i_lock);
1363 goto out;
1364 }
1365 orig_gen = ci->i_rdcache_gen;
1366 spin_unlock(&inode->i_lock);
1367
1368 ceph_invalidate_nondirty_pages(inode->i_mapping);
1369
1370 spin_lock(&inode->i_lock);
1371 if (orig_gen == ci->i_rdcache_gen) {
1372 dout("invalidate_pages %p gen %d successful\n", inode,
1373 ci->i_rdcache_gen);
1374 ci->i_rdcache_gen = 0;
1375 ci->i_rdcache_revoking = 0;
1376 check = 1;
1377 } else {
1378 dout("invalidate_pages %p gen %d raced, gen now %d\n",
1379 inode, orig_gen, ci->i_rdcache_gen);
1380 }
1381 spin_unlock(&inode->i_lock);
1382
1383 if (check)
1384 ceph_check_caps(ci, 0, NULL);
1385out:
1386 iput(inode);
1387}
1388
1389
1390/*
1391 * called by trunc_wq; take i_mutex ourselves
1392 *
1393 * We also truncate in a separate thread as well.
1394 */
1395static void ceph_vmtruncate_work(struct work_struct *work)
1396{
1397 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1398 i_vmtruncate_work);
1399 struct inode *inode = &ci->vfs_inode;
1400
1401 dout("vmtruncate_work %p\n", inode);
1402 mutex_lock(&inode->i_mutex);
1403 __ceph_do_pending_vmtruncate(inode);
1404 mutex_unlock(&inode->i_mutex);
1405 iput(inode);
1406}
1407
1408/*
1409 * Queue an async vmtruncate. If we fail to queue work, we will handle
1410 * the truncation the next time we call __ceph_do_pending_vmtruncate.
1411 */
1412void ceph_queue_vmtruncate(struct inode *inode)
1413{
1414 struct ceph_inode_info *ci = ceph_inode(inode);
1415
1416 if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
1417 &ci->i_vmtruncate_work)) {
1418 dout("ceph_queue_vmtruncate %p\n", inode);
1419 igrab(inode);
1420 } else {
1421 dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
1422 inode, ci->i_truncate_pending);
1423 }
1424}
1425
1426/*
1427 * called with i_mutex held.
1428 *
1429 * Make sure any pending truncation is applied before doing anything
1430 * that may depend on it.
1431 */
1432void __ceph_do_pending_vmtruncate(struct inode *inode)
1433{
1434 struct ceph_inode_info *ci = ceph_inode(inode);
1435 u64 to;
1436 int wrbuffer_refs, wake = 0;
1437
1438retry:
1439 spin_lock(&inode->i_lock);
1440 if (ci->i_truncate_pending == 0) {
1441 dout("__do_pending_vmtruncate %p none pending\n", inode);
1442 spin_unlock(&inode->i_lock);
1443 return;
1444 }
1445
1446 /*
1447 * make sure any dirty snapped pages are flushed before we
1448 * possibly truncate them.. so write AND block!
1449 */
1450 if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
1451 dout("__do_pending_vmtruncate %p flushing snaps first\n",
1452 inode);
1453 spin_unlock(&inode->i_lock);
1454 filemap_write_and_wait_range(&inode->i_data, 0,
1455 inode->i_sb->s_maxbytes);
1456 goto retry;
1457 }
1458
1459 to = ci->i_truncate_size;
1460 wrbuffer_refs = ci->i_wrbuffer_ref;
1461 dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
1462 ci->i_truncate_pending, to);
1463 spin_unlock(&inode->i_lock);
1464
1465 truncate_inode_pages(inode->i_mapping, to);
1466
1467 spin_lock(&inode->i_lock);
1468 ci->i_truncate_pending--;
1469 if (ci->i_truncate_pending == 0)
1470 wake = 1;
1471 spin_unlock(&inode->i_lock);
1472
1473 if (wrbuffer_refs == 0)
1474 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
1475 if (wake)
1476 wake_up(&ci->i_cap_wq);
1477}
1478
1479
1480/*
1481 * symlinks
1482 */
1483static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
1484{
1485 struct ceph_inode_info *ci = ceph_inode(dentry->d_inode);
1486 nd_set_link(nd, ci->i_symlink);
1487 return NULL;
1488}
1489
1490static const struct inode_operations ceph_symlink_iops = {
1491 .readlink = generic_readlink,
1492 .follow_link = ceph_sym_follow_link,
1493};
1494
1495/*
1496 * setattr
1497 */
1498int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1499{
1500 struct inode *inode = dentry->d_inode;
1501 struct ceph_inode_info *ci = ceph_inode(inode);
1502 struct inode *parent_inode = dentry->d_parent->d_inode;
1503 const unsigned int ia_valid = attr->ia_valid;
1504 struct ceph_mds_request *req;
1505 struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc;
1506 int issued;
1507 int release = 0, dirtied = 0;
1508 int mask = 0;
1509 int err = 0;
1510
1511 if (ceph_snap(inode) != CEPH_NOSNAP)
1512 return -EROFS;
1513
1514 __ceph_do_pending_vmtruncate(inode);
1515
1516 err = inode_change_ok(inode, attr);
1517 if (err != 0)
1518 return err;
1519
1520 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
1521 USE_AUTH_MDS);
1522 if (IS_ERR(req))
1523 return PTR_ERR(req);
1524
1525 spin_lock(&inode->i_lock);
1526 issued = __ceph_caps_issued(ci, NULL);
1527 dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
1528
1529 if (ia_valid & ATTR_UID) {
1530 dout("setattr %p uid %d -> %d\n", inode,
1531 inode->i_uid, attr->ia_uid);
1532 if (issued & CEPH_CAP_AUTH_EXCL) {
1533 inode->i_uid = attr->ia_uid;
1534 dirtied |= CEPH_CAP_AUTH_EXCL;
1535 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1536 attr->ia_uid != inode->i_uid) {
1537 req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid);
1538 mask |= CEPH_SETATTR_UID;
1539 release |= CEPH_CAP_AUTH_SHARED;
1540 }
1541 }
1542 if (ia_valid & ATTR_GID) {
1543 dout("setattr %p gid %d -> %d\n", inode,
1544 inode->i_gid, attr->ia_gid);
1545 if (issued & CEPH_CAP_AUTH_EXCL) {
1546 inode->i_gid = attr->ia_gid;
1547 dirtied |= CEPH_CAP_AUTH_EXCL;
1548 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1549 attr->ia_gid != inode->i_gid) {
1550 req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid);
1551 mask |= CEPH_SETATTR_GID;
1552 release |= CEPH_CAP_AUTH_SHARED;
1553 }
1554 }
1555 if (ia_valid & ATTR_MODE) {
1556 dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
1557 attr->ia_mode);
1558 if (issued & CEPH_CAP_AUTH_EXCL) {
1559 inode->i_mode = attr->ia_mode;
1560 dirtied |= CEPH_CAP_AUTH_EXCL;
1561 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1562 attr->ia_mode != inode->i_mode) {
1563 req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
1564 mask |= CEPH_SETATTR_MODE;
1565 release |= CEPH_CAP_AUTH_SHARED;
1566 }
1567 }
1568
1569 if (ia_valid & ATTR_ATIME) {
1570 dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode,
1571 inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
1572 attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
1573 if (issued & CEPH_CAP_FILE_EXCL) {
1574 ci->i_time_warp_seq++;
1575 inode->i_atime = attr->ia_atime;
1576 dirtied |= CEPH_CAP_FILE_EXCL;
1577 } else if ((issued & CEPH_CAP_FILE_WR) &&
1578 timespec_compare(&inode->i_atime,
1579 &attr->ia_atime) < 0) {
1580 inode->i_atime = attr->ia_atime;
1581 dirtied |= CEPH_CAP_FILE_WR;
1582 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1583 !timespec_equal(&inode->i_atime, &attr->ia_atime)) {
1584 ceph_encode_timespec(&req->r_args.setattr.atime,
1585 &attr->ia_atime);
1586 mask |= CEPH_SETATTR_ATIME;
1587 release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
1588 CEPH_CAP_FILE_WR;
1589 }
1590 }
1591 if (ia_valid & ATTR_MTIME) {
1592 dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode,
1593 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
1594 attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
1595 if (issued & CEPH_CAP_FILE_EXCL) {
1596 ci->i_time_warp_seq++;
1597 inode->i_mtime = attr->ia_mtime;
1598 dirtied |= CEPH_CAP_FILE_EXCL;
1599 } else if ((issued & CEPH_CAP_FILE_WR) &&
1600 timespec_compare(&inode->i_mtime,
1601 &attr->ia_mtime) < 0) {
1602 inode->i_mtime = attr->ia_mtime;
1603 dirtied |= CEPH_CAP_FILE_WR;
1604 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1605 !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) {
1606 ceph_encode_timespec(&req->r_args.setattr.mtime,
1607 &attr->ia_mtime);
1608 mask |= CEPH_SETATTR_MTIME;
1609 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
1610 CEPH_CAP_FILE_WR;
1611 }
1612 }
1613 if (ia_valid & ATTR_SIZE) {
1614 dout("setattr %p size %lld -> %lld\n", inode,
1615 inode->i_size, attr->ia_size);
1616 if (attr->ia_size > inode->i_sb->s_maxbytes) {
1617 err = -EINVAL;
1618 goto out;
1619 }
1620 if ((issued & CEPH_CAP_FILE_EXCL) &&
1621 attr->ia_size > inode->i_size) {
1622 inode->i_size = attr->ia_size;
1623 inode->i_blocks =
1624 (attr->ia_size + (1 << 9) - 1) >> 9;
1625 inode->i_ctime = attr->ia_ctime;
1626 ci->i_reported_size = attr->ia_size;
1627 dirtied |= CEPH_CAP_FILE_EXCL;
1628 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1629 attr->ia_size != inode->i_size) {
1630 req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
1631 req->r_args.setattr.old_size =
1632 cpu_to_le64(inode->i_size);
1633 mask |= CEPH_SETATTR_SIZE;
1634 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
1635 CEPH_CAP_FILE_WR;
1636 }
1637 }
1638
1639 /* these do nothing */
1640 if (ia_valid & ATTR_CTIME) {
1641 bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
1642 ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
1643 dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode,
1644 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
1645 attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
1646 only ? "ctime only" : "ignored");
1647 inode->i_ctime = attr->ia_ctime;
1648 if (only) {
1649 /*
1650 * if kernel wants to dirty ctime but nothing else,
1651 * we need to choose a cap to dirty under, or do
1652 * a almost-no-op setattr
1653 */
1654 if (issued & CEPH_CAP_AUTH_EXCL)
1655 dirtied |= CEPH_CAP_AUTH_EXCL;
1656 else if (issued & CEPH_CAP_FILE_EXCL)
1657 dirtied |= CEPH_CAP_FILE_EXCL;
1658 else if (issued & CEPH_CAP_XATTR_EXCL)
1659 dirtied |= CEPH_CAP_XATTR_EXCL;
1660 else
1661 mask |= CEPH_SETATTR_CTIME;
1662 }
1663 }
1664 if (ia_valid & ATTR_FILE)
1665 dout("setattr %p ATTR_FILE ... hrm!\n", inode);
1666
1667 if (dirtied) {
1668 __ceph_mark_dirty_caps(ci, dirtied);
1669 inode->i_ctime = CURRENT_TIME;
1670 }
1671
1672 release &= issued;
1673 spin_unlock(&inode->i_lock);
1674
1675 if (mask) {
1676 req->r_inode = igrab(inode);
1677 req->r_inode_drop = release;
1678 req->r_args.setattr.mask = cpu_to_le32(mask);
1679 req->r_num_caps = 1;
1680 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
1681 }
1682 dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
1683 ceph_cap_string(dirtied), mask);
1684
1685 ceph_mdsc_put_request(req);
1686 __ceph_do_pending_vmtruncate(inode);
1687 return err;
1688out:
1689 spin_unlock(&inode->i_lock);
1690 ceph_mdsc_put_request(req);
1691 return err;
1692}
1693
1694/*
1695 * Verify that we have a lease on the given mask. If not,
1696 * do a getattr against an mds.
1697 */
1698int ceph_do_getattr(struct inode *inode, int mask)
1699{
1700 struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
1701 struct ceph_mds_client *mdsc = &client->mdsc;
1702 struct ceph_mds_request *req;
1703 int err;
1704
1705 if (ceph_snap(inode) == CEPH_SNAPDIR) {
1706 dout("do_getattr inode %p SNAPDIR\n", inode);
1707 return 0;
1708 }
1709
1710 dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
1711 if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
1712 return 0;
1713
1714 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
1715 if (IS_ERR(req))
1716 return PTR_ERR(req);
1717 req->r_inode = igrab(inode);
1718 req->r_num_caps = 1;
1719 req->r_args.getattr.mask = cpu_to_le32(mask);
1720 err = ceph_mdsc_do_request(mdsc, NULL, req);
1721 ceph_mdsc_put_request(req);
1722 dout("do_getattr result=%d\n", err);
1723 return err;
1724}
1725
1726
1727/*
1728 * Check inode permissions. We verify we have a valid value for
1729 * the AUTH cap, then call the generic handler.
1730 */
1731int ceph_permission(struct inode *inode, int mask)
1732{
1733 int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
1734
1735 if (!err)
1736 err = generic_permission(inode, mask, NULL);
1737 return err;
1738}
1739
1740/*
1741 * Get all attributes. Hopefully somedata we'll have a statlite()
1742 * and can limit the fields we require to be accurate.
1743 */
1744int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
1745 struct kstat *stat)
1746{
1747 struct inode *inode = dentry->d_inode;
1748 struct ceph_inode_info *ci = ceph_inode(inode);
1749 int err;
1750
1751 err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
1752 if (!err) {
1753 generic_fillattr(inode, stat);
1754 stat->ino = inode->i_ino;
1755 if (ceph_snap(inode) != CEPH_NOSNAP)
1756 stat->dev = ceph_snap(inode);
1757 else
1758 stat->dev = 0;
1759 if (S_ISDIR(inode->i_mode)) {
1760 stat->size = ci->i_rbytes;
1761 stat->blocks = 0;
1762 stat->blksize = 65536;
1763 }
1764 }
1765 return err;
1766}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
new file mode 100644
index 000000000000..8a5bcae62846
--- /dev/null
+++ b/fs/ceph/ioctl.c
@@ -0,0 +1,160 @@
1#include <linux/in.h>
2
3#include "ioctl.h"
4#include "super.h"
5#include "ceph_debug.h"
6
7
8/*
9 * ioctls
10 */
11
12/*
13 * get and set the file layout
14 */
15static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
16{
17 struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
18 struct ceph_ioctl_layout l;
19 int err;
20
21 err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
22 if (!err) {
23 l.stripe_unit = ceph_file_layout_su(ci->i_layout);
24 l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
25 l.object_size = ceph_file_layout_object_size(ci->i_layout);
26 l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
27 l.preferred_osd =
28 (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
29 if (copy_to_user(arg, &l, sizeof(l)))
30 return -EFAULT;
31 }
32
33 return err;
34}
35
36static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
37{
38 struct inode *inode = file->f_dentry->d_inode;
39 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
40 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
41 struct ceph_mds_request *req;
42 struct ceph_ioctl_layout l;
43 int err, i;
44
45 /* copy and validate */
46 if (copy_from_user(&l, arg, sizeof(l)))
47 return -EFAULT;
48
49 if ((l.object_size & ~PAGE_MASK) ||
50 (l.stripe_unit & ~PAGE_MASK) ||
51 !l.stripe_unit ||
52 (l.object_size &&
53 (unsigned)l.object_size % (unsigned)l.stripe_unit))
54 return -EINVAL;
55
56 /* make sure it's a valid data pool */
57 if (l.data_pool > 0) {
58 mutex_lock(&mdsc->mutex);
59 err = -EINVAL;
60 for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
61 if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
62 err = 0;
63 break;
64 }
65 mutex_unlock(&mdsc->mutex);
66 if (err)
67 return err;
68 }
69
70 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
71 USE_AUTH_MDS);
72 if (IS_ERR(req))
73 return PTR_ERR(req);
74 req->r_inode = igrab(inode);
75 req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
76
77 req->r_args.setlayout.layout.fl_stripe_unit =
78 cpu_to_le32(l.stripe_unit);
79 req->r_args.setlayout.layout.fl_stripe_count =
80 cpu_to_le32(l.stripe_count);
81 req->r_args.setlayout.layout.fl_object_size =
82 cpu_to_le32(l.object_size);
83 req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
84 req->r_args.setlayout.layout.fl_pg_preferred =
85 cpu_to_le32(l.preferred_osd);
86
87 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
88 ceph_mdsc_put_request(req);
89 return err;
90}
91
92/*
93 * Return object name, size/offset information, and location (OSD
94 * number, network address) for a given file offset.
95 */
96static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
97{
98 struct ceph_ioctl_dataloc dl;
99 struct inode *inode = file->f_dentry->d_inode;
100 struct ceph_inode_info *ci = ceph_inode(inode);
101 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
102 u64 len = 1, olen;
103 u64 tmp;
104 struct ceph_object_layout ol;
105 struct ceph_pg pgid;
106
107 /* copy and validate */
108 if (copy_from_user(&dl, arg, sizeof(dl)))
109 return -EFAULT;
110
111 down_read(&osdc->map_sem);
112 ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
113 &dl.object_no, &dl.object_offset, &olen);
114 dl.file_offset -= dl.object_offset;
115 dl.object_size = ceph_file_layout_object_size(ci->i_layout);
116 dl.block_size = ceph_file_layout_su(ci->i_layout);
117
118 /* block_offset = object_offset % block_size */
119 tmp = dl.object_offset;
120 dl.block_offset = do_div(tmp, dl.block_size);
121
122 snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
123 ceph_ino(inode), dl.object_no);
124 ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
125 osdc->osdmap);
126
127 pgid = ol.ol_pgid;
128 dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
129 if (dl.osd >= 0) {
130 struct ceph_entity_addr *a =
131 ceph_osd_addr(osdc->osdmap, dl.osd);
132 if (a)
133 memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr));
134 } else {
135 memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
136 }
137 up_read(&osdc->map_sem);
138
139 /* send result back to user */
140 if (copy_to_user(arg, &dl, sizeof(dl)))
141 return -EFAULT;
142
143 return 0;
144}
145
146long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
147{
148 dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
149 switch (cmd) {
150 case CEPH_IOC_GET_LAYOUT:
151 return ceph_ioctl_get_layout(file, (void __user *)arg);
152
153 case CEPH_IOC_SET_LAYOUT:
154 return ceph_ioctl_set_layout(file, (void __user *)arg);
155
156 case CEPH_IOC_GET_DATALOC:
157 return ceph_ioctl_get_dataloc(file, (void __user *)arg);
158 }
159 return -ENOTTY;
160}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
new file mode 100644
index 000000000000..25e4f1a9d059
--- /dev/null
+++ b/fs/ceph/ioctl.h
@@ -0,0 +1,40 @@
1#ifndef FS_CEPH_IOCTL_H
2#define FS_CEPH_IOCTL_H
3
4#include <linux/ioctl.h>
5#include <linux/types.h>
6
7#define CEPH_IOCTL_MAGIC 0x97
8
9/* just use u64 to align sanely on all archs */
10struct ceph_ioctl_layout {
11 __u64 stripe_unit, stripe_count, object_size;
12 __u64 data_pool;
13 __s64 preferred_osd;
14};
15
16#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1, \
17 struct ceph_ioctl_layout)
18#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \
19 struct ceph_ioctl_layout)
20
21/*
22 * Extract identity, address of the OSD and object storing a given
23 * file offset.
24 */
25struct ceph_ioctl_dataloc {
26 __u64 file_offset; /* in+out: file offset */
27 __u64 object_offset; /* out: offset in object */
28 __u64 object_no; /* out: object # */
29 __u64 object_size; /* out: object size */
30 char object_name[64]; /* out: object name */
31 __u64 block_offset; /* out: offset in block */
32 __u64 block_size; /* out: block length */
33 __s64 osd; /* out: osd # */
34 struct sockaddr_storage osd_addr; /* out: osd address */
35};
36
37#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
38 struct ceph_ioctl_dataloc)
39
40#endif
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
new file mode 100644
index 000000000000..60a9a4ae47be
--- /dev/null
+++ b/fs/ceph/mds_client.c
@@ -0,0 +1,3043 @@
1#include "ceph_debug.h"
2
3#include <linux/wait.h>
4#include <linux/slab.h>
5#include <linux/sched.h>
6
7#include "mds_client.h"
8#include "mon_client.h"
9#include "super.h"
10#include "messenger.h"
11#include "decode.h"
12#include "auth.h"
13#include "pagelist.h"
14
15/*
16 * A cluster of MDS (metadata server) daemons is responsible for
17 * managing the file system namespace (the directory hierarchy and
18 * inodes) and for coordinating shared access to storage. Metadata is
19 * partitioning hierarchically across a number of servers, and that
20 * partition varies over time as the cluster adjusts the distribution
21 * in order to balance load.
22 *
23 * The MDS client is primarily responsible to managing synchronous
24 * metadata requests for operations like open, unlink, and so forth.
25 * If there is a MDS failure, we find out about it when we (possibly
26 * request and) receive a new MDS map, and can resubmit affected
27 * requests.
28 *
29 * For the most part, though, we take advantage of a lossless
30 * communications channel to the MDS, and do not need to worry about
31 * timing out or resubmitting requests.
32 *
33 * We maintain a stateful "session" with each MDS we interact with.
34 * Within each session, we sent periodic heartbeat messages to ensure
35 * any capabilities or leases we have been issues remain valid. If
36 * the session times out and goes stale, our leases and capabilities
37 * are no longer valid.
38 */
39
40static void __wake_requests(struct ceph_mds_client *mdsc,
41 struct list_head *head);
42
43const static struct ceph_connection_operations mds_con_ops;
44
45
46/*
47 * mds reply parsing
48 */
49
50/*
51 * parse individual inode info
52 */
53static int parse_reply_info_in(void **p, void *end,
54 struct ceph_mds_reply_info_in *info)
55{
56 int err = -EIO;
57
58 info->in = *p;
59 *p += sizeof(struct ceph_mds_reply_inode) +
60 sizeof(*info->in->fragtree.splits) *
61 le32_to_cpu(info->in->fragtree.nsplits);
62
63 ceph_decode_32_safe(p, end, info->symlink_len, bad);
64 ceph_decode_need(p, end, info->symlink_len, bad);
65 info->symlink = *p;
66 *p += info->symlink_len;
67
68 ceph_decode_32_safe(p, end, info->xattr_len, bad);
69 ceph_decode_need(p, end, info->xattr_len, bad);
70 info->xattr_data = *p;
71 *p += info->xattr_len;
72 return 0;
73bad:
74 return err;
75}
76
77/*
78 * parse a normal reply, which may contain a (dir+)dentry and/or a
79 * target inode.
80 */
81static int parse_reply_info_trace(void **p, void *end,
82 struct ceph_mds_reply_info_parsed *info)
83{
84 int err;
85
86 if (info->head->is_dentry) {
87 err = parse_reply_info_in(p, end, &info->diri);
88 if (err < 0)
89 goto out_bad;
90
91 if (unlikely(*p + sizeof(*info->dirfrag) > end))
92 goto bad;
93 info->dirfrag = *p;
94 *p += sizeof(*info->dirfrag) +
95 sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
96 if (unlikely(*p > end))
97 goto bad;
98
99 ceph_decode_32_safe(p, end, info->dname_len, bad);
100 ceph_decode_need(p, end, info->dname_len, bad);
101 info->dname = *p;
102 *p += info->dname_len;
103 info->dlease = *p;
104 *p += sizeof(*info->dlease);
105 }
106
107 if (info->head->is_target) {
108 err = parse_reply_info_in(p, end, &info->targeti);
109 if (err < 0)
110 goto out_bad;
111 }
112
113 if (unlikely(*p != end))
114 goto bad;
115 return 0;
116
117bad:
118 err = -EIO;
119out_bad:
120 pr_err("problem parsing mds trace %d\n", err);
121 return err;
122}
123
124/*
125 * parse readdir results
126 */
127static int parse_reply_info_dir(void **p, void *end,
128 struct ceph_mds_reply_info_parsed *info)
129{
130 u32 num, i = 0;
131 int err;
132
133 info->dir_dir = *p;
134 if (*p + sizeof(*info->dir_dir) > end)
135 goto bad;
136 *p += sizeof(*info->dir_dir) +
137 sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
138 if (*p > end)
139 goto bad;
140
141 ceph_decode_need(p, end, sizeof(num) + 2, bad);
142 num = ceph_decode_32(p);
143 info->dir_end = ceph_decode_8(p);
144 info->dir_complete = ceph_decode_8(p);
145 if (num == 0)
146 goto done;
147
148 /* alloc large array */
149 info->dir_nr = num;
150 info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
151 sizeof(*info->dir_dname) +
152 sizeof(*info->dir_dname_len) +
153 sizeof(*info->dir_dlease),
154 GFP_NOFS);
155 if (info->dir_in == NULL) {
156 err = -ENOMEM;
157 goto out_bad;
158 }
159 info->dir_dname = (void *)(info->dir_in + num);
160 info->dir_dname_len = (void *)(info->dir_dname + num);
161 info->dir_dlease = (void *)(info->dir_dname_len + num);
162
163 while (num) {
164 /* dentry */
165 ceph_decode_need(p, end, sizeof(u32)*2, bad);
166 info->dir_dname_len[i] = ceph_decode_32(p);
167 ceph_decode_need(p, end, info->dir_dname_len[i], bad);
168 info->dir_dname[i] = *p;
169 *p += info->dir_dname_len[i];
170 dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
171 info->dir_dname[i]);
172 info->dir_dlease[i] = *p;
173 *p += sizeof(struct ceph_mds_reply_lease);
174
175 /* inode */
176 err = parse_reply_info_in(p, end, &info->dir_in[i]);
177 if (err < 0)
178 goto out_bad;
179 i++;
180 num--;
181 }
182
183done:
184 if (*p != end)
185 goto bad;
186 return 0;
187
188bad:
189 err = -EIO;
190out_bad:
191 pr_err("problem parsing dir contents %d\n", err);
192 return err;
193}
194
195/*
196 * parse entire mds reply
197 */
198static int parse_reply_info(struct ceph_msg *msg,
199 struct ceph_mds_reply_info_parsed *info)
200{
201 void *p, *end;
202 u32 len;
203 int err;
204
205 info->head = msg->front.iov_base;
206 p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
207 end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
208
209 /* trace */
210 ceph_decode_32_safe(&p, end, len, bad);
211 if (len > 0) {
212 err = parse_reply_info_trace(&p, p+len, info);
213 if (err < 0)
214 goto out_bad;
215 }
216
217 /* dir content */
218 ceph_decode_32_safe(&p, end, len, bad);
219 if (len > 0) {
220 err = parse_reply_info_dir(&p, p+len, info);
221 if (err < 0)
222 goto out_bad;
223 }
224
225 /* snap blob */
226 ceph_decode_32_safe(&p, end, len, bad);
227 info->snapblob_len = len;
228 info->snapblob = p;
229 p += len;
230
231 if (p != end)
232 goto bad;
233 return 0;
234
235bad:
236 err = -EIO;
237out_bad:
238 pr_err("mds parse_reply err %d\n", err);
239 return err;
240}
241
242static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
243{
244 kfree(info->dir_in);
245}
246
247
248/*
249 * sessions
250 */
251static const char *session_state_name(int s)
252{
253 switch (s) {
254 case CEPH_MDS_SESSION_NEW: return "new";
255 case CEPH_MDS_SESSION_OPENING: return "opening";
256 case CEPH_MDS_SESSION_OPEN: return "open";
257 case CEPH_MDS_SESSION_HUNG: return "hung";
258 case CEPH_MDS_SESSION_CLOSING: return "closing";
259 case CEPH_MDS_SESSION_RESTARTING: return "restarting";
260 case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
261 default: return "???";
262 }
263}
264
265static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
266{
267 if (atomic_inc_not_zero(&s->s_ref)) {
268 dout("mdsc get_session %p %d -> %d\n", s,
269 atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
270 return s;
271 } else {
272 dout("mdsc get_session %p 0 -- FAIL", s);
273 return NULL;
274 }
275}
276
277void ceph_put_mds_session(struct ceph_mds_session *s)
278{
279 dout("mdsc put_session %p %d -> %d\n", s,
280 atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
281 if (atomic_dec_and_test(&s->s_ref)) {
282 if (s->s_authorizer)
283 s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
284 s->s_mdsc->client->monc.auth, s->s_authorizer);
285 kfree(s);
286 }
287}
288
289/*
290 * called under mdsc->mutex
291 */
292struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
293 int mds)
294{
295 struct ceph_mds_session *session;
296
297 if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
298 return NULL;
299 session = mdsc->sessions[mds];
300 dout("lookup_mds_session %p %d\n", session,
301 atomic_read(&session->s_ref));
302 get_session(session);
303 return session;
304}
305
306static bool __have_session(struct ceph_mds_client *mdsc, int mds)
307{
308 if (mds >= mdsc->max_sessions)
309 return false;
310 return mdsc->sessions[mds];
311}
312
313static int __verify_registered_session(struct ceph_mds_client *mdsc,
314 struct ceph_mds_session *s)
315{
316 if (s->s_mds >= mdsc->max_sessions ||
317 mdsc->sessions[s->s_mds] != s)
318 return -ENOENT;
319 return 0;
320}
321
322/*
323 * create+register a new session for given mds.
324 * called under mdsc->mutex.
325 */
326static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
327 int mds)
328{
329 struct ceph_mds_session *s;
330
331 s = kzalloc(sizeof(*s), GFP_NOFS);
332 if (!s)
333 return ERR_PTR(-ENOMEM);
334 s->s_mdsc = mdsc;
335 s->s_mds = mds;
336 s->s_state = CEPH_MDS_SESSION_NEW;
337 s->s_ttl = 0;
338 s->s_seq = 0;
339 mutex_init(&s->s_mutex);
340
341 ceph_con_init(mdsc->client->msgr, &s->s_con);
342 s->s_con.private = s;
343 s->s_con.ops = &mds_con_ops;
344 s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
345 s->s_con.peer_name.num = cpu_to_le64(mds);
346
347 spin_lock_init(&s->s_cap_lock);
348 s->s_cap_gen = 0;
349 s->s_cap_ttl = 0;
350 s->s_renew_requested = 0;
351 s->s_renew_seq = 0;
352 INIT_LIST_HEAD(&s->s_caps);
353 s->s_nr_caps = 0;
354 s->s_trim_caps = 0;
355 atomic_set(&s->s_ref, 1);
356 INIT_LIST_HEAD(&s->s_waiting);
357 INIT_LIST_HEAD(&s->s_unsafe);
358 s->s_num_cap_releases = 0;
359 s->s_cap_iterator = NULL;
360 INIT_LIST_HEAD(&s->s_cap_releases);
361 INIT_LIST_HEAD(&s->s_cap_releases_done);
362 INIT_LIST_HEAD(&s->s_cap_flushing);
363 INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
364
365 dout("register_session mds%d\n", mds);
366 if (mds >= mdsc->max_sessions) {
367 int newmax = 1 << get_count_order(mds+1);
368 struct ceph_mds_session **sa;
369
370 dout("register_session realloc to %d\n", newmax);
371 sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
372 if (sa == NULL)
373 goto fail_realloc;
374 if (mdsc->sessions) {
375 memcpy(sa, mdsc->sessions,
376 mdsc->max_sessions * sizeof(void *));
377 kfree(mdsc->sessions);
378 }
379 mdsc->sessions = sa;
380 mdsc->max_sessions = newmax;
381 }
382 mdsc->sessions[mds] = s;
383 atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */
384
385 ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
386
387 return s;
388
389fail_realloc:
390 kfree(s);
391 return ERR_PTR(-ENOMEM);
392}
393
394/*
395 * called under mdsc->mutex
396 */
397static void __unregister_session(struct ceph_mds_client *mdsc,
398 struct ceph_mds_session *s)
399{
400 dout("__unregister_session mds%d %p\n", s->s_mds, s);
401 BUG_ON(mdsc->sessions[s->s_mds] != s);
402 mdsc->sessions[s->s_mds] = NULL;
403 ceph_con_close(&s->s_con);
404 ceph_put_mds_session(s);
405}
406
407/*
408 * drop session refs in request.
409 *
410 * should be last request ref, or hold mdsc->mutex
411 */
412static void put_request_session(struct ceph_mds_request *req)
413{
414 if (req->r_session) {
415 ceph_put_mds_session(req->r_session);
416 req->r_session = NULL;
417 }
418}
419
420void ceph_mdsc_release_request(struct kref *kref)
421{
422 struct ceph_mds_request *req = container_of(kref,
423 struct ceph_mds_request,
424 r_kref);
425 if (req->r_request)
426 ceph_msg_put(req->r_request);
427 if (req->r_reply) {
428 ceph_msg_put(req->r_reply);
429 destroy_reply_info(&req->r_reply_info);
430 }
431 if (req->r_inode) {
432 ceph_put_cap_refs(ceph_inode(req->r_inode),
433 CEPH_CAP_PIN);
434 iput(req->r_inode);
435 }
436 if (req->r_locked_dir)
437 ceph_put_cap_refs(ceph_inode(req->r_locked_dir),
438 CEPH_CAP_PIN);
439 if (req->r_target_inode)
440 iput(req->r_target_inode);
441 if (req->r_dentry)
442 dput(req->r_dentry);
443 if (req->r_old_dentry) {
444 ceph_put_cap_refs(
445 ceph_inode(req->r_old_dentry->d_parent->d_inode),
446 CEPH_CAP_PIN);
447 dput(req->r_old_dentry);
448 }
449 kfree(req->r_path1);
450 kfree(req->r_path2);
451 put_request_session(req);
452 ceph_unreserve_caps(&req->r_caps_reservation);
453 kfree(req);
454}
455
456/*
457 * lookup session, bump ref if found.
458 *
459 * called under mdsc->mutex.
460 */
461static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
462 u64 tid)
463{
464 struct ceph_mds_request *req;
465 struct rb_node *n = mdsc->request_tree.rb_node;
466
467 while (n) {
468 req = rb_entry(n, struct ceph_mds_request, r_node);
469 if (tid < req->r_tid)
470 n = n->rb_left;
471 else if (tid > req->r_tid)
472 n = n->rb_right;
473 else {
474 ceph_mdsc_get_request(req);
475 return req;
476 }
477 }
478 return NULL;
479}
480
481static void __insert_request(struct ceph_mds_client *mdsc,
482 struct ceph_mds_request *new)
483{
484 struct rb_node **p = &mdsc->request_tree.rb_node;
485 struct rb_node *parent = NULL;
486 struct ceph_mds_request *req = NULL;
487
488 while (*p) {
489 parent = *p;
490 req = rb_entry(parent, struct ceph_mds_request, r_node);
491 if (new->r_tid < req->r_tid)
492 p = &(*p)->rb_left;
493 else if (new->r_tid > req->r_tid)
494 p = &(*p)->rb_right;
495 else
496 BUG();
497 }
498
499 rb_link_node(&new->r_node, parent, p);
500 rb_insert_color(&new->r_node, &mdsc->request_tree);
501}
502
503/*
504 * Register an in-flight request, and assign a tid. Link to directory
505 * are modifying (if any).
506 *
507 * Called under mdsc->mutex.
508 */
509static void __register_request(struct ceph_mds_client *mdsc,
510 struct ceph_mds_request *req,
511 struct inode *dir)
512{
513 req->r_tid = ++mdsc->last_tid;
514 if (req->r_num_caps)
515 ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps);
516 dout("__register_request %p tid %lld\n", req, req->r_tid);
517 ceph_mdsc_get_request(req);
518 __insert_request(mdsc, req);
519
520 if (dir) {
521 struct ceph_inode_info *ci = ceph_inode(dir);
522
523 spin_lock(&ci->i_unsafe_lock);
524 req->r_unsafe_dir = dir;
525 list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
526 spin_unlock(&ci->i_unsafe_lock);
527 }
528}
529
530static void __unregister_request(struct ceph_mds_client *mdsc,
531 struct ceph_mds_request *req)
532{
533 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
534 rb_erase(&req->r_node, &mdsc->request_tree);
535 RB_CLEAR_NODE(&req->r_node);
536
537 if (req->r_unsafe_dir) {
538 struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
539
540 spin_lock(&ci->i_unsafe_lock);
541 list_del_init(&req->r_unsafe_dir_item);
542 spin_unlock(&ci->i_unsafe_lock);
543 }
544
545 ceph_mdsc_put_request(req);
546}
547
548/*
549 * Choose mds to send request to next. If there is a hint set in the
550 * request (e.g., due to a prior forward hint from the mds), use that.
551 * Otherwise, consult frag tree and/or caps to identify the
552 * appropriate mds. If all else fails, choose randomly.
553 *
554 * Called under mdsc->mutex.
555 */
556static int __choose_mds(struct ceph_mds_client *mdsc,
557 struct ceph_mds_request *req)
558{
559 struct inode *inode;
560 struct ceph_inode_info *ci;
561 struct ceph_cap *cap;
562 int mode = req->r_direct_mode;
563 int mds = -1;
564 u32 hash = req->r_direct_hash;
565 bool is_hash = req->r_direct_is_hash;
566
567 /*
568 * is there a specific mds we should try? ignore hint if we have
569 * no session and the mds is not up (active or recovering).
570 */
571 if (req->r_resend_mds >= 0 &&
572 (__have_session(mdsc, req->r_resend_mds) ||
573 ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
574 dout("choose_mds using resend_mds mds%d\n",
575 req->r_resend_mds);
576 return req->r_resend_mds;
577 }
578
579 if (mode == USE_RANDOM_MDS)
580 goto random;
581
582 inode = NULL;
583 if (req->r_inode) {
584 inode = req->r_inode;
585 } else if (req->r_dentry) {
586 if (req->r_dentry->d_inode) {
587 inode = req->r_dentry->d_inode;
588 } else {
589 inode = req->r_dentry->d_parent->d_inode;
590 hash = req->r_dentry->d_name.hash;
591 is_hash = true;
592 }
593 }
594 dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
595 (int)hash, mode);
596 if (!inode)
597 goto random;
598 ci = ceph_inode(inode);
599
600 if (is_hash && S_ISDIR(inode->i_mode)) {
601 struct ceph_inode_frag frag;
602 int found;
603
604 ceph_choose_frag(ci, hash, &frag, &found);
605 if (found) {
606 if (mode == USE_ANY_MDS && frag.ndist > 0) {
607 u8 r;
608
609 /* choose a random replica */
610 get_random_bytes(&r, 1);
611 r %= frag.ndist;
612 mds = frag.dist[r];
613 dout("choose_mds %p %llx.%llx "
614 "frag %u mds%d (%d/%d)\n",
615 inode, ceph_vinop(inode),
616 frag.frag, frag.mds,
617 (int)r, frag.ndist);
618 return mds;
619 }
620
621 /* since this file/dir wasn't known to be
622 * replicated, then we want to look for the
623 * authoritative mds. */
624 mode = USE_AUTH_MDS;
625 if (frag.mds >= 0) {
626 /* choose auth mds */
627 mds = frag.mds;
628 dout("choose_mds %p %llx.%llx "
629 "frag %u mds%d (auth)\n",
630 inode, ceph_vinop(inode), frag.frag, mds);
631 return mds;
632 }
633 }
634 }
635
636 spin_lock(&inode->i_lock);
637 cap = NULL;
638 if (mode == USE_AUTH_MDS)
639 cap = ci->i_auth_cap;
640 if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
641 cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
642 if (!cap) {
643 spin_unlock(&inode->i_lock);
644 goto random;
645 }
646 mds = cap->session->s_mds;
647 dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
648 inode, ceph_vinop(inode), mds,
649 cap == ci->i_auth_cap ? "auth " : "", cap);
650 spin_unlock(&inode->i_lock);
651 return mds;
652
653random:
654 mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
655 dout("choose_mds chose random mds%d\n", mds);
656 return mds;
657}
658
659
660/*
661 * session messages
662 */
663static struct ceph_msg *create_session_msg(u32 op, u64 seq)
664{
665 struct ceph_msg *msg;
666 struct ceph_mds_session_head *h;
667
668 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
669 if (IS_ERR(msg)) {
670 pr_err("create_session_msg ENOMEM creating msg\n");
671 return ERR_PTR(PTR_ERR(msg));
672 }
673 h = msg->front.iov_base;
674 h->op = cpu_to_le32(op);
675 h->seq = cpu_to_le64(seq);
676 return msg;
677}
678
679/*
680 * send session open request.
681 *
682 * called under mdsc->mutex
683 */
684static int __open_session(struct ceph_mds_client *mdsc,
685 struct ceph_mds_session *session)
686{
687 struct ceph_msg *msg;
688 int mstate;
689 int mds = session->s_mds;
690 int err = 0;
691
692 /* wait for mds to go active? */
693 mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
694 dout("open_session to mds%d (%s)\n", mds,
695 ceph_mds_state_name(mstate));
696 session->s_state = CEPH_MDS_SESSION_OPENING;
697 session->s_renew_requested = jiffies;
698
699 /* send connect message */
700 msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
701 if (IS_ERR(msg)) {
702 err = PTR_ERR(msg);
703 goto out;
704 }
705 ceph_con_send(&session->s_con, msg);
706
707out:
708 return 0;
709}
710
711/*
712 * session caps
713 */
714
715/*
716 * Free preallocated cap messages assigned to this session
717 */
718static void cleanup_cap_releases(struct ceph_mds_session *session)
719{
720 struct ceph_msg *msg;
721
722 spin_lock(&session->s_cap_lock);
723 while (!list_empty(&session->s_cap_releases)) {
724 msg = list_first_entry(&session->s_cap_releases,
725 struct ceph_msg, list_head);
726 list_del_init(&msg->list_head);
727 ceph_msg_put(msg);
728 }
729 while (!list_empty(&session->s_cap_releases_done)) {
730 msg = list_first_entry(&session->s_cap_releases_done,
731 struct ceph_msg, list_head);
732 list_del_init(&msg->list_head);
733 ceph_msg_put(msg);
734 }
735 spin_unlock(&session->s_cap_lock);
736}
737
738/*
739 * Helper to safely iterate over all caps associated with a session.
740 *
741 * caller must hold session s_mutex
742 */
743static int iterate_session_caps(struct ceph_mds_session *session,
744 int (*cb)(struct inode *, struct ceph_cap *,
745 void *), void *arg)
746{
747 struct list_head *p;
748 struct ceph_cap *cap;
749 struct inode *inode, *last_inode = NULL;
750 struct ceph_cap *old_cap = NULL;
751 int ret;
752
753 dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
754 spin_lock(&session->s_cap_lock);
755 p = session->s_caps.next;
756 while (p != &session->s_caps) {
757 cap = list_entry(p, struct ceph_cap, session_caps);
758 inode = igrab(&cap->ci->vfs_inode);
759 if (!inode) {
760 p = p->next;
761 continue;
762 }
763 session->s_cap_iterator = cap;
764 spin_unlock(&session->s_cap_lock);
765
766 if (last_inode) {
767 iput(last_inode);
768 last_inode = NULL;
769 }
770 if (old_cap) {
771 ceph_put_cap(old_cap);
772 old_cap = NULL;
773 }
774
775 ret = cb(inode, cap, arg);
776 last_inode = inode;
777
778 spin_lock(&session->s_cap_lock);
779 p = p->next;
780 if (cap->ci == NULL) {
781 dout("iterate_session_caps finishing cap %p removal\n",
782 cap);
783 BUG_ON(cap->session != session);
784 list_del_init(&cap->session_caps);
785 session->s_nr_caps--;
786 cap->session = NULL;
787 old_cap = cap; /* put_cap it w/o locks held */
788 }
789 if (ret < 0)
790 goto out;
791 }
792 ret = 0;
793out:
794 session->s_cap_iterator = NULL;
795 spin_unlock(&session->s_cap_lock);
796
797 if (last_inode)
798 iput(last_inode);
799 if (old_cap)
800 ceph_put_cap(old_cap);
801
802 return ret;
803}
804
805static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
806 void *arg)
807{
808 struct ceph_inode_info *ci = ceph_inode(inode);
809 dout("removing cap %p, ci is %p, inode is %p\n",
810 cap, ci, &ci->vfs_inode);
811 ceph_remove_cap(cap);
812 return 0;
813}
814
815/*
816 * caller must hold session s_mutex
817 */
818static void remove_session_caps(struct ceph_mds_session *session)
819{
820 dout("remove_session_caps on %p\n", session);
821 iterate_session_caps(session, remove_session_caps_cb, NULL);
822 BUG_ON(session->s_nr_caps > 0);
823 cleanup_cap_releases(session);
824}
825
826/*
827 * wake up any threads waiting on this session's caps. if the cap is
828 * old (didn't get renewed on the client reconnect), remove it now.
829 *
830 * caller must hold s_mutex.
831 */
832static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
833 void *arg)
834{
835 struct ceph_inode_info *ci = ceph_inode(inode);
836
837 wake_up(&ci->i_cap_wq);
838 if (arg) {
839 spin_lock(&inode->i_lock);
840 ci->i_wanted_max_size = 0;
841 ci->i_requested_max_size = 0;
842 spin_unlock(&inode->i_lock);
843 }
844 return 0;
845}
846
847static void wake_up_session_caps(struct ceph_mds_session *session,
848 int reconnect)
849{
850 dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
851 iterate_session_caps(session, wake_up_session_cb,
852 (void *)(unsigned long)reconnect);
853}
854
855/*
856 * Send periodic message to MDS renewing all currently held caps. The
857 * ack will reset the expiration for all caps from this session.
858 *
859 * caller holds s_mutex
860 */
861static int send_renew_caps(struct ceph_mds_client *mdsc,
862 struct ceph_mds_session *session)
863{
864 struct ceph_msg *msg;
865 int state;
866
867 if (time_after_eq(jiffies, session->s_cap_ttl) &&
868 time_after_eq(session->s_cap_ttl, session->s_renew_requested))
869 pr_info("mds%d caps stale\n", session->s_mds);
870 session->s_renew_requested = jiffies;
871
872 /* do not try to renew caps until a recovering mds has reconnected
873 * with its clients. */
874 state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
875 if (state < CEPH_MDS_STATE_RECONNECT) {
876 dout("send_renew_caps ignoring mds%d (%s)\n",
877 session->s_mds, ceph_mds_state_name(state));
878 return 0;
879 }
880
881 dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
882 ceph_mds_state_name(state));
883 msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
884 ++session->s_renew_seq);
885 if (IS_ERR(msg))
886 return PTR_ERR(msg);
887 ceph_con_send(&session->s_con, msg);
888 return 0;
889}
890
891/*
892 * Note new cap ttl, and any transition from stale -> not stale (fresh?).
893 *
894 * Called under session->s_mutex
895 */
896static void renewed_caps(struct ceph_mds_client *mdsc,
897 struct ceph_mds_session *session, int is_renew)
898{
899 int was_stale;
900 int wake = 0;
901
902 spin_lock(&session->s_cap_lock);
903 was_stale = is_renew && (session->s_cap_ttl == 0 ||
904 time_after_eq(jiffies, session->s_cap_ttl));
905
906 session->s_cap_ttl = session->s_renew_requested +
907 mdsc->mdsmap->m_session_timeout*HZ;
908
909 if (was_stale) {
910 if (time_before(jiffies, session->s_cap_ttl)) {
911 pr_info("mds%d caps renewed\n", session->s_mds);
912 wake = 1;
913 } else {
914 pr_info("mds%d caps still stale\n", session->s_mds);
915 }
916 }
917 dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
918 session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
919 time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
920 spin_unlock(&session->s_cap_lock);
921
922 if (wake)
923 wake_up_session_caps(session, 0);
924}
925
926/*
927 * send a session close request
928 */
929static int request_close_session(struct ceph_mds_client *mdsc,
930 struct ceph_mds_session *session)
931{
932 struct ceph_msg *msg;
933 int err = 0;
934
935 dout("request_close_session mds%d state %s seq %lld\n",
936 session->s_mds, session_state_name(session->s_state),
937 session->s_seq);
938 msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
939 if (IS_ERR(msg))
940 err = PTR_ERR(msg);
941 else
942 ceph_con_send(&session->s_con, msg);
943 return err;
944}
945
946/*
947 * Called with s_mutex held.
948 */
949static int __close_session(struct ceph_mds_client *mdsc,
950 struct ceph_mds_session *session)
951{
952 if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
953 return 0;
954 session->s_state = CEPH_MDS_SESSION_CLOSING;
955 return request_close_session(mdsc, session);
956}
957
958/*
959 * Trim old(er) caps.
960 *
961 * Because we can't cache an inode without one or more caps, we do
962 * this indirectly: if a cap is unused, we prune its aliases, at which
963 * point the inode will hopefully get dropped to.
964 *
965 * Yes, this is a bit sloppy. Our only real goal here is to respond to
966 * memory pressure from the MDS, though, so it needn't be perfect.
967 */
968static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
969{
970 struct ceph_mds_session *session = arg;
971 struct ceph_inode_info *ci = ceph_inode(inode);
972 int used, oissued, mine;
973
974 if (session->s_trim_caps <= 0)
975 return -1;
976
977 spin_lock(&inode->i_lock);
978 mine = cap->issued | cap->implemented;
979 used = __ceph_caps_used(ci);
980 oissued = __ceph_caps_issued_other(ci, cap);
981
982 dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
983 inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
984 ceph_cap_string(used));
985 if (ci->i_dirty_caps)
986 goto out; /* dirty caps */
987 if ((used & ~oissued) & mine)
988 goto out; /* we need these caps */
989
990 session->s_trim_caps--;
991 if (oissued) {
992 /* we aren't the only cap.. just remove us */
993 __ceph_remove_cap(cap);
994 } else {
995 /* try to drop referring dentries */
996 spin_unlock(&inode->i_lock);
997 d_prune_aliases(inode);
998 dout("trim_caps_cb %p cap %p pruned, count now %d\n",
999 inode, cap, atomic_read(&inode->i_count));
1000 return 0;
1001 }
1002
1003out:
1004 spin_unlock(&inode->i_lock);
1005 return 0;
1006}
1007
1008/*
1009 * Trim session cap count down to some max number.
1010 */
1011static int trim_caps(struct ceph_mds_client *mdsc,
1012 struct ceph_mds_session *session,
1013 int max_caps)
1014{
1015 int trim_caps = session->s_nr_caps - max_caps;
1016
1017 dout("trim_caps mds%d start: %d / %d, trim %d\n",
1018 session->s_mds, session->s_nr_caps, max_caps, trim_caps);
1019 if (trim_caps > 0) {
1020 session->s_trim_caps = trim_caps;
1021 iterate_session_caps(session, trim_caps_cb, session);
1022 dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
1023 session->s_mds, session->s_nr_caps, max_caps,
1024 trim_caps - session->s_trim_caps);
1025 session->s_trim_caps = 0;
1026 }
1027 return 0;
1028}
1029
1030/*
1031 * Allocate cap_release messages. If there is a partially full message
1032 * in the queue, try to allocate enough to cover it's remainder, so that
1033 * we can send it immediately.
1034 *
1035 * Called under s_mutex.
1036 */
1037static int add_cap_releases(struct ceph_mds_client *mdsc,
1038 struct ceph_mds_session *session,
1039 int extra)
1040{
1041 struct ceph_msg *msg;
1042 struct ceph_mds_cap_release *head;
1043 int err = -ENOMEM;
1044
1045 if (extra < 0)
1046 extra = mdsc->client->mount_args->cap_release_safety;
1047
1048 spin_lock(&session->s_cap_lock);
1049
1050 if (!list_empty(&session->s_cap_releases)) {
1051 msg = list_first_entry(&session->s_cap_releases,
1052 struct ceph_msg,
1053 list_head);
1054 head = msg->front.iov_base;
1055 extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
1056 }
1057
1058 while (session->s_num_cap_releases < session->s_nr_caps + extra) {
1059 spin_unlock(&session->s_cap_lock);
1060 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
1061 0, 0, NULL);
1062 if (!msg)
1063 goto out_unlocked;
1064 dout("add_cap_releases %p msg %p now %d\n", session, msg,
1065 (int)msg->front.iov_len);
1066 head = msg->front.iov_base;
1067 head->num = cpu_to_le32(0);
1068 msg->front.iov_len = sizeof(*head);
1069 spin_lock(&session->s_cap_lock);
1070 list_add(&msg->list_head, &session->s_cap_releases);
1071 session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
1072 }
1073
1074 if (!list_empty(&session->s_cap_releases)) {
1075 msg = list_first_entry(&session->s_cap_releases,
1076 struct ceph_msg,
1077 list_head);
1078 head = msg->front.iov_base;
1079 if (head->num) {
1080 dout(" queueing non-full %p (%d)\n", msg,
1081 le32_to_cpu(head->num));
1082 list_move_tail(&msg->list_head,
1083 &session->s_cap_releases_done);
1084 session->s_num_cap_releases -=
1085 CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
1086 }
1087 }
1088 err = 0;
1089 spin_unlock(&session->s_cap_lock);
1090out_unlocked:
1091 return err;
1092}
1093
1094/*
1095 * flush all dirty inode data to disk.
1096 *
1097 * returns true if we've flushed through want_flush_seq
1098 */
1099static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
1100{
1101 int mds, ret = 1;
1102
1103 dout("check_cap_flush want %lld\n", want_flush_seq);
1104 mutex_lock(&mdsc->mutex);
1105 for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
1106 struct ceph_mds_session *session = mdsc->sessions[mds];
1107
1108 if (!session)
1109 continue;
1110 get_session(session);
1111 mutex_unlock(&mdsc->mutex);
1112
1113 mutex_lock(&session->s_mutex);
1114 if (!list_empty(&session->s_cap_flushing)) {
1115 struct ceph_inode_info *ci =
1116 list_entry(session->s_cap_flushing.next,
1117 struct ceph_inode_info,
1118 i_flushing_item);
1119 struct inode *inode = &ci->vfs_inode;
1120
1121 spin_lock(&inode->i_lock);
1122 if (ci->i_cap_flush_seq <= want_flush_seq) {
1123 dout("check_cap_flush still flushing %p "
1124 "seq %lld <= %lld to mds%d\n", inode,
1125 ci->i_cap_flush_seq, want_flush_seq,
1126 session->s_mds);
1127 ret = 0;
1128 }
1129 spin_unlock(&inode->i_lock);
1130 }
1131 mutex_unlock(&session->s_mutex);
1132 ceph_put_mds_session(session);
1133
1134 if (!ret)
1135 return ret;
1136 mutex_lock(&mdsc->mutex);
1137 }
1138
1139 mutex_unlock(&mdsc->mutex);
1140 dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
1141 return ret;
1142}
1143
1144/*
1145 * called under s_mutex
1146 */
1147static void send_cap_releases(struct ceph_mds_client *mdsc,
1148 struct ceph_mds_session *session)
1149{
1150 struct ceph_msg *msg;
1151
1152 dout("send_cap_releases mds%d\n", session->s_mds);
1153 while (1) {
1154 spin_lock(&session->s_cap_lock);
1155 if (list_empty(&session->s_cap_releases_done))
1156 break;
1157 msg = list_first_entry(&session->s_cap_releases_done,
1158 struct ceph_msg, list_head);
1159 list_del_init(&msg->list_head);
1160 spin_unlock(&session->s_cap_lock);
1161 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1162 dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
1163 ceph_con_send(&session->s_con, msg);
1164 }
1165 spin_unlock(&session->s_cap_lock);
1166}
1167
1168/*
1169 * requests
1170 */
1171
1172/*
1173 * Create an mds request.
1174 */
1175struct ceph_mds_request *
1176ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
1177{
1178 struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
1179
1180 if (!req)
1181 return ERR_PTR(-ENOMEM);
1182
1183 req->r_started = jiffies;
1184 req->r_resend_mds = -1;
1185 INIT_LIST_HEAD(&req->r_unsafe_dir_item);
1186 req->r_fmode = -1;
1187 kref_init(&req->r_kref);
1188 INIT_LIST_HEAD(&req->r_wait);
1189 init_completion(&req->r_completion);
1190 init_completion(&req->r_safe_completion);
1191 INIT_LIST_HEAD(&req->r_unsafe_item);
1192
1193 req->r_op = op;
1194 req->r_direct_mode = mode;
1195 return req;
1196}
1197
1198/*
1199 * return oldest (lowest) request, tid in request tree, 0 if none.
1200 *
1201 * called under mdsc->mutex.
1202 */
1203static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
1204{
1205 if (RB_EMPTY_ROOT(&mdsc->request_tree))
1206 return NULL;
1207 return rb_entry(rb_first(&mdsc->request_tree),
1208 struct ceph_mds_request, r_node);
1209}
1210
1211static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
1212{
1213 struct ceph_mds_request *req = __get_oldest_req(mdsc);
1214
1215 if (req)
1216 return req->r_tid;
1217 return 0;
1218}
1219
1220/*
1221 * Build a dentry's path. Allocate on heap; caller must kfree. Based
1222 * on build_path_from_dentry in fs/cifs/dir.c.
1223 *
1224 * If @stop_on_nosnap, generate path relative to the first non-snapped
1225 * inode.
1226 *
1227 * Encode hidden .snap dirs as a double /, i.e.
1228 * foo/.snap/bar -> foo//bar
1229 */
1230char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
1231 int stop_on_nosnap)
1232{
1233 struct dentry *temp;
1234 char *path;
1235 int len, pos;
1236
1237 if (dentry == NULL)
1238 return ERR_PTR(-EINVAL);
1239
1240retry:
1241 len = 0;
1242 for (temp = dentry; !IS_ROOT(temp);) {
1243 struct inode *inode = temp->d_inode;
1244 if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
1245 len++; /* slash only */
1246 else if (stop_on_nosnap && inode &&
1247 ceph_snap(inode) == CEPH_NOSNAP)
1248 break;
1249 else
1250 len += 1 + temp->d_name.len;
1251 temp = temp->d_parent;
1252 if (temp == NULL) {
1253 pr_err("build_path_dentry corrupt dentry %p\n", dentry);
1254 return ERR_PTR(-EINVAL);
1255 }
1256 }
1257 if (len)
1258 len--; /* no leading '/' */
1259
1260 path = kmalloc(len+1, GFP_NOFS);
1261 if (path == NULL)
1262 return ERR_PTR(-ENOMEM);
1263 pos = len;
1264 path[pos] = 0; /* trailing null */
1265 for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
1266 struct inode *inode = temp->d_inode;
1267
1268 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
1269 dout("build_path_dentry path+%d: %p SNAPDIR\n",
1270 pos, temp);
1271 } else if (stop_on_nosnap && inode &&
1272 ceph_snap(inode) == CEPH_NOSNAP) {
1273 break;
1274 } else {
1275 pos -= temp->d_name.len;
1276 if (pos < 0)
1277 break;
1278 strncpy(path + pos, temp->d_name.name,
1279 temp->d_name.len);
1280 dout("build_path_dentry path+%d: %p '%.*s'\n",
1281 pos, temp, temp->d_name.len, path + pos);
1282 }
1283 if (pos)
1284 path[--pos] = '/';
1285 temp = temp->d_parent;
1286 if (temp == NULL) {
1287 pr_err("build_path_dentry corrupt dentry\n");
1288 kfree(path);
1289 return ERR_PTR(-EINVAL);
1290 }
1291 }
1292 if (pos != 0) {
1293 pr_err("build_path_dentry did not end path lookup where "
1294 "expected, namelen is %d, pos is %d\n", len, pos);
1295 /* presumably this is only possible if racing with a
1296 rename of one of the parent directories (we can not
1297 lock the dentries above us to prevent this, but
1298 retrying should be harmless) */
1299 kfree(path);
1300 goto retry;
1301 }
1302
1303 *base = ceph_ino(temp->d_inode);
1304 *plen = len;
1305 dout("build_path_dentry on %p %d built %llx '%.*s'\n",
1306 dentry, atomic_read(&dentry->d_count), *base, len, path);
1307 return path;
1308}
1309
1310static int build_dentry_path(struct dentry *dentry,
1311 const char **ppath, int *ppathlen, u64 *pino,
1312 int *pfreepath)
1313{
1314 char *path;
1315
1316 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) {
1317 *pino = ceph_ino(dentry->d_parent->d_inode);
1318 *ppath = dentry->d_name.name;
1319 *ppathlen = dentry->d_name.len;
1320 return 0;
1321 }
1322 path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
1323 if (IS_ERR(path))
1324 return PTR_ERR(path);
1325 *ppath = path;
1326 *pfreepath = 1;
1327 return 0;
1328}
1329
1330static int build_inode_path(struct inode *inode,
1331 const char **ppath, int *ppathlen, u64 *pino,
1332 int *pfreepath)
1333{
1334 struct dentry *dentry;
1335 char *path;
1336
1337 if (ceph_snap(inode) == CEPH_NOSNAP) {
1338 *pino = ceph_ino(inode);
1339 *ppathlen = 0;
1340 return 0;
1341 }
1342 dentry = d_find_alias(inode);
1343 path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
1344 dput(dentry);
1345 if (IS_ERR(path))
1346 return PTR_ERR(path);
1347 *ppath = path;
1348 *pfreepath = 1;
1349 return 0;
1350}
1351
1352/*
1353 * request arguments may be specified via an inode *, a dentry *, or
1354 * an explicit ino+path.
1355 */
1356static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
1357 const char *rpath, u64 rino,
1358 const char **ppath, int *pathlen,
1359 u64 *ino, int *freepath)
1360{
1361 int r = 0;
1362
1363 if (rinode) {
1364 r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
1365 dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
1366 ceph_snap(rinode));
1367 } else if (rdentry) {
1368 r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
1369 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
1370 *ppath);
1371 } else if (rpath) {
1372 *ino = rino;
1373 *ppath = rpath;
1374 *pathlen = strlen(rpath);
1375 dout(" path %.*s\n", *pathlen, rpath);
1376 }
1377
1378 return r;
1379}
1380
1381/*
1382 * called under mdsc->mutex
1383 */
1384static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1385 struct ceph_mds_request *req,
1386 int mds)
1387{
1388 struct ceph_msg *msg;
1389 struct ceph_mds_request_head *head;
1390 const char *path1 = NULL;
1391 const char *path2 = NULL;
1392 u64 ino1 = 0, ino2 = 0;
1393 int pathlen1 = 0, pathlen2 = 0;
1394 int freepath1 = 0, freepath2 = 0;
1395 int len;
1396 u16 releases;
1397 void *p, *end;
1398 int ret;
1399
1400 ret = set_request_path_attr(req->r_inode, req->r_dentry,
1401 req->r_path1, req->r_ino1.ino,
1402 &path1, &pathlen1, &ino1, &freepath1);
1403 if (ret < 0) {
1404 msg = ERR_PTR(ret);
1405 goto out;
1406 }
1407
1408 ret = set_request_path_attr(NULL, req->r_old_dentry,
1409 req->r_path2, req->r_ino2.ino,
1410 &path2, &pathlen2, &ino2, &freepath2);
1411 if (ret < 0) {
1412 msg = ERR_PTR(ret);
1413 goto out_free1;
1414 }
1415
1416 len = sizeof(*head) +
1417 pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
1418
1419 /* calculate (max) length for cap releases */
1420 len += sizeof(struct ceph_mds_request_release) *
1421 (!!req->r_inode_drop + !!req->r_dentry_drop +
1422 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
1423 if (req->r_dentry_drop)
1424 len += req->r_dentry->d_name.len;
1425 if (req->r_old_dentry_drop)
1426 len += req->r_old_dentry->d_name.len;
1427
1428 msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL);
1429 if (IS_ERR(msg))
1430 goto out_free2;
1431
1432 msg->hdr.tid = cpu_to_le64(req->r_tid);
1433
1434 head = msg->front.iov_base;
1435 p = msg->front.iov_base + sizeof(*head);
1436 end = msg->front.iov_base + msg->front.iov_len;
1437
1438 head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
1439 head->op = cpu_to_le32(req->r_op);
1440 head->caller_uid = cpu_to_le32(current_fsuid());
1441 head->caller_gid = cpu_to_le32(current_fsgid());
1442 head->args = req->r_args;
1443
1444 ceph_encode_filepath(&p, end, ino1, path1);
1445 ceph_encode_filepath(&p, end, ino2, path2);
1446
1447 /* cap releases */
1448 releases = 0;
1449 if (req->r_inode_drop)
1450 releases += ceph_encode_inode_release(&p,
1451 req->r_inode ? req->r_inode : req->r_dentry->d_inode,
1452 mds, req->r_inode_drop, req->r_inode_unless, 0);
1453 if (req->r_dentry_drop)
1454 releases += ceph_encode_dentry_release(&p, req->r_dentry,
1455 mds, req->r_dentry_drop, req->r_dentry_unless);
1456 if (req->r_old_dentry_drop)
1457 releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
1458 mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
1459 if (req->r_old_inode_drop)
1460 releases += ceph_encode_inode_release(&p,
1461 req->r_old_dentry->d_inode,
1462 mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
1463 head->num_releases = cpu_to_le16(releases);
1464
1465 BUG_ON(p > end);
1466 msg->front.iov_len = p - msg->front.iov_base;
1467 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1468
1469 msg->pages = req->r_pages;
1470 msg->nr_pages = req->r_num_pages;
1471 msg->hdr.data_len = cpu_to_le32(req->r_data_len);
1472 msg->hdr.data_off = cpu_to_le16(0);
1473
1474out_free2:
1475 if (freepath2)
1476 kfree((char *)path2);
1477out_free1:
1478 if (freepath1)
1479 kfree((char *)path1);
1480out:
1481 return msg;
1482}
1483
1484/*
1485 * called under mdsc->mutex if error, under no mutex if
1486 * success.
1487 */
1488static void complete_request(struct ceph_mds_client *mdsc,
1489 struct ceph_mds_request *req)
1490{
1491 if (req->r_callback)
1492 req->r_callback(mdsc, req);
1493 else
1494 complete(&req->r_completion);
1495}
1496
1497/*
1498 * called under mdsc->mutex
1499 */
1500static int __prepare_send_request(struct ceph_mds_client *mdsc,
1501 struct ceph_mds_request *req,
1502 int mds)
1503{
1504 struct ceph_mds_request_head *rhead;
1505 struct ceph_msg *msg;
1506 int flags = 0;
1507
1508 req->r_mds = mds;
1509 req->r_attempts++;
1510 dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
1511 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
1512
1513 if (req->r_request) {
1514 ceph_msg_put(req->r_request);
1515 req->r_request = NULL;
1516 }
1517 msg = create_request_message(mdsc, req, mds);
1518 if (IS_ERR(msg)) {
1519 req->r_reply = ERR_PTR(PTR_ERR(msg));
1520 complete_request(mdsc, req);
1521 return -PTR_ERR(msg);
1522 }
1523 req->r_request = msg;
1524
1525 rhead = msg->front.iov_base;
1526 rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
1527 if (req->r_got_unsafe)
1528 flags |= CEPH_MDS_FLAG_REPLAY;
1529 if (req->r_locked_dir)
1530 flags |= CEPH_MDS_FLAG_WANT_DENTRY;
1531 rhead->flags = cpu_to_le32(flags);
1532 rhead->num_fwd = req->r_num_fwd;
1533 rhead->num_retry = req->r_attempts - 1;
1534
1535 dout(" r_locked_dir = %p\n", req->r_locked_dir);
1536
1537 if (req->r_target_inode && req->r_got_unsafe)
1538 rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
1539 else
1540 rhead->ino = 0;
1541 return 0;
1542}
1543
1544/*
1545 * send request, or put it on the appropriate wait list.
1546 */
1547static int __do_request(struct ceph_mds_client *mdsc,
1548 struct ceph_mds_request *req)
1549{
1550 struct ceph_mds_session *session = NULL;
1551 int mds = -1;
1552 int err = -EAGAIN;
1553
1554 if (req->r_reply)
1555 goto out;
1556
1557 if (req->r_timeout &&
1558 time_after_eq(jiffies, req->r_started + req->r_timeout)) {
1559 dout("do_request timed out\n");
1560 err = -EIO;
1561 goto finish;
1562 }
1563
1564 mds = __choose_mds(mdsc, req);
1565 if (mds < 0 ||
1566 ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
1567 dout("do_request no mds or not active, waiting for map\n");
1568 list_add(&req->r_wait, &mdsc->waiting_for_map);
1569 goto out;
1570 }
1571
1572 /* get, open session */
1573 session = __ceph_lookup_mds_session(mdsc, mds);
1574 if (!session) {
1575 session = register_session(mdsc, mds);
1576 if (IS_ERR(session)) {
1577 err = PTR_ERR(session);
1578 goto finish;
1579 }
1580 }
1581 dout("do_request mds%d session %p state %s\n", mds, session,
1582 session_state_name(session->s_state));
1583 if (session->s_state != CEPH_MDS_SESSION_OPEN &&
1584 session->s_state != CEPH_MDS_SESSION_HUNG) {
1585 if (session->s_state == CEPH_MDS_SESSION_NEW ||
1586 session->s_state == CEPH_MDS_SESSION_CLOSING)
1587 __open_session(mdsc, session);
1588 list_add(&req->r_wait, &session->s_waiting);
1589 goto out_session;
1590 }
1591
1592 /* send request */
1593 req->r_session = get_session(session);
1594 req->r_resend_mds = -1; /* forget any previous mds hint */
1595
1596 if (req->r_request_started == 0) /* note request start time */
1597 req->r_request_started = jiffies;
1598
1599 err = __prepare_send_request(mdsc, req, mds);
1600 if (!err) {
1601 ceph_msg_get(req->r_request);
1602 ceph_con_send(&session->s_con, req->r_request);
1603 }
1604
1605out_session:
1606 ceph_put_mds_session(session);
1607out:
1608 return err;
1609
1610finish:
1611 req->r_reply = ERR_PTR(err);
1612 complete_request(mdsc, req);
1613 goto out;
1614}
1615
1616/*
1617 * called under mdsc->mutex
1618 */
1619static void __wake_requests(struct ceph_mds_client *mdsc,
1620 struct list_head *head)
1621{
1622 struct ceph_mds_request *req, *nreq;
1623
1624 list_for_each_entry_safe(req, nreq, head, r_wait) {
1625 list_del_init(&req->r_wait);
1626 __do_request(mdsc, req);
1627 }
1628}
1629
1630/*
1631 * Wake up threads with requests pending for @mds, so that they can
1632 * resubmit their requests to a possibly different mds. If @all is set,
1633 * wake up if their requests has been forwarded to @mds, too.
1634 */
1635static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all)
1636{
1637 struct ceph_mds_request *req;
1638 struct rb_node *p;
1639
1640 dout("kick_requests mds%d\n", mds);
1641 for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) {
1642 req = rb_entry(p, struct ceph_mds_request, r_node);
1643 if (req->r_got_unsafe)
1644 continue;
1645 if (req->r_session &&
1646 req->r_session->s_mds == mds) {
1647 dout(" kicking tid %llu\n", req->r_tid);
1648 put_request_session(req);
1649 __do_request(mdsc, req);
1650 }
1651 }
1652}
1653
1654void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
1655 struct ceph_mds_request *req)
1656{
1657 dout("submit_request on %p\n", req);
1658 mutex_lock(&mdsc->mutex);
1659 __register_request(mdsc, req, NULL);
1660 __do_request(mdsc, req);
1661 mutex_unlock(&mdsc->mutex);
1662}
1663
1664/*
1665 * Synchrously perform an mds request. Take care of all of the
1666 * session setup, forwarding, retry details.
1667 */
1668int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
1669 struct inode *dir,
1670 struct ceph_mds_request *req)
1671{
1672 int err;
1673
1674 dout("do_request on %p\n", req);
1675
1676 /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
1677 if (req->r_inode)
1678 ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
1679 if (req->r_locked_dir)
1680 ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
1681 if (req->r_old_dentry)
1682 ceph_get_cap_refs(
1683 ceph_inode(req->r_old_dentry->d_parent->d_inode),
1684 CEPH_CAP_PIN);
1685
1686 /* issue */
1687 mutex_lock(&mdsc->mutex);
1688 __register_request(mdsc, req, dir);
1689 __do_request(mdsc, req);
1690
1691 /* wait */
1692 if (!req->r_reply) {
1693 mutex_unlock(&mdsc->mutex);
1694 if (req->r_timeout) {
1695 err = (long)wait_for_completion_interruptible_timeout(
1696 &req->r_completion, req->r_timeout);
1697 if (err == 0)
1698 req->r_reply = ERR_PTR(-EIO);
1699 else if (err < 0)
1700 req->r_reply = ERR_PTR(err);
1701 } else {
1702 err = wait_for_completion_interruptible(
1703 &req->r_completion);
1704 if (err)
1705 req->r_reply = ERR_PTR(err);
1706 }
1707 mutex_lock(&mdsc->mutex);
1708 }
1709
1710 if (IS_ERR(req->r_reply)) {
1711 err = PTR_ERR(req->r_reply);
1712 req->r_reply = NULL;
1713
1714 if (err == -ERESTARTSYS) {
1715 /* aborted */
1716 req->r_aborted = true;
1717
1718 if (req->r_locked_dir &&
1719 (req->r_op & CEPH_MDS_OP_WRITE)) {
1720 struct ceph_inode_info *ci =
1721 ceph_inode(req->r_locked_dir);
1722
1723 dout("aborted, clearing I_COMPLETE on %p\n",
1724 req->r_locked_dir);
1725 spin_lock(&req->r_locked_dir->i_lock);
1726 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1727 ci->i_release_count++;
1728 spin_unlock(&req->r_locked_dir->i_lock);
1729 }
1730 } else {
1731 /* clean up this request */
1732 __unregister_request(mdsc, req);
1733 if (!list_empty(&req->r_unsafe_item))
1734 list_del_init(&req->r_unsafe_item);
1735 complete(&req->r_safe_completion);
1736 }
1737 } else if (req->r_err) {
1738 err = req->r_err;
1739 } else {
1740 err = le32_to_cpu(req->r_reply_info.head->result);
1741 }
1742 mutex_unlock(&mdsc->mutex);
1743
1744 dout("do_request %p done, result %d\n", req, err);
1745 return err;
1746}
1747
1748/*
1749 * Handle mds reply.
1750 *
1751 * We take the session mutex and parse and process the reply immediately.
1752 * This preserves the logical ordering of replies, capabilities, etc., sent
1753 * by the MDS as they are applied to our local cache.
1754 */
1755static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1756{
1757 struct ceph_mds_client *mdsc = session->s_mdsc;
1758 struct ceph_mds_request *req;
1759 struct ceph_mds_reply_head *head = msg->front.iov_base;
1760 struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */
1761 u64 tid;
1762 int err, result;
1763 int mds = session->s_mds;
1764
1765 if (msg->front.iov_len < sizeof(*head)) {
1766 pr_err("mdsc_handle_reply got corrupt (short) reply\n");
1767 ceph_msg_dump(msg);
1768 return;
1769 }
1770
1771 /* get request, session */
1772 tid = le64_to_cpu(msg->hdr.tid);
1773 mutex_lock(&mdsc->mutex);
1774 req = __lookup_request(mdsc, tid);
1775 if (!req) {
1776 dout("handle_reply on unknown tid %llu\n", tid);
1777 mutex_unlock(&mdsc->mutex);
1778 return;
1779 }
1780 dout("handle_reply %p\n", req);
1781
1782 /* correct session? */
1783 if (req->r_session != session) {
1784 pr_err("mdsc_handle_reply got %llu on session mds%d"
1785 " not mds%d\n", tid, session->s_mds,
1786 req->r_session ? req->r_session->s_mds : -1);
1787 mutex_unlock(&mdsc->mutex);
1788 goto out;
1789 }
1790
1791 /* dup? */
1792 if ((req->r_got_unsafe && !head->safe) ||
1793 (req->r_got_safe && head->safe)) {
1794 pr_warning("got a dup %s reply on %llu from mds%d\n",
1795 head->safe ? "safe" : "unsafe", tid, mds);
1796 mutex_unlock(&mdsc->mutex);
1797 goto out;
1798 }
1799
1800 result = le32_to_cpu(head->result);
1801
1802 /*
1803 * Tolerate 2 consecutive ESTALEs from the same mds.
1804 * FIXME: we should be looking at the cap migrate_seq.
1805 */
1806 if (result == -ESTALE) {
1807 req->r_direct_mode = USE_AUTH_MDS;
1808 req->r_num_stale++;
1809 if (req->r_num_stale <= 2) {
1810 __do_request(mdsc, req);
1811 mutex_unlock(&mdsc->mutex);
1812 goto out;
1813 }
1814 } else {
1815 req->r_num_stale = 0;
1816 }
1817
1818 if (head->safe) {
1819 req->r_got_safe = true;
1820 __unregister_request(mdsc, req);
1821 complete(&req->r_safe_completion);
1822
1823 if (req->r_got_unsafe) {
1824 /*
1825 * We already handled the unsafe response, now do the
1826 * cleanup. No need to examine the response; the MDS
1827 * doesn't include any result info in the safe
1828 * response. And even if it did, there is nothing
1829 * useful we could do with a revised return value.
1830 */
1831 dout("got safe reply %llu, mds%d\n", tid, mds);
1832 list_del_init(&req->r_unsafe_item);
1833
1834 /* last unsafe request during umount? */
1835 if (mdsc->stopping && !__get_oldest_req(mdsc))
1836 complete(&mdsc->safe_umount_waiters);
1837 mutex_unlock(&mdsc->mutex);
1838 goto out;
1839 }
1840 }
1841
1842 BUG_ON(req->r_reply);
1843
1844 if (!head->safe) {
1845 req->r_got_unsafe = true;
1846 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
1847 }
1848
1849 dout("handle_reply tid %lld result %d\n", tid, result);
1850 rinfo = &req->r_reply_info;
1851 err = parse_reply_info(msg, rinfo);
1852 mutex_unlock(&mdsc->mutex);
1853
1854 mutex_lock(&session->s_mutex);
1855 if (err < 0) {
1856 pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
1857 ceph_msg_dump(msg);
1858 goto out_err;
1859 }
1860
1861 /* snap trace */
1862 if (rinfo->snapblob_len) {
1863 down_write(&mdsc->snap_rwsem);
1864 ceph_update_snap_trace(mdsc, rinfo->snapblob,
1865 rinfo->snapblob + rinfo->snapblob_len,
1866 le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
1867 downgrade_write(&mdsc->snap_rwsem);
1868 } else {
1869 down_read(&mdsc->snap_rwsem);
1870 }
1871
1872 /* insert trace into our cache */
1873 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
1874 if (err == 0) {
1875 if (result == 0 && rinfo->dir_nr)
1876 ceph_readdir_prepopulate(req, req->r_session);
1877 ceph_unreserve_caps(&req->r_caps_reservation);
1878 }
1879
1880 up_read(&mdsc->snap_rwsem);
1881out_err:
1882 if (err) {
1883 req->r_err = err;
1884 } else {
1885 req->r_reply = msg;
1886 ceph_msg_get(msg);
1887 }
1888
1889 add_cap_releases(mdsc, req->r_session, -1);
1890 mutex_unlock(&session->s_mutex);
1891
1892 /* kick calling process */
1893 complete_request(mdsc, req);
1894out:
1895 ceph_mdsc_put_request(req);
1896 return;
1897}
1898
1899
1900
1901/*
1902 * handle mds notification that our request has been forwarded.
1903 */
1904static void handle_forward(struct ceph_mds_client *mdsc,
1905 struct ceph_mds_session *session,
1906 struct ceph_msg *msg)
1907{
1908 struct ceph_mds_request *req;
1909 u64 tid = le64_to_cpu(msg->hdr.tid);
1910 u32 next_mds;
1911 u32 fwd_seq;
1912 int err = -EINVAL;
1913 void *p = msg->front.iov_base;
1914 void *end = p + msg->front.iov_len;
1915
1916 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1917 next_mds = ceph_decode_32(&p);
1918 fwd_seq = ceph_decode_32(&p);
1919
1920 mutex_lock(&mdsc->mutex);
1921 req = __lookup_request(mdsc, tid);
1922 if (!req) {
1923 dout("forward %llu to mds%d - req dne\n", tid, next_mds);
1924 goto out; /* dup reply? */
1925 }
1926
1927 if (fwd_seq <= req->r_num_fwd) {
1928 dout("forward %llu to mds%d - old seq %d <= %d\n",
1929 tid, next_mds, req->r_num_fwd, fwd_seq);
1930 } else {
1931 /* resend. forward race not possible; mds would drop */
1932 dout("forward %llu to mds%d (we resend)\n", tid, next_mds);
1933 req->r_num_fwd = fwd_seq;
1934 req->r_resend_mds = next_mds;
1935 put_request_session(req);
1936 __do_request(mdsc, req);
1937 }
1938 ceph_mdsc_put_request(req);
1939out:
1940 mutex_unlock(&mdsc->mutex);
1941 return;
1942
1943bad:
1944 pr_err("mdsc_handle_forward decode error err=%d\n", err);
1945}
1946
1947/*
1948 * handle a mds session control message
1949 */
1950static void handle_session(struct ceph_mds_session *session,
1951 struct ceph_msg *msg)
1952{
1953 struct ceph_mds_client *mdsc = session->s_mdsc;
1954 u32 op;
1955 u64 seq;
1956 int mds = session->s_mds;
1957 struct ceph_mds_session_head *h = msg->front.iov_base;
1958 int wake = 0;
1959
1960 /* decode */
1961 if (msg->front.iov_len != sizeof(*h))
1962 goto bad;
1963 op = le32_to_cpu(h->op);
1964 seq = le64_to_cpu(h->seq);
1965
1966 mutex_lock(&mdsc->mutex);
1967 if (op == CEPH_SESSION_CLOSE)
1968 __unregister_session(mdsc, session);
1969 /* FIXME: this ttl calculation is generous */
1970 session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
1971 mutex_unlock(&mdsc->mutex);
1972
1973 mutex_lock(&session->s_mutex);
1974
1975 dout("handle_session mds%d %s %p state %s seq %llu\n",
1976 mds, ceph_session_op_name(op), session,
1977 session_state_name(session->s_state), seq);
1978
1979 if (session->s_state == CEPH_MDS_SESSION_HUNG) {
1980 session->s_state = CEPH_MDS_SESSION_OPEN;
1981 pr_info("mds%d came back\n", session->s_mds);
1982 }
1983
1984 switch (op) {
1985 case CEPH_SESSION_OPEN:
1986 session->s_state = CEPH_MDS_SESSION_OPEN;
1987 renewed_caps(mdsc, session, 0);
1988 wake = 1;
1989 if (mdsc->stopping)
1990 __close_session(mdsc, session);
1991 break;
1992
1993 case CEPH_SESSION_RENEWCAPS:
1994 if (session->s_renew_seq == seq)
1995 renewed_caps(mdsc, session, 1);
1996 break;
1997
1998 case CEPH_SESSION_CLOSE:
1999 remove_session_caps(session);
2000 wake = 1; /* for good measure */
2001 complete(&mdsc->session_close_waiters);
2002 kick_requests(mdsc, mds, 0); /* cur only */
2003 break;
2004
2005 case CEPH_SESSION_STALE:
2006 pr_info("mds%d caps went stale, renewing\n",
2007 session->s_mds);
2008 spin_lock(&session->s_cap_lock);
2009 session->s_cap_gen++;
2010 session->s_cap_ttl = 0;
2011 spin_unlock(&session->s_cap_lock);
2012 send_renew_caps(mdsc, session);
2013 break;
2014
2015 case CEPH_SESSION_RECALL_STATE:
2016 trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
2017 break;
2018
2019 default:
2020 pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
2021 WARN_ON(1);
2022 }
2023
2024 mutex_unlock(&session->s_mutex);
2025 if (wake) {
2026 mutex_lock(&mdsc->mutex);
2027 __wake_requests(mdsc, &session->s_waiting);
2028 mutex_unlock(&mdsc->mutex);
2029 }
2030 return;
2031
2032bad:
2033 pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
2034 (int)msg->front.iov_len);
2035 ceph_msg_dump(msg);
2036 return;
2037}
2038
2039
2040/*
2041 * called under session->mutex.
2042 */
2043static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
2044 struct ceph_mds_session *session)
2045{
2046 struct ceph_mds_request *req, *nreq;
2047 int err;
2048
2049 dout("replay_unsafe_requests mds%d\n", session->s_mds);
2050
2051 mutex_lock(&mdsc->mutex);
2052 list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
2053 err = __prepare_send_request(mdsc, req, session->s_mds);
2054 if (!err) {
2055 ceph_msg_get(req->r_request);
2056 ceph_con_send(&session->s_con, req->r_request);
2057 }
2058 }
2059 mutex_unlock(&mdsc->mutex);
2060}
2061
2062/*
2063 * Encode information about a cap for a reconnect with the MDS.
2064 */
2065static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2066 void *arg)
2067{
2068 struct ceph_mds_cap_reconnect rec;
2069 struct ceph_inode_info *ci;
2070 struct ceph_pagelist *pagelist = arg;
2071 char *path;
2072 int pathlen, err;
2073 u64 pathbase;
2074 struct dentry *dentry;
2075
2076 ci = cap->ci;
2077
2078 dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
2079 inode, ceph_vinop(inode), cap, cap->cap_id,
2080 ceph_cap_string(cap->issued));
2081 err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
2082 if (err)
2083 return err;
2084
2085 dentry = d_find_alias(inode);
2086 if (dentry) {
2087 path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
2088 if (IS_ERR(path)) {
2089 err = PTR_ERR(path);
2090 BUG_ON(err);
2091 }
2092 } else {
2093 path = NULL;
2094 pathlen = 0;
2095 }
2096 err = ceph_pagelist_encode_string(pagelist, path, pathlen);
2097 if (err)
2098 goto out;
2099
2100 spin_lock(&inode->i_lock);
2101 cap->seq = 0; /* reset cap seq */
2102 cap->issue_seq = 0; /* and issue_seq */
2103 rec.cap_id = cpu_to_le64(cap->cap_id);
2104 rec.pathbase = cpu_to_le64(pathbase);
2105 rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
2106 rec.issued = cpu_to_le32(cap->issued);
2107 rec.size = cpu_to_le64(inode->i_size);
2108 ceph_encode_timespec(&rec.mtime, &inode->i_mtime);
2109 ceph_encode_timespec(&rec.atime, &inode->i_atime);
2110 rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
2111 spin_unlock(&inode->i_lock);
2112
2113 err = ceph_pagelist_append(pagelist, &rec, sizeof(rec));
2114
2115out:
2116 kfree(path);
2117 dput(dentry);
2118 return err;
2119}
2120
2121
2122/*
2123 * If an MDS fails and recovers, clients need to reconnect in order to
2124 * reestablish shared state. This includes all caps issued through
2125 * this session _and_ the snap_realm hierarchy. Because it's not
2126 * clear which snap realms the mds cares about, we send everything we
2127 * know about.. that ensures we'll then get any new info the
2128 * recovering MDS might have.
2129 *
2130 * This is a relatively heavyweight operation, but it's rare.
2131 *
2132 * called with mdsc->mutex held.
2133 */
2134static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
2135{
2136 struct ceph_mds_session *session = NULL;
2137 struct ceph_msg *reply;
2138 struct rb_node *p;
2139 int err;
2140 struct ceph_pagelist *pagelist;
2141
2142 pr_info("reconnect to recovering mds%d\n", mds);
2143
2144 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
2145 if (!pagelist)
2146 goto fail_nopagelist;
2147 ceph_pagelist_init(pagelist);
2148
2149 reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL);
2150 if (IS_ERR(reply)) {
2151 err = PTR_ERR(reply);
2152 goto fail_nomsg;
2153 }
2154
2155 /* find session */
2156 session = __ceph_lookup_mds_session(mdsc, mds);
2157 mutex_unlock(&mdsc->mutex); /* drop lock for duration */
2158
2159 if (session) {
2160 mutex_lock(&session->s_mutex);
2161
2162 session->s_state = CEPH_MDS_SESSION_RECONNECTING;
2163 session->s_seq = 0;
2164
2165 ceph_con_open(&session->s_con,
2166 ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
2167
2168 /* replay unsafe requests */
2169 replay_unsafe_requests(mdsc, session);
2170 } else {
2171 dout("no session for mds%d, will send short reconnect\n",
2172 mds);
2173 }
2174
2175 down_read(&mdsc->snap_rwsem);
2176
2177 if (!session)
2178 goto send;
2179 dout("session %p state %s\n", session,
2180 session_state_name(session->s_state));
2181
2182 /* traverse this session's caps */
2183 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
2184 if (err)
2185 goto fail;
2186 err = iterate_session_caps(session, encode_caps_cb, pagelist);
2187 if (err < 0)
2188 goto out;
2189
2190 /*
2191 * snaprealms. we provide mds with the ino, seq (version), and
2192 * parent for all of our realms. If the mds has any newer info,
2193 * it will tell us.
2194 */
2195 for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
2196 struct ceph_snap_realm *realm =
2197 rb_entry(p, struct ceph_snap_realm, node);
2198 struct ceph_mds_snaprealm_reconnect sr_rec;
2199
2200 dout(" adding snap realm %llx seq %lld parent %llx\n",
2201 realm->ino, realm->seq, realm->parent_ino);
2202 sr_rec.ino = cpu_to_le64(realm->ino);
2203 sr_rec.seq = cpu_to_le64(realm->seq);
2204 sr_rec.parent = cpu_to_le64(realm->parent_ino);
2205 err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
2206 if (err)
2207 goto fail;
2208 }
2209
2210send:
2211 reply->pagelist = pagelist;
2212 reply->hdr.data_len = cpu_to_le32(pagelist->length);
2213 reply->nr_pages = calc_pages_for(0, pagelist->length);
2214 ceph_con_send(&session->s_con, reply);
2215
2216 if (session) {
2217 session->s_state = CEPH_MDS_SESSION_OPEN;
2218 __wake_requests(mdsc, &session->s_waiting);
2219 }
2220
2221out:
2222 up_read(&mdsc->snap_rwsem);
2223 if (session) {
2224 mutex_unlock(&session->s_mutex);
2225 ceph_put_mds_session(session);
2226 }
2227 mutex_lock(&mdsc->mutex);
2228 return;
2229
2230fail:
2231 ceph_msg_put(reply);
2232fail_nomsg:
2233 ceph_pagelist_release(pagelist);
2234 kfree(pagelist);
2235fail_nopagelist:
2236 pr_err("ENOMEM preparing reconnect for mds%d\n", mds);
2237 goto out;
2238}
2239
2240
2241/*
2242 * compare old and new mdsmaps, kicking requests
2243 * and closing out old connections as necessary
2244 *
2245 * called under mdsc->mutex.
2246 */
2247static void check_new_map(struct ceph_mds_client *mdsc,
2248 struct ceph_mdsmap *newmap,
2249 struct ceph_mdsmap *oldmap)
2250{
2251 int i;
2252 int oldstate, newstate;
2253 struct ceph_mds_session *s;
2254
2255 dout("check_new_map new %u old %u\n",
2256 newmap->m_epoch, oldmap->m_epoch);
2257
2258 for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
2259 if (mdsc->sessions[i] == NULL)
2260 continue;
2261 s = mdsc->sessions[i];
2262 oldstate = ceph_mdsmap_get_state(oldmap, i);
2263 newstate = ceph_mdsmap_get_state(newmap, i);
2264
2265 dout("check_new_map mds%d state %s -> %s (session %s)\n",
2266 i, ceph_mds_state_name(oldstate),
2267 ceph_mds_state_name(newstate),
2268 session_state_name(s->s_state));
2269
2270 if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
2271 ceph_mdsmap_get_addr(newmap, i),
2272 sizeof(struct ceph_entity_addr))) {
2273 if (s->s_state == CEPH_MDS_SESSION_OPENING) {
2274 /* the session never opened, just close it
2275 * out now */
2276 __wake_requests(mdsc, &s->s_waiting);
2277 __unregister_session(mdsc, s);
2278 } else {
2279 /* just close it */
2280 mutex_unlock(&mdsc->mutex);
2281 mutex_lock(&s->s_mutex);
2282 mutex_lock(&mdsc->mutex);
2283 ceph_con_close(&s->s_con);
2284 mutex_unlock(&s->s_mutex);
2285 s->s_state = CEPH_MDS_SESSION_RESTARTING;
2286 }
2287
2288 /* kick any requests waiting on the recovering mds */
2289 kick_requests(mdsc, i, 1);
2290 } else if (oldstate == newstate) {
2291 continue; /* nothing new with this mds */
2292 }
2293
2294 /*
2295 * send reconnect?
2296 */
2297 if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
2298 newstate >= CEPH_MDS_STATE_RECONNECT)
2299 send_mds_reconnect(mdsc, i);
2300
2301 /*
2302 * kick requests on any mds that has gone active.
2303 *
2304 * kick requests on cur or forwarder: we may have sent
2305 * the request to mds1, mds1 told us it forwarded it
2306 * to mds2, but then we learn mds1 failed and can't be
2307 * sure it successfully forwarded our request before
2308 * it died.
2309 */
2310 if (oldstate < CEPH_MDS_STATE_ACTIVE &&
2311 newstate >= CEPH_MDS_STATE_ACTIVE) {
2312 pr_info("mds%d reconnect completed\n", s->s_mds);
2313 kick_requests(mdsc, i, 1);
2314 ceph_kick_flushing_caps(mdsc, s);
2315 wake_up_session_caps(s, 1);
2316 }
2317 }
2318}
2319
2320
2321
2322/*
2323 * leases
2324 */
2325
2326/*
2327 * caller must hold session s_mutex, dentry->d_lock
2328 */
2329void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
2330{
2331 struct ceph_dentry_info *di = ceph_dentry(dentry);
2332
2333 ceph_put_mds_session(di->lease_session);
2334 di->lease_session = NULL;
2335}
2336
2337static void handle_lease(struct ceph_mds_client *mdsc,
2338 struct ceph_mds_session *session,
2339 struct ceph_msg *msg)
2340{
2341 struct super_block *sb = mdsc->client->sb;
2342 struct inode *inode;
2343 struct ceph_inode_info *ci;
2344 struct dentry *parent, *dentry;
2345 struct ceph_dentry_info *di;
2346 int mds = session->s_mds;
2347 struct ceph_mds_lease *h = msg->front.iov_base;
2348 struct ceph_vino vino;
2349 int mask;
2350 struct qstr dname;
2351 int release = 0;
2352
2353 dout("handle_lease from mds%d\n", mds);
2354
2355 /* decode */
2356 if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
2357 goto bad;
2358 vino.ino = le64_to_cpu(h->ino);
2359 vino.snap = CEPH_NOSNAP;
2360 mask = le16_to_cpu(h->mask);
2361 dname.name = (void *)h + sizeof(*h) + sizeof(u32);
2362 dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
2363 if (dname.len != get_unaligned_le32(h+1))
2364 goto bad;
2365
2366 mutex_lock(&session->s_mutex);
2367 session->s_seq++;
2368
2369 /* lookup inode */
2370 inode = ceph_find_inode(sb, vino);
2371 dout("handle_lease '%s', mask %d, ino %llx %p\n",
2372 ceph_lease_op_name(h->action), mask, vino.ino, inode);
2373 if (inode == NULL) {
2374 dout("handle_lease no inode %llx\n", vino.ino);
2375 goto release;
2376 }
2377 ci = ceph_inode(inode);
2378
2379 /* dentry */
2380 parent = d_find_alias(inode);
2381 if (!parent) {
2382 dout("no parent dentry on inode %p\n", inode);
2383 WARN_ON(1);
2384 goto release; /* hrm... */
2385 }
2386 dname.hash = full_name_hash(dname.name, dname.len);
2387 dentry = d_lookup(parent, &dname);
2388 dput(parent);
2389 if (!dentry)
2390 goto release;
2391
2392 spin_lock(&dentry->d_lock);
2393 di = ceph_dentry(dentry);
2394 switch (h->action) {
2395 case CEPH_MDS_LEASE_REVOKE:
2396 if (di && di->lease_session == session) {
2397 h->seq = cpu_to_le32(di->lease_seq);
2398 __ceph_mdsc_drop_dentry_lease(dentry);
2399 }
2400 release = 1;
2401 break;
2402
2403 case CEPH_MDS_LEASE_RENEW:
2404 if (di && di->lease_session == session &&
2405 di->lease_gen == session->s_cap_gen &&
2406 di->lease_renew_from &&
2407 di->lease_renew_after == 0) {
2408 unsigned long duration =
2409 le32_to_cpu(h->duration_ms) * HZ / 1000;
2410
2411 di->lease_seq = le32_to_cpu(h->seq);
2412 dentry->d_time = di->lease_renew_from + duration;
2413 di->lease_renew_after = di->lease_renew_from +
2414 (duration >> 1);
2415 di->lease_renew_from = 0;
2416 }
2417 break;
2418 }
2419 spin_unlock(&dentry->d_lock);
2420 dput(dentry);
2421
2422 if (!release)
2423 goto out;
2424
2425release:
2426 /* let's just reuse the same message */
2427 h->action = CEPH_MDS_LEASE_REVOKE_ACK;
2428 ceph_msg_get(msg);
2429 ceph_con_send(&session->s_con, msg);
2430
2431out:
2432 iput(inode);
2433 mutex_unlock(&session->s_mutex);
2434 return;
2435
2436bad:
2437 pr_err("corrupt lease message\n");
2438 ceph_msg_dump(msg);
2439}
2440
2441void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
2442 struct inode *inode,
2443 struct dentry *dentry, char action,
2444 u32 seq)
2445{
2446 struct ceph_msg *msg;
2447 struct ceph_mds_lease *lease;
2448 int len = sizeof(*lease) + sizeof(u32);
2449 int dnamelen = 0;
2450
2451 dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
2452 inode, dentry, ceph_lease_op_name(action), session->s_mds);
2453 dnamelen = dentry->d_name.len;
2454 len += dnamelen;
2455
2456 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
2457 if (IS_ERR(msg))
2458 return;
2459 lease = msg->front.iov_base;
2460 lease->action = action;
2461 lease->mask = cpu_to_le16(CEPH_LOCK_DN);
2462 lease->ino = cpu_to_le64(ceph_vino(inode).ino);
2463 lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
2464 lease->seq = cpu_to_le32(seq);
2465 put_unaligned_le32(dnamelen, lease + 1);
2466 memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
2467
2468 /*
2469 * if this is a preemptive lease RELEASE, no need to
2470 * flush request stream, since the actual request will
2471 * soon follow.
2472 */
2473 msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
2474
2475 ceph_con_send(&session->s_con, msg);
2476}
2477
2478/*
2479 * Preemptively release a lease we expect to invalidate anyway.
2480 * Pass @inode always, @dentry is optional.
2481 */
2482void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
2483 struct dentry *dentry, int mask)
2484{
2485 struct ceph_dentry_info *di;
2486 struct ceph_mds_session *session;
2487 u32 seq;
2488
2489 BUG_ON(inode == NULL);
2490 BUG_ON(dentry == NULL);
2491 BUG_ON(mask != CEPH_LOCK_DN);
2492
2493 /* is dentry lease valid? */
2494 spin_lock(&dentry->d_lock);
2495 di = ceph_dentry(dentry);
2496 if (!di || !di->lease_session ||
2497 di->lease_session->s_mds < 0 ||
2498 di->lease_gen != di->lease_session->s_cap_gen ||
2499 !time_before(jiffies, dentry->d_time)) {
2500 dout("lease_release inode %p dentry %p -- "
2501 "no lease on %d\n",
2502 inode, dentry, mask);
2503 spin_unlock(&dentry->d_lock);
2504 return;
2505 }
2506
2507 /* we do have a lease on this dentry; note mds and seq */
2508 session = ceph_get_mds_session(di->lease_session);
2509 seq = di->lease_seq;
2510 __ceph_mdsc_drop_dentry_lease(dentry);
2511 spin_unlock(&dentry->d_lock);
2512
2513 dout("lease_release inode %p dentry %p mask %d to mds%d\n",
2514 inode, dentry, mask, session->s_mds);
2515 ceph_mdsc_lease_send_msg(session, inode, dentry,
2516 CEPH_MDS_LEASE_RELEASE, seq);
2517 ceph_put_mds_session(session);
2518}
2519
2520/*
2521 * drop all leases (and dentry refs) in preparation for umount
2522 */
2523static void drop_leases(struct ceph_mds_client *mdsc)
2524{
2525 int i;
2526
2527 dout("drop_leases\n");
2528 mutex_lock(&mdsc->mutex);
2529 for (i = 0; i < mdsc->max_sessions; i++) {
2530 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
2531 if (!s)
2532 continue;
2533 mutex_unlock(&mdsc->mutex);
2534 mutex_lock(&s->s_mutex);
2535 mutex_unlock(&s->s_mutex);
2536 ceph_put_mds_session(s);
2537 mutex_lock(&mdsc->mutex);
2538 }
2539 mutex_unlock(&mdsc->mutex);
2540}
2541
2542
2543
2544/*
2545 * delayed work -- periodically trim expired leases, renew caps with mds
2546 */
2547static void schedule_delayed(struct ceph_mds_client *mdsc)
2548{
2549 int delay = 5;
2550 unsigned hz = round_jiffies_relative(HZ * delay);
2551 schedule_delayed_work(&mdsc->delayed_work, hz);
2552}
2553
2554static void delayed_work(struct work_struct *work)
2555{
2556 int i;
2557 struct ceph_mds_client *mdsc =
2558 container_of(work, struct ceph_mds_client, delayed_work.work);
2559 int renew_interval;
2560 int renew_caps;
2561
2562 dout("mdsc delayed_work\n");
2563 ceph_check_delayed_caps(mdsc);
2564
2565 mutex_lock(&mdsc->mutex);
2566 renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
2567 renew_caps = time_after_eq(jiffies, HZ*renew_interval +
2568 mdsc->last_renew_caps);
2569 if (renew_caps)
2570 mdsc->last_renew_caps = jiffies;
2571
2572 for (i = 0; i < mdsc->max_sessions; i++) {
2573 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
2574 if (s == NULL)
2575 continue;
2576 if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
2577 dout("resending session close request for mds%d\n",
2578 s->s_mds);
2579 request_close_session(mdsc, s);
2580 ceph_put_mds_session(s);
2581 continue;
2582 }
2583 if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
2584 if (s->s_state == CEPH_MDS_SESSION_OPEN) {
2585 s->s_state = CEPH_MDS_SESSION_HUNG;
2586 pr_info("mds%d hung\n", s->s_mds);
2587 }
2588 }
2589 if (s->s_state < CEPH_MDS_SESSION_OPEN) {
2590 /* this mds is failed or recovering, just wait */
2591 ceph_put_mds_session(s);
2592 continue;
2593 }
2594 mutex_unlock(&mdsc->mutex);
2595
2596 mutex_lock(&s->s_mutex);
2597 if (renew_caps)
2598 send_renew_caps(mdsc, s);
2599 else
2600 ceph_con_keepalive(&s->s_con);
2601 add_cap_releases(mdsc, s, -1);
2602 send_cap_releases(mdsc, s);
2603 mutex_unlock(&s->s_mutex);
2604 ceph_put_mds_session(s);
2605
2606 mutex_lock(&mdsc->mutex);
2607 }
2608 mutex_unlock(&mdsc->mutex);
2609
2610 schedule_delayed(mdsc);
2611}
2612
2613
2614int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2615{
2616 mdsc->client = client;
2617 mutex_init(&mdsc->mutex);
2618 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
2619 init_completion(&mdsc->safe_umount_waiters);
2620 init_completion(&mdsc->session_close_waiters);
2621 INIT_LIST_HEAD(&mdsc->waiting_for_map);
2622 mdsc->sessions = NULL;
2623 mdsc->max_sessions = 0;
2624 mdsc->stopping = 0;
2625 init_rwsem(&mdsc->snap_rwsem);
2626 mdsc->snap_realms = RB_ROOT;
2627 INIT_LIST_HEAD(&mdsc->snap_empty);
2628 spin_lock_init(&mdsc->snap_empty_lock);
2629 mdsc->last_tid = 0;
2630 mdsc->request_tree = RB_ROOT;
2631 INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
2632 mdsc->last_renew_caps = jiffies;
2633 INIT_LIST_HEAD(&mdsc->cap_delay_list);
2634 spin_lock_init(&mdsc->cap_delay_lock);
2635 INIT_LIST_HEAD(&mdsc->snap_flush_list);
2636 spin_lock_init(&mdsc->snap_flush_lock);
2637 mdsc->cap_flush_seq = 0;
2638 INIT_LIST_HEAD(&mdsc->cap_dirty);
2639 mdsc->num_cap_flushing = 0;
2640 spin_lock_init(&mdsc->cap_dirty_lock);
2641 init_waitqueue_head(&mdsc->cap_flushing_wq);
2642 spin_lock_init(&mdsc->dentry_lru_lock);
2643 INIT_LIST_HEAD(&mdsc->dentry_lru);
2644 return 0;
2645}
2646
2647/*
2648 * Wait for safe replies on open mds requests. If we time out, drop
2649 * all requests from the tree to avoid dangling dentry refs.
2650 */
2651static void wait_requests(struct ceph_mds_client *mdsc)
2652{
2653 struct ceph_mds_request *req;
2654 struct ceph_client *client = mdsc->client;
2655
2656 mutex_lock(&mdsc->mutex);
2657 if (__get_oldest_req(mdsc)) {
2658 mutex_unlock(&mdsc->mutex);
2659
2660 dout("wait_requests waiting for requests\n");
2661 wait_for_completion_timeout(&mdsc->safe_umount_waiters,
2662 client->mount_args->mount_timeout * HZ);
2663
2664 /* tear down remaining requests */
2665 mutex_lock(&mdsc->mutex);
2666 while ((req = __get_oldest_req(mdsc))) {
2667 dout("wait_requests timed out on tid %llu\n",
2668 req->r_tid);
2669 __unregister_request(mdsc, req);
2670 }
2671 }
2672 mutex_unlock(&mdsc->mutex);
2673 dout("wait_requests done\n");
2674}
2675
2676/*
2677 * called before mount is ro, and before dentries are torn down.
2678 * (hmm, does this still race with new lookups?)
2679 */
2680void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
2681{
2682 dout("pre_umount\n");
2683 mdsc->stopping = 1;
2684
2685 drop_leases(mdsc);
2686 ceph_flush_dirty_caps(mdsc);
2687 wait_requests(mdsc);
2688}
2689
2690/*
2691 * wait for all write mds requests to flush.
2692 */
2693static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
2694{
2695 struct ceph_mds_request *req = NULL, *nextreq;
2696 struct rb_node *n;
2697
2698 mutex_lock(&mdsc->mutex);
2699 dout("wait_unsafe_requests want %lld\n", want_tid);
2700restart:
2701 req = __get_oldest_req(mdsc);
2702 while (req && req->r_tid <= want_tid) {
2703 /* find next request */
2704 n = rb_next(&req->r_node);
2705 if (n)
2706 nextreq = rb_entry(n, struct ceph_mds_request, r_node);
2707 else
2708 nextreq = NULL;
2709 if ((req->r_op & CEPH_MDS_OP_WRITE)) {
2710 /* write op */
2711 ceph_mdsc_get_request(req);
2712 if (nextreq)
2713 ceph_mdsc_get_request(nextreq);
2714 mutex_unlock(&mdsc->mutex);
2715 dout("wait_unsafe_requests wait on %llu (want %llu)\n",
2716 req->r_tid, want_tid);
2717 wait_for_completion(&req->r_safe_completion);
2718 mutex_lock(&mdsc->mutex);
2719 ceph_mdsc_put_request(req);
2720 if (!nextreq)
2721 break; /* next dne before, so we're done! */
2722 if (RB_EMPTY_NODE(&nextreq->r_node)) {
2723 /* next request was removed from tree */
2724 ceph_mdsc_put_request(nextreq);
2725 goto restart;
2726 }
2727 ceph_mdsc_put_request(nextreq); /* won't go away */
2728 }
2729 req = nextreq;
2730 }
2731 mutex_unlock(&mdsc->mutex);
2732 dout("wait_unsafe_requests done\n");
2733}
2734
2735void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
2736{
2737 u64 want_tid, want_flush;
2738
2739 dout("sync\n");
2740 mutex_lock(&mdsc->mutex);
2741 want_tid = mdsc->last_tid;
2742 want_flush = mdsc->cap_flush_seq;
2743 mutex_unlock(&mdsc->mutex);
2744 dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
2745
2746 ceph_flush_dirty_caps(mdsc);
2747
2748 wait_unsafe_requests(mdsc, want_tid);
2749 wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
2750}
2751
2752
2753/*
2754 * called after sb is ro.
2755 */
2756void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
2757{
2758 struct ceph_mds_session *session;
2759 int i;
2760 int n;
2761 struct ceph_client *client = mdsc->client;
2762 unsigned long started, timeout = client->mount_args->mount_timeout * HZ;
2763
2764 dout("close_sessions\n");
2765
2766 mutex_lock(&mdsc->mutex);
2767
2768 /* close sessions */
2769 started = jiffies;
2770 while (time_before(jiffies, started + timeout)) {
2771 dout("closing sessions\n");
2772 n = 0;
2773 for (i = 0; i < mdsc->max_sessions; i++) {
2774 session = __ceph_lookup_mds_session(mdsc, i);
2775 if (!session)
2776 continue;
2777 mutex_unlock(&mdsc->mutex);
2778 mutex_lock(&session->s_mutex);
2779 __close_session(mdsc, session);
2780 mutex_unlock(&session->s_mutex);
2781 ceph_put_mds_session(session);
2782 mutex_lock(&mdsc->mutex);
2783 n++;
2784 }
2785 if (n == 0)
2786 break;
2787
2788 if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
2789 break;
2790
2791 dout("waiting for sessions to close\n");
2792 mutex_unlock(&mdsc->mutex);
2793 wait_for_completion_timeout(&mdsc->session_close_waiters,
2794 timeout);
2795 mutex_lock(&mdsc->mutex);
2796 }
2797
2798 /* tear down remaining sessions */
2799 for (i = 0; i < mdsc->max_sessions; i++) {
2800 if (mdsc->sessions[i]) {
2801 session = get_session(mdsc->sessions[i]);
2802 __unregister_session(mdsc, session);
2803 mutex_unlock(&mdsc->mutex);
2804 mutex_lock(&session->s_mutex);
2805 remove_session_caps(session);
2806 mutex_unlock(&session->s_mutex);
2807 ceph_put_mds_session(session);
2808 mutex_lock(&mdsc->mutex);
2809 }
2810 }
2811
2812 WARN_ON(!list_empty(&mdsc->cap_delay_list));
2813
2814 mutex_unlock(&mdsc->mutex);
2815
2816 ceph_cleanup_empty_realms(mdsc);
2817
2818 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
2819
2820 dout("stopped\n");
2821}
2822
2823void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
2824{
2825 dout("stop\n");
2826 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
2827 if (mdsc->mdsmap)
2828 ceph_mdsmap_destroy(mdsc->mdsmap);
2829 kfree(mdsc->sessions);
2830}
2831
2832
2833/*
2834 * handle mds map update.
2835 */
2836void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
2837{
2838 u32 epoch;
2839 u32 maplen;
2840 void *p = msg->front.iov_base;
2841 void *end = p + msg->front.iov_len;
2842 struct ceph_mdsmap *newmap, *oldmap;
2843 struct ceph_fsid fsid;
2844 int err = -EINVAL;
2845
2846 ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
2847 ceph_decode_copy(&p, &fsid, sizeof(fsid));
2848 if (ceph_check_fsid(mdsc->client, &fsid) < 0)
2849 return;
2850 epoch = ceph_decode_32(&p);
2851 maplen = ceph_decode_32(&p);
2852 dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
2853
2854 /* do we need it? */
2855 ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
2856 mutex_lock(&mdsc->mutex);
2857 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
2858 dout("handle_map epoch %u <= our %u\n",
2859 epoch, mdsc->mdsmap->m_epoch);
2860 mutex_unlock(&mdsc->mutex);
2861 return;
2862 }
2863
2864 newmap = ceph_mdsmap_decode(&p, end);
2865 if (IS_ERR(newmap)) {
2866 err = PTR_ERR(newmap);
2867 goto bad_unlock;
2868 }
2869
2870 /* swap into place */
2871 if (mdsc->mdsmap) {
2872 oldmap = mdsc->mdsmap;
2873 mdsc->mdsmap = newmap;
2874 check_new_map(mdsc, newmap, oldmap);
2875 ceph_mdsmap_destroy(oldmap);
2876 } else {
2877 mdsc->mdsmap = newmap; /* first mds map */
2878 }
2879 mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
2880
2881 __wake_requests(mdsc, &mdsc->waiting_for_map);
2882
2883 mutex_unlock(&mdsc->mutex);
2884 schedule_delayed(mdsc);
2885 return;
2886
2887bad_unlock:
2888 mutex_unlock(&mdsc->mutex);
2889bad:
2890 pr_err("error decoding mdsmap %d\n", err);
2891 return;
2892}
2893
2894static struct ceph_connection *con_get(struct ceph_connection *con)
2895{
2896 struct ceph_mds_session *s = con->private;
2897
2898 if (get_session(s)) {
2899 dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
2900 return con;
2901 }
2902 dout("mdsc con_get %p FAIL\n", s);
2903 return NULL;
2904}
2905
2906static void con_put(struct ceph_connection *con)
2907{
2908 struct ceph_mds_session *s = con->private;
2909
2910 ceph_put_mds_session(s);
2911 dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref));
2912}
2913
2914/*
2915 * if the client is unresponsive for long enough, the mds will kill
2916 * the session entirely.
2917 */
2918static void peer_reset(struct ceph_connection *con)
2919{
2920 struct ceph_mds_session *s = con->private;
2921
2922 pr_err("mds%d gave us the boot. IMPLEMENT RECONNECT.\n",
2923 s->s_mds);
2924}
2925
2926static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
2927{
2928 struct ceph_mds_session *s = con->private;
2929 struct ceph_mds_client *mdsc = s->s_mdsc;
2930 int type = le16_to_cpu(msg->hdr.type);
2931
2932 mutex_lock(&mdsc->mutex);
2933 if (__verify_registered_session(mdsc, s) < 0) {
2934 mutex_unlock(&mdsc->mutex);
2935 goto out;
2936 }
2937 mutex_unlock(&mdsc->mutex);
2938
2939 switch (type) {
2940 case CEPH_MSG_MDS_MAP:
2941 ceph_mdsc_handle_map(mdsc, msg);
2942 break;
2943 case CEPH_MSG_CLIENT_SESSION:
2944 handle_session(s, msg);
2945 break;
2946 case CEPH_MSG_CLIENT_REPLY:
2947 handle_reply(s, msg);
2948 break;
2949 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2950 handle_forward(mdsc, s, msg);
2951 break;
2952 case CEPH_MSG_CLIENT_CAPS:
2953 ceph_handle_caps(s, msg);
2954 break;
2955 case CEPH_MSG_CLIENT_SNAP:
2956 ceph_handle_snap(mdsc, s, msg);
2957 break;
2958 case CEPH_MSG_CLIENT_LEASE:
2959 handle_lease(mdsc, s, msg);
2960 break;
2961
2962 default:
2963 pr_err("received unknown message type %d %s\n", type,
2964 ceph_msg_type_name(type));
2965 }
2966out:
2967 ceph_msg_put(msg);
2968}
2969
2970/*
2971 * authentication
2972 */
2973static int get_authorizer(struct ceph_connection *con,
2974 void **buf, int *len, int *proto,
2975 void **reply_buf, int *reply_len, int force_new)
2976{
2977 struct ceph_mds_session *s = con->private;
2978 struct ceph_mds_client *mdsc = s->s_mdsc;
2979 struct ceph_auth_client *ac = mdsc->client->monc.auth;
2980 int ret = 0;
2981
2982 if (force_new && s->s_authorizer) {
2983 ac->ops->destroy_authorizer(ac, s->s_authorizer);
2984 s->s_authorizer = NULL;
2985 }
2986 if (s->s_authorizer == NULL) {
2987 if (ac->ops->create_authorizer) {
2988 ret = ac->ops->create_authorizer(
2989 ac, CEPH_ENTITY_TYPE_MDS,
2990 &s->s_authorizer,
2991 &s->s_authorizer_buf,
2992 &s->s_authorizer_buf_len,
2993 &s->s_authorizer_reply_buf,
2994 &s->s_authorizer_reply_buf_len);
2995 if (ret)
2996 return ret;
2997 }
2998 }
2999
3000 *proto = ac->protocol;
3001 *buf = s->s_authorizer_buf;
3002 *len = s->s_authorizer_buf_len;
3003 *reply_buf = s->s_authorizer_reply_buf;
3004 *reply_len = s->s_authorizer_reply_buf_len;
3005 return 0;
3006}
3007
3008
3009static int verify_authorizer_reply(struct ceph_connection *con, int len)
3010{
3011 struct ceph_mds_session *s = con->private;
3012 struct ceph_mds_client *mdsc = s->s_mdsc;
3013 struct ceph_auth_client *ac = mdsc->client->monc.auth;
3014
3015 return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
3016}
3017
3018static int invalidate_authorizer(struct ceph_connection *con)
3019{
3020 struct ceph_mds_session *s = con->private;
3021 struct ceph_mds_client *mdsc = s->s_mdsc;
3022 struct ceph_auth_client *ac = mdsc->client->monc.auth;
3023
3024 if (ac->ops->invalidate_authorizer)
3025 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
3026
3027 return ceph_monc_validate_auth(&mdsc->client->monc);
3028}
3029
3030const static struct ceph_connection_operations mds_con_ops = {
3031 .get = con_get,
3032 .put = con_put,
3033 .dispatch = dispatch,
3034 .get_authorizer = get_authorizer,
3035 .verify_authorizer_reply = verify_authorizer_reply,
3036 .invalidate_authorizer = invalidate_authorizer,
3037 .peer_reset = peer_reset,
3038};
3039
3040
3041
3042
3043/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
new file mode 100644
index 000000000000..961cc6f65878
--- /dev/null
+++ b/fs/ceph/mds_client.h
@@ -0,0 +1,335 @@
1#ifndef _FS_CEPH_MDS_CLIENT_H
2#define _FS_CEPH_MDS_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/list.h>
7#include <linux/mutex.h>
8#include <linux/rbtree.h>
9#include <linux/spinlock.h>
10
11#include "types.h"
12#include "messenger.h"
13#include "mdsmap.h"
14
15/*
16 * Some lock dependencies:
17 *
18 * session->s_mutex
19 * mdsc->mutex
20 *
21 * mdsc->snap_rwsem
22 *
23 * inode->i_lock
24 * mdsc->snap_flush_lock
25 * mdsc->cap_delay_lock
26 *
27 */
28
29struct ceph_client;
30struct ceph_cap;
31
32/*
33 * parsed info about a single inode. pointers are into the encoded
34 * on-wire structures within the mds reply message payload.
35 */
36struct ceph_mds_reply_info_in {
37 struct ceph_mds_reply_inode *in;
38 u32 symlink_len;
39 char *symlink;
40 u32 xattr_len;
41 char *xattr_data;
42};
43
44/*
45 * parsed info about an mds reply, including information about the
46 * target inode and/or its parent directory and dentry, and directory
47 * contents (for readdir results).
48 */
49struct ceph_mds_reply_info_parsed {
50 struct ceph_mds_reply_head *head;
51
52 struct ceph_mds_reply_info_in diri, targeti;
53 struct ceph_mds_reply_dirfrag *dirfrag;
54 char *dname;
55 u32 dname_len;
56 struct ceph_mds_reply_lease *dlease;
57
58 struct ceph_mds_reply_dirfrag *dir_dir;
59 int dir_nr;
60 char **dir_dname;
61 u32 *dir_dname_len;
62 struct ceph_mds_reply_lease **dir_dlease;
63 struct ceph_mds_reply_info_in *dir_in;
64 u8 dir_complete, dir_end;
65
66 /* encoded blob describing snapshot contexts for certain
67 operations (e.g., open) */
68 void *snapblob;
69 int snapblob_len;
70};
71
72
73/*
74 * cap releases are batched and sent to the MDS en masse.
75 */
76#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \
77 sizeof(struct ceph_mds_cap_release)) / \
78 sizeof(struct ceph_mds_cap_item))
79
80
81/*
82 * state associated with each MDS<->client session
83 */
84enum {
85 CEPH_MDS_SESSION_NEW = 1,
86 CEPH_MDS_SESSION_OPENING = 2,
87 CEPH_MDS_SESSION_OPEN = 3,
88 CEPH_MDS_SESSION_HUNG = 4,
89 CEPH_MDS_SESSION_CLOSING = 5,
90 CEPH_MDS_SESSION_RESTARTING = 6,
91 CEPH_MDS_SESSION_RECONNECTING = 7,
92};
93
94struct ceph_mds_session {
95 struct ceph_mds_client *s_mdsc;
96 int s_mds;
97 int s_state;
98 unsigned long s_ttl; /* time until mds kills us */
99 u64 s_seq; /* incoming msg seq # */
100 struct mutex s_mutex; /* serialize session messages */
101
102 struct ceph_connection s_con;
103
104 struct ceph_authorizer *s_authorizer;
105 void *s_authorizer_buf, *s_authorizer_reply_buf;
106 size_t s_authorizer_buf_len, s_authorizer_reply_buf_len;
107
108 /* protected by s_cap_lock */
109 spinlock_t s_cap_lock;
110 u32 s_cap_gen; /* inc each time we get mds stale msg */
111 unsigned long s_cap_ttl; /* when session caps expire */
112 struct list_head s_caps; /* all caps issued by this session */
113 int s_nr_caps, s_trim_caps;
114 int s_num_cap_releases;
115 struct list_head s_cap_releases; /* waiting cap_release messages */
116 struct list_head s_cap_releases_done; /* ready to send */
117 struct ceph_cap *s_cap_iterator;
118
119 /* protected by mutex */
120 struct list_head s_cap_flushing; /* inodes w/ flushing caps */
121 struct list_head s_cap_snaps_flushing;
122 unsigned long s_renew_requested; /* last time we sent a renew req */
123 u64 s_renew_seq;
124
125 atomic_t s_ref;
126 struct list_head s_waiting; /* waiting requests */
127 struct list_head s_unsafe; /* unsafe requests */
128};
129
130/*
131 * modes of choosing which MDS to send a request to
132 */
133enum {
134 USE_ANY_MDS,
135 USE_RANDOM_MDS,
136 USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */
137};
138
139struct ceph_mds_request;
140struct ceph_mds_client;
141
142/*
143 * request completion callback
144 */
145typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
146 struct ceph_mds_request *req);
147
148/*
149 * an in-flight mds request
150 */
151struct ceph_mds_request {
152 u64 r_tid; /* transaction id */
153 struct rb_node r_node;
154
155 int r_op; /* mds op code */
156 int r_mds;
157
158 /* operation on what? */
159 struct inode *r_inode; /* arg1 */
160 struct dentry *r_dentry; /* arg1 */
161 struct dentry *r_old_dentry; /* arg2: rename from or link from */
162 char *r_path1, *r_path2;
163 struct ceph_vino r_ino1, r_ino2;
164
165 struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
166 struct inode *r_target_inode; /* resulting inode */
167
168 union ceph_mds_request_args r_args;
169 int r_fmode; /* file mode, if expecting cap */
170
171 /* for choosing which mds to send this request to */
172 int r_direct_mode;
173 u32 r_direct_hash; /* choose dir frag based on this dentry hash */
174 bool r_direct_is_hash; /* true if r_direct_hash is valid */
175
176 /* data payload is used for xattr ops */
177 struct page **r_pages;
178 int r_num_pages;
179 int r_data_len;
180
181 /* what caps shall we drop? */
182 int r_inode_drop, r_inode_unless;
183 int r_dentry_drop, r_dentry_unless;
184 int r_old_dentry_drop, r_old_dentry_unless;
185 struct inode *r_old_inode;
186 int r_old_inode_drop, r_old_inode_unless;
187
188 struct ceph_msg *r_request; /* original request */
189 struct ceph_msg *r_reply;
190 struct ceph_mds_reply_info_parsed r_reply_info;
191 int r_err;
192 bool r_aborted;
193
194 unsigned long r_timeout; /* optional. jiffies */
195 unsigned long r_started; /* start time to measure timeout against */
196 unsigned long r_request_started; /* start time for mds request only,
197 used to measure lease durations */
198
199 /* link unsafe requests to parent directory, for fsync */
200 struct inode *r_unsafe_dir;
201 struct list_head r_unsafe_dir_item;
202
203 struct ceph_mds_session *r_session;
204
205 int r_attempts; /* resend attempts */
206 int r_num_fwd; /* number of forward attempts */
207 int r_num_stale;
208 int r_resend_mds; /* mds to resend to next, if any*/
209
210 struct kref r_kref;
211 struct list_head r_wait;
212 struct completion r_completion;
213 struct completion r_safe_completion;
214 ceph_mds_request_callback_t r_callback;
215 struct list_head r_unsafe_item; /* per-session unsafe list item */
216 bool r_got_unsafe, r_got_safe;
217
218 bool r_did_prepopulate;
219 u32 r_readdir_offset;
220
221 struct ceph_cap_reservation r_caps_reservation;
222 int r_num_caps;
223};
224
225/*
226 * mds client state
227 */
228struct ceph_mds_client {
229 struct ceph_client *client;
230 struct mutex mutex; /* all nested structures */
231
232 struct ceph_mdsmap *mdsmap;
233 struct completion safe_umount_waiters, session_close_waiters;
234 struct list_head waiting_for_map;
235
236 struct ceph_mds_session **sessions; /* NULL for mds if no session */
237 int max_sessions; /* len of s_mds_sessions */
238 int stopping; /* true if shutting down */
239
240 /*
241 * snap_rwsem will cover cap linkage into snaprealms, and
242 * realm snap contexts. (later, we can do per-realm snap
243 * contexts locks..) the empty list contains realms with no
244 * references (implying they contain no inodes with caps) that
245 * should be destroyed.
246 */
247 struct rw_semaphore snap_rwsem;
248 struct rb_root snap_realms;
249 struct list_head snap_empty;
250 spinlock_t snap_empty_lock; /* protect snap_empty */
251
252 u64 last_tid; /* most recent mds request */
253 struct rb_root request_tree; /* pending mds requests */
254 struct delayed_work delayed_work; /* delayed work */
255 unsigned long last_renew_caps; /* last time we renewed our caps */
256 struct list_head cap_delay_list; /* caps with delayed release */
257 spinlock_t cap_delay_lock; /* protects cap_delay_list */
258 struct list_head snap_flush_list; /* cap_snaps ready to flush */
259 spinlock_t snap_flush_lock;
260
261 u64 cap_flush_seq;
262 struct list_head cap_dirty; /* inodes with dirty caps */
263 int num_cap_flushing; /* # caps we are flushing */
264 spinlock_t cap_dirty_lock; /* protects above items */
265 wait_queue_head_t cap_flushing_wq;
266
267#ifdef CONFIG_DEBUG_FS
268 struct dentry *debugfs_file;
269#endif
270
271 spinlock_t dentry_lru_lock;
272 struct list_head dentry_lru;
273 int num_dentry;
274};
275
276extern const char *ceph_mds_op_name(int op);
277
278extern struct ceph_mds_session *
279__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
280
281static inline struct ceph_mds_session *
282ceph_get_mds_session(struct ceph_mds_session *s)
283{
284 atomic_inc(&s->s_ref);
285 return s;
286}
287
288extern void ceph_put_mds_session(struct ceph_mds_session *s);
289
290extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
291 struct ceph_msg *msg, int mds);
292
293extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
294 struct ceph_client *client);
295extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
296extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
297
298extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
299
300extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
301 struct inode *inode,
302 struct dentry *dn, int mask);
303
304extern struct ceph_mds_request *
305ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
306extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
307 struct ceph_mds_request *req);
308extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
309 struct inode *dir,
310 struct ceph_mds_request *req);
311static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
312{
313 kref_get(&req->r_kref);
314}
315extern void ceph_mdsc_release_request(struct kref *kref);
316static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
317{
318 kref_put(&req->r_kref, ceph_mdsc_release_request);
319}
320
321extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
322
323extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
324 int stop_on_nosnap);
325
326extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
327extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
328 struct inode *inode,
329 struct dentry *dentry, char action,
330 u32 seq);
331
332extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
333 struct ceph_msg *msg);
334
335#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
new file mode 100644
index 000000000000..c4c498e6dfef
--- /dev/null
+++ b/fs/ceph/mdsmap.c
@@ -0,0 +1,174 @@
1#include "ceph_debug.h"
2
3#include <linux/bug.h>
4#include <linux/err.h>
5#include <linux/random.h>
6#include <linux/slab.h>
7#include <linux/types.h>
8
9#include "mdsmap.h"
10#include "messenger.h"
11#include "decode.h"
12
13#include "super.h"
14
15
16/*
17 * choose a random mds that is "up" (i.e. has a state > 0), or -1.
18 */
19int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
20{
21 int n = 0;
22 int i;
23 char r;
24
25 /* count */
26 for (i = 0; i < m->m_max_mds; i++)
27 if (m->m_info[i].state > 0)
28 n++;
29 if (n == 0)
30 return -1;
31
32 /* pick */
33 get_random_bytes(&r, 1);
34 n = r % n;
35 i = 0;
36 for (i = 0; n > 0; i++, n--)
37 while (m->m_info[i].state <= 0)
38 i++;
39
40 return i;
41}
42
43/*
44 * Decode an MDS map
45 *
46 * Ignore any fields we don't care about (there are quite a few of
47 * them).
48 */
49struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
50{
51 struct ceph_mdsmap *m;
52 const void *start = *p;
53 int i, j, n;
54 int err = -EINVAL;
55 u16 version;
56
57 m = kzalloc(sizeof(*m), GFP_NOFS);
58 if (m == NULL)
59 return ERR_PTR(-ENOMEM);
60
61 ceph_decode_16_safe(p, end, version, bad);
62
63 ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
64 m->m_epoch = ceph_decode_32(p);
65 m->m_client_epoch = ceph_decode_32(p);
66 m->m_last_failure = ceph_decode_32(p);
67 m->m_root = ceph_decode_32(p);
68 m->m_session_timeout = ceph_decode_32(p);
69 m->m_session_autoclose = ceph_decode_32(p);
70 m->m_max_file_size = ceph_decode_64(p);
71 m->m_max_mds = ceph_decode_32(p);
72
73 m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
74 if (m->m_info == NULL)
75 goto badmem;
76
77 /* pick out active nodes from mds_info (state > 0) */
78 n = ceph_decode_32(p);
79 for (i = 0; i < n; i++) {
80 u64 global_id;
81 u32 namelen;
82 s32 mds, inc, state;
83 u64 state_seq;
84 u8 infoversion;
85 struct ceph_entity_addr addr;
86 u32 num_export_targets;
87 void *pexport_targets = NULL;
88
89 ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
90 global_id = ceph_decode_64(p);
91 infoversion = ceph_decode_8(p);
92 *p += sizeof(u64);
93 namelen = ceph_decode_32(p); /* skip mds name */
94 *p += namelen;
95
96 ceph_decode_need(p, end,
97 4*sizeof(u32) + sizeof(u64) +
98 sizeof(addr) + sizeof(struct ceph_timespec),
99 bad);
100 mds = ceph_decode_32(p);
101 inc = ceph_decode_32(p);
102 state = ceph_decode_32(p);
103 state_seq = ceph_decode_64(p);
104 ceph_decode_copy(p, &addr, sizeof(addr));
105 ceph_decode_addr(&addr);
106 *p += sizeof(struct ceph_timespec);
107 *p += sizeof(u32);
108 ceph_decode_32_safe(p, end, namelen, bad);
109 *p += namelen;
110 if (infoversion >= 2) {
111 ceph_decode_32_safe(p, end, num_export_targets, bad);
112 pexport_targets = *p;
113 *p += num_export_targets * sizeof(u32);
114 } else {
115 num_export_targets = 0;
116 }
117
118 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
119 i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr),
120 ceph_mds_state_name(state));
121 if (mds >= 0 && mds < m->m_max_mds && state > 0) {
122 m->m_info[mds].global_id = global_id;
123 m->m_info[mds].state = state;
124 m->m_info[mds].addr = addr;
125 m->m_info[mds].num_export_targets = num_export_targets;
126 if (num_export_targets) {
127 m->m_info[mds].export_targets =
128 kcalloc(num_export_targets, sizeof(u32),
129 GFP_NOFS);
130 for (j = 0; j < num_export_targets; j++)
131 m->m_info[mds].export_targets[j] =
132 ceph_decode_32(&pexport_targets);
133 } else {
134 m->m_info[mds].export_targets = NULL;
135 }
136 }
137 }
138
139 /* pg_pools */
140 ceph_decode_32_safe(p, end, n, bad);
141 m->m_num_data_pg_pools = n;
142 m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
143 if (!m->m_data_pg_pools)
144 goto badmem;
145 ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
146 for (i = 0; i < n; i++)
147 m->m_data_pg_pools[i] = ceph_decode_32(p);
148 m->m_cas_pg_pool = ceph_decode_32(p);
149
150 /* ok, we don't care about the rest. */
151 dout("mdsmap_decode success epoch %u\n", m->m_epoch);
152 return m;
153
154badmem:
155 err = -ENOMEM;
156bad:
157 pr_err("corrupt mdsmap\n");
158 print_hex_dump(KERN_DEBUG, "mdsmap: ",
159 DUMP_PREFIX_OFFSET, 16, 1,
160 start, end - start, true);
161 ceph_mdsmap_destroy(m);
162 return ERR_PTR(-EINVAL);
163}
164
165void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
166{
167 int i;
168
169 for (i = 0; i < m->m_max_mds; i++)
170 kfree(m->m_info[i].export_targets);
171 kfree(m->m_info);
172 kfree(m->m_data_pg_pools);
173 kfree(m);
174}
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
new file mode 100644
index 000000000000..eacc131aa5cb
--- /dev/null
+++ b/fs/ceph/mdsmap.h
@@ -0,0 +1,54 @@
1#ifndef _FS_CEPH_MDSMAP_H
2#define _FS_CEPH_MDSMAP_H
3
4#include "types.h"
5
6/*
7 * mds map - describe servers in the mds cluster.
8 *
9 * we limit fields to those the client actually xcares about
10 */
11struct ceph_mds_info {
12 u64 global_id;
13 struct ceph_entity_addr addr;
14 s32 state;
15 int num_export_targets;
16 u32 *export_targets;
17};
18
19struct ceph_mdsmap {
20 u32 m_epoch, m_client_epoch, m_last_failure;
21 u32 m_root;
22 u32 m_session_timeout; /* seconds */
23 u32 m_session_autoclose; /* seconds */
24 u64 m_max_file_size;
25 u32 m_max_mds; /* size of m_addr, m_state arrays */
26 struct ceph_mds_info *m_info;
27
28 /* which object pools file data can be stored in */
29 int m_num_data_pg_pools;
30 u32 *m_data_pg_pools;
31 u32 m_cas_pg_pool;
32};
33
34static inline struct ceph_entity_addr *
35ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
36{
37 if (w >= m->m_max_mds)
38 return NULL;
39 return &m->m_info[w].addr;
40}
41
42static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
43{
44 BUG_ON(w < 0);
45 if (w >= m->m_max_mds)
46 return CEPH_MDS_STATE_DNE;
47 return m->m_info[w].state;
48}
49
50extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
51extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
52extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
53
54#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
new file mode 100644
index 000000000000..8f1715ffbe4b
--- /dev/null
+++ b/fs/ceph/messenger.c
@@ -0,0 +1,2240 @@
1#include "ceph_debug.h"
2
3#include <linux/crc32c.h>
4#include <linux/ctype.h>
5#include <linux/highmem.h>
6#include <linux/inet.h>
7#include <linux/kthread.h>
8#include <linux/net.h>
9#include <linux/slab.h>
10#include <linux/socket.h>
11#include <linux/string.h>
12#include <net/tcp.h>
13
14#include "super.h"
15#include "messenger.h"
16#include "decode.h"
17#include "pagelist.h"
18
19/*
20 * Ceph uses the messenger to exchange ceph_msg messages with other
21 * hosts in the system. The messenger provides ordered and reliable
22 * delivery. We tolerate TCP disconnects by reconnecting (with
23 * exponential backoff) in the case of a fault (disconnection, bad
24 * crc, protocol error). Acks allow sent messages to be discarded by
25 * the sender.
26 */
27
28/* static tag bytes (protocol control messages) */
29static char tag_msg = CEPH_MSGR_TAG_MSG;
30static char tag_ack = CEPH_MSGR_TAG_ACK;
31static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
32
33
34static void queue_con(struct ceph_connection *con);
35static void con_work(struct work_struct *);
36static void ceph_fault(struct ceph_connection *con);
37
38const char *ceph_name_type_str(int t)
39{
40 switch (t) {
41 case CEPH_ENTITY_TYPE_MON: return "mon";
42 case CEPH_ENTITY_TYPE_MDS: return "mds";
43 case CEPH_ENTITY_TYPE_OSD: return "osd";
44 case CEPH_ENTITY_TYPE_CLIENT: return "client";
45 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
46 default: return "???";
47 }
48}
49
50/*
51 * nicely render a sockaddr as a string.
52 */
53#define MAX_ADDR_STR 20
54static char addr_str[MAX_ADDR_STR][40];
55static DEFINE_SPINLOCK(addr_str_lock);
56static int last_addr_str;
57
58const char *pr_addr(const struct sockaddr_storage *ss)
59{
60 int i;
61 char *s;
62 struct sockaddr_in *in4 = (void *)ss;
63 unsigned char *quad = (void *)&in4->sin_addr.s_addr;
64 struct sockaddr_in6 *in6 = (void *)ss;
65
66 spin_lock(&addr_str_lock);
67 i = last_addr_str++;
68 if (last_addr_str == MAX_ADDR_STR)
69 last_addr_str = 0;
70 spin_unlock(&addr_str_lock);
71 s = addr_str[i];
72
73 switch (ss->ss_family) {
74 case AF_INET:
75 sprintf(s, "%u.%u.%u.%u:%u",
76 (unsigned int)quad[0],
77 (unsigned int)quad[1],
78 (unsigned int)quad[2],
79 (unsigned int)quad[3],
80 (unsigned int)ntohs(in4->sin_port));
81 break;
82
83 case AF_INET6:
84 sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u",
85 in6->sin6_addr.s6_addr16[0],
86 in6->sin6_addr.s6_addr16[1],
87 in6->sin6_addr.s6_addr16[2],
88 in6->sin6_addr.s6_addr16[3],
89 in6->sin6_addr.s6_addr16[4],
90 in6->sin6_addr.s6_addr16[5],
91 in6->sin6_addr.s6_addr16[6],
92 in6->sin6_addr.s6_addr16[7],
93 (unsigned int)ntohs(in6->sin6_port));
94 break;
95
96 default:
97 sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
98 }
99
100 return s;
101}
102
103static void encode_my_addr(struct ceph_messenger *msgr)
104{
105 memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
106 ceph_encode_addr(&msgr->my_enc_addr);
107}
108
109/*
110 * work queue for all reading and writing to/from the socket.
111 */
112struct workqueue_struct *ceph_msgr_wq;
113
114int __init ceph_msgr_init(void)
115{
116 ceph_msgr_wq = create_workqueue("ceph-msgr");
117 if (IS_ERR(ceph_msgr_wq)) {
118 int ret = PTR_ERR(ceph_msgr_wq);
119 pr_err("msgr_init failed to create workqueue: %d\n", ret);
120 ceph_msgr_wq = NULL;
121 return ret;
122 }
123 return 0;
124}
125
126void ceph_msgr_exit(void)
127{
128 destroy_workqueue(ceph_msgr_wq);
129}
130
131/*
132 * socket callback functions
133 */
134
135/* data available on socket, or listen socket received a connect */
136static void ceph_data_ready(struct sock *sk, int count_unused)
137{
138 struct ceph_connection *con =
139 (struct ceph_connection *)sk->sk_user_data;
140 if (sk->sk_state != TCP_CLOSE_WAIT) {
141 dout("ceph_data_ready on %p state = %lu, queueing work\n",
142 con, con->state);
143 queue_con(con);
144 }
145}
146
147/* socket has buffer space for writing */
148static void ceph_write_space(struct sock *sk)
149{
150 struct ceph_connection *con =
151 (struct ceph_connection *)sk->sk_user_data;
152
153 /* only queue to workqueue if there is data we want to write. */
154 if (test_bit(WRITE_PENDING, &con->state)) {
155 dout("ceph_write_space %p queueing write work\n", con);
156 queue_con(con);
157 } else {
158 dout("ceph_write_space %p nothing to write\n", con);
159 }
160
161 /* since we have our own write_space, clear the SOCK_NOSPACE flag */
162 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
163}
164
165/* socket's state has changed */
166static void ceph_state_change(struct sock *sk)
167{
168 struct ceph_connection *con =
169 (struct ceph_connection *)sk->sk_user_data;
170
171 dout("ceph_state_change %p state = %lu sk_state = %u\n",
172 con, con->state, sk->sk_state);
173
174 if (test_bit(CLOSED, &con->state))
175 return;
176
177 switch (sk->sk_state) {
178 case TCP_CLOSE:
179 dout("ceph_state_change TCP_CLOSE\n");
180 case TCP_CLOSE_WAIT:
181 dout("ceph_state_change TCP_CLOSE_WAIT\n");
182 if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
183 if (test_bit(CONNECTING, &con->state))
184 con->error_msg = "connection failed";
185 else
186 con->error_msg = "socket closed";
187 queue_con(con);
188 }
189 break;
190 case TCP_ESTABLISHED:
191 dout("ceph_state_change TCP_ESTABLISHED\n");
192 queue_con(con);
193 break;
194 }
195}
196
197/*
198 * set up socket callbacks
199 */
200static void set_sock_callbacks(struct socket *sock,
201 struct ceph_connection *con)
202{
203 struct sock *sk = sock->sk;
204 sk->sk_user_data = (void *)con;
205 sk->sk_data_ready = ceph_data_ready;
206 sk->sk_write_space = ceph_write_space;
207 sk->sk_state_change = ceph_state_change;
208}
209
210
211/*
212 * socket helpers
213 */
214
215/*
216 * initiate connection to a remote socket.
217 */
218static struct socket *ceph_tcp_connect(struct ceph_connection *con)
219{
220 struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr;
221 struct socket *sock;
222 int ret;
223
224 BUG_ON(con->sock);
225 ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
226 if (ret)
227 return ERR_PTR(ret);
228 con->sock = sock;
229 sock->sk->sk_allocation = GFP_NOFS;
230
231 set_sock_callbacks(sock, con);
232
233 dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
234
235 ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK);
236 if (ret == -EINPROGRESS) {
237 dout("connect %s EINPROGRESS sk_state = %u\n",
238 pr_addr(&con->peer_addr.in_addr),
239 sock->sk->sk_state);
240 ret = 0;
241 }
242 if (ret < 0) {
243 pr_err("connect %s error %d\n",
244 pr_addr(&con->peer_addr.in_addr), ret);
245 sock_release(sock);
246 con->sock = NULL;
247 con->error_msg = "connect error";
248 }
249
250 if (ret < 0)
251 return ERR_PTR(ret);
252 return sock;
253}
254
255static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
256{
257 struct kvec iov = {buf, len};
258 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
259
260 return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
261}
262
263/*
264 * write something. @more is true if caller will be sending more data
265 * shortly.
266 */
267static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
268 size_t kvlen, size_t len, int more)
269{
270 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
271
272 if (more)
273 msg.msg_flags |= MSG_MORE;
274 else
275 msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
276
277 return kernel_sendmsg(sock, &msg, iov, kvlen, len);
278}
279
280
281/*
282 * Shutdown/close the socket for the given connection.
283 */
284static int con_close_socket(struct ceph_connection *con)
285{
286 int rc;
287
288 dout("con_close_socket on %p sock %p\n", con, con->sock);
289 if (!con->sock)
290 return 0;
291 set_bit(SOCK_CLOSED, &con->state);
292 rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
293 sock_release(con->sock);
294 con->sock = NULL;
295 clear_bit(SOCK_CLOSED, &con->state);
296 return rc;
297}
298
299/*
300 * Reset a connection. Discard all incoming and outgoing messages
301 * and clear *_seq state.
302 */
303static void ceph_msg_remove(struct ceph_msg *msg)
304{
305 list_del_init(&msg->list_head);
306 ceph_msg_put(msg);
307}
308static void ceph_msg_remove_list(struct list_head *head)
309{
310 while (!list_empty(head)) {
311 struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
312 list_head);
313 ceph_msg_remove(msg);
314 }
315}
316
317static void reset_connection(struct ceph_connection *con)
318{
319 /* reset connection, out_queue, msg_ and connect_seq */
320 /* discard existing out_queue and msg_seq */
321 ceph_msg_remove_list(&con->out_queue);
322 ceph_msg_remove_list(&con->out_sent);
323
324 if (con->in_msg) {
325 ceph_msg_put(con->in_msg);
326 con->in_msg = NULL;
327 }
328
329 con->connect_seq = 0;
330 con->out_seq = 0;
331 if (con->out_msg) {
332 ceph_msg_put(con->out_msg);
333 con->out_msg = NULL;
334 }
335 con->in_seq = 0;
336}
337
338/*
339 * mark a peer down. drop any open connections.
340 */
341void ceph_con_close(struct ceph_connection *con)
342{
343 dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
344 set_bit(CLOSED, &con->state); /* in case there's queued work */
345 clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
346 clear_bit(LOSSYTX, &con->state); /* so we retry next connect */
347 clear_bit(KEEPALIVE_PENDING, &con->state);
348 clear_bit(WRITE_PENDING, &con->state);
349 mutex_lock(&con->mutex);
350 reset_connection(con);
351 cancel_delayed_work(&con->work);
352 mutex_unlock(&con->mutex);
353 queue_con(con);
354}
355
356/*
357 * Reopen a closed connection, with a new peer address.
358 */
359void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
360{
361 dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
362 set_bit(OPENING, &con->state);
363 clear_bit(CLOSED, &con->state);
364 memcpy(&con->peer_addr, addr, sizeof(*addr));
365 con->delay = 0; /* reset backoff memory */
366 queue_con(con);
367}
368
369/*
370 * return true if this connection ever successfully opened
371 */
372bool ceph_con_opened(struct ceph_connection *con)
373{
374 return con->connect_seq > 0;
375}
376
377/*
378 * generic get/put
379 */
380struct ceph_connection *ceph_con_get(struct ceph_connection *con)
381{
382 dout("con_get %p nref = %d -> %d\n", con,
383 atomic_read(&con->nref), atomic_read(&con->nref) + 1);
384 if (atomic_inc_not_zero(&con->nref))
385 return con;
386 return NULL;
387}
388
389void ceph_con_put(struct ceph_connection *con)
390{
391 dout("con_put %p nref = %d -> %d\n", con,
392 atomic_read(&con->nref), atomic_read(&con->nref) - 1);
393 BUG_ON(atomic_read(&con->nref) == 0);
394 if (atomic_dec_and_test(&con->nref)) {
395 BUG_ON(con->sock);
396 kfree(con);
397 }
398}
399
400/*
401 * initialize a new connection.
402 */
403void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
404{
405 dout("con_init %p\n", con);
406 memset(con, 0, sizeof(*con));
407 atomic_set(&con->nref, 1);
408 con->msgr = msgr;
409 mutex_init(&con->mutex);
410 INIT_LIST_HEAD(&con->out_queue);
411 INIT_LIST_HEAD(&con->out_sent);
412 INIT_DELAYED_WORK(&con->work, con_work);
413}
414
415
416/*
417 * We maintain a global counter to order connection attempts. Get
418 * a unique seq greater than @gt.
419 */
420static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
421{
422 u32 ret;
423
424 spin_lock(&msgr->global_seq_lock);
425 if (msgr->global_seq < gt)
426 msgr->global_seq = gt;
427 ret = ++msgr->global_seq;
428 spin_unlock(&msgr->global_seq_lock);
429 return ret;
430}
431
432
433/*
434 * Prepare footer for currently outgoing message, and finish things
435 * off. Assumes out_kvec* are already valid.. we just add on to the end.
436 */
437static void prepare_write_message_footer(struct ceph_connection *con, int v)
438{
439 struct ceph_msg *m = con->out_msg;
440
441 dout("prepare_write_message_footer %p\n", con);
442 con->out_kvec_is_msg = true;
443 con->out_kvec[v].iov_base = &m->footer;
444 con->out_kvec[v].iov_len = sizeof(m->footer);
445 con->out_kvec_bytes += sizeof(m->footer);
446 con->out_kvec_left++;
447 con->out_more = m->more_to_follow;
448 con->out_msg_done = true;
449}
450
451/*
452 * Prepare headers for the next outgoing message.
453 */
454static void prepare_write_message(struct ceph_connection *con)
455{
456 struct ceph_msg *m;
457 int v = 0;
458
459 con->out_kvec_bytes = 0;
460 con->out_kvec_is_msg = true;
461 con->out_msg_done = false;
462
463 /* Sneak an ack in there first? If we can get it into the same
464 * TCP packet that's a good thing. */
465 if (con->in_seq > con->in_seq_acked) {
466 con->in_seq_acked = con->in_seq;
467 con->out_kvec[v].iov_base = &tag_ack;
468 con->out_kvec[v++].iov_len = 1;
469 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
470 con->out_kvec[v].iov_base = &con->out_temp_ack;
471 con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
472 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
473 }
474
475 m = list_first_entry(&con->out_queue,
476 struct ceph_msg, list_head);
477 con->out_msg = m;
478 if (test_bit(LOSSYTX, &con->state)) {
479 list_del_init(&m->list_head);
480 } else {
481 /* put message on sent list */
482 ceph_msg_get(m);
483 list_move_tail(&m->list_head, &con->out_sent);
484 }
485
486 m->hdr.seq = cpu_to_le64(++con->out_seq);
487
488 dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
489 m, con->out_seq, le16_to_cpu(m->hdr.type),
490 le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
491 le32_to_cpu(m->hdr.data_len),
492 m->nr_pages);
493 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
494
495 /* tag + hdr + front + middle */
496 con->out_kvec[v].iov_base = &tag_msg;
497 con->out_kvec[v++].iov_len = 1;
498 con->out_kvec[v].iov_base = &m->hdr;
499 con->out_kvec[v++].iov_len = sizeof(m->hdr);
500 con->out_kvec[v++] = m->front;
501 if (m->middle)
502 con->out_kvec[v++] = m->middle->vec;
503 con->out_kvec_left = v;
504 con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
505 (m->middle ? m->middle->vec.iov_len : 0);
506 con->out_kvec_cur = con->out_kvec;
507
508 /* fill in crc (except data pages), footer */
509 con->out_msg->hdr.crc =
510 cpu_to_le32(crc32c(0, (void *)&m->hdr,
511 sizeof(m->hdr) - sizeof(m->hdr.crc)));
512 con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
513 con->out_msg->footer.front_crc =
514 cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
515 if (m->middle)
516 con->out_msg->footer.middle_crc =
517 cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
518 m->middle->vec.iov_len));
519 else
520 con->out_msg->footer.middle_crc = 0;
521 con->out_msg->footer.data_crc = 0;
522 dout("prepare_write_message front_crc %u data_crc %u\n",
523 le32_to_cpu(con->out_msg->footer.front_crc),
524 le32_to_cpu(con->out_msg->footer.middle_crc));
525
526 /* is there a data payload? */
527 if (le32_to_cpu(m->hdr.data_len) > 0) {
528 /* initialize page iterator */
529 con->out_msg_pos.page = 0;
530 con->out_msg_pos.page_pos =
531 le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
532 con->out_msg_pos.data_pos = 0;
533 con->out_msg_pos.did_page_crc = 0;
534 con->out_more = 1; /* data + footer will follow */
535 } else {
536 /* no, queue up footer too and be done */
537 prepare_write_message_footer(con, v);
538 }
539
540 set_bit(WRITE_PENDING, &con->state);
541}
542
543/*
544 * Prepare an ack.
545 */
546static void prepare_write_ack(struct ceph_connection *con)
547{
548 dout("prepare_write_ack %p %llu -> %llu\n", con,
549 con->in_seq_acked, con->in_seq);
550 con->in_seq_acked = con->in_seq;
551
552 con->out_kvec[0].iov_base = &tag_ack;
553 con->out_kvec[0].iov_len = 1;
554 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
555 con->out_kvec[1].iov_base = &con->out_temp_ack;
556 con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
557 con->out_kvec_left = 2;
558 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
559 con->out_kvec_cur = con->out_kvec;
560 con->out_more = 1; /* more will follow.. eventually.. */
561 set_bit(WRITE_PENDING, &con->state);
562}
563
564/*
565 * Prepare to write keepalive byte.
566 */
567static void prepare_write_keepalive(struct ceph_connection *con)
568{
569 dout("prepare_write_keepalive %p\n", con);
570 con->out_kvec[0].iov_base = &tag_keepalive;
571 con->out_kvec[0].iov_len = 1;
572 con->out_kvec_left = 1;
573 con->out_kvec_bytes = 1;
574 con->out_kvec_cur = con->out_kvec;
575 set_bit(WRITE_PENDING, &con->state);
576}
577
578/*
579 * Connection negotiation.
580 */
581
582static void prepare_connect_authorizer(struct ceph_connection *con)
583{
584 void *auth_buf;
585 int auth_len = 0;
586 int auth_protocol = 0;
587
588 mutex_unlock(&con->mutex);
589 if (con->ops->get_authorizer)
590 con->ops->get_authorizer(con, &auth_buf, &auth_len,
591 &auth_protocol, &con->auth_reply_buf,
592 &con->auth_reply_buf_len,
593 con->auth_retry);
594 mutex_lock(&con->mutex);
595
596 con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
597 con->out_connect.authorizer_len = cpu_to_le32(auth_len);
598
599 con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
600 con->out_kvec[con->out_kvec_left].iov_len = auth_len;
601 con->out_kvec_left++;
602 con->out_kvec_bytes += auth_len;
603}
604
605/*
606 * We connected to a peer and are saying hello.
607 */
608static void prepare_write_banner(struct ceph_messenger *msgr,
609 struct ceph_connection *con)
610{
611 int len = strlen(CEPH_BANNER);
612
613 con->out_kvec[0].iov_base = CEPH_BANNER;
614 con->out_kvec[0].iov_len = len;
615 con->out_kvec[1].iov_base = &msgr->my_enc_addr;
616 con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
617 con->out_kvec_left = 2;
618 con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
619 con->out_kvec_cur = con->out_kvec;
620 con->out_more = 0;
621 set_bit(WRITE_PENDING, &con->state);
622}
623
624static void prepare_write_connect(struct ceph_messenger *msgr,
625 struct ceph_connection *con,
626 int after_banner)
627{
628 unsigned global_seq = get_global_seq(con->msgr, 0);
629 int proto;
630
631 switch (con->peer_name.type) {
632 case CEPH_ENTITY_TYPE_MON:
633 proto = CEPH_MONC_PROTOCOL;
634 break;
635 case CEPH_ENTITY_TYPE_OSD:
636 proto = CEPH_OSDC_PROTOCOL;
637 break;
638 case CEPH_ENTITY_TYPE_MDS:
639 proto = CEPH_MDSC_PROTOCOL;
640 break;
641 default:
642 BUG();
643 }
644
645 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
646 con->connect_seq, global_seq, proto);
647
648 con->out_connect.features = CEPH_FEATURE_SUPPORTED;
649 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
650 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
651 con->out_connect.global_seq = cpu_to_le32(global_seq);
652 con->out_connect.protocol_version = cpu_to_le32(proto);
653 con->out_connect.flags = 0;
654
655 if (!after_banner) {
656 con->out_kvec_left = 0;
657 con->out_kvec_bytes = 0;
658 }
659 con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
660 con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
661 con->out_kvec_left++;
662 con->out_kvec_bytes += sizeof(con->out_connect);
663 con->out_kvec_cur = con->out_kvec;
664 con->out_more = 0;
665 set_bit(WRITE_PENDING, &con->state);
666
667 prepare_connect_authorizer(con);
668}
669
670
671/*
672 * write as much of pending kvecs to the socket as we can.
673 * 1 -> done
674 * 0 -> socket full, but more to do
675 * <0 -> error
676 */
677static int write_partial_kvec(struct ceph_connection *con)
678{
679 int ret;
680
681 dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
682 while (con->out_kvec_bytes > 0) {
683 ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
684 con->out_kvec_left, con->out_kvec_bytes,
685 con->out_more);
686 if (ret <= 0)
687 goto out;
688 con->out_kvec_bytes -= ret;
689 if (con->out_kvec_bytes == 0)
690 break; /* done */
691 while (ret > 0) {
692 if (ret >= con->out_kvec_cur->iov_len) {
693 ret -= con->out_kvec_cur->iov_len;
694 con->out_kvec_cur++;
695 con->out_kvec_left--;
696 } else {
697 con->out_kvec_cur->iov_len -= ret;
698 con->out_kvec_cur->iov_base += ret;
699 ret = 0;
700 break;
701 }
702 }
703 }
704 con->out_kvec_left = 0;
705 con->out_kvec_is_msg = false;
706 ret = 1;
707out:
708 dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
709 con->out_kvec_bytes, con->out_kvec_left, ret);
710 return ret; /* done! */
711}
712
713/*
714 * Write as much message data payload as we can. If we finish, queue
715 * up the footer.
716 * 1 -> done, footer is now queued in out_kvec[].
717 * 0 -> socket full, but more to do
718 * <0 -> error
719 */
720static int write_partial_msg_pages(struct ceph_connection *con)
721{
722 struct ceph_msg *msg = con->out_msg;
723 unsigned data_len = le32_to_cpu(msg->hdr.data_len);
724 size_t len;
725 int crc = con->msgr->nocrc;
726 int ret;
727
728 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
729 con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
730 con->out_msg_pos.page_pos);
731
732 while (con->out_msg_pos.page < con->out_msg->nr_pages) {
733 struct page *page = NULL;
734 void *kaddr = NULL;
735
736 /*
737 * if we are calculating the data crc (the default), we need
738 * to map the page. if our pages[] has been revoked, use the
739 * zero page.
740 */
741 if (msg->pages) {
742 page = msg->pages[con->out_msg_pos.page];
743 if (crc)
744 kaddr = kmap(page);
745 } else if (msg->pagelist) {
746 page = list_first_entry(&msg->pagelist->head,
747 struct page, lru);
748 if (crc)
749 kaddr = kmap(page);
750 } else {
751 page = con->msgr->zero_page;
752 if (crc)
753 kaddr = page_address(con->msgr->zero_page);
754 }
755 len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
756 (int)(data_len - con->out_msg_pos.data_pos));
757 if (crc && !con->out_msg_pos.did_page_crc) {
758 void *base = kaddr + con->out_msg_pos.page_pos;
759 u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
760
761 BUG_ON(kaddr == NULL);
762 con->out_msg->footer.data_crc =
763 cpu_to_le32(crc32c(tmpcrc, base, len));
764 con->out_msg_pos.did_page_crc = 1;
765 }
766
767 ret = kernel_sendpage(con->sock, page,
768 con->out_msg_pos.page_pos, len,
769 MSG_DONTWAIT | MSG_NOSIGNAL |
770 MSG_MORE);
771
772 if (crc && (msg->pages || msg->pagelist))
773 kunmap(page);
774
775 if (ret <= 0)
776 goto out;
777
778 con->out_msg_pos.data_pos += ret;
779 con->out_msg_pos.page_pos += ret;
780 if (ret == len) {
781 con->out_msg_pos.page_pos = 0;
782 con->out_msg_pos.page++;
783 con->out_msg_pos.did_page_crc = 0;
784 if (msg->pagelist)
785 list_move_tail(&page->lru,
786 &msg->pagelist->head);
787 }
788 }
789
790 dout("write_partial_msg_pages %p msg %p done\n", con, msg);
791
792 /* prepare and queue up footer, too */
793 if (!crc)
794 con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
795 con->out_kvec_bytes = 0;
796 con->out_kvec_left = 0;
797 con->out_kvec_cur = con->out_kvec;
798 prepare_write_message_footer(con, 0);
799 ret = 1;
800out:
801 return ret;
802}
803
804/*
805 * write some zeros
806 */
807static int write_partial_skip(struct ceph_connection *con)
808{
809 int ret;
810
811 while (con->out_skip > 0) {
812 struct kvec iov = {
813 .iov_base = page_address(con->msgr->zero_page),
814 .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
815 };
816
817 ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
818 if (ret <= 0)
819 goto out;
820 con->out_skip -= ret;
821 }
822 ret = 1;
823out:
824 return ret;
825}
826
827/*
828 * Prepare to read connection handshake, or an ack.
829 */
830static void prepare_read_banner(struct ceph_connection *con)
831{
832 dout("prepare_read_banner %p\n", con);
833 con->in_base_pos = 0;
834}
835
836static void prepare_read_connect(struct ceph_connection *con)
837{
838 dout("prepare_read_connect %p\n", con);
839 con->in_base_pos = 0;
840}
841
842static void prepare_read_ack(struct ceph_connection *con)
843{
844 dout("prepare_read_ack %p\n", con);
845 con->in_base_pos = 0;
846}
847
848static void prepare_read_tag(struct ceph_connection *con)
849{
850 dout("prepare_read_tag %p\n", con);
851 con->in_base_pos = 0;
852 con->in_tag = CEPH_MSGR_TAG_READY;
853}
854
855/*
856 * Prepare to read a message.
857 */
858static int prepare_read_message(struct ceph_connection *con)
859{
860 dout("prepare_read_message %p\n", con);
861 BUG_ON(con->in_msg != NULL);
862 con->in_base_pos = 0;
863 con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
864 return 0;
865}
866
867
868static int read_partial(struct ceph_connection *con,
869 int *to, int size, void *object)
870{
871 *to += size;
872 while (con->in_base_pos < *to) {
873 int left = *to - con->in_base_pos;
874 int have = size - left;
875 int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
876 if (ret <= 0)
877 return ret;
878 con->in_base_pos += ret;
879 }
880 return 1;
881}
882
883
884/*
885 * Read all or part of the connect-side handshake on a new connection
886 */
887static int read_partial_banner(struct ceph_connection *con)
888{
889 int ret, to = 0;
890
891 dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
892
893 /* peer's banner */
894 ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
895 if (ret <= 0)
896 goto out;
897 ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
898 &con->actual_peer_addr);
899 if (ret <= 0)
900 goto out;
901 ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
902 &con->peer_addr_for_me);
903 if (ret <= 0)
904 goto out;
905out:
906 return ret;
907}
908
909static int read_partial_connect(struct ceph_connection *con)
910{
911 int ret, to = 0;
912
913 dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
914
915 ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
916 if (ret <= 0)
917 goto out;
918 ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
919 con->auth_reply_buf);
920 if (ret <= 0)
921 goto out;
922
923 dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
924 con, (int)con->in_reply.tag,
925 le32_to_cpu(con->in_reply.connect_seq),
926 le32_to_cpu(con->in_reply.global_seq));
927out:
928 return ret;
929
930}
931
932/*
933 * Verify the hello banner looks okay.
934 */
935static int verify_hello(struct ceph_connection *con)
936{
937 if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
938 pr_err("connect to %s got bad banner\n",
939 pr_addr(&con->peer_addr.in_addr));
940 con->error_msg = "protocol error, bad banner";
941 return -1;
942 }
943 return 0;
944}
945
946static bool addr_is_blank(struct sockaddr_storage *ss)
947{
948 switch (ss->ss_family) {
949 case AF_INET:
950 return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
951 case AF_INET6:
952 return
953 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
954 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
955 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
956 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
957 }
958 return false;
959}
960
961static int addr_port(struct sockaddr_storage *ss)
962{
963 switch (ss->ss_family) {
964 case AF_INET:
965 return ntohs(((struct sockaddr_in *)ss)->sin_port);
966 case AF_INET6:
967 return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
968 }
969 return 0;
970}
971
972static void addr_set_port(struct sockaddr_storage *ss, int p)
973{
974 switch (ss->ss_family) {
975 case AF_INET:
976 ((struct sockaddr_in *)ss)->sin_port = htons(p);
977 case AF_INET6:
978 ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
979 }
980}
981
982/*
983 * Parse an ip[:port] list into an addr array. Use the default
984 * monitor port if a port isn't specified.
985 */
986int ceph_parse_ips(const char *c, const char *end,
987 struct ceph_entity_addr *addr,
988 int max_count, int *count)
989{
990 int i;
991 const char *p = c;
992
993 dout("parse_ips on '%.*s'\n", (int)(end-c), c);
994 for (i = 0; i < max_count; i++) {
995 const char *ipend;
996 struct sockaddr_storage *ss = &addr[i].in_addr;
997 struct sockaddr_in *in4 = (void *)ss;
998 struct sockaddr_in6 *in6 = (void *)ss;
999 int port;
1000
1001 memset(ss, 0, sizeof(*ss));
1002 if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
1003 ',', &ipend)) {
1004 ss->ss_family = AF_INET;
1005 } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
1006 ',', &ipend)) {
1007 ss->ss_family = AF_INET6;
1008 } else {
1009 goto bad;
1010 }
1011 p = ipend;
1012
1013 /* port? */
1014 if (p < end && *p == ':') {
1015 port = 0;
1016 p++;
1017 while (p < end && *p >= '0' && *p <= '9') {
1018 port = (port * 10) + (*p - '0');
1019 p++;
1020 }
1021 if (port > 65535 || port == 0)
1022 goto bad;
1023 } else {
1024 port = CEPH_MON_PORT;
1025 }
1026
1027 addr_set_port(ss, port);
1028
1029 dout("parse_ips got %s\n", pr_addr(ss));
1030
1031 if (p == end)
1032 break;
1033 if (*p != ',')
1034 goto bad;
1035 p++;
1036 }
1037
1038 if (p != end)
1039 goto bad;
1040
1041 if (count)
1042 *count = i + 1;
1043 return 0;
1044
1045bad:
1046 pr_err("parse_ips bad ip '%s'\n", c);
1047 return -EINVAL;
1048}
1049
1050static int process_banner(struct ceph_connection *con)
1051{
1052 dout("process_banner on %p\n", con);
1053
1054 if (verify_hello(con) < 0)
1055 return -1;
1056
1057 ceph_decode_addr(&con->actual_peer_addr);
1058 ceph_decode_addr(&con->peer_addr_for_me);
1059
1060 /*
1061 * Make sure the other end is who we wanted. note that the other
1062 * end may not yet know their ip address, so if it's 0.0.0.0, give
1063 * them the benefit of the doubt.
1064 */
1065 if (memcmp(&con->peer_addr, &con->actual_peer_addr,
1066 sizeof(con->peer_addr)) != 0 &&
1067 !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
1068 con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
1069 pr_warning("wrong peer, want %s/%lld, got %s/%lld\n",
1070 pr_addr(&con->peer_addr.in_addr),
1071 le64_to_cpu(con->peer_addr.nonce),
1072 pr_addr(&con->actual_peer_addr.in_addr),
1073 le64_to_cpu(con->actual_peer_addr.nonce));
1074 con->error_msg = "wrong peer at address";
1075 return -1;
1076 }
1077
1078 /*
1079 * did we learn our address?
1080 */
1081 if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
1082 int port = addr_port(&con->msgr->inst.addr.in_addr);
1083
1084 memcpy(&con->msgr->inst.addr.in_addr,
1085 &con->peer_addr_for_me.in_addr,
1086 sizeof(con->peer_addr_for_me.in_addr));
1087 addr_set_port(&con->msgr->inst.addr.in_addr, port);
1088 encode_my_addr(con->msgr);
1089 dout("process_banner learned my addr is %s\n",
1090 pr_addr(&con->msgr->inst.addr.in_addr));
1091 }
1092
1093 set_bit(NEGOTIATING, &con->state);
1094 prepare_read_connect(con);
1095 return 0;
1096}
1097
1098static void fail_protocol(struct ceph_connection *con)
1099{
1100 reset_connection(con);
1101 set_bit(CLOSED, &con->state); /* in case there's queued work */
1102
1103 mutex_unlock(&con->mutex);
1104 if (con->ops->bad_proto)
1105 con->ops->bad_proto(con);
1106 mutex_lock(&con->mutex);
1107}
1108
1109static int process_connect(struct ceph_connection *con)
1110{
1111 u64 sup_feat = CEPH_FEATURE_SUPPORTED;
1112 u64 req_feat = CEPH_FEATURE_REQUIRED;
1113 u64 server_feat = le64_to_cpu(con->in_reply.features);
1114
1115 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
1116
1117 switch (con->in_reply.tag) {
1118 case CEPH_MSGR_TAG_FEATURES:
1119 pr_err("%s%lld %s feature set mismatch,"
1120 " my %llx < server's %llx, missing %llx\n",
1121 ENTITY_NAME(con->peer_name),
1122 pr_addr(&con->peer_addr.in_addr),
1123 sup_feat, server_feat, server_feat & ~sup_feat);
1124 con->error_msg = "missing required protocol features";
1125 fail_protocol(con);
1126 return -1;
1127
1128 case CEPH_MSGR_TAG_BADPROTOVER:
1129 pr_err("%s%lld %s protocol version mismatch,"
1130 " my %d != server's %d\n",
1131 ENTITY_NAME(con->peer_name),
1132 pr_addr(&con->peer_addr.in_addr),
1133 le32_to_cpu(con->out_connect.protocol_version),
1134 le32_to_cpu(con->in_reply.protocol_version));
1135 con->error_msg = "protocol version mismatch";
1136 fail_protocol(con);
1137 return -1;
1138
1139 case CEPH_MSGR_TAG_BADAUTHORIZER:
1140 con->auth_retry++;
1141 dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
1142 con->auth_retry);
1143 if (con->auth_retry == 2) {
1144 con->error_msg = "connect authorization failure";
1145 reset_connection(con);
1146 set_bit(CLOSED, &con->state);
1147 return -1;
1148 }
1149 con->auth_retry = 1;
1150 prepare_write_connect(con->msgr, con, 0);
1151 prepare_read_connect(con);
1152 break;
1153
1154 case CEPH_MSGR_TAG_RESETSESSION:
1155 /*
1156 * If we connected with a large connect_seq but the peer
1157 * has no record of a session with us (no connection, or
1158 * connect_seq == 0), they will send RESETSESION to indicate
1159 * that they must have reset their session, and may have
1160 * dropped messages.
1161 */
1162 dout("process_connect got RESET peer seq %u\n",
1163 le32_to_cpu(con->in_connect.connect_seq));
1164 pr_err("%s%lld %s connection reset\n",
1165 ENTITY_NAME(con->peer_name),
1166 pr_addr(&con->peer_addr.in_addr));
1167 reset_connection(con);
1168 prepare_write_connect(con->msgr, con, 0);
1169 prepare_read_connect(con);
1170
1171 /* Tell ceph about it. */
1172 mutex_unlock(&con->mutex);
1173 pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
1174 if (con->ops->peer_reset)
1175 con->ops->peer_reset(con);
1176 mutex_lock(&con->mutex);
1177 break;
1178
1179 case CEPH_MSGR_TAG_RETRY_SESSION:
1180 /*
1181 * If we sent a smaller connect_seq than the peer has, try
1182 * again with a larger value.
1183 */
1184 dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
1185 le32_to_cpu(con->out_connect.connect_seq),
1186 le32_to_cpu(con->in_connect.connect_seq));
1187 con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
1188 prepare_write_connect(con->msgr, con, 0);
1189 prepare_read_connect(con);
1190 break;
1191
1192 case CEPH_MSGR_TAG_RETRY_GLOBAL:
1193 /*
1194 * If we sent a smaller global_seq than the peer has, try
1195 * again with a larger value.
1196 */
1197 dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
1198 con->peer_global_seq,
1199 le32_to_cpu(con->in_connect.global_seq));
1200 get_global_seq(con->msgr,
1201 le32_to_cpu(con->in_connect.global_seq));
1202 prepare_write_connect(con->msgr, con, 0);
1203 prepare_read_connect(con);
1204 break;
1205
1206 case CEPH_MSGR_TAG_READY:
1207 if (req_feat & ~server_feat) {
1208 pr_err("%s%lld %s protocol feature mismatch,"
1209 " my required %llx > server's %llx, need %llx\n",
1210 ENTITY_NAME(con->peer_name),
1211 pr_addr(&con->peer_addr.in_addr),
1212 req_feat, server_feat, req_feat & ~server_feat);
1213 con->error_msg = "missing required protocol features";
1214 fail_protocol(con);
1215 return -1;
1216 }
1217 clear_bit(CONNECTING, &con->state);
1218 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1219 con->connect_seq++;
1220 dout("process_connect got READY gseq %d cseq %d (%d)\n",
1221 con->peer_global_seq,
1222 le32_to_cpu(con->in_reply.connect_seq),
1223 con->connect_seq);
1224 WARN_ON(con->connect_seq !=
1225 le32_to_cpu(con->in_reply.connect_seq));
1226
1227 if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
1228 set_bit(LOSSYTX, &con->state);
1229
1230 prepare_read_tag(con);
1231 break;
1232
1233 case CEPH_MSGR_TAG_WAIT:
1234 /*
1235 * If there is a connection race (we are opening
1236 * connections to each other), one of us may just have
1237 * to WAIT. This shouldn't happen if we are the
1238 * client.
1239 */
1240 pr_err("process_connect peer connecting WAIT\n");
1241
1242 default:
1243 pr_err("connect protocol error, will retry\n");
1244 con->error_msg = "protocol error, garbage tag during connect";
1245 return -1;
1246 }
1247 return 0;
1248}
1249
1250
1251/*
1252 * read (part of) an ack
1253 */
1254static int read_partial_ack(struct ceph_connection *con)
1255{
1256 int to = 0;
1257
1258 return read_partial(con, &to, sizeof(con->in_temp_ack),
1259 &con->in_temp_ack);
1260}
1261
1262
1263/*
1264 * We can finally discard anything that's been acked.
1265 */
1266static void process_ack(struct ceph_connection *con)
1267{
1268 struct ceph_msg *m;
1269 u64 ack = le64_to_cpu(con->in_temp_ack);
1270 u64 seq;
1271
1272 while (!list_empty(&con->out_sent)) {
1273 m = list_first_entry(&con->out_sent, struct ceph_msg,
1274 list_head);
1275 seq = le64_to_cpu(m->hdr.seq);
1276 if (seq > ack)
1277 break;
1278 dout("got ack for seq %llu type %d at %p\n", seq,
1279 le16_to_cpu(m->hdr.type), m);
1280 ceph_msg_remove(m);
1281 }
1282 prepare_read_tag(con);
1283}
1284
1285
1286
1287
1288static int read_partial_message_section(struct ceph_connection *con,
1289 struct kvec *section, unsigned int sec_len,
1290 u32 *crc)
1291{
1292 int left;
1293 int ret;
1294
1295 BUG_ON(!section);
1296
1297 while (section->iov_len < sec_len) {
1298 BUG_ON(section->iov_base == NULL);
1299 left = sec_len - section->iov_len;
1300 ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
1301 section->iov_len, left);
1302 if (ret <= 0)
1303 return ret;
1304 section->iov_len += ret;
1305 if (section->iov_len == sec_len)
1306 *crc = crc32c(0, section->iov_base,
1307 section->iov_len);
1308 }
1309
1310 return 1;
1311}
1312
1313static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
1314 struct ceph_msg_header *hdr,
1315 int *skip);
1316/*
1317 * read (part of) a message.
1318 */
1319static int read_partial_message(struct ceph_connection *con)
1320{
1321 struct ceph_msg *m = con->in_msg;
1322 void *p;
1323 int ret;
1324 int to, left;
1325 unsigned front_len, middle_len, data_len, data_off;
1326 int datacrc = con->msgr->nocrc;
1327 int skip;
1328
1329 dout("read_partial_message con %p msg %p\n", con, m);
1330
1331 /* header */
1332 while (con->in_base_pos < sizeof(con->in_hdr)) {
1333 left = sizeof(con->in_hdr) - con->in_base_pos;
1334 ret = ceph_tcp_recvmsg(con->sock,
1335 (char *)&con->in_hdr + con->in_base_pos,
1336 left);
1337 if (ret <= 0)
1338 return ret;
1339 con->in_base_pos += ret;
1340 if (con->in_base_pos == sizeof(con->in_hdr)) {
1341 u32 crc = crc32c(0, (void *)&con->in_hdr,
1342 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
1343 if (crc != le32_to_cpu(con->in_hdr.crc)) {
1344 pr_err("read_partial_message bad hdr "
1345 " crc %u != expected %u\n",
1346 crc, con->in_hdr.crc);
1347 return -EBADMSG;
1348 }
1349 }
1350 }
1351 front_len = le32_to_cpu(con->in_hdr.front_len);
1352 if (front_len > CEPH_MSG_MAX_FRONT_LEN)
1353 return -EIO;
1354 middle_len = le32_to_cpu(con->in_hdr.middle_len);
1355 if (middle_len > CEPH_MSG_MAX_DATA_LEN)
1356 return -EIO;
1357 data_len = le32_to_cpu(con->in_hdr.data_len);
1358 if (data_len > CEPH_MSG_MAX_DATA_LEN)
1359 return -EIO;
1360 data_off = le16_to_cpu(con->in_hdr.data_off);
1361
1362 /* allocate message? */
1363 if (!con->in_msg) {
1364 dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
1365 con->in_hdr.front_len, con->in_hdr.data_len);
1366 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
1367 if (skip) {
1368 /* skip this message */
1369 dout("alloc_msg returned NULL, skipping message\n");
1370 con->in_base_pos = -front_len - middle_len - data_len -
1371 sizeof(m->footer);
1372 con->in_tag = CEPH_MSGR_TAG_READY;
1373 return 0;
1374 }
1375 if (IS_ERR(con->in_msg)) {
1376 ret = PTR_ERR(con->in_msg);
1377 con->in_msg = NULL;
1378 con->error_msg =
1379 "error allocating memory for incoming message";
1380 return ret;
1381 }
1382 m = con->in_msg;
1383 m->front.iov_len = 0; /* haven't read it yet */
1384 if (m->middle)
1385 m->middle->vec.iov_len = 0;
1386
1387 con->in_msg_pos.page = 0;
1388 con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
1389 con->in_msg_pos.data_pos = 0;
1390 }
1391
1392 /* front */
1393 ret = read_partial_message_section(con, &m->front, front_len,
1394 &con->in_front_crc);
1395 if (ret <= 0)
1396 return ret;
1397
1398 /* middle */
1399 if (m->middle) {
1400 ret = read_partial_message_section(con, &m->middle->vec, middle_len,
1401 &con->in_middle_crc);
1402 if (ret <= 0)
1403 return ret;
1404 }
1405
1406 /* (page) data */
1407 while (con->in_msg_pos.data_pos < data_len) {
1408 left = min((int)(data_len - con->in_msg_pos.data_pos),
1409 (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
1410 BUG_ON(m->pages == NULL);
1411 p = kmap(m->pages[con->in_msg_pos.page]);
1412 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1413 left);
1414 if (ret > 0 && datacrc)
1415 con->in_data_crc =
1416 crc32c(con->in_data_crc,
1417 p + con->in_msg_pos.page_pos, ret);
1418 kunmap(m->pages[con->in_msg_pos.page]);
1419 if (ret <= 0)
1420 return ret;
1421 con->in_msg_pos.data_pos += ret;
1422 con->in_msg_pos.page_pos += ret;
1423 if (con->in_msg_pos.page_pos == PAGE_SIZE) {
1424 con->in_msg_pos.page_pos = 0;
1425 con->in_msg_pos.page++;
1426 }
1427 }
1428
1429 /* footer */
1430 to = sizeof(m->hdr) + sizeof(m->footer);
1431 while (con->in_base_pos < to) {
1432 left = to - con->in_base_pos;
1433 ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
1434 (con->in_base_pos - sizeof(m->hdr)),
1435 left);
1436 if (ret <= 0)
1437 return ret;
1438 con->in_base_pos += ret;
1439 }
1440 dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
1441 m, front_len, m->footer.front_crc, middle_len,
1442 m->footer.middle_crc, data_len, m->footer.data_crc);
1443
1444 /* crc ok? */
1445 if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
1446 pr_err("read_partial_message %p front crc %u != exp. %u\n",
1447 m, con->in_front_crc, m->footer.front_crc);
1448 return -EBADMSG;
1449 }
1450 if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
1451 pr_err("read_partial_message %p middle crc %u != exp %u\n",
1452 m, con->in_middle_crc, m->footer.middle_crc);
1453 return -EBADMSG;
1454 }
1455 if (datacrc &&
1456 (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
1457 con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
1458 pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
1459 con->in_data_crc, le32_to_cpu(m->footer.data_crc));
1460 return -EBADMSG;
1461 }
1462
1463 return 1; /* done! */
1464}
1465
1466/*
1467 * Process message. This happens in the worker thread. The callback should
1468 * be careful not to do anything that waits on other incoming messages or it
1469 * may deadlock.
1470 */
1471static void process_message(struct ceph_connection *con)
1472{
1473 struct ceph_msg *msg;
1474
1475 msg = con->in_msg;
1476 con->in_msg = NULL;
1477
1478 /* if first message, set peer_name */
1479 if (con->peer_name.type == 0)
1480 con->peer_name = msg->hdr.src.name;
1481
1482 con->in_seq++;
1483 mutex_unlock(&con->mutex);
1484
1485 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
1486 msg, le64_to_cpu(msg->hdr.seq),
1487 ENTITY_NAME(msg->hdr.src.name),
1488 le16_to_cpu(msg->hdr.type),
1489 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1490 le32_to_cpu(msg->hdr.front_len),
1491 le32_to_cpu(msg->hdr.data_len),
1492 con->in_front_crc, con->in_middle_crc, con->in_data_crc);
1493 con->ops->dispatch(con, msg);
1494
1495 mutex_lock(&con->mutex);
1496 prepare_read_tag(con);
1497}
1498
1499
1500/*
1501 * Write something to the socket. Called in a worker thread when the
1502 * socket appears to be writeable and we have something ready to send.
1503 */
1504static int try_write(struct ceph_connection *con)
1505{
1506 struct ceph_messenger *msgr = con->msgr;
1507 int ret = 1;
1508
1509 dout("try_write start %p state %lu nref %d\n", con, con->state,
1510 atomic_read(&con->nref));
1511
1512 mutex_lock(&con->mutex);
1513more:
1514 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
1515
1516 /* open the socket first? */
1517 if (con->sock == NULL) {
1518 /*
1519 * if we were STANDBY and are reconnecting _this_
1520 * connection, bump connect_seq now. Always bump
1521 * global_seq.
1522 */
1523 if (test_and_clear_bit(STANDBY, &con->state))
1524 con->connect_seq++;
1525
1526 prepare_write_banner(msgr, con);
1527 prepare_write_connect(msgr, con, 1);
1528 prepare_read_banner(con);
1529 set_bit(CONNECTING, &con->state);
1530 clear_bit(NEGOTIATING, &con->state);
1531
1532 BUG_ON(con->in_msg);
1533 con->in_tag = CEPH_MSGR_TAG_READY;
1534 dout("try_write initiating connect on %p new state %lu\n",
1535 con, con->state);
1536 con->sock = ceph_tcp_connect(con);
1537 if (IS_ERR(con->sock)) {
1538 con->sock = NULL;
1539 con->error_msg = "connect error";
1540 ret = -1;
1541 goto out;
1542 }
1543 }
1544
1545more_kvec:
1546 /* kvec data queued? */
1547 if (con->out_skip) {
1548 ret = write_partial_skip(con);
1549 if (ret <= 0)
1550 goto done;
1551 if (ret < 0) {
1552 dout("try_write write_partial_skip err %d\n", ret);
1553 goto done;
1554 }
1555 }
1556 if (con->out_kvec_left) {
1557 ret = write_partial_kvec(con);
1558 if (ret <= 0)
1559 goto done;
1560 }
1561
1562 /* msg pages? */
1563 if (con->out_msg) {
1564 if (con->out_msg_done) {
1565 ceph_msg_put(con->out_msg);
1566 con->out_msg = NULL; /* we're done with this one */
1567 goto do_next;
1568 }
1569
1570 ret = write_partial_msg_pages(con);
1571 if (ret == 1)
1572 goto more_kvec; /* we need to send the footer, too! */
1573 if (ret == 0)
1574 goto done;
1575 if (ret < 0) {
1576 dout("try_write write_partial_msg_pages err %d\n",
1577 ret);
1578 goto done;
1579 }
1580 }
1581
1582do_next:
1583 if (!test_bit(CONNECTING, &con->state)) {
1584 /* is anything else pending? */
1585 if (!list_empty(&con->out_queue)) {
1586 prepare_write_message(con);
1587 goto more;
1588 }
1589 if (con->in_seq > con->in_seq_acked) {
1590 prepare_write_ack(con);
1591 goto more;
1592 }
1593 if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
1594 prepare_write_keepalive(con);
1595 goto more;
1596 }
1597 }
1598
1599 /* Nothing to do! */
1600 clear_bit(WRITE_PENDING, &con->state);
1601 dout("try_write nothing else to write.\n");
1602done:
1603 ret = 0;
1604out:
1605 mutex_unlock(&con->mutex);
1606 dout("try_write done on %p\n", con);
1607 return ret;
1608}
1609
1610
1611
1612/*
1613 * Read what we can from the socket.
1614 */
1615static int try_read(struct ceph_connection *con)
1616{
1617 struct ceph_messenger *msgr;
1618 int ret = -1;
1619
1620 if (!con->sock)
1621 return 0;
1622
1623 if (test_bit(STANDBY, &con->state))
1624 return 0;
1625
1626 dout("try_read start on %p\n", con);
1627 msgr = con->msgr;
1628
1629 mutex_lock(&con->mutex);
1630
1631more:
1632 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
1633 con->in_base_pos);
1634 if (test_bit(CONNECTING, &con->state)) {
1635 if (!test_bit(NEGOTIATING, &con->state)) {
1636 dout("try_read connecting\n");
1637 ret = read_partial_banner(con);
1638 if (ret <= 0)
1639 goto done;
1640 if (process_banner(con) < 0) {
1641 ret = -1;
1642 goto out;
1643 }
1644 }
1645 ret = read_partial_connect(con);
1646 if (ret <= 0)
1647 goto done;
1648 if (process_connect(con) < 0) {
1649 ret = -1;
1650 goto out;
1651 }
1652 goto more;
1653 }
1654
1655 if (con->in_base_pos < 0) {
1656 /*
1657 * skipping + discarding content.
1658 *
1659 * FIXME: there must be a better way to do this!
1660 */
1661 static char buf[1024];
1662 int skip = min(1024, -con->in_base_pos);
1663 dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
1664 ret = ceph_tcp_recvmsg(con->sock, buf, skip);
1665 if (ret <= 0)
1666 goto done;
1667 con->in_base_pos += ret;
1668 if (con->in_base_pos)
1669 goto more;
1670 }
1671 if (con->in_tag == CEPH_MSGR_TAG_READY) {
1672 /*
1673 * what's next?
1674 */
1675 ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
1676 if (ret <= 0)
1677 goto done;
1678 dout("try_read got tag %d\n", (int)con->in_tag);
1679 switch (con->in_tag) {
1680 case CEPH_MSGR_TAG_MSG:
1681 prepare_read_message(con);
1682 break;
1683 case CEPH_MSGR_TAG_ACK:
1684 prepare_read_ack(con);
1685 break;
1686 case CEPH_MSGR_TAG_CLOSE:
1687 set_bit(CLOSED, &con->state); /* fixme */
1688 goto done;
1689 default:
1690 goto bad_tag;
1691 }
1692 }
1693 if (con->in_tag == CEPH_MSGR_TAG_MSG) {
1694 ret = read_partial_message(con);
1695 if (ret <= 0) {
1696 switch (ret) {
1697 case -EBADMSG:
1698 con->error_msg = "bad crc";
1699 ret = -EIO;
1700 goto out;
1701 case -EIO:
1702 con->error_msg = "io error";
1703 goto out;
1704 default:
1705 goto done;
1706 }
1707 }
1708 if (con->in_tag == CEPH_MSGR_TAG_READY)
1709 goto more;
1710 process_message(con);
1711 goto more;
1712 }
1713 if (con->in_tag == CEPH_MSGR_TAG_ACK) {
1714 ret = read_partial_ack(con);
1715 if (ret <= 0)
1716 goto done;
1717 process_ack(con);
1718 goto more;
1719 }
1720
1721done:
1722 ret = 0;
1723out:
1724 mutex_unlock(&con->mutex);
1725 dout("try_read done on %p\n", con);
1726 return ret;
1727
1728bad_tag:
1729 pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
1730 con->error_msg = "protocol error, garbage tag";
1731 ret = -1;
1732 goto out;
1733}
1734
1735
1736/*
1737 * Atomically queue work on a connection. Bump @con reference to
1738 * avoid races with connection teardown.
1739 *
1740 * There is some trickery going on with QUEUED and BUSY because we
1741 * only want a _single_ thread operating on each connection at any
1742 * point in time, but we want to use all available CPUs.
1743 *
1744 * The worker thread only proceeds if it can atomically set BUSY. It
1745 * clears QUEUED and does it's thing. When it thinks it's done, it
1746 * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
1747 * (tries again to set BUSY).
1748 *
1749 * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
1750 * try to queue work. If that fails (work is already queued, or BUSY)
1751 * we give up (work also already being done or is queued) but leave QUEUED
1752 * set so that the worker thread will loop if necessary.
1753 */
1754static void queue_con(struct ceph_connection *con)
1755{
1756 if (test_bit(DEAD, &con->state)) {
1757 dout("queue_con %p ignoring: DEAD\n",
1758 con);
1759 return;
1760 }
1761
1762 if (!con->ops->get(con)) {
1763 dout("queue_con %p ref count 0\n", con);
1764 return;
1765 }
1766
1767 set_bit(QUEUED, &con->state);
1768 if (test_bit(BUSY, &con->state)) {
1769 dout("queue_con %p - already BUSY\n", con);
1770 con->ops->put(con);
1771 } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
1772 dout("queue_con %p - already queued\n", con);
1773 con->ops->put(con);
1774 } else {
1775 dout("queue_con %p\n", con);
1776 }
1777}
1778
1779/*
1780 * Do some work on a connection. Drop a connection ref when we're done.
1781 */
1782static void con_work(struct work_struct *work)
1783{
1784 struct ceph_connection *con = container_of(work, struct ceph_connection,
1785 work.work);
1786 int backoff = 0;
1787
1788more:
1789 if (test_and_set_bit(BUSY, &con->state) != 0) {
1790 dout("con_work %p BUSY already set\n", con);
1791 goto out;
1792 }
1793 dout("con_work %p start, clearing QUEUED\n", con);
1794 clear_bit(QUEUED, &con->state);
1795
1796 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
1797 dout("con_work CLOSED\n");
1798 con_close_socket(con);
1799 goto done;
1800 }
1801 if (test_and_clear_bit(OPENING, &con->state)) {
1802 /* reopen w/ new peer */
1803 dout("con_work OPENING\n");
1804 con_close_socket(con);
1805 }
1806
1807 if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
1808 try_read(con) < 0 ||
1809 try_write(con) < 0) {
1810 backoff = 1;
1811 ceph_fault(con); /* error/fault path */
1812 }
1813
1814done:
1815 clear_bit(BUSY, &con->state);
1816 dout("con->state=%lu\n", con->state);
1817 if (test_bit(QUEUED, &con->state)) {
1818 if (!backoff || test_bit(OPENING, &con->state)) {
1819 dout("con_work %p QUEUED reset, looping\n", con);
1820 goto more;
1821 }
1822 dout("con_work %p QUEUED reset, but just faulted\n", con);
1823 clear_bit(QUEUED, &con->state);
1824 }
1825 dout("con_work %p done\n", con);
1826
1827out:
1828 con->ops->put(con);
1829}
1830
1831
1832/*
1833 * Generic error/fault handler. A retry mechanism is used with
1834 * exponential backoff
1835 */
1836static void ceph_fault(struct ceph_connection *con)
1837{
1838 pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
1839 pr_addr(&con->peer_addr.in_addr), con->error_msg);
1840 dout("fault %p state %lu to peer %s\n",
1841 con, con->state, pr_addr(&con->peer_addr.in_addr));
1842
1843 if (test_bit(LOSSYTX, &con->state)) {
1844 dout("fault on LOSSYTX channel\n");
1845 goto out;
1846 }
1847
1848 mutex_lock(&con->mutex);
1849 if (test_bit(CLOSED, &con->state))
1850 goto out_unlock;
1851
1852 con_close_socket(con);
1853
1854 if (con->in_msg) {
1855 ceph_msg_put(con->in_msg);
1856 con->in_msg = NULL;
1857 }
1858
1859 /* Requeue anything that hasn't been acked */
1860 list_splice_init(&con->out_sent, &con->out_queue);
1861
1862 /* If there are no messages in the queue, place the connection
1863 * in a STANDBY state (i.e., don't try to reconnect just yet). */
1864 if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
1865 dout("fault setting STANDBY\n");
1866 set_bit(STANDBY, &con->state);
1867 } else {
1868 /* retry after a delay. */
1869 if (con->delay == 0)
1870 con->delay = BASE_DELAY_INTERVAL;
1871 else if (con->delay < MAX_DELAY_INTERVAL)
1872 con->delay *= 2;
1873 dout("fault queueing %p delay %lu\n", con, con->delay);
1874 con->ops->get(con);
1875 if (queue_delayed_work(ceph_msgr_wq, &con->work,
1876 round_jiffies_relative(con->delay)) == 0)
1877 con->ops->put(con);
1878 }
1879
1880out_unlock:
1881 mutex_unlock(&con->mutex);
1882out:
1883 /*
1884 * in case we faulted due to authentication, invalidate our
1885 * current tickets so that we can get new ones.
1886 */
1887 if (con->auth_retry && con->ops->invalidate_authorizer) {
1888 dout("calling invalidate_authorizer()\n");
1889 con->ops->invalidate_authorizer(con);
1890 }
1891
1892 if (con->ops->fault)
1893 con->ops->fault(con);
1894}
1895
1896
1897
1898/*
1899 * create a new messenger instance
1900 */
1901struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
1902{
1903 struct ceph_messenger *msgr;
1904
1905 msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
1906 if (msgr == NULL)
1907 return ERR_PTR(-ENOMEM);
1908
1909 spin_lock_init(&msgr->global_seq_lock);
1910
1911 /* the zero page is needed if a request is "canceled" while the message
1912 * is being written over the socket */
1913 msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
1914 if (!msgr->zero_page) {
1915 kfree(msgr);
1916 return ERR_PTR(-ENOMEM);
1917 }
1918 kmap(msgr->zero_page);
1919
1920 if (myaddr)
1921 msgr->inst.addr = *myaddr;
1922
1923 /* select a random nonce */
1924 msgr->inst.addr.type = 0;
1925 get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
1926 encode_my_addr(msgr);
1927
1928 dout("messenger_create %p\n", msgr);
1929 return msgr;
1930}
1931
1932void ceph_messenger_destroy(struct ceph_messenger *msgr)
1933{
1934 dout("destroy %p\n", msgr);
1935 kunmap(msgr->zero_page);
1936 __free_page(msgr->zero_page);
1937 kfree(msgr);
1938 dout("destroyed messenger %p\n", msgr);
1939}
1940
1941/*
1942 * Queue up an outgoing message on the given connection.
1943 */
1944void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
1945{
1946 if (test_bit(CLOSED, &con->state)) {
1947 dout("con_send %p closed, dropping %p\n", con, msg);
1948 ceph_msg_put(msg);
1949 return;
1950 }
1951
1952 /* set src+dst */
1953 msg->hdr.src.name = con->msgr->inst.name;
1954 msg->hdr.src.addr = con->msgr->my_enc_addr;
1955 msg->hdr.orig_src = msg->hdr.src;
1956
1957 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
1958
1959 /* queue */
1960 mutex_lock(&con->mutex);
1961 BUG_ON(!list_empty(&msg->list_head));
1962 list_add_tail(&msg->list_head, &con->out_queue);
1963 dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
1964 ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
1965 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1966 le32_to_cpu(msg->hdr.front_len),
1967 le32_to_cpu(msg->hdr.middle_len),
1968 le32_to_cpu(msg->hdr.data_len));
1969 mutex_unlock(&con->mutex);
1970
1971 /* if there wasn't anything waiting to send before, queue
1972 * new work */
1973 if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
1974 queue_con(con);
1975}
1976
1977/*
1978 * Revoke a message that was previously queued for send
1979 */
1980void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
1981{
1982 mutex_lock(&con->mutex);
1983 if (!list_empty(&msg->list_head)) {
1984 dout("con_revoke %p msg %p\n", con, msg);
1985 list_del_init(&msg->list_head);
1986 ceph_msg_put(msg);
1987 msg->hdr.seq = 0;
1988 if (con->out_msg == msg) {
1989 ceph_msg_put(con->out_msg);
1990 con->out_msg = NULL;
1991 }
1992 if (con->out_kvec_is_msg) {
1993 con->out_skip = con->out_kvec_bytes;
1994 con->out_kvec_is_msg = false;
1995 }
1996 } else {
1997 dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg);
1998 }
1999 mutex_unlock(&con->mutex);
2000}
2001
2002/*
2003 * Revoke a message that we may be reading data into
2004 */
2005void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
2006{
2007 mutex_lock(&con->mutex);
2008 if (con->in_msg && con->in_msg == msg) {
2009 unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
2010 unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
2011 unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
2012
2013 /* skip rest of message */
2014 dout("con_revoke_pages %p msg %p revoked\n", con, msg);
2015 con->in_base_pos = con->in_base_pos -
2016 sizeof(struct ceph_msg_header) -
2017 front_len -
2018 middle_len -
2019 data_len -
2020 sizeof(struct ceph_msg_footer);
2021 ceph_msg_put(con->in_msg);
2022 con->in_msg = NULL;
2023 con->in_tag = CEPH_MSGR_TAG_READY;
2024 } else {
2025 dout("con_revoke_pages %p msg %p pages %p no-op\n",
2026 con, con->in_msg, msg);
2027 }
2028 mutex_unlock(&con->mutex);
2029}
2030
2031/*
2032 * Queue a keepalive byte to ensure the tcp connection is alive.
2033 */
2034void ceph_con_keepalive(struct ceph_connection *con)
2035{
2036 if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
2037 test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2038 queue_con(con);
2039}
2040
2041
2042/*
2043 * construct a new message with given type, size
2044 * the new msg has a ref count of 1.
2045 */
2046struct ceph_msg *ceph_msg_new(int type, int front_len,
2047 int page_len, int page_off, struct page **pages)
2048{
2049 struct ceph_msg *m;
2050
2051 m = kmalloc(sizeof(*m), GFP_NOFS);
2052 if (m == NULL)
2053 goto out;
2054 kref_init(&m->kref);
2055 INIT_LIST_HEAD(&m->list_head);
2056
2057 m->hdr.type = cpu_to_le16(type);
2058 m->hdr.front_len = cpu_to_le32(front_len);
2059 m->hdr.middle_len = 0;
2060 m->hdr.data_len = cpu_to_le32(page_len);
2061 m->hdr.data_off = cpu_to_le16(page_off);
2062 m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
2063 m->footer.front_crc = 0;
2064 m->footer.middle_crc = 0;
2065 m->footer.data_crc = 0;
2066 m->front_max = front_len;
2067 m->front_is_vmalloc = false;
2068 m->more_to_follow = false;
2069 m->pool = NULL;
2070
2071 /* front */
2072 if (front_len) {
2073 if (front_len > PAGE_CACHE_SIZE) {
2074 m->front.iov_base = __vmalloc(front_len, GFP_NOFS,
2075 PAGE_KERNEL);
2076 m->front_is_vmalloc = true;
2077 } else {
2078 m->front.iov_base = kmalloc(front_len, GFP_NOFS);
2079 }
2080 if (m->front.iov_base == NULL) {
2081 pr_err("msg_new can't allocate %d bytes\n",
2082 front_len);
2083 goto out2;
2084 }
2085 } else {
2086 m->front.iov_base = NULL;
2087 }
2088 m->front.iov_len = front_len;
2089
2090 /* middle */
2091 m->middle = NULL;
2092
2093 /* data */
2094 m->nr_pages = calc_pages_for(page_off, page_len);
2095 m->pages = pages;
2096 m->pagelist = NULL;
2097
2098 dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
2099 m->nr_pages);
2100 return m;
2101
2102out2:
2103 ceph_msg_put(m);
2104out:
2105 pr_err("msg_new can't create type %d len %d\n", type, front_len);
2106 return ERR_PTR(-ENOMEM);
2107}
2108
2109/*
2110 * Allocate "middle" portion of a message, if it is needed and wasn't
2111 * allocated by alloc_msg. This allows us to read a small fixed-size
2112 * per-type header in the front and then gracefully fail (i.e.,
2113 * propagate the error to the caller based on info in the front) when
2114 * the middle is too large.
2115 */
2116static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
2117{
2118 int type = le16_to_cpu(msg->hdr.type);
2119 int middle_len = le32_to_cpu(msg->hdr.middle_len);
2120
2121 dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
2122 ceph_msg_type_name(type), middle_len);
2123 BUG_ON(!middle_len);
2124 BUG_ON(msg->middle);
2125
2126 msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
2127 if (!msg->middle)
2128 return -ENOMEM;
2129 return 0;
2130}
2131
2132/*
2133 * Generic message allocator, for incoming messages.
2134 */
2135static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
2136 struct ceph_msg_header *hdr,
2137 int *skip)
2138{
2139 int type = le16_to_cpu(hdr->type);
2140 int front_len = le32_to_cpu(hdr->front_len);
2141 int middle_len = le32_to_cpu(hdr->middle_len);
2142 struct ceph_msg *msg = NULL;
2143 int ret;
2144
2145 if (con->ops->alloc_msg) {
2146 mutex_unlock(&con->mutex);
2147 msg = con->ops->alloc_msg(con, hdr, skip);
2148 mutex_lock(&con->mutex);
2149 if (IS_ERR(msg))
2150 return msg;
2151
2152 if (*skip)
2153 return NULL;
2154 }
2155 if (!msg) {
2156 *skip = 0;
2157 msg = ceph_msg_new(type, front_len, 0, 0, NULL);
2158 if (!msg) {
2159 pr_err("unable to allocate msg type %d len %d\n",
2160 type, front_len);
2161 return ERR_PTR(-ENOMEM);
2162 }
2163 }
2164 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2165
2166 if (middle_len) {
2167 ret = ceph_alloc_middle(con, msg);
2168
2169 if (ret < 0) {
2170 ceph_msg_put(msg);
2171 return msg;
2172 }
2173 }
2174
2175 return msg;
2176}
2177
2178
2179/*
2180 * Free a generically kmalloc'd message.
2181 */
2182void ceph_msg_kfree(struct ceph_msg *m)
2183{
2184 dout("msg_kfree %p\n", m);
2185 if (m->front_is_vmalloc)
2186 vfree(m->front.iov_base);
2187 else
2188 kfree(m->front.iov_base);
2189 kfree(m);
2190}
2191
2192/*
2193 * Drop a msg ref. Destroy as needed.
2194 */
2195void ceph_msg_last_put(struct kref *kref)
2196{
2197 struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
2198
2199 dout("ceph_msg_put last one on %p\n", m);
2200 WARN_ON(!list_empty(&m->list_head));
2201
2202 /* drop middle, data, if any */
2203 if (m->middle) {
2204 ceph_buffer_put(m->middle);
2205 m->middle = NULL;
2206 }
2207 m->nr_pages = 0;
2208 m->pages = NULL;
2209
2210 if (m->pagelist) {
2211 ceph_pagelist_release(m->pagelist);
2212 kfree(m->pagelist);
2213 m->pagelist = NULL;
2214 }
2215
2216 if (m->pool)
2217 ceph_msgpool_put(m->pool, m);
2218 else
2219 ceph_msg_kfree(m);
2220}
2221
2222void ceph_msg_dump(struct ceph_msg *msg)
2223{
2224 pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
2225 msg->front_max, msg->nr_pages);
2226 print_hex_dump(KERN_DEBUG, "header: ",
2227 DUMP_PREFIX_OFFSET, 16, 1,
2228 &msg->hdr, sizeof(msg->hdr), true);
2229 print_hex_dump(KERN_DEBUG, " front: ",
2230 DUMP_PREFIX_OFFSET, 16, 1,
2231 msg->front.iov_base, msg->front.iov_len, true);
2232 if (msg->middle)
2233 print_hex_dump(KERN_DEBUG, "middle: ",
2234 DUMP_PREFIX_OFFSET, 16, 1,
2235 msg->middle->vec.iov_base,
2236 msg->middle->vec.iov_len, true);
2237 print_hex_dump(KERN_DEBUG, "footer: ",
2238 DUMP_PREFIX_OFFSET, 16, 1,
2239 &msg->footer, sizeof(msg->footer), true);
2240}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
new file mode 100644
index 000000000000..a343dae73cdc
--- /dev/null
+++ b/fs/ceph/messenger.h
@@ -0,0 +1,255 @@
1#ifndef __FS_CEPH_MESSENGER_H
2#define __FS_CEPH_MESSENGER_H
3
4#include <linux/kref.h>
5#include <linux/mutex.h>
6#include <linux/net.h>
7#include <linux/radix-tree.h>
8#include <linux/uio.h>
9#include <linux/version.h>
10#include <linux/workqueue.h>
11
12#include "types.h"
13#include "buffer.h"
14
15struct ceph_msg;
16struct ceph_connection;
17
18extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */
19
20/*
21 * Ceph defines these callbacks for handling connection events.
22 */
23struct ceph_connection_operations {
24 struct ceph_connection *(*get)(struct ceph_connection *);
25 void (*put)(struct ceph_connection *);
26
27 /* handle an incoming message. */
28 void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
29
30 /* authorize an outgoing connection */
31 int (*get_authorizer) (struct ceph_connection *con,
32 void **buf, int *len, int *proto,
33 void **reply_buf, int *reply_len, int force_new);
34 int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
35 int (*invalidate_authorizer)(struct ceph_connection *con);
36
37 /* protocol version mismatch */
38 void (*bad_proto) (struct ceph_connection *con);
39
40 /* there was some error on the socket (disconnect, whatever) */
41 void (*fault) (struct ceph_connection *con);
42
43 /* a remote host as terminated a message exchange session, and messages
44 * we sent (or they tried to send us) may be lost. */
45 void (*peer_reset) (struct ceph_connection *con);
46
47 struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
48 struct ceph_msg_header *hdr,
49 int *skip);
50};
51
52extern const char *ceph_name_type_str(int t);
53
54/* use format string %s%d */
55#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num)
56
57struct ceph_messenger {
58 struct ceph_entity_inst inst; /* my name+address */
59 struct ceph_entity_addr my_enc_addr;
60 struct page *zero_page; /* used in certain error cases */
61
62 bool nocrc;
63
64 /*
65 * the global_seq counts connections i (attempt to) initiate
66 * in order to disambiguate certain connect race conditions.
67 */
68 u32 global_seq;
69 spinlock_t global_seq_lock;
70};
71
72/*
73 * a single message. it contains a header (src, dest, message type, etc.),
74 * footer (crc values, mainly), a "front" message body, and possibly a
75 * data payload (stored in some number of pages).
76 */
77struct ceph_msg {
78 struct ceph_msg_header hdr; /* header */
79 struct ceph_msg_footer footer; /* footer */
80 struct kvec front; /* unaligned blobs of message */
81 struct ceph_buffer *middle;
82 struct page **pages; /* data payload. NOT OWNER. */
83 unsigned nr_pages; /* size of page array */
84 struct ceph_pagelist *pagelist; /* instead of pages */
85 struct list_head list_head;
86 struct kref kref;
87 bool front_is_vmalloc;
88 bool more_to_follow;
89 int front_max;
90
91 struct ceph_msgpool *pool;
92};
93
94struct ceph_msg_pos {
95 int page, page_pos; /* which page; offset in page */
96 int data_pos; /* offset in data payload */
97 int did_page_crc; /* true if we've calculated crc for current page */
98};
99
100/* ceph connection fault delay defaults, for exponential backoff */
101#define BASE_DELAY_INTERVAL (HZ/2)
102#define MAX_DELAY_INTERVAL (5 * 60 * HZ)
103
104/*
105 * ceph_connection state bit flags
106 *
107 * QUEUED and BUSY are used together to ensure that only a single
108 * thread is currently opening, reading or writing data to the socket.
109 */
110#define LOSSYTX 0 /* we can close channel or drop messages on errors */
111#define CONNECTING 1
112#define NEGOTIATING 2
113#define KEEPALIVE_PENDING 3
114#define WRITE_PENDING 4 /* we have data ready to send */
115#define QUEUED 5 /* there is work queued on this connection */
116#define BUSY 6 /* work is being done */
117#define STANDBY 8 /* no outgoing messages, socket closed. we keep
118 * the ceph_connection around to maintain shared
119 * state with the peer. */
120#define CLOSED 10 /* we've closed the connection */
121#define SOCK_CLOSED 11 /* socket state changed to closed */
122#define OPENING 13 /* open connection w/ (possibly new) peer */
123#define DEAD 14 /* dead, about to kfree */
124
125/*
126 * A single connection with another host.
127 *
128 * We maintain a queue of outgoing messages, and some session state to
129 * ensure that we can preserve the lossless, ordered delivery of
130 * messages in the case of a TCP disconnect.
131 */
132struct ceph_connection {
133 void *private;
134 atomic_t nref;
135
136 const struct ceph_connection_operations *ops;
137
138 struct ceph_messenger *msgr;
139 struct socket *sock;
140 unsigned long state; /* connection state (see flags above) */
141 const char *error_msg; /* error message, if any */
142
143 struct ceph_entity_addr peer_addr; /* peer address */
144 struct ceph_entity_name peer_name; /* peer name */
145 struct ceph_entity_addr peer_addr_for_me;
146 u32 connect_seq; /* identify the most recent connection
147 attempt for this connection, client */
148 u32 peer_global_seq; /* peer's global seq for this connection */
149
150 int auth_retry; /* true if we need a newer authorizer */
151 void *auth_reply_buf; /* where to put the authorizer reply */
152 int auth_reply_buf_len;
153
154 struct mutex mutex;
155
156 /* out queue */
157 struct list_head out_queue;
158 struct list_head out_sent; /* sending or sent but unacked */
159 u64 out_seq; /* last message queued for send */
160 u64 out_seq_sent; /* last message sent */
161 bool out_keepalive_pending;
162
163 u64 in_seq, in_seq_acked; /* last message received, acked */
164
165 /* connection negotiation temps */
166 char in_banner[CEPH_BANNER_MAX_LEN];
167 union {
168 struct { /* outgoing connection */
169 struct ceph_msg_connect out_connect;
170 struct ceph_msg_connect_reply in_reply;
171 };
172 struct { /* incoming */
173 struct ceph_msg_connect in_connect;
174 struct ceph_msg_connect_reply out_reply;
175 };
176 };
177 struct ceph_entity_addr actual_peer_addr;
178
179 /* message out temps */
180 struct ceph_msg *out_msg; /* sending message (== tail of
181 out_sent) */
182 bool out_msg_done;
183 struct ceph_msg_pos out_msg_pos;
184
185 struct kvec out_kvec[8], /* sending header/footer data */
186 *out_kvec_cur;
187 int out_kvec_left; /* kvec's left in out_kvec */
188 int out_skip; /* skip this many bytes */
189 int out_kvec_bytes; /* total bytes left */
190 bool out_kvec_is_msg; /* kvec refers to out_msg */
191 int out_more; /* there is more data after the kvecs */
192 __le64 out_temp_ack; /* for writing an ack */
193
194 /* message in temps */
195 struct ceph_msg_header in_hdr;
196 struct ceph_msg *in_msg;
197 struct ceph_msg_pos in_msg_pos;
198 u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
199
200 char in_tag; /* protocol control byte */
201 int in_base_pos; /* bytes read */
202 __le64 in_temp_ack; /* for reading an ack */
203
204 struct delayed_work work; /* send|recv work */
205 unsigned long delay; /* current delay interval */
206};
207
208
209extern const char *pr_addr(const struct sockaddr_storage *ss);
210extern int ceph_parse_ips(const char *c, const char *end,
211 struct ceph_entity_addr *addr,
212 int max_count, int *count);
213
214
215extern int ceph_msgr_init(void);
216extern void ceph_msgr_exit(void);
217
218extern struct ceph_messenger *ceph_messenger_create(
219 struct ceph_entity_addr *myaddr);
220extern void ceph_messenger_destroy(struct ceph_messenger *);
221
222extern void ceph_con_init(struct ceph_messenger *msgr,
223 struct ceph_connection *con);
224extern void ceph_con_open(struct ceph_connection *con,
225 struct ceph_entity_addr *addr);
226extern bool ceph_con_opened(struct ceph_connection *con);
227extern void ceph_con_close(struct ceph_connection *con);
228extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
229extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
230extern void ceph_con_revoke_message(struct ceph_connection *con,
231 struct ceph_msg *msg);
232extern void ceph_con_keepalive(struct ceph_connection *con);
233extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
234extern void ceph_con_put(struct ceph_connection *con);
235
236extern struct ceph_msg *ceph_msg_new(int type, int front_len,
237 int page_len, int page_off,
238 struct page **pages);
239extern void ceph_msg_kfree(struct ceph_msg *m);
240
241
242static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
243{
244 kref_get(&msg->kref);
245 return msg;
246}
247extern void ceph_msg_last_put(struct kref *kref);
248static inline void ceph_msg_put(struct ceph_msg *msg)
249{
250 kref_put(&msg->kref, ceph_msg_last_put);
251}
252
253extern void ceph_msg_dump(struct ceph_msg *msg);
254
255#endif
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
new file mode 100644
index 000000000000..8fdc011ca956
--- /dev/null
+++ b/fs/ceph/mon_client.c
@@ -0,0 +1,835 @@
1#include "ceph_debug.h"
2
3#include <linux/types.h>
4#include <linux/slab.h>
5#include <linux/random.h>
6#include <linux/sched.h>
7
8#include "mon_client.h"
9#include "super.h"
10#include "auth.h"
11#include "decode.h"
12
13/*
14 * Interact with Ceph monitor cluster. Handle requests for new map
15 * versions, and periodically resend as needed. Also implement
16 * statfs() and umount().
17 *
18 * A small cluster of Ceph "monitors" are responsible for managing critical
19 * cluster configuration and state information. An odd number (e.g., 3, 5)
20 * of cmon daemons use a modified version of the Paxos part-time parliament
21 * algorithm to manage the MDS map (mds cluster membership), OSD map, and
22 * list of clients who have mounted the file system.
23 *
24 * We maintain an open, active session with a monitor at all times in order to
25 * receive timely MDSMap updates. We periodically send a keepalive byte on the
26 * TCP socket to ensure we detect a failure. If the connection does break, we
27 * randomly hunt for a new monitor. Once the connection is reestablished, we
28 * resend any outstanding requests.
29 */
30
31const static struct ceph_connection_operations mon_con_ops;
32
33static int __validate_auth(struct ceph_mon_client *monc);
34
35/*
36 * Decode a monmap blob (e.g., during mount).
37 */
38struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
39{
40 struct ceph_monmap *m = NULL;
41 int i, err = -EINVAL;
42 struct ceph_fsid fsid;
43 u32 epoch, num_mon;
44 u16 version;
45 u32 len;
46
47 ceph_decode_32_safe(&p, end, len, bad);
48 ceph_decode_need(&p, end, len, bad);
49
50 dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
51
52 ceph_decode_16_safe(&p, end, version, bad);
53
54 ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
55 ceph_decode_copy(&p, &fsid, sizeof(fsid));
56 epoch = ceph_decode_32(&p);
57
58 num_mon = ceph_decode_32(&p);
59 ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
60
61 if (num_mon >= CEPH_MAX_MON)
62 goto bad;
63 m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
64 if (m == NULL)
65 return ERR_PTR(-ENOMEM);
66 m->fsid = fsid;
67 m->epoch = epoch;
68 m->num_mon = num_mon;
69 ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
70 for (i = 0; i < num_mon; i++)
71 ceph_decode_addr(&m->mon_inst[i].addr);
72
73 dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
74 m->num_mon);
75 for (i = 0; i < m->num_mon; i++)
76 dout("monmap_decode mon%d is %s\n", i,
77 pr_addr(&m->mon_inst[i].addr.in_addr));
78 return m;
79
80bad:
81 dout("monmap_decode failed with %d\n", err);
82 kfree(m);
83 return ERR_PTR(err);
84}
85
86/*
87 * return true if *addr is included in the monmap.
88 */
89int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
90{
91 int i;
92
93 for (i = 0; i < m->num_mon; i++)
94 if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
95 return 1;
96 return 0;
97}
98
99/*
100 * Send an auth request.
101 */
102static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
103{
104 monc->pending_auth = 1;
105 monc->m_auth->front.iov_len = len;
106 monc->m_auth->hdr.front_len = cpu_to_le32(len);
107 ceph_msg_get(monc->m_auth); /* keep our ref */
108 ceph_con_send(monc->con, monc->m_auth);
109}
110
111/*
112 * Close monitor session, if any.
113 */
114static void __close_session(struct ceph_mon_client *monc)
115{
116 if (monc->con) {
117 dout("__close_session closing mon%d\n", monc->cur_mon);
118 ceph_con_revoke(monc->con, monc->m_auth);
119 ceph_con_close(monc->con);
120 monc->cur_mon = -1;
121 monc->pending_auth = 0;
122 ceph_auth_reset(monc->auth);
123 }
124}
125
126/*
127 * Open a session with a (new) monitor.
128 */
129static int __open_session(struct ceph_mon_client *monc)
130{
131 char r;
132 int ret;
133
134 if (monc->cur_mon < 0) {
135 get_random_bytes(&r, 1);
136 monc->cur_mon = r % monc->monmap->num_mon;
137 dout("open_session num=%d r=%d -> mon%d\n",
138 monc->monmap->num_mon, r, monc->cur_mon);
139 monc->sub_sent = 0;
140 monc->sub_renew_after = jiffies; /* i.e., expired */
141 monc->want_next_osdmap = !!monc->want_next_osdmap;
142
143 dout("open_session mon%d opening\n", monc->cur_mon);
144 monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
145 monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
146 ceph_con_open(monc->con,
147 &monc->monmap->mon_inst[monc->cur_mon].addr);
148
149 /* initiatiate authentication handshake */
150 ret = ceph_auth_build_hello(monc->auth,
151 monc->m_auth->front.iov_base,
152 monc->m_auth->front_max);
153 __send_prepared_auth_request(monc, ret);
154 } else {
155 dout("open_session mon%d already open\n", monc->cur_mon);
156 }
157 return 0;
158}
159
160static bool __sub_expired(struct ceph_mon_client *monc)
161{
162 return time_after_eq(jiffies, monc->sub_renew_after);
163}
164
165/*
166 * Reschedule delayed work timer.
167 */
168static void __schedule_delayed(struct ceph_mon_client *monc)
169{
170 unsigned delay;
171
172 if (monc->cur_mon < 0 || __sub_expired(monc))
173 delay = 10 * HZ;
174 else
175 delay = 20 * HZ;
176 dout("__schedule_delayed after %u\n", delay);
177 schedule_delayed_work(&monc->delayed_work, delay);
178}
179
180/*
181 * Send subscribe request for mdsmap and/or osdmap.
182 */
183static void __send_subscribe(struct ceph_mon_client *monc)
184{
185 dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
186 (unsigned)monc->sub_sent, __sub_expired(monc),
187 monc->want_next_osdmap);
188 if ((__sub_expired(monc) && !monc->sub_sent) ||
189 monc->want_next_osdmap == 1) {
190 struct ceph_msg *msg;
191 struct ceph_mon_subscribe_item *i;
192 void *p, *end;
193
194 msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
195 if (!msg)
196 return;
197
198 p = msg->front.iov_base;
199 end = p + msg->front.iov_len;
200
201 dout("__send_subscribe to 'mdsmap' %u+\n",
202 (unsigned)monc->have_mdsmap);
203 if (monc->want_next_osdmap) {
204 dout("__send_subscribe to 'osdmap' %u\n",
205 (unsigned)monc->have_osdmap);
206 ceph_encode_32(&p, 3);
207 ceph_encode_string(&p, end, "osdmap", 6);
208 i = p;
209 i->have = cpu_to_le64(monc->have_osdmap);
210 i->onetime = 1;
211 p += sizeof(*i);
212 monc->want_next_osdmap = 2; /* requested */
213 } else {
214 ceph_encode_32(&p, 2);
215 }
216 ceph_encode_string(&p, end, "mdsmap", 6);
217 i = p;
218 i->have = cpu_to_le64(monc->have_mdsmap);
219 i->onetime = 0;
220 p += sizeof(*i);
221 ceph_encode_string(&p, end, "monmap", 6);
222 i = p;
223 i->have = 0;
224 i->onetime = 0;
225 p += sizeof(*i);
226
227 msg->front.iov_len = p - msg->front.iov_base;
228 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
229 ceph_con_send(monc->con, msg);
230
231 monc->sub_sent = jiffies | 1; /* never 0 */
232 }
233}
234
235static void handle_subscribe_ack(struct ceph_mon_client *monc,
236 struct ceph_msg *msg)
237{
238 unsigned seconds;
239 struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
240
241 if (msg->front.iov_len < sizeof(*h))
242 goto bad;
243 seconds = le32_to_cpu(h->duration);
244
245 mutex_lock(&monc->mutex);
246 if (monc->hunting) {
247 pr_info("mon%d %s session established\n",
248 monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
249 monc->hunting = false;
250 }
251 dout("handle_subscribe_ack after %d seconds\n", seconds);
252 monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
253 monc->sub_sent = 0;
254 mutex_unlock(&monc->mutex);
255 return;
256bad:
257 pr_err("got corrupt subscribe-ack msg\n");
258 ceph_msg_dump(msg);
259}
260
261/*
262 * Keep track of which maps we have
263 */
264int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
265{
266 mutex_lock(&monc->mutex);
267 monc->have_mdsmap = got;
268 mutex_unlock(&monc->mutex);
269 return 0;
270}
271
272int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
273{
274 mutex_lock(&monc->mutex);
275 monc->have_osdmap = got;
276 monc->want_next_osdmap = 0;
277 mutex_unlock(&monc->mutex);
278 return 0;
279}
280
281/*
282 * Register interest in the next osdmap
283 */
284void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
285{
286 dout("request_next_osdmap have %u\n", monc->have_osdmap);
287 mutex_lock(&monc->mutex);
288 if (!monc->want_next_osdmap)
289 monc->want_next_osdmap = 1;
290 if (monc->want_next_osdmap < 2)
291 __send_subscribe(monc);
292 mutex_unlock(&monc->mutex);
293}
294
295/*
296 *
297 */
298int ceph_monc_open_session(struct ceph_mon_client *monc)
299{
300 if (!monc->con) {
301 monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
302 if (!monc->con)
303 return -ENOMEM;
304 ceph_con_init(monc->client->msgr, monc->con);
305 monc->con->private = monc;
306 monc->con->ops = &mon_con_ops;
307 }
308
309 mutex_lock(&monc->mutex);
310 __open_session(monc);
311 __schedule_delayed(monc);
312 mutex_unlock(&monc->mutex);
313 return 0;
314}
315
316/*
317 * The monitor responds with mount ack indicate mount success. The
318 * included client ticket allows the client to talk to MDSs and OSDs.
319 */
320static void ceph_monc_handle_map(struct ceph_mon_client *monc,
321 struct ceph_msg *msg)
322{
323 struct ceph_client *client = monc->client;
324 struct ceph_monmap *monmap = NULL, *old = monc->monmap;
325 void *p, *end;
326
327 mutex_lock(&monc->mutex);
328
329 dout("handle_monmap\n");
330 p = msg->front.iov_base;
331 end = p + msg->front.iov_len;
332
333 monmap = ceph_monmap_decode(p, end);
334 if (IS_ERR(monmap)) {
335 pr_err("problem decoding monmap, %d\n",
336 (int)PTR_ERR(monmap));
337 goto out;
338 }
339
340 if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
341 kfree(monmap);
342 goto out;
343 }
344
345 client->monc.monmap = monmap;
346 kfree(old);
347
348out:
349 mutex_unlock(&monc->mutex);
350 wake_up(&client->auth_wq);
351}
352
353/*
354 * statfs
355 */
356static struct ceph_mon_statfs_request *__lookup_statfs(
357 struct ceph_mon_client *monc, u64 tid)
358{
359 struct ceph_mon_statfs_request *req;
360 struct rb_node *n = monc->statfs_request_tree.rb_node;
361
362 while (n) {
363 req = rb_entry(n, struct ceph_mon_statfs_request, node);
364 if (tid < req->tid)
365 n = n->rb_left;
366 else if (tid > req->tid)
367 n = n->rb_right;
368 else
369 return req;
370 }
371 return NULL;
372}
373
374static void __insert_statfs(struct ceph_mon_client *monc,
375 struct ceph_mon_statfs_request *new)
376{
377 struct rb_node **p = &monc->statfs_request_tree.rb_node;
378 struct rb_node *parent = NULL;
379 struct ceph_mon_statfs_request *req = NULL;
380
381 while (*p) {
382 parent = *p;
383 req = rb_entry(parent, struct ceph_mon_statfs_request, node);
384 if (new->tid < req->tid)
385 p = &(*p)->rb_left;
386 else if (new->tid > req->tid)
387 p = &(*p)->rb_right;
388 else
389 BUG();
390 }
391
392 rb_link_node(&new->node, parent, p);
393 rb_insert_color(&new->node, &monc->statfs_request_tree);
394}
395
396static void handle_statfs_reply(struct ceph_mon_client *monc,
397 struct ceph_msg *msg)
398{
399 struct ceph_mon_statfs_request *req;
400 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
401 u64 tid;
402
403 if (msg->front.iov_len != sizeof(*reply))
404 goto bad;
405 tid = le64_to_cpu(msg->hdr.tid);
406 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
407
408 mutex_lock(&monc->mutex);
409 req = __lookup_statfs(monc, tid);
410 if (req) {
411 *req->buf = reply->st;
412 req->result = 0;
413 }
414 mutex_unlock(&monc->mutex);
415 if (req)
416 complete(&req->completion);
417 return;
418
419bad:
420 pr_err("corrupt statfs reply, no tid\n");
421 ceph_msg_dump(msg);
422}
423
424/*
425 * (re)send a statfs request
426 */
427static int send_statfs(struct ceph_mon_client *monc,
428 struct ceph_mon_statfs_request *req)
429{
430 struct ceph_msg *msg;
431 struct ceph_mon_statfs *h;
432
433 dout("send_statfs tid %llu\n", req->tid);
434 msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
435 if (IS_ERR(msg))
436 return PTR_ERR(msg);
437 req->request = msg;
438 msg->hdr.tid = cpu_to_le64(req->tid);
439 h = msg->front.iov_base;
440 h->monhdr.have_version = 0;
441 h->monhdr.session_mon = cpu_to_le16(-1);
442 h->monhdr.session_mon_tid = 0;
443 h->fsid = monc->monmap->fsid;
444 ceph_con_send(monc->con, msg);
445 return 0;
446}
447
448/*
449 * Do a synchronous statfs().
450 */
451int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
452{
453 struct ceph_mon_statfs_request req;
454 int err;
455
456 req.buf = buf;
457 init_completion(&req.completion);
458
459 /* allocate memory for reply */
460 err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
461 if (err)
462 return err;
463
464 /* register request */
465 mutex_lock(&monc->mutex);
466 req.tid = ++monc->last_tid;
467 req.last_attempt = jiffies;
468 req.delay = BASE_DELAY_INTERVAL;
469 __insert_statfs(monc, &req);
470 monc->num_statfs_requests++;
471 mutex_unlock(&monc->mutex);
472
473 /* send request and wait */
474 err = send_statfs(monc, &req);
475 if (!err)
476 err = wait_for_completion_interruptible(&req.completion);
477
478 mutex_lock(&monc->mutex);
479 rb_erase(&req.node, &monc->statfs_request_tree);
480 monc->num_statfs_requests--;
481 ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
482 mutex_unlock(&monc->mutex);
483
484 if (!err)
485 err = req.result;
486 return err;
487}
488
489/*
490 * Resend pending statfs requests.
491 */
492static void __resend_statfs(struct ceph_mon_client *monc)
493{
494 struct ceph_mon_statfs_request *req;
495 struct rb_node *p;
496
497 for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) {
498 req = rb_entry(p, struct ceph_mon_statfs_request, node);
499 send_statfs(monc, req);
500 }
501}
502
503/*
504 * Delayed work. If we haven't mounted yet, retry. Otherwise,
505 * renew/retry subscription as needed (in case it is timing out, or we
506 * got an ENOMEM). And keep the monitor connection alive.
507 */
508static void delayed_work(struct work_struct *work)
509{
510 struct ceph_mon_client *monc =
511 container_of(work, struct ceph_mon_client, delayed_work.work);
512
513 dout("monc delayed_work\n");
514 mutex_lock(&monc->mutex);
515 if (monc->hunting) {
516 __close_session(monc);
517 __open_session(monc); /* continue hunting */
518 } else {
519 ceph_con_keepalive(monc->con);
520
521 __validate_auth(monc);
522
523 if (monc->auth->ops->is_authenticated(monc->auth))
524 __send_subscribe(monc);
525 }
526 __schedule_delayed(monc);
527 mutex_unlock(&monc->mutex);
528}
529
530/*
531 * On startup, we build a temporary monmap populated with the IPs
532 * provided by mount(2).
533 */
534static int build_initial_monmap(struct ceph_mon_client *monc)
535{
536 struct ceph_mount_args *args = monc->client->mount_args;
537 struct ceph_entity_addr *mon_addr = args->mon_addr;
538 int num_mon = args->num_mon;
539 int i;
540
541 /* build initial monmap */
542 monc->monmap = kzalloc(sizeof(*monc->monmap) +
543 num_mon*sizeof(monc->monmap->mon_inst[0]),
544 GFP_KERNEL);
545 if (!monc->monmap)
546 return -ENOMEM;
547 for (i = 0; i < num_mon; i++) {
548 monc->monmap->mon_inst[i].addr = mon_addr[i];
549 monc->monmap->mon_inst[i].addr.nonce = 0;
550 monc->monmap->mon_inst[i].name.type =
551 CEPH_ENTITY_TYPE_MON;
552 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
553 }
554 monc->monmap->num_mon = num_mon;
555 monc->have_fsid = false;
556
557 /* release addr memory */
558 kfree(args->mon_addr);
559 args->mon_addr = NULL;
560 args->num_mon = 0;
561 return 0;
562}
563
564int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
565{
566 int err = 0;
567
568 dout("init\n");
569 memset(monc, 0, sizeof(*monc));
570 monc->client = cl;
571 monc->monmap = NULL;
572 mutex_init(&monc->mutex);
573
574 err = build_initial_monmap(monc);
575 if (err)
576 goto out;
577
578 monc->con = NULL;
579
580 /* authentication */
581 monc->auth = ceph_auth_init(cl->mount_args->name,
582 cl->mount_args->secret);
583 if (IS_ERR(monc->auth))
584 return PTR_ERR(monc->auth);
585 monc->auth->want_keys =
586 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
587 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
588
589 /* msg pools */
590 err = ceph_msgpool_init(&monc->msgpool_subscribe_ack,
591 sizeof(struct ceph_mon_subscribe_ack), 1, false);
592 if (err < 0)
593 goto out_monmap;
594 err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
595 sizeof(struct ceph_mon_statfs_reply), 0, false);
596 if (err < 0)
597 goto out_pool1;
598 err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
599 if (err < 0)
600 goto out_pool2;
601
602 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
603 monc->pending_auth = 0;
604 if (IS_ERR(monc->m_auth)) {
605 err = PTR_ERR(monc->m_auth);
606 monc->m_auth = NULL;
607 goto out_pool3;
608 }
609
610 monc->cur_mon = -1;
611 monc->hunting = true;
612 monc->sub_renew_after = jiffies;
613 monc->sub_sent = 0;
614
615 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
616 monc->statfs_request_tree = RB_ROOT;
617 monc->num_statfs_requests = 0;
618 monc->last_tid = 0;
619
620 monc->have_mdsmap = 0;
621 monc->have_osdmap = 0;
622 monc->want_next_osdmap = 1;
623 return 0;
624
625out_pool3:
626 ceph_msgpool_destroy(&monc->msgpool_auth_reply);
627out_pool2:
628 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
629out_pool1:
630 ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
631out_monmap:
632 kfree(monc->monmap);
633out:
634 return err;
635}
636
637void ceph_monc_stop(struct ceph_mon_client *monc)
638{
639 dout("stop\n");
640 cancel_delayed_work_sync(&monc->delayed_work);
641
642 mutex_lock(&monc->mutex);
643 __close_session(monc);
644 if (monc->con) {
645 monc->con->private = NULL;
646 monc->con->ops->put(monc->con);
647 monc->con = NULL;
648 }
649 mutex_unlock(&monc->mutex);
650
651 ceph_auth_destroy(monc->auth);
652
653 ceph_msg_put(monc->m_auth);
654 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
655 ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
656 ceph_msgpool_destroy(&monc->msgpool_auth_reply);
657
658 kfree(monc->monmap);
659}
660
661static void handle_auth_reply(struct ceph_mon_client *monc,
662 struct ceph_msg *msg)
663{
664 int ret;
665
666 mutex_lock(&monc->mutex);
667 monc->pending_auth = 0;
668 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
669 msg->front.iov_len,
670 monc->m_auth->front.iov_base,
671 monc->m_auth->front_max);
672 if (ret < 0) {
673 monc->client->auth_err = ret;
674 wake_up(&monc->client->auth_wq);
675 } else if (ret > 0) {
676 __send_prepared_auth_request(monc, ret);
677 } else if (monc->auth->ops->is_authenticated(monc->auth)) {
678 dout("authenticated, starting session\n");
679
680 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
681 monc->client->msgr->inst.name.num = monc->auth->global_id;
682
683 __send_subscribe(monc);
684 __resend_statfs(monc);
685 }
686 mutex_unlock(&monc->mutex);
687}
688
689static int __validate_auth(struct ceph_mon_client *monc)
690{
691 int ret;
692
693 if (monc->pending_auth)
694 return 0;
695
696 ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
697 monc->m_auth->front_max);
698 if (ret <= 0)
699 return ret; /* either an error, or no need to authenticate */
700 __send_prepared_auth_request(monc, ret);
701 return 0;
702}
703
704int ceph_monc_validate_auth(struct ceph_mon_client *monc)
705{
706 int ret;
707
708 mutex_lock(&monc->mutex);
709 ret = __validate_auth(monc);
710 mutex_unlock(&monc->mutex);
711 return ret;
712}
713
714/*
715 * handle incoming message
716 */
717static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
718{
719 struct ceph_mon_client *monc = con->private;
720 int type = le16_to_cpu(msg->hdr.type);
721
722 if (!monc)
723 return;
724
725 switch (type) {
726 case CEPH_MSG_AUTH_REPLY:
727 handle_auth_reply(monc, msg);
728 break;
729
730 case CEPH_MSG_MON_SUBSCRIBE_ACK:
731 handle_subscribe_ack(monc, msg);
732 break;
733
734 case CEPH_MSG_STATFS_REPLY:
735 handle_statfs_reply(monc, msg);
736 break;
737
738 case CEPH_MSG_MON_MAP:
739 ceph_monc_handle_map(monc, msg);
740 break;
741
742 case CEPH_MSG_MDS_MAP:
743 ceph_mdsc_handle_map(&monc->client->mdsc, msg);
744 break;
745
746 case CEPH_MSG_OSD_MAP:
747 ceph_osdc_handle_map(&monc->client->osdc, msg);
748 break;
749
750 default:
751 pr_err("received unknown message type %d %s\n", type,
752 ceph_msg_type_name(type));
753 }
754 ceph_msg_put(msg);
755}
756
757/*
758 * Allocate memory for incoming message
759 */
760static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
761 struct ceph_msg_header *hdr,
762 int *skip)
763{
764 struct ceph_mon_client *monc = con->private;
765 int type = le16_to_cpu(hdr->type);
766 int front_len = le32_to_cpu(hdr->front_len);
767 struct ceph_msg *m = NULL;
768
769 *skip = 0;
770
771 switch (type) {
772 case CEPH_MSG_MON_SUBSCRIBE_ACK:
773 m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len);
774 break;
775 case CEPH_MSG_STATFS_REPLY:
776 m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len);
777 break;
778 case CEPH_MSG_AUTH_REPLY:
779 m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len);
780 break;
781 case CEPH_MSG_MON_MAP:
782 case CEPH_MSG_MDS_MAP:
783 case CEPH_MSG_OSD_MAP:
784 m = ceph_msg_new(type, front_len, 0, 0, NULL);
785 break;
786 }
787
788 if (!m) {
789 pr_info("alloc_msg unknown type %d\n", type);
790 *skip = 1;
791 }
792 return m;
793}
794
795/*
796 * If the monitor connection resets, pick a new monitor and resubmit
797 * any pending requests.
798 */
799static void mon_fault(struct ceph_connection *con)
800{
801 struct ceph_mon_client *monc = con->private;
802
803 if (!monc)
804 return;
805
806 dout("mon_fault\n");
807 mutex_lock(&monc->mutex);
808 if (!con->private)
809 goto out;
810
811 if (monc->con && !monc->hunting)
812 pr_info("mon%d %s session lost, "
813 "hunting for new mon\n", monc->cur_mon,
814 pr_addr(&monc->con->peer_addr.in_addr));
815
816 __close_session(monc);
817 if (!monc->hunting) {
818 /* start hunting */
819 monc->hunting = true;
820 __open_session(monc);
821 } else {
822 /* already hunting, let's wait a bit */
823 __schedule_delayed(monc);
824 }
825out:
826 mutex_unlock(&monc->mutex);
827}
828
829const static struct ceph_connection_operations mon_con_ops = {
830 .get = ceph_con_get,
831 .put = ceph_con_put,
832 .dispatch = dispatch,
833 .fault = mon_fault,
834 .alloc_msg = mon_alloc_msg,
835};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
new file mode 100644
index 000000000000..b958ad5afa06
--- /dev/null
+++ b/fs/ceph/mon_client.h
@@ -0,0 +1,119 @@
1#ifndef _FS_CEPH_MON_CLIENT_H
2#define _FS_CEPH_MON_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/rbtree.h>
6
7#include "messenger.h"
8#include "msgpool.h"
9
10struct ceph_client;
11struct ceph_mount_args;
12struct ceph_auth_client;
13
14/*
15 * The monitor map enumerates the set of all monitors.
16 */
17struct ceph_monmap {
18 struct ceph_fsid fsid;
19 u32 epoch;
20 u32 num_mon;
21 struct ceph_entity_inst mon_inst[0];
22};
23
24struct ceph_mon_client;
25struct ceph_mon_statfs_request;
26
27
28/*
29 * Generic mechanism for resending monitor requests.
30 */
31typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
32 int newmon);
33
34/* a pending monitor request */
35struct ceph_mon_request {
36 struct ceph_mon_client *monc;
37 struct delayed_work delayed_work;
38 unsigned long delay;
39 ceph_monc_request_func_t do_request;
40};
41
42/*
43 * statfs() is done a bit differently because we need to get data back
44 * to the caller
45 */
46struct ceph_mon_statfs_request {
47 u64 tid;
48 struct rb_node node;
49 int result;
50 struct ceph_statfs *buf;
51 struct completion completion;
52 unsigned long last_attempt, delay; /* jiffies */
53 struct ceph_msg *request; /* original request */
54};
55
56struct ceph_mon_client {
57 struct ceph_client *client;
58 struct ceph_monmap *monmap;
59
60 struct mutex mutex;
61 struct delayed_work delayed_work;
62
63 struct ceph_auth_client *auth;
64 struct ceph_msg *m_auth;
65 int pending_auth;
66
67 bool hunting;
68 int cur_mon; /* last monitor i contacted */
69 unsigned long sub_sent, sub_renew_after;
70 struct ceph_connection *con;
71 bool have_fsid;
72
73 /* msg pools */
74 struct ceph_msgpool msgpool_subscribe_ack;
75 struct ceph_msgpool msgpool_statfs_reply;
76 struct ceph_msgpool msgpool_auth_reply;
77
78 /* pending statfs requests */
79 struct rb_root statfs_request_tree;
80 int num_statfs_requests;
81 u64 last_tid;
82
83 /* mds/osd map */
84 int want_next_osdmap; /* 1 = want, 2 = want+asked */
85 u32 have_osdmap, have_mdsmap;
86
87#ifdef CONFIG_DEBUG_FS
88 struct dentry *debugfs_file;
89#endif
90};
91
92extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
93extern int ceph_monmap_contains(struct ceph_monmap *m,
94 struct ceph_entity_addr *addr);
95
96extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
97extern void ceph_monc_stop(struct ceph_mon_client *monc);
98
99/*
100 * The model here is to indicate that we need a new map of at least
101 * epoch @want, and also call in when we receive a map. We will
102 * periodically rerequest the map from the monitor cluster until we
103 * get what we want.
104 */
105extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
106extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
107
108extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
109
110extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
111 struct ceph_statfs *buf);
112
113extern int ceph_monc_open_session(struct ceph_mon_client *monc);
114
115extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
116
117
118
119#endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
new file mode 100644
index 000000000000..ca3b44a89f2d
--- /dev/null
+++ b/fs/ceph/msgpool.c
@@ -0,0 +1,186 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/sched.h>
5#include <linux/types.h>
6#include <linux/vmalloc.h>
7
8#include "msgpool.h"
9
10/*
11 * We use msg pools to preallocate memory for messages we expect to
12 * receive over the wire, to avoid getting ourselves into OOM
13 * conditions at unexpected times. We take use a few different
14 * strategies:
15 *
16 * - for request/response type interactions, we preallocate the
17 * memory needed for the response when we generate the request.
18 *
19 * - for messages we can receive at any time from the MDS, we preallocate
20 * a pool of messages we can re-use.
21 *
22 * - for writeback, we preallocate some number of messages to use for
23 * requests and their replies, so that we always make forward
24 * progress.
25 *
26 * The msgpool behaves like a mempool_t, but keeps preallocated
27 * ceph_msgs strung together on a list_head instead of using a pointer
28 * vector. This avoids vector reallocation when we adjust the number
29 * of preallocated items (which happens frequently).
30 */
31
32
33/*
34 * Allocate or release as necessary to meet our target pool size.
35 */
36static int __fill_msgpool(struct ceph_msgpool *pool)
37{
38 struct ceph_msg *msg;
39
40 while (pool->num < pool->min) {
41 dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
42 pool->min);
43 spin_unlock(&pool->lock);
44 msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
45 spin_lock(&pool->lock);
46 if (IS_ERR(msg))
47 return PTR_ERR(msg);
48 msg->pool = pool;
49 list_add(&msg->list_head, &pool->msgs);
50 pool->num++;
51 }
52 while (pool->num > pool->min) {
53 msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
54 dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
55 pool->min, msg);
56 list_del_init(&msg->list_head);
57 pool->num--;
58 ceph_msg_kfree(msg);
59 }
60 return 0;
61}
62
63int ceph_msgpool_init(struct ceph_msgpool *pool,
64 int front_len, int min, bool blocking)
65{
66 int ret;
67
68 dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
69 spin_lock_init(&pool->lock);
70 pool->front_len = front_len;
71 INIT_LIST_HEAD(&pool->msgs);
72 pool->num = 0;
73 pool->min = min;
74 pool->blocking = blocking;
75 init_waitqueue_head(&pool->wait);
76
77 spin_lock(&pool->lock);
78 ret = __fill_msgpool(pool);
79 spin_unlock(&pool->lock);
80 return ret;
81}
82
83void ceph_msgpool_destroy(struct ceph_msgpool *pool)
84{
85 dout("msgpool_destroy %p\n", pool);
86 spin_lock(&pool->lock);
87 pool->min = 0;
88 __fill_msgpool(pool);
89 spin_unlock(&pool->lock);
90}
91
92int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
93{
94 int ret;
95
96 spin_lock(&pool->lock);
97 dout("msgpool_resv %p delta %d\n", pool, delta);
98 pool->min += delta;
99 ret = __fill_msgpool(pool);
100 spin_unlock(&pool->lock);
101 return ret;
102}
103
104struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
105{
106 wait_queue_t wait;
107 struct ceph_msg *msg;
108
109 if (front_len && front_len > pool->front_len) {
110 pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
111 pool, front_len, pool->front_len);
112 WARN_ON(1);
113
114 /* try to alloc a fresh message */
115 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
116 if (!IS_ERR(msg))
117 return msg;
118 }
119
120 if (!front_len)
121 front_len = pool->front_len;
122
123 if (pool->blocking) {
124 /* mempool_t behavior; first try to alloc */
125 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
126 if (!IS_ERR(msg))
127 return msg;
128 }
129
130 while (1) {
131 spin_lock(&pool->lock);
132 if (likely(pool->num)) {
133 msg = list_entry(pool->msgs.next, struct ceph_msg,
134 list_head);
135 list_del_init(&msg->list_head);
136 pool->num--;
137 dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
138 pool->num, pool->min);
139 spin_unlock(&pool->lock);
140 return msg;
141 }
142 pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
143 pool->min, pool->blocking ? "waiting" : "may fail");
144 spin_unlock(&pool->lock);
145
146 if (!pool->blocking) {
147 WARN_ON(1);
148
149 /* maybe we can allocate it now? */
150 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
151 if (!IS_ERR(msg))
152 return msg;
153
154 pr_err("msgpool_get %p empty + alloc failed\n", pool);
155 return ERR_PTR(-ENOMEM);
156 }
157
158 init_wait(&wait);
159 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
160 schedule();
161 finish_wait(&pool->wait, &wait);
162 }
163}
164
165void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
166{
167 spin_lock(&pool->lock);
168 if (pool->num < pool->min) {
169 /* reset msg front_len; user may have changed it */
170 msg->front.iov_len = pool->front_len;
171 msg->hdr.front_len = cpu_to_le32(pool->front_len);
172
173 kref_set(&msg->kref, 1); /* retake a single ref */
174 list_add(&msg->list_head, &pool->msgs);
175 pool->num++;
176 dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
177 pool->num, pool->min);
178 spin_unlock(&pool->lock);
179 wake_up(&pool->wait);
180 } else {
181 dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
182 pool->num, pool->min);
183 spin_unlock(&pool->lock);
184 ceph_msg_kfree(msg);
185 }
186}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
new file mode 100644
index 000000000000..bc834bfcd720
--- /dev/null
+++ b/fs/ceph/msgpool.h
@@ -0,0 +1,27 @@
1#ifndef _FS_CEPH_MSGPOOL
2#define _FS_CEPH_MSGPOOL
3
4#include "messenger.h"
5
6/*
7 * we use memory pools for preallocating messages we may receive, to
8 * avoid unexpected OOM conditions.
9 */
10struct ceph_msgpool {
11 spinlock_t lock;
12 int front_len; /* preallocated payload size */
13 struct list_head msgs; /* msgs in the pool; each has 1 ref */
14 int num, min; /* cur, min # msgs in the pool */
15 bool blocking;
16 wait_queue_head_t wait;
17};
18
19extern int ceph_msgpool_init(struct ceph_msgpool *pool,
20 int front_len, int size, bool blocking);
21extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
22extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
23extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
24 int front_len);
25extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
26
27#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
new file mode 100644
index 000000000000..8aaab414f3f8
--- /dev/null
+++ b/fs/ceph/msgr.h
@@ -0,0 +1,158 @@
1#ifndef __MSGR_H
2#define __MSGR_H
3
4/*
5 * Data types for message passing layer used by Ceph.
6 */
7
8#define CEPH_MON_PORT 6789 /* default monitor port */
9
10/*
11 * client-side processes will try to bind to ports in this
12 * range, simply for the benefit of tools like nmap or wireshark
13 * that would like to identify the protocol.
14 */
15#define CEPH_PORT_FIRST 6789
16#define CEPH_PORT_START 6800 /* non-monitors start here */
17#define CEPH_PORT_LAST 6900
18
19/*
20 * tcp connection banner. include a protocol version. and adjust
21 * whenever the wire protocol changes. try to keep this string length
22 * constant.
23 */
24#define CEPH_BANNER "ceph v027"
25#define CEPH_BANNER_MAX_LEN 30
26
27
28/*
29 * Rollover-safe type and comparator for 32-bit sequence numbers.
30 * Comparator returns -1, 0, or 1.
31 */
32typedef __u32 ceph_seq_t;
33
34static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
35{
36 return (__s32)a - (__s32)b;
37}
38
39
40/*
41 * entity_name -- logical name for a process participating in the
42 * network, e.g. 'mds0' or 'osd3'.
43 */
44struct ceph_entity_name {
45 __u8 type; /* CEPH_ENTITY_TYPE_* */
46 __le64 num;
47} __attribute__ ((packed));
48
49#define CEPH_ENTITY_TYPE_MON 0x01
50#define CEPH_ENTITY_TYPE_MDS 0x02
51#define CEPH_ENTITY_TYPE_OSD 0x04
52#define CEPH_ENTITY_TYPE_CLIENT 0x08
53#define CEPH_ENTITY_TYPE_ADMIN 0x10
54#define CEPH_ENTITY_TYPE_AUTH 0x20
55
56#define CEPH_ENTITY_TYPE_ANY 0xFF
57
58extern const char *ceph_entity_type_name(int type);
59
60/*
61 * entity_addr -- network address
62 */
63struct ceph_entity_addr {
64 __le32 type;
65 __le32 nonce; /* unique id for process (e.g. pid) */
66 struct sockaddr_storage in_addr;
67} __attribute__ ((packed));
68
69struct ceph_entity_inst {
70 struct ceph_entity_name name;
71 struct ceph_entity_addr addr;
72} __attribute__ ((packed));
73
74
75/* used by message exchange protocol */
76#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
77#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
78#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
79 incoming connection */
80#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
81 with higher cseq */
82#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
83 with higher gseq */
84#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
85#define CEPH_MSGR_TAG_MSG 7 /* message */
86#define CEPH_MSGR_TAG_ACK 8 /* message ack */
87#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
88#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
89#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
90#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */
91
92
93/*
94 * connection negotiation
95 */
96struct ceph_msg_connect {
97 __le64 features; /* supported feature bits */
98 __le32 host_type; /* CEPH_ENTITY_TYPE_* */
99 __le32 global_seq; /* count connections initiated by this host */
100 __le32 connect_seq; /* count connections initiated in this session */
101 __le32 protocol_version;
102 __le32 authorizer_protocol;
103 __le32 authorizer_len;
104 __u8 flags; /* CEPH_MSG_CONNECT_* */
105} __attribute__ ((packed));
106
107struct ceph_msg_connect_reply {
108 __u8 tag;
109 __le64 features; /* feature bits for this session */
110 __le32 global_seq;
111 __le32 connect_seq;
112 __le32 protocol_version;
113 __le32 authorizer_len;
114 __u8 flags;
115} __attribute__ ((packed));
116
117#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
118
119
120/*
121 * message header
122 */
123struct ceph_msg_header {
124 __le64 seq; /* message seq# for this session */
125 __le64 tid; /* transaction id */
126 __le16 type; /* message type */
127 __le16 priority; /* priority. higher value == higher priority */
128 __le16 version; /* version of message encoding */
129
130 __le32 front_len; /* bytes in main payload */
131 __le32 middle_len;/* bytes in middle payload */
132 __le32 data_len; /* bytes of data payload */
133 __le16 data_off; /* sender: include full offset;
134 receiver: mask against ~PAGE_MASK */
135
136 struct ceph_entity_inst src, orig_src;
137 __le32 reserved;
138 __le32 crc; /* header crc32c */
139} __attribute__ ((packed));
140
141#define CEPH_MSG_PRIO_LOW 64
142#define CEPH_MSG_PRIO_DEFAULT 127
143#define CEPH_MSG_PRIO_HIGH 196
144#define CEPH_MSG_PRIO_HIGHEST 255
145
146/*
147 * follows data payload
148 */
149struct ceph_msg_footer {
150 __le32 front_crc, middle_crc, data_crc;
151 __u8 flags;
152} __attribute__ ((packed));
153
154#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */
155#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
156
157
158#endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
new file mode 100644
index 000000000000..c7b4dedaace6
--- /dev/null
+++ b/fs/ceph/osd_client.c
@@ -0,0 +1,1550 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/highmem.h>
5#include <linux/mm.h>
6#include <linux/pagemap.h>
7#include <linux/slab.h>
8#include <linux/uaccess.h>
9
10#include "super.h"
11#include "osd_client.h"
12#include "messenger.h"
13#include "decode.h"
14#include "auth.h"
15
16#define OSD_OP_FRONT_LEN 4096
17#define OSD_OPREPLY_FRONT_LEN 512
18
19const static struct ceph_connection_operations osd_con_ops;
20static int __kick_requests(struct ceph_osd_client *osdc,
21 struct ceph_osd *kickosd);
22
23static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
24
25/*
26 * Implement client access to distributed object storage cluster.
27 *
28 * All data objects are stored within a cluster/cloud of OSDs, or
29 * "object storage devices." (Note that Ceph OSDs have _nothing_ to
30 * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
31 * remote daemons serving up and coordinating consistent and safe
32 * access to storage.
33 *
34 * Cluster membership and the mapping of data objects onto storage devices
35 * are described by the osd map.
36 *
37 * We keep track of pending OSD requests (read, write), resubmit
38 * requests to different OSDs when the cluster topology/data layout
39 * change, or retry the affected requests when the communications
40 * channel with an OSD is reset.
41 */
42
43/*
44 * calculate the mapping of a file extent onto an object, and fill out the
45 * request accordingly. shorten extent as necessary if it crosses an
46 * object boundary.
47 *
48 * fill osd op in request message.
49 */
50static void calc_layout(struct ceph_osd_client *osdc,
51 struct ceph_vino vino, struct ceph_file_layout *layout,
52 u64 off, u64 *plen,
53 struct ceph_osd_request *req)
54{
55 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
56 struct ceph_osd_op *op = (void *)(reqhead + 1);
57 u64 orig_len = *plen;
58 u64 objoff, objlen; /* extent in object */
59 u64 bno;
60
61 reqhead->snapid = cpu_to_le64(vino.snap);
62
63 /* object extent? */
64 ceph_calc_file_object_mapping(layout, off, plen, &bno,
65 &objoff, &objlen);
66 if (*plen < orig_len)
67 dout(" skipping last %llu, final file extent %llu~%llu\n",
68 orig_len - *plen, off, *plen);
69
70 sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
71 req->r_oid_len = strlen(req->r_oid);
72
73 op->extent.offset = cpu_to_le64(objoff);
74 op->extent.length = cpu_to_le64(objlen);
75 req->r_num_pages = calc_pages_for(off, *plen);
76
77 dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
78 req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
79}
80
81/*
82 * requests
83 */
84void ceph_osdc_release_request(struct kref *kref)
85{
86 struct ceph_osd_request *req = container_of(kref,
87 struct ceph_osd_request,
88 r_kref);
89
90 if (req->r_request)
91 ceph_msg_put(req->r_request);
92 if (req->r_reply)
93 ceph_msg_put(req->r_reply);
94 if (req->r_con_filling_msg) {
95 dout("release_request revoking pages %p from con %p\n",
96 req->r_pages, req->r_con_filling_msg);
97 ceph_con_revoke_message(req->r_con_filling_msg,
98 req->r_reply);
99 ceph_con_put(req->r_con_filling_msg);
100 }
101 if (req->r_own_pages)
102 ceph_release_page_vector(req->r_pages,
103 req->r_num_pages);
104 ceph_put_snap_context(req->r_snapc);
105 if (req->r_mempool)
106 mempool_free(req, req->r_osdc->req_mempool);
107 else
108 kfree(req);
109}
110
111/*
112 * build new request AND message, calculate layout, and adjust file
113 * extent as needed.
114 *
115 * if the file was recently truncated, we include information about its
116 * old and new size so that the object can be updated appropriately. (we
117 * avoid synchronously deleting truncated objects because it's slow.)
118 *
119 * if @do_sync, include a 'startsync' command so that the osd will flush
120 * data quickly.
121 */
122struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
123 struct ceph_file_layout *layout,
124 struct ceph_vino vino,
125 u64 off, u64 *plen,
126 int opcode, int flags,
127 struct ceph_snap_context *snapc,
128 int do_sync,
129 u32 truncate_seq,
130 u64 truncate_size,
131 struct timespec *mtime,
132 bool use_mempool, int num_reply)
133{
134 struct ceph_osd_request *req;
135 struct ceph_msg *msg;
136 struct ceph_osd_request_head *head;
137 struct ceph_osd_op *op;
138 void *p;
139 int num_op = 1 + do_sync;
140 size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
141 int i;
142
143 if (use_mempool) {
144 req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
145 memset(req, 0, sizeof(*req));
146 } else {
147 req = kzalloc(sizeof(*req), GFP_NOFS);
148 }
149 if (req == NULL)
150 return ERR_PTR(-ENOMEM);
151
152 req->r_osdc = osdc;
153 req->r_mempool = use_mempool;
154 kref_init(&req->r_kref);
155 init_completion(&req->r_completion);
156 init_completion(&req->r_safe_completion);
157 INIT_LIST_HEAD(&req->r_unsafe_item);
158 req->r_flags = flags;
159
160 WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
161
162 /* create reply message */
163 if (use_mempool)
164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
165 else
166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
167 OSD_OPREPLY_FRONT_LEN, 0, 0, NULL);
168 if (IS_ERR(msg)) {
169 ceph_osdc_put_request(req);
170 return ERR_PTR(PTR_ERR(msg));
171 }
172 req->r_reply = msg;
173
174 /* create request message; allow space for oid */
175 msg_size += 40;
176 if (snapc)
177 msg_size += sizeof(u64) * snapc->num_snaps;
178 if (use_mempool)
179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
180 else
181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
182 if (IS_ERR(msg)) {
183 ceph_osdc_put_request(req);
184 return ERR_PTR(PTR_ERR(msg));
185 }
186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
187 memset(msg->front.iov_base, 0, msg->front.iov_len);
188 head = msg->front.iov_base;
189 op = (void *)(head + 1);
190 p = (void *)(op + num_op);
191
192 req->r_request = msg;
193 req->r_snapc = ceph_get_snap_context(snapc);
194
195 head->client_inc = cpu_to_le32(1); /* always, for now. */
196 head->flags = cpu_to_le32(flags);
197 if (flags & CEPH_OSD_FLAG_WRITE)
198 ceph_encode_timespec(&head->mtime, mtime);
199 head->num_ops = cpu_to_le16(num_op);
200 op->op = cpu_to_le16(opcode);
201
202 /* calculate max write size */
203 calc_layout(osdc, vino, layout, off, plen, req);
204 req->r_file_layout = *layout; /* keep a copy */
205
206 if (flags & CEPH_OSD_FLAG_WRITE) {
207 req->r_request->hdr.data_off = cpu_to_le16(off);
208 req->r_request->hdr.data_len = cpu_to_le32(*plen);
209 op->payload_len = cpu_to_le32(*plen);
210 }
211 op->extent.truncate_size = cpu_to_le64(truncate_size);
212 op->extent.truncate_seq = cpu_to_le32(truncate_seq);
213
214 /* fill in oid */
215 head->object_len = cpu_to_le32(req->r_oid_len);
216 memcpy(p, req->r_oid, req->r_oid_len);
217 p += req->r_oid_len;
218
219 if (do_sync) {
220 op++;
221 op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
222 }
223 if (snapc) {
224 head->snap_seq = cpu_to_le64(snapc->seq);
225 head->num_snaps = cpu_to_le32(snapc->num_snaps);
226 for (i = 0; i < snapc->num_snaps; i++) {
227 put_unaligned_le64(snapc->snaps[i], p);
228 p += sizeof(u64);
229 }
230 }
231
232 BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
233 msg_size = p - msg->front.iov_base;
234 msg->front.iov_len = msg_size;
235 msg->hdr.front_len = cpu_to_le32(msg_size);
236 return req;
237}
238
239/*
240 * We keep osd requests in an rbtree, sorted by ->r_tid.
241 */
242static void __insert_request(struct ceph_osd_client *osdc,
243 struct ceph_osd_request *new)
244{
245 struct rb_node **p = &osdc->requests.rb_node;
246 struct rb_node *parent = NULL;
247 struct ceph_osd_request *req = NULL;
248
249 while (*p) {
250 parent = *p;
251 req = rb_entry(parent, struct ceph_osd_request, r_node);
252 if (new->r_tid < req->r_tid)
253 p = &(*p)->rb_left;
254 else if (new->r_tid > req->r_tid)
255 p = &(*p)->rb_right;
256 else
257 BUG();
258 }
259
260 rb_link_node(&new->r_node, parent, p);
261 rb_insert_color(&new->r_node, &osdc->requests);
262}
263
264static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
265 u64 tid)
266{
267 struct ceph_osd_request *req;
268 struct rb_node *n = osdc->requests.rb_node;
269
270 while (n) {
271 req = rb_entry(n, struct ceph_osd_request, r_node);
272 if (tid < req->r_tid)
273 n = n->rb_left;
274 else if (tid > req->r_tid)
275 n = n->rb_right;
276 else
277 return req;
278 }
279 return NULL;
280}
281
282static struct ceph_osd_request *
283__lookup_request_ge(struct ceph_osd_client *osdc,
284 u64 tid)
285{
286 struct ceph_osd_request *req;
287 struct rb_node *n = osdc->requests.rb_node;
288
289 while (n) {
290 req = rb_entry(n, struct ceph_osd_request, r_node);
291 if (tid < req->r_tid) {
292 if (!n->rb_left)
293 return req;
294 n = n->rb_left;
295 } else if (tid > req->r_tid) {
296 n = n->rb_right;
297 } else {
298 return req;
299 }
300 }
301 return NULL;
302}
303
304
305/*
306 * If the osd connection drops, we need to resubmit all requests.
307 */
308static void osd_reset(struct ceph_connection *con)
309{
310 struct ceph_osd *osd = con->private;
311 struct ceph_osd_client *osdc;
312
313 if (!osd)
314 return;
315 dout("osd_reset osd%d\n", osd->o_osd);
316 osdc = osd->o_osdc;
317 down_read(&osdc->map_sem);
318 kick_requests(osdc, osd);
319 up_read(&osdc->map_sem);
320}
321
322/*
323 * Track open sessions with osds.
324 */
325static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
326{
327 struct ceph_osd *osd;
328
329 osd = kzalloc(sizeof(*osd), GFP_NOFS);
330 if (!osd)
331 return NULL;
332
333 atomic_set(&osd->o_ref, 1);
334 osd->o_osdc = osdc;
335 INIT_LIST_HEAD(&osd->o_requests);
336 INIT_LIST_HEAD(&osd->o_osd_lru);
337 osd->o_incarnation = 1;
338
339 ceph_con_init(osdc->client->msgr, &osd->o_con);
340 osd->o_con.private = osd;
341 osd->o_con.ops = &osd_con_ops;
342 osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
343
344 INIT_LIST_HEAD(&osd->o_keepalive_item);
345 return osd;
346}
347
348static struct ceph_osd *get_osd(struct ceph_osd *osd)
349{
350 if (atomic_inc_not_zero(&osd->o_ref)) {
351 dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
352 atomic_read(&osd->o_ref));
353 return osd;
354 } else {
355 dout("get_osd %p FAIL\n", osd);
356 return NULL;
357 }
358}
359
360static void put_osd(struct ceph_osd *osd)
361{
362 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
363 atomic_read(&osd->o_ref) - 1);
364 if (atomic_dec_and_test(&osd->o_ref))
365 kfree(osd);
366}
367
368/*
369 * remove an osd from our map
370 */
371static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
372{
373 dout("__remove_osd %p\n", osd);
374 BUG_ON(!list_empty(&osd->o_requests));
375 rb_erase(&osd->o_node, &osdc->osds);
376 list_del_init(&osd->o_osd_lru);
377 ceph_con_close(&osd->o_con);
378 put_osd(osd);
379}
380
381static void __move_osd_to_lru(struct ceph_osd_client *osdc,
382 struct ceph_osd *osd)
383{
384 dout("__move_osd_to_lru %p\n", osd);
385 BUG_ON(!list_empty(&osd->o_osd_lru));
386 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
387 osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
388}
389
390static void __remove_osd_from_lru(struct ceph_osd *osd)
391{
392 dout("__remove_osd_from_lru %p\n", osd);
393 if (!list_empty(&osd->o_osd_lru))
394 list_del_init(&osd->o_osd_lru);
395}
396
397static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
398{
399 struct ceph_osd *osd, *nosd;
400
401 dout("__remove_old_osds %p\n", osdc);
402 mutex_lock(&osdc->request_mutex);
403 list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
404 if (!remove_all && time_before(jiffies, osd->lru_ttl))
405 break;
406 __remove_osd(osdc, osd);
407 }
408 mutex_unlock(&osdc->request_mutex);
409}
410
411/*
412 * reset osd connect
413 */
414static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
415{
416 struct ceph_osd_request *req;
417 int ret = 0;
418
419 dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
420 if (list_empty(&osd->o_requests)) {
421 __remove_osd(osdc, osd);
422 } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
423 &osd->o_con.peer_addr,
424 sizeof(osd->o_con.peer_addr)) == 0 &&
425 !ceph_con_opened(&osd->o_con)) {
426 dout(" osd addr hasn't changed and connection never opened,"
427 " letting msgr retry");
428 /* touch each r_stamp for handle_timeout()'s benfit */
429 list_for_each_entry(req, &osd->o_requests, r_osd_item)
430 req->r_stamp = jiffies;
431 ret = -EAGAIN;
432 } else {
433 ceph_con_close(&osd->o_con);
434 ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
435 osd->o_incarnation++;
436 }
437 return ret;
438}
439
440static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
441{
442 struct rb_node **p = &osdc->osds.rb_node;
443 struct rb_node *parent = NULL;
444 struct ceph_osd *osd = NULL;
445
446 while (*p) {
447 parent = *p;
448 osd = rb_entry(parent, struct ceph_osd, o_node);
449 if (new->o_osd < osd->o_osd)
450 p = &(*p)->rb_left;
451 else if (new->o_osd > osd->o_osd)
452 p = &(*p)->rb_right;
453 else
454 BUG();
455 }
456
457 rb_link_node(&new->o_node, parent, p);
458 rb_insert_color(&new->o_node, &osdc->osds);
459}
460
461static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
462{
463 struct ceph_osd *osd;
464 struct rb_node *n = osdc->osds.rb_node;
465
466 while (n) {
467 osd = rb_entry(n, struct ceph_osd, o_node);
468 if (o < osd->o_osd)
469 n = n->rb_left;
470 else if (o > osd->o_osd)
471 n = n->rb_right;
472 else
473 return osd;
474 }
475 return NULL;
476}
477
478static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
479{
480 schedule_delayed_work(&osdc->timeout_work,
481 osdc->client->mount_args->osd_keepalive_timeout * HZ);
482}
483
484static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
485{
486 cancel_delayed_work(&osdc->timeout_work);
487}
488
489/*
490 * Register request, assign tid. If this is the first request, set up
491 * the timeout event.
492 */
493static void register_request(struct ceph_osd_client *osdc,
494 struct ceph_osd_request *req)
495{
496 mutex_lock(&osdc->request_mutex);
497 req->r_tid = ++osdc->last_tid;
498 req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
499 INIT_LIST_HEAD(&req->r_req_lru_item);
500
501 dout("register_request %p tid %lld\n", req, req->r_tid);
502 __insert_request(osdc, req);
503 ceph_osdc_get_request(req);
504 osdc->num_requests++;
505
506 if (osdc->num_requests == 1) {
507 dout(" first request, scheduling timeout\n");
508 __schedule_osd_timeout(osdc);
509 }
510 mutex_unlock(&osdc->request_mutex);
511}
512
513/*
514 * called under osdc->request_mutex
515 */
516static void __unregister_request(struct ceph_osd_client *osdc,
517 struct ceph_osd_request *req)
518{
519 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
520 rb_erase(&req->r_node, &osdc->requests);
521 osdc->num_requests--;
522
523 if (req->r_osd) {
524 /* make sure the original request isn't in flight. */
525 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
526
527 list_del_init(&req->r_osd_item);
528 if (list_empty(&req->r_osd->o_requests))
529 __move_osd_to_lru(osdc, req->r_osd);
530 req->r_osd = NULL;
531 }
532
533 ceph_osdc_put_request(req);
534
535 list_del_init(&req->r_req_lru_item);
536 if (osdc->num_requests == 0) {
537 dout(" no requests, canceling timeout\n");
538 __cancel_osd_timeout(osdc);
539 }
540}
541
542/*
543 * Cancel a previously queued request message
544 */
545static void __cancel_request(struct ceph_osd_request *req)
546{
547 if (req->r_sent) {
548 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
549 req->r_sent = 0;
550 }
551 list_del_init(&req->r_req_lru_item);
552}
553
554/*
555 * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
556 * (as needed), and set the request r_osd appropriately. If there is
557 * no up osd, set r_osd to NULL.
558 *
559 * Return 0 if unchanged, 1 if changed, or negative on error.
560 *
561 * Caller should hold map_sem for read and request_mutex.
562 */
563static int __map_osds(struct ceph_osd_client *osdc,
564 struct ceph_osd_request *req)
565{
566 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
567 struct ceph_pg pgid;
568 int o = -1;
569 int err;
570
571 dout("map_osds %p tid %lld\n", req, req->r_tid);
572 err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
573 &req->r_file_layout, osdc->osdmap);
574 if (err)
575 return err;
576 pgid = reqhead->layout.ol_pgid;
577 req->r_pgid = pgid;
578
579 o = ceph_calc_pg_primary(osdc->osdmap, pgid);
580
581 if ((req->r_osd && req->r_osd->o_osd == o &&
582 req->r_sent >= req->r_osd->o_incarnation) ||
583 (req->r_osd == NULL && o == -1))
584 return 0; /* no change */
585
586 dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
587 req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
588 req->r_osd ? req->r_osd->o_osd : -1);
589
590 if (req->r_osd) {
591 __cancel_request(req);
592 list_del_init(&req->r_osd_item);
593 req->r_osd = NULL;
594 }
595
596 req->r_osd = __lookup_osd(osdc, o);
597 if (!req->r_osd && o >= 0) {
598 err = -ENOMEM;
599 req->r_osd = create_osd(osdc);
600 if (!req->r_osd)
601 goto out;
602
603 dout("map_osds osd %p is osd%d\n", req->r_osd, o);
604 req->r_osd->o_osd = o;
605 req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
606 __insert_osd(osdc, req->r_osd);
607
608 ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
609 }
610
611 if (req->r_osd) {
612 __remove_osd_from_lru(req->r_osd);
613 list_add(&req->r_osd_item, &req->r_osd->o_requests);
614 }
615 err = 1; /* osd changed */
616
617out:
618 return err;
619}
620
621/*
622 * caller should hold map_sem (for read) and request_mutex
623 */
624static int __send_request(struct ceph_osd_client *osdc,
625 struct ceph_osd_request *req)
626{
627 struct ceph_osd_request_head *reqhead;
628 int err;
629
630 err = __map_osds(osdc, req);
631 if (err < 0)
632 return err;
633 if (req->r_osd == NULL) {
634 dout("send_request %p no up osds in pg\n", req);
635 ceph_monc_request_next_osdmap(&osdc->client->monc);
636 return 0;
637 }
638
639 dout("send_request %p tid %llu to osd%d flags %d\n",
640 req, req->r_tid, req->r_osd->o_osd, req->r_flags);
641
642 reqhead = req->r_request->front.iov_base;
643 reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
644 reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */
645 reqhead->reassert_version = req->r_reassert_version;
646
647 req->r_stamp = jiffies;
648 list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
649
650 ceph_msg_get(req->r_request); /* send consumes a ref */
651 ceph_con_send(&req->r_osd->o_con, req->r_request);
652 req->r_sent = req->r_osd->o_incarnation;
653 return 0;
654}
655
656/*
657 * Timeout callback, called every N seconds when 1 or more osd
658 * requests has been active for more than N seconds. When this
659 * happens, we ping all OSDs with requests who have timed out to
660 * ensure any communications channel reset is detected. Reset the
661 * request timeouts another N seconds in the future as we go.
662 * Reschedule the timeout event another N seconds in future (unless
663 * there are no open requests).
664 */
665static void handle_timeout(struct work_struct *work)
666{
667 struct ceph_osd_client *osdc =
668 container_of(work, struct ceph_osd_client, timeout_work.work);
669 struct ceph_osd_request *req, *last_req = NULL;
670 struct ceph_osd *osd;
671 unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
672 unsigned long keepalive =
673 osdc->client->mount_args->osd_keepalive_timeout * HZ;
674 unsigned long last_stamp = 0;
675 struct rb_node *p;
676 struct list_head slow_osds;
677
678 dout("timeout\n");
679 down_read(&osdc->map_sem);
680
681 ceph_monc_request_next_osdmap(&osdc->client->monc);
682
683 mutex_lock(&osdc->request_mutex);
684 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
685 req = rb_entry(p, struct ceph_osd_request, r_node);
686
687 if (req->r_resend) {
688 int err;
689
690 dout("osdc resending prev failed %lld\n", req->r_tid);
691 err = __send_request(osdc, req);
692 if (err)
693 dout("osdc failed again on %lld\n", req->r_tid);
694 else
695 req->r_resend = false;
696 continue;
697 }
698 }
699
700 /*
701 * reset osds that appear to be _really_ unresponsive. this
702 * is a failsafe measure.. we really shouldn't be getting to
703 * this point if the system is working properly. the monitors
704 * should mark the osd as failed and we should find out about
705 * it from an updated osd map.
706 */
707 while (!list_empty(&osdc->req_lru)) {
708 req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
709 r_req_lru_item);
710
711 if (time_before(jiffies, req->r_stamp + timeout))
712 break;
713
714 BUG_ON(req == last_req && req->r_stamp == last_stamp);
715 last_req = req;
716 last_stamp = req->r_stamp;
717
718 osd = req->r_osd;
719 BUG_ON(!osd);
720 pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
721 req->r_tid, osd->o_osd);
722 __kick_requests(osdc, osd);
723 }
724
725 /*
726 * ping osds that are a bit slow. this ensures that if there
727 * is a break in the TCP connection we will notice, and reopen
728 * a connection with that osd (from the fault callback).
729 */
730 INIT_LIST_HEAD(&slow_osds);
731 list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
732 if (time_before(jiffies, req->r_stamp + keepalive))
733 break;
734
735 osd = req->r_osd;
736 BUG_ON(!osd);
737 dout(" tid %llu is slow, will send keepalive on osd%d\n",
738 req->r_tid, osd->o_osd);
739 list_move_tail(&osd->o_keepalive_item, &slow_osds);
740 }
741 while (!list_empty(&slow_osds)) {
742 osd = list_entry(slow_osds.next, struct ceph_osd,
743 o_keepalive_item);
744 list_del_init(&osd->o_keepalive_item);
745 ceph_con_keepalive(&osd->o_con);
746 }
747
748 __schedule_osd_timeout(osdc);
749 mutex_unlock(&osdc->request_mutex);
750
751 up_read(&osdc->map_sem);
752}
753
754static void handle_osds_timeout(struct work_struct *work)
755{
756 struct ceph_osd_client *osdc =
757 container_of(work, struct ceph_osd_client,
758 osds_timeout_work.work);
759 unsigned long delay =
760 osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
761
762 dout("osds timeout\n");
763 down_read(&osdc->map_sem);
764 remove_old_osds(osdc, 0);
765 up_read(&osdc->map_sem);
766
767 schedule_delayed_work(&osdc->osds_timeout_work,
768 round_jiffies_relative(delay));
769}
770
771/*
772 * handle osd op reply. either call the callback if it is specified,
773 * or do the completion to wake up the waiting thread.
774 */
775static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
776 struct ceph_connection *con)
777{
778 struct ceph_osd_reply_head *rhead = msg->front.iov_base;
779 struct ceph_osd_request *req;
780 u64 tid;
781 int numops, object_len, flags;
782
783 tid = le64_to_cpu(msg->hdr.tid);
784 if (msg->front.iov_len < sizeof(*rhead))
785 goto bad;
786 numops = le32_to_cpu(rhead->num_ops);
787 object_len = le32_to_cpu(rhead->object_len);
788 if (msg->front.iov_len != sizeof(*rhead) + object_len +
789 numops * sizeof(struct ceph_osd_op))
790 goto bad;
791 dout("handle_reply %p tid %llu\n", msg, tid);
792
793 /* lookup */
794 mutex_lock(&osdc->request_mutex);
795 req = __lookup_request(osdc, tid);
796 if (req == NULL) {
797 dout("handle_reply tid %llu dne\n", tid);
798 mutex_unlock(&osdc->request_mutex);
799 return;
800 }
801 ceph_osdc_get_request(req);
802 flags = le32_to_cpu(rhead->flags);
803
804 /*
805 * if this connection filled our message, drop our reference now, to
806 * avoid a (safe but slower) revoke later.
807 */
808 if (req->r_con_filling_msg == con && req->r_reply == msg) {
809 dout(" dropping con_filling_msg ref %p\n", con);
810 req->r_con_filling_msg = NULL;
811 ceph_con_put(con);
812 }
813
814 if (!req->r_got_reply) {
815 unsigned bytes;
816
817 req->r_result = le32_to_cpu(rhead->result);
818 bytes = le32_to_cpu(msg->hdr.data_len);
819 dout("handle_reply result %d bytes %d\n", req->r_result,
820 bytes);
821 if (req->r_result == 0)
822 req->r_result = bytes;
823
824 /* in case this is a write and we need to replay, */
825 req->r_reassert_version = rhead->reassert_version;
826
827 req->r_got_reply = 1;
828 } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
829 dout("handle_reply tid %llu dup ack\n", tid);
830 mutex_unlock(&osdc->request_mutex);
831 goto done;
832 }
833
834 dout("handle_reply tid %llu flags %d\n", tid, flags);
835
836 /* either this is a read, or we got the safe response */
837 if ((flags & CEPH_OSD_FLAG_ONDISK) ||
838 ((flags & CEPH_OSD_FLAG_WRITE) == 0))
839 __unregister_request(osdc, req);
840
841 mutex_unlock(&osdc->request_mutex);
842
843 if (req->r_callback)
844 req->r_callback(req, msg);
845 else
846 complete(&req->r_completion);
847
848 if (flags & CEPH_OSD_FLAG_ONDISK) {
849 if (req->r_safe_callback)
850 req->r_safe_callback(req, msg);
851 complete(&req->r_safe_completion); /* fsync waiter */
852 }
853
854done:
855 ceph_osdc_put_request(req);
856 return;
857
858bad:
859 pr_err("corrupt osd_op_reply got %d %d expected %d\n",
860 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
861 (int)sizeof(*rhead));
862 ceph_msg_dump(msg);
863}
864
865
866static int __kick_requests(struct ceph_osd_client *osdc,
867 struct ceph_osd *kickosd)
868{
869 struct ceph_osd_request *req;
870 struct rb_node *p, *n;
871 int needmap = 0;
872 int err;
873
874 dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
875 if (kickosd) {
876 err = __reset_osd(osdc, kickosd);
877 if (err == -EAGAIN)
878 return 1;
879 } else {
880 for (p = rb_first(&osdc->osds); p; p = n) {
881 struct ceph_osd *osd =
882 rb_entry(p, struct ceph_osd, o_node);
883
884 n = rb_next(p);
885 if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
886 memcmp(&osd->o_con.peer_addr,
887 ceph_osd_addr(osdc->osdmap,
888 osd->o_osd),
889 sizeof(struct ceph_entity_addr)) != 0)
890 __reset_osd(osdc, osd);
891 }
892 }
893
894 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
895 req = rb_entry(p, struct ceph_osd_request, r_node);
896
897 if (req->r_resend) {
898 dout(" r_resend set on tid %llu\n", req->r_tid);
899 __cancel_request(req);
900 goto kick;
901 }
902 if (req->r_osd && kickosd == req->r_osd) {
903 __cancel_request(req);
904 goto kick;
905 }
906
907 err = __map_osds(osdc, req);
908 if (err == 0)
909 continue; /* no change */
910 if (err < 0) {
911 /*
912 * FIXME: really, we should set the request
913 * error and fail if this isn't a 'nofail'
914 * request, but that's a fair bit more
915 * complicated to do. So retry!
916 */
917 dout(" setting r_resend on %llu\n", req->r_tid);
918 req->r_resend = true;
919 continue;
920 }
921 if (req->r_osd == NULL) {
922 dout("tid %llu maps to no valid osd\n", req->r_tid);
923 needmap++; /* request a newer map */
924 continue;
925 }
926
927kick:
928 dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
929 req->r_osd ? req->r_osd->o_osd : -1);
930 req->r_flags |= CEPH_OSD_FLAG_RETRY;
931 err = __send_request(osdc, req);
932 if (err) {
933 dout(" setting r_resend on %llu\n", req->r_tid);
934 req->r_resend = true;
935 }
936 }
937
938 return needmap;
939}
940
941/*
942 * Resubmit osd requests whose osd or osd address has changed. Request
943 * a new osd map if osds are down, or we are otherwise unable to determine
944 * how to direct a request.
945 *
946 * Close connections to down osds.
947 *
948 * If @who is specified, resubmit requests for that specific osd.
949 *
950 * Caller should hold map_sem for read and request_mutex.
951 */
952static void kick_requests(struct ceph_osd_client *osdc,
953 struct ceph_osd *kickosd)
954{
955 int needmap;
956
957 mutex_lock(&osdc->request_mutex);
958 needmap = __kick_requests(osdc, kickosd);
959 mutex_unlock(&osdc->request_mutex);
960
961 if (needmap) {
962 dout("%d requests for down osds, need new map\n", needmap);
963 ceph_monc_request_next_osdmap(&osdc->client->monc);
964 }
965
966}
967/*
968 * Process updated osd map.
969 *
970 * The message contains any number of incremental and full maps, normally
971 * indicating some sort of topology change in the cluster. Kick requests
972 * off to different OSDs as needed.
973 */
974void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
975{
976 void *p, *end, *next;
977 u32 nr_maps, maplen;
978 u32 epoch;
979 struct ceph_osdmap *newmap = NULL, *oldmap;
980 int err;
981 struct ceph_fsid fsid;
982
983 dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
984 p = msg->front.iov_base;
985 end = p + msg->front.iov_len;
986
987 /* verify fsid */
988 ceph_decode_need(&p, end, sizeof(fsid), bad);
989 ceph_decode_copy(&p, &fsid, sizeof(fsid));
990 if (ceph_check_fsid(osdc->client, &fsid) < 0)
991 return;
992
993 down_write(&osdc->map_sem);
994
995 /* incremental maps */
996 ceph_decode_32_safe(&p, end, nr_maps, bad);
997 dout(" %d inc maps\n", nr_maps);
998 while (nr_maps > 0) {
999 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1000 epoch = ceph_decode_32(&p);
1001 maplen = ceph_decode_32(&p);
1002 ceph_decode_need(&p, end, maplen, bad);
1003 next = p + maplen;
1004 if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
1005 dout("applying incremental map %u len %d\n",
1006 epoch, maplen);
1007 newmap = osdmap_apply_incremental(&p, next,
1008 osdc->osdmap,
1009 osdc->client->msgr);
1010 if (IS_ERR(newmap)) {
1011 err = PTR_ERR(newmap);
1012 goto bad;
1013 }
1014 BUG_ON(!newmap);
1015 if (newmap != osdc->osdmap) {
1016 ceph_osdmap_destroy(osdc->osdmap);
1017 osdc->osdmap = newmap;
1018 }
1019 } else {
1020 dout("ignoring incremental map %u len %d\n",
1021 epoch, maplen);
1022 }
1023 p = next;
1024 nr_maps--;
1025 }
1026 if (newmap)
1027 goto done;
1028
1029 /* full maps */
1030 ceph_decode_32_safe(&p, end, nr_maps, bad);
1031 dout(" %d full maps\n", nr_maps);
1032 while (nr_maps) {
1033 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1034 epoch = ceph_decode_32(&p);
1035 maplen = ceph_decode_32(&p);
1036 ceph_decode_need(&p, end, maplen, bad);
1037 if (nr_maps > 1) {
1038 dout("skipping non-latest full map %u len %d\n",
1039 epoch, maplen);
1040 } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
1041 dout("skipping full map %u len %d, "
1042 "older than our %u\n", epoch, maplen,
1043 osdc->osdmap->epoch);
1044 } else {
1045 dout("taking full map %u len %d\n", epoch, maplen);
1046 newmap = osdmap_decode(&p, p+maplen);
1047 if (IS_ERR(newmap)) {
1048 err = PTR_ERR(newmap);
1049 goto bad;
1050 }
1051 BUG_ON(!newmap);
1052 oldmap = osdc->osdmap;
1053 osdc->osdmap = newmap;
1054 if (oldmap)
1055 ceph_osdmap_destroy(oldmap);
1056 }
1057 p += maplen;
1058 nr_maps--;
1059 }
1060
1061done:
1062 downgrade_write(&osdc->map_sem);
1063 ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
1064 if (newmap)
1065 kick_requests(osdc, NULL);
1066 up_read(&osdc->map_sem);
1067 return;
1068
1069bad:
1070 pr_err("osdc handle_map corrupt msg\n");
1071 ceph_msg_dump(msg);
1072 up_write(&osdc->map_sem);
1073 return;
1074}
1075
1076
1077/*
1078 * A read request prepares specific pages that data is to be read into.
1079 * When a message is being read off the wire, we call prepare_pages to
1080 * find those pages.
1081 * 0 = success, -1 failure.
1082 */
1083static int __prepare_pages(struct ceph_connection *con,
1084 struct ceph_msg_header *hdr,
1085 struct ceph_osd_request *req,
1086 u64 tid,
1087 struct ceph_msg *m)
1088{
1089 struct ceph_osd *osd = con->private;
1090 struct ceph_osd_client *osdc;
1091 int ret = -1;
1092 int data_len = le32_to_cpu(hdr->data_len);
1093 unsigned data_off = le16_to_cpu(hdr->data_off);
1094
1095 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1096
1097 if (!osd)
1098 return -1;
1099
1100 osdc = osd->o_osdc;
1101
1102 dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
1103 tid, req->r_num_pages, want);
1104 if (unlikely(req->r_num_pages < want))
1105 goto out;
1106 m->pages = req->r_pages;
1107 m->nr_pages = req->r_num_pages;
1108 ret = 0; /* success */
1109out:
1110 BUG_ON(ret < 0 || m->nr_pages < want);
1111
1112 return ret;
1113}
1114
1115/*
1116 * Register request, send initial attempt.
1117 */
1118int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1119 struct ceph_osd_request *req,
1120 bool nofail)
1121{
1122 int rc = 0;
1123
1124 req->r_request->pages = req->r_pages;
1125 req->r_request->nr_pages = req->r_num_pages;
1126
1127 register_request(osdc, req);
1128
1129 down_read(&osdc->map_sem);
1130 mutex_lock(&osdc->request_mutex);
1131 /*
1132 * a racing kick_requests() may have sent the message for us
1133 * while we dropped request_mutex above, so only send now if
1134 * the request still han't been touched yet.
1135 */
1136 if (req->r_sent == 0) {
1137 rc = __send_request(osdc, req);
1138 if (rc) {
1139 if (nofail) {
1140 dout("osdc_start_request failed send, "
1141 " marking %lld\n", req->r_tid);
1142 req->r_resend = true;
1143 rc = 0;
1144 } else {
1145 __unregister_request(osdc, req);
1146 }
1147 }
1148 }
1149 mutex_unlock(&osdc->request_mutex);
1150 up_read(&osdc->map_sem);
1151 return rc;
1152}
1153
1154/*
1155 * wait for a request to complete
1156 */
1157int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
1158 struct ceph_osd_request *req)
1159{
1160 int rc;
1161
1162 rc = wait_for_completion_interruptible(&req->r_completion);
1163 if (rc < 0) {
1164 mutex_lock(&osdc->request_mutex);
1165 __cancel_request(req);
1166 __unregister_request(osdc, req);
1167 mutex_unlock(&osdc->request_mutex);
1168 dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
1169 return rc;
1170 }
1171
1172 dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
1173 return req->r_result;
1174}
1175
1176/*
1177 * sync - wait for all in-flight requests to flush. avoid starvation.
1178 */
1179void ceph_osdc_sync(struct ceph_osd_client *osdc)
1180{
1181 struct ceph_osd_request *req;
1182 u64 last_tid, next_tid = 0;
1183
1184 mutex_lock(&osdc->request_mutex);
1185 last_tid = osdc->last_tid;
1186 while (1) {
1187 req = __lookup_request_ge(osdc, next_tid);
1188 if (!req)
1189 break;
1190 if (req->r_tid > last_tid)
1191 break;
1192
1193 next_tid = req->r_tid + 1;
1194 if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
1195 continue;
1196
1197 ceph_osdc_get_request(req);
1198 mutex_unlock(&osdc->request_mutex);
1199 dout("sync waiting on tid %llu (last is %llu)\n",
1200 req->r_tid, last_tid);
1201 wait_for_completion(&req->r_safe_completion);
1202 mutex_lock(&osdc->request_mutex);
1203 ceph_osdc_put_request(req);
1204 }
1205 mutex_unlock(&osdc->request_mutex);
1206 dout("sync done (thru tid %llu)\n", last_tid);
1207}
1208
1209/*
1210 * init, shutdown
1211 */
1212int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1213{
1214 int err;
1215
1216 dout("init\n");
1217 osdc->client = client;
1218 osdc->osdmap = NULL;
1219 init_rwsem(&osdc->map_sem);
1220 init_completion(&osdc->map_waiters);
1221 osdc->last_requested_map = 0;
1222 mutex_init(&osdc->request_mutex);
1223 osdc->last_tid = 0;
1224 osdc->osds = RB_ROOT;
1225 INIT_LIST_HEAD(&osdc->osd_lru);
1226 osdc->requests = RB_ROOT;
1227 INIT_LIST_HEAD(&osdc->req_lru);
1228 osdc->num_requests = 0;
1229 INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
1230 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
1231
1232 schedule_delayed_work(&osdc->osds_timeout_work,
1233 round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
1234
1235 err = -ENOMEM;
1236 osdc->req_mempool = mempool_create_kmalloc_pool(10,
1237 sizeof(struct ceph_osd_request));
1238 if (!osdc->req_mempool)
1239 goto out;
1240
1241 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true);
1242 if (err < 0)
1243 goto out_mempool;
1244 err = ceph_msgpool_init(&osdc->msgpool_op_reply,
1245 OSD_OPREPLY_FRONT_LEN, 10, true);
1246 if (err < 0)
1247 goto out_msgpool;
1248 return 0;
1249
1250out_msgpool:
1251 ceph_msgpool_destroy(&osdc->msgpool_op);
1252out_mempool:
1253 mempool_destroy(osdc->req_mempool);
1254out:
1255 return err;
1256}
1257
1258void ceph_osdc_stop(struct ceph_osd_client *osdc)
1259{
1260 cancel_delayed_work_sync(&osdc->timeout_work);
1261 cancel_delayed_work_sync(&osdc->osds_timeout_work);
1262 if (osdc->osdmap) {
1263 ceph_osdmap_destroy(osdc->osdmap);
1264 osdc->osdmap = NULL;
1265 }
1266 remove_old_osds(osdc, 1);
1267 mempool_destroy(osdc->req_mempool);
1268 ceph_msgpool_destroy(&osdc->msgpool_op);
1269 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
1270}
1271
1272/*
1273 * Read some contiguous pages. If we cross a stripe boundary, shorten
1274 * *plen. Return number of bytes read, or error.
1275 */
1276int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1277 struct ceph_vino vino, struct ceph_file_layout *layout,
1278 u64 off, u64 *plen,
1279 u32 truncate_seq, u64 truncate_size,
1280 struct page **pages, int num_pages)
1281{
1282 struct ceph_osd_request *req;
1283 int rc = 0;
1284
1285 dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
1286 vino.snap, off, *plen);
1287 req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
1288 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1289 NULL, 0, truncate_seq, truncate_size, NULL,
1290 false, 1);
1291 if (IS_ERR(req))
1292 return PTR_ERR(req);
1293
1294 /* it may be a short read due to an object boundary */
1295 req->r_pages = pages;
1296 num_pages = calc_pages_for(off, *plen);
1297 req->r_num_pages = num_pages;
1298
1299 dout("readpages final extent is %llu~%llu (%d pages)\n",
1300 off, *plen, req->r_num_pages);
1301
1302 rc = ceph_osdc_start_request(osdc, req, false);
1303 if (!rc)
1304 rc = ceph_osdc_wait_request(osdc, req);
1305
1306 ceph_osdc_put_request(req);
1307 dout("readpages result %d\n", rc);
1308 return rc;
1309}
1310
1311/*
1312 * do a synchronous write on N pages
1313 */
1314int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1315 struct ceph_file_layout *layout,
1316 struct ceph_snap_context *snapc,
1317 u64 off, u64 len,
1318 u32 truncate_seq, u64 truncate_size,
1319 struct timespec *mtime,
1320 struct page **pages, int num_pages,
1321 int flags, int do_sync, bool nofail)
1322{
1323 struct ceph_osd_request *req;
1324 int rc = 0;
1325
1326 BUG_ON(vino.snap != CEPH_NOSNAP);
1327 req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
1328 CEPH_OSD_OP_WRITE,
1329 flags | CEPH_OSD_FLAG_ONDISK |
1330 CEPH_OSD_FLAG_WRITE,
1331 snapc, do_sync,
1332 truncate_seq, truncate_size, mtime,
1333 nofail, 1);
1334 if (IS_ERR(req))
1335 return PTR_ERR(req);
1336
1337 /* it may be a short write due to an object boundary */
1338 req->r_pages = pages;
1339 req->r_num_pages = calc_pages_for(off, len);
1340 dout("writepages %llu~%llu (%d pages)\n", off, len,
1341 req->r_num_pages);
1342
1343 rc = ceph_osdc_start_request(osdc, req, nofail);
1344 if (!rc)
1345 rc = ceph_osdc_wait_request(osdc, req);
1346
1347 ceph_osdc_put_request(req);
1348 if (rc == 0)
1349 rc = len;
1350 dout("writepages result %d\n", rc);
1351 return rc;
1352}
1353
1354/*
1355 * handle incoming message
1356 */
1357static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1358{
1359 struct ceph_osd *osd = con->private;
1360 struct ceph_osd_client *osdc;
1361 int type = le16_to_cpu(msg->hdr.type);
1362
1363 if (!osd)
1364 return;
1365 osdc = osd->o_osdc;
1366
1367 switch (type) {
1368 case CEPH_MSG_OSD_MAP:
1369 ceph_osdc_handle_map(osdc, msg);
1370 break;
1371 case CEPH_MSG_OSD_OPREPLY:
1372 handle_reply(osdc, msg, con);
1373 break;
1374
1375 default:
1376 pr_err("received unknown message type %d %s\n", type,
1377 ceph_msg_type_name(type));
1378 }
1379 ceph_msg_put(msg);
1380}
1381
1382/*
1383 * lookup and return message for incoming reply
1384 */
1385static struct ceph_msg *get_reply(struct ceph_connection *con,
1386 struct ceph_msg_header *hdr,
1387 int *skip)
1388{
1389 struct ceph_osd *osd = con->private;
1390 struct ceph_osd_client *osdc = osd->o_osdc;
1391 struct ceph_msg *m;
1392 struct ceph_osd_request *req;
1393 int front = le32_to_cpu(hdr->front_len);
1394 int data_len = le32_to_cpu(hdr->data_len);
1395 u64 tid;
1396 int err;
1397
1398 tid = le64_to_cpu(hdr->tid);
1399 mutex_lock(&osdc->request_mutex);
1400 req = __lookup_request(osdc, tid);
1401 if (!req) {
1402 *skip = 1;
1403 m = NULL;
1404 pr_info("get_reply unknown tid %llu from osd%d\n", tid,
1405 osd->o_osd);
1406 goto out;
1407 }
1408
1409 if (req->r_con_filling_msg) {
1410 dout("get_reply revoking msg %p from old con %p\n",
1411 req->r_reply, req->r_con_filling_msg);
1412 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
1413 ceph_con_put(req->r_con_filling_msg);
1414 }
1415
1416 if (front > req->r_reply->front.iov_len) {
1417 pr_warning("get_reply front %d > preallocated %d\n",
1418 front, (int)req->r_reply->front.iov_len);
1419 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL);
1420 if (IS_ERR(m))
1421 goto out;
1422 ceph_msg_put(req->r_reply);
1423 req->r_reply = m;
1424 }
1425 m = ceph_msg_get(req->r_reply);
1426
1427 if (data_len > 0) {
1428 err = __prepare_pages(con, hdr, req, tid, m);
1429 if (err < 0) {
1430 *skip = 1;
1431 ceph_msg_put(m);
1432 m = ERR_PTR(err);
1433 }
1434 }
1435 *skip = 0;
1436 req->r_con_filling_msg = ceph_con_get(con);
1437 dout("get_reply tid %lld %p\n", tid, m);
1438
1439out:
1440 mutex_unlock(&osdc->request_mutex);
1441 return m;
1442
1443}
1444
1445static struct ceph_msg *alloc_msg(struct ceph_connection *con,
1446 struct ceph_msg_header *hdr,
1447 int *skip)
1448{
1449 struct ceph_osd *osd = con->private;
1450 int type = le16_to_cpu(hdr->type);
1451 int front = le32_to_cpu(hdr->front_len);
1452
1453 switch (type) {
1454 case CEPH_MSG_OSD_MAP:
1455 return ceph_msg_new(type, front, 0, 0, NULL);
1456 case CEPH_MSG_OSD_OPREPLY:
1457 return get_reply(con, hdr, skip);
1458 default:
1459 pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
1460 osd->o_osd);
1461 *skip = 1;
1462 return NULL;
1463 }
1464}
1465
1466/*
1467 * Wrappers to refcount containing ceph_osd struct
1468 */
1469static struct ceph_connection *get_osd_con(struct ceph_connection *con)
1470{
1471 struct ceph_osd *osd = con->private;
1472 if (get_osd(osd))
1473 return con;
1474 return NULL;
1475}
1476
1477static void put_osd_con(struct ceph_connection *con)
1478{
1479 struct ceph_osd *osd = con->private;
1480 put_osd(osd);
1481}
1482
1483/*
1484 * authentication
1485 */
1486static int get_authorizer(struct ceph_connection *con,
1487 void **buf, int *len, int *proto,
1488 void **reply_buf, int *reply_len, int force_new)
1489{
1490 struct ceph_osd *o = con->private;
1491 struct ceph_osd_client *osdc = o->o_osdc;
1492 struct ceph_auth_client *ac = osdc->client->monc.auth;
1493 int ret = 0;
1494
1495 if (force_new && o->o_authorizer) {
1496 ac->ops->destroy_authorizer(ac, o->o_authorizer);
1497 o->o_authorizer = NULL;
1498 }
1499 if (o->o_authorizer == NULL) {
1500 ret = ac->ops->create_authorizer(
1501 ac, CEPH_ENTITY_TYPE_OSD,
1502 &o->o_authorizer,
1503 &o->o_authorizer_buf,
1504 &o->o_authorizer_buf_len,
1505 &o->o_authorizer_reply_buf,
1506 &o->o_authorizer_reply_buf_len);
1507 if (ret)
1508 return ret;
1509 }
1510
1511 *proto = ac->protocol;
1512 *buf = o->o_authorizer_buf;
1513 *len = o->o_authorizer_buf_len;
1514 *reply_buf = o->o_authorizer_reply_buf;
1515 *reply_len = o->o_authorizer_reply_buf_len;
1516 return 0;
1517}
1518
1519
1520static int verify_authorizer_reply(struct ceph_connection *con, int len)
1521{
1522 struct ceph_osd *o = con->private;
1523 struct ceph_osd_client *osdc = o->o_osdc;
1524 struct ceph_auth_client *ac = osdc->client->monc.auth;
1525
1526 return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
1527}
1528
1529static int invalidate_authorizer(struct ceph_connection *con)
1530{
1531 struct ceph_osd *o = con->private;
1532 struct ceph_osd_client *osdc = o->o_osdc;
1533 struct ceph_auth_client *ac = osdc->client->monc.auth;
1534
1535 if (ac->ops->invalidate_authorizer)
1536 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
1537
1538 return ceph_monc_validate_auth(&osdc->client->monc);
1539}
1540
1541const static struct ceph_connection_operations osd_con_ops = {
1542 .get = get_osd_con,
1543 .put = put_osd_con,
1544 .dispatch = dispatch,
1545 .get_authorizer = get_authorizer,
1546 .verify_authorizer_reply = verify_authorizer_reply,
1547 .invalidate_authorizer = invalidate_authorizer,
1548 .alloc_msg = alloc_msg,
1549 .fault = osd_reset,
1550};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
new file mode 100644
index 000000000000..b0759911e7c3
--- /dev/null
+++ b/fs/ceph/osd_client.h
@@ -0,0 +1,166 @@
1#ifndef _FS_CEPH_OSD_CLIENT_H
2#define _FS_CEPH_OSD_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/mempool.h>
7#include <linux/rbtree.h>
8
9#include "types.h"
10#include "osdmap.h"
11#include "messenger.h"
12
13struct ceph_msg;
14struct ceph_snap_context;
15struct ceph_osd_request;
16struct ceph_osd_client;
17struct ceph_authorizer;
18
19/*
20 * completion callback for async writepages
21 */
22typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
23 struct ceph_msg *);
24
25/* a given osd we're communicating with */
26struct ceph_osd {
27 atomic_t o_ref;
28 struct ceph_osd_client *o_osdc;
29 int o_osd;
30 int o_incarnation;
31 struct rb_node o_node;
32 struct ceph_connection o_con;
33 struct list_head o_requests;
34 struct list_head o_osd_lru;
35 struct ceph_authorizer *o_authorizer;
36 void *o_authorizer_buf, *o_authorizer_reply_buf;
37 size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
38 unsigned long lru_ttl;
39 int o_marked_for_keepalive;
40 struct list_head o_keepalive_item;
41};
42
43/* an in-flight request */
44struct ceph_osd_request {
45 u64 r_tid; /* unique for this client */
46 struct rb_node r_node;
47 struct list_head r_req_lru_item;
48 struct list_head r_osd_item;
49 struct ceph_osd *r_osd;
50 struct ceph_pg r_pgid;
51
52 struct ceph_connection *r_con_filling_msg;
53
54 struct ceph_msg *r_request, *r_reply;
55 int r_result;
56 int r_flags; /* any additional flags for the osd */
57 u32 r_sent; /* >0 if r_request is sending/sent */
58 int r_got_reply;
59
60 struct ceph_osd_client *r_osdc;
61 struct kref r_kref;
62 bool r_mempool;
63 struct completion r_completion, r_safe_completion;
64 ceph_osdc_callback_t r_callback, r_safe_callback;
65 struct ceph_eversion r_reassert_version;
66 struct list_head r_unsafe_item;
67
68 struct inode *r_inode; /* for use by callbacks */
69 struct writeback_control *r_wbc; /* ditto */
70
71 char r_oid[40]; /* object name */
72 int r_oid_len;
73 unsigned long r_stamp; /* send OR check time */
74 bool r_resend; /* msg send failed, needs retry */
75
76 struct ceph_file_layout r_file_layout;
77 struct ceph_snap_context *r_snapc; /* snap context for writes */
78 unsigned r_num_pages; /* size of page array (follows) */
79 struct page **r_pages; /* pages for data payload */
80 int r_pages_from_pool;
81 int r_own_pages; /* if true, i own page list */
82};
83
84struct ceph_osd_client {
85 struct ceph_client *client;
86
87 struct ceph_osdmap *osdmap; /* current map */
88 struct rw_semaphore map_sem;
89 struct completion map_waiters;
90 u64 last_requested_map;
91
92 struct mutex request_mutex;
93 struct rb_root osds; /* osds */
94 struct list_head osd_lru; /* idle osds */
95 u64 timeout_tid; /* tid of timeout triggering rq */
96 u64 last_tid; /* tid of last request */
97 struct rb_root requests; /* pending requests */
98 struct list_head req_lru; /* pending requests lru */
99 int num_requests;
100 struct delayed_work timeout_work;
101 struct delayed_work osds_timeout_work;
102#ifdef CONFIG_DEBUG_FS
103 struct dentry *debugfs_file;
104#endif
105
106 mempool_t *req_mempool;
107
108 struct ceph_msgpool msgpool_op;
109 struct ceph_msgpool msgpool_op_reply;
110};
111
112extern int ceph_osdc_init(struct ceph_osd_client *osdc,
113 struct ceph_client *client);
114extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
115
116extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
117 struct ceph_msg *msg);
118extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
119 struct ceph_msg *msg);
120
121extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
122 struct ceph_file_layout *layout,
123 struct ceph_vino vino,
124 u64 offset, u64 *len, int op, int flags,
125 struct ceph_snap_context *snapc,
126 int do_sync, u32 truncate_seq,
127 u64 truncate_size,
128 struct timespec *mtime,
129 bool use_mempool, int num_reply);
130
131static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
132{
133 kref_get(&req->r_kref);
134}
135extern void ceph_osdc_release_request(struct kref *kref);
136static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
137{
138 kref_put(&req->r_kref, ceph_osdc_release_request);
139}
140
141extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
142 struct ceph_osd_request *req,
143 bool nofail);
144extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
145 struct ceph_osd_request *req);
146extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
147
148extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
149 struct ceph_vino vino,
150 struct ceph_file_layout *layout,
151 u64 off, u64 *plen,
152 u32 truncate_seq, u64 truncate_size,
153 struct page **pages, int nr_pages);
154
155extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
156 struct ceph_vino vino,
157 struct ceph_file_layout *layout,
158 struct ceph_snap_context *sc,
159 u64 off, u64 len,
160 u32 truncate_seq, u64 truncate_size,
161 struct timespec *mtime,
162 struct page **pages, int nr_pages,
163 int flags, int do_sync, bool nofail);
164
165#endif
166
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
new file mode 100644
index 000000000000..21c6623c4b07
--- /dev/null
+++ b/fs/ceph/osdmap.c
@@ -0,0 +1,1024 @@
1
2#include "ceph_debug.h"
3
4#include <linux/slab.h>
5#include <asm/div64.h>
6
7#include "super.h"
8#include "osdmap.h"
9#include "crush/hash.h"
10#include "crush/mapper.h"
11#include "decode.h"
12
13char *ceph_osdmap_state_str(char *str, int len, int state)
14{
15 int flag = 0;
16
17 if (!len)
18 goto done;
19
20 *str = '\0';
21 if (state) {
22 if (state & CEPH_OSD_EXISTS) {
23 snprintf(str, len, "exists");
24 flag = 1;
25 }
26 if (state & CEPH_OSD_UP) {
27 snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
28 "up");
29 flag = 1;
30 }
31 } else {
32 snprintf(str, len, "doesn't exist");
33 }
34done:
35 return str;
36}
37
38/* maps */
39
40static int calc_bits_of(unsigned t)
41{
42 int b = 0;
43 while (t) {
44 t = t >> 1;
45 b++;
46 }
47 return b;
48}
49
50/*
51 * the foo_mask is the smallest value 2^n-1 that is >= foo.
52 */
53static void calc_pg_masks(struct ceph_pg_pool_info *pi)
54{
55 pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
56 pi->pgp_num_mask =
57 (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
58 pi->lpg_num_mask =
59 (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
60 pi->lpgp_num_mask =
61 (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
62}
63
64/*
65 * decode crush map
66 */
67static int crush_decode_uniform_bucket(void **p, void *end,
68 struct crush_bucket_uniform *b)
69{
70 dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
71 ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
72 b->item_weight = ceph_decode_32(p);
73 return 0;
74bad:
75 return -EINVAL;
76}
77
78static int crush_decode_list_bucket(void **p, void *end,
79 struct crush_bucket_list *b)
80{
81 int j;
82 dout("crush_decode_list_bucket %p to %p\n", *p, end);
83 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
84 if (b->item_weights == NULL)
85 return -ENOMEM;
86 b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
87 if (b->sum_weights == NULL)
88 return -ENOMEM;
89 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
90 for (j = 0; j < b->h.size; j++) {
91 b->item_weights[j] = ceph_decode_32(p);
92 b->sum_weights[j] = ceph_decode_32(p);
93 }
94 return 0;
95bad:
96 return -EINVAL;
97}
98
99static int crush_decode_tree_bucket(void **p, void *end,
100 struct crush_bucket_tree *b)
101{
102 int j;
103 dout("crush_decode_tree_bucket %p to %p\n", *p, end);
104 ceph_decode_32_safe(p, end, b->num_nodes, bad);
105 b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
106 if (b->node_weights == NULL)
107 return -ENOMEM;
108 ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
109 for (j = 0; j < b->num_nodes; j++)
110 b->node_weights[j] = ceph_decode_32(p);
111 return 0;
112bad:
113 return -EINVAL;
114}
115
116static int crush_decode_straw_bucket(void **p, void *end,
117 struct crush_bucket_straw *b)
118{
119 int j;
120 dout("crush_decode_straw_bucket %p to %p\n", *p, end);
121 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
122 if (b->item_weights == NULL)
123 return -ENOMEM;
124 b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
125 if (b->straws == NULL)
126 return -ENOMEM;
127 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
128 for (j = 0; j < b->h.size; j++) {
129 b->item_weights[j] = ceph_decode_32(p);
130 b->straws[j] = ceph_decode_32(p);
131 }
132 return 0;
133bad:
134 return -EINVAL;
135}
136
137static struct crush_map *crush_decode(void *pbyval, void *end)
138{
139 struct crush_map *c;
140 int err = -EINVAL;
141 int i, j;
142 void **p = &pbyval;
143 void *start = pbyval;
144 u32 magic;
145
146 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
147
148 c = kzalloc(sizeof(*c), GFP_NOFS);
149 if (c == NULL)
150 return ERR_PTR(-ENOMEM);
151
152 ceph_decode_need(p, end, 4*sizeof(u32), bad);
153 magic = ceph_decode_32(p);
154 if (magic != CRUSH_MAGIC) {
155 pr_err("crush_decode magic %x != current %x\n",
156 (unsigned)magic, (unsigned)CRUSH_MAGIC);
157 goto bad;
158 }
159 c->max_buckets = ceph_decode_32(p);
160 c->max_rules = ceph_decode_32(p);
161 c->max_devices = ceph_decode_32(p);
162
163 c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
164 if (c->device_parents == NULL)
165 goto badmem;
166 c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
167 if (c->bucket_parents == NULL)
168 goto badmem;
169
170 c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
171 if (c->buckets == NULL)
172 goto badmem;
173 c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
174 if (c->rules == NULL)
175 goto badmem;
176
177 /* buckets */
178 for (i = 0; i < c->max_buckets; i++) {
179 int size = 0;
180 u32 alg;
181 struct crush_bucket *b;
182
183 ceph_decode_32_safe(p, end, alg, bad);
184 if (alg == 0) {
185 c->buckets[i] = NULL;
186 continue;
187 }
188 dout("crush_decode bucket %d off %x %p to %p\n",
189 i, (int)(*p-start), *p, end);
190
191 switch (alg) {
192 case CRUSH_BUCKET_UNIFORM:
193 size = sizeof(struct crush_bucket_uniform);
194 break;
195 case CRUSH_BUCKET_LIST:
196 size = sizeof(struct crush_bucket_list);
197 break;
198 case CRUSH_BUCKET_TREE:
199 size = sizeof(struct crush_bucket_tree);
200 break;
201 case CRUSH_BUCKET_STRAW:
202 size = sizeof(struct crush_bucket_straw);
203 break;
204 default:
205 err = -EINVAL;
206 goto bad;
207 }
208 BUG_ON(size == 0);
209 b = c->buckets[i] = kzalloc(size, GFP_NOFS);
210 if (b == NULL)
211 goto badmem;
212
213 ceph_decode_need(p, end, 4*sizeof(u32), bad);
214 b->id = ceph_decode_32(p);
215 b->type = ceph_decode_16(p);
216 b->alg = ceph_decode_8(p);
217 b->hash = ceph_decode_8(p);
218 b->weight = ceph_decode_32(p);
219 b->size = ceph_decode_32(p);
220
221 dout("crush_decode bucket size %d off %x %p to %p\n",
222 b->size, (int)(*p-start), *p, end);
223
224 b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
225 if (b->items == NULL)
226 goto badmem;
227 b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
228 if (b->perm == NULL)
229 goto badmem;
230 b->perm_n = 0;
231
232 ceph_decode_need(p, end, b->size*sizeof(u32), bad);
233 for (j = 0; j < b->size; j++)
234 b->items[j] = ceph_decode_32(p);
235
236 switch (b->alg) {
237 case CRUSH_BUCKET_UNIFORM:
238 err = crush_decode_uniform_bucket(p, end,
239 (struct crush_bucket_uniform *)b);
240 if (err < 0)
241 goto bad;
242 break;
243 case CRUSH_BUCKET_LIST:
244 err = crush_decode_list_bucket(p, end,
245 (struct crush_bucket_list *)b);
246 if (err < 0)
247 goto bad;
248 break;
249 case CRUSH_BUCKET_TREE:
250 err = crush_decode_tree_bucket(p, end,
251 (struct crush_bucket_tree *)b);
252 if (err < 0)
253 goto bad;
254 break;
255 case CRUSH_BUCKET_STRAW:
256 err = crush_decode_straw_bucket(p, end,
257 (struct crush_bucket_straw *)b);
258 if (err < 0)
259 goto bad;
260 break;
261 }
262 }
263
264 /* rules */
265 dout("rule vec is %p\n", c->rules);
266 for (i = 0; i < c->max_rules; i++) {
267 u32 yes;
268 struct crush_rule *r;
269
270 ceph_decode_32_safe(p, end, yes, bad);
271 if (!yes) {
272 dout("crush_decode NO rule %d off %x %p to %p\n",
273 i, (int)(*p-start), *p, end);
274 c->rules[i] = NULL;
275 continue;
276 }
277
278 dout("crush_decode rule %d off %x %p to %p\n",
279 i, (int)(*p-start), *p, end);
280
281 /* len */
282 ceph_decode_32_safe(p, end, yes, bad);
283#if BITS_PER_LONG == 32
284 err = -EINVAL;
285 if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
286 goto bad;
287#endif
288 r = c->rules[i] = kmalloc(sizeof(*r) +
289 yes*sizeof(struct crush_rule_step),
290 GFP_NOFS);
291 if (r == NULL)
292 goto badmem;
293 dout(" rule %d is at %p\n", i, r);
294 r->len = yes;
295 ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
296 ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
297 for (j = 0; j < r->len; j++) {
298 r->steps[j].op = ceph_decode_32(p);
299 r->steps[j].arg1 = ceph_decode_32(p);
300 r->steps[j].arg2 = ceph_decode_32(p);
301 }
302 }
303
304 /* ignore trailing name maps. */
305
306 dout("crush_decode success\n");
307 return c;
308
309badmem:
310 err = -ENOMEM;
311bad:
312 dout("crush_decode fail %d\n", err);
313 crush_destroy(c);
314 return ERR_PTR(err);
315}
316
317
318/*
319 * osd map
320 */
321void ceph_osdmap_destroy(struct ceph_osdmap *map)
322{
323 dout("osdmap_destroy %p\n", map);
324 if (map->crush)
325 crush_destroy(map->crush);
326 while (!RB_EMPTY_ROOT(&map->pg_temp)) {
327 struct ceph_pg_mapping *pg =
328 rb_entry(rb_first(&map->pg_temp),
329 struct ceph_pg_mapping, node);
330 rb_erase(&pg->node, &map->pg_temp);
331 kfree(pg);
332 }
333 while (!RB_EMPTY_ROOT(&map->pg_pools)) {
334 struct ceph_pg_pool_info *pi =
335 rb_entry(rb_first(&map->pg_pools),
336 struct ceph_pg_pool_info, node);
337 rb_erase(&pi->node, &map->pg_pools);
338 kfree(pi);
339 }
340 kfree(map->osd_state);
341 kfree(map->osd_weight);
342 kfree(map->osd_addr);
343 kfree(map);
344}
345
346/*
347 * adjust max osd value. reallocate arrays.
348 */
349static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
350{
351 u8 *state;
352 struct ceph_entity_addr *addr;
353 u32 *weight;
354
355 state = kcalloc(max, sizeof(*state), GFP_NOFS);
356 addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
357 weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
358 if (state == NULL || addr == NULL || weight == NULL) {
359 kfree(state);
360 kfree(addr);
361 kfree(weight);
362 return -ENOMEM;
363 }
364
365 /* copy old? */
366 if (map->osd_state) {
367 memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
368 memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
369 memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
370 kfree(map->osd_state);
371 kfree(map->osd_addr);
372 kfree(map->osd_weight);
373 }
374
375 map->osd_state = state;
376 map->osd_weight = weight;
377 map->osd_addr = addr;
378 map->max_osd = max;
379 return 0;
380}
381
382/*
383 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
384 * to a set of osds)
385 */
386static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
387{
388 u64 a = *(u64 *)&l;
389 u64 b = *(u64 *)&r;
390
391 if (a < b)
392 return -1;
393 if (a > b)
394 return 1;
395 return 0;
396}
397
398static int __insert_pg_mapping(struct ceph_pg_mapping *new,
399 struct rb_root *root)
400{
401 struct rb_node **p = &root->rb_node;
402 struct rb_node *parent = NULL;
403 struct ceph_pg_mapping *pg = NULL;
404 int c;
405
406 while (*p) {
407 parent = *p;
408 pg = rb_entry(parent, struct ceph_pg_mapping, node);
409 c = pgid_cmp(new->pgid, pg->pgid);
410 if (c < 0)
411 p = &(*p)->rb_left;
412 else if (c > 0)
413 p = &(*p)->rb_right;
414 else
415 return -EEXIST;
416 }
417
418 rb_link_node(&new->node, parent, p);
419 rb_insert_color(&new->node, root);
420 return 0;
421}
422
423static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
424 struct ceph_pg pgid)
425{
426 struct rb_node *n = root->rb_node;
427 struct ceph_pg_mapping *pg;
428 int c;
429
430 while (n) {
431 pg = rb_entry(n, struct ceph_pg_mapping, node);
432 c = pgid_cmp(pgid, pg->pgid);
433 if (c < 0)
434 n = n->rb_left;
435 else if (c > 0)
436 n = n->rb_right;
437 else
438 return pg;
439 }
440 return NULL;
441}
442
443/*
444 * rbtree of pg pool info
445 */
446static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
447{
448 struct rb_node **p = &root->rb_node;
449 struct rb_node *parent = NULL;
450 struct ceph_pg_pool_info *pi = NULL;
451
452 while (*p) {
453 parent = *p;
454 pi = rb_entry(parent, struct ceph_pg_pool_info, node);
455 if (new->id < pi->id)
456 p = &(*p)->rb_left;
457 else if (new->id > pi->id)
458 p = &(*p)->rb_right;
459 else
460 return -EEXIST;
461 }
462
463 rb_link_node(&new->node, parent, p);
464 rb_insert_color(&new->node, root);
465 return 0;
466}
467
468static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
469{
470 struct ceph_pg_pool_info *pi;
471 struct rb_node *n = root->rb_node;
472
473 while (n) {
474 pi = rb_entry(n, struct ceph_pg_pool_info, node);
475 if (id < pi->id)
476 n = n->rb_left;
477 else if (id > pi->id)
478 n = n->rb_right;
479 else
480 return pi;
481 }
482 return NULL;
483}
484
485void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
486{
487 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
488 calc_pg_masks(pi);
489 *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
490 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
491}
492
493/*
494 * decode a full map.
495 */
496struct ceph_osdmap *osdmap_decode(void **p, void *end)
497{
498 struct ceph_osdmap *map;
499 u16 version;
500 u32 len, max, i;
501 u8 ev;
502 int err = -EINVAL;
503 void *start = *p;
504 struct ceph_pg_pool_info *pi;
505
506 dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
507
508 map = kzalloc(sizeof(*map), GFP_NOFS);
509 if (map == NULL)
510 return ERR_PTR(-ENOMEM);
511 map->pg_temp = RB_ROOT;
512
513 ceph_decode_16_safe(p, end, version, bad);
514 if (version > CEPH_OSDMAP_VERSION) {
515 pr_warning("got unknown v %d > %d of osdmap\n", version,
516 CEPH_OSDMAP_VERSION);
517 goto bad;
518 }
519
520 ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
521 ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
522 map->epoch = ceph_decode_32(p);
523 ceph_decode_copy(p, &map->created, sizeof(map->created));
524 ceph_decode_copy(p, &map->modified, sizeof(map->modified));
525
526 ceph_decode_32_safe(p, end, max, bad);
527 while (max--) {
528 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
529 pi = kmalloc(sizeof(*pi), GFP_NOFS);
530 if (!pi)
531 goto bad;
532 pi->id = ceph_decode_32(p);
533 ev = ceph_decode_8(p); /* encoding version */
534 if (ev > CEPH_PG_POOL_VERSION) {
535 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
536 ev, CEPH_PG_POOL_VERSION);
537 goto bad;
538 }
539 __decode_pool(p, pi);
540 __insert_pg_pool(&map->pg_pools, pi);
541 }
542 ceph_decode_32_safe(p, end, map->pool_max, bad);
543
544 ceph_decode_32_safe(p, end, map->flags, bad);
545
546 max = ceph_decode_32(p);
547
548 /* (re)alloc osd arrays */
549 err = osdmap_set_max_osd(map, max);
550 if (err < 0)
551 goto bad;
552 dout("osdmap_decode max_osd = %d\n", map->max_osd);
553
554 /* osds */
555 err = -EINVAL;
556 ceph_decode_need(p, end, 3*sizeof(u32) +
557 map->max_osd*(1 + sizeof(*map->osd_weight) +
558 sizeof(*map->osd_addr)), bad);
559 *p += 4; /* skip length field (should match max) */
560 ceph_decode_copy(p, map->osd_state, map->max_osd);
561
562 *p += 4; /* skip length field (should match max) */
563 for (i = 0; i < map->max_osd; i++)
564 map->osd_weight[i] = ceph_decode_32(p);
565
566 *p += 4; /* skip length field (should match max) */
567 ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
568 for (i = 0; i < map->max_osd; i++)
569 ceph_decode_addr(&map->osd_addr[i]);
570
571 /* pg_temp */
572 ceph_decode_32_safe(p, end, len, bad);
573 for (i = 0; i < len; i++) {
574 int n, j;
575 struct ceph_pg pgid;
576 struct ceph_pg_mapping *pg;
577
578 ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
579 ceph_decode_copy(p, &pgid, sizeof(pgid));
580 n = ceph_decode_32(p);
581 ceph_decode_need(p, end, n * sizeof(u32), bad);
582 err = -ENOMEM;
583 pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
584 if (!pg)
585 goto bad;
586 pg->pgid = pgid;
587 pg->len = n;
588 for (j = 0; j < n; j++)
589 pg->osds[j] = ceph_decode_32(p);
590
591 err = __insert_pg_mapping(pg, &map->pg_temp);
592 if (err)
593 goto bad;
594 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
595 }
596
597 /* crush */
598 ceph_decode_32_safe(p, end, len, bad);
599 dout("osdmap_decode crush len %d from off 0x%x\n", len,
600 (int)(*p - start));
601 ceph_decode_need(p, end, len, bad);
602 map->crush = crush_decode(*p, end);
603 *p += len;
604 if (IS_ERR(map->crush)) {
605 err = PTR_ERR(map->crush);
606 map->crush = NULL;
607 goto bad;
608 }
609
610 /* ignore the rest of the map */
611 *p = end;
612
613 dout("osdmap_decode done %p %p\n", *p, end);
614 return map;
615
616bad:
617 dout("osdmap_decode fail\n");
618 ceph_osdmap_destroy(map);
619 return ERR_PTR(err);
620}
621
622/*
623 * decode and apply an incremental map update.
624 */
625struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
626 struct ceph_osdmap *map,
627 struct ceph_messenger *msgr)
628{
629 struct crush_map *newcrush = NULL;
630 struct ceph_fsid fsid;
631 u32 epoch = 0;
632 struct ceph_timespec modified;
633 u32 len, pool;
634 __s32 new_pool_max, new_flags, max;
635 void *start = *p;
636 int err = -EINVAL;
637 u16 version;
638 struct rb_node *rbp;
639
640 ceph_decode_16_safe(p, end, version, bad);
641 if (version > CEPH_OSDMAP_INC_VERSION) {
642 pr_warning("got unknown v %d > %d of inc osdmap\n", version,
643 CEPH_OSDMAP_INC_VERSION);
644 goto bad;
645 }
646
647 ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
648 bad);
649 ceph_decode_copy(p, &fsid, sizeof(fsid));
650 epoch = ceph_decode_32(p);
651 BUG_ON(epoch != map->epoch+1);
652 ceph_decode_copy(p, &modified, sizeof(modified));
653 new_pool_max = ceph_decode_32(p);
654 new_flags = ceph_decode_32(p);
655
656 /* full map? */
657 ceph_decode_32_safe(p, end, len, bad);
658 if (len > 0) {
659 dout("apply_incremental full map len %d, %p to %p\n",
660 len, *p, end);
661 return osdmap_decode(p, min(*p+len, end));
662 }
663
664 /* new crush? */
665 ceph_decode_32_safe(p, end, len, bad);
666 if (len > 0) {
667 dout("apply_incremental new crush map len %d, %p to %p\n",
668 len, *p, end);
669 newcrush = crush_decode(*p, min(*p+len, end));
670 if (IS_ERR(newcrush))
671 return ERR_PTR(PTR_ERR(newcrush));
672 }
673
674 /* new flags? */
675 if (new_flags >= 0)
676 map->flags = new_flags;
677 if (new_pool_max >= 0)
678 map->pool_max = new_pool_max;
679
680 ceph_decode_need(p, end, 5*sizeof(u32), bad);
681
682 /* new max? */
683 max = ceph_decode_32(p);
684 if (max >= 0) {
685 err = osdmap_set_max_osd(map, max);
686 if (err < 0)
687 goto bad;
688 }
689
690 map->epoch++;
691 map->modified = map->modified;
692 if (newcrush) {
693 if (map->crush)
694 crush_destroy(map->crush);
695 map->crush = newcrush;
696 newcrush = NULL;
697 }
698
699 /* new_pool */
700 ceph_decode_32_safe(p, end, len, bad);
701 while (len--) {
702 __u8 ev;
703 struct ceph_pg_pool_info *pi;
704
705 ceph_decode_32_safe(p, end, pool, bad);
706 ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
707 ev = ceph_decode_8(p); /* encoding version */
708 if (ev > CEPH_PG_POOL_VERSION) {
709 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
710 ev, CEPH_PG_POOL_VERSION);
711 goto bad;
712 }
713 pi = __lookup_pg_pool(&map->pg_pools, pool);
714 if (!pi) {
715 pi = kmalloc(sizeof(*pi), GFP_NOFS);
716 if (!pi) {
717 err = -ENOMEM;
718 goto bad;
719 }
720 pi->id = pool;
721 __insert_pg_pool(&map->pg_pools, pi);
722 }
723 __decode_pool(p, pi);
724 }
725
726 /* old_pool */
727 ceph_decode_32_safe(p, end, len, bad);
728 while (len--) {
729 struct ceph_pg_pool_info *pi;
730
731 ceph_decode_32_safe(p, end, pool, bad);
732 pi = __lookup_pg_pool(&map->pg_pools, pool);
733 if (pi) {
734 rb_erase(&pi->node, &map->pg_pools);
735 kfree(pi);
736 }
737 }
738
739 /* new_up */
740 err = -EINVAL;
741 ceph_decode_32_safe(p, end, len, bad);
742 while (len--) {
743 u32 osd;
744 struct ceph_entity_addr addr;
745 ceph_decode_32_safe(p, end, osd, bad);
746 ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
747 ceph_decode_addr(&addr);
748 pr_info("osd%d up\n", osd);
749 BUG_ON(osd >= map->max_osd);
750 map->osd_state[osd] |= CEPH_OSD_UP;
751 map->osd_addr[osd] = addr;
752 }
753
754 /* new_down */
755 ceph_decode_32_safe(p, end, len, bad);
756 while (len--) {
757 u32 osd;
758 ceph_decode_32_safe(p, end, osd, bad);
759 (*p)++; /* clean flag */
760 pr_info("osd%d down\n", osd);
761 if (osd < map->max_osd)
762 map->osd_state[osd] &= ~CEPH_OSD_UP;
763 }
764
765 /* new_weight */
766 ceph_decode_32_safe(p, end, len, bad);
767 while (len--) {
768 u32 osd, off;
769 ceph_decode_need(p, end, sizeof(u32)*2, bad);
770 osd = ceph_decode_32(p);
771 off = ceph_decode_32(p);
772 pr_info("osd%d weight 0x%x %s\n", osd, off,
773 off == CEPH_OSD_IN ? "(in)" :
774 (off == CEPH_OSD_OUT ? "(out)" : ""));
775 if (osd < map->max_osd)
776 map->osd_weight[osd] = off;
777 }
778
779 /* new_pg_temp */
780 rbp = rb_first(&map->pg_temp);
781 ceph_decode_32_safe(p, end, len, bad);
782 while (len--) {
783 struct ceph_pg_mapping *pg;
784 int j;
785 struct ceph_pg pgid;
786 u32 pglen;
787 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
788 ceph_decode_copy(p, &pgid, sizeof(pgid));
789 pglen = ceph_decode_32(p);
790
791 /* remove any? */
792 while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
793 node)->pgid, pgid) <= 0) {
794 struct rb_node *cur = rbp;
795 rbp = rb_next(rbp);
796 dout(" removed pg_temp %llx\n",
797 *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
798 node)->pgid);
799 rb_erase(cur, &map->pg_temp);
800 }
801
802 if (pglen) {
803 /* insert */
804 ceph_decode_need(p, end, pglen*sizeof(u32), bad);
805 pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
806 if (!pg) {
807 err = -ENOMEM;
808 goto bad;
809 }
810 pg->pgid = pgid;
811 pg->len = pglen;
812 for (j = 0; j < pglen; j++)
813 pg->osds[j] = ceph_decode_32(p);
814 err = __insert_pg_mapping(pg, &map->pg_temp);
815 if (err)
816 goto bad;
817 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
818 pglen);
819 }
820 }
821 while (rbp) {
822 struct rb_node *cur = rbp;
823 rbp = rb_next(rbp);
824 dout(" removed pg_temp %llx\n",
825 *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
826 node)->pgid);
827 rb_erase(cur, &map->pg_temp);
828 }
829
830 /* ignore the rest */
831 *p = end;
832 return map;
833
834bad:
835 pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
836 epoch, (int)(*p - start), *p, start, end);
837 print_hex_dump(KERN_DEBUG, "osdmap: ",
838 DUMP_PREFIX_OFFSET, 16, 1,
839 start, end - start, true);
840 if (newcrush)
841 crush_destroy(newcrush);
842 return ERR_PTR(err);
843}
844
845
846
847
848/*
849 * calculate file layout from given offset, length.
850 * fill in correct oid, logical length, and object extent
851 * offset, length.
852 *
853 * for now, we write only a single su, until we can
854 * pass a stride back to the caller.
855 */
856void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
857 u64 off, u64 *plen,
858 u64 *ono,
859 u64 *oxoff, u64 *oxlen)
860{
861 u32 osize = le32_to_cpu(layout->fl_object_size);
862 u32 su = le32_to_cpu(layout->fl_stripe_unit);
863 u32 sc = le32_to_cpu(layout->fl_stripe_count);
864 u32 bl, stripeno, stripepos, objsetno;
865 u32 su_per_object;
866 u64 t, su_offset;
867
868 dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen,
869 osize, su);
870 su_per_object = osize / su;
871 dout("osize %u / su %u = su_per_object %u\n", osize, su,
872 su_per_object);
873
874 BUG_ON((su & ~PAGE_MASK) != 0);
875 /* bl = *off / su; */
876 t = off;
877 do_div(t, su);
878 bl = t;
879 dout("off %llu / su %u = bl %u\n", off, su, bl);
880
881 stripeno = bl / sc;
882 stripepos = bl % sc;
883 objsetno = stripeno / su_per_object;
884
885 *ono = objsetno * sc + stripepos;
886 dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
887
888 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
889 t = off;
890 su_offset = do_div(t, su);
891 *oxoff = su_offset + (stripeno % su_per_object) * su;
892
893 /*
894 * Calculate the length of the extent being written to the selected
895 * object. This is the minimum of the full length requested (plen) or
896 * the remainder of the current stripe being written to.
897 */
898 *oxlen = min_t(u64, *plen, su - su_offset);
899 *plen = *oxlen;
900
901 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
902}
903
904/*
905 * calculate an object layout (i.e. pgid) from an oid,
906 * file_layout, and osdmap
907 */
908int ceph_calc_object_layout(struct ceph_object_layout *ol,
909 const char *oid,
910 struct ceph_file_layout *fl,
911 struct ceph_osdmap *osdmap)
912{
913 unsigned num, num_mask;
914 struct ceph_pg pgid;
915 s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
916 int poolid = le32_to_cpu(fl->fl_pg_pool);
917 struct ceph_pg_pool_info *pool;
918 unsigned ps;
919
920 BUG_ON(!osdmap);
921
922 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
923 if (!pool)
924 return -EIO;
925 ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
926 if (preferred >= 0) {
927 ps += preferred;
928 num = le32_to_cpu(pool->v.lpg_num);
929 num_mask = pool->lpg_num_mask;
930 } else {
931 num = le32_to_cpu(pool->v.pg_num);
932 num_mask = pool->pg_num_mask;
933 }
934
935 pgid.ps = cpu_to_le16(ps);
936 pgid.preferred = cpu_to_le16(preferred);
937 pgid.pool = fl->fl_pg_pool;
938 if (preferred >= 0)
939 dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
940 (int)preferred);
941 else
942 dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
943
944 ol->ol_pgid = pgid;
945 ol->ol_stripe_unit = fl->fl_object_stripe_unit;
946 return 0;
947}
948
949/*
950 * Calculate raw osd vector for the given pgid. Return pointer to osd
951 * array, or NULL on failure.
952 */
953static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
954 int *osds, int *num)
955{
956 struct ceph_pg_mapping *pg;
957 struct ceph_pg_pool_info *pool;
958 int ruleno;
959 unsigned poolid, ps, pps;
960 int preferred;
961
962 /* pg_temp? */
963 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
964 if (pg) {
965 *num = pg->len;
966 return pg->osds;
967 }
968
969 /* crush */
970 poolid = le32_to_cpu(pgid.pool);
971 ps = le16_to_cpu(pgid.ps);
972 preferred = (s16)le16_to_cpu(pgid.preferred);
973
974 /* don't forcefeed bad device ids to crush */
975 if (preferred >= osdmap->max_osd ||
976 preferred >= osdmap->crush->max_devices)
977 preferred = -1;
978
979 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
980 if (!pool)
981 return NULL;
982 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
983 pool->v.type, pool->v.size);
984 if (ruleno < 0) {
985 pr_err("no crush rule pool %d type %d size %d\n",
986 poolid, pool->v.type, pool->v.size);
987 return NULL;
988 }
989
990 if (preferred >= 0)
991 pps = ceph_stable_mod(ps,
992 le32_to_cpu(pool->v.lpgp_num),
993 pool->lpgp_num_mask);
994 else
995 pps = ceph_stable_mod(ps,
996 le32_to_cpu(pool->v.pgp_num),
997 pool->pgp_num_mask);
998 pps += poolid;
999 *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
1000 min_t(int, pool->v.size, *num),
1001 preferred, osdmap->osd_weight);
1002 return osds;
1003}
1004
1005/*
1006 * Return primary osd for given pgid, or -1 if none.
1007 */
1008int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
1009{
1010 int rawosds[10], *osds;
1011 int i, num = ARRAY_SIZE(rawosds);
1012
1013 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1014 if (!osds)
1015 return -1;
1016
1017 /* primary is first up osd */
1018 for (i = 0; i < num; i++)
1019 if (ceph_osd_is_up(osdmap, osds[i])) {
1020 return osds[i];
1021 break;
1022 }
1023 return -1;
1024}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
new file mode 100644
index 000000000000..1fb55afb2642
--- /dev/null
+++ b/fs/ceph/osdmap.h
@@ -0,0 +1,125 @@
1#ifndef _FS_CEPH_OSDMAP_H
2#define _FS_CEPH_OSDMAP_H
3
4#include <linux/rbtree.h>
5#include "types.h"
6#include "ceph_fs.h"
7#include "crush/crush.h"
8
9/*
10 * The osd map describes the current membership of the osd cluster and
11 * specifies the mapping of objects to placement groups and placement
12 * groups to (sets of) osds. That is, it completely specifies the
13 * (desired) distribution of all data objects in the system at some
14 * point in time.
15 *
16 * Each map version is identified by an epoch, which increases monotonically.
17 *
18 * The map can be updated either via an incremental map (diff) describing
19 * the change between two successive epochs, or as a fully encoded map.
20 */
21struct ceph_pg_pool_info {
22 struct rb_node node;
23 int id;
24 struct ceph_pg_pool v;
25 int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
26};
27
28struct ceph_pg_mapping {
29 struct rb_node node;
30 struct ceph_pg pgid;
31 int len;
32 int osds[];
33};
34
35struct ceph_osdmap {
36 struct ceph_fsid fsid;
37 u32 epoch;
38 u32 mkfs_epoch;
39 struct ceph_timespec created, modified;
40
41 u32 flags; /* CEPH_OSDMAP_* */
42
43 u32 max_osd; /* size of osd_state, _offload, _addr arrays */
44 u8 *osd_state; /* CEPH_OSD_* */
45 u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
46 struct ceph_entity_addr *osd_addr;
47
48 struct rb_root pg_temp;
49 struct rb_root pg_pools;
50 u32 pool_max;
51
52 /* the CRUSH map specifies the mapping of placement groups to
53 * the list of osds that store+replicate them. */
54 struct crush_map *crush;
55};
56
57/*
58 * file layout helpers
59 */
60#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
61#define ceph_file_layout_stripe_count(l) \
62 ((__s32)le32_to_cpu((l).fl_stripe_count))
63#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
64#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
65#define ceph_file_layout_object_su(l) \
66 ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
67#define ceph_file_layout_pg_preferred(l) \
68 ((__s32)le32_to_cpu((l).fl_pg_preferred))
69#define ceph_file_layout_pg_pool(l) \
70 ((__s32)le32_to_cpu((l).fl_pg_pool))
71
72static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
73{
74 return le32_to_cpu(l->fl_stripe_unit) *
75 le32_to_cpu(l->fl_stripe_count);
76}
77
78/* "period" == bytes before i start on a new set of objects */
79static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
80{
81 return le32_to_cpu(l->fl_object_size) *
82 le32_to_cpu(l->fl_stripe_count);
83}
84
85
86static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
87{
88 return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
89}
90
91static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
92{
93 return map && (map->flags & flag);
94}
95
96extern char *ceph_osdmap_state_str(char *str, int len, int state);
97
98static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
99 int osd)
100{
101 if (osd >= map->max_osd)
102 return NULL;
103 return &map->osd_addr[osd];
104}
105
106extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
107extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
108 struct ceph_osdmap *map,
109 struct ceph_messenger *msgr);
110extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
111
112/* calculate mapping of a file extent to an object */
113extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
114 u64 off, u64 *plen,
115 u64 *bno, u64 *oxoff, u64 *oxlen);
116
117/* calculate mapping of object to a placement group */
118extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
119 const char *oid,
120 struct ceph_file_layout *fl,
121 struct ceph_osdmap *osdmap);
122extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
123 struct ceph_pg pgid);
124
125#endif
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
new file mode 100644
index 000000000000..5f8dbf7c745a
--- /dev/null
+++ b/fs/ceph/pagelist.c
@@ -0,0 +1,55 @@
1
2#include <linux/gfp.h>
3#include <linux/pagemap.h>
4#include <linux/highmem.h>
5
6#include "pagelist.h"
7
8int ceph_pagelist_release(struct ceph_pagelist *pl)
9{
10 if (pl->mapped_tail)
11 kunmap(pl->mapped_tail);
12 while (!list_empty(&pl->head)) {
13 struct page *page = list_first_entry(&pl->head, struct page,
14 lru);
15 list_del(&page->lru);
16 __free_page(page);
17 }
18 return 0;
19}
20
21static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
22{
23 struct page *page = alloc_page(GFP_NOFS);
24 if (!page)
25 return -ENOMEM;
26 pl->room += PAGE_SIZE;
27 list_add_tail(&page->lru, &pl->head);
28 if (pl->mapped_tail)
29 kunmap(pl->mapped_tail);
30 pl->mapped_tail = kmap(page);
31 return 0;
32}
33
34int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
35{
36 while (pl->room < len) {
37 size_t bit = pl->room;
38 int ret;
39
40 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
41 buf, bit);
42 pl->length += bit;
43 pl->room -= bit;
44 buf += bit;
45 len -= bit;
46 ret = ceph_pagelist_addpage(pl);
47 if (ret)
48 return ret;
49 }
50
51 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
52 pl->length += len;
53 pl->room -= len;
54 return 0;
55}
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h
new file mode 100644
index 000000000000..e8a4187e1087
--- /dev/null
+++ b/fs/ceph/pagelist.h
@@ -0,0 +1,54 @@
1#ifndef __FS_CEPH_PAGELIST_H
2#define __FS_CEPH_PAGELIST_H
3
4#include <linux/list.h>
5
6struct ceph_pagelist {
7 struct list_head head;
8 void *mapped_tail;
9 size_t length;
10 size_t room;
11};
12
13static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
14{
15 INIT_LIST_HEAD(&pl->head);
16 pl->mapped_tail = NULL;
17 pl->length = 0;
18 pl->room = 0;
19}
20extern int ceph_pagelist_release(struct ceph_pagelist *pl);
21
22extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
23
24static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
25{
26 __le64 ev = cpu_to_le64(v);
27 return ceph_pagelist_append(pl, &ev, sizeof(ev));
28}
29static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
30{
31 __le32 ev = cpu_to_le32(v);
32 return ceph_pagelist_append(pl, &ev, sizeof(ev));
33}
34static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
35{
36 __le16 ev = cpu_to_le16(v);
37 return ceph_pagelist_append(pl, &ev, sizeof(ev));
38}
39static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
40{
41 return ceph_pagelist_append(pl, &v, 1);
42}
43static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
44 char *s, size_t len)
45{
46 int ret = ceph_pagelist_encode_32(pl, len);
47 if (ret)
48 return ret;
49 if (len)
50 return ceph_pagelist_append(pl, s, len);
51 return 0;
52}
53
54#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
new file mode 100644
index 000000000000..26ac8b89a676
--- /dev/null
+++ b/fs/ceph/rados.h
@@ -0,0 +1,374 @@
1#ifndef __RADOS_H
2#define __RADOS_H
3
4/*
5 * Data types for the Ceph distributed object storage layer RADOS
6 * (Reliable Autonomic Distributed Object Store).
7 */
8
9#include "msgr.h"
10
11/*
12 * osdmap encoding versions
13 */
14#define CEPH_OSDMAP_INC_VERSION 4
15#define CEPH_OSDMAP_VERSION 4
16
17/*
18 * fs id
19 */
20struct ceph_fsid {
21 unsigned char fsid[16];
22};
23
24static inline int ceph_fsid_compare(const struct ceph_fsid *a,
25 const struct ceph_fsid *b)
26{
27 return memcmp(a, b, sizeof(*a));
28}
29
30/*
31 * ino, object, etc.
32 */
33typedef __le64 ceph_snapid_t;
34#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */
35#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */
36#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */
37
38struct ceph_timespec {
39 __le32 tv_sec;
40 __le32 tv_nsec;
41} __attribute__ ((packed));
42
43
44/*
45 * object layout - how objects are mapped into PGs
46 */
47#define CEPH_OBJECT_LAYOUT_HASH 1
48#define CEPH_OBJECT_LAYOUT_LINEAR 2
49#define CEPH_OBJECT_LAYOUT_HASHINO 3
50
51/*
52 * pg layout -- how PGs are mapped onto (sets of) OSDs
53 */
54#define CEPH_PG_LAYOUT_CRUSH 0
55#define CEPH_PG_LAYOUT_HASH 1
56#define CEPH_PG_LAYOUT_LINEAR 2
57#define CEPH_PG_LAYOUT_HYBRID 3
58
59
60/*
61 * placement group.
62 * we encode this into one __le64.
63 */
64struct ceph_pg {
65 __le16 preferred; /* preferred primary osd */
66 __le16 ps; /* placement seed */
67 __le32 pool; /* object pool */
68} __attribute__ ((packed));
69
70/*
71 * pg_pool is a set of pgs storing a pool of objects
72 *
73 * pg_num -- base number of pseudorandomly placed pgs
74 *
75 * pgp_num -- effective number when calculating pg placement. this
76 * is used for pg_num increases. new pgs result in data being "split"
77 * into new pgs. for this to proceed smoothly, new pgs are intiially
78 * colocated with their parents; that is, pgp_num doesn't increase
79 * until the new pgs have successfully split. only _then_ are the new
80 * pgs placed independently.
81 *
82 * lpg_num -- localized pg count (per device). replicas are randomly
83 * selected.
84 *
85 * lpgp_num -- as above.
86 */
87#define CEPH_PG_TYPE_REP 1
88#define CEPH_PG_TYPE_RAID4 2
89#define CEPH_PG_POOL_VERSION 2
90struct ceph_pg_pool {
91 __u8 type; /* CEPH_PG_TYPE_* */
92 __u8 size; /* number of osds in each pg */
93 __u8 crush_ruleset; /* crush placement rule */
94 __u8 object_hash; /* hash mapping object name to ps */
95 __le32 pg_num, pgp_num; /* number of pg's */
96 __le32 lpg_num, lpgp_num; /* number of localized pg's */
97 __le32 last_change; /* most recent epoch changed */
98 __le64 snap_seq; /* seq for per-pool snapshot */
99 __le32 snap_epoch; /* epoch of last snap */
100 __le32 num_snaps;
101 __le32 num_removed_snap_intervals;
102 __le64 uid;
103} __attribute__ ((packed));
104
105/*
106 * stable_mod func is used to control number of placement groups.
107 * similar to straight-up modulo, but produces a stable mapping as b
108 * increases over time. b is the number of bins, and bmask is the
109 * containing power of 2 minus 1.
110 *
111 * b <= bmask and bmask=(2**n)-1
112 * e.g., b=12 -> bmask=15, b=123 -> bmask=127
113 */
114static inline int ceph_stable_mod(int x, int b, int bmask)
115{
116 if ((x & bmask) < b)
117 return x & bmask;
118 else
119 return x & (bmask >> 1);
120}
121
122/*
123 * object layout - how a given object should be stored.
124 */
125struct ceph_object_layout {
126 struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */
127 __le32 ol_stripe_unit; /* for per-object parity, if any */
128} __attribute__ ((packed));
129
130/*
131 * compound epoch+version, used by storage layer to serialize mutations
132 */
133struct ceph_eversion {
134 __le32 epoch;
135 __le64 version;
136} __attribute__ ((packed));
137
138/*
139 * osd map bits
140 */
141
142/* status bits */
143#define CEPH_OSD_EXISTS 1
144#define CEPH_OSD_UP 2
145
146/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
147#define CEPH_OSD_IN 0x10000
148#define CEPH_OSD_OUT 0
149
150
151/*
152 * osd map flag bits
153 */
154#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
155#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
156#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
157#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
158#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
159
160/*
161 * osd ops
162 */
163#define CEPH_OSD_OP_MODE 0xf000
164#define CEPH_OSD_OP_MODE_RD 0x1000
165#define CEPH_OSD_OP_MODE_WR 0x2000
166#define CEPH_OSD_OP_MODE_RMW 0x3000
167#define CEPH_OSD_OP_MODE_SUB 0x4000
168
169#define CEPH_OSD_OP_TYPE 0x0f00
170#define CEPH_OSD_OP_TYPE_LOCK 0x0100
171#define CEPH_OSD_OP_TYPE_DATA 0x0200
172#define CEPH_OSD_OP_TYPE_ATTR 0x0300
173#define CEPH_OSD_OP_TYPE_EXEC 0x0400
174#define CEPH_OSD_OP_TYPE_PG 0x0500
175
176enum {
177 /** data **/
178 /* read */
179 CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
180 CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
181
182 /* fancy read */
183 CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
184
185 /* write */
186 CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
187 CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
188 CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
189 CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
190 CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
191
192 /* fancy write */
193 CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
194 CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
195 CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
196 CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
197
198 CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
199 CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
200 CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
201
202 CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
203
204 /** attrs **/
205 /* read */
206 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
207 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
208
209 /* write */
210 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
211 CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
212 CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
213 CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
214
215 /** subop **/
216 CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1,
217 CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2,
218 CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3,
219 CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
220 CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5,
221
222 /** lock **/
223 CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
224 CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
225 CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
226 CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
227 CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
228 CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
229
230 /** exec **/
231 CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
232
233 /** pg **/
234 CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
235};
236
237static inline int ceph_osd_op_type_lock(int op)
238{
239 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
240}
241static inline int ceph_osd_op_type_data(int op)
242{
243 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
244}
245static inline int ceph_osd_op_type_attr(int op)
246{
247 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
248}
249static inline int ceph_osd_op_type_exec(int op)
250{
251 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
252}
253static inline int ceph_osd_op_type_pg(int op)
254{
255 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
256}
257
258static inline int ceph_osd_op_mode_subop(int op)
259{
260 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
261}
262static inline int ceph_osd_op_mode_read(int op)
263{
264 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
265}
266static inline int ceph_osd_op_mode_modify(int op)
267{
268 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
269}
270
271#define CEPH_OSD_TMAP_HDR 'h'
272#define CEPH_OSD_TMAP_SET 's'
273#define CEPH_OSD_TMAP_RM 'r'
274
275extern const char *ceph_osd_op_name(int op);
276
277
278/*
279 * osd op flags
280 *
281 * An op may be READ, WRITE, or READ|WRITE.
282 */
283enum {
284 CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */
285 CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */
286 CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */
287 CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */
288 CEPH_OSD_FLAG_READ = 16, /* op may read */
289 CEPH_OSD_FLAG_WRITE = 32, /* op may write */
290 CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */
291 CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */
292 CEPH_OSD_FLAG_BALANCE_READS = 256,
293 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
294 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */
295 CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */
296};
297
298enum {
299 CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
300};
301
302#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
303#define EBLACKLISTED ESHUTDOWN /* blacklisted */
304
305/*
306 * an individual object operation. each may be accompanied by some data
307 * payload
308 */
309struct ceph_osd_op {
310 __le16 op; /* CEPH_OSD_OP_* */
311 __le32 flags; /* CEPH_OSD_FLAG_* */
312 union {
313 struct {
314 __le64 offset, length;
315 __le64 truncate_size;
316 __le32 truncate_seq;
317 } __attribute__ ((packed)) extent;
318 struct {
319 __le32 name_len;
320 __le32 value_len;
321 } __attribute__ ((packed)) xattr;
322 struct {
323 __u8 class_len;
324 __u8 method_len;
325 __u8 argc;
326 __le32 indata_len;
327 } __attribute__ ((packed)) cls;
328 struct {
329 __le64 cookie, count;
330 } __attribute__ ((packed)) pgls;
331 };
332 __le32 payload_len;
333} __attribute__ ((packed));
334
335/*
336 * osd request message header. each request may include multiple
337 * ceph_osd_op object operations.
338 */
339struct ceph_osd_request_head {
340 __le32 client_inc; /* client incarnation */
341 struct ceph_object_layout layout; /* pgid */
342 __le32 osdmap_epoch; /* client's osdmap epoch */
343
344 __le32 flags;
345
346 struct ceph_timespec mtime; /* for mutations only */
347 struct ceph_eversion reassert_version; /* if we are replaying op */
348
349 __le32 object_len; /* length of object name */
350
351 __le64 snapid; /* snapid to read */
352 __le64 snap_seq; /* writer's snap context */
353 __le32 num_snaps;
354
355 __le16 num_ops;
356 struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */
357} __attribute__ ((packed));
358
359struct ceph_osd_reply_head {
360 __le32 client_inc; /* client incarnation */
361 __le32 flags;
362 struct ceph_object_layout layout;
363 __le32 osdmap_epoch;
364 struct ceph_eversion reassert_version; /* for replaying uncommitted */
365
366 __le32 result; /* result code */
367
368 __le32 object_len; /* length of object name */
369 __le32 num_ops;
370 struct ceph_osd_op ops[0]; /* ops[], object */
371} __attribute__ ((packed));
372
373
374#endif
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
new file mode 100644
index 000000000000..e6f9bc57d472
--- /dev/null
+++ b/fs/ceph/snap.c
@@ -0,0 +1,907 @@
1#include "ceph_debug.h"
2
3#include <linux/sort.h>
4#include <linux/slab.h>
5
6#include "super.h"
7#include "decode.h"
8
9/*
10 * Snapshots in ceph are driven in large part by cooperation from the
11 * client. In contrast to local file systems or file servers that
12 * implement snapshots at a single point in the system, ceph's
13 * distributed access to storage requires clients to help decide
14 * whether a write logically occurs before or after a recently created
15 * snapshot.
16 *
17 * This provides a perfect instantanous client-wide snapshot. Between
18 * clients, however, snapshots may appear to be applied at slightly
19 * different points in time, depending on delays in delivering the
20 * snapshot notification.
21 *
22 * Snapshots are _not_ file system-wide. Instead, each snapshot
23 * applies to the subdirectory nested beneath some directory. This
24 * effectively divides the hierarchy into multiple "realms," where all
25 * of the files contained by each realm share the same set of
26 * snapshots. An individual realm's snap set contains snapshots
27 * explicitly created on that realm, as well as any snaps in its
28 * parent's snap set _after_ the point at which the parent became it's
29 * parent (due to, say, a rename). Similarly, snaps from prior parents
30 * during the time intervals during which they were the parent are included.
31 *
32 * The client is spared most of this detail, fortunately... it must only
33 * maintains a hierarchy of realms reflecting the current parent/child
34 * realm relationship, and for each realm has an explicit list of snaps
35 * inherited from prior parents.
36 *
37 * A snap_realm struct is maintained for realms containing every inode
38 * with an open cap in the system. (The needed snap realm information is
39 * provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq'
40 * version number is used to ensure that as realm parameters change (new
41 * snapshot, new parent, etc.) the client's realm hierarchy is updated.
42 *
43 * The realm hierarchy drives the generation of a 'snap context' for each
44 * realm, which simply lists the resulting set of snaps for the realm. This
45 * is attached to any writes sent to OSDs.
46 */
47/*
48 * Unfortunately error handling is a bit mixed here. If we get a snap
49 * update, but don't have enough memory to update our realm hierarchy,
50 * it's not clear what we can do about it (besides complaining to the
51 * console).
52 */
53
54
55/*
56 * increase ref count for the realm
57 *
58 * caller must hold snap_rwsem for write.
59 */
60void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
61 struct ceph_snap_realm *realm)
62{
63 dout("get_realm %p %d -> %d\n", realm,
64 atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
65 /*
66 * since we _only_ increment realm refs or empty the empty
67 * list with snap_rwsem held, adjusting the empty list here is
68 * safe. we do need to protect against concurrent empty list
69 * additions, however.
70 */
71 if (atomic_read(&realm->nref) == 0) {
72 spin_lock(&mdsc->snap_empty_lock);
73 list_del_init(&realm->empty_item);
74 spin_unlock(&mdsc->snap_empty_lock);
75 }
76
77 atomic_inc(&realm->nref);
78}
79
80static void __insert_snap_realm(struct rb_root *root,
81 struct ceph_snap_realm *new)
82{
83 struct rb_node **p = &root->rb_node;
84 struct rb_node *parent = NULL;
85 struct ceph_snap_realm *r = NULL;
86
87 while (*p) {
88 parent = *p;
89 r = rb_entry(parent, struct ceph_snap_realm, node);
90 if (new->ino < r->ino)
91 p = &(*p)->rb_left;
92 else if (new->ino > r->ino)
93 p = &(*p)->rb_right;
94 else
95 BUG();
96 }
97
98 rb_link_node(&new->node, parent, p);
99 rb_insert_color(&new->node, root);
100}
101
102/*
103 * create and get the realm rooted at @ino and bump its ref count.
104 *
105 * caller must hold snap_rwsem for write.
106 */
107static struct ceph_snap_realm *ceph_create_snap_realm(
108 struct ceph_mds_client *mdsc,
109 u64 ino)
110{
111 struct ceph_snap_realm *realm;
112
113 realm = kzalloc(sizeof(*realm), GFP_NOFS);
114 if (!realm)
115 return ERR_PTR(-ENOMEM);
116
117 atomic_set(&realm->nref, 0); /* tree does not take a ref */
118 realm->ino = ino;
119 INIT_LIST_HEAD(&realm->children);
120 INIT_LIST_HEAD(&realm->child_item);
121 INIT_LIST_HEAD(&realm->empty_item);
122 INIT_LIST_HEAD(&realm->inodes_with_caps);
123 spin_lock_init(&realm->inodes_with_caps_lock);
124 __insert_snap_realm(&mdsc->snap_realms, realm);
125 dout("create_snap_realm %llx %p\n", realm->ino, realm);
126 return realm;
127}
128
129/*
130 * lookup the realm rooted at @ino.
131 *
132 * caller must hold snap_rwsem for write.
133 */
134struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
135 u64 ino)
136{
137 struct rb_node *n = mdsc->snap_realms.rb_node;
138 struct ceph_snap_realm *r;
139
140 while (n) {
141 r = rb_entry(n, struct ceph_snap_realm, node);
142 if (ino < r->ino)
143 n = n->rb_left;
144 else if (ino > r->ino)
145 n = n->rb_right;
146 else {
147 dout("lookup_snap_realm %llx %p\n", r->ino, r);
148 return r;
149 }
150 }
151 return NULL;
152}
153
154static void __put_snap_realm(struct ceph_mds_client *mdsc,
155 struct ceph_snap_realm *realm);
156
157/*
158 * called with snap_rwsem (write)
159 */
160static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
161 struct ceph_snap_realm *realm)
162{
163 dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
164
165 rb_erase(&realm->node, &mdsc->snap_realms);
166
167 if (realm->parent) {
168 list_del_init(&realm->child_item);
169 __put_snap_realm(mdsc, realm->parent);
170 }
171
172 kfree(realm->prior_parent_snaps);
173 kfree(realm->snaps);
174 ceph_put_snap_context(realm->cached_context);
175 kfree(realm);
176}
177
178/*
179 * caller holds snap_rwsem (write)
180 */
181static void __put_snap_realm(struct ceph_mds_client *mdsc,
182 struct ceph_snap_realm *realm)
183{
184 dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
185 atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
186 if (atomic_dec_and_test(&realm->nref))
187 __destroy_snap_realm(mdsc, realm);
188}
189
190/*
191 * caller needn't hold any locks
192 */
193void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
194 struct ceph_snap_realm *realm)
195{
196 dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
197 atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
198 if (!atomic_dec_and_test(&realm->nref))
199 return;
200
201 if (down_write_trylock(&mdsc->snap_rwsem)) {
202 __destroy_snap_realm(mdsc, realm);
203 up_write(&mdsc->snap_rwsem);
204 } else {
205 spin_lock(&mdsc->snap_empty_lock);
206 list_add(&mdsc->snap_empty, &realm->empty_item);
207 spin_unlock(&mdsc->snap_empty_lock);
208 }
209}
210
211/*
212 * Clean up any realms whose ref counts have dropped to zero. Note
213 * that this does not include realms who were created but not yet
214 * used.
215 *
216 * Called under snap_rwsem (write)
217 */
218static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
219{
220 struct ceph_snap_realm *realm;
221
222 spin_lock(&mdsc->snap_empty_lock);
223 while (!list_empty(&mdsc->snap_empty)) {
224 realm = list_first_entry(&mdsc->snap_empty,
225 struct ceph_snap_realm, empty_item);
226 list_del(&realm->empty_item);
227 spin_unlock(&mdsc->snap_empty_lock);
228 __destroy_snap_realm(mdsc, realm);
229 spin_lock(&mdsc->snap_empty_lock);
230 }
231 spin_unlock(&mdsc->snap_empty_lock);
232}
233
234void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
235{
236 down_write(&mdsc->snap_rwsem);
237 __cleanup_empty_realms(mdsc);
238 up_write(&mdsc->snap_rwsem);
239}
240
241/*
242 * adjust the parent realm of a given @realm. adjust child list, and parent
243 * pointers, and ref counts appropriately.
244 *
245 * return true if parent was changed, 0 if unchanged, <0 on error.
246 *
247 * caller must hold snap_rwsem for write.
248 */
249static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
250 struct ceph_snap_realm *realm,
251 u64 parentino)
252{
253 struct ceph_snap_realm *parent;
254
255 if (realm->parent_ino == parentino)
256 return 0;
257
258 parent = ceph_lookup_snap_realm(mdsc, parentino);
259 if (!parent) {
260 parent = ceph_create_snap_realm(mdsc, parentino);
261 if (IS_ERR(parent))
262 return PTR_ERR(parent);
263 }
264 dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
265 realm->ino, realm, realm->parent_ino, realm->parent,
266 parentino, parent);
267 if (realm->parent) {
268 list_del_init(&realm->child_item);
269 ceph_put_snap_realm(mdsc, realm->parent);
270 }
271 realm->parent_ino = parentino;
272 realm->parent = parent;
273 ceph_get_snap_realm(mdsc, parent);
274 list_add(&realm->child_item, &parent->children);
275 return 1;
276}
277
278
279static int cmpu64_rev(const void *a, const void *b)
280{
281 if (*(u64 *)a < *(u64 *)b)
282 return 1;
283 if (*(u64 *)a > *(u64 *)b)
284 return -1;
285 return 0;
286}
287
288/*
289 * build the snap context for a given realm.
290 */
291static int build_snap_context(struct ceph_snap_realm *realm)
292{
293 struct ceph_snap_realm *parent = realm->parent;
294 struct ceph_snap_context *snapc;
295 int err = 0;
296 int i;
297 int num = realm->num_prior_parent_snaps + realm->num_snaps;
298
299 /*
300 * build parent context, if it hasn't been built.
301 * conservatively estimate that all parent snaps might be
302 * included by us.
303 */
304 if (parent) {
305 if (!parent->cached_context) {
306 err = build_snap_context(parent);
307 if (err)
308 goto fail;
309 }
310 num += parent->cached_context->num_snaps;
311 }
312
313 /* do i actually need to update? not if my context seq
314 matches realm seq, and my parents' does to. (this works
315 because we rebuild_snap_realms() works _downward_ in
316 hierarchy after each update.) */
317 if (realm->cached_context &&
318 realm->cached_context->seq == realm->seq &&
319 (!parent ||
320 realm->cached_context->seq >= parent->cached_context->seq)) {
321 dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
322 " (unchanged)\n",
323 realm->ino, realm, realm->cached_context,
324 realm->cached_context->seq,
325 realm->cached_context->num_snaps);
326 return 0;
327 }
328
329 /* alloc new snap context */
330 err = -ENOMEM;
331 if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc))
332 goto fail;
333 snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
334 if (!snapc)
335 goto fail;
336 atomic_set(&snapc->nref, 1);
337
338 /* build (reverse sorted) snap vector */
339 num = 0;
340 snapc->seq = realm->seq;
341 if (parent) {
342 /* include any of parent's snaps occuring _after_ my
343 parent became my parent */
344 for (i = 0; i < parent->cached_context->num_snaps; i++)
345 if (parent->cached_context->snaps[i] >=
346 realm->parent_since)
347 snapc->snaps[num++] =
348 parent->cached_context->snaps[i];
349 if (parent->cached_context->seq > snapc->seq)
350 snapc->seq = parent->cached_context->seq;
351 }
352 memcpy(snapc->snaps + num, realm->snaps,
353 sizeof(u64)*realm->num_snaps);
354 num += realm->num_snaps;
355 memcpy(snapc->snaps + num, realm->prior_parent_snaps,
356 sizeof(u64)*realm->num_prior_parent_snaps);
357 num += realm->num_prior_parent_snaps;
358
359 sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
360 snapc->num_snaps = num;
361 dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n",
362 realm->ino, realm, snapc, snapc->seq, snapc->num_snaps);
363
364 if (realm->cached_context)
365 ceph_put_snap_context(realm->cached_context);
366 realm->cached_context = snapc;
367 return 0;
368
369fail:
370 /*
371 * if we fail, clear old (incorrect) cached_context... hopefully
372 * we'll have better luck building it later
373 */
374 if (realm->cached_context) {
375 ceph_put_snap_context(realm->cached_context);
376 realm->cached_context = NULL;
377 }
378 pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
379 realm, err);
380 return err;
381}
382
383/*
384 * rebuild snap context for the given realm and all of its children.
385 */
386static void rebuild_snap_realms(struct ceph_snap_realm *realm)
387{
388 struct ceph_snap_realm *child;
389
390 dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
391 build_snap_context(realm);
392
393 list_for_each_entry(child, &realm->children, child_item)
394 rebuild_snap_realms(child);
395}
396
397
398/*
399 * helper to allocate and decode an array of snapids. free prior
400 * instance, if any.
401 */
402static int dup_array(u64 **dst, __le64 *src, int num)
403{
404 int i;
405
406 kfree(*dst);
407 if (num) {
408 *dst = kcalloc(num, sizeof(u64), GFP_NOFS);
409 if (!*dst)
410 return -ENOMEM;
411 for (i = 0; i < num; i++)
412 (*dst)[i] = get_unaligned_le64(src + i);
413 } else {
414 *dst = NULL;
415 }
416 return 0;
417}
418
419
420/*
421 * When a snapshot is applied, the size/mtime inode metadata is queued
422 * in a ceph_cap_snap (one for each snapshot) until writeback
423 * completes and the metadata can be flushed back to the MDS.
424 *
425 * However, if a (sync) write is currently in-progress when we apply
426 * the snapshot, we have to wait until the write succeeds or fails
427 * (and a final size/mtime is known). In this case the
428 * cap_snap->writing = 1, and is said to be "pending." When the write
429 * finishes, we __ceph_finish_cap_snap().
430 *
431 * Caller must hold snap_rwsem for read (i.e., the realm topology won't
432 * change).
433 */
434void ceph_queue_cap_snap(struct ceph_inode_info *ci,
435 struct ceph_snap_context *snapc)
436{
437 struct inode *inode = &ci->vfs_inode;
438 struct ceph_cap_snap *capsnap;
439 int used;
440
441 capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
442 if (!capsnap) {
443 pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
444 return;
445 }
446
447 spin_lock(&inode->i_lock);
448 used = __ceph_caps_used(ci);
449 if (__ceph_have_pending_cap_snap(ci)) {
450 /* there is no point in queuing multiple "pending" cap_snaps,
451 as no new writes are allowed to start when pending, so any
452 writes in progress now were started before the previous
453 cap_snap. lucky us. */
454 dout("queue_cap_snap %p snapc %p seq %llu used %d"
455 " already pending\n", inode, snapc, snapc->seq, used);
456 kfree(capsnap);
457 } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
458 igrab(inode);
459
460 atomic_set(&capsnap->nref, 1);
461 capsnap->ci = ci;
462 INIT_LIST_HEAD(&capsnap->ci_item);
463 INIT_LIST_HEAD(&capsnap->flushing_item);
464
465 capsnap->follows = snapc->seq - 1;
466 capsnap->context = ceph_get_snap_context(snapc);
467 capsnap->issued = __ceph_caps_issued(ci, NULL);
468 capsnap->dirty = __ceph_caps_dirty(ci);
469
470 capsnap->mode = inode->i_mode;
471 capsnap->uid = inode->i_uid;
472 capsnap->gid = inode->i_gid;
473
474 /* fixme? */
475 capsnap->xattr_blob = NULL;
476 capsnap->xattr_len = 0;
477
478 /* dirty page count moved from _head to this cap_snap;
479 all subsequent writes page dirties occur _after_ this
480 snapshot. */
481 capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
482 ci->i_wrbuffer_ref_head = 0;
483 ceph_put_snap_context(ci->i_head_snapc);
484 ci->i_head_snapc = NULL;
485 list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
486
487 if (used & CEPH_CAP_FILE_WR) {
488 dout("queue_cap_snap %p cap_snap %p snapc %p"
489 " seq %llu used WR, now pending\n", inode,
490 capsnap, snapc, snapc->seq);
491 capsnap->writing = 1;
492 } else {
493 /* note mtime, size NOW. */
494 __ceph_finish_cap_snap(ci, capsnap);
495 }
496 } else {
497 dout("queue_cap_snap %p nothing dirty|writing\n", inode);
498 kfree(capsnap);
499 }
500
501 spin_unlock(&inode->i_lock);
502}
503
504/*
505 * Finalize the size, mtime for a cap_snap.. that is, settle on final values
506 * to be used for the snapshot, to be flushed back to the mds.
507 *
508 * If capsnap can now be flushed, add to snap_flush list, and return 1.
509 *
510 * Caller must hold i_lock.
511 */
512int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
513 struct ceph_cap_snap *capsnap)
514{
515 struct inode *inode = &ci->vfs_inode;
516 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
517
518 BUG_ON(capsnap->writing);
519 capsnap->size = inode->i_size;
520 capsnap->mtime = inode->i_mtime;
521 capsnap->atime = inode->i_atime;
522 capsnap->ctime = inode->i_ctime;
523 capsnap->time_warp_seq = ci->i_time_warp_seq;
524 if (capsnap->dirty_pages) {
525 dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu "
526 "still has %d dirty pages\n", inode, capsnap,
527 capsnap->context, capsnap->context->seq,
528 capsnap->size, capsnap->dirty_pages);
529 return 0;
530 }
531 dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu clean\n",
532 inode, capsnap, capsnap->context,
533 capsnap->context->seq, capsnap->size);
534
535 spin_lock(&mdsc->snap_flush_lock);
536 list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
537 spin_unlock(&mdsc->snap_flush_lock);
538 return 1; /* caller may want to ceph_flush_snaps */
539}
540
541
542/*
543 * Parse and apply a snapblob "snap trace" from the MDS. This specifies
544 * the snap realm parameters from a given realm and all of its ancestors,
545 * up to the root.
546 *
547 * Caller must hold snap_rwsem for write.
548 */
549int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
550 void *p, void *e, bool deletion)
551{
552 struct ceph_mds_snap_realm *ri; /* encoded */
553 __le64 *snaps; /* encoded */
554 __le64 *prior_parent_snaps; /* encoded */
555 struct ceph_snap_realm *realm;
556 int invalidate = 0;
557 int err = -ENOMEM;
558
559 dout("update_snap_trace deletion=%d\n", deletion);
560more:
561 ceph_decode_need(&p, e, sizeof(*ri), bad);
562 ri = p;
563 p += sizeof(*ri);
564 ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
565 le32_to_cpu(ri->num_prior_parent_snaps)), bad);
566 snaps = p;
567 p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
568 prior_parent_snaps = p;
569 p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
570
571 realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
572 if (!realm) {
573 realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
574 if (IS_ERR(realm)) {
575 err = PTR_ERR(realm);
576 goto fail;
577 }
578 }
579
580 if (le64_to_cpu(ri->seq) > realm->seq) {
581 dout("update_snap_trace updating %llx %p %lld -> %lld\n",
582 realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
583 /*
584 * if the realm seq has changed, queue a cap_snap for every
585 * inode with open caps. we do this _before_ we update
586 * the realm info so that we prepare for writeback under the
587 * _previous_ snap context.
588 *
589 * ...unless it's a snap deletion!
590 */
591 if (!deletion) {
592 struct ceph_inode_info *ci;
593 struct inode *lastinode = NULL;
594
595 spin_lock(&realm->inodes_with_caps_lock);
596 list_for_each_entry(ci, &realm->inodes_with_caps,
597 i_snap_realm_item) {
598 struct inode *inode = igrab(&ci->vfs_inode);
599 if (!inode)
600 continue;
601 spin_unlock(&realm->inodes_with_caps_lock);
602 if (lastinode)
603 iput(lastinode);
604 lastinode = inode;
605 ceph_queue_cap_snap(ci, realm->cached_context);
606 spin_lock(&realm->inodes_with_caps_lock);
607 }
608 spin_unlock(&realm->inodes_with_caps_lock);
609 if (lastinode)
610 iput(lastinode);
611 dout("update_snap_trace cap_snaps queued\n");
612 }
613
614 } else {
615 dout("update_snap_trace %llx %p seq %lld unchanged\n",
616 realm->ino, realm, realm->seq);
617 }
618
619 /* ensure the parent is correct */
620 err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
621 if (err < 0)
622 goto fail;
623 invalidate += err;
624
625 if (le64_to_cpu(ri->seq) > realm->seq) {
626 /* update realm parameters, snap lists */
627 realm->seq = le64_to_cpu(ri->seq);
628 realm->created = le64_to_cpu(ri->created);
629 realm->parent_since = le64_to_cpu(ri->parent_since);
630
631 realm->num_snaps = le32_to_cpu(ri->num_snaps);
632 err = dup_array(&realm->snaps, snaps, realm->num_snaps);
633 if (err < 0)
634 goto fail;
635
636 realm->num_prior_parent_snaps =
637 le32_to_cpu(ri->num_prior_parent_snaps);
638 err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps,
639 realm->num_prior_parent_snaps);
640 if (err < 0)
641 goto fail;
642
643 invalidate = 1;
644 } else if (!realm->cached_context) {
645 invalidate = 1;
646 }
647
648 dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
649 realm, invalidate, p, e);
650
651 if (p < e)
652 goto more;
653
654 /* invalidate when we reach the _end_ (root) of the trace */
655 if (invalidate)
656 rebuild_snap_realms(realm);
657
658 __cleanup_empty_realms(mdsc);
659 return 0;
660
661bad:
662 err = -EINVAL;
663fail:
664 pr_err("update_snap_trace error %d\n", err);
665 return err;
666}
667
668
669/*
670 * Send any cap_snaps that are queued for flush. Try to carry
671 * s_mutex across multiple snap flushes to avoid locking overhead.
672 *
673 * Caller holds no locks.
674 */
675static void flush_snaps(struct ceph_mds_client *mdsc)
676{
677 struct ceph_inode_info *ci;
678 struct inode *inode;
679 struct ceph_mds_session *session = NULL;
680
681 dout("flush_snaps\n");
682 spin_lock(&mdsc->snap_flush_lock);
683 while (!list_empty(&mdsc->snap_flush_list)) {
684 ci = list_first_entry(&mdsc->snap_flush_list,
685 struct ceph_inode_info, i_snap_flush_item);
686 inode = &ci->vfs_inode;
687 igrab(inode);
688 spin_unlock(&mdsc->snap_flush_lock);
689 spin_lock(&inode->i_lock);
690 __ceph_flush_snaps(ci, &session);
691 spin_unlock(&inode->i_lock);
692 iput(inode);
693 spin_lock(&mdsc->snap_flush_lock);
694 }
695 spin_unlock(&mdsc->snap_flush_lock);
696
697 if (session) {
698 mutex_unlock(&session->s_mutex);
699 ceph_put_mds_session(session);
700 }
701 dout("flush_snaps done\n");
702}
703
704
705/*
706 * Handle a snap notification from the MDS.
707 *
708 * This can take two basic forms: the simplest is just a snap creation
709 * or deletion notification on an existing realm. This should update the
710 * realm and its children.
711 *
712 * The more difficult case is realm creation, due to snap creation at a
713 * new point in the file hierarchy, or due to a rename that moves a file or
714 * directory into another realm.
715 */
716void ceph_handle_snap(struct ceph_mds_client *mdsc,
717 struct ceph_mds_session *session,
718 struct ceph_msg *msg)
719{
720 struct super_block *sb = mdsc->client->sb;
721 int mds = session->s_mds;
722 u64 split;
723 int op;
724 int trace_len;
725 struct ceph_snap_realm *realm = NULL;
726 void *p = msg->front.iov_base;
727 void *e = p + msg->front.iov_len;
728 struct ceph_mds_snap_head *h;
729 int num_split_inos, num_split_realms;
730 __le64 *split_inos = NULL, *split_realms = NULL;
731 int i;
732 int locked_rwsem = 0;
733
734 /* decode */
735 if (msg->front.iov_len < sizeof(*h))
736 goto bad;
737 h = p;
738 op = le32_to_cpu(h->op);
739 split = le64_to_cpu(h->split); /* non-zero if we are splitting an
740 * existing realm */
741 num_split_inos = le32_to_cpu(h->num_split_inos);
742 num_split_realms = le32_to_cpu(h->num_split_realms);
743 trace_len = le32_to_cpu(h->trace_len);
744 p += sizeof(*h);
745
746 dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
747 ceph_snap_op_name(op), split, trace_len);
748
749 mutex_lock(&session->s_mutex);
750 session->s_seq++;
751 mutex_unlock(&session->s_mutex);
752
753 down_write(&mdsc->snap_rwsem);
754 locked_rwsem = 1;
755
756 if (op == CEPH_SNAP_OP_SPLIT) {
757 struct ceph_mds_snap_realm *ri;
758
759 /*
760 * A "split" breaks part of an existing realm off into
761 * a new realm. The MDS provides a list of inodes
762 * (with caps) and child realms that belong to the new
763 * child.
764 */
765 split_inos = p;
766 p += sizeof(u64) * num_split_inos;
767 split_realms = p;
768 p += sizeof(u64) * num_split_realms;
769 ceph_decode_need(&p, e, sizeof(*ri), bad);
770 /* we will peek at realm info here, but will _not_
771 * advance p, as the realm update will occur below in
772 * ceph_update_snap_trace. */
773 ri = p;
774
775 realm = ceph_lookup_snap_realm(mdsc, split);
776 if (!realm) {
777 realm = ceph_create_snap_realm(mdsc, split);
778 if (IS_ERR(realm))
779 goto out;
780 }
781 ceph_get_snap_realm(mdsc, realm);
782
783 dout("splitting snap_realm %llx %p\n", realm->ino, realm);
784 for (i = 0; i < num_split_inos; i++) {
785 struct ceph_vino vino = {
786 .ino = le64_to_cpu(split_inos[i]),
787 .snap = CEPH_NOSNAP,
788 };
789 struct inode *inode = ceph_find_inode(sb, vino);
790 struct ceph_inode_info *ci;
791
792 if (!inode)
793 continue;
794 ci = ceph_inode(inode);
795
796 spin_lock(&inode->i_lock);
797 if (!ci->i_snap_realm)
798 goto skip_inode;
799 /*
800 * If this inode belongs to a realm that was
801 * created after our new realm, we experienced
802 * a race (due to another split notifications
803 * arriving from a different MDS). So skip
804 * this inode.
805 */
806 if (ci->i_snap_realm->created >
807 le64_to_cpu(ri->created)) {
808 dout(" leaving %p in newer realm %llx %p\n",
809 inode, ci->i_snap_realm->ino,
810 ci->i_snap_realm);
811 goto skip_inode;
812 }
813 dout(" will move %p to split realm %llx %p\n",
814 inode, realm->ino, realm);
815 /*
816 * Remove the inode from the realm's inode
817 * list, but don't add it to the new realm
818 * yet. We don't want the cap_snap to be
819 * queued (again) by ceph_update_snap_trace()
820 * below. Queue it _now_, under the old context.
821 */
822 spin_lock(&realm->inodes_with_caps_lock);
823 list_del_init(&ci->i_snap_realm_item);
824 spin_unlock(&realm->inodes_with_caps_lock);
825 spin_unlock(&inode->i_lock);
826
827 ceph_queue_cap_snap(ci,
828 ci->i_snap_realm->cached_context);
829
830 iput(inode);
831 continue;
832
833skip_inode:
834 spin_unlock(&inode->i_lock);
835 iput(inode);
836 }
837
838 /* we may have taken some of the old realm's children. */
839 for (i = 0; i < num_split_realms; i++) {
840 struct ceph_snap_realm *child =
841 ceph_lookup_snap_realm(mdsc,
842 le64_to_cpu(split_realms[i]));
843 if (!child)
844 continue;
845 adjust_snap_realm_parent(mdsc, child, realm->ino);
846 }
847 }
848
849 /*
850 * update using the provided snap trace. if we are deleting a
851 * snap, we can avoid queueing cap_snaps.
852 */
853 ceph_update_snap_trace(mdsc, p, e,
854 op == CEPH_SNAP_OP_DESTROY);
855
856 if (op == CEPH_SNAP_OP_SPLIT) {
857 /*
858 * ok, _now_ add the inodes into the new realm.
859 */
860 for (i = 0; i < num_split_inos; i++) {
861 struct ceph_vino vino = {
862 .ino = le64_to_cpu(split_inos[i]),
863 .snap = CEPH_NOSNAP,
864 };
865 struct inode *inode = ceph_find_inode(sb, vino);
866 struct ceph_inode_info *ci;
867
868 if (!inode)
869 continue;
870 ci = ceph_inode(inode);
871 spin_lock(&inode->i_lock);
872 if (!ci->i_snap_realm)
873 goto split_skip_inode;
874 ceph_put_snap_realm(mdsc, ci->i_snap_realm);
875 spin_lock(&realm->inodes_with_caps_lock);
876 list_add(&ci->i_snap_realm_item,
877 &realm->inodes_with_caps);
878 ci->i_snap_realm = realm;
879 spin_unlock(&realm->inodes_with_caps_lock);
880 ceph_get_snap_realm(mdsc, realm);
881split_skip_inode:
882 spin_unlock(&inode->i_lock);
883 iput(inode);
884 }
885
886 /* we took a reference when we created the realm, above */
887 ceph_put_snap_realm(mdsc, realm);
888 }
889
890 __cleanup_empty_realms(mdsc);
891
892 up_write(&mdsc->snap_rwsem);
893
894 flush_snaps(mdsc);
895 return;
896
897bad:
898 pr_err("corrupt snap message from mds%d\n", mds);
899 ceph_msg_dump(msg);
900out:
901 if (locked_rwsem)
902 up_write(&mdsc->snap_rwsem);
903 return;
904}
905
906
907
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
new file mode 100644
index 000000000000..75d02eaa1279
--- /dev/null
+++ b/fs/ceph/super.c
@@ -0,0 +1,1031 @@
1
2#include "ceph_debug.h"
3
4#include <linux/backing-dev.h>
5#include <linux/fs.h>
6#include <linux/inet.h>
7#include <linux/in6.h>
8#include <linux/module.h>
9#include <linux/mount.h>
10#include <linux/parser.h>
11#include <linux/rwsem.h>
12#include <linux/sched.h>
13#include <linux/seq_file.h>
14#include <linux/slab.h>
15#include <linux/statfs.h>
16#include <linux/string.h>
17#include <linux/version.h>
18#include <linux/vmalloc.h>
19
20#include "decode.h"
21#include "super.h"
22#include "mon_client.h"
23#include "auth.h"
24
25/*
26 * Ceph superblock operations
27 *
28 * Handle the basics of mounting, unmounting.
29 */
30
31
32/*
33 * find filename portion of a path (/foo/bar/baz -> baz)
34 */
35const char *ceph_file_part(const char *s, int len)
36{
37 const char *e = s + len;
38
39 while (e != s && *(e-1) != '/')
40 e--;
41 return e;
42}
43
44
45/*
46 * super ops
47 */
48static void ceph_put_super(struct super_block *s)
49{
50 struct ceph_client *cl = ceph_client(s);
51
52 dout("put_super\n");
53 ceph_mdsc_close_sessions(&cl->mdsc);
54 return;
55}
56
57static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
58{
59 struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
60 struct ceph_monmap *monmap = client->monc.monmap;
61 struct ceph_statfs st;
62 u64 fsid;
63 int err;
64
65 dout("statfs\n");
66 err = ceph_monc_do_statfs(&client->monc, &st);
67 if (err < 0)
68 return err;
69
70 /* fill in kstatfs */
71 buf->f_type = CEPH_SUPER_MAGIC; /* ?? */
72
73 /*
74 * express utilization in terms of large blocks to avoid
75 * overflow on 32-bit machines.
76 */
77 buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
78 buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
79 buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >>
80 (CEPH_BLOCK_SHIFT-10);
81 buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
82
83 buf->f_files = le64_to_cpu(st.num_objects);
84 buf->f_ffree = -1;
85 buf->f_namelen = PATH_MAX;
86 buf->f_frsize = PAGE_CACHE_SIZE;
87
88 /* leave fsid little-endian, regardless of host endianness */
89 fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
90 buf->f_fsid.val[0] = fsid & 0xffffffff;
91 buf->f_fsid.val[1] = fsid >> 32;
92
93 return 0;
94}
95
96
97static int ceph_syncfs(struct super_block *sb, int wait)
98{
99 dout("sync_fs %d\n", wait);
100 ceph_osdc_sync(&ceph_client(sb)->osdc);
101 ceph_mdsc_sync(&ceph_client(sb)->mdsc);
102 dout("sync_fs %d done\n", wait);
103 return 0;
104}
105
106
107/**
108 * ceph_show_options - Show mount options in /proc/mounts
109 * @m: seq_file to write to
110 * @mnt: mount descriptor
111 */
112static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
113{
114 struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
115 struct ceph_mount_args *args = client->mount_args;
116
117 if (args->flags & CEPH_OPT_FSID)
118 seq_printf(m, ",fsidmajor=%llu,fsidminor%llu",
119 le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
120 le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
121 if (args->flags & CEPH_OPT_NOSHARE)
122 seq_puts(m, ",noshare");
123 if (args->flags & CEPH_OPT_DIRSTAT)
124 seq_puts(m, ",dirstat");
125 if ((args->flags & CEPH_OPT_RBYTES) == 0)
126 seq_puts(m, ",norbytes");
127 if (args->flags & CEPH_OPT_NOCRC)
128 seq_puts(m, ",nocrc");
129 if (args->flags & CEPH_OPT_NOASYNCREADDIR)
130 seq_puts(m, ",noasyncreaddir");
131 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
132 seq_printf(m, ",snapdirname=%s", args->snapdir_name);
133 if (args->name)
134 seq_printf(m, ",name=%s", args->name);
135 if (args->secret)
136 seq_puts(m, ",secret=<hidden>");
137 return 0;
138}
139
140/*
141 * caches
142 */
143struct kmem_cache *ceph_inode_cachep;
144struct kmem_cache *ceph_cap_cachep;
145struct kmem_cache *ceph_dentry_cachep;
146struct kmem_cache *ceph_file_cachep;
147
148static void ceph_inode_init_once(void *foo)
149{
150 struct ceph_inode_info *ci = foo;
151 inode_init_once(&ci->vfs_inode);
152}
153
154static int default_congestion_kb(void)
155{
156 int congestion_kb;
157
158 /*
159 * Copied from NFS
160 *
161 * congestion size, scale with available memory.
162 *
163 * 64MB: 8192k
164 * 128MB: 11585k
165 * 256MB: 16384k
166 * 512MB: 23170k
167 * 1GB: 32768k
168 * 2GB: 46340k
169 * 4GB: 65536k
170 * 8GB: 92681k
171 * 16GB: 131072k
172 *
173 * This allows larger machines to have larger/more transfers.
174 * Limit the default to 256M
175 */
176 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
177 if (congestion_kb > 256*1024)
178 congestion_kb = 256*1024;
179
180 return congestion_kb;
181}
182
183static int __init init_caches(void)
184{
185 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
186 sizeof(struct ceph_inode_info),
187 __alignof__(struct ceph_inode_info),
188 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
189 ceph_inode_init_once);
190 if (ceph_inode_cachep == NULL)
191 return -ENOMEM;
192
193 ceph_cap_cachep = KMEM_CACHE(ceph_cap,
194 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
195 if (ceph_cap_cachep == NULL)
196 goto bad_cap;
197
198 ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
199 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
200 if (ceph_dentry_cachep == NULL)
201 goto bad_dentry;
202
203 ceph_file_cachep = KMEM_CACHE(ceph_file_info,
204 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
205 if (ceph_file_cachep == NULL)
206 goto bad_file;
207
208 return 0;
209
210bad_file:
211 kmem_cache_destroy(ceph_dentry_cachep);
212bad_dentry:
213 kmem_cache_destroy(ceph_cap_cachep);
214bad_cap:
215 kmem_cache_destroy(ceph_inode_cachep);
216 return -ENOMEM;
217}
218
219static void destroy_caches(void)
220{
221 kmem_cache_destroy(ceph_inode_cachep);
222 kmem_cache_destroy(ceph_cap_cachep);
223 kmem_cache_destroy(ceph_dentry_cachep);
224 kmem_cache_destroy(ceph_file_cachep);
225}
226
227
228/*
229 * ceph_umount_begin - initiate forced umount. Tear down down the
230 * mount, skipping steps that may hang while waiting for server(s).
231 */
232static void ceph_umount_begin(struct super_block *sb)
233{
234 struct ceph_client *client = ceph_sb_to_client(sb);
235
236 dout("ceph_umount_begin - starting forced umount\n");
237 if (!client)
238 return;
239 client->mount_state = CEPH_MOUNT_SHUTDOWN;
240 return;
241}
242
243static const struct super_operations ceph_super_ops = {
244 .alloc_inode = ceph_alloc_inode,
245 .destroy_inode = ceph_destroy_inode,
246 .write_inode = ceph_write_inode,
247 .sync_fs = ceph_syncfs,
248 .put_super = ceph_put_super,
249 .show_options = ceph_show_options,
250 .statfs = ceph_statfs,
251 .umount_begin = ceph_umount_begin,
252};
253
254
255const char *ceph_msg_type_name(int type)
256{
257 switch (type) {
258 case CEPH_MSG_SHUTDOWN: return "shutdown";
259 case CEPH_MSG_PING: return "ping";
260 case CEPH_MSG_AUTH: return "auth";
261 case CEPH_MSG_AUTH_REPLY: return "auth_reply";
262 case CEPH_MSG_MON_MAP: return "mon_map";
263 case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
264 case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
265 case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
266 case CEPH_MSG_STATFS: return "statfs";
267 case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
268 case CEPH_MSG_MDS_MAP: return "mds_map";
269 case CEPH_MSG_CLIENT_SESSION: return "client_session";
270 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
271 case CEPH_MSG_CLIENT_REQUEST: return "client_request";
272 case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
273 case CEPH_MSG_CLIENT_REPLY: return "client_reply";
274 case CEPH_MSG_CLIENT_CAPS: return "client_caps";
275 case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
276 case CEPH_MSG_CLIENT_SNAP: return "client_snap";
277 case CEPH_MSG_CLIENT_LEASE: return "client_lease";
278 case CEPH_MSG_OSD_MAP: return "osd_map";
279 case CEPH_MSG_OSD_OP: return "osd_op";
280 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
281 default: return "unknown";
282 }
283}
284
285
286/*
287 * mount options
288 */
289enum {
290 Opt_fsidmajor,
291 Opt_fsidminor,
292 Opt_monport,
293 Opt_wsize,
294 Opt_rsize,
295 Opt_osdtimeout,
296 Opt_osdkeepalivetimeout,
297 Opt_mount_timeout,
298 Opt_osd_idle_ttl,
299 Opt_caps_wanted_delay_min,
300 Opt_caps_wanted_delay_max,
301 Opt_readdir_max_entries,
302 Opt_congestion_kb,
303 Opt_last_int,
304 /* int args above */
305 Opt_snapdirname,
306 Opt_name,
307 Opt_secret,
308 Opt_last_string,
309 /* string args above */
310 Opt_ip,
311 Opt_noshare,
312 Opt_dirstat,
313 Opt_nodirstat,
314 Opt_rbytes,
315 Opt_norbytes,
316 Opt_nocrc,
317 Opt_noasyncreaddir,
318};
319
320static match_table_t arg_tokens = {
321 {Opt_fsidmajor, "fsidmajor=%ld"},
322 {Opt_fsidminor, "fsidminor=%ld"},
323 {Opt_monport, "monport=%d"},
324 {Opt_wsize, "wsize=%d"},
325 {Opt_rsize, "rsize=%d"},
326 {Opt_osdtimeout, "osdtimeout=%d"},
327 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
328 {Opt_mount_timeout, "mount_timeout=%d"},
329 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
330 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
331 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
332 {Opt_readdir_max_entries, "readdir_max_entries=%d"},
333 {Opt_congestion_kb, "write_congestion_kb=%d"},
334 /* int args above */
335 {Opt_snapdirname, "snapdirname=%s"},
336 {Opt_name, "name=%s"},
337 {Opt_secret, "secret=%s"},
338 /* string args above */
339 {Opt_ip, "ip=%s"},
340 {Opt_noshare, "noshare"},
341 {Opt_dirstat, "dirstat"},
342 {Opt_nodirstat, "nodirstat"},
343 {Opt_rbytes, "rbytes"},
344 {Opt_norbytes, "norbytes"},
345 {Opt_nocrc, "nocrc"},
346 {Opt_noasyncreaddir, "noasyncreaddir"},
347 {-1, NULL}
348};
349
350
351static struct ceph_mount_args *parse_mount_args(int flags, char *options,
352 const char *dev_name,
353 const char **path)
354{
355 struct ceph_mount_args *args;
356 const char *c;
357 int err = -ENOMEM;
358 substring_t argstr[MAX_OPT_ARGS];
359
360 args = kzalloc(sizeof(*args), GFP_KERNEL);
361 if (!args)
362 return ERR_PTR(-ENOMEM);
363 args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
364 GFP_KERNEL);
365 if (!args->mon_addr)
366 goto out;
367
368 dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
369
370 /* start with defaults */
371 args->sb_flags = flags;
372 args->flags = CEPH_OPT_DEFAULT;
373 args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
374 args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
375 args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
376 args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
377 args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
378 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
379 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
380 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
381 args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
382 args->max_readdir = 1024;
383 args->congestion_kb = default_congestion_kb();
384
385 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
386 err = -EINVAL;
387 if (!dev_name)
388 goto out;
389 *path = strstr(dev_name, ":/");
390 if (*path == NULL) {
391 pr_err("device name is missing path (no :/ in %s)\n",
392 dev_name);
393 goto out;
394 }
395
396 /* get mon ip(s) */
397 err = ceph_parse_ips(dev_name, *path, args->mon_addr,
398 CEPH_MAX_MON, &args->num_mon);
399 if (err < 0)
400 goto out;
401
402 /* path on server */
403 *path += 2;
404 dout("server path '%s'\n", *path);
405
406 /* parse mount options */
407 while ((c = strsep(&options, ",")) != NULL) {
408 int token, intval, ret;
409 if (!*c)
410 continue;
411 err = -EINVAL;
412 token = match_token((char *)c, arg_tokens, argstr);
413 if (token < 0) {
414 pr_err("bad mount option at '%s'\n", c);
415 goto out;
416 }
417 if (token < Opt_last_int) {
418 ret = match_int(&argstr[0], &intval);
419 if (ret < 0) {
420 pr_err("bad mount option arg (not int) "
421 "at '%s'\n", c);
422 continue;
423 }
424 dout("got int token %d val %d\n", token, intval);
425 } else if (token > Opt_last_int && token < Opt_last_string) {
426 dout("got string token %d val %s\n", token,
427 argstr[0].from);
428 } else {
429 dout("got token %d\n", token);
430 }
431 switch (token) {
432 case Opt_fsidmajor:
433 *(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval);
434 break;
435 case Opt_fsidminor:
436 *(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval);
437 break;
438 case Opt_ip:
439 err = ceph_parse_ips(argstr[0].from,
440 argstr[0].to,
441 &args->my_addr,
442 1, NULL);
443 if (err < 0)
444 goto out;
445 args->flags |= CEPH_OPT_MYIP;
446 break;
447
448 case Opt_snapdirname:
449 kfree(args->snapdir_name);
450 args->snapdir_name = kstrndup(argstr[0].from,
451 argstr[0].to-argstr[0].from,
452 GFP_KERNEL);
453 break;
454 case Opt_name:
455 args->name = kstrndup(argstr[0].from,
456 argstr[0].to-argstr[0].from,
457 GFP_KERNEL);
458 break;
459 case Opt_secret:
460 args->secret = kstrndup(argstr[0].from,
461 argstr[0].to-argstr[0].from,
462 GFP_KERNEL);
463 break;
464
465 /* misc */
466 case Opt_wsize:
467 args->wsize = intval;
468 break;
469 case Opt_rsize:
470 args->rsize = intval;
471 break;
472 case Opt_osdtimeout:
473 args->osd_timeout = intval;
474 break;
475 case Opt_osdkeepalivetimeout:
476 args->osd_keepalive_timeout = intval;
477 break;
478 case Opt_mount_timeout:
479 args->mount_timeout = intval;
480 break;
481 case Opt_caps_wanted_delay_min:
482 args->caps_wanted_delay_min = intval;
483 break;
484 case Opt_caps_wanted_delay_max:
485 args->caps_wanted_delay_max = intval;
486 break;
487 case Opt_readdir_max_entries:
488 args->max_readdir = intval;
489 break;
490 case Opt_congestion_kb:
491 args->congestion_kb = intval;
492 break;
493
494 case Opt_noshare:
495 args->flags |= CEPH_OPT_NOSHARE;
496 break;
497
498 case Opt_dirstat:
499 args->flags |= CEPH_OPT_DIRSTAT;
500 break;
501 case Opt_nodirstat:
502 args->flags &= ~CEPH_OPT_DIRSTAT;
503 break;
504 case Opt_rbytes:
505 args->flags |= CEPH_OPT_RBYTES;
506 break;
507 case Opt_norbytes:
508 args->flags &= ~CEPH_OPT_RBYTES;
509 break;
510 case Opt_nocrc:
511 args->flags |= CEPH_OPT_NOCRC;
512 break;
513 case Opt_noasyncreaddir:
514 args->flags |= CEPH_OPT_NOASYNCREADDIR;
515 break;
516
517 default:
518 BUG_ON(token);
519 }
520 }
521 return args;
522
523out:
524 kfree(args->mon_addr);
525 kfree(args);
526 return ERR_PTR(err);
527}
528
529static void destroy_mount_args(struct ceph_mount_args *args)
530{
531 dout("destroy_mount_args %p\n", args);
532 kfree(args->snapdir_name);
533 args->snapdir_name = NULL;
534 kfree(args->name);
535 args->name = NULL;
536 kfree(args->secret);
537 args->secret = NULL;
538 kfree(args);
539}
540
541/*
542 * create a fresh client instance
543 */
544static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
545{
546 struct ceph_client *client;
547 int err = -ENOMEM;
548
549 client = kzalloc(sizeof(*client), GFP_KERNEL);
550 if (client == NULL)
551 return ERR_PTR(-ENOMEM);
552
553 mutex_init(&client->mount_mutex);
554
555 init_waitqueue_head(&client->auth_wq);
556
557 client->sb = NULL;
558 client->mount_state = CEPH_MOUNT_MOUNTING;
559 client->mount_args = args;
560
561 client->msgr = NULL;
562
563 client->auth_err = 0;
564 atomic_long_set(&client->writeback_count, 0);
565
566 err = bdi_init(&client->backing_dev_info);
567 if (err < 0)
568 goto fail;
569
570 err = -ENOMEM;
571 client->wb_wq = create_workqueue("ceph-writeback");
572 if (client->wb_wq == NULL)
573 goto fail_bdi;
574 client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
575 if (client->pg_inv_wq == NULL)
576 goto fail_wb_wq;
577 client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
578 if (client->trunc_wq == NULL)
579 goto fail_pg_inv_wq;
580
581 /* set up mempools */
582 err = -ENOMEM;
583 client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
584 client->mount_args->wsize >> PAGE_CACHE_SHIFT);
585 if (!client->wb_pagevec_pool)
586 goto fail_trunc_wq;
587
588 /* caps */
589 client->min_caps = args->max_readdir;
590 ceph_adjust_min_caps(client->min_caps);
591
592 /* subsystems */
593 err = ceph_monc_init(&client->monc, client);
594 if (err < 0)
595 goto fail_mempool;
596 err = ceph_osdc_init(&client->osdc, client);
597 if (err < 0)
598 goto fail_monc;
599 err = ceph_mdsc_init(&client->mdsc, client);
600 if (err < 0)
601 goto fail_osdc;
602 return client;
603
604fail_osdc:
605 ceph_osdc_stop(&client->osdc);
606fail_monc:
607 ceph_monc_stop(&client->monc);
608fail_mempool:
609 mempool_destroy(client->wb_pagevec_pool);
610fail_trunc_wq:
611 destroy_workqueue(client->trunc_wq);
612fail_pg_inv_wq:
613 destroy_workqueue(client->pg_inv_wq);
614fail_wb_wq:
615 destroy_workqueue(client->wb_wq);
616fail_bdi:
617 bdi_destroy(&client->backing_dev_info);
618fail:
619 kfree(client);
620 return ERR_PTR(err);
621}
622
623static void ceph_destroy_client(struct ceph_client *client)
624{
625 dout("destroy_client %p\n", client);
626
627 /* unmount */
628 ceph_mdsc_stop(&client->mdsc);
629 ceph_monc_stop(&client->monc);
630 ceph_osdc_stop(&client->osdc);
631
632 ceph_adjust_min_caps(-client->min_caps);
633
634 ceph_debugfs_client_cleanup(client);
635 destroy_workqueue(client->wb_wq);
636 destroy_workqueue(client->pg_inv_wq);
637 destroy_workqueue(client->trunc_wq);
638
639 if (client->msgr)
640 ceph_messenger_destroy(client->msgr);
641 mempool_destroy(client->wb_pagevec_pool);
642
643 destroy_mount_args(client->mount_args);
644
645 kfree(client);
646 dout("destroy_client %p done\n", client);
647}
648
649/*
650 * Initially learn our fsid, or verify an fsid matches.
651 */
652int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
653{
654 if (client->have_fsid) {
655 if (ceph_fsid_compare(&client->fsid, fsid)) {
656 pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT,
657 PR_FSID(&client->fsid), PR_FSID(fsid));
658 return -1;
659 }
660 } else {
661 pr_info("client%lld fsid " FSID_FORMAT "\n",
662 client->monc.auth->global_id, PR_FSID(fsid));
663 memcpy(&client->fsid, fsid, sizeof(*fsid));
664 ceph_debugfs_client_init(client);
665 client->have_fsid = true;
666 }
667 return 0;
668}
669
670/*
671 * true if we have the mon map (and have thus joined the cluster)
672 */
673static int have_mon_map(struct ceph_client *client)
674{
675 return client->monc.monmap && client->monc.monmap->epoch;
676}
677
678/*
679 * Bootstrap mount by opening the root directory. Note the mount
680 * @started time from caller, and time out if this takes too long.
681 */
682static struct dentry *open_root_dentry(struct ceph_client *client,
683 const char *path,
684 unsigned long started)
685{
686 struct ceph_mds_client *mdsc = &client->mdsc;
687 struct ceph_mds_request *req = NULL;
688 int err;
689 struct dentry *root;
690
691 /* open dir */
692 dout("open_root_inode opening '%s'\n", path);
693 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
694 if (IS_ERR(req))
695 return ERR_PTR(PTR_ERR(req));
696 req->r_path1 = kstrdup(path, GFP_NOFS);
697 req->r_ino1.ino = CEPH_INO_ROOT;
698 req->r_ino1.snap = CEPH_NOSNAP;
699 req->r_started = started;
700 req->r_timeout = client->mount_args->mount_timeout * HZ;
701 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
702 req->r_num_caps = 2;
703 err = ceph_mdsc_do_request(mdsc, NULL, req);
704 if (err == 0) {
705 dout("open_root_inode success\n");
706 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
707 client->sb->s_root == NULL)
708 root = d_alloc_root(req->r_target_inode);
709 else
710 root = d_obtain_alias(req->r_target_inode);
711 req->r_target_inode = NULL;
712 dout("open_root_inode success, root dentry is %p\n", root);
713 } else {
714 root = ERR_PTR(err);
715 }
716 ceph_mdsc_put_request(req);
717 return root;
718}
719
720/*
721 * mount: join the ceph cluster, and open root directory.
722 */
723static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
724 const char *path)
725{
726 struct ceph_entity_addr *myaddr = NULL;
727 int err;
728 unsigned long timeout = client->mount_args->mount_timeout * HZ;
729 unsigned long started = jiffies; /* note the start time */
730 struct dentry *root;
731
732 dout("mount start\n");
733 mutex_lock(&client->mount_mutex);
734
735 /* initialize the messenger */
736 if (client->msgr == NULL) {
737 if (ceph_test_opt(client, MYIP))
738 myaddr = &client->mount_args->my_addr;
739 client->msgr = ceph_messenger_create(myaddr);
740 if (IS_ERR(client->msgr)) {
741 err = PTR_ERR(client->msgr);
742 client->msgr = NULL;
743 goto out;
744 }
745 client->msgr->nocrc = ceph_test_opt(client, NOCRC);
746 }
747
748 /* open session, and wait for mon, mds, and osd maps */
749 err = ceph_monc_open_session(&client->monc);
750 if (err < 0)
751 goto out;
752
753 while (!have_mon_map(client)) {
754 err = -EIO;
755 if (timeout && time_after_eq(jiffies, started + timeout))
756 goto out;
757
758 /* wait */
759 dout("mount waiting for mon_map\n");
760 err = wait_event_interruptible_timeout(client->auth_wq,
761 have_mon_map(client) || (client->auth_err < 0),
762 timeout);
763 if (err == -EINTR || err == -ERESTARTSYS)
764 goto out;
765 if (client->auth_err < 0) {
766 err = client->auth_err;
767 goto out;
768 }
769 }
770
771 dout("mount opening root\n");
772 root = open_root_dentry(client, "", started);
773 if (IS_ERR(root)) {
774 err = PTR_ERR(root);
775 goto out;
776 }
777 if (client->sb->s_root)
778 dput(root);
779 else
780 client->sb->s_root = root;
781
782 if (path[0] == 0) {
783 dget(root);
784 } else {
785 dout("mount opening base mountpoint\n");
786 root = open_root_dentry(client, path, started);
787 if (IS_ERR(root)) {
788 err = PTR_ERR(root);
789 dput(client->sb->s_root);
790 client->sb->s_root = NULL;
791 goto out;
792 }
793 }
794
795 mnt->mnt_root = root;
796 mnt->mnt_sb = client->sb;
797
798 client->mount_state = CEPH_MOUNT_MOUNTED;
799 dout("mount success\n");
800 err = 0;
801
802out:
803 mutex_unlock(&client->mount_mutex);
804 return err;
805}
806
807static int ceph_set_super(struct super_block *s, void *data)
808{
809 struct ceph_client *client = data;
810 int ret;
811
812 dout("set_super %p data %p\n", s, data);
813
814 s->s_flags = client->mount_args->sb_flags;
815 s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
816
817 s->s_fs_info = client;
818 client->sb = s;
819
820 s->s_op = &ceph_super_ops;
821 s->s_export_op = &ceph_export_ops;
822
823 s->s_time_gran = 1000; /* 1000 ns == 1 us */
824
825 ret = set_anon_super(s, NULL); /* what is that second arg for? */
826 if (ret != 0)
827 goto fail;
828
829 return ret;
830
831fail:
832 s->s_fs_info = NULL;
833 client->sb = NULL;
834 return ret;
835}
836
837/*
838 * share superblock if same fs AND options
839 */
840static int ceph_compare_super(struct super_block *sb, void *data)
841{
842 struct ceph_client *new = data;
843 struct ceph_mount_args *args = new->mount_args;
844 struct ceph_client *other = ceph_sb_to_client(sb);
845 int i;
846
847 dout("ceph_compare_super %p\n", sb);
848 if (args->flags & CEPH_OPT_FSID) {
849 if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
850 dout("fsid doesn't match\n");
851 return 0;
852 }
853 } else {
854 /* do we share (a) monitor? */
855 for (i = 0; i < new->monc.monmap->num_mon; i++)
856 if (ceph_monmap_contains(other->monc.monmap,
857 &new->monc.monmap->mon_inst[i].addr))
858 break;
859 if (i == new->monc.monmap->num_mon) {
860 dout("mon ip not part of monmap\n");
861 return 0;
862 }
863 dout("mon ip matches existing sb %p\n", sb);
864 }
865 if (args->sb_flags != other->mount_args->sb_flags) {
866 dout("flags differ\n");
867 return 0;
868 }
869 return 1;
870}
871
872/*
873 * construct our own bdi so we can control readahead, etc.
874 */
875static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
876{
877 int err;
878
879 sb->s_bdi = &client->backing_dev_info;
880
881 /* set ra_pages based on rsize mount option? */
882 if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
883 client->backing_dev_info.ra_pages =
884 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
885 >> PAGE_SHIFT;
886 err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
887 return err;
888}
889
890static int ceph_get_sb(struct file_system_type *fs_type,
891 int flags, const char *dev_name, void *data,
892 struct vfsmount *mnt)
893{
894 struct super_block *sb;
895 struct ceph_client *client;
896 int err;
897 int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
898 const char *path = NULL;
899 struct ceph_mount_args *args;
900
901 dout("ceph_get_sb\n");
902 args = parse_mount_args(flags, data, dev_name, &path);
903 if (IS_ERR(args)) {
904 err = PTR_ERR(args);
905 goto out_final;
906 }
907
908 /* create client (which we may/may not use) */
909 client = ceph_create_client(args);
910 if (IS_ERR(client)) {
911 err = PTR_ERR(client);
912 goto out_final;
913 }
914
915 if (client->mount_args->flags & CEPH_OPT_NOSHARE)
916 compare_super = NULL;
917 sb = sget(fs_type, compare_super, ceph_set_super, client);
918 if (IS_ERR(sb)) {
919 err = PTR_ERR(sb);
920 goto out;
921 }
922
923 if (ceph_client(sb) != client) {
924 ceph_destroy_client(client);
925 client = ceph_client(sb);
926 dout("get_sb got existing client %p\n", client);
927 } else {
928 dout("get_sb using new client %p\n", client);
929 err = ceph_register_bdi(sb, client);
930 if (err < 0)
931 goto out_splat;
932 }
933
934 err = ceph_mount(client, mnt, path);
935 if (err < 0)
936 goto out_splat;
937 dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
938 mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode));
939 return 0;
940
941out_splat:
942 ceph_mdsc_close_sessions(&client->mdsc);
943 up_write(&sb->s_umount);
944 deactivate_super(sb);
945 goto out_final;
946
947out:
948 ceph_destroy_client(client);
949out_final:
950 dout("ceph_get_sb fail %d\n", err);
951 return err;
952}
953
954static void ceph_kill_sb(struct super_block *s)
955{
956 struct ceph_client *client = ceph_sb_to_client(s);
957 dout("kill_sb %p\n", s);
958 ceph_mdsc_pre_umount(&client->mdsc);
959 kill_anon_super(s); /* will call put_super after sb is r/o */
960 if (s->s_bdi == &client->backing_dev_info)
961 bdi_unregister(&client->backing_dev_info);
962 bdi_destroy(&client->backing_dev_info);
963 ceph_destroy_client(client);
964}
965
966static struct file_system_type ceph_fs_type = {
967 .owner = THIS_MODULE,
968 .name = "ceph",
969 .get_sb = ceph_get_sb,
970 .kill_sb = ceph_kill_sb,
971 .fs_flags = FS_RENAME_DOES_D_MOVE,
972};
973
974#define _STRINGIFY(x) #x
975#define STRINGIFY(x) _STRINGIFY(x)
976
977static int __init init_ceph(void)
978{
979 int ret = 0;
980
981 ret = ceph_debugfs_init();
982 if (ret < 0)
983 goto out;
984
985 ret = ceph_msgr_init();
986 if (ret < 0)
987 goto out_debugfs;
988
989 ret = init_caches();
990 if (ret)
991 goto out_msgr;
992
993 ceph_caps_init();
994
995 ret = register_filesystem(&ceph_fs_type);
996 if (ret)
997 goto out_icache;
998
999 pr_info("loaded %d.%d.%d (mon/mds/osd proto %d/%d/%d)\n",
1000 CEPH_VERSION_MAJOR, CEPH_VERSION_MINOR, CEPH_VERSION_PATCH,
1001 CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL);
1002 return 0;
1003
1004out_icache:
1005 destroy_caches();
1006out_msgr:
1007 ceph_msgr_exit();
1008out_debugfs:
1009 ceph_debugfs_cleanup();
1010out:
1011 return ret;
1012}
1013
1014static void __exit exit_ceph(void)
1015{
1016 dout("exit_ceph\n");
1017 unregister_filesystem(&ceph_fs_type);
1018 ceph_caps_finalize();
1019 destroy_caches();
1020 ceph_msgr_exit();
1021 ceph_debugfs_cleanup();
1022}
1023
1024module_init(init_ceph);
1025module_exit(exit_ceph);
1026
1027MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
1028MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
1029MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
1030MODULE_DESCRIPTION("Ceph filesystem for Linux");
1031MODULE_LICENSE("GPL");
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
new file mode 100644
index 000000000000..ca702c67bc66
--- /dev/null
+++ b/fs/ceph/super.h
@@ -0,0 +1,902 @@
1#ifndef _FS_CEPH_SUPER_H
2#define _FS_CEPH_SUPER_H
3
4#include "ceph_debug.h"
5
6#include <asm/unaligned.h>
7#include <linux/backing-dev.h>
8#include <linux/completion.h>
9#include <linux/exportfs.h>
10#include <linux/fs.h>
11#include <linux/mempool.h>
12#include <linux/pagemap.h>
13#include <linux/wait.h>
14#include <linux/writeback.h>
15#include <linux/slab.h>
16
17#include "types.h"
18#include "messenger.h"
19#include "msgpool.h"
20#include "mon_client.h"
21#include "mds_client.h"
22#include "osd_client.h"
23#include "ceph_fs.h"
24
25/* f_type in struct statfs */
26#define CEPH_SUPER_MAGIC 0x00c36400
27
28/* large granularity for statfs utilization stats to facilitate
29 * large volume sizes on 32-bit machines. */
30#define CEPH_BLOCK_SHIFT 20 /* 1 MB */
31#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
32
33/*
34 * mount options
35 */
36#define CEPH_OPT_FSID (1<<0)
37#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
38#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
39#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */
40#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
41#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */
42#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
43
44#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES)
45
46#define ceph_set_opt(client, opt) \
47 (client)->mount_args->flags |= CEPH_OPT_##opt;
48#define ceph_test_opt(client, opt) \
49 (!!((client)->mount_args->flags & CEPH_OPT_##opt))
50
51
52struct ceph_mount_args {
53 int sb_flags;
54 int num_mon;
55 struct ceph_entity_addr *mon_addr;
56 int flags;
57 int mount_timeout;
58 int osd_idle_ttl;
59 int caps_wanted_delay_min, caps_wanted_delay_max;
60 struct ceph_fsid fsid;
61 struct ceph_entity_addr my_addr;
62 int wsize;
63 int rsize; /* max readahead */
64 int max_readdir; /* max readdir size */
65 int congestion_kb; /* max readdir size */
66 int osd_timeout;
67 int osd_keepalive_timeout;
68 char *snapdir_name; /* default ".snap" */
69 char *name;
70 char *secret;
71 int cap_release_safety;
72};
73
74/*
75 * defaults
76 */
77#define CEPH_MOUNT_TIMEOUT_DEFAULT 60
78#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */
79#define CEPH_OSD_KEEPALIVE_DEFAULT 5
80#define CEPH_OSD_IDLE_TTL_DEFAULT 60
81#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
82
83#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
84#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
85
86#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
87#define CEPH_AUTH_NAME_DEFAULT "guest"
88
89/*
90 * Delay telling the MDS we no longer want caps, in case we reopen
91 * the file. Delay a minimum amount of time, even if we send a cap
92 * message for some other reason. Otherwise, take the oppotunity to
93 * update the mds to avoid sending another message later.
94 */
95#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
96#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
97
98
99/* mount state */
100enum {
101 CEPH_MOUNT_MOUNTING,
102 CEPH_MOUNT_MOUNTED,
103 CEPH_MOUNT_UNMOUNTING,
104 CEPH_MOUNT_UNMOUNTED,
105 CEPH_MOUNT_SHUTDOWN,
106};
107
108/*
109 * subtract jiffies
110 */
111static inline unsigned long time_sub(unsigned long a, unsigned long b)
112{
113 BUG_ON(time_after(b, a));
114 return (long)a - (long)b;
115}
116
117/*
118 * per-filesystem client state
119 *
120 * possibly shared by multiple mount points, if they are
121 * mounting the same ceph filesystem/cluster.
122 */
123struct ceph_client {
124 struct ceph_fsid fsid;
125 bool have_fsid;
126
127 struct mutex mount_mutex; /* serialize mount attempts */
128 struct ceph_mount_args *mount_args;
129
130 struct super_block *sb;
131
132 unsigned long mount_state;
133 wait_queue_head_t auth_wq;
134
135 int auth_err;
136
137 int min_caps; /* min caps i added */
138
139 struct ceph_messenger *msgr; /* messenger instance */
140 struct ceph_mon_client monc;
141 struct ceph_mds_client mdsc;
142 struct ceph_osd_client osdc;
143
144 /* writeback */
145 mempool_t *wb_pagevec_pool;
146 struct workqueue_struct *wb_wq;
147 struct workqueue_struct *pg_inv_wq;
148 struct workqueue_struct *trunc_wq;
149 atomic_long_t writeback_count;
150
151 struct backing_dev_info backing_dev_info;
152
153#ifdef CONFIG_DEBUG_FS
154 struct dentry *debugfs_monmap;
155 struct dentry *debugfs_mdsmap, *debugfs_osdmap;
156 struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
157 struct dentry *debugfs_congestion_kb;
158 struct dentry *debugfs_bdi;
159#endif
160};
161
162static inline struct ceph_client *ceph_client(struct super_block *sb)
163{
164 return sb->s_fs_info;
165}
166
167
168/*
169 * File i/o capability. This tracks shared state with the metadata
170 * server that allows us to cache or writeback attributes or to read
171 * and write data. For any given inode, we should have one or more
172 * capabilities, one issued by each metadata server, and our
173 * cumulative access is the OR of all issued capabilities.
174 *
175 * Each cap is referenced by the inode's i_caps rbtree and by per-mds
176 * session capability lists.
177 */
178struct ceph_cap {
179 struct ceph_inode_info *ci;
180 struct rb_node ci_node; /* per-ci cap tree */
181 struct ceph_mds_session *session;
182 struct list_head session_caps; /* per-session caplist */
183 int mds;
184 u64 cap_id; /* unique cap id (mds provided) */
185 int issued; /* latest, from the mds */
186 int implemented; /* implemented superset of issued (for revocation) */
187 int mds_wanted;
188 u32 seq, issue_seq, mseq;
189 u32 cap_gen; /* active/stale cycle */
190 unsigned long last_used;
191 struct list_head caps_item;
192};
193
194#define CHECK_CAPS_NODELAY 1 /* do not delay any further */
195#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
196#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
197
198/*
199 * Snapped cap state that is pending flush to mds. When a snapshot occurs,
200 * we first complete any in-process sync writes and writeback any dirty
201 * data before flushing the snapped state (tracked here) back to the MDS.
202 */
203struct ceph_cap_snap {
204 atomic_t nref;
205 struct ceph_inode_info *ci;
206 struct list_head ci_item, flushing_item;
207
208 u64 follows, flush_tid;
209 int issued, dirty;
210 struct ceph_snap_context *context;
211
212 mode_t mode;
213 uid_t uid;
214 gid_t gid;
215
216 void *xattr_blob;
217 int xattr_len;
218 u64 xattr_version;
219
220 u64 size;
221 struct timespec mtime, atime, ctime;
222 u64 time_warp_seq;
223 int writing; /* a sync write is still in progress */
224 int dirty_pages; /* dirty pages awaiting writeback */
225};
226
227static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
228{
229 if (atomic_dec_and_test(&capsnap->nref))
230 kfree(capsnap);
231}
232
233/*
234 * The frag tree describes how a directory is fragmented, potentially across
235 * multiple metadata servers. It is also used to indicate points where
236 * metadata authority is delegated, and whether/where metadata is replicated.
237 *
238 * A _leaf_ frag will be present in the i_fragtree IFF there is
239 * delegation info. That is, if mds >= 0 || ndist > 0.
240 */
241#define CEPH_MAX_DIRFRAG_REP 4
242
243struct ceph_inode_frag {
244 struct rb_node node;
245
246 /* fragtree state */
247 u32 frag;
248 int split_by; /* i.e. 2^(split_by) children */
249
250 /* delegation and replication info */
251 int mds; /* -1 if same authority as parent */
252 int ndist; /* >0 if replicated */
253 int dist[CEPH_MAX_DIRFRAG_REP];
254};
255
256/*
257 * We cache inode xattrs as an encoded blob until they are first used,
258 * at which point we parse them into an rbtree.
259 */
260struct ceph_inode_xattr {
261 struct rb_node node;
262
263 const char *name;
264 int name_len;
265 const char *val;
266 int val_len;
267 int dirty;
268
269 int should_free_name;
270 int should_free_val;
271};
272
273struct ceph_inode_xattrs_info {
274 /*
275 * (still encoded) xattr blob. we avoid the overhead of parsing
276 * this until someone actually calls getxattr, etc.
277 *
278 * blob->vec.iov_len == 4 implies there are no xattrs; blob ==
279 * NULL means we don't know.
280 */
281 struct ceph_buffer *blob, *prealloc_blob;
282
283 struct rb_root index;
284 bool dirty;
285 int count;
286 int names_size;
287 int vals_size;
288 u64 version, index_version;
289};
290
291/*
292 * Ceph inode.
293 */
294#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
295#define CEPH_I_NODELAY 4 /* do not delay cap release */
296#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
297#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
298
299struct ceph_inode_info {
300 struct ceph_vino i_vino; /* ceph ino + snap */
301
302 u64 i_version;
303 u32 i_time_warp_seq;
304
305 unsigned i_ceph_flags;
306 unsigned long i_release_count;
307
308 struct ceph_file_layout i_layout;
309 char *i_symlink;
310
311 /* for dirs */
312 struct timespec i_rctime;
313 u64 i_rbytes, i_rfiles, i_rsubdirs;
314 u64 i_files, i_subdirs;
315 u64 i_max_offset; /* largest readdir offset, set with I_COMPLETE */
316
317 struct rb_root i_fragtree;
318 struct mutex i_fragtree_mutex;
319
320 struct ceph_inode_xattrs_info i_xattrs;
321
322 /* capabilities. protected _both_ by i_lock and cap->session's
323 * s_mutex. */
324 struct rb_root i_caps; /* cap list */
325 struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
326 unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
327 struct list_head i_dirty_item, i_flushing_item;
328 u64 i_cap_flush_seq;
329 /* we need to track cap writeback on a per-cap-bit basis, to allow
330 * overlapping, pipelined cap flushes to the mds. we can probably
331 * reduce the tid to 8 bits if we're concerned about inode size. */
332 u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
333 wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
334 unsigned long i_hold_caps_min; /* jiffies */
335 unsigned long i_hold_caps_max; /* jiffies */
336 struct list_head i_cap_delay_list; /* for delayed cap release to mds */
337 int i_cap_exporting_mds; /* to handle cap migration between */
338 unsigned i_cap_exporting_mseq; /* mds's. */
339 unsigned i_cap_exporting_issued;
340 struct ceph_cap_reservation i_cap_migration_resv;
341 struct list_head i_cap_snaps; /* snapped state pending flush to mds */
342 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 */
343 unsigned i_snap_caps; /* cap bits for snapped files */
344
345 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
346
347 u32 i_truncate_seq; /* last truncate to smaller size */
348 u64 i_truncate_size; /* and the size we last truncated down to */
349 int i_truncate_pending; /* still need to call vmtruncate */
350
351 u64 i_max_size; /* max file size authorized by mds */
352 u64 i_reported_size; /* (max_)size reported to or requested of mds */
353 u64 i_wanted_max_size; /* offset we'd like to write too */
354 u64 i_requested_max_size; /* max_size we've requested */
355
356 /* held references to caps */
357 int i_pin_ref;
358 int i_rd_ref, i_rdcache_ref, i_wr_ref;
359 int i_wrbuffer_ref, i_wrbuffer_ref_head;
360 u32 i_shared_gen; /* increment each time we get FILE_SHARED */
361 u32 i_rdcache_gen; /* we increment this each time we get
362 FILE_CACHE. If it's non-zero, we
363 _may_ have cached pages. */
364 u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
365
366 struct list_head i_unsafe_writes; /* uncommitted sync writes */
367 struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
368 spinlock_t i_unsafe_lock;
369
370 struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
371 int i_snap_realm_counter; /* snap realm (if caps) */
372 struct list_head i_snap_realm_item;
373 struct list_head i_snap_flush_item;
374
375 struct work_struct i_wb_work; /* writeback work */
376 struct work_struct i_pg_inv_work; /* page invalidation work */
377
378 struct work_struct i_vmtruncate_work;
379
380 struct inode vfs_inode; /* at end */
381};
382
383static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
384{
385 return container_of(inode, struct ceph_inode_info, vfs_inode);
386}
387
388static inline void ceph_i_clear(struct inode *inode, unsigned mask)
389{
390 struct ceph_inode_info *ci = ceph_inode(inode);
391
392 spin_lock(&inode->i_lock);
393 ci->i_ceph_flags &= ~mask;
394 spin_unlock(&inode->i_lock);
395}
396
397static inline void ceph_i_set(struct inode *inode, unsigned mask)
398{
399 struct ceph_inode_info *ci = ceph_inode(inode);
400
401 spin_lock(&inode->i_lock);
402 ci->i_ceph_flags |= mask;
403 spin_unlock(&inode->i_lock);
404}
405
406static inline bool ceph_i_test(struct inode *inode, unsigned mask)
407{
408 struct ceph_inode_info *ci = ceph_inode(inode);
409 bool r;
410
411 smp_mb();
412 r = (ci->i_ceph_flags & mask) == mask;
413 return r;
414}
415
416
417/* find a specific frag @f */
418extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
419 u32 f);
420
421/*
422 * choose fragment for value @v. copy frag content to pfrag, if leaf
423 * exists
424 */
425extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
426 struct ceph_inode_frag *pfrag,
427 int *found);
428
429/*
430 * Ceph dentry state
431 */
432struct ceph_dentry_info {
433 struct ceph_mds_session *lease_session;
434 u32 lease_gen, lease_shared_gen;
435 u32 lease_seq;
436 unsigned long lease_renew_after, lease_renew_from;
437 struct list_head lru;
438 struct dentry *dentry;
439 u64 time;
440 u64 offset;
441};
442
443static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
444{
445 return (struct ceph_dentry_info *)dentry->d_fsdata;
446}
447
448static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
449{
450 return ((loff_t)frag << 32) | (loff_t)off;
451}
452
453/*
454 * ino_t is <64 bits on many architectures, blech.
455 *
456 * don't include snap in ino hash, at least for now.
457 */
458static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
459{
460 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
461#if BITS_PER_LONG == 32
462 ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
463 if (!ino)
464 ino = 1;
465#endif
466 return ino;
467}
468
469static inline int ceph_set_ino_cb(struct inode *inode, void *data)
470{
471 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
472 inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
473 return 0;
474}
475
476static inline struct ceph_vino ceph_vino(struct inode *inode)
477{
478 return ceph_inode(inode)->i_vino;
479}
480
481/* for printf-style formatting */
482#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
483
484static inline u64 ceph_ino(struct inode *inode)
485{
486 return ceph_inode(inode)->i_vino.ino;
487}
488static inline u64 ceph_snap(struct inode *inode)
489{
490 return ceph_inode(inode)->i_vino.snap;
491}
492
493static inline int ceph_ino_compare(struct inode *inode, void *data)
494{
495 struct ceph_vino *pvino = (struct ceph_vino *)data;
496 struct ceph_inode_info *ci = ceph_inode(inode);
497 return ci->i_vino.ino == pvino->ino &&
498 ci->i_vino.snap == pvino->snap;
499}
500
501static inline struct inode *ceph_find_inode(struct super_block *sb,
502 struct ceph_vino vino)
503{
504 ino_t t = ceph_vino_to_ino(vino);
505 return ilookup5(sb, t, ceph_ino_compare, &vino);
506}
507
508
509/*
510 * caps helpers
511 */
512static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
513{
514 return !RB_EMPTY_ROOT(&ci->i_caps);
515}
516
517extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
518extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
519extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
520 struct ceph_cap *cap);
521
522static inline int ceph_caps_issued(struct ceph_inode_info *ci)
523{
524 int issued;
525 spin_lock(&ci->vfs_inode.i_lock);
526 issued = __ceph_caps_issued(ci, NULL);
527 spin_unlock(&ci->vfs_inode.i_lock);
528 return issued;
529}
530
531static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
532 int touch)
533{
534 int r;
535 spin_lock(&ci->vfs_inode.i_lock);
536 r = __ceph_caps_issued_mask(ci, mask, touch);
537 spin_unlock(&ci->vfs_inode.i_lock);
538 return r;
539}
540
541static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
542{
543 return ci->i_dirty_caps | ci->i_flushing_caps;
544}
545extern void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
546
547extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
548extern int __ceph_caps_used(struct ceph_inode_info *ci);
549
550extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
551
552/*
553 * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
554 */
555static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
556{
557 int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
558 if (w & CEPH_CAP_FILE_BUFFER)
559 w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */
560 return w;
561}
562
563/* what the mds thinks we want */
564extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
565
566extern void ceph_caps_init(void);
567extern void ceph_caps_finalize(void);
568extern void ceph_adjust_min_caps(int delta);
569extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need);
570extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx);
571extern void ceph_reservation_status(struct ceph_client *client,
572 int *total, int *avail, int *used,
573 int *reserved, int *min);
574
575static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
576{
577 return (struct ceph_client *)inode->i_sb->s_fs_info;
578}
579
580static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
581{
582 return (struct ceph_client *)sb->s_fs_info;
583}
584
585
586/*
587 * we keep buffered readdir results attached to file->private_data
588 */
589struct ceph_file_info {
590 int fmode; /* initialized on open */
591
592 /* readdir: position within the dir */
593 u32 frag;
594 struct ceph_mds_request *last_readdir;
595 int at_end;
596
597 /* readdir: position within a frag */
598 unsigned offset; /* offset of last chunk, adjusted for . and .. */
599 u64 next_offset; /* offset of next chunk (last_name's + 1) */
600 char *last_name; /* last entry in previous chunk */
601 struct dentry *dentry; /* next dentry (for dcache readdir) */
602 unsigned long dir_release_count;
603
604 /* used for -o dirstat read() on directory thing */
605 char *dir_info;
606 int dir_info_len;
607};
608
609
610
611/*
612 * snapshots
613 */
614
615/*
616 * A "snap context" is the set of existing snapshots when we
617 * write data. It is used by the OSD to guide its COW behavior.
618 *
619 * The ceph_snap_context is refcounted, and attached to each dirty
620 * page, indicating which context the dirty data belonged when it was
621 * dirtied.
622 */
623struct ceph_snap_context {
624 atomic_t nref;
625 u64 seq;
626 int num_snaps;
627 u64 snaps[];
628};
629
630static inline struct ceph_snap_context *
631ceph_get_snap_context(struct ceph_snap_context *sc)
632{
633 /*
634 printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
635 atomic_read(&sc->nref)+1);
636 */
637 if (sc)
638 atomic_inc(&sc->nref);
639 return sc;
640}
641
642static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
643{
644 if (!sc)
645 return;
646 /*
647 printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
648 atomic_read(&sc->nref)-1);
649 */
650 if (atomic_dec_and_test(&sc->nref)) {
651 /*printk(" deleting snap_context %p\n", sc);*/
652 kfree(sc);
653 }
654}
655
656/*
657 * A "snap realm" describes a subset of the file hierarchy sharing
658 * the same set of snapshots that apply to it. The realms themselves
659 * are organized into a hierarchy, such that children inherit (some of)
660 * the snapshots of their parents.
661 *
662 * All inodes within the realm that have capabilities are linked into a
663 * per-realm list.
664 */
665struct ceph_snap_realm {
666 u64 ino;
667 atomic_t nref;
668 struct rb_node node;
669
670 u64 created, seq;
671 u64 parent_ino;
672 u64 parent_since; /* snapid when our current parent became so */
673
674 u64 *prior_parent_snaps; /* snaps inherited from any parents we */
675 int num_prior_parent_snaps; /* had prior to parent_since */
676 u64 *snaps; /* snaps specific to this realm */
677 int num_snaps;
678
679 struct ceph_snap_realm *parent;
680 struct list_head children; /* list of child realms */
681 struct list_head child_item;
682
683 struct list_head empty_item; /* if i have ref==0 */
684
685 /* the current set of snaps for this realm */
686 struct ceph_snap_context *cached_context;
687
688 struct list_head inodes_with_caps;
689 spinlock_t inodes_with_caps_lock;
690};
691
692
693
694/*
695 * calculate the number of pages a given length and offset map onto,
696 * if we align the data.
697 */
698static inline int calc_pages_for(u64 off, u64 len)
699{
700 return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
701 (off >> PAGE_CACHE_SHIFT);
702}
703
704
705
706/* snap.c */
707struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
708 u64 ino);
709extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
710 struct ceph_snap_realm *realm);
711extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
712 struct ceph_snap_realm *realm);
713extern int ceph_update_snap_trace(struct ceph_mds_client *m,
714 void *p, void *e, bool deletion);
715extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
716 struct ceph_mds_session *session,
717 struct ceph_msg *msg);
718extern void ceph_queue_cap_snap(struct ceph_inode_info *ci,
719 struct ceph_snap_context *snapc);
720extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
721 struct ceph_cap_snap *capsnap);
722extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
723
724/*
725 * a cap_snap is "pending" if it is still awaiting an in-progress
726 * sync write (that may/may not still update size, mtime, etc.).
727 */
728static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
729{
730 return !list_empty(&ci->i_cap_snaps) &&
731 list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
732 ci_item)->writing;
733}
734
735
736/* super.c */
737extern struct kmem_cache *ceph_inode_cachep;
738extern struct kmem_cache *ceph_cap_cachep;
739extern struct kmem_cache *ceph_dentry_cachep;
740extern struct kmem_cache *ceph_file_cachep;
741
742extern const char *ceph_msg_type_name(int type);
743extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
744
745#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
746 "%02x%02x%02x%02x%02x%02x"
747#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \
748 (f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7], \
749 (f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11], \
750 (f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15]
751
752/* inode.c */
753extern const struct inode_operations ceph_file_iops;
754
755extern struct inode *ceph_alloc_inode(struct super_block *sb);
756extern void ceph_destroy_inode(struct inode *inode);
757
758extern struct inode *ceph_get_inode(struct super_block *sb,
759 struct ceph_vino vino);
760extern struct inode *ceph_get_snapdir(struct inode *parent);
761extern int ceph_fill_file_size(struct inode *inode, int issued,
762 u32 truncate_seq, u64 truncate_size, u64 size);
763extern void ceph_fill_file_time(struct inode *inode, int issued,
764 u64 time_warp_seq, struct timespec *ctime,
765 struct timespec *mtime, struct timespec *atime);
766extern int ceph_fill_trace(struct super_block *sb,
767 struct ceph_mds_request *req,
768 struct ceph_mds_session *session);
769extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
770 struct ceph_mds_session *session);
771
772extern int ceph_inode_holds_cap(struct inode *inode, int mask);
773
774extern int ceph_inode_set_size(struct inode *inode, loff_t size);
775extern void __ceph_do_pending_vmtruncate(struct inode *inode);
776extern void ceph_queue_vmtruncate(struct inode *inode);
777
778extern void ceph_queue_invalidate(struct inode *inode);
779extern void ceph_queue_writeback(struct inode *inode);
780
781extern int ceph_do_getattr(struct inode *inode, int mask);
782extern int ceph_permission(struct inode *inode, int mask);
783extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
784extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
785 struct kstat *stat);
786
787/* xattr.c */
788extern int ceph_setxattr(struct dentry *, const char *, const void *,
789 size_t, int);
790extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
791extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
792extern int ceph_removexattr(struct dentry *, const char *);
793extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
794extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
795
796/* caps.c */
797extern const char *ceph_cap_string(int c);
798extern void ceph_handle_caps(struct ceph_mds_session *session,
799 struct ceph_msg *msg);
800extern int ceph_add_cap(struct inode *inode,
801 struct ceph_mds_session *session, u64 cap_id,
802 int fmode, unsigned issued, unsigned wanted,
803 unsigned cap, unsigned seq, u64 realmino, int flags,
804 struct ceph_cap_reservation *caps_reservation);
805extern void __ceph_remove_cap(struct ceph_cap *cap);
806static inline void ceph_remove_cap(struct ceph_cap *cap)
807{
808 struct inode *inode = &cap->ci->vfs_inode;
809 spin_lock(&inode->i_lock);
810 __ceph_remove_cap(cap);
811 spin_unlock(&inode->i_lock);
812}
813extern void ceph_put_cap(struct ceph_cap *cap);
814
815extern void ceph_queue_caps_release(struct inode *inode);
816extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
817extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync);
818extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
819 struct ceph_mds_session *session);
820extern int ceph_get_cap_mds(struct inode *inode);
821extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
822extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
823extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
824 struct ceph_snap_context *snapc);
825extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
826 struct ceph_mds_session **psession);
827extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
828 struct ceph_mds_session *session);
829extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
830extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
831
832extern int ceph_encode_inode_release(void **p, struct inode *inode,
833 int mds, int drop, int unless, int force);
834extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
835 int mds, int drop, int unless);
836
837extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
838 int *got, loff_t endoff);
839
840/* for counting open files by mode */
841static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
842{
843 ci->i_nr_by_mode[mode]++;
844}
845extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
846
847/* addr.c */
848extern const struct address_space_operations ceph_aops;
849extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
850
851/* file.c */
852extern const struct file_operations ceph_file_fops;
853extern const struct address_space_operations ceph_aops;
854extern int ceph_open(struct inode *inode, struct file *file);
855extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
856 struct nameidata *nd, int mode,
857 int locked_dir);
858extern int ceph_release(struct inode *inode, struct file *filp);
859extern void ceph_release_page_vector(struct page **pages, int num_pages);
860
861/* dir.c */
862extern const struct file_operations ceph_dir_fops;
863extern const struct inode_operations ceph_dir_iops;
864extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
865 ceph_snapdir_dentry_ops;
866
867extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
868extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
869 struct dentry *dentry, int err);
870
871extern void ceph_dentry_lru_add(struct dentry *dn);
872extern void ceph_dentry_lru_touch(struct dentry *dn);
873extern void ceph_dentry_lru_del(struct dentry *dn);
874
875/*
876 * our d_ops vary depending on whether the inode is live,
877 * snapshotted (read-only), or a virtual ".snap" directory.
878 */
879int ceph_init_dentry(struct dentry *dentry);
880
881
882/* ioctl.c */
883extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
884
885/* export.c */
886extern const struct export_operations ceph_export_ops;
887
888/* debugfs.c */
889extern int ceph_debugfs_init(void);
890extern void ceph_debugfs_cleanup(void);
891extern int ceph_debugfs_client_init(struct ceph_client *client);
892extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
893
894static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
895{
896 if (dentry && dentry->d_parent)
897 return dentry->d_parent->d_inode;
898
899 return NULL;
900}
901
902#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
new file mode 100644
index 000000000000..28b35a005ec2
--- /dev/null
+++ b/fs/ceph/types.h
@@ -0,0 +1,29 @@
1#ifndef _FS_CEPH_TYPES_H
2#define _FS_CEPH_TYPES_H
3
4/* needed before including ceph_fs.h */
5#include <linux/in.h>
6#include <linux/types.h>
7#include <linux/fcntl.h>
8#include <linux/string.h>
9
10#include "ceph_fs.h"
11#include "ceph_frag.h"
12#include "ceph_hash.h"
13
14/*
15 * Identify inodes by both their ino AND snapshot id (a u64).
16 */
17struct ceph_vino {
18 u64 ino;
19 u64 snap;
20};
21
22
23/* context for the caps reservation mechanism */
24struct ceph_cap_reservation {
25 int count;
26};
27
28
29#endif
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
new file mode 100644
index 000000000000..2845422907fc
--- /dev/null
+++ b/fs/ceph/xattr.c
@@ -0,0 +1,845 @@
1#include "ceph_debug.h"
2#include "super.h"
3#include "decode.h"
4
5#include <linux/xattr.h>
6#include <linux/slab.h>
7
8static bool ceph_is_valid_xattr(const char *name)
9{
10 return !strncmp(name, XATTR_SECURITY_PREFIX,
11 XATTR_SECURITY_PREFIX_LEN) ||
12 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
13 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
14}
15
16/*
17 * These define virtual xattrs exposing the recursive directory
18 * statistics and layout metadata.
19 */
20struct ceph_vxattr_cb {
21 bool readonly;
22 char *name;
23 size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
24 size_t size);
25};
26
27/* directories */
28
29static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val,
30 size_t size)
31{
32 return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
33}
34
35static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val,
36 size_t size)
37{
38 return snprintf(val, size, "%lld", ci->i_files);
39}
40
41static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val,
42 size_t size)
43{
44 return snprintf(val, size, "%lld", ci->i_subdirs);
45}
46
47static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val,
48 size_t size)
49{
50 return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
51}
52
53static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val,
54 size_t size)
55{
56 return snprintf(val, size, "%lld", ci->i_rfiles);
57}
58
59static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val,
60 size_t size)
61{
62 return snprintf(val, size, "%lld", ci->i_rsubdirs);
63}
64
65static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val,
66 size_t size)
67{
68 return snprintf(val, size, "%lld", ci->i_rbytes);
69}
70
71static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
72 size_t size)
73{
74 return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec,
75 (long)ci->i_rctime.tv_nsec);
76}
77
78static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
79 { true, "user.ceph.dir.entries", ceph_vxattrcb_entries},
80 { true, "user.ceph.dir.files", ceph_vxattrcb_files},
81 { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs},
82 { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries},
83 { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles},
84 { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
85 { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes},
86 { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime},
87 { true, NULL, NULL }
88};
89
90/* files */
91
92static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
93 size_t size)
94{
95 int ret;
96
97 ret = snprintf(val, size,
98 "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
99 (unsigned long long)ceph_file_layout_su(ci->i_layout),
100 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
101 (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
102 if (ceph_file_layout_pg_preferred(ci->i_layout))
103 ret += snprintf(val + ret, size, "preferred_osd=%lld\n",
104 (unsigned long long)ceph_file_layout_pg_preferred(
105 ci->i_layout));
106 return ret;
107}
108
109static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
110 { true, "user.ceph.layout", ceph_vxattrcb_layout},
111 { NULL, NULL }
112};
113
114static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
115{
116 if (S_ISDIR(inode->i_mode))
117 return ceph_dir_vxattrs;
118 else if (S_ISREG(inode->i_mode))
119 return ceph_file_vxattrs;
120 return NULL;
121}
122
123static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr,
124 const char *name)
125{
126 do {
127 if (strcmp(vxattr->name, name) == 0)
128 return vxattr;
129 vxattr++;
130 } while (vxattr->name);
131 return NULL;
132}
133
134static int __set_xattr(struct ceph_inode_info *ci,
135 const char *name, int name_len,
136 const char *val, int val_len,
137 int dirty,
138 int should_free_name, int should_free_val,
139 struct ceph_inode_xattr **newxattr)
140{
141 struct rb_node **p;
142 struct rb_node *parent = NULL;
143 struct ceph_inode_xattr *xattr = NULL;
144 int c;
145 int new = 0;
146
147 p = &ci->i_xattrs.index.rb_node;
148 while (*p) {
149 parent = *p;
150 xattr = rb_entry(parent, struct ceph_inode_xattr, node);
151 c = strncmp(name, xattr->name, min(name_len, xattr->name_len));
152 if (c < 0)
153 p = &(*p)->rb_left;
154 else if (c > 0)
155 p = &(*p)->rb_right;
156 else {
157 if (name_len == xattr->name_len)
158 break;
159 else if (name_len < xattr->name_len)
160 p = &(*p)->rb_left;
161 else
162 p = &(*p)->rb_right;
163 }
164 xattr = NULL;
165 }
166
167 if (!xattr) {
168 new = 1;
169 xattr = *newxattr;
170 xattr->name = name;
171 xattr->name_len = name_len;
172 xattr->should_free_name = should_free_name;
173
174 ci->i_xattrs.count++;
175 dout("__set_xattr count=%d\n", ci->i_xattrs.count);
176 } else {
177 kfree(*newxattr);
178 *newxattr = NULL;
179 if (xattr->should_free_val)
180 kfree((void *)xattr->val);
181
182 if (should_free_name) {
183 kfree((void *)name);
184 name = xattr->name;
185 }
186 ci->i_xattrs.names_size -= xattr->name_len;
187 ci->i_xattrs.vals_size -= xattr->val_len;
188 }
189 if (!xattr) {
190 pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
191 &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
192 xattr->val);
193 return -ENOMEM;
194 }
195 ci->i_xattrs.names_size += name_len;
196 ci->i_xattrs.vals_size += val_len;
197 if (val)
198 xattr->val = val;
199 else
200 xattr->val = "";
201
202 xattr->val_len = val_len;
203 xattr->dirty = dirty;
204 xattr->should_free_val = (val && should_free_val);
205
206 if (new) {
207 rb_link_node(&xattr->node, parent, p);
208 rb_insert_color(&xattr->node, &ci->i_xattrs.index);
209 dout("__set_xattr_val p=%p\n", p);
210 }
211
212 dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
213 ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
214
215 return 0;
216}
217
218static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
219 const char *name)
220{
221 struct rb_node **p;
222 struct rb_node *parent = NULL;
223 struct ceph_inode_xattr *xattr = NULL;
224 int c;
225
226 p = &ci->i_xattrs.index.rb_node;
227 while (*p) {
228 parent = *p;
229 xattr = rb_entry(parent, struct ceph_inode_xattr, node);
230 c = strncmp(name, xattr->name, xattr->name_len);
231 if (c < 0)
232 p = &(*p)->rb_left;
233 else if (c > 0)
234 p = &(*p)->rb_right;
235 else {
236 dout("__get_xattr %s: found %.*s\n", name,
237 xattr->val_len, xattr->val);
238 return xattr;
239 }
240 }
241
242 dout("__get_xattr %s: not found\n", name);
243
244 return NULL;
245}
246
247static void __free_xattr(struct ceph_inode_xattr *xattr)
248{
249 BUG_ON(!xattr);
250
251 if (xattr->should_free_name)
252 kfree((void *)xattr->name);
253 if (xattr->should_free_val)
254 kfree((void *)xattr->val);
255
256 kfree(xattr);
257}
258
259static int __remove_xattr(struct ceph_inode_info *ci,
260 struct ceph_inode_xattr *xattr)
261{
262 if (!xattr)
263 return -EOPNOTSUPP;
264
265 rb_erase(&xattr->node, &ci->i_xattrs.index);
266
267 if (xattr->should_free_name)
268 kfree((void *)xattr->name);
269 if (xattr->should_free_val)
270 kfree((void *)xattr->val);
271
272 ci->i_xattrs.names_size -= xattr->name_len;
273 ci->i_xattrs.vals_size -= xattr->val_len;
274 ci->i_xattrs.count--;
275 kfree(xattr);
276
277 return 0;
278}
279
280static int __remove_xattr_by_name(struct ceph_inode_info *ci,
281 const char *name)
282{
283 struct rb_node **p;
284 struct ceph_inode_xattr *xattr;
285 int err;
286
287 p = &ci->i_xattrs.index.rb_node;
288 xattr = __get_xattr(ci, name);
289 err = __remove_xattr(ci, xattr);
290 return err;
291}
292
293static char *__copy_xattr_names(struct ceph_inode_info *ci,
294 char *dest)
295{
296 struct rb_node *p;
297 struct ceph_inode_xattr *xattr = NULL;
298
299 p = rb_first(&ci->i_xattrs.index);
300 dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
301
302 while (p) {
303 xattr = rb_entry(p, struct ceph_inode_xattr, node);
304 memcpy(dest, xattr->name, xattr->name_len);
305 dest[xattr->name_len] = '\0';
306
307 dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
308 xattr->name_len, ci->i_xattrs.names_size);
309
310 dest += xattr->name_len + 1;
311 p = rb_next(p);
312 }
313
314 return dest;
315}
316
317void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
318{
319 struct rb_node *p, *tmp;
320 struct ceph_inode_xattr *xattr = NULL;
321
322 p = rb_first(&ci->i_xattrs.index);
323
324 dout("__ceph_destroy_xattrs p=%p\n", p);
325
326 while (p) {
327 xattr = rb_entry(p, struct ceph_inode_xattr, node);
328 tmp = p;
329 p = rb_next(tmp);
330 dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
331 xattr->name_len, xattr->name);
332 rb_erase(tmp, &ci->i_xattrs.index);
333
334 __free_xattr(xattr);
335 }
336
337 ci->i_xattrs.names_size = 0;
338 ci->i_xattrs.vals_size = 0;
339 ci->i_xattrs.index_version = 0;
340 ci->i_xattrs.count = 0;
341 ci->i_xattrs.index = RB_ROOT;
342}
343
344static int __build_xattrs(struct inode *inode)
345{
346 u32 namelen;
347 u32 numattr = 0;
348 void *p, *end;
349 u32 len;
350 const char *name, *val;
351 struct ceph_inode_info *ci = ceph_inode(inode);
352 int xattr_version;
353 struct ceph_inode_xattr **xattrs = NULL;
354 int err = 0;
355 int i;
356
357 dout("__build_xattrs() len=%d\n",
358 ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
359
360 if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
361 return 0; /* already built */
362
363 __ceph_destroy_xattrs(ci);
364
365start:
366 /* updated internal xattr rb tree */
367 if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) {
368 p = ci->i_xattrs.blob->vec.iov_base;
369 end = p + ci->i_xattrs.blob->vec.iov_len;
370 ceph_decode_32_safe(&p, end, numattr, bad);
371 xattr_version = ci->i_xattrs.version;
372 spin_unlock(&inode->i_lock);
373
374 xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
375 GFP_NOFS);
376 err = -ENOMEM;
377 if (!xattrs)
378 goto bad_lock;
379 memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
380 for (i = 0; i < numattr; i++) {
381 xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
382 GFP_NOFS);
383 if (!xattrs[i])
384 goto bad_lock;
385 }
386
387 spin_lock(&inode->i_lock);
388 if (ci->i_xattrs.version != xattr_version) {
389 /* lost a race, retry */
390 for (i = 0; i < numattr; i++)
391 kfree(xattrs[i]);
392 kfree(xattrs);
393 goto start;
394 }
395 err = -EIO;
396 while (numattr--) {
397 ceph_decode_32_safe(&p, end, len, bad);
398 namelen = len;
399 name = p;
400 p += len;
401 ceph_decode_32_safe(&p, end, len, bad);
402 val = p;
403 p += len;
404
405 err = __set_xattr(ci, name, namelen, val, len,
406 0, 0, 0, &xattrs[numattr]);
407
408 if (err < 0)
409 goto bad;
410 }
411 kfree(xattrs);
412 }
413 ci->i_xattrs.index_version = ci->i_xattrs.version;
414 ci->i_xattrs.dirty = false;
415
416 return err;
417bad_lock:
418 spin_lock(&inode->i_lock);
419bad:
420 if (xattrs) {
421 for (i = 0; i < numattr; i++)
422 kfree(xattrs[i]);
423 kfree(xattrs);
424 }
425 ci->i_xattrs.names_size = 0;
426 return err;
427}
428
429static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
430 int val_size)
431{
432 /*
433 * 4 bytes for the length, and additional 4 bytes per each xattr name,
434 * 4 bytes per each value
435 */
436 int size = 4 + ci->i_xattrs.count*(4 + 4) +
437 ci->i_xattrs.names_size +
438 ci->i_xattrs.vals_size;
439 dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
440 ci->i_xattrs.count, ci->i_xattrs.names_size,
441 ci->i_xattrs.vals_size);
442
443 if (name_size)
444 size += 4 + 4 + name_size + val_size;
445
446 return size;
447}
448
449/*
450 * If there are dirty xattrs, reencode xattrs into the prealloc_blob
451 * and swap into place.
452 */
453void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
454{
455 struct rb_node *p;
456 struct ceph_inode_xattr *xattr = NULL;
457 void *dest;
458
459 dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
460 if (ci->i_xattrs.dirty) {
461 int need = __get_required_blob_size(ci, 0, 0);
462
463 BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len);
464
465 p = rb_first(&ci->i_xattrs.index);
466 dest = ci->i_xattrs.prealloc_blob->vec.iov_base;
467
468 ceph_encode_32(&dest, ci->i_xattrs.count);
469 while (p) {
470 xattr = rb_entry(p, struct ceph_inode_xattr, node);
471
472 ceph_encode_32(&dest, xattr->name_len);
473 memcpy(dest, xattr->name, xattr->name_len);
474 dest += xattr->name_len;
475 ceph_encode_32(&dest, xattr->val_len);
476 memcpy(dest, xattr->val, xattr->val_len);
477 dest += xattr->val_len;
478
479 p = rb_next(p);
480 }
481
482 /* adjust buffer len; it may be larger than we need */
483 ci->i_xattrs.prealloc_blob->vec.iov_len =
484 dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
485
486 if (ci->i_xattrs.blob)
487 ceph_buffer_put(ci->i_xattrs.blob);
488 ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
489 ci->i_xattrs.prealloc_blob = NULL;
490 ci->i_xattrs.dirty = false;
491 }
492}
493
494ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
495 size_t size)
496{
497 struct inode *inode = dentry->d_inode;
498 struct ceph_inode_info *ci = ceph_inode(inode);
499 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
500 int err;
501 struct ceph_inode_xattr *xattr;
502 struct ceph_vxattr_cb *vxattr = NULL;
503
504 if (!ceph_is_valid_xattr(name))
505 return -ENODATA;
506
507 /* let's see if a virtual xattr was requested */
508 if (vxattrs)
509 vxattr = ceph_match_vxattr(vxattrs, name);
510
511 spin_lock(&inode->i_lock);
512 dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
513 ci->i_xattrs.version, ci->i_xattrs.index_version);
514
515 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
516 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
517 goto get_xattr;
518 } else {
519 spin_unlock(&inode->i_lock);
520 /* get xattrs from mds (if we don't already have them) */
521 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
522 if (err)
523 return err;
524 }
525
526 spin_lock(&inode->i_lock);
527
528 if (vxattr && vxattr->readonly) {
529 err = vxattr->getxattr_cb(ci, value, size);
530 goto out;
531 }
532
533 err = __build_xattrs(inode);
534 if (err < 0)
535 goto out;
536
537get_xattr:
538 err = -ENODATA; /* == ENOATTR */
539 xattr = __get_xattr(ci, name);
540 if (!xattr) {
541 if (vxattr)
542 err = vxattr->getxattr_cb(ci, value, size);
543 goto out;
544 }
545
546 err = -ERANGE;
547 if (size && size < xattr->val_len)
548 goto out;
549
550 err = xattr->val_len;
551 if (size == 0)
552 goto out;
553
554 memcpy(value, xattr->val, xattr->val_len);
555
556out:
557 spin_unlock(&inode->i_lock);
558 return err;
559}
560
561ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
562{
563 struct inode *inode = dentry->d_inode;
564 struct ceph_inode_info *ci = ceph_inode(inode);
565 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
566 u32 vir_namelen = 0;
567 u32 namelen;
568 int err;
569 u32 len;
570 int i;
571
572 spin_lock(&inode->i_lock);
573 dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
574 ci->i_xattrs.version, ci->i_xattrs.index_version);
575
576 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
577 (ci->i_xattrs.index_version > ci->i_xattrs.version)) {
578 goto list_xattr;
579 } else {
580 spin_unlock(&inode->i_lock);
581 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
582 if (err)
583 return err;
584 }
585
586 spin_lock(&inode->i_lock);
587
588 err = __build_xattrs(inode);
589 if (err < 0)
590 goto out;
591
592list_xattr:
593 vir_namelen = 0;
594 /* include virtual dir xattrs */
595 if (vxattrs)
596 for (i = 0; vxattrs[i].name; i++)
597 vir_namelen += strlen(vxattrs[i].name) + 1;
598 /* adding 1 byte per each variable due to the null termination */
599 namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
600 err = -ERANGE;
601 if (size && namelen > size)
602 goto out;
603
604 err = namelen;
605 if (size == 0)
606 goto out;
607
608 names = __copy_xattr_names(ci, names);
609
610 /* virtual xattr names, too */
611 if (vxattrs)
612 for (i = 0; vxattrs[i].name; i++) {
613 len = sprintf(names, "%s", vxattrs[i].name);
614 names += len + 1;
615 }
616
617out:
618 spin_unlock(&inode->i_lock);
619 return err;
620}
621
622static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
623 const char *value, size_t size, int flags)
624{
625 struct ceph_client *client = ceph_client(dentry->d_sb);
626 struct inode *inode = dentry->d_inode;
627 struct ceph_inode_info *ci = ceph_inode(inode);
628 struct inode *parent_inode = dentry->d_parent->d_inode;
629 struct ceph_mds_request *req;
630 struct ceph_mds_client *mdsc = &client->mdsc;
631 int err;
632 int i, nr_pages;
633 struct page **pages = NULL;
634 void *kaddr;
635
636 /* copy value into some pages */
637 nr_pages = calc_pages_for(0, size);
638 if (nr_pages) {
639 pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS);
640 if (!pages)
641 return -ENOMEM;
642 err = -ENOMEM;
643 for (i = 0; i < nr_pages; i++) {
644 pages[i] = alloc_page(GFP_NOFS);
645 if (!pages[i]) {
646 nr_pages = i;
647 goto out;
648 }
649 kaddr = kmap(pages[i]);
650 memcpy(kaddr, value + i*PAGE_CACHE_SIZE,
651 min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE));
652 }
653 }
654
655 dout("setxattr value=%.*s\n", (int)size, value);
656
657 /* do request */
658 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
659 USE_AUTH_MDS);
660 if (IS_ERR(req)) {
661 err = PTR_ERR(req);
662 goto out;
663 }
664 req->r_inode = igrab(inode);
665 req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
666 req->r_num_caps = 1;
667 req->r_args.setxattr.flags = cpu_to_le32(flags);
668 req->r_path2 = kstrdup(name, GFP_NOFS);
669
670 req->r_pages = pages;
671 req->r_num_pages = nr_pages;
672 req->r_data_len = size;
673
674 dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
675 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
676 ceph_mdsc_put_request(req);
677 dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
678
679out:
680 if (pages) {
681 for (i = 0; i < nr_pages; i++)
682 __free_page(pages[i]);
683 kfree(pages);
684 }
685 return err;
686}
687
688int ceph_setxattr(struct dentry *dentry, const char *name,
689 const void *value, size_t size, int flags)
690{
691 struct inode *inode = dentry->d_inode;
692 struct ceph_inode_info *ci = ceph_inode(inode);
693 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
694 int err;
695 int name_len = strlen(name);
696 int val_len = size;
697 char *newname = NULL;
698 char *newval = NULL;
699 struct ceph_inode_xattr *xattr = NULL;
700 int issued;
701 int required_blob_size;
702
703 if (ceph_snap(inode) != CEPH_NOSNAP)
704 return -EROFS;
705
706 if (!ceph_is_valid_xattr(name))
707 return -EOPNOTSUPP;
708
709 if (vxattrs) {
710 struct ceph_vxattr_cb *vxattr =
711 ceph_match_vxattr(vxattrs, name);
712 if (vxattr && vxattr->readonly)
713 return -EOPNOTSUPP;
714 }
715
716 /* preallocate memory for xattr name, value, index node */
717 err = -ENOMEM;
718 newname = kmalloc(name_len + 1, GFP_NOFS);
719 if (!newname)
720 goto out;
721 memcpy(newname, name, name_len + 1);
722
723 if (val_len) {
724 newval = kmalloc(val_len + 1, GFP_NOFS);
725 if (!newval)
726 goto out;
727 memcpy(newval, value, val_len);
728 newval[val_len] = '\0';
729 }
730
731 xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
732 if (!xattr)
733 goto out;
734
735 spin_lock(&inode->i_lock);
736retry:
737 issued = __ceph_caps_issued(ci, NULL);
738 if (!(issued & CEPH_CAP_XATTR_EXCL))
739 goto do_sync;
740 __build_xattrs(inode);
741
742 required_blob_size = __get_required_blob_size(ci, name_len, val_len);
743
744 if (!ci->i_xattrs.prealloc_blob ||
745 required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
746 struct ceph_buffer *blob = NULL;
747
748 spin_unlock(&inode->i_lock);
749 dout(" preaallocating new blob size=%d\n", required_blob_size);
750 blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
751 if (!blob)
752 goto out;
753 spin_lock(&inode->i_lock);
754 if (ci->i_xattrs.prealloc_blob)
755 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
756 ci->i_xattrs.prealloc_blob = blob;
757 goto retry;
758 }
759
760 dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
761 err = __set_xattr(ci, newname, name_len, newval,
762 val_len, 1, 1, 1, &xattr);
763 __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
764 ci->i_xattrs.dirty = true;
765 inode->i_ctime = CURRENT_TIME;
766 spin_unlock(&inode->i_lock);
767
768 return err;
769
770do_sync:
771 spin_unlock(&inode->i_lock);
772 err = ceph_sync_setxattr(dentry, name, value, size, flags);
773out:
774 kfree(newname);
775 kfree(newval);
776 kfree(xattr);
777 return err;
778}
779
780static int ceph_send_removexattr(struct dentry *dentry, const char *name)
781{
782 struct ceph_client *client = ceph_client(dentry->d_sb);
783 struct ceph_mds_client *mdsc = &client->mdsc;
784 struct inode *inode = dentry->d_inode;
785 struct inode *parent_inode = dentry->d_parent->d_inode;
786 struct ceph_mds_request *req;
787 int err;
788
789 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR,
790 USE_AUTH_MDS);
791 if (IS_ERR(req))
792 return PTR_ERR(req);
793 req->r_inode = igrab(inode);
794 req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
795 req->r_num_caps = 1;
796 req->r_path2 = kstrdup(name, GFP_NOFS);
797
798 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
799 ceph_mdsc_put_request(req);
800 return err;
801}
802
803int ceph_removexattr(struct dentry *dentry, const char *name)
804{
805 struct inode *inode = dentry->d_inode;
806 struct ceph_inode_info *ci = ceph_inode(inode);
807 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
808 int issued;
809 int err;
810
811 if (ceph_snap(inode) != CEPH_NOSNAP)
812 return -EROFS;
813
814 if (!ceph_is_valid_xattr(name))
815 return -EOPNOTSUPP;
816
817 if (vxattrs) {
818 struct ceph_vxattr_cb *vxattr =
819 ceph_match_vxattr(vxattrs, name);
820 if (vxattr && vxattr->readonly)
821 return -EOPNOTSUPP;
822 }
823
824 spin_lock(&inode->i_lock);
825 __build_xattrs(inode);
826 issued = __ceph_caps_issued(ci, NULL);
827 dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
828
829 if (!(issued & CEPH_CAP_XATTR_EXCL))
830 goto do_sync;
831
832 err = __remove_xattr_by_name(ceph_inode(inode), name);
833 __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
834 ci->i_xattrs.dirty = true;
835 inode->i_ctime = CURRENT_TIME;
836
837 spin_unlock(&inode->i_lock);
838
839 return err;
840do_sync:
841 spin_unlock(&inode->i_lock);
842 err = ceph_send_removexattr(dentry, name);
843 return err;
844}
845
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index b1d61d0bdfc7..78e4d2a3a68b 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -15,6 +15,7 @@
15#include <linux/dcache.h> 15#include <linux/dcache.h>
16#include <linux/mount.h> 16#include <linux/mount.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/slab.h>
18#include <linux/vfs.h> 19#include <linux/vfs.h>
19#include <linux/fs.h> 20#include <linux/fs.h>
20#include "cifsglob.h" 21#include "cifsglob.h"
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 8ec7736ce954..310d12f69a92 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -20,6 +20,7 @@
20 */ 20 */
21 21
22#include <linux/list.h> 22#include <linux/list.h>
23#include <linux/slab.h>
23#include <linux/string.h> 24#include <linux/string.h>
24#include <keys/user-type.h> 25#include <keys/user-type.h>
25#include <linux/key-type.h> 26#include <linux/key-type.h>
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 714a542cbafc..d07676bd76d2 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -19,6 +19,7 @@
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/slab.h>
22#include "cifs_unicode.h" 23#include "cifs_unicode.h"
23#include "cifs_uniupr.h" 24#include "cifs_uniupr.h"
24#include "cifspdu.h" 25#include "cifspdu.h"
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 7dfe0842a6f6..9b716d044bbd 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -22,6 +22,7 @@
22 */ 22 */
23 23
24#include <linux/fs.h> 24#include <linux/fs.h>
25#include <linux/slab.h>
25#include "cifspdu.h" 26#include "cifspdu.h"
26#include "cifsglob.h" 27#include "cifsglob.h"
27#include "cifsacl.h" 28#include "cifsacl.h"
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 7efe1745494d..fbe986430d0c 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -20,6 +20,7 @@
20 */ 20 */
21 21
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/slab.h>
23#include "cifspdu.h" 24#include "cifspdu.h"
24#include "cifsglob.h" 25#include "cifsglob.h"
25#include "cifs_debug.h" 26#include "cifs_debug.h"
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8c6a03627176..5183bc2a1916 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -312,6 +312,7 @@ cifs_alloc_inode(struct super_block *sb)
312 cifs_inode->clientCanCacheRead = false; 312 cifs_inode->clientCanCacheRead = false;
313 cifs_inode->clientCanCacheAll = false; 313 cifs_inode->clientCanCacheAll = false;
314 cifs_inode->delete_pending = false; 314 cifs_inode->delete_pending = false;
315 cifs_inode->invalid_mapping = false;
315 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ 316 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
316 cifs_inode->server_eof = 0; 317 cifs_inode->server_eof = 0;
317 318
@@ -638,7 +639,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
638 setting the revalidate time to zero */ 639 setting the revalidate time to zero */
639 CIFS_I(file->f_path.dentry->d_inode)->time = 0; 640 CIFS_I(file->f_path.dentry->d_inode)->time = 0;
640 641
641 retval = cifs_revalidate(file->f_path.dentry); 642 retval = cifs_revalidate_file(file);
642 if (retval < 0) 643 if (retval < 0)
643 return (loff_t)retval; 644 return (loff_t)retval;
644 } 645 }
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 78c1b86d55f6..7aa57ecdc437 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -61,7 +61,8 @@ extern int cifs_mkdir(struct inode *, struct dentry *, int);
61extern int cifs_rmdir(struct inode *, struct dentry *); 61extern int cifs_rmdir(struct inode *, struct dentry *);
62extern int cifs_rename(struct inode *, struct dentry *, struct inode *, 62extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
63 struct dentry *); 63 struct dentry *);
64extern int cifs_revalidate(struct dentry *); 64extern int cifs_revalidate_file(struct file *filp);
65extern int cifs_revalidate_dentry(struct dentry *);
65extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *); 66extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
66extern int cifs_setattr(struct dentry *, struct iattr *); 67extern int cifs_setattr(struct dentry *, struct iattr *);
67 68
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index a1c817eb291a..ecf0ffbe2b64 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -18,6 +18,7 @@
18 */ 18 */
19#include <linux/in.h> 19#include <linux/in.h>
20#include <linux/in6.h> 20#include <linux/in6.h>
21#include <linux/slab.h>
21#include <linux/slow-work.h> 22#include <linux/slow-work.h>
22#include "cifs_fs_sb.h" 23#include "cifs_fs_sb.h"
23#include "cifsacl.h" 24#include "cifsacl.h"
@@ -389,6 +390,7 @@ struct cifsInodeInfo {
389 bool clientCanCacheRead:1; /* read oplock */ 390 bool clientCanCacheRead:1; /* read oplock */
390 bool clientCanCacheAll:1; /* read and writebehind oplock */ 391 bool clientCanCacheAll:1; /* read and writebehind oplock */
391 bool delete_pending:1; /* DELETE_ON_CLOSE is set */ 392 bool delete_pending:1; /* DELETE_ON_CLOSE is set */
393 bool invalid_mapping:1; /* pagecache is invalid */
392 u64 server_eof; /* current file size on server */ 394 u64 server_eof; /* current file size on server */
393 u64 uniqueid; /* server inode number */ 395 u64 uniqueid; /* server inode number */
394 struct inode vfs_inode; 396 struct inode vfs_inode;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 88e2bc44ac58..39e47f46dea5 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -104,10 +104,12 @@ extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr);
104extern struct inode *cifs_iget(struct super_block *sb, 104extern struct inode *cifs_iget(struct super_block *sb,
105 struct cifs_fattr *fattr); 105 struct cifs_fattr *fattr);
106 106
107extern int cifs_get_file_info(struct file *filp);
107extern int cifs_get_inode_info(struct inode **pinode, 108extern int cifs_get_inode_info(struct inode **pinode,
108 const unsigned char *search_path, 109 const unsigned char *search_path,
109 FILE_ALL_INFO *pfile_info, 110 FILE_ALL_INFO *pfile_info,
110 struct super_block *sb, int xid, const __u16 *pfid); 111 struct super_block *sb, int xid, const __u16 *pfid);
112extern int cifs_get_file_info_unix(struct file *filp);
111extern int cifs_get_inode_info_unix(struct inode **pinode, 113extern int cifs_get_inode_info_unix(struct inode **pinode,
112 const unsigned char *search_path, 114 const unsigned char *search_path,
113 struct super_block *sb, int xid); 115 struct super_block *sb, int xid);
@@ -142,6 +144,8 @@ extern int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
142extern int CIFSFindClose(const int, struct cifsTconInfo *tcon, 144extern int CIFSFindClose(const int, struct cifsTconInfo *tcon,
143 const __u16 search_handle); 145 const __u16 search_handle);
144 146
147extern int CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
148 u16 netfid, FILE_ALL_INFO *pFindData);
145extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon, 149extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
146 const unsigned char *searchName, 150 const unsigned char *searchName,
147 FILE_ALL_INFO *findData, 151 FILE_ALL_INFO *findData,
@@ -152,6 +156,8 @@ extern int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
152 FILE_ALL_INFO *findData, 156 FILE_ALL_INFO *findData,
153 const struct nls_table *nls_codepage, int remap); 157 const struct nls_table *nls_codepage, int remap);
154 158
159extern int CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
160 u16 netfid, FILE_UNIX_BASIC_INFO *pFindData);
155extern int CIFSSMBUnixQPathInfo(const int xid, 161extern int CIFSSMBUnixQPathInfo(const int xid,
156 struct cifsTconInfo *tcon, 162 struct cifsTconInfo *tcon,
157 const unsigned char *searchName, 163 const unsigned char *searchName,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 611835899844..3f4fbd670507 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -30,6 +30,7 @@
30#include <linux/fs.h> 30#include <linux/fs.h>
31#include <linux/kernel.h> 31#include <linux/kernel.h>
32#include <linux/vfs.h> 32#include <linux/vfs.h>
33#include <linux/slab.h>
33#include <linux/posix_acl_xattr.h> 34#include <linux/posix_acl_xattr.h>
34#include <asm/uaccess.h> 35#include <asm/uaccess.h>
35#include "cifspdu.h" 36#include "cifspdu.h"
@@ -500,7 +501,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
500 } else if (pSMBr->hdr.WordCount == 13) { 501 } else if (pSMBr->hdr.WordCount == 13) {
501 cERROR(1, ("mount failed, cifs module not built " 502 cERROR(1, ("mount failed, cifs module not built "
502 "with CIFS_WEAK_PW_HASH support")); 503 "with CIFS_WEAK_PW_HASH support"));
503 rc = -EOPNOTSUPP; 504 rc = -EOPNOTSUPP;
504#endif /* WEAK_PW_HASH */ 505#endif /* WEAK_PW_HASH */
505 goto neg_err_exit; 506 goto neg_err_exit;
506 } else if (pSMBr->hdr.WordCount != 17) { 507 } else if (pSMBr->hdr.WordCount != 17) {
@@ -3230,8 +3231,72 @@ QInfRetry:
3230 return rc; 3231 return rc;
3231} 3232}
3232 3233
3234int
3235CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
3236 u16 netfid, FILE_ALL_INFO *pFindData)
3237{
3238 struct smb_t2_qfi_req *pSMB = NULL;
3239 struct smb_t2_qfi_rsp *pSMBr = NULL;
3240 int rc = 0;
3241 int bytes_returned;
3242 __u16 params, byte_count;
3233 3243
3244QFileInfoRetry:
3245 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
3246 (void **) &pSMBr);
3247 if (rc)
3248 return rc;
3234 3249
3250 params = 2 /* level */ + 2 /* fid */;
3251 pSMB->t2.TotalDataCount = 0;
3252 pSMB->t2.MaxParameterCount = cpu_to_le16(4);
3253 /* BB find exact max data count below from sess structure BB */
3254 pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
3255 pSMB->t2.MaxSetupCount = 0;
3256 pSMB->t2.Reserved = 0;
3257 pSMB->t2.Flags = 0;
3258 pSMB->t2.Timeout = 0;
3259 pSMB->t2.Reserved2 = 0;
3260 pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
3261 Fid) - 4);
3262 pSMB->t2.DataCount = 0;
3263 pSMB->t2.DataOffset = 0;
3264 pSMB->t2.SetupCount = 1;
3265 pSMB->t2.Reserved3 = 0;
3266 pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
3267 byte_count = params + 1 /* pad */ ;
3268 pSMB->t2.TotalParameterCount = cpu_to_le16(params);
3269 pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
3270 pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
3271 pSMB->Pad = 0;
3272 pSMB->Fid = netfid;
3273 pSMB->hdr.smb_buf_length += byte_count;
3274
3275 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3276 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3277 if (rc) {
3278 cFYI(1, ("Send error in QPathInfo = %d", rc));
3279 } else { /* decode response */
3280 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3281
3282 if (rc) /* BB add auto retry on EOPNOTSUPP? */
3283 rc = -EIO;
3284 else if (pSMBr->ByteCount < 40)
3285 rc = -EIO; /* bad smb */
3286 else if (pFindData) {
3287 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
3288 memcpy((char *) pFindData,
3289 (char *) &pSMBr->hdr.Protocol +
3290 data_offset, sizeof(FILE_ALL_INFO));
3291 } else
3292 rc = -ENOMEM;
3293 }
3294 cifs_buf_release(pSMB);
3295 if (rc == -EAGAIN)
3296 goto QFileInfoRetry;
3297
3298 return rc;
3299}
3235 3300
3236int 3301int
3237CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon, 3302CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
@@ -3335,6 +3400,75 @@ QPathInfoRetry:
3335} 3400}
3336 3401
3337int 3402int
3403CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
3404 u16 netfid, FILE_UNIX_BASIC_INFO *pFindData)
3405{
3406 struct smb_t2_qfi_req *pSMB = NULL;
3407 struct smb_t2_qfi_rsp *pSMBr = NULL;
3408 int rc = 0;
3409 int bytes_returned;
3410 __u16 params, byte_count;
3411
3412UnixQFileInfoRetry:
3413 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
3414 (void **) &pSMBr);
3415 if (rc)
3416 return rc;
3417
3418 params = 2 /* level */ + 2 /* fid */;
3419 pSMB->t2.TotalDataCount = 0;
3420 pSMB->t2.MaxParameterCount = cpu_to_le16(4);
3421 /* BB find exact max data count below from sess structure BB */
3422 pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
3423 pSMB->t2.MaxSetupCount = 0;
3424 pSMB->t2.Reserved = 0;
3425 pSMB->t2.Flags = 0;
3426 pSMB->t2.Timeout = 0;
3427 pSMB->t2.Reserved2 = 0;
3428 pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
3429 Fid) - 4);
3430 pSMB->t2.DataCount = 0;
3431 pSMB->t2.DataOffset = 0;
3432 pSMB->t2.SetupCount = 1;
3433 pSMB->t2.Reserved3 = 0;
3434 pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
3435 byte_count = params + 1 /* pad */ ;
3436 pSMB->t2.TotalParameterCount = cpu_to_le16(params);
3437 pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
3438 pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
3439 pSMB->Pad = 0;
3440 pSMB->Fid = netfid;
3441 pSMB->hdr.smb_buf_length += byte_count;
3442
3443 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3444 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3445 if (rc) {
3446 cFYI(1, ("Send error in QPathInfo = %d", rc));
3447 } else { /* decode response */
3448 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3449
3450 if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
3451 cERROR(1, ("Malformed FILE_UNIX_BASIC_INFO response.\n"
3452 "Unix Extensions can be disabled on mount "
3453 "by specifying the nosfu mount option."));
3454 rc = -EIO; /* bad smb */
3455 } else {
3456 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
3457 memcpy((char *) pFindData,
3458 (char *) &pSMBr->hdr.Protocol +
3459 data_offset,
3460 sizeof(FILE_UNIX_BASIC_INFO));
3461 }
3462 }
3463
3464 cifs_buf_release(pSMB);
3465 if (rc == -EAGAIN)
3466 goto UnixQFileInfoRetry;
3467
3468 return rc;
3469}
3470
3471int
3338CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon, 3472CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
3339 const unsigned char *searchName, 3473 const unsigned char *searchName,
3340 FILE_UNIX_BASIC_INFO *pFindData, 3474 FILE_UNIX_BASIC_INFO *pFindData,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 45eb6cba793f..d9566bf8f917 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -23,6 +23,7 @@
23#include <linux/string.h> 23#include <linux/string.h>
24#include <linux/list.h> 24#include <linux/list.h>
25#include <linux/wait.h> 25#include <linux/wait.h>
26#include <linux/slab.h>
26#include <linux/pagemap.h> 27#include <linux/pagemap.h>
27#include <linux/ctype.h> 28#include <linux/ctype.h>
28#include <linux/utsname.h> 29#include <linux/utsname.h>
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 6ccf7262d1b7..e9f7ecc2714b 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -739,7 +739,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
739 int isValid = 1; 739 int isValid = 1;
740 740
741 if (direntry->d_inode) { 741 if (direntry->d_inode) {
742 if (cifs_revalidate(direntry)) 742 if (cifs_revalidate_dentry(direntry))
743 return 0; 743 return 0;
744 } else { 744 } else {
745 cFYI(1, ("neg dentry 0x%p name = %s", 745 cFYI(1, ("neg dentry 0x%p name = %s",
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 87948147d7ec..6f8a0e3fb25b 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -23,6 +23,7 @@
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */ 24 */
25 25
26#include <linux/slab.h>
26#include <keys/user-type.h> 27#include <keys/user-type.h>
27#include "dns_resolve.h" 28#include "dns_resolve.h"
28#include "cifsglob.h" 29#include "cifsglob.h"
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 3d8f8a96f5a3..058b390d3da8 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -31,6 +31,7 @@
31#include <linux/task_io_accounting_ops.h> 31#include <linux/task_io_accounting_ops.h>
32#include <linux/delay.h> 32#include <linux/delay.h>
33#include <linux/mount.h> 33#include <linux/mount.h>
34#include <linux/slab.h>
34#include <asm/div64.h> 35#include <asm/div64.h>
35#include "cifsfs.h" 36#include "cifsfs.h"
36#include "cifspdu.h" 37#include "cifspdu.h"
@@ -219,8 +220,8 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
219 cFYI(1, ("inode unchanged on server")); 220 cFYI(1, ("inode unchanged on server"));
220 } else { 221 } else {
221 if (file->f_path.dentry->d_inode->i_mapping) { 222 if (file->f_path.dentry->d_inode->i_mapping) {
222 /* BB no need to lock inode until after invalidate 223 /* BB no need to lock inode until after invalidate
223 since namei code should already have it locked? */ 224 since namei code should already have it locked? */
224 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping); 225 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
225 if (rc != 0) 226 if (rc != 0)
226 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc; 227 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
@@ -1890,11 +1891,10 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1890 1891
1891int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) 1892int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1892{ 1893{
1893 struct dentry *dentry = file->f_path.dentry;
1894 int rc, xid; 1894 int rc, xid;
1895 1895
1896 xid = GetXid(); 1896 xid = GetXid();
1897 rc = cifs_revalidate(dentry); 1897 rc = cifs_revalidate_file(file);
1898 if (rc) { 1898 if (rc) {
1899 cFYI(1, ("Validation prior to mmap failed, error=%d", rc)); 1899 cFYI(1, ("Validation prior to mmap failed, error=%d", rc));
1900 FreeXid(xid); 1900 FreeXid(xid);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 8bdbc818164c..35ec11716213 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -20,6 +20,7 @@
20 */ 20 */
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/stat.h> 22#include <linux/stat.h>
23#include <linux/slab.h>
23#include <linux/pagemap.h> 24#include <linux/pagemap.h>
24#include <asm/div64.h> 25#include <asm/div64.h>
25#include "cifsfs.h" 26#include "cifsfs.h"
@@ -77,6 +78,41 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
77 } 78 }
78} 79}
79 80
81/* check inode attributes against fattr. If they don't match, tag the
82 * inode for cache invalidation
83 */
84static void
85cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
86{
87 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
88
89 cFYI(1, ("%s: revalidating inode %llu", __func__, cifs_i->uniqueid));
90
91 if (inode->i_state & I_NEW) {
92 cFYI(1, ("%s: inode %llu is new", __func__, cifs_i->uniqueid));
93 return;
94 }
95
96 /* don't bother with revalidation if we have an oplock */
97 if (cifs_i->clientCanCacheRead) {
98 cFYI(1, ("%s: inode %llu is oplocked", __func__,
99 cifs_i->uniqueid));
100 return;
101 }
102
103 /* revalidate if mtime or size have changed */
104 if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) &&
105 cifs_i->server_eof == fattr->cf_eof) {
106 cFYI(1, ("%s: inode %llu is unchanged", __func__,
107 cifs_i->uniqueid));
108 return;
109 }
110
111 cFYI(1, ("%s: invalidating inode %llu mapping", __func__,
112 cifs_i->uniqueid));
113 cifs_i->invalid_mapping = true;
114}
115
80/* populate an inode with info from a cifs_fattr struct */ 116/* populate an inode with info from a cifs_fattr struct */
81void 117void
82cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) 118cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
@@ -85,6 +121,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
85 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 121 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
86 unsigned long oldtime = cifs_i->time; 122 unsigned long oldtime = cifs_i->time;
87 123
124 cifs_revalidate_cache(inode, fattr);
125
88 inode->i_atime = fattr->cf_atime; 126 inode->i_atime = fattr->cf_atime;
89 inode->i_mtime = fattr->cf_mtime; 127 inode->i_mtime = fattr->cf_mtime;
90 inode->i_ctime = fattr->cf_ctime; 128 inode->i_ctime = fattr->cf_ctime;
@@ -231,6 +269,31 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
231 fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL; 269 fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL;
232} 270}
233 271
272int cifs_get_file_info_unix(struct file *filp)
273{
274 int rc;
275 int xid;
276 FILE_UNIX_BASIC_INFO find_data;
277 struct cifs_fattr fattr;
278 struct inode *inode = filp->f_path.dentry->d_inode;
279 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
280 struct cifsTconInfo *tcon = cifs_sb->tcon;
281 struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
282
283 xid = GetXid();
284 rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
285 if (!rc) {
286 cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
287 } else if (rc == -EREMOTE) {
288 cifs_create_dfs_fattr(&fattr, inode->i_sb);
289 rc = 0;
290 }
291
292 cifs_fattr_to_inode(inode, &fattr);
293 FreeXid(xid);
294 return rc;
295}
296
234int cifs_get_inode_info_unix(struct inode **pinode, 297int cifs_get_inode_info_unix(struct inode **pinode,
235 const unsigned char *full_path, 298 const unsigned char *full_path,
236 struct super_block *sb, int xid) 299 struct super_block *sb, int xid)
@@ -432,6 +495,47 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
432 fattr->cf_gid = cifs_sb->mnt_gid; 495 fattr->cf_gid = cifs_sb->mnt_gid;
433} 496}
434 497
498int cifs_get_file_info(struct file *filp)
499{
500 int rc;
501 int xid;
502 FILE_ALL_INFO find_data;
503 struct cifs_fattr fattr;
504 struct inode *inode = filp->f_path.dentry->d_inode;
505 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
506 struct cifsTconInfo *tcon = cifs_sb->tcon;
507 struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
508
509 xid = GetXid();
510 rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
511 if (rc == -EOPNOTSUPP || rc == -EINVAL) {
512 /*
513 * FIXME: legacy server -- fall back to path-based call?
514 * for now, just skip revalidating and mark inode for
515 * immediate reval.
516 */
517 rc = 0;
518 CIFS_I(inode)->time = 0;
519 goto cgfi_exit;
520 } else if (rc == -EREMOTE) {
521 cifs_create_dfs_fattr(&fattr, inode->i_sb);
522 rc = 0;
523 } else if (rc)
524 goto cgfi_exit;
525
526 /*
527 * don't bother with SFU junk here -- just mark inode as needing
528 * revalidation.
529 */
530 cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
531 fattr.cf_uniqueid = CIFS_I(inode)->uniqueid;
532 fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
533 cifs_fattr_to_inode(inode, &fattr);
534cgfi_exit:
535 FreeXid(xid);
536 return rc;
537}
538
435int cifs_get_inode_info(struct inode **pinode, 539int cifs_get_inode_info(struct inode **pinode,
436 const unsigned char *full_path, FILE_ALL_INFO *pfindData, 540 const unsigned char *full_path, FILE_ALL_INFO *pfindData,
437 struct super_block *sb, int xid, const __u16 *pfid) 541 struct super_block *sb, int xid, const __u16 *pfid)
@@ -1389,135 +1493,103 @@ cifs_rename_exit:
1389 return rc; 1493 return rc;
1390} 1494}
1391 1495
1392int cifs_revalidate(struct dentry *direntry) 1496static bool
1497cifs_inode_needs_reval(struct inode *inode)
1393{ 1498{
1394 int xid; 1499 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
1395 int rc = 0, wbrc = 0;
1396 char *full_path;
1397 struct cifs_sb_info *cifs_sb;
1398 struct cifsInodeInfo *cifsInode;
1399 loff_t local_size;
1400 struct timespec local_mtime;
1401 bool invalidate_inode = false;
1402 1500
1403 if (direntry->d_inode == NULL) 1501 if (cifs_i->clientCanCacheRead)
1404 return -ENOENT; 1502 return false;
1405 1503
1406 cifsInode = CIFS_I(direntry->d_inode); 1504 if (!lookupCacheEnabled)
1505 return true;
1407 1506
1408 if (cifsInode == NULL) 1507 if (cifs_i->time == 0)
1409 return -ENOENT; 1508 return true;
1410 1509
1411 /* no sense revalidating inode info on file that no one can write */ 1510 /* FIXME: the actimeo should be tunable */
1412 if (CIFS_I(direntry->d_inode)->clientCanCacheRead) 1511 if (time_after_eq(jiffies, cifs_i->time + HZ))
1413 return rc; 1512 return true;
1513
1514 return false;
1515}
1516
1517/* check invalid_mapping flag and zap the cache if it's set */
1518static void
1519cifs_invalidate_mapping(struct inode *inode)
1520{
1521 int rc;
1522 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
1523
1524 cifs_i->invalid_mapping = false;
1525
1526 /* write back any cached data */
1527 if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
1528 rc = filemap_write_and_wait(inode->i_mapping);
1529 if (rc)
1530 cifs_i->write_behind_rc = rc;
1531 }
1532 invalidate_remote_inode(inode);
1533}
1534
1535int cifs_revalidate_file(struct file *filp)
1536{
1537 int rc = 0;
1538 struct inode *inode = filp->f_path.dentry->d_inode;
1539
1540 if (!cifs_inode_needs_reval(inode))
1541 goto check_inval;
1542
1543 if (CIFS_SB(inode->i_sb)->tcon->unix_ext)
1544 rc = cifs_get_file_info_unix(filp);
1545 else
1546 rc = cifs_get_file_info(filp);
1547
1548check_inval:
1549 if (CIFS_I(inode)->invalid_mapping)
1550 cifs_invalidate_mapping(inode);
1551
1552 return rc;
1553}
1554
1555/* revalidate a dentry's inode attributes */
1556int cifs_revalidate_dentry(struct dentry *dentry)
1557{
1558 int xid;
1559 int rc = 0;
1560 char *full_path = NULL;
1561 struct inode *inode = dentry->d_inode;
1562 struct super_block *sb = dentry->d_sb;
1563
1564 if (inode == NULL)
1565 return -ENOENT;
1414 1566
1415 xid = GetXid(); 1567 xid = GetXid();
1416 1568
1417 cifs_sb = CIFS_SB(direntry->d_sb); 1569 if (!cifs_inode_needs_reval(inode))
1570 goto check_inval;
1418 1571
1419 /* can not safely grab the rename sem here if rename calls revalidate 1572 /* can not safely grab the rename sem here if rename calls revalidate
1420 since that would deadlock */ 1573 since that would deadlock */
1421 full_path = build_path_from_dentry(direntry); 1574 full_path = build_path_from_dentry(dentry);
1422 if (full_path == NULL) { 1575 if (full_path == NULL) {
1423 rc = -ENOMEM; 1576 rc = -ENOMEM;
1424 FreeXid(xid); 1577 goto check_inval;
1425 return rc;
1426 }
1427 cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
1428 "jiffies %ld", full_path, direntry->d_inode,
1429 direntry->d_inode->i_count.counter, direntry,
1430 direntry->d_time, jiffies));
1431
1432 if (cifsInode->time == 0) {
1433 /* was set to zero previously to force revalidate */
1434 } else if (time_before(jiffies, cifsInode->time + HZ) &&
1435 lookupCacheEnabled) {
1436 if ((S_ISREG(direntry->d_inode->i_mode) == 0) ||
1437 (direntry->d_inode->i_nlink == 1)) {
1438 kfree(full_path);
1439 FreeXid(xid);
1440 return rc;
1441 } else {
1442 cFYI(1, ("Have to revalidate file due to hardlinks"));
1443 }
1444 }
1445
1446 /* save mtime and size */
1447 local_mtime = direntry->d_inode->i_mtime;
1448 local_size = direntry->d_inode->i_size;
1449
1450 if (cifs_sb->tcon->unix_ext) {
1451 rc = cifs_get_inode_info_unix(&direntry->d_inode, full_path,
1452 direntry->d_sb, xid);
1453 if (rc) {
1454 cFYI(1, ("error on getting revalidate info %d", rc));
1455/* if (rc != -ENOENT)
1456 rc = 0; */ /* BB should we cache info on
1457 certain errors? */
1458 }
1459 } else {
1460 rc = cifs_get_inode_info(&direntry->d_inode, full_path, NULL,
1461 direntry->d_sb, xid, NULL);
1462 if (rc) {
1463 cFYI(1, ("error on getting revalidate info %d", rc));
1464/* if (rc != -ENOENT)
1465 rc = 0; */ /* BB should we cache info on
1466 certain errors? */
1467 }
1468 } 1578 }
1469 /* should we remap certain errors, access denied?, to zero */
1470 1579
1471 /* if not oplocked, we invalidate inode pages if mtime or file size 1580 cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
1472 had changed on server */ 1581 "jiffies %ld", full_path, inode, inode->i_count.counter,
1582 dentry, dentry->d_time, jiffies));
1473 1583
1474 if (timespec_equal(&local_mtime, &direntry->d_inode->i_mtime) && 1584 if (CIFS_SB(sb)->tcon->unix_ext)
1475 (local_size == direntry->d_inode->i_size)) { 1585 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
1476 cFYI(1, ("cifs_revalidate - inode unchanged")); 1586 else
1477 } else { 1587 rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
1478 /* file may have changed on server */ 1588 xid, NULL);
1479 if (cifsInode->clientCanCacheRead) {
1480 /* no need to invalidate inode pages since we were the
1481 only ones who could have modified the file and the
1482 server copy is staler than ours */
1483 } else {
1484 invalidate_inode = true;
1485 }
1486 }
1487 1589
1488 /* can not grab this sem since kernel filesys locking documentation 1590check_inval:
1489 indicates i_mutex may be taken by the kernel on lookup and rename 1591 if (CIFS_I(inode)->invalid_mapping)
1490 which could deadlock if we grab the i_mutex here as well */ 1592 cifs_invalidate_mapping(inode);
1491/* mutex_lock(&direntry->d_inode->i_mutex);*/
1492 /* need to write out dirty pages here */
1493 if (direntry->d_inode->i_mapping) {
1494 /* do we need to lock inode until after invalidate completes
1495 below? */
1496 wbrc = filemap_fdatawrite(direntry->d_inode->i_mapping);
1497 if (wbrc)
1498 CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
1499 }
1500 if (invalidate_inode) {
1501 /* shrink_dcache not necessary now that cifs dentry ops
1502 are exported for negative dentries */
1503/* if (S_ISDIR(direntry->d_inode->i_mode))
1504 shrink_dcache_parent(direntry); */
1505 if (S_ISREG(direntry->d_inode->i_mode)) {
1506 if (direntry->d_inode->i_mapping) {
1507 wbrc = filemap_fdatawait(direntry->d_inode->i_mapping);
1508 if (wbrc)
1509 CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
1510 }
1511 /* may eventually have to do this for open files too */
1512 if (list_empty(&(cifsInode->openFileList))) {
1513 /* changed on server - flush read ahead pages */
1514 cFYI(1, ("Invalidating read ahead data on "
1515 "closed file"));
1516 invalidate_remote_inode(direntry->d_inode);
1517 }
1518 }
1519 }
1520/* mutex_unlock(&direntry->d_inode->i_mutex); */
1521 1593
1522 kfree(full_path); 1594 kfree(full_path);
1523 FreeXid(xid); 1595 FreeXid(xid);
@@ -1527,7 +1599,7 @@ int cifs_revalidate(struct dentry *direntry)
1527int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, 1599int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1528 struct kstat *stat) 1600 struct kstat *stat)
1529{ 1601{
1530 int err = cifs_revalidate(dentry); 1602 int err = cifs_revalidate_dentry(dentry);
1531 if (!err) { 1603 if (!err) {
1532 generic_fillattr(dentry->d_inode, stat); 1604 generic_fillattr(dentry->d_inode, stat);
1533 stat->blksize = CIFS_MAX_MSGSIZE; 1605 stat->blksize = CIFS_MAX_MSGSIZE;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index fc1e0487eaee..c1a9d4236a8c 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -20,6 +20,7 @@
20 */ 20 */
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/stat.h> 22#include <linux/stat.h>
23#include <linux/slab.h>
23#include <linux/namei.h> 24#include <linux/namei.h>
24#include "cifsfs.h" 25#include "cifsfs.h"
25#include "cifspdu.h" 26#include "cifspdu.h"
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index c343b14ba2d3..18e0bc1fb593 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -22,6 +22,7 @@
22 */ 22 */
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/pagemap.h> 24#include <linux/pagemap.h>
25#include <linux/slab.h>
25#include <linux/stat.h> 26#include <linux/stat.h>
26#include "cifspdu.h" 27#include "cifspdu.h"
27#include "cifsglob.h" 28#include "cifsglob.h"
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index aaa9c1c5a5bd..7c3fd7463f44 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -29,6 +29,7 @@
29#include "ntlmssp.h" 29#include "ntlmssp.h"
30#include "nterr.h" 30#include "nterr.h"
31#include <linux/utsname.h> 31#include <linux/utsname.h>
32#include <linux/slab.h>
32#include "cifs_spnego.h" 33#include "cifs_spnego.h"
33 34
34extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, 35extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 93fb09a99c69..192ea51af20f 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -24,6 +24,7 @@
24*/ 24*/
25 25
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/slab.h>
27#include <linux/fs.h> 28#include <linux/fs.h>
28#include <linux/string.h> 29#include <linux/string.h>
29#include <linux/kernel.h> 30#include <linux/kernel.h>
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 07b8e71544ee..ad081fe7eb18 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/list.h> 24#include <linux/list.h>
25#include <linux/gfp.h>
25#include <linux/wait.h> 26#include <linux/wait.h>
26#include <linux/net.h> 27#include <linux/net.h>
27#include <linux/delay.h> 28#include <linux/delay.h>
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 3e2ef0de1209..f555ce077d4f 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -21,6 +21,7 @@
21 21
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/posix_acl_xattr.h> 23#include <linux/posix_acl_xattr.h>
24#include <linux/slab.h>
24#include "cifsfs.h" 25#include "cifsfs.h"
25#include "cifspdu.h" 26#include "cifspdu.h"
26#include "cifsglob.h" 27#include "cifsglob.h"
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 4bb9d0a5decc..ccd98b0f2b0b 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -12,6 +12,7 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/slab.h>
15#include <linux/file.h> 16#include <linux/file.h>
16#include <linux/stat.h> 17#include <linux/stat.h>
17#include <linux/errno.h> 18#include <linux/errno.h>
diff --git a/fs/coda/file.c b/fs/coda/file.c
index ffd42815fda1..4c813f2cdc52 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -17,6 +17,7 @@
17#include <linux/errno.h> 17#include <linux/errno.h>
18#include <linux/smp_lock.h> 18#include <linux/smp_lock.h>
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/slab.h>
20#include <asm/uaccess.h> 21#include <asm/uaccess.h>
21 22
22#include <linux/coda.h> 23#include <linux/coda.h>
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 830f51abb971..a1695dcadd99 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -18,6 +18,7 @@
18#include <linux/smp_lock.h> 18#include <linux/smp_lock.h>
19#include <linux/file.h> 19#include <linux/file.h>
20#include <linux/vfs.h> 20#include <linux/vfs.h>
21#include <linux/slab.h>
21 22
22#include <asm/system.h> 23#include <asm/system.h>
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index c274d949179d..f09c5ed76f6c 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -26,6 +26,7 @@
26#include <linux/stat.h> 26#include <linux/stat.h>
27#include <linux/errno.h> 27#include <linux/errno.h>
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/slab.h>
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
30#include <linux/vmalloc.h> 31#include <linux/vmalloc.h>
31#include <linux/vfs.h> 32#include <linux/vfs.h>
diff --git a/fs/compat.c b/fs/compat.c
index 030602d453b7..4b6ed03cc478 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -49,6 +49,7 @@
49#include <linux/mm.h> 49#include <linux/mm.h>
50#include <linux/eventpoll.h> 50#include <linux/eventpoll.h>
51#include <linux/fs_struct.h> 51#include <linux/fs_struct.h>
52#include <linux/slab.h>
52 53
53#include <asm/uaccess.h> 54#include <asm/uaccess.h>
54#include <asm/mmu_context.h> 55#include <asm/mmu_context.h>
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 6d55b61bfa79..c32a1b6a856b 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -23,7 +23,6 @@
23#include <linux/ioctl.h> 23#include <linux/ioctl.h>
24#include <linux/if.h> 24#include <linux/if.h>
25#include <linux/if_bridge.h> 25#include <linux/if_bridge.h>
26#include <linux/slab.h>
27#include <linux/raid/md_u.h> 26#include <linux/raid/md_u.h>
28#include <linux/kd.h> 27#include <linux/kd.h>
29#include <linux/route.h> 28#include <linux/route.h>
@@ -60,6 +59,7 @@
60#include <linux/i2c.h> 59#include <linux/i2c.h>
61#include <linux/i2c-dev.h> 60#include <linux/i2c-dev.h>
62#include <linux/atalk.h> 61#include <linux/atalk.h>
62#include <linux/gfp.h>
63 63
64#include <net/bluetooth/bluetooth.h> 64#include <net/bluetooth/bluetooth.h>
65#include <net/bluetooth/hci.h> 65#include <net/bluetooth/hci.h>
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index a2f746066c5d..c8af2d91174b 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -34,6 +34,7 @@
34#include <linux/capability.h> 34#include <linux/capability.h>
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/lockdep.h> 36#include <linux/lockdep.h>
37#include <linux/slab.h>
37 38
38#include <linux/configfs.h> 39#include <linux/configfs.h>
39#include "configfs_internal.h" 40#include "configfs_internal.h"
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 8421cea7d8c7..8c8d64230c2d 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -29,6 +29,7 @@
29#include <linux/mount.h> 29#include <linux/mount.h>
30#include <linux/pagemap.h> 30#include <linux/pagemap.h>
31#include <linux/init.h> 31#include <linux/init.h>
32#include <linux/slab.h>
32 33
33#include <linux/configfs.h> 34#include <linux/configfs.h>
34#include "configfs_internal.h" 35#include "configfs_internal.h"
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 32a5f46b1157..0f3eb41d9201 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -27,6 +27,7 @@
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/namei.h> 29#include <linux/namei.h>
30#include <linux/slab.h>
30 31
31#include <linux/configfs.h> 32#include <linux/configfs.h>
32#include "configfs_internal.h" 33#include "configfs_internal.h"
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 049d6c36da09..30a87b3dbcac 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -27,6 +27,7 @@
27#include <linux/fsnotify.h> 27#include <linux/fsnotify.h>
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/magic.h> 29#include <linux/magic.h>
30#include <linux/slab.h>
30 31
31static struct vfsmount *debugfs_mount; 32static struct vfsmount *debugfs_mount;
32static int debugfs_mount_count; 33static int debugfs_mount_count;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 8882ecc0f1bf..0120247b41c0 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -15,6 +15,7 @@
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/slab.h>
18#include <linux/mount.h> 19#include <linux/mount.h>
19#include <linux/tty.h> 20#include <linux/tty.h>
20#include <linux/mutex.h> 21#include <linux/mutex.h>
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 0df243850818..b54bca03d92f 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -14,6 +14,7 @@
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/configfs.h> 16#include <linux/configfs.h>
17#include <linux/slab.h>
17#include <linux/in.h> 18#include <linux/in.h>
18#include <linux/in6.h> 19#include <linux/in6.h>
19#include <net/ipv6.h> 20#include <net/ipv6.h>
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 29d6139c35fc..c6cf25158746 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -15,6 +15,7 @@
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/ctype.h> 16#include <linux/ctype.h>
17#include <linux/debugfs.h> 17#include <linux/debugfs.h>
18#include <linux/slab.h>
18 19
19#include "dlm_internal.h" 20#include "dlm_internal.h"
20#include "lock.h" 21#include "lock.h"
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 46ffd3eeaaf7..17903b491298 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -56,6 +56,7 @@
56 L: receive_xxxx_reply() <- R: send_xxxx_reply() 56 L: receive_xxxx_reply() <- R: send_xxxx_reply()
57*/ 57*/
58#include <linux/types.h> 58#include <linux/types.h>
59#include <linux/slab.h>
59#include "dlm_internal.h" 60#include "dlm_internal.h"
60#include <linux/dlm_device.h> 61#include <linux/dlm_device.h>
61#include "memory.h" 62#include "memory.h"
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 52cab160893c..c0d35c620526 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -51,6 +51,7 @@
51#include <linux/file.h> 51#include <linux/file.h>
52#include <linux/mutex.h> 52#include <linux/mutex.h>
53#include <linux/sctp.h> 53#include <linux/sctp.h>
54#include <linux/slab.h>
54#include <net/sctp/user.h> 55#include <net/sctp/user.h>
55#include <net/ipv6.h> 56#include <net/ipv6.h>
56 57
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 052095cd592f..2c6ad518100d 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -9,6 +9,7 @@
9#include <net/genetlink.h> 9#include <net/genetlink.h>
10#include <linux/dlm.h> 10#include <linux/dlm.h>
11#include <linux/dlm_netlink.h> 11#include <linux/dlm_netlink.h>
12#include <linux/gfp.h>
12 13
13#include "dlm_internal.h" 14#include "dlm_internal.h"
14 15
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index b5f89aef3b29..d45c02db6943 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -11,6 +11,7 @@
11#include <linux/poll.h> 11#include <linux/poll.h>
12#include <linux/dlm.h> 12#include <linux/dlm.h>
13#include <linux/dlm_plock.h> 13#include <linux/dlm_plock.h>
14#include <linux/slab.h>
14 15
15#include "dlm_internal.h" 16#include "dlm_internal.h"
16#include "lockspace.h" 17#include "lockspace.h"
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index a4bfd31ac45b..8b6e73c47435 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -17,6 +17,7 @@
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/dlm.h> 18#include <linux/dlm.h>
19#include <linux/dlm_device.h> 19#include <linux/dlm_device.h>
20#include <linux/slab.h>
20 21
21#include "dlm_internal.h" 22#include "dlm_internal.h"
22#include "lockspace.h" 23#include "lockspace.h"
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 7cb0a59f4b9d..efb2b9400391 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -33,6 +33,7 @@
33#include <linux/crypto.h> 33#include <linux/crypto.h>
34#include <linux/file.h> 34#include <linux/file.h>
35#include <linux/scatterlist.h> 35#include <linux/scatterlist.h>
36#include <linux/slab.h>
36#include <asm/unaligned.h> 37#include <asm/unaligned.h>
37#include "ecryptfs_kernel.h" 38#include "ecryptfs_kernel.h"
38 39
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 8f006a0d6076..906e803f7f79 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -26,6 +26,7 @@
26#include <linux/namei.h> 26#include <linux/namei.h>
27#include <linux/mount.h> 27#include <linux/mount.h>
28#include <linux/fs_stack.h> 28#include <linux/fs_stack.h>
29#include <linux/slab.h>
29#include "ecryptfs_kernel.h" 30#include "ecryptfs_kernel.h"
30 31
31/** 32/**
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 678172b61be2..e7440a6f5ebf 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -25,6 +25,7 @@
25 25
26#include <linux/file.h> 26#include <linux/file.h>
27#include <linux/poll.h> 27#include <linux/poll.h>
28#include <linux/slab.h>
28#include <linux/mount.h> 29#include <linux/mount.h>
29#include <linux/pagemap.h> 30#include <linux/pagemap.h>
30#include <linux/security.h> 31#include <linux/security.h>
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 4a430ab4115c..d3362faf3852 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -31,6 +31,7 @@
31#include <linux/mount.h> 31#include <linux/mount.h>
32#include <linux/crypto.h> 32#include <linux/crypto.h>
33#include <linux/fs_stack.h> 33#include <linux/fs_stack.h>
34#include <linux/slab.h>
34#include <asm/unaligned.h> 35#include <asm/unaligned.h>
35#include "ecryptfs_kernel.h" 36#include "ecryptfs_kernel.h"
36 37
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index a0a7847567e9..89c5476506ef 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -32,6 +32,7 @@
32#include <linux/random.h> 32#include <linux/random.h>
33#include <linux/crypto.h> 33#include <linux/crypto.h>
34#include <linux/scatterlist.h> 34#include <linux/scatterlist.h>
35#include <linux/slab.h>
35#include "ecryptfs_kernel.h" 36#include "ecryptfs_kernel.h"
36 37
37/** 38/**
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index e14cf7e588db..d8c3a373aafa 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/kthread.h> 23#include <linux/kthread.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/slab.h>
25#include <linux/wait.h> 26#include <linux/wait.h>
26#include <linux/mount.h> 27#include <linux/mount.h>
27#include "ecryptfs_kernel.h" 28#include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index ea2f92101dfe..af1a8f01ebac 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -35,6 +35,7 @@
35#include <linux/key.h> 35#include <linux/key.h>
36#include <linux/parser.h> 36#include <linux/parser.h>
37#include <linux/fs_stack.h> 37#include <linux/fs_stack.h>
38#include <linux/slab.h>
38#include "ecryptfs_kernel.h" 39#include "ecryptfs_kernel.h"
39 40
40/** 41/**
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index f1c17e87c5fb..2d8dbce9d485 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -20,6 +20,7 @@
20 * 02111-1307, USA. 20 * 02111-1307, USA.
21 */ 21 */
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/slab.h>
23#include <linux/user_namespace.h> 24#include <linux/user_namespace.h>
24#include <linux/nsproxy.h> 25#include <linux/nsproxy.h>
25#include "ecryptfs_kernel.h" 26#include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 4ec8f61ccf5a..3745f612bcd4 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -24,6 +24,7 @@
24#include <linux/random.h> 24#include <linux/random.h>
25#include <linux/miscdevice.h> 25#include <linux/miscdevice.h>
26#include <linux/poll.h> 26#include <linux/poll.h>
27#include <linux/slab.h>
27#include <linux/wait.h> 28#include <linux/wait.h>
28#include <linux/module.h> 29#include <linux/module.h>
29#include "ecryptfs_kernel.h" 30#include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index df4ce99d0597..d491237c98e7 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -32,6 +32,7 @@
32#include <linux/file.h> 32#include <linux/file.h>
33#include <linux/crypto.h> 33#include <linux/crypto.h>
34#include <linux/scatterlist.h> 34#include <linux/scatterlist.h>
35#include <linux/slab.h>
35#include <asm/unaligned.h> 36#include <asm/unaligned.h>
36#include "ecryptfs_kernel.h" 37#include "ecryptfs_kernel.h"
37 38
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index b15a43a80ab7..fcef41c1d2cf 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -26,6 +26,7 @@
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/mount.h> 27#include <linux/mount.h>
28#include <linux/key.h> 28#include <linux/key.h>
29#include <linux/slab.h>
29#include <linux/seq_file.h> 30#include <linux/seq_file.h>
30#include <linux/smp_lock.h> 31#include <linux/smp_lock.h>
31#include <linux/file.h> 32#include <linux/file.h>
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 7758cc382ef0..6bd3f76fdf88 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -11,6 +11,7 @@
11#include <linux/fs.h> 11#include <linux/fs.h>
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/slab.h>
14#include <linux/list.h> 15#include <linux/list.h>
15#include <linux/spinlock.h> 16#include <linux/spinlock.h>
16#include <linux/anon_inodes.h> 17#include <linux/anon_inodes.h>
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index a17e4b733e35..76d2a79ef93e 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -31,6 +31,7 @@
31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
32 */ 32 */
33 33
34#include <linux/slab.h>
34#include <linux/writeback.h> 35#include <linux/writeback.h>
35#include <linux/buffer_head.h> 36#include <linux/buffer_head.h>
36#include <scsi/scsi_device.h> 37#include <scsi/scsi_device.h>
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 5293bc411d17..4337cad7777b 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -22,6 +22,7 @@
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 */ 23 */
24 24
25#include <linux/slab.h>
25#include <scsi/scsi_device.h> 26#include <scsi/scsi_device.h>
26#include <asm/div64.h> 27#include <asm/div64.h>
27 28
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 6cf5e4e84d61..18e57ea1e5b4 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -37,6 +37,7 @@
37#include <linux/vfs.h> 37#include <linux/vfs.h>
38#include <linux/random.h> 38#include <linux/random.h>
39#include <linux/exportfs.h> 39#include <linux/exportfs.h>
40#include <linux/slab.h>
40 41
41#include "exofs.h" 42#include "exofs.h"
42 43
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 1d081f0cfec2..3cf038c055d7 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -13,6 +13,7 @@
13 13
14#include "ext2.h" 14#include "ext2.h"
15#include <linux/quotaops.h> 15#include <linux/quotaops.h>
16#include <linux/slab.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
17#include <linux/buffer_head.h> 18#include <linux/buffer_head.h>
18#include <linux/capability.h> 19#include <linux/capability.h>
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index c8155845ac05..b118c6383c6d 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -4,6 +4,7 @@
4 */ 4 */
5 5
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/slab.h>
7#include <linux/string.h> 8#include <linux/string.h>
8#include <linux/fs.h> 9#include <linux/fs.h>
9#include <linux/ext2_fs.h> 10#include <linux/ext2_fs.h>
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 161da2d3f890..a177122a1b25 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -14,6 +14,7 @@
14#include <linux/time.h> 14#include <linux/time.h>
15#include <linux/capability.h> 15#include <linux/capability.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/slab.h>
17#include <linux/jbd.h> 18#include <linux/jbd.h>
18#include <linux/ext3_fs.h> 19#include <linux/ext3_fs.h>
19#include <linux/ext3_jbd.h> 20#include <linux/ext3_jbd.h>
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index ef9008b885b5..0d0e97ed3ff6 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -582,7 +582,9 @@ got:
582 inode->i_generation = sbi->s_next_generation++; 582 inode->i_generation = sbi->s_next_generation++;
583 spin_unlock(&sbi->s_next_gen_lock); 583 spin_unlock(&sbi->s_next_gen_lock);
584 584
585 ei->i_state = EXT3_STATE_NEW; 585 ei->i_state_flags = 0;
586 ext3_set_inode_state(inode, EXT3_STATE_NEW);
587
586 ei->i_extra_isize = 588 ei->i_extra_isize =
587 (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ? 589 (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ?
588 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; 590 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 7f920b7263a4..ea33bdf0a300 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2811,7 +2811,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
2811 inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime); 2811 inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
2812 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0; 2812 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2813 2813
2814 ei->i_state = 0; 2814 ei->i_state_flags = 0;
2815 ei->i_dir_start_lookup = 0; 2815 ei->i_dir_start_lookup = 0;
2816 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 2816 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2817 /* We now have enough fields to check if the inode was active or not. 2817 /* We now have enough fields to check if the inode was active or not.
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 474348788dd9..3af91f476dff 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -4,6 +4,7 @@
4 */ 4 */
5 5
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/slab.h>
7#include <linux/string.h> 8#include <linux/string.h>
8#include <linux/fs.h> 9#include <linux/fs.h>
9#include <linux/ext3_jbd.h> 10#include <linux/ext3_jbd.h>
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 983f0e127493..538c48655084 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -18,6 +18,7 @@
18#include <linux/pagemap.h> 18#include <linux/pagemap.h>
19#include <linux/blkdev.h> 19#include <linux/blkdev.h>
20#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include <linux/slab.h>
21#include "ext4.h" 22#include "ext4.h"
22 23
23struct ext4_system_zone { 24struct ext4_system_zone {
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 361c0b9962a8..57f6eef6ccd6 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -263,7 +263,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
263 ext4_group_t f; 263 ext4_group_t f;
264 264
265 f = ext4_flex_group(sbi, block_group); 265 f = ext4_flex_group(sbi, block_group);
266 atomic_dec(&sbi->s_flex_groups[f].free_inodes); 266 atomic_dec(&sbi->s_flex_groups[f].used_dirs);
267 } 267 }
268 268
269 } 269 }
@@ -773,7 +773,7 @@ static int ext4_claim_inode(struct super_block *sb,
773 if (sbi->s_log_groups_per_flex) { 773 if (sbi->s_log_groups_per_flex) {
774 ext4_group_t f = ext4_flex_group(sbi, group); 774 ext4_group_t f = ext4_flex_group(sbi, group);
775 775
776 atomic_inc(&sbi->s_flex_groups[f].free_inodes); 776 atomic_inc(&sbi->s_flex_groups[f].used_dirs);
777 } 777 }
778 } 778 }
779 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); 779 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 986120f30066..5381802d6052 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -39,6 +39,7 @@
39#include <linux/bio.h> 39#include <linux/bio.h>
40#include <linux/workqueue.h> 40#include <linux/workqueue.h>
41#include <linux/kernel.h> 41#include <linux/kernel.h>
42#include <linux/slab.h>
42 43
43#include "ext4_jbd2.h" 44#include "ext4_jbd2.h"
44#include "xattr.h" 45#include "xattr.h"
@@ -1035,7 +1036,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1035 sector_t lblock) 1036 sector_t lblock)
1036{ 1037{
1037 struct ext4_inode_info *ei = EXT4_I(inode); 1038 struct ext4_inode_info *ei = EXT4_I(inode);
1038 int dind_mask = EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1; 1039 sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
1039 int blk_bits; 1040 int blk_bits;
1040 1041
1041 if (lblock < EXT4_NDIR_BLOCKS) 1042 if (lblock < EXT4_NDIR_BLOCKS)
@@ -1050,7 +1051,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1050 } 1051 }
1051 ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; 1052 ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
1052 ei->i_da_metadata_calc_len = 1; 1053 ei->i_da_metadata_calc_len = 1;
1053 blk_bits = roundup_pow_of_two(lblock + 1); 1054 blk_bits = order_base_2(lblock);
1054 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; 1055 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
1055} 1056}
1056 1057
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 54df209d2eed..bde9d0b170c2 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -23,6 +23,7 @@
23 23
24#include "mballoc.h" 24#include "mballoc.h"
25#include <linux/debugfs.h> 25#include <linux/debugfs.h>
26#include <linux/slab.h>
26#include <trace/events/ext4.h> 27#include <trace/events/ext4.h>
27 28
28/* 29/*
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 8b87bd0eac95..34dcfc52ef44 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -13,6 +13,7 @@
13 */ 13 */
14 14
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/slab.h>
16#include "ext4_jbd2.h" 17#include "ext4_jbd2.h"
17#include "ext4_extents.h" 18#include "ext4_extents.h"
18 19
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index aa5fe28d180f..d1fc662cc311 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -15,6 +15,7 @@
15 15
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/quotaops.h> 17#include <linux/quotaops.h>
18#include <linux/slab.h>
18#include "ext4_jbd2.h" 19#include "ext4_jbd2.h"
19#include "ext4_extents.h" 20#include "ext4_extents.h"
20#include "ext4.h" 21#include "ext4.h"
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ba191dae8730..e14d22c170d5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -68,7 +68,21 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
68static int ext4_unfreeze(struct super_block *sb); 68static int ext4_unfreeze(struct super_block *sb);
69static void ext4_write_super(struct super_block *sb); 69static void ext4_write_super(struct super_block *sb);
70static int ext4_freeze(struct super_block *sb); 70static int ext4_freeze(struct super_block *sb);
71static int ext4_get_sb(struct file_system_type *fs_type, int flags,
72 const char *dev_name, void *data, struct vfsmount *mnt);
71 73
74#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
75static struct file_system_type ext3_fs_type = {
76 .owner = THIS_MODULE,
77 .name = "ext3",
78 .get_sb = ext4_get_sb,
79 .kill_sb = kill_block_super,
80 .fs_flags = FS_REQUIRES_DEV,
81};
82#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
83#else
84#define IS_EXT3_SB(sb) (0)
85#endif
72 86
73ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, 87ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
74 struct ext4_group_desc *bg) 88 struct ext4_group_desc *bg)
@@ -2539,7 +2553,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2539 * enable delayed allocation by default 2553 * enable delayed allocation by default
2540 * Use -o nodelalloc to turn it off 2554 * Use -o nodelalloc to turn it off
2541 */ 2555 */
2542 set_opt(sbi->s_mount_opt, DELALLOC); 2556 if (!IS_EXT3_SB(sb))
2557 set_opt(sbi->s_mount_opt, DELALLOC);
2543 2558
2544 if (!parse_options((char *) data, sb, &journal_devnum, 2559 if (!parse_options((char *) data, sb, &journal_devnum,
2545 &journal_ioprio, NULL, 0)) 2560 &journal_ioprio, NULL, 0))
@@ -4068,7 +4083,7 @@ static int ext4_get_sb(struct file_system_type *fs_type, int flags,
4068 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); 4083 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
4069} 4084}
4070 4085
4071#if !defined(CONTIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 4086#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
4072static struct file_system_type ext2_fs_type = { 4087static struct file_system_type ext2_fs_type = {
4073 .owner = THIS_MODULE, 4088 .owner = THIS_MODULE,
4074 .name = "ext2", 4089 .name = "ext2",
@@ -4095,15 +4110,7 @@ static inline void register_as_ext2(void) { }
4095static inline void unregister_as_ext2(void) { } 4110static inline void unregister_as_ext2(void) { }
4096#endif 4111#endif
4097 4112
4098#if !defined(CONTIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 4113#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
4099static struct file_system_type ext3_fs_type = {
4100 .owner = THIS_MODULE,
4101 .name = "ext3",
4102 .get_sb = ext4_get_sb,
4103 .kill_sb = kill_block_super,
4104 .fs_flags = FS_REQUIRES_DEV,
4105};
4106
4107static inline void register_as_ext3(void) 4114static inline void register_as_ext3(void)
4108{ 4115{
4109 int err = register_filesystem(&ext3_fs_type); 4116 int err = register_filesystem(&ext3_fs_type);
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 983c253999a7..8b145e98df07 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -7,6 +7,7 @@
7#include <linux/string.h> 7#include <linux/string.h>
8#include <linux/fs.h> 8#include <linux/fs.h>
9#include <linux/security.h> 9#include <linux/security.h>
10#include <linux/slab.h>
10#include "ext4_jbd2.h" 11#include "ext4_jbd2.h"
11#include "ext4.h" 12#include "ext4.h"
12#include "xattr.h" 13#include "xattr.h"
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 923990e4f16e..113f0a1e565d 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/fs.h> 11#include <linux/fs.h>
12#include <linux/slab.h>
12#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
13#include "fat.h" 14#include "fat.h"
14 15
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index c1ef50154868..6fcc7e71fbaa 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -309,7 +309,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
309{ 309{
310 struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options; 310 struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options;
311 wchar_t *ip, *ext_start, *end, *name_start; 311 wchar_t *ip, *ext_start, *end, *name_start;
312 unsigned char base[9], ext[4], buf[8], *p; 312 unsigned char base[9], ext[4], buf[5], *p;
313 unsigned char charbuf[NLS_MAX_CHARSET_SIZE]; 313 unsigned char charbuf[NLS_MAX_CHARSET_SIZE];
314 int chl, chi; 314 int chl, chi;
315 int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen; 315 int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen;
@@ -467,7 +467,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
467 return 0; 467 return 0;
468 } 468 }
469 469
470 i = jiffies & 0xffff; 470 i = jiffies;
471 sz = (jiffies >> 16) & 0x7; 471 sz = (jiffies >> 16) & 0x7;
472 if (baselen > 2) { 472 if (baselen > 2) {
473 baselen = numtail2_baselen; 473 baselen = numtail2_baselen;
@@ -476,7 +476,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
476 name_res[baselen + 4] = '~'; 476 name_res[baselen + 4] = '~';
477 name_res[baselen + 5] = '1' + sz; 477 name_res[baselen + 5] = '1' + sz;
478 while (1) { 478 while (1) {
479 sprintf(buf, "%04X", i); 479 snprintf(buf, sizeof(buf), "%04X", i & 0xffff);
480 memcpy(&name_res[baselen], buf, 4); 480 memcpy(&name_res[baselen], buf, 4);
481 if (vfat_find_form(dir, name_res) < 0) 481 if (vfat_find_form(dir, name_res) < 0)
482 break; 482 break;
diff --git a/fs/fifo.c b/fs/fifo.c
index f8f97b8b6d44..5d6606ffc2d2 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/slab.h>
14#include <linux/fs.h> 13#include <linux/fs.h>
15#include <linux/sched.h> 14#include <linux/sched.h>
16#include <linux/pipe_fs_i.h> 15#include <linux/pipe_fs_i.h>
diff --git a/fs/filesystems.c b/fs/filesystems.c
index a24c58e181db..68ba492d8eef 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -10,10 +10,10 @@
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/proc_fs.h> 11#include <linux/proc_fs.h>
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/slab.h>
14#include <linux/kmod.h> 13#include <linux/kmod.h>
15#include <linux/init.h> 14#include <linux/init.h>
16#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/slab.h>
17#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18 18
19/* 19/*
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index ed8f0b0dd880..1429f3ae1e86 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -33,7 +33,6 @@
33#include <linux/fs.h> 33#include <linux/fs.h>
34#include <linux/buffer_head.h> 34#include <linux/buffer_head.h>
35#include <linux/kernel.h> 35#include <linux/kernel.h>
36#include <linux/slab.h>
37#include <linux/pagemap.h> 36#include <linux/pagemap.h>
38 37
39#include "vxfs_extern.h" 38#include "vxfs_extern.h"
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 76fc4d594acb..781a322ccb45 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -16,6 +16,7 @@
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/spinlock.h> 18#include <linux/spinlock.h>
19#include <linux/slab.h>
19#include <linux/sched.h> 20#include <linux/sched.h>
20#include <linux/fs.h> 21#include <linux/fs.h>
21#include <linux/mm.h> 22#include <linux/mm.h>
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 3221a0c7944e..1e1f286dd70e 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -12,6 +12,7 @@
12#define FSCACHE_DEBUG_LEVEL COOKIE 12#define FSCACHE_DEBUG_LEVEL COOKIE
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/seq_file.h> 14#include <linux/seq_file.h>
15#include <linux/slab.h>
15#include <linux/key.h> 16#include <linux/key.h>
16#include <keys/user-type.h> 17#include <keys/user-type.h>
17#include "internal.h" 18#include "internal.h"
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index e513ac599c8e..0b589a9b4ffc 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -53,7 +53,7 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
53static void fscache_object_slow_work_put_ref(struct slow_work *); 53static void fscache_object_slow_work_put_ref(struct slow_work *);
54static int fscache_object_slow_work_get_ref(struct slow_work *); 54static int fscache_object_slow_work_get_ref(struct slow_work *);
55static void fscache_object_slow_work_execute(struct slow_work *); 55static void fscache_object_slow_work_execute(struct slow_work *);
56#ifdef CONFIG_SLOW_WORK_PROC 56#ifdef CONFIG_SLOW_WORK_DEBUG
57static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *); 57static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
58#endif 58#endif
59static void fscache_initialise_object(struct fscache_object *); 59static void fscache_initialise_object(struct fscache_object *);
@@ -69,7 +69,7 @@ const struct slow_work_ops fscache_object_slow_work_ops = {
69 .get_ref = fscache_object_slow_work_get_ref, 69 .get_ref = fscache_object_slow_work_get_ref,
70 .put_ref = fscache_object_slow_work_put_ref, 70 .put_ref = fscache_object_slow_work_put_ref,
71 .execute = fscache_object_slow_work_execute, 71 .execute = fscache_object_slow_work_execute,
72#ifdef CONFIG_SLOW_WORK_PROC 72#ifdef CONFIG_SLOW_WORK_DEBUG
73 .desc = fscache_object_slow_work_desc, 73 .desc = fscache_object_slow_work_desc,
74#endif 74#endif
75}; 75};
@@ -364,7 +364,7 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
364/* 364/*
365 * describe an object for slow-work debugging 365 * describe an object for slow-work debugging
366 */ 366 */
367#ifdef CONFIG_SLOW_WORK_PROC 367#ifdef CONFIG_SLOW_WORK_DEBUG
368static void fscache_object_slow_work_desc(struct slow_work *work, 368static void fscache_object_slow_work_desc(struct slow_work *work,
369 struct seq_file *m) 369 struct seq_file *m)
370{ 370{
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 313e79a14266..f17cecafae44 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -14,6 +14,7 @@
14#define FSCACHE_DEBUG_LEVEL OPERATION 14#define FSCACHE_DEBUG_LEVEL OPERATION
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/seq_file.h> 16#include <linux/seq_file.h>
17#include <linux/slab.h>
17#include "internal.h" 18#include "internal.h"
18 19
19atomic_t fscache_op_debug_id; 20atomic_t fscache_op_debug_id;
@@ -500,7 +501,7 @@ static void fscache_op_execute(struct slow_work *work)
500/* 501/*
501 * describe an operation for slow-work debugging 502 * describe an operation for slow-work debugging
502 */ 503 */
503#ifdef CONFIG_SLOW_WORK_PROC 504#ifdef CONFIG_SLOW_WORK_DEBUG
504static void fscache_op_desc(struct slow_work *work, struct seq_file *m) 505static void fscache_op_desc(struct slow_work *work, struct seq_file *m)
505{ 506{
506 struct fscache_operation *op = 507 struct fscache_operation *op =
@@ -517,7 +518,7 @@ const struct slow_work_ops fscache_op_slow_work_ops = {
517 .get_ref = fscache_op_get_ref, 518 .get_ref = fscache_op_get_ref,
518 .put_ref = fscache_op_put_ref, 519 .put_ref = fscache_op_put_ref,
519 .execute = fscache_op_execute, 520 .execute = fscache_op_execute,
520#ifdef CONFIG_SLOW_WORK_PROC 521#ifdef CONFIG_SLOW_WORK_DEBUG
521 .desc = fscache_op_desc, 522 .desc = fscache_op_desc,
522#endif 523#endif
523}; 524};
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index c598ea4c4e7d..47aefd376e54 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -14,6 +14,7 @@
14#include <linux/fscache-cache.h> 14#include <linux/fscache-cache.h>
15#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
16#include <linux/pagevec.h> 16#include <linux/pagevec.h>
17#include <linux/slab.h>
17#include "internal.h" 18#include "internal.h"
18 19
19/* 20/*
@@ -881,6 +882,7 @@ submit_failed:
881 goto nobufs; 882 goto nobufs;
882 883
883nobufs_unlock_obj: 884nobufs_unlock_obj:
885 spin_unlock(&cookie->stores_lock);
884 spin_unlock(&object->lock); 886 spin_unlock(&object->lock);
885nobufs: 887nobufs:
886 spin_unlock(&cookie->lock); 888 spin_unlock(&cookie->lock);
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index de792dcf3274..e1f8171278bd 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -44,6 +44,7 @@
44#include <linux/magic.h> 44#include <linux/magic.h>
45#include <linux/miscdevice.h> 45#include <linux/miscdevice.h>
46#include <linux/mutex.h> 46#include <linux/mutex.h>
47#include <linux/slab.h>
47#include <linux/spinlock.h> 48#include <linux/spinlock.h>
48#include <linux/stat.h> 49#include <linux/stat.h>
49 50
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 55458031e501..fe5df5457656 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/gfp.h>
10#include <linux/fs.h> 11#include <linux/fs.h>
11#include <linux/generic_acl.h> 12#include <linux/generic_acl.h>
12#include <linux/posix_acl.h> 13#include <linux/posix_acl.h>
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 583e823307ae..5e411d5f4697 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 91beddadd388..bb7907bde3d8 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index d15876e9aa26..c22c21174833 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 38e3749d476c..49f97d3bb690 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 569b46240f61..0e0470ed34c2 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -9,6 +9,7 @@
9 9
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/dlm.h> 11#include <linux/dlm.h>
12#include <linux/slab.h>
12#include <linux/types.h> 13#include <linux/types.h>
13#include <linux/gfs2_ondisk.h> 14#include <linux/gfs2_ondisk.h>
14 15
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index b4106ddaaa98..f07119d89557 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -10,6 +10,8 @@
10#ifndef __RGRP_DOT_H__ 10#ifndef __RGRP_DOT_H__
11#define __RGRP_DOT_H__ 11#define __RGRP_DOT_H__
12 12
13#include <linux/slab.h>
14
13struct gfs2_rgrpd; 15struct gfs2_rgrpd;
14struct gfs2_sbd; 16struct gfs2_sbd;
15struct gfs2_holder; 17struct gfs2_holder;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 419042f7f0b6..54fd98425991 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -8,7 +8,6 @@
8 */ 8 */
9 9
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h> 11#include <linux/spinlock.h>
13#include <linux/completion.h> 12#include <linux/completion.h>
14#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 226f2bfbf16a..53511291fe36 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 0d200068d0af..cdb41a1f6a64 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/slab.h>
12#include <linux/swap.h> 13#include <linux/swap.h>
13 14
14#include "btree.h" 15#include "btree.h"
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 052f214ea6f0..38a0a9917d7f 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/slab.h>
12#include <linux/log2.h> 13#include <linux/log2.h>
13 14
14#include "btree.h" 15#include "btree.h"
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 8bbe03c3f6d5..86428f5ac991 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -11,6 +11,7 @@
11#include <linux/cdrom.h> 11#include <linux/cdrom.h>
12#include <linux/genhd.h> 12#include <linux/genhd.h>
13#include <linux/nls.h> 13#include <linux/nls.h>
14#include <linux/slab.h>
14 15
15#include "hfs_fs.h" 16#include "hfs_fs.h"
16#include "btree.h" 17#include "btree.h"
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 5ed7252b7b23..0a81eb7111f3 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -19,6 +19,7 @@
19#include <linux/nls.h> 19#include <linux/nls.h>
20#include <linux/parser.h> 20#include <linux/parser.h>
21#include <linux/seq_file.h> 21#include <linux/seq_file.h>
22#include <linux/slab.h>
22#include <linux/smp_lock.h> 23#include <linux/smp_lock.h>
23#include <linux/vfs.h> 24#include <linux/vfs.h>
24 25
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 3fcbb0e1f6fc..572628b4b07d 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -15,6 +15,7 @@
15#include <linux/nls.h> 15#include <linux/nls.h>
16#include <linux/mount.h> 16#include <linux/mount.h>
17#include <linux/seq_file.h> 17#include <linux/seq_file.h>
18#include <linux/slab.h>
18#include "hfsplus_fs.h" 19#include "hfsplus_fs.h"
19 20
20enum { 21enum {
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 032604e5ef2c..3a029d8f4cf1 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -11,6 +11,7 @@
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/statfs.h> 13#include <linux/statfs.h>
14#include <linux/slab.h>
14#include <linux/seq_file.h> 15#include <linux/seq_file.h>
15#include <linux/mount.h> 16#include <linux/mount.h>
16#include "hostfs.h" 17#include "hostfs.h"
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index b6fca543544c..eac5f96323e3 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -6,6 +6,7 @@
6 * general buffer i/o 6 * general buffer i/o
7 */ 7 */
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/slab.h>
9#include "hpfs_fn.h" 10#include "hpfs_fn.h"
10 11
11void hpfs_lock_creation(struct super_block *s) 12void hpfs_lock_creation(struct super_block *s)
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 26e3964a4b8c..2338130cceba 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/smp_lock.h> 9#include <linux/smp_lock.h>
10#include <linux/slab.h>
10#include "hpfs_fn.h" 11#include "hpfs_fn.h"
11 12
12static int hpfs_dir_release(struct inode *inode, struct file *filp) 13static int hpfs_dir_release(struct inode *inode, struct file *filp)
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index ff90affb94e1..1042a9bc97f3 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/smp_lock.h> 9#include <linux/smp_lock.h>
10#include <linux/slab.h>
10#include "hpfs_fn.h" 11#include "hpfs_fn.h"
11 12
12void hpfs_init_inode(struct inode *i) 13void hpfs_init_inode(struct inode *i)
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index cadc4ce48656..aa53842c599c 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -15,6 +15,7 @@
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/smp_lock.h> 16#include <linux/smp_lock.h>
17#include <linux/bitmap.h> 17#include <linux/bitmap.h>
18#include <linux/slab.h>
18 19
19/* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */ 20/* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
20 21
diff --git a/fs/ioprio.c b/fs/ioprio.c
index c7c0b28d7d21..748cfb92dcc6 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -19,6 +19,7 @@
19 * See also Documentation/block/ioprio.txt 19 * See also Documentation/block/ioprio.txt
20 * 20 *
21 */ 21 */
22#include <linux/gfp.h>
22#include <linux/kernel.h> 23#include <linux/kernel.h>
23#include <linux/ioprio.h> 24#include <linux/ioprio.h>
24#include <linux/blkdev.h> 25#include <linux/blkdev.h>
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 8ba5441063be..b9ab69b3a482 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -11,6 +11,7 @@
11 * isofs directory handling functions 11 * isofs directory handling functions
12 */ 12 */
13#include <linux/smp_lock.h> 13#include <linux/smp_lock.h>
14#include <linux/gfp.h>
14#include "isofs.h" 15#include "isofs.h"
15 16
16int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode) 17int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode)
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index eaa831311c9c..ab438beb867c 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/smp_lock.h> 9#include <linux/smp_lock.h>
10#include <linux/gfp.h>
10#include "isofs.h" 11#include "isofs.h"
11 12
12/* 13/*
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 2c90e3ef625f..ecb44c94ba8d 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -17,7 +17,6 @@
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/jbd.h> 18#include <linux/jbd.h>
19#include <linux/errno.h> 19#include <linux/errno.h>
20#include <linux/slab.h>
21#include <linux/mm.h> 20#include <linux/mm.h>
22#include <linux/pagemap.h> 21#include <linux/pagemap.h>
23#include <linux/bio.h> 22#include <linux/bio.h>
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index cb1a49ae605e..54c9bc9e1b17 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -20,7 +20,6 @@
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/jbd.h> 21#include <linux/jbd.h>
22#include <linux/errno.h> 22#include <linux/errno.h>
23#include <linux/slab.h>
24#endif 23#endif
25 24
26/* 25/*
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 73063285b13f..049281b7cb89 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -20,7 +20,6 @@
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/jbd2.h> 21#include <linux/jbd2.h>
22#include <linux/errno.h> 22#include <linux/errno.h>
23#include <linux/slab.h>
24#include <linux/crc32.h> 23#include <linux/crc32.h>
25#endif 24#endif
26 25
diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
index 90cb60d09787..cd02acafde8a 100644
--- a/fs/jffs2/compr_lzo.c
+++ b/fs/jffs2/compr_lzo.c
@@ -11,7 +11,6 @@
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/slab.h>
15#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/lzo.h> 16#include <linux/lzo.h>
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index cfd301a5edfc..b46661a42758 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -14,7 +14,6 @@
14#endif 14#endif
15 15
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/slab.h>
18#include <linux/zlib.h> 17#include <linux/zlib.h>
19#include <linux/zutil.h> 18#include <linux/zutil.h>
20#include "nodelist.h" 19#include "nodelist.h"
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index 5544d31c066b..ec3538413926 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -15,6 +15,7 @@
15#include <linux/crc32.h> 15#include <linux/crc32.h>
16#include <linux/jffs2.h> 16#include <linux/jffs2.h>
17#include <linux/mtd/mtd.h> 17#include <linux/mtd/mtd.h>
18#include <linux/slab.h>
18#include "nodelist.h" 19#include "nodelist.h"
19#include "debug.h" 20#include "debug.h"
20 21
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index b7b74e299142..e7291c161a19 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/slab.h>
14#include <linux/fs.h> 13#include <linux/fs.h>
15#include <linux/time.h> 14#include <linux/time.h>
16#include <linux/pagemap.h> 15#include <linux/pagemap.h>
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 87c6f555e1a0..af02bd138469 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -15,7 +15,6 @@
15#include <linux/mtd/mtd.h> 15#include <linux/mtd/mtd.h>
16#include <linux/rbtree.h> 16#include <linux/rbtree.h>
17#include <linux/crc32.h> 17#include <linux/crc32.h>
18#include <linux/slab.h>
19#include <linux/pagemap.h> 18#include <linux/pagemap.h>
20#include "nodelist.h" 19#include "nodelist.h"
21 20
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 21a052915aa9..191359dde4e1 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/slab.h>
14#include <linux/mtd/mtd.h> 13#include <linux/mtd/mtd.h>
15#include <linux/compiler.h> 14#include <linux/compiler.h>
16#include <linux/sched.h> /* For cond_resched() */ 15#include <linux/sched.h> /* For cond_resched() */
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 4ec11e8bda8c..b955626071c2 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/slab.h>
14#include <linux/fs.h> 13#include <linux/fs.h>
15#include <linux/namei.h> 14#include <linux/namei.h>
16#include "nodelist.h" 15#include "nodelist.h"
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index ca29440e9435..c819eb0e982d 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -12,7 +12,6 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/crc32.h> 14#include <linux/crc32.h>
15#include <linux/slab.h>
16#include <linux/pagemap.h> 15#include <linux/pagemap.h>
17#include <linux/mtd/mtd.h> 16#include <linux/mtd/mtd.h>
18#include "nodelist.h" 17#include "nodelist.h"
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 213169780b6c..1057a4998e4e 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -19,6 +19,7 @@
19 */ 19 */
20 20
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/slab.h>
22#include <linux/fs.h> 23#include <linux/fs.h>
23#include <linux/posix_acl_xattr.h> 24#include <linux/posix_acl_xattr.h>
24#include "jfs_incore.h" 25#include "jfs_incore.h"
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index d9b031cf69f5..6c4dfcbf3f55 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/slab.h>
20#include "jfs_incore.h" 21#include "jfs_incore.h"
21#include "jfs_superblock.h" 22#include "jfs_superblock.h"
22#include "jfs_dmap.h" 23#include "jfs_dmap.h"
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 0e4623be70ce..9197a1b0d02d 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -102,6 +102,7 @@
102 102
103#include <linux/fs.h> 103#include <linux/fs.h>
104#include <linux/quotaops.h> 104#include <linux/quotaops.h>
105#include <linux/slab.h>
105#include "jfs_incore.h" 106#include "jfs_incore.h"
106#include "jfs_superblock.h" 107#include "jfs_superblock.h"
107#include "jfs_filsys.h" 108#include "jfs_filsys.h"
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 0fc30407f039..f8332dc8eeb2 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -45,6 +45,7 @@
45#include <linux/buffer_head.h> 45#include <linux/buffer_head.h>
46#include <linux/pagemap.h> 46#include <linux/pagemap.h>
47#include <linux/quotaops.h> 47#include <linux/quotaops.h>
48#include <linux/slab.h>
48 49
49#include "jfs_incore.h" 50#include "jfs_incore.h"
50#include "jfs_inode.h" 51#include "jfs_inode.h"
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 335c4de6552d..c51af2a14516 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -70,6 +70,7 @@
70#include <linux/delay.h> 70#include <linux/delay.h>
71#include <linux/mutex.h> 71#include <linux/mutex.h>
72#include <linux/seq_file.h> 72#include <linux/seq_file.h>
73#include <linux/slab.h>
73#include "jfs_incore.h" 74#include "jfs_incore.h"
74#include "jfs_filsys.h" 75#include "jfs_filsys.h"
75#include "jfs_metapage.h" 76#include "jfs_metapage.h"
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 07b6c5dfb4b6..48b44bd8267b 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -21,6 +21,7 @@
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/bio.h> 23#include <linux/bio.h>
24#include <linux/slab.h>
24#include <linux/init.h> 25#include <linux/init.h>
25#include <linux/buffer_head.h> 26#include <linux/buffer_head.h>
26#include <linux/mempool.h> 27#include <linux/mempool.h>
diff --git a/fs/jfs/jfs_unicode.h b/fs/jfs/jfs_unicode.h
index 3fbb3a225590..8f0f02cb6ca6 100644
--- a/fs/jfs/jfs_unicode.h
+++ b/fs/jfs/jfs_unicode.h
@@ -19,6 +19,7 @@
19#ifndef _H_JFS_UNICODE 19#ifndef _H_JFS_UNICODE
20#define _H_JFS_UNICODE 20#define _H_JFS_UNICODE
21 21
22#include <linux/slab.h>
22#include <asm/byteorder.h> 23#include <asm/byteorder.h>
23#include "jfs_types.h" 24#include "jfs_types.h"
24 25
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 266699deb1c6..157382fa6256 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -30,6 +30,7 @@
30#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
31#include <linux/exportfs.h> 31#include <linux/exportfs.h>
32#include <linux/crc32.h> 32#include <linux/crc32.h>
33#include <linux/slab.h>
33#include <asm/uaccess.h> 34#include <asm/uaccess.h>
34#include <linux/seq_file.h> 35#include <linux/seq_file.h>
35#include <linux/smp_lock.h> 36#include <linux/smp_lock.h>
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 1f594ab21895..fa96bbb26343 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -21,6 +21,7 @@
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/xattr.h> 22#include <linux/xattr.h>
23#include <linux/posix_acl_xattr.h> 23#include <linux/posix_acl_xattr.h>
24#include <linux/slab.h>
24#include <linux/quotaops.h> 25#include <linux/quotaops.h>
25#include <linux/security.h> 26#include <linux/security.h>
26#include "jfs_incore.h" 27#include "jfs_incore.h"
diff --git a/fs/libfs.c b/fs/libfs.c
index 9e50bcf55857..ea9a6cc9b35c 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -5,6 +5,7 @@
5 5
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/pagemap.h> 7#include <linux/pagemap.h>
8#include <linux/slab.h>
8#include <linux/mount.h> 9#include <linux/mount.h>
9#include <linux/vfs.h> 10#include <linux/vfs.h>
10#include <linux/mutex.h> 11#include <linux/mutex.h>
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index fc9032dc8862..64fd427c993c 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/slab.h>
11#include <linux/time.h> 12#include <linux/time.h>
12#include <linux/nfs_fs.h> 13#include <linux/nfs_fs.h>
13#include <linux/sunrpc/clnt.h> 14#include <linux/sunrpc/clnt.h>
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index c81249fef11f..7932c399fab4 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/smp_lock.h> 10#include <linux/smp_lock.h>
11#include <linux/slab.h>
11#include <linux/types.h> 12#include <linux/types.h>
12#include <linux/errno.h> 13#include <linux/errno.h>
13#include <linux/fs.h> 14#include <linux/fs.h>
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index fefa4df3f005..e3015464fbab 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -10,6 +10,7 @@
10#include <linux/utsname.h> 10#include <linux/utsname.h>
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/ktime.h> 12#include <linux/ktime.h>
13#include <linux/slab.h>
13 14
14#include <linux/sunrpc/clnt.h> 15#include <linux/sunrpc/clnt.h>
15#include <linux/sunrpc/xprtsock.h> 16#include <linux/sunrpc/xprtsock.h>
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 7d150517ddf0..f1bacf1a0391 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -21,7 +21,6 @@
21#include <linux/errno.h> 21#include <linux/errno.h>
22#include <linux/in.h> 22#include <linux/in.h>
23#include <linux/uio.h> 23#include <linux/uio.h>
24#include <linux/slab.h>
25#include <linux/smp.h> 24#include <linux/smp.h>
26#include <linux/smp_lock.h> 25#include <linux/smp_lock.h>
27#include <linux/mutex.h> 26#include <linux/mutex.h>
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index a7966eed3c17..031c6569a134 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -9,7 +9,6 @@
9 9
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/slab.h>
13#include <linux/smp_lock.h> 12#include <linux/smp_lock.h>
14#include <linux/lockd/lockd.h> 13#include <linux/lockd/lockd.h>
15#include <linux/lockd/share.h> 14#include <linux/lockd/share.h>
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index d1001790fa9a..84055d31bfc5 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/types.h> 23#include <linux/types.h>
24#include <linux/slab.h>
24#include <linux/errno.h> 25#include <linux/errno.h>
25#include <linux/kernel.h> 26#include <linux/kernel.h>
26#include <linux/sched.h> 27#include <linux/sched.h>
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 56c9519d900a..0f2ab741ae7c 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -9,7 +9,6 @@
9 9
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/slab.h>
13#include <linux/smp_lock.h> 12#include <linux/smp_lock.h>
14#include <linux/lockd/lockd.h> 13#include <linux/lockd/lockd.h>
15#include <linux/lockd/share.h> 14#include <linux/lockd/share.h>
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index ad478da7ca63..d0ef94cfb3da 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -10,6 +10,7 @@
10#include <linux/string.h> 10#include <linux/string.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/in.h> 12#include <linux/in.h>
13#include <linux/slab.h>
13#include <linux/mutex.h> 14#include <linux/mutex.h>
14#include <linux/sunrpc/svc.h> 15#include <linux/sunrpc/svc.h>
15#include <linux/sunrpc/clnt.h> 16#include <linux/sunrpc/clnt.h>
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 9718c22f186d..243c00071f76 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -9,6 +9,7 @@
9#include <linux/bio.h> 9#include <linux/bio.h>
10#include <linux/blkdev.h> 10#include <linux/blkdev.h>
11#include <linux/buffer_head.h> 11#include <linux/buffer_head.h>
12#include <linux/gfp.h>
12 13
13#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) 14#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
14 15
@@ -80,6 +81,7 @@ static void writeseg_end_io(struct bio *bio, int err)
80 prefetchw(&bvec->bv_page->flags); 81 prefetchw(&bvec->bv_page->flags);
81 82
82 end_page_writeback(page); 83 end_page_writeback(page);
84 page_cache_release(page);
83 } while (bvec >= bio->bi_io_vec); 85 } while (bvec >= bio->bi_io_vec);
84 bio_put(bio); 86 bio_put(bio);
85 if (atomic_dec_and_test(&super->s_pending_writes)) 87 if (atomic_dec_and_test(&super->s_pending_writes))
@@ -97,8 +99,10 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
97 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9); 99 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
98 int i; 100 int i;
99 101
102 if (max_pages > BIO_MAX_PAGES)
103 max_pages = BIO_MAX_PAGES;
100 bio = bio_alloc(GFP_NOFS, max_pages); 104 bio = bio_alloc(GFP_NOFS, max_pages);
101 BUG_ON(!bio); /* FIXME: handle this */ 105 BUG_ON(!bio);
102 106
103 for (i = 0; i < nr_pages; i++) { 107 for (i = 0; i < nr_pages; i++) {
104 if (i >= max_pages) { 108 if (i >= max_pages) {
@@ -191,8 +195,10 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
191 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9); 195 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
192 int i; 196 int i;
193 197
198 if (max_pages > BIO_MAX_PAGES)
199 max_pages = BIO_MAX_PAGES;
194 bio = bio_alloc(GFP_NOFS, max_pages); 200 bio = bio_alloc(GFP_NOFS, max_pages);
195 BUG_ON(!bio); /* FIXME: handle this */ 201 BUG_ON(!bio);
196 202
197 for (i = 0; i < nr_pages; i++) { 203 for (i = 0; i < nr_pages; i++) {
198 if (i >= max_pages) { 204 if (i >= max_pages) {
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 56a8bfbb0120..2396a85c0f55 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -6,7 +6,7 @@
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> 6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */ 7 */
8#include "logfs.h" 8#include "logfs.h"
9 9#include <linux/slab.h>
10 10
11/* 11/*
12 * Atomic dir operations 12 * Atomic dir operations
@@ -303,12 +303,12 @@ static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
303 (filler_t *)logfs_readpage, NULL); 303 (filler_t *)logfs_readpage, NULL);
304 if (IS_ERR(page)) 304 if (IS_ERR(page))
305 return PTR_ERR(page); 305 return PTR_ERR(page);
306 dd = kmap_atomic(page, KM_USER0); 306 dd = kmap(page);
307 BUG_ON(dd->namelen == 0); 307 BUG_ON(dd->namelen == 0);
308 308
309 full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen), 309 full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen),
310 pos, be64_to_cpu(dd->ino), dd->type); 310 pos, be64_to_cpu(dd->ino), dd->type);
311 kunmap_atomic(dd, KM_USER0); 311 kunmap(page);
312 page_cache_release(page); 312 page_cache_release(page);
313 if (full) 313 if (full)
314 break; 314 break;
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
index 92949f95a901..84e36f52fe95 100644
--- a/fs/logfs/gc.c
+++ b/fs/logfs/gc.c
@@ -7,6 +7,7 @@
7 */ 7 */
8#include "logfs.h" 8#include "logfs.h"
9#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/slab.h>
10 11
11/* 12/*
12 * Wear leveling needs to kick in when the difference between low erase 13 * Wear leveling needs to kick in when the difference between low erase
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 33ec1aeaeec4..14ed27274da2 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -6,6 +6,7 @@
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> 6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */ 7 */
8#include "logfs.h" 8#include "logfs.h"
9#include <linux/slab.h>
9#include <linux/writeback.h> 10#include <linux/writeback.h>
10#include <linux/backing-dev.h> 11#include <linux/backing-dev.h>
11 12
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index 6ad30a4c9052..33bd260b8309 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -6,6 +6,7 @@
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> 6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */ 7 */
8#include "logfs.h" 8#include "logfs.h"
9#include <linux/slab.h>
9 10
10static void logfs_calc_free(struct super_block *sb) 11static void logfs_calc_free(struct super_block *sb)
11{ 12{
@@ -800,6 +801,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
800{ 801{
801 struct logfs_super *super = logfs_super(sb); 802 struct logfs_super *super = logfs_super(sb);
802 struct logfs_area *area = super->s_journal_area; 803 struct logfs_area *area = super->s_journal_area;
804 struct btree_head32 *head = &super->s_reserved_segments;
803 u32 segno, ec; 805 u32 segno, ec;
804 int i, err; 806 int i, err;
805 807
@@ -807,6 +809,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
807 /* Drop old segments */ 809 /* Drop old segments */
808 journal_for_each(i) 810 journal_for_each(i)
809 if (super->s_journal_seg[i]) { 811 if (super->s_journal_seg[i]) {
812 btree_remove32(head, super->s_journal_seg[i]);
810 logfs_set_segment_unreserved(sb, 813 logfs_set_segment_unreserved(sb,
811 super->s_journal_seg[i], 814 super->s_journal_seg[i],
812 super->s_journal_ec[i]); 815 super->s_journal_ec[i]);
@@ -819,8 +822,13 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
819 super->s_journal_seg[i] = segno; 822 super->s_journal_seg[i] = segno;
820 super->s_journal_ec[i] = ec; 823 super->s_journal_ec[i] = ec;
821 logfs_set_segment_reserved(sb, segno); 824 logfs_set_segment_reserved(sb, segno);
825 err = btree_insert32(head, segno, (void *)1, GFP_KERNEL);
826 BUG_ON(err); /* mempool should prevent this */
827 err = logfs_erase_segment(sb, segno, 1);
828 BUG_ON(err); /* FIXME: remount-ro would be nicer */
822 } 829 }
823 /* Manually move journal_area */ 830 /* Manually move journal_area */
831 freeseg(sb, area->a_segno);
824 area->a_segno = super->s_journal_seg[0]; 832 area->a_segno = super->s_journal_seg[0];
825 area->a_is_open = 0; 833 area->a_is_open = 0;
826 area->a_used_bytes = 0; 834 area->a_used_bytes = 0;
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 129779431373..b84b0eec6024 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -587,6 +587,7 @@ void move_page_to_btree(struct page *page);
587int logfs_init_mapping(struct super_block *sb); 587int logfs_init_mapping(struct super_block *sb);
588void logfs_sync_area(struct logfs_area *area); 588void logfs_sync_area(struct logfs_area *area);
589void logfs_sync_segments(struct super_block *sb); 589void logfs_sync_segments(struct super_block *sb);
590void freeseg(struct super_block *sb, u32 segno);
590 591
591/* area handling */ 592/* area handling */
592int logfs_init_areas(struct super_block *sb); 593int logfs_init_areas(struct super_block *sb);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 7a23b3e7c0a7..bff40253dfb2 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -18,6 +18,7 @@
18 */ 18 */
19#include "logfs.h" 19#include "logfs.h"
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h>
21 22
22static u64 adjust_bix(u64 bix, level_t level) 23static u64 adjust_bix(u64 bix, level_t level)
23{ 24{
@@ -1594,7 +1595,6 @@ int logfs_delete(struct inode *inode, pgoff_t index,
1594 return ret; 1595 return ret;
1595} 1596}
1596 1597
1597/* Rewrite cannot mark the inode dirty but has to write it immediatly. */
1598int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs, 1598int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
1599 gc_level_t gc_level, long flags) 1599 gc_level_t gc_level, long flags)
1600{ 1600{
@@ -1611,6 +1611,18 @@ int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
1611 if (level != 0) 1611 if (level != 0)
1612 alloc_indirect_block(inode, page, 0); 1612 alloc_indirect_block(inode, page, 0);
1613 err = logfs_write_buf(inode, page, flags); 1613 err = logfs_write_buf(inode, page, flags);
1614 if (!err && shrink_level(gc_level) == 0) {
1615 /* Rewrite cannot mark the inode dirty but has to
1616 * write it immediatly.
1617 * Q: Can't we just create an alias for the inode
1618 * instead? And if not, why not?
1619 */
1620 if (inode->i_ino == LOGFS_INO_MASTER)
1621 logfs_write_anchor(inode->i_sb);
1622 else {
1623 err = __logfs_write_inode(inode, flags);
1624 }
1625 }
1614 } 1626 }
1615 logfs_put_write_page(page); 1627 logfs_put_write_page(page);
1616 return err; 1628 return err;
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 1a14f9910d55..801a3a141625 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -10,6 +10,7 @@
10 * three kinds of objects: inodes, dentries and blocks, both data and indirect. 10 * three kinds of objects: inodes, dentries and blocks, both data and indirect.
11 */ 11 */
12#include "logfs.h" 12#include "logfs.h"
13#include <linux/slab.h>
13 14
14static int logfs_mark_segment_bad(struct super_block *sb, u32 segno) 15static int logfs_mark_segment_bad(struct super_block *sb, u32 segno)
15{ 16{
@@ -93,50 +94,58 @@ void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
93 } while (len); 94 } while (len);
94} 95}
95 96
96/* 97static void pad_partial_page(struct logfs_area *area)
97 * bdev_writeseg will write full pages. Memset the tail to prevent data leaks.
98 */
99static void pad_wbuf(struct logfs_area *area, int final)
100{ 98{
101 struct super_block *sb = area->a_sb; 99 struct super_block *sb = area->a_sb;
102 struct logfs_super *super = logfs_super(sb);
103 struct page *page; 100 struct page *page;
104 u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes); 101 u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
105 pgoff_t index = ofs >> PAGE_SHIFT; 102 pgoff_t index = ofs >> PAGE_SHIFT;
106 long offset = ofs & (PAGE_SIZE-1); 103 long offset = ofs & (PAGE_SIZE-1);
107 u32 len = PAGE_SIZE - offset; 104 u32 len = PAGE_SIZE - offset;
108 105
109 if (len == PAGE_SIZE) { 106 if (len % PAGE_SIZE) {
110 /* The math in this function can surely use some love */ 107 page = get_mapping_page(sb, index, 0);
111 len = 0;
112 }
113 if (len) {
114 BUG_ON(area->a_used_bytes >= super->s_segsize);
115
116 page = get_mapping_page(area->a_sb, index, 0);
117 BUG_ON(!page); /* FIXME: reserve a pool */ 108 BUG_ON(!page); /* FIXME: reserve a pool */
118 memset(page_address(page) + offset, 0xff, len); 109 memset(page_address(page) + offset, 0xff, len);
119 SetPagePrivate(page); 110 SetPagePrivate(page);
120 page_cache_release(page); 111 page_cache_release(page);
121 } 112 }
113}
122 114
123 if (!final) 115static void pad_full_pages(struct logfs_area *area)
124 return; 116{
117 struct super_block *sb = area->a_sb;
118 struct logfs_super *super = logfs_super(sb);
119 u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
120 u32 len = super->s_segsize - area->a_used_bytes;
121 pgoff_t index = PAGE_CACHE_ALIGN(ofs) >> PAGE_CACHE_SHIFT;
122 pgoff_t no_indizes = len >> PAGE_CACHE_SHIFT;
123 struct page *page;
125 124
126 area->a_used_bytes += len; 125 while (no_indizes) {
127 for ( ; area->a_used_bytes < super->s_segsize; 126 page = get_mapping_page(sb, index, 0);
128 area->a_used_bytes += PAGE_SIZE) {
129 /* Memset another page */
130 index++;
131 page = get_mapping_page(area->a_sb, index, 0);
132 BUG_ON(!page); /* FIXME: reserve a pool */ 127 BUG_ON(!page); /* FIXME: reserve a pool */
133 memset(page_address(page), 0xff, PAGE_SIZE); 128 SetPageUptodate(page);
129 memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
134 SetPagePrivate(page); 130 SetPagePrivate(page);
135 page_cache_release(page); 131 page_cache_release(page);
132 index++;
133 no_indizes--;
136 } 134 }
137} 135}
138 136
139/* 137/*
138 * bdev_writeseg will write full pages. Memset the tail to prevent data leaks.
139 * Also make sure we allocate (and memset) all pages for final writeout.
140 */
141static void pad_wbuf(struct logfs_area *area, int final)
142{
143 pad_partial_page(area);
144 if (final)
145 pad_full_pages(area);
146}
147
148/*
140 * We have to be careful with the alias tree. Since lookup is done by bix, 149 * We have to be careful with the alias tree. Since lookup is done by bix,
141 * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with 150 * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with
142 * indirect blocks. So always use it through accessor functions. 151 * indirect blocks. So always use it through accessor functions.
@@ -683,7 +692,7 @@ int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow)
683 return 0; 692 return 0;
684} 693}
685 694
686static void freeseg(struct super_block *sb, u32 segno) 695void freeseg(struct super_block *sb, u32 segno)
687{ 696{
688 struct logfs_super *super = logfs_super(sb); 697 struct logfs_super *super = logfs_super(sb);
689 struct address_space *mapping = super->s_mapping_inode->i_mapping; 698 struct address_space *mapping = super->s_mapping_inode->i_mapping;
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index c66beab78dee..b60bfac3263c 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -11,6 +11,7 @@
11 */ 11 */
12#include "logfs.h" 12#include "logfs.h"
13#include <linux/bio.h> 13#include <linux/bio.h>
14#include <linux/slab.h>
14#include <linux/mtd/mtd.h> 15#include <linux/mtd/mtd.h>
15#include <linux/statfs.h> 16#include <linux/statfs.h>
16#include <linux/buffer_head.h> 17#include <linux/buffer_head.h>
@@ -277,7 +278,7 @@ static int logfs_recover_sb(struct super_block *sb)
277 } 278 }
278 if (valid0 && valid1 && ds_cmp(ds0, ds1)) { 279 if (valid0 && valid1 && ds_cmp(ds0, ds1)) {
279 printk(KERN_INFO"Superblocks don't match - fixing.\n"); 280 printk(KERN_INFO"Superblocks don't match - fixing.\n");
280 return write_one_sb(sb, super->s_devops->find_last_sb); 281 return logfs_write_sb(sb);
281 } 282 }
282 /* If neither is valid now, something's wrong. Didn't we properly 283 /* If neither is valid now, something's wrong. Didn't we properly
283 * check them before?!? */ 284 * check them before?!? */
@@ -289,6 +290,10 @@ static int logfs_make_writeable(struct super_block *sb)
289{ 290{
290 int err; 291 int err;
291 292
293 err = logfs_open_segfile(sb);
294 if (err)
295 return err;
296
292 /* Repair any broken superblock copies */ 297 /* Repair any broken superblock copies */
293 err = logfs_recover_sb(sb); 298 err = logfs_recover_sb(sb);
294 if (err) 299 if (err)
@@ -299,10 +304,6 @@ static int logfs_make_writeable(struct super_block *sb)
299 if (err) 304 if (err)
300 return err; 305 return err;
301 306
302 err = logfs_open_segfile(sb);
303 if (err)
304 return err;
305
306 /* Do one GC pass before any data gets dirtied */ 307 /* Do one GC pass before any data gets dirtied */
307 logfs_gc_pass(sb); 308 logfs_gc_pass(sb);
308 309
@@ -328,7 +329,7 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
328 329
329 sb->s_root = d_alloc_root(rootdir); 330 sb->s_root = d_alloc_root(rootdir);
330 if (!sb->s_root) 331 if (!sb->s_root)
331 goto fail; 332 goto fail2;
332 333
333 super->s_erase_page = alloc_pages(GFP_KERNEL, 0); 334 super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
334 if (!super->s_erase_page) 335 if (!super->s_erase_page)
@@ -572,8 +573,7 @@ int logfs_get_sb_device(struct file_system_type *type, int flags,
572 return 0; 573 return 0;
573 574
574err1: 575err1:
575 up_write(&sb->s_umount); 576 deactivate_locked_super(sb);
576 deactivate_super(sb);
577 return err; 577 return err;
578err0: 578err0:
579 kfree(super); 579 kfree(super);
diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c
index 82d6554b02fe..282e15ad8cd8 100644
--- a/fs/minix/itree_v1.c
+++ b/fs/minix/itree_v1.c
@@ -1,4 +1,5 @@
1#include <linux/buffer_head.h> 1#include <linux/buffer_head.h>
2#include <linux/slab.h>
2#include "minix.h" 3#include "minix.h"
3 4
4enum {DEPTH = 3, DIRECT = 7}; /* Only double indirect */ 5enum {DEPTH = 3, DIRECT = 7}; /* Only double indirect */
diff --git a/fs/mpage.c b/fs/mpage.c
index 598d54e200eb..fd56ca2ea556 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -16,6 +16,7 @@
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include <linux/kdev_t.h> 18#include <linux/kdev_t.h>
19#include <linux/gfp.h>
19#include <linux/bio.h> 20#include <linux/bio.h>
20#include <linux/fs.h> 21#include <linux/fs.h>
21#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
diff --git a/fs/namei.c b/fs/namei.c
index 1c0fca6e899e..a7dce91a7e42 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1610,8 +1610,7 @@ exit:
1610 1610
1611static struct file *do_last(struct nameidata *nd, struct path *path, 1611static struct file *do_last(struct nameidata *nd, struct path *path,
1612 int open_flag, int acc_mode, 1612 int open_flag, int acc_mode,
1613 int mode, const char *pathname, 1613 int mode, const char *pathname)
1614 int *want_dir)
1615{ 1614{
1616 struct dentry *dir = nd->path.dentry; 1615 struct dentry *dir = nd->path.dentry;
1617 struct file *filp; 1616 struct file *filp;
@@ -1642,7 +1641,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1642 if (nd->last.name[nd->last.len]) { 1641 if (nd->last.name[nd->last.len]) {
1643 if (open_flag & O_CREAT) 1642 if (open_flag & O_CREAT)
1644 goto exit; 1643 goto exit;
1645 *want_dir = 1; 1644 nd->flags |= LOOKUP_DIRECTORY;
1646 } 1645 }
1647 1646
1648 /* just plain open? */ 1647 /* just plain open? */
@@ -1656,8 +1655,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1656 if (path->dentry->d_inode->i_op->follow_link) 1655 if (path->dentry->d_inode->i_op->follow_link)
1657 return NULL; 1656 return NULL;
1658 error = -ENOTDIR; 1657 error = -ENOTDIR;
1659 if (*want_dir && !path->dentry->d_inode->i_op->lookup) 1658 if (nd->flags & LOOKUP_DIRECTORY) {
1660 goto exit_dput; 1659 if (!path->dentry->d_inode->i_op->lookup)
1660 goto exit_dput;
1661 }
1661 path_to_nameidata(path, nd); 1662 path_to_nameidata(path, nd);
1662 audit_inode(pathname, nd->path.dentry); 1663 audit_inode(pathname, nd->path.dentry);
1663 goto ok; 1664 goto ok;
@@ -1766,7 +1767,6 @@ struct file *do_filp_open(int dfd, const char *pathname,
1766 int count = 0; 1767 int count = 0;
1767 int flag = open_to_namei_flags(open_flag); 1768 int flag = open_to_namei_flags(open_flag);
1768 int force_reval = 0; 1769 int force_reval = 0;
1769 int want_dir = open_flag & O_DIRECTORY;
1770 1770
1771 if (!(open_flag & O_CREAT)) 1771 if (!(open_flag & O_CREAT))
1772 mode = 0; 1772 mode = 0;
@@ -1828,7 +1828,9 @@ reval:
1828 if (open_flag & O_EXCL) 1828 if (open_flag & O_EXCL)
1829 nd.flags |= LOOKUP_EXCL; 1829 nd.flags |= LOOKUP_EXCL;
1830 } 1830 }
1831 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir); 1831 if (open_flag & O_DIRECTORY)
1832 nd.flags |= LOOKUP_DIRECTORY;
1833 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1832 while (unlikely(!filp)) { /* trailing symlink */ 1834 while (unlikely(!filp)) { /* trailing symlink */
1833 struct path holder; 1835 struct path holder;
1834 struct inode *inode = path.dentry->d_inode; 1836 struct inode *inode = path.dentry->d_inode;
@@ -1866,7 +1868,7 @@ reval:
1866 } 1868 }
1867 holder = path; 1869 holder = path;
1868 nd.flags &= ~LOOKUP_PARENT; 1870 nd.flags &= ~LOOKUP_PARENT;
1869 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir); 1871 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1870 if (inode->i_op->put_link) 1872 if (inode->i_op->put_link)
1871 inode->i_op->put_link(holder.dentry, &nd, cookie); 1873 inode->i_op->put_link(holder.dentry, &nd, cookie);
1872 path_put(&holder); 1874 path_put(&holder);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index b8b5b30d53f0..7edfcd4d5e52 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -15,7 +15,6 @@
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/stat.h> 16#include <linux/stat.h>
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/slab.h>
19#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
20#include <linux/mm.h> 19#include <linux/mm.h>
21#include <asm/uaccess.h> 20#include <asm/uaccess.h>
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 6a7d901f1936..1daabb90e0a5 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -15,7 +15,6 @@
15#include <linux/fcntl.h> 15#include <linux/fcntl.h>
16#include <linux/stat.h> 16#include <linux/stat.h>
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include <linux/slab.h>
19#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
20#include <linux/sched.h> 19#include <linux/sched.h>
21#include <linux/smp_lock.h> 20#include <linux/smp_lock.h>
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index ec8f45f12e05..60a5e2864ea8 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -15,6 +15,7 @@
15#include <linux/time.h> 15#include <linux/time.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/mount.h> 17#include <linux/mount.h>
18#include <linux/slab.h>
18#include <linux/highuid.h> 19#include <linux/highuid.h>
19#include <linux/smp_lock.h> 20#include <linux/smp_lock.h>
20#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 15458decdb8a..56f5b3a0e1ee 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -9,12 +9,12 @@
9#include <linux/stat.h> 9#include <linux/stat.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/gfp.h>
12#include <linux/mm.h> 13#include <linux/mm.h>
13#include <linux/shm.h> 14#include <linux/shm.h>
14#include <linux/errno.h> 15#include <linux/errno.h>
15#include <linux/mman.h> 16#include <linux/mman.h>
16#include <linux/string.h> 17#include <linux/string.h>
17#include <linux/slab.h>
18#include <linux/fcntl.h> 18#include <linux/fcntl.h>
19#include <linux/ncp_fs.h> 19#include <linux/ncp_fs.h>
20 20
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index e37df8d5fe70..c7ff6c700a6e 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -21,6 +21,7 @@
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/netdevice.h> 22#include <linux/netdevice.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/slab.h>
24#include <net/scm.h> 25#include <net/scm.h>
25#include <net/sock.h> 26#include <net/sock.h>
26#include <linux/ipx.h> 27#include <linux/ipx.h>
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index e3d26c1bd105..c634fd17b337 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -27,6 +27,7 @@
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/ncp_fs.h> 28#include <linux/ncp_fs.h>
29#include <linux/time.h> 29#include <linux/time.h>
30#include <linux/slab.h>
30#include <linux/mm.h> 31#include <linux/mm.h>
31#include <linux/stat.h> 32#include <linux/stat.h>
32#include "ncplib_kernel.h" 33#include "ncplib_kernel.h"
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index b4ffd0146ea6..84690319e625 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -10,6 +10,7 @@
10#include <linux/moduleparam.h> 10#include <linux/moduleparam.h>
11#include <linux/mount.h> 11#include <linux/mount.h>
12#include <linux/namei.h> 12#include <linux/namei.h>
13#include <linux/slab.h>
13#include <linux/sunrpc/cache.h> 14#include <linux/sunrpc/cache.h>
14#include <linux/sunrpc/rpc_pipe_fs.h> 15#include <linux/sunrpc/rpc_pipe_fs.h>
15 16
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 84761b5bb8e2..a08770a7e857 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -7,6 +7,7 @@
7 */ 7 */
8#include <linux/nfs4.h> 8#include <linux/nfs4.h>
9#include <linux/nfs_fs.h> 9#include <linux/nfs_fs.h>
10#include <linux/slab.h>
10#include "nfs4_fs.h" 11#include "nfs4_fs.h"
11#include "callback.h" 12#include "callback.h"
12#include "delegation.h" 13#include "delegation.h"
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index a2b8b4df125d..05af212f0edf 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -9,6 +9,7 @@
9#include <linux/sunrpc/svc.h> 9#include <linux/sunrpc/svc.h>
10#include <linux/nfs4.h> 10#include <linux/nfs4.h>
11#include <linux/nfs_fs.h> 11#include <linux/nfs_fs.h>
12#include <linux/slab.h>
12#include "nfs4_fs.h" 13#include "nfs4_fs.h"
13#include "callback.h" 14#include "callback.h"
14 15
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2274f1737336..2a3d352c0bff 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -35,6 +35,7 @@
35#include <linux/vfs.h> 35#include <linux/vfs.h>
36#include <linux/inet.h> 36#include <linux/inet.h>
37#include <linux/in6.h> 37#include <linux/in6.h>
38#include <linux/slab.h>
38#include <net/ipv6.h> 39#include <net/ipv6.h>
39#include <linux/nfs_xdr.h> 40#include <linux/nfs_xdr.h>
40#include <linux/sunrpc/bc_xprt.h> 41#include <linux/sunrpc/bc_xprt.h>
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 2563bebc4c67..15671245c6ee 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -10,6 +10,7 @@
10#include <linux/kthread.h> 10#include <linux/kthread.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/slab.h>
13#include <linux/smp_lock.h> 14#include <linux/smp_lock.h>
14#include <linux/spinlock.h> 15#include <linux/spinlock.h>
15 16
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 0d289823e856..ad4cd31d6050 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -44,6 +44,7 @@
44#include <linux/file.h> 44#include <linux/file.h>
45#include <linux/pagemap.h> 45#include <linux/pagemap.h>
46#include <linux/kref.h> 46#include <linux/kref.h>
47#include <linux/slab.h>
47 48
48#include <linux/nfs_fs.h> 49#include <linux/nfs_fs.h>
49#include <linux/nfs_page.h> 50#include <linux/nfs_page.h>
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 3f0cd4dfddaf..76fd235d0024 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -9,6 +9,7 @@
9#include <linux/hash.h> 9#include <linux/hash.h>
10#include <linux/string.h> 10#include <linux/string.h>
11#include <linux/kmod.h> 11#include <linux/kmod.h>
12#include <linux/slab.h>
12#include <linux/module.h> 13#include <linux/module.h>
13#include <linux/socket.h> 14#include <linux/socket.h>
14#include <linux/seq_file.h> 15#include <linux/seq_file.h>
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index ae8d02294e46..8d965bddb87e 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -24,9 +24,9 @@
24#include <linux/nfs_fs.h> 24#include <linux/nfs_fs.h>
25#include <linux/nfs_mount.h> 25#include <linux/nfs_mount.h>
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/slab.h>
28#include <linux/pagemap.h> 27#include <linux/pagemap.h>
29#include <linux/aio.h> 28#include <linux/aio.h>
29#include <linux/gfp.h>
30 30
31#include <asm/uaccess.h> 31#include <asm/uaccess.h>
32#include <asm/system.h> 32#include <asm/system.h>
@@ -491,7 +491,8 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
491{ 491{
492 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); 492 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
493 493
494 if (gfp & __GFP_WAIT) 494 /* Only do I/O if gfp is a superset of GFP_KERNEL */
495 if ((gfp & GFP_KERNEL) == GFP_KERNEL)
495 nfs_wb_page(page->mapping->host, page); 496 nfs_wb_page(page->mapping->host, page);
496 /* If PagePrivate() is set, then the page is not freeable */ 497 /* If PagePrivate() is set, then the page is not freeable */
497 if (PagePrivate(page)) 498 if (PagePrivate(page))
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 237874f1af23..a6b16ed93229 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -17,6 +17,7 @@
17#include <linux/nfs_fs_sb.h> 17#include <linux/nfs_fs_sb.h>
18#include <linux/in6.h> 18#include <linux/in6.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/slab.h>
20 21
21#include "internal.h" 22#include "internal.h"
22#include "iostat.h" 23#include "iostat.h"
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e358df75a6ad..737128f777f3 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -36,6 +36,7 @@
36#include <linux/vfs.h> 36#include <linux/vfs.h>
37#include <linux/inet.h> 37#include <linux/inet.h>
38#include <linux/nfs_xdr.h> 38#include <linux/nfs_xdr.h>
39#include <linux/slab.h>
39 40
40#include <asm/system.h> 41#include <asm/system.h>
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 40c766782891..7888cf36022d 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -8,6 +8,7 @@
8 */ 8 */
9 9
10#include <linux/dcache.h> 10#include <linux/dcache.h>
11#include <linux/gfp.h>
11#include <linux/mount.h> 12#include <linux/mount.h>
12#include <linux/namei.h> 13#include <linux/namei.h>
13#include <linux/nfs_fs.h> 14#include <linux/nfs_fs.h>
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 7bc2da8efd4a..81cf14257916 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -12,7 +12,6 @@
12#include <linux/param.h> 12#include <linux/param.h>
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/slab.h>
16#include <linux/errno.h> 15#include <linux/errno.h>
17#include <linux/string.h> 16#include <linux/string.h>
18#include <linux/in.h> 17#include <linux/in.h>
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index bac60515a4b3..d150ae0c5ecd 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -1,4 +1,5 @@
1#include <linux/fs.h> 1#include <linux/fs.h>
2#include <linux/gfp.h>
2#include <linux/nfs.h> 3#include <linux/nfs.h>
3#include <linux/nfs3.h> 4#include <linux/nfs3.h>
4#include <linux/nfs_fs.h> 5#include <linux/nfs_fs.h>
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 24992f0a29f2..e701002694e5 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -10,6 +10,7 @@
10#include <linux/errno.h> 10#include <linux/errno.h>
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/sunrpc/clnt.h> 12#include <linux/sunrpc/clnt.h>
13#include <linux/slab.h>
13#include <linux/nfs.h> 14#include <linux/nfs.h>
14#include <linux/nfs3.h> 15#include <linux/nfs3.h>
15#include <linux/nfs_fs.h> 16#include <linux/nfs_fs.h>
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 5fe5492fbd29..56a86f6ac8b5 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -9,7 +9,6 @@
9#include <linux/param.h> 9#include <linux/param.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/slab.h>
13#include <linux/errno.h> 12#include <linux/errno.h>
14#include <linux/string.h> 13#include <linux/string.h>
15#include <linux/in.h> 14#include <linux/in.h>
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index fa3408f20112..f071d12c613b 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -11,6 +11,7 @@
11#include <linux/mount.h> 11#include <linux/mount.h>
12#include <linux/namei.h> 12#include <linux/namei.h>
13#include <linux/nfs_fs.h> 13#include <linux/nfs_fs.h>
14#include <linux/slab.h>
14#include <linux/string.h> 15#include <linux/string.h>
15#include <linux/sunrpc/clnt.h> 16#include <linux/sunrpc/clnt.h>
16#include <linux/vfs.h> 17#include <linux/vfs.h>
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f9254fb0c9d0..d79a7b37e56c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -39,6 +39,7 @@
39#include <linux/delay.h> 39#include <linux/delay.h>
40#include <linux/errno.h> 40#include <linux/errno.h>
41#include <linux/string.h> 41#include <linux/string.h>
42#include <linux/slab.h>
42#include <linux/sunrpc/clnt.h> 43#include <linux/sunrpc/clnt.h>
43#include <linux/nfs.h> 44#include <linux/nfs.h>
44#include <linux/nfs4.h> 45#include <linux/nfs4.h>
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4d338be492cb..38f3b582e7c2 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -38,7 +38,6 @@
38#include <linux/param.h> 38#include <linux/param.h>
39#include <linux/time.h> 39#include <linux/time.h>
40#include <linux/mm.h> 40#include <linux/mm.h>
41#include <linux/slab.h>
42#include <linux/errno.h> 41#include <linux/errno.h>
43#include <linux/string.h> 42#include <linux/string.h>
44#include <linux/in.h> 43#include <linux/in.h>
@@ -5552,6 +5551,8 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf
5552 if (status != 0) 5551 if (status != 0)
5553 goto out; 5552 goto out;
5554 status = decode_delegreturn(&xdr); 5553 status = decode_delegreturn(&xdr);
5554 if (status != 0)
5555 goto out;
5555 decode_getfattr(&xdr, res->fattr, res->server, 5556 decode_getfattr(&xdr, res->fattr, res->server,
5556 !RPC_IS_ASYNC(rqstp->rq_task)); 5557 !RPC_IS_ASYNC(rqstp->rq_task));
5557out: 5558out:
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index c752d944fe9e..0288be80444f 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -29,7 +29,6 @@
29 29
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/param.h> 31#include <linux/param.h>
32#include <linux/slab.h>
33#include <linux/time.h> 32#include <linux/time.h>
34#include <linux/mm.h> 33#include <linux/mm.h>
35#include <linux/errno.h> 34#include <linux/errno.h>
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 6baf9a393466..e01637240eeb 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -48,6 +48,7 @@
48#include <linux/vfs.h> 48#include <linux/vfs.h>
49#include <linux/inet.h> 49#include <linux/inet.h>
50#include <linux/in6.h> 50#include <linux/in6.h>
51#include <linux/slab.h>
51#include <net/ipv6.h> 52#include <net/ipv6.h>
52#include <linux/netdevice.h> 53#include <linux/netdevice.h>
53#include <linux/nfs_xdr.h> 54#include <linux/nfs_xdr.h>
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 2ea9e5c27e55..05c9e02f4153 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -19,7 +19,6 @@
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/stat.h> 20#include <linux/stat.h>
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/slab.h>
23#include <linux/string.h> 22#include <linux/string.h>
24#include <linux/namei.h> 23#include <linux/namei.h>
25 24
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 04133aacb1e5..fc1c52571c03 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/fs.h> 24#include <linux/fs.h>
25#include <linux/gfp.h>
25#include <linux/sunrpc/xdr.h> 26#include <linux/sunrpc/xdr.h>
26#include <linux/nfsacl.h> 27#include <linux/nfsacl.h>
27#include <linux/nfs3.h> 28#include <linux/nfs3.h>
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index a0c4016413f1..872a5ef550c7 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -12,6 +12,7 @@
12 * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de> 12 * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de>
13 */ 13 */
14 14
15#include <linux/slab.h>
15#include <linux/namei.h> 16#include <linux/namei.h>
16#include <linux/module.h> 17#include <linux/module.h>
17#include <linux/exportfs.h> 18#include <linux/exportfs.h>
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index f20589d2ae27..6aa5590c3679 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -7,6 +7,7 @@
7#include "nfsd.h" 7#include "nfsd.h"
8/* FIXME: nfsacl.h is a broken header */ 8/* FIXME: nfsacl.h is a broken header */
9#include <linux/nfsacl.h> 9#include <linux/nfsacl.h>
10#include <linux/gfp.h>
10#include "cache.h" 11#include "cache.h"
11#include "xdr3.h" 12#include "xdr3.h"
12#include "vfs.h" 13#include "vfs.h"
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index e0c4846bad92..a596e9d987e4 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -7,6 +7,7 @@
7#include "nfsd.h" 7#include "nfsd.h"
8/* FIXME: nfsacl.h is a broken header */ 8/* FIXME: nfsacl.h is a broken header */
9#include <linux/nfsacl.h> 9#include <linux/nfsacl.h>
10#include <linux/gfp.h>
10#include "cache.h" 11#include "cache.h"
11#include "xdr3.h" 12#include "xdr3.h"
12#include "vfs.h" 13#include "vfs.h"
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 88150685df34..e48052615159 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -34,6 +34,7 @@
34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 */ 35 */
36 36
37#include <linux/slab.h>
37#include <linux/nfs_fs.h> 38#include <linux/nfs_fs.h>
38#include <linux/nfs4_acl.h> 39#include <linux/nfs4_acl.h>
39 40
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 4bc22c763de7..7e32bd394e86 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -32,6 +32,7 @@
32 */ 32 */
33 33
34#include <linux/sunrpc/clnt.h> 34#include <linux/sunrpc/clnt.h>
35#include <linux/slab.h>
35#include "nfsd.h" 36#include "nfsd.h"
36#include "state.h" 37#include "state.h"
37 38
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 6e2983b27f3c..c78dbf493424 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -36,6 +36,7 @@
36#include <linux/nfsd_idmap.h> 36#include <linux/nfsd_idmap.h>
37#include <linux/seq_file.h> 37#include <linux/seq_file.h>
38#include <linux/sched.h> 38#include <linux/sched.h>
39#include <linux/slab.h>
39 40
40/* 41/*
41 * Cache entry 42 * Cache entry
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 37514c469846..2ab9e8501bfe 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -33,6 +33,7 @@
33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 */ 34 */
35#include <linux/file.h> 35#include <linux/file.h>
36#include <linux/slab.h>
36 37
37#include "cache.h" 38#include "cache.h"
38#include "xdr4.h" 39#include "xdr4.h"
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 98fb98e330b4..7a9ae3254a4b 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -32,6 +32,7 @@
32*/ 32*/
33 33
34#include <linux/file.h> 34#include <linux/file.h>
35#include <linux/slab.h>
35#include <linux/namei.h> 36#include <linux/namei.h>
36#include <linux/crypto.h> 37#include <linux/crypto.h>
37#include <linux/sched.h> 38#include <linux/sched.h>
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c97fddbd17db..6a8fedaa4f55 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -34,6 +34,7 @@
34 34
35#include <linux/file.h> 35#include <linux/file.h>
36#include <linux/smp_lock.h> 36#include <linux/smp_lock.h>
37#include <linux/slab.h>
37#include <linux/namei.h> 38#include <linux/namei.h>
38#include <linux/swap.h> 39#include <linux/swap.h>
39#include <linux/sunrpc/svcauth_gss.h> 40#include <linux/sunrpc/svcauth_gss.h>
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c47b4d7bafa7..e1703175ee28 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -40,6 +40,7 @@
40 * at the end of nfs4svc_decode_compoundargs. 40 * at the end of nfs4svc_decode_compoundargs.
41 */ 41 */
42 42
43#include <linux/slab.h>
43#include <linux/namei.h> 44#include <linux/namei.h>
44#include <linux/statfs.h> 45#include <linux/statfs.h>
45#include <linux/utsname.h> 46#include <linux/utsname.h>
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index da08560c4818..4666a209678a 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -8,6 +8,8 @@
8 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> 8 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
9 */ 9 */
10 10
11#include <linux/slab.h>
12
11#include "nfsd.h" 13#include "nfsd.h"
12#include "cache.h" 14#include "cache.h"
13 15
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 0f0e77f2012f..e3591073098f 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -4,6 +4,7 @@
4 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> 4 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
5 */ 5 */
6 6
7#include <linux/slab.h>
7#include <linux/namei.h> 8#include <linux/namei.h>
8#include <linux/ctype.h> 9#include <linux/ctype.h>
9 10
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a11b0e8678ee..6dd5f1970e01 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -25,6 +25,7 @@
25#include <linux/xattr.h> 25#include <linux/xattr.h>
26#include <linux/jhash.h> 26#include <linux/jhash.h>
27#include <linux/ima.h> 27#include <linux/ima.h>
28#include <linux/slab.h>
28#include <asm/uaccess.h> 29#include <asm/uaccess.h>
29#include <linux/exportfs.h> 30#include <linux/exportfs.h>
30#include <linux/writeback.h> 31#include <linux/writeback.h>
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 3f959f1879d8..8d6356a804f3 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -26,6 +26,7 @@
26#include <linux/buffer_head.h> 26#include <linux/buffer_head.h>
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/bitops.h> 28#include <linux/bitops.h>
29#include <linux/slab.h>
29#include "mdt.h" 30#include "mdt.h"
30#include "alloc.h" 31#include "alloc.h"
31 32
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 471e269536ae..447ce47a3306 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -27,6 +27,7 @@
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/backing-dev.h> 29#include <linux/backing-dev.h>
30#include <linux/gfp.h>
30#include "nilfs.h" 31#include "nilfs.h"
31#include "mdt.h" 32#include "mdt.h"
32#include "dat.h" 33#include "dat.h"
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 8880a9e281e7..145f03cd7d3e 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -45,6 +45,7 @@
45#include <linux/buffer_head.h> 45#include <linux/buffer_head.h>
46#include <linux/mpage.h> 46#include <linux/mpage.h>
47#include <linux/hash.h> 47#include <linux/hash.h>
48#include <linux/slab.h>
48#include <linux/swap.h> 49#include <linux/swap.h>
49#include "nilfs.h" 50#include "nilfs.h"
50#include "page.h" 51#include "page.h"
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 7868cc122ac7..0957b58f909d 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -22,6 +22,7 @@
22 */ 22 */
23 23
24#include <linux/buffer_head.h> 24#include <linux/buffer_head.h>
25#include <linux/gfp.h>
25#include <linux/mpage.h> 26#include <linux/mpage.h>
26#include <linux/writeback.h> 27#include <linux/writeback.h>
27#include <linux/uio.h> 28#include <linux/uio.h>
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 313d0a21da48..c2ff1b306012 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -23,6 +23,7 @@
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/wait.h> 24#include <linux/wait.h>
25#include <linux/smp_lock.h> /* lock_kernel(), unlock_kernel() */ 25#include <linux/smp_lock.h> /* lock_kernel(), unlock_kernel() */
26#include <linux/slab.h>
26#include <linux/capability.h> /* capable() */ 27#include <linux/capability.h> /* capable() */
27#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */ 28#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */
28#include <linux/vmalloc.h> 29#include <linux/vmalloc.h>
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 06713ffcc7f2..024be8c35bb6 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -26,6 +26,7 @@
26#include <linux/writeback.h> 26#include <linux/writeback.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/swap.h> 28#include <linux/swap.h>
29#include <linux/slab.h>
29#include "nilfs.h" 30#include "nilfs.h"
30#include "segment.h" 31#include "segment.h"
31#include "page.h" 32#include "page.h"
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index fc246dba112a..8de3e1e48130 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -29,6 +29,7 @@
29#include <linux/list.h> 29#include <linux/list.h>
30#include <linux/highmem.h> 30#include <linux/highmem.h>
31#include <linux/pagevec.h> 31#include <linux/pagevec.h>
32#include <linux/gfp.h>
32#include "nilfs.h" 33#include "nilfs.h"
33#include "page.h" 34#include "page.h"
34#include "mdt.h" 35#include "mdt.h"
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 017bedc761a0..ba43146f3c30 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -23,6 +23,7 @@
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/blkdev.h> 24#include <linux/blkdev.h>
25#include <linux/swap.h> 25#include <linux/swap.h>
26#include <linux/slab.h>
26#include <linux/crc32.h> 27#include <linux/crc32.h>
27#include "nilfs.h" 28#include "nilfs.h"
28#include "segment.h" 29#include "segment.h"
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 636eaafd6ea2..17851f77f739 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -25,6 +25,7 @@
25#include <linux/writeback.h> 25#include <linux/writeback.h>
26#include <linux/crc32.h> 26#include <linux/crc32.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/slab.h>
28#include "page.h" 29#include "page.h"
29#include "segbuf.h" 30#include "segbuf.h"
30 31
@@ -323,14 +324,14 @@ int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs)
323int nilfs_wait_on_logs(struct list_head *logs) 324int nilfs_wait_on_logs(struct list_head *logs)
324{ 325{
325 struct nilfs_segment_buffer *segbuf; 326 struct nilfs_segment_buffer *segbuf;
326 int err; 327 int err, ret = 0;
327 328
328 list_for_each_entry(segbuf, logs, sb_list) { 329 list_for_each_entry(segbuf, logs, sb_list) {
329 err = nilfs_segbuf_wait(segbuf); 330 err = nilfs_segbuf_wait(segbuf);
330 if (err) 331 if (err && !ret)
331 return err; 332 ret = err;
332 } 333 }
333 return 0; 334 return ret;
334} 335}
335 336
336/* 337/*
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 69576a95e13f..6a7dbd8451db 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -32,6 +32,7 @@
32#include <linux/kthread.h> 32#include <linux/kthread.h>
33#include <linux/crc32.h> 33#include <linux/crc32.h>
34#include <linux/pagevec.h> 34#include <linux/pagevec.h>
35#include <linux/slab.h>
35#include "nilfs.h" 36#include "nilfs.h"
36#include "btnode.h" 37#include "btnode.h"
37#include "page.h" 38#include "page.h"
@@ -1510,6 +1511,12 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
1510 if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE) 1511 if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
1511 break; 1512 break;
1512 1513
1514 nilfs_clear_logs(&sci->sc_segbufs);
1515
1516 err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
1517 if (unlikely(err))
1518 return err;
1519
1513 if (sci->sc_stage.flags & NILFS_CF_SUFREED) { 1520 if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
1514 err = nilfs_sufile_cancel_freev(nilfs->ns_sufile, 1521 err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
1515 sci->sc_freesegs, 1522 sci->sc_freesegs,
@@ -1517,12 +1524,6 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
1517 NULL); 1524 NULL);
1518 WARN_ON(err); /* do not happen */ 1525 WARN_ON(err); /* do not happen */
1519 } 1526 }
1520 nilfs_clear_logs(&sci->sc_segbufs);
1521
1522 err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
1523 if (unlikely(err))
1524 return err;
1525
1526 nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA); 1527 nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
1527 sci->sc_stage = prev_stage; 1528 sci->sc_stage = prev_stage;
1528 } 1529 }
@@ -1897,8 +1898,7 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
1897 1898
1898 list_splice_tail_init(&sci->sc_write_logs, &logs); 1899 list_splice_tail_init(&sci->sc_write_logs, &logs);
1899 ret = nilfs_wait_on_logs(&logs); 1900 ret = nilfs_wait_on_logs(&logs);
1900 if (ret) 1901 nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret ? : err);
1901 nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret);
1902 1902
1903 list_splice_tail_init(&sci->sc_segbufs, &logs); 1903 list_splice_tail_init(&sci->sc_segbufs, &logs);
1904 nilfs_cancel_segusage(&logs, nilfs->ns_sufile); 1904 nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index e9795f1724d7..1ab974533697 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -29,6 +29,7 @@
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/blkdev.h> 30#include <linux/blkdev.h>
31#include <linux/backing-dev.h> 31#include <linux/backing-dev.h>
32#include <linux/slab.h>
32#include "sb.h" 33#include "sb.h"
33 34
34/* the_nilfs struct */ 35/* the_nilfs struct */
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 037e878e03fc..fcc2f064af83 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/dcache.h> 19#include <linux/dcache.h>
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/gfp.h>
21#include <linux/init.h> 22#include <linux/init.h>
22#include <linux/module.h> 23#include <linux/module.h>
23#include <linux/srcu.h> 24#include <linux/srcu.h>
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 3165d85aada2..0399bcbe09c8 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -87,7 +87,6 @@
87#include <linux/kernel.h> 87#include <linux/kernel.h>
88#include <linux/module.h> 88#include <linux/module.h>
89#include <linux/mutex.h> 89#include <linux/mutex.h>
90#include <linux/slab.h>
91#include <linux/spinlock.h> 90#include <linux/spinlock.h>
92#include <linux/writeback.h> /* for inode_lock */ 91#include <linux/writeback.h> /* for inode_lock */
93 92
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index cfce53cb65d7..c3c2c7ac9020 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -23,6 +23,7 @@
23 23
24#include <linux/errno.h> 24#include <linux/errno.h>
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/gfp.h>
26#include <linux/mm.h> 27#include <linux/mm.h>
27#include <linux/pagemap.h> 28#include <linux/pagemap.h>
28#include <linux/swap.h> 29#include <linux/swap.h>
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 50d3b0c258e3..f5094ee224c1 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/slab.h>
25#include <linux/swap.h> 26#include <linux/swap.h>
26#include <linux/writeback.h> 27#include <linux/writeback.h>
27 28
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 08f7530e9341..6551c7cbad92 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -25,6 +25,7 @@
25#include <linux/buffer_head.h> 25#include <linux/buffer_head.h>
26#include <linux/blkdev.h> 26#include <linux/blkdev.h>
27#include <linux/vmalloc.h> 27#include <linux/vmalloc.h>
28#include <linux/slab.h>
28 29
29#include "attrib.h" 30#include "attrib.h"
30#include "inode.h" 31#include "inode.h"
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 9173e82a45d1..fe44d3feee4a 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/slab.h>
24 25
25#include "dir.h" 26#include "dir.h"
26#include "aops.h" 27#include "aops.h"
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index b681c71d7069..8804f093ba75 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -20,6 +20,7 @@
20 */ 20 */
21 21
22#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
23#include <linux/gfp.h>
23#include <linux/pagemap.h> 24#include <linux/pagemap.h>
24#include <linux/pagevec.h> 25#include <linux/pagevec.h>
25#include <linux/sched.h> 26#include <linux/sched.h>
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
index 2194eff49743..096c135691ae 100644
--- a/fs/ntfs/index.c
+++ b/fs/ntfs/index.c
@@ -19,6 +19,8 @@
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21 21
22#include <linux/slab.h>
23
22#include "aops.h" 24#include "aops.h"
23#include "collate.h" 25#include "collate.h"
24#include "debug.h" 26#include "debug.h"
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 1caa0ef0b2bb..b572b6727181 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/slab.h>
24#include <linux/swap.h> 25#include <linux/swap.h>
25 26
26#include "attrib.h" 27#include "attrib.h"
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 2ca00153b6ec..358273e59ade 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -23,6 +23,7 @@
23#include <linux/dcache.h> 23#include <linux/dcache.h>
24#include <linux/exportfs.h> 24#include <linux/exportfs.h>
25#include <linux/security.h> 25#include <linux/security.h>
26#include <linux/slab.h>
26 27
27#include "attrib.h" 28#include "attrib.h"
28#include "debug.h" 29#include "debug.h"
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 0501974bedd0..e13fc9e8fcdc 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -21,6 +21,7 @@
21 21
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/slab.h>
24#include <linux/string.h> 25#include <linux/string.h>
25 26
26#define MLOG_MASK_PREFIX ML_INODE 27#define MLOG_MASK_PREFIX ML_INODE
@@ -30,6 +31,8 @@
30#include "alloc.h" 31#include "alloc.h"
31#include "dlmglue.h" 32#include "dlmglue.h"
32#include "file.h" 33#include "file.h"
34#include "inode.h"
35#include "journal.h"
33#include "ocfs2_fs.h" 36#include "ocfs2_fs.h"
34 37
35#include "xattr.h" 38#include "xattr.h"
@@ -166,6 +169,60 @@ static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
166} 169}
167 170
168/* 171/*
172 * Helper function to set i_mode in memory and disk. Some call paths
173 * will not have di_bh or a journal handle to pass, in which case it
174 * will create it's own.
175 */
176static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
177 handle_t *handle, umode_t new_mode)
178{
179 int ret, commit_handle = 0;
180 struct ocfs2_dinode *di;
181
182 if (di_bh == NULL) {
183 ret = ocfs2_read_inode_block(inode, &di_bh);
184 if (ret) {
185 mlog_errno(ret);
186 goto out;
187 }
188 } else
189 get_bh(di_bh);
190
191 if (handle == NULL) {
192 handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
193 OCFS2_INODE_UPDATE_CREDITS);
194 if (IS_ERR(handle)) {
195 ret = PTR_ERR(handle);
196 mlog_errno(ret);
197 goto out_brelse;
198 }
199
200 commit_handle = 1;
201 }
202
203 di = (struct ocfs2_dinode *)di_bh->b_data;
204 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
205 OCFS2_JOURNAL_ACCESS_WRITE);
206 if (ret) {
207 mlog_errno(ret);
208 goto out_commit;
209 }
210
211 inode->i_mode = new_mode;
212 di->i_mode = cpu_to_le16(inode->i_mode);
213
214 ocfs2_journal_dirty(handle, di_bh);
215
216out_commit:
217 if (commit_handle)
218 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
219out_brelse:
220 brelse(di_bh);
221out:
222 return ret;
223}
224
225/*
169 * Set the access or default ACL of an inode. 226 * Set the access or default ACL of an inode.
170 */ 227 */
171static int ocfs2_set_acl(handle_t *handle, 228static int ocfs2_set_acl(handle_t *handle,
@@ -193,9 +250,14 @@ static int ocfs2_set_acl(handle_t *handle,
193 if (ret < 0) 250 if (ret < 0)
194 return ret; 251 return ret;
195 else { 252 else {
196 inode->i_mode = mode;
197 if (ret == 0) 253 if (ret == 0)
198 acl = NULL; 254 acl = NULL;
255
256 ret = ocfs2_acl_set_mode(inode, di_bh,
257 handle, mode);
258 if (ret)
259 return ret;
260
199 } 261 }
200 } 262 }
201 break; 263 break;
@@ -283,6 +345,7 @@ int ocfs2_init_acl(handle_t *handle,
283 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 345 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
284 struct posix_acl *acl = NULL; 346 struct posix_acl *acl = NULL;
285 int ret = 0; 347 int ret = 0;
348 mode_t mode;
286 349
287 if (!S_ISLNK(inode->i_mode)) { 350 if (!S_ISLNK(inode->i_mode)) {
288 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { 351 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
@@ -291,12 +354,17 @@ int ocfs2_init_acl(handle_t *handle,
291 if (IS_ERR(acl)) 354 if (IS_ERR(acl))
292 return PTR_ERR(acl); 355 return PTR_ERR(acl);
293 } 356 }
294 if (!acl) 357 if (!acl) {
295 inode->i_mode &= ~current_umask(); 358 mode = inode->i_mode & ~current_umask();
359 ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
360 if (ret) {
361 mlog_errno(ret);
362 goto cleanup;
363 }
364 }
296 } 365 }
297 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { 366 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
298 struct posix_acl *clone; 367 struct posix_acl *clone;
299 mode_t mode;
300 368
301 if (S_ISDIR(inode->i_mode)) { 369 if (S_ISDIR(inode->i_mode)) {
302 ret = ocfs2_set_acl(handle, inode, di_bh, 370 ret = ocfs2_set_acl(handle, inode, di_bh,
@@ -313,7 +381,7 @@ int ocfs2_init_acl(handle_t *handle,
313 mode = inode->i_mode; 381 mode = inode->i_mode;
314 ret = posix_acl_create_masq(clone, &mode); 382 ret = posix_acl_create_masq(clone, &mode);
315 if (ret >= 0) { 383 if (ret >= 0) {
316 inode->i_mode = mode; 384 ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
317 if (ret > 0) { 385 if (ret > 0) {
318 ret = ocfs2_set_acl(handle, inode, 386 ret = ocfs2_set_acl(handle, inode,
319 di_bh, ACL_TYPE_ACCESS, 387 di_bh, ACL_TYPE_ACCESS,
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 21c808f752d8..ecebb2276790 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30 29
31#include <cluster/masklog.h> 30#include <cluster/masklog.h>
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 5c9890006708..41d5f1f92d56 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -34,6 +34,7 @@
34#include <linux/crc32.h> 34#include <linux/crc32.h>
35#include <linux/time.h> 35#include <linux/time.h>
36#include <linux/debugfs.h> 36#include <linux/debugfs.h>
37#include <linux/slab.h>
37 38
38#include "heartbeat.h" 39#include "heartbeat.h"
39#include "tcp.h" 40#include "tcp.h"
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index c81142e3ef84..ed0c9f367fed 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -19,6 +19,7 @@
19 * Boston, MA 021110-1307, USA. 19 * Boston, MA 021110-1307, USA.
20 */ 20 */
21 21
22#include <linux/slab.h>
22#include <linux/kernel.h> 23#include <linux/kernel.h>
23#include <linux/module.h> 24#include <linux/module.h>
24#include <linux/configfs.h> 25#include <linux/configfs.h>
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 639024033fce..cf3e16696216 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -44,7 +44,6 @@
44 * and if they're the last, they fire off the decision. 44 * and if they're the last, they fire off the decision.
45 */ 45 */
46#include <linux/kernel.h> 46#include <linux/kernel.h>
47#include <linux/slab.h>
48#include <linux/workqueue.h> 47#include <linux/workqueue.h>
49#include <linux/reboot.h> 48#include <linux/reboot.h>
50 49
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index dccc439fa087..a795eb91f4ea 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index f283bce776b4..90803b47cd8c 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index a659606dcb95..9289b4357d27 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1875,7 +1875,6 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
1875ok: 1875ok:
1876 spin_unlock(&res->spinlock); 1876 spin_unlock(&res->spinlock);
1877 } 1877 }
1878 spin_unlock(&dlm->spinlock);
1879 1878
1880 // mlog(0, "woo! got an assert_master from node %u!\n", 1879 // mlog(0, "woo! got an assert_master from node %u!\n",
1881 // assert->node_idx); 1880 // assert->node_idx);
@@ -1926,7 +1925,6 @@ ok:
1926 /* master is known, detach if not already detached. 1925 /* master is known, detach if not already detached.
1927 * ensures that only one assert_master call will happen 1926 * ensures that only one assert_master call will happen
1928 * on this mle. */ 1927 * on this mle. */
1929 spin_lock(&dlm->spinlock);
1930 spin_lock(&dlm->master_lock); 1928 spin_lock(&dlm->master_lock);
1931 1929
1932 rr = atomic_read(&mle->mle_refs.refcount); 1930 rr = atomic_read(&mle->mle_refs.refcount);
@@ -1959,7 +1957,6 @@ ok:
1959 __dlm_put_mle(mle); 1957 __dlm_put_mle(mle);
1960 } 1958 }
1961 spin_unlock(&dlm->master_lock); 1959 spin_unlock(&dlm->master_lock);
1962 spin_unlock(&dlm->spinlock);
1963 } else if (res) { 1960 } else if (res) {
1964 if (res->owner != assert->node_idx) { 1961 if (res->owner != assert->node_idx) {
1965 mlog(0, "assert_master from %u, but current " 1962 mlog(0, "assert_master from %u, but current "
@@ -1967,6 +1964,7 @@ ok:
1967 res->owner, namelen, name); 1964 res->owner, namelen, name);
1968 } 1965 }
1969 } 1966 }
1967 spin_unlock(&dlm->spinlock);
1970 1968
1971done: 1969done:
1972 ret = 0; 1970 ret = 0;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 52ec020ea78b..11a6d1fd1d35 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 49e29ecd0201..b47c1b92b82b 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index c562a7581cf9..09e3fdfa6d33 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -24,6 +24,7 @@
24 24
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/slab.h>
27#include <linux/types.h> 28#include <linux/types.h>
28#include <linux/fiemap.h> 29#include <linux/fiemap.h>
29 30
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index c6e7213db868..1aa863dd901f 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -26,7 +26,6 @@
26 26
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h> 29#include <linux/highmem.h>
31 30
32#define MLOG_MASK_PREFIX ML_SUPER 31#define MLOG_MASK_PREFIX ML_SUPER
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 278a223aae14..07cc8bb68b6d 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30#include <linux/pagemap.h> 29#include <linux/pagemap.h>
31#include <linux/quotaops.h> 30#include <linux/quotaops.h>
@@ -891,6 +890,21 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
891 /* Do some basic inode verification... */ 890 /* Do some basic inode verification... */
892 di = (struct ocfs2_dinode *) di_bh->b_data; 891 di = (struct ocfs2_dinode *) di_bh->b_data;
893 if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) { 892 if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) {
893 /*
894 * Inodes in the orphan dir must have ORPHANED_FL. The only
895 * inodes that come back out of the orphan dir are reflink
896 * targets. A reflink target may be moved out of the orphan
897 * dir between the time we scan the directory and the time we
898 * process it. This would lead to HAS_REFCOUNT_FL being set but
899 * ORPHANED_FL not.
900 */
901 if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) {
902 mlog(0, "Reflinked inode %llu is no longer orphaned. "
903 "it shouldn't be deleted\n",
904 (unsigned long long)oi->ip_blkno);
905 goto bail;
906 }
907
894 /* for lack of a better error? */ 908 /* for lack of a better error? */
895 status = -EEXIST; 909 status = -EEXIST;
896 mlog(ML_ERROR, 910 mlog(ML_ERROR,
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ca992d91f511..c983715d8d8c 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -872,8 +872,10 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
872 (unsigned long long)la_start_blk, 872 (unsigned long long)la_start_blk,
873 (unsigned long long)blkno); 873 (unsigned long long)blkno);
874 874
875 status = ocfs2_free_clusters(handle, main_bm_inode, 875 status = ocfs2_release_clusters(handle,
876 main_bm_bh, blkno, count); 876 main_bm_inode,
877 main_bm_bh, blkno,
878 count);
877 if (status < 0) { 879 if (status < 0) {
878 mlog_errno(status); 880 mlog_errno(status);
879 goto bail; 881 goto bail;
@@ -984,8 +986,7 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
984 } 986 }
985 987
986retry_enospc: 988retry_enospc:
987 (*ac)->ac_bits_wanted = osb->local_alloc_bits; 989 (*ac)->ac_bits_wanted = osb->local_alloc_default_bits;
988
989 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); 990 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
990 if (status == -ENOSPC) { 991 if (status == -ENOSPC) {
991 if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) == 992 if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
@@ -1061,6 +1062,7 @@ retry_enospc:
1061 OCFS2_LA_DISABLED) 1062 OCFS2_LA_DISABLED)
1062 goto bail; 1063 goto bail;
1063 1064
1065 ac->ac_bits_wanted = osb->local_alloc_default_bits;
1064 status = ocfs2_claim_clusters(osb, handle, ac, 1066 status = ocfs2_claim_clusters(osb, handle, ac,
1065 osb->local_alloc_bits, 1067 osb->local_alloc_bits,
1066 &cluster_off, 1068 &cluster_off,
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 544ac6245175..b5cb3ede9408 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -133,7 +133,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
133 133
134 if (!(fl->fl_flags & FL_POSIX)) 134 if (!(fl->fl_flags & FL_POSIX))
135 return -ENOLCK; 135 return -ENOLCK;
136 if (__mandatory_lock(inode)) 136 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
137 return -ENOLCK; 137 return -ENOLCK;
138 138
139 return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl); 139 return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 39737613424a..7898bd3a99f5 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30#include <linux/pagemap.h> 29#include <linux/pagemap.h>
31#include <linux/uio.h> 30#include <linux/uio.h>
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d9cd4e373a53..b1eb50ae4097 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -84,7 +84,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
84static int ocfs2_orphan_add(struct ocfs2_super *osb, 84static int ocfs2_orphan_add(struct ocfs2_super *osb,
85 handle_t *handle, 85 handle_t *handle,
86 struct inode *inode, 86 struct inode *inode,
87 struct ocfs2_dinode *fe, 87 struct buffer_head *fe_bh,
88 char *name, 88 char *name,
89 struct ocfs2_dir_lookup_result *lookup, 89 struct ocfs2_dir_lookup_result *lookup,
90 struct inode *orphan_dir_inode); 90 struct inode *orphan_dir_inode);
@@ -879,7 +879,7 @@ static int ocfs2_unlink(struct inode *dir,
879 fe = (struct ocfs2_dinode *) fe_bh->b_data; 879 fe = (struct ocfs2_dinode *) fe_bh->b_data;
880 880
881 if (inode_is_unlinkable(inode)) { 881 if (inode_is_unlinkable(inode)) {
882 status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name, 882 status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name,
883 &orphan_insert, orphan_dir); 883 &orphan_insert, orphan_dir);
884 if (status < 0) { 884 if (status < 0) {
885 mlog_errno(status); 885 mlog_errno(status);
@@ -1300,7 +1300,7 @@ static int ocfs2_rename(struct inode *old_dir,
1300 if (S_ISDIR(new_inode->i_mode) || 1300 if (S_ISDIR(new_inode->i_mode) ||
1301 (ocfs2_read_links_count(newfe) == 1)) { 1301 (ocfs2_read_links_count(newfe) == 1)) {
1302 status = ocfs2_orphan_add(osb, handle, new_inode, 1302 status = ocfs2_orphan_add(osb, handle, new_inode,
1303 newfe, orphan_name, 1303 newfe_bh, orphan_name,
1304 &orphan_insert, orphan_dir); 1304 &orphan_insert, orphan_dir);
1305 if (status < 0) { 1305 if (status < 0) {
1306 mlog_errno(status); 1306 mlog_errno(status);
@@ -1911,7 +1911,7 @@ leave:
1911static int ocfs2_orphan_add(struct ocfs2_super *osb, 1911static int ocfs2_orphan_add(struct ocfs2_super *osb,
1912 handle_t *handle, 1912 handle_t *handle,
1913 struct inode *inode, 1913 struct inode *inode,
1914 struct ocfs2_dinode *fe, 1914 struct buffer_head *fe_bh,
1915 char *name, 1915 char *name,
1916 struct ocfs2_dir_lookup_result *lookup, 1916 struct ocfs2_dir_lookup_result *lookup,
1917 struct inode *orphan_dir_inode) 1917 struct inode *orphan_dir_inode)
@@ -1919,6 +1919,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1919 struct buffer_head *orphan_dir_bh = NULL; 1919 struct buffer_head *orphan_dir_bh = NULL;
1920 int status = 0; 1920 int status = 0;
1921 struct ocfs2_dinode *orphan_fe; 1921 struct ocfs2_dinode *orphan_fe;
1922 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1922 1923
1923 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); 1924 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
1924 1925
@@ -1959,6 +1960,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1959 goto leave; 1960 goto leave;
1960 } 1961 }
1961 1962
1963 /*
1964 * We're going to journal the change of i_flags and i_orphaned_slot.
1965 * It's safe anyway, though some callers may duplicate the journaling.
1966 * Journaling within the func just make the logic look more
1967 * straightforward.
1968 */
1969 status = ocfs2_journal_access_di(handle,
1970 INODE_CACHE(inode),
1971 fe_bh,
1972 OCFS2_JOURNAL_ACCESS_WRITE);
1973 if (status < 0) {
1974 mlog_errno(status);
1975 goto leave;
1976 }
1977
1962 le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL); 1978 le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
1963 1979
1964 /* Record which orphan dir our inode now resides 1980 /* Record which orphan dir our inode now resides
@@ -1966,6 +1982,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1966 * dir to lock. */ 1982 * dir to lock. */
1967 fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); 1983 fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
1968 1984
1985 ocfs2_journal_dirty(handle, fe_bh);
1986
1969 mlog(0, "Inode %llu orphaned in slot %d\n", 1987 mlog(0, "Inode %llu orphaned in slot %d\n",
1970 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); 1988 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
1971 1989
@@ -2123,7 +2141,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2123 } 2141 }
2124 2142
2125 di = (struct ocfs2_dinode *)new_di_bh->b_data; 2143 di = (struct ocfs2_dinode *)new_di_bh->b_data;
2126 status = ocfs2_orphan_add(osb, handle, inode, di, orphan_name, 2144 status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
2127 &orphan_insert, orphan_dir); 2145 &orphan_insert, orphan_dir);
2128 if (status < 0) { 2146 if (status < 0) {
2129 mlog_errno(status); 2147 mlog_errno(status);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 1238b491db90..adf5e2ebc2c4 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -763,8 +763,18 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
763 return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits); 763 return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
764} 764}
765 765
766#define ocfs2_set_bit ext2_set_bit 766static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
767#define ocfs2_clear_bit ext2_clear_bit 767{
768 ext2_set_bit(bit, bitmap);
769}
770#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
771
772static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
773{
774 ext2_clear_bit(bit, bitmap);
775}
776#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
777
768#define ocfs2_test_bit ext2_test_bit 778#define ocfs2_test_bit ext2_test_bit
769#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit 779#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
770#define ocfs2_find_next_bit ext2_find_next_bit 780#define ocfs2_find_next_bit ext2_find_next_bit
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 355f41d1d520..ab42a74c7539 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -3,6 +3,7 @@
3 */ 3 */
4#include <linux/spinlock.h> 4#include <linux/spinlock.h>
5#include <linux/fs.h> 5#include <linux/fs.h>
6#include <linux/slab.h>
6#include <linux/quota.h> 7#include <linux/quota.h>
7#include <linux/quotaops.h> 8#include <linux/quotaops.h>
8#include <linux/dqblk_qtree.h> 9#include <linux/dqblk_qtree.h>
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index a6467f3d262e..9ad49305f450 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -3,6 +3,7 @@
3 */ 3 */
4 4
5#include <linux/fs.h> 5#include <linux/fs.h>
6#include <linux/slab.h>
6#include <linux/quota.h> 7#include <linux/quota.h>
7#include <linux/quotaops.h> 8#include <linux/quotaops.h>
8#include <linux/module.h> 9#include <linux/module.h>
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 9e96921dffda..bd96f6c7877e 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -37,7 +37,6 @@
37 37
38#include <linux/bio.h> 38#include <linux/bio.h>
39#include <linux/blkdev.h> 39#include <linux/blkdev.h>
40#include <linux/gfp.h>
41#include <linux/slab.h> 40#include <linux/slab.h>
42#include <linux/writeback.h> 41#include <linux/writeback.h>
43#include <linux/pagevec.h> 42#include <linux/pagevec.h>
@@ -4075,6 +4074,7 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
4075 OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features; 4074 OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
4076 spin_unlock(&OCFS2_I(t_inode)->ip_lock); 4075 spin_unlock(&OCFS2_I(t_inode)->ip_lock);
4077 i_size_write(t_inode, size); 4076 i_size_write(t_inode, size);
4077 t_inode->i_blocks = s_inode->i_blocks;
4078 4078
4079 di->i_xattr_inline_size = s_di->i_xattr_inline_size; 4079 di->i_xattr_inline_size = s_di->i_xattr_inline_size;
4080 di->i_clusters = s_di->i_clusters; 4080 di->i_clusters = s_di->i_clusters;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 7020e1253ffa..0d3049f696c5 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -19,6 +19,7 @@
19 19
20#include <linux/kernel.h> 20#include <linux/kernel.h>
21#include <linux/crc32.h> 21#include <linux/crc32.h>
22#include <linux/slab.h>
22#include <linux/module.h> 23#include <linux/module.h>
23 24
24/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */ 25/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 5ae8812b2864..2dc57bca0688 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -21,6 +21,7 @@
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/miscdevice.h> 22#include <linux/miscdevice.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h>
24#include <linux/smp_lock.h> 25#include <linux/smp_lock.h>
25#include <linux/reboot.h> 26#include <linux/reboot.h>
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c3c60bc3e072..19ba00f28547 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -95,13 +95,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
95 struct buffer_head *group_bh, 95 struct buffer_head *group_bh,
96 unsigned int bit_off, 96 unsigned int bit_off,
97 unsigned int num_bits); 97 unsigned int num_bits);
98static inline int ocfs2_block_group_clear_bits(handle_t *handle,
99 struct inode *alloc_inode,
100 struct ocfs2_group_desc *bg,
101 struct buffer_head *group_bh,
102 unsigned int bit_off,
103 unsigned int num_bits);
104
105static int ocfs2_relink_block_group(handle_t *handle, 98static int ocfs2_relink_block_group(handle_t *handle,
106 struct inode *alloc_inode, 99 struct inode *alloc_inode,
107 struct buffer_head *fe_bh, 100 struct buffer_head *fe_bh,
@@ -152,7 +145,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
152 145
153#define do_error(fmt, ...) \ 146#define do_error(fmt, ...) \
154 do{ \ 147 do{ \
155 if (clean_error) \ 148 if (resize) \
156 mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \ 149 mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \
157 else \ 150 else \
158 ocfs2_error(sb, fmt, ##__VA_ARGS__); \ 151 ocfs2_error(sb, fmt, ##__VA_ARGS__); \
@@ -160,7 +153,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
160 153
161static int ocfs2_validate_gd_self(struct super_block *sb, 154static int ocfs2_validate_gd_self(struct super_block *sb,
162 struct buffer_head *bh, 155 struct buffer_head *bh,
163 int clean_error) 156 int resize)
164{ 157{
165 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 158 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
166 159
@@ -211,7 +204,7 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
211static int ocfs2_validate_gd_parent(struct super_block *sb, 204static int ocfs2_validate_gd_parent(struct super_block *sb,
212 struct ocfs2_dinode *di, 205 struct ocfs2_dinode *di,
213 struct buffer_head *bh, 206 struct buffer_head *bh,
214 int clean_error) 207 int resize)
215{ 208{
216 unsigned int max_bits; 209 unsigned int max_bits;
217 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 210 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
@@ -233,8 +226,11 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
233 return -EINVAL; 226 return -EINVAL;
234 } 227 }
235 228
236 if (le16_to_cpu(gd->bg_chain) >= 229 /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
237 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) { 230 if ((le16_to_cpu(gd->bg_chain) >
231 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
232 ((le16_to_cpu(gd->bg_chain) ==
233 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
238 do_error("Group descriptor #%llu has bad chain %u", 234 do_error("Group descriptor #%llu has bad chain %u",
239 (unsigned long long)bh->b_blocknr, 235 (unsigned long long)bh->b_blocknr,
240 le16_to_cpu(gd->bg_chain)); 236 le16_to_cpu(gd->bg_chain));
@@ -1975,18 +1971,18 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
1975 bits_wanted, cluster_start, num_clusters); 1971 bits_wanted, cluster_start, num_clusters);
1976} 1972}
1977 1973
1978static inline int ocfs2_block_group_clear_bits(handle_t *handle, 1974static int ocfs2_block_group_clear_bits(handle_t *handle,
1979 struct inode *alloc_inode, 1975 struct inode *alloc_inode,
1980 struct ocfs2_group_desc *bg, 1976 struct ocfs2_group_desc *bg,
1981 struct buffer_head *group_bh, 1977 struct buffer_head *group_bh,
1982 unsigned int bit_off, 1978 unsigned int bit_off,
1983 unsigned int num_bits) 1979 unsigned int num_bits,
1980 void (*undo_fn)(unsigned int bit,
1981 unsigned long *bmap))
1984{ 1982{
1985 int status; 1983 int status;
1986 unsigned int tmp; 1984 unsigned int tmp;
1987 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1988 struct ocfs2_group_desc *undo_bg = NULL; 1985 struct ocfs2_group_desc *undo_bg = NULL;
1989 int cluster_bitmap = 0;
1990 1986
1991 mlog_entry_void(); 1987 mlog_entry_void();
1992 1988
@@ -1996,20 +1992,18 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1996 1992
1997 mlog(0, "off = %u, num = %u\n", bit_off, num_bits); 1993 mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1998 1994
1999 if (ocfs2_is_cluster_bitmap(alloc_inode)) 1995 BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
2000 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
2001
2002 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1996 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
2003 group_bh, journal_type); 1997 group_bh,
1998 undo_fn ?
1999 OCFS2_JOURNAL_ACCESS_UNDO :
2000 OCFS2_JOURNAL_ACCESS_WRITE);
2004 if (status < 0) { 2001 if (status < 0) {
2005 mlog_errno(status); 2002 mlog_errno(status);
2006 goto bail; 2003 goto bail;
2007 } 2004 }
2008 2005
2009 if (ocfs2_is_cluster_bitmap(alloc_inode)) 2006 if (undo_fn) {
2010 cluster_bitmap = 1;
2011
2012 if (cluster_bitmap) {
2013 jbd_lock_bh_state(group_bh); 2007 jbd_lock_bh_state(group_bh);
2014 undo_bg = (struct ocfs2_group_desc *) 2008 undo_bg = (struct ocfs2_group_desc *)
2015 bh2jh(group_bh)->b_committed_data; 2009 bh2jh(group_bh)->b_committed_data;
@@ -2020,13 +2014,13 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
2020 while(tmp--) { 2014 while(tmp--) {
2021 ocfs2_clear_bit((bit_off + tmp), 2015 ocfs2_clear_bit((bit_off + tmp),
2022 (unsigned long *) bg->bg_bitmap); 2016 (unsigned long *) bg->bg_bitmap);
2023 if (cluster_bitmap) 2017 if (undo_fn)
2024 ocfs2_set_bit(bit_off + tmp, 2018 undo_fn(bit_off + tmp,
2025 (unsigned long *) undo_bg->bg_bitmap); 2019 (unsigned long *) undo_bg->bg_bitmap);
2026 } 2020 }
2027 le16_add_cpu(&bg->bg_free_bits_count, num_bits); 2021 le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2028 2022
2029 if (cluster_bitmap) 2023 if (undo_fn)
2030 jbd_unlock_bh_state(group_bh); 2024 jbd_unlock_bh_state(group_bh);
2031 2025
2032 status = ocfs2_journal_dirty(handle, group_bh); 2026 status = ocfs2_journal_dirty(handle, group_bh);
@@ -2039,12 +2033,14 @@ bail:
2039/* 2033/*
2040 * expects the suballoc inode to already be locked. 2034 * expects the suballoc inode to already be locked.
2041 */ 2035 */
2042int ocfs2_free_suballoc_bits(handle_t *handle, 2036static int _ocfs2_free_suballoc_bits(handle_t *handle,
2043 struct inode *alloc_inode, 2037 struct inode *alloc_inode,
2044 struct buffer_head *alloc_bh, 2038 struct buffer_head *alloc_bh,
2045 unsigned int start_bit, 2039 unsigned int start_bit,
2046 u64 bg_blkno, 2040 u64 bg_blkno,
2047 unsigned int count) 2041 unsigned int count,
2042 void (*undo_fn)(unsigned int bit,
2043 unsigned long *bitmap))
2048{ 2044{
2049 int status = 0; 2045 int status = 0;
2050 u32 tmp_used; 2046 u32 tmp_used;
@@ -2079,7 +2075,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
2079 2075
2080 status = ocfs2_block_group_clear_bits(handle, alloc_inode, 2076 status = ocfs2_block_group_clear_bits(handle, alloc_inode,
2081 group, group_bh, 2077 group, group_bh,
2082 start_bit, count); 2078 start_bit, count, undo_fn);
2083 if (status < 0) { 2079 if (status < 0) {
2084 mlog_errno(status); 2080 mlog_errno(status);
2085 goto bail; 2081 goto bail;
@@ -2110,6 +2106,17 @@ bail:
2110 return status; 2106 return status;
2111} 2107}
2112 2108
2109int ocfs2_free_suballoc_bits(handle_t *handle,
2110 struct inode *alloc_inode,
2111 struct buffer_head *alloc_bh,
2112 unsigned int start_bit,
2113 u64 bg_blkno,
2114 unsigned int count)
2115{
2116 return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2117 start_bit, bg_blkno, count, NULL);
2118}
2119
2113int ocfs2_free_dinode(handle_t *handle, 2120int ocfs2_free_dinode(handle_t *handle,
2114 struct inode *inode_alloc_inode, 2121 struct inode *inode_alloc_inode,
2115 struct buffer_head *inode_alloc_bh, 2122 struct buffer_head *inode_alloc_bh,
@@ -2123,11 +2130,13 @@ int ocfs2_free_dinode(handle_t *handle,
2123 inode_alloc_bh, bit, bg_blkno, 1); 2130 inode_alloc_bh, bit, bg_blkno, 1);
2124} 2131}
2125 2132
2126int ocfs2_free_clusters(handle_t *handle, 2133static int _ocfs2_free_clusters(handle_t *handle,
2127 struct inode *bitmap_inode, 2134 struct inode *bitmap_inode,
2128 struct buffer_head *bitmap_bh, 2135 struct buffer_head *bitmap_bh,
2129 u64 start_blk, 2136 u64 start_blk,
2130 unsigned int num_clusters) 2137 unsigned int num_clusters,
2138 void (*undo_fn)(unsigned int bit,
2139 unsigned long *bitmap))
2131{ 2140{
2132 int status; 2141 int status;
2133 u16 bg_start_bit; 2142 u16 bg_start_bit;
@@ -2154,9 +2163,9 @@ int ocfs2_free_clusters(handle_t *handle,
2154 mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n", 2163 mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
2155 (unsigned long long)bg_blkno, bg_start_bit); 2164 (unsigned long long)bg_blkno, bg_start_bit);
2156 2165
2157 status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, 2166 status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2158 bg_start_bit, bg_blkno, 2167 bg_start_bit, bg_blkno,
2159 num_clusters); 2168 num_clusters, undo_fn);
2160 if (status < 0) { 2169 if (status < 0) {
2161 mlog_errno(status); 2170 mlog_errno(status);
2162 goto out; 2171 goto out;
@@ -2170,6 +2179,32 @@ out:
2170 return status; 2179 return status;
2171} 2180}
2172 2181
2182int ocfs2_free_clusters(handle_t *handle,
2183 struct inode *bitmap_inode,
2184 struct buffer_head *bitmap_bh,
2185 u64 start_blk,
2186 unsigned int num_clusters)
2187{
2188 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2189 start_blk, num_clusters,
2190 _ocfs2_set_bit);
2191}
2192
2193/*
2194 * Give never-used clusters back to the global bitmap. We don't need
2195 * to protect these bits in the undo buffer.
2196 */
2197int ocfs2_release_clusters(handle_t *handle,
2198 struct inode *bitmap_inode,
2199 struct buffer_head *bitmap_bh,
2200 u64 start_blk,
2201 unsigned int num_clusters)
2202{
2203 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2204 start_blk, num_clusters,
2205 _ocfs2_clear_bit);
2206}
2207
2173static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg) 2208static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
2174{ 2209{
2175 printk("Block Group:\n"); 2210 printk("Block Group:\n");
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index fa60723c43e8..e0f46df357e6 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -127,6 +127,11 @@ int ocfs2_free_clusters(handle_t *handle,
127 struct buffer_head *bitmap_bh, 127 struct buffer_head *bitmap_bh,
128 u64 start_blk, 128 u64 start_blk,
129 unsigned int num_clusters); 129 unsigned int num_clusters);
130int ocfs2_release_clusters(handle_t *handle,
131 struct inode *bitmap_inode,
132 struct buffer_head *bitmap_bh,
133 u64 start_blk,
134 unsigned int num_clusters);
130 135
131static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit) 136static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
132{ 137{
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 40e53702948c..bfe7190cdbf1 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30 29
31#define MLOG_MASK_PREFIX ML_INODE 30#define MLOG_MASK_PREFIX ML_INODE
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d1b0d386f6d1..3e7773089b96 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1622,7 +1622,7 @@ static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc)
1622 /* Now tell xh->xh_entries about it */ 1622 /* Now tell xh->xh_entries about it */
1623 for (i = 0; i < count; i++) { 1623 for (i = 0; i < count; i++) {
1624 offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset); 1624 offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
1625 if (offset < namevalue_offset) 1625 if (offset <= namevalue_offset)
1626 le16_add_cpu(&xh->xh_entries[i].xe_name_offset, 1626 le16_add_cpu(&xh->xh_entries[i].xe_name_offset,
1627 namevalue_size); 1627 namevalue_size);
1628 } 1628 }
@@ -6528,13 +6528,11 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
6528 int indexed) 6528 int indexed)
6529{ 6529{
6530 int ret; 6530 int ret;
6531 struct ocfs2_alloc_context *meta_ac;
6532 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 6531 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6533 struct ocfs2_xattr_set_ctxt ctxt = { 6532 struct ocfs2_xattr_set_ctxt ctxt;
6534 .meta_ac = meta_ac,
6535 };
6536 6533
6537 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); 6534 memset(&ctxt, 0, sizeof(ctxt));
6535 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &ctxt.meta_ac);
6538 if (ret < 0) { 6536 if (ret < 0) {
6539 mlog_errno(ret); 6537 mlog_errno(ret);
6540 return ret; 6538 return ret;
@@ -6556,7 +6554,7 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
6556 6554
6557 ocfs2_commit_trans(osb, ctxt.handle); 6555 ocfs2_commit_trans(osb, ctxt.handle);
6558out: 6556out:
6559 ocfs2_free_alloc_context(meta_ac); 6557 ocfs2_free_alloc_context(ctxt.meta_ac);
6560 return ret; 6558 return ret;
6561} 6559}
6562 6560
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 75d9b5ba1d45..c82af6acc2e7 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -6,6 +6,7 @@
6#include <linux/version.h> 6#include <linux/version.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/slab.h>
9#include <linux/fs.h> 10#include <linux/fs.h>
10#include <linux/vfs.h> 11#include <linux/vfs.h>
11#include <linux/parser.h> 12#include <linux/parser.h>
diff --git a/fs/open.c b/fs/open.c
index e17f54454b50..74e5cd9f718e 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -10,7 +10,6 @@
10#include <linux/fdtable.h> 10#include <linux/fdtable.h>
11#include <linux/fsnotify.h> 11#include <linux/fsnotify.h>
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/slab.h>
14#include <linux/tty.h> 13#include <linux/tty.h>
15#include <linux/namei.h> 14#include <linux/namei.h>
16#include <linux/backing-dev.h> 15#include <linux/backing-dev.h>
@@ -20,6 +19,7 @@
20#include <linux/mount.h> 19#include <linux/mount.h>
21#include <linux/vfs.h> 20#include <linux/vfs.h>
22#include <linux/fcntl.h> 21#include <linux/fcntl.h>
22#include <linux/slab.h>
23#include <asm/uaccess.h> 23#include <asm/uaccess.h>
24#include <linux/fs.h> 24#include <linux/fs.h>
25#include <linux/personality.h> 25#include <linux/personality.h>
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index e8865c11777f..e238ab23a9e7 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -16,6 +16,7 @@
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/slab.h>
19#include <linux/kmod.h> 20#include <linux/kmod.h>
20#include <linux/ctype.h> 21#include <linux/ctype.h>
21#include <linux/genhd.h> 22#include <linux/genhd.h>
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 49cfd5f54238..91babdae7587 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -95,6 +95,7 @@
95 ************************************************************/ 95 ************************************************************/
96#include <linux/crc32.h> 96#include <linux/crc32.h>
97#include <linux/math64.h> 97#include <linux/math64.h>
98#include <linux/slab.h>
98#include "check.h" 99#include "check.h"
99#include "efi.h" 100#include "efi.h"
100 101
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 0028d2ef0662..90be97f1f5a8 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -31,14 +31,17 @@
31 */ 31 */
32#include <asm/unaligned.h> 32#include <asm/unaligned.h>
33 33
34#define SYS_IND(p) (get_unaligned(&p->sys_ind)) 34#define SYS_IND(p) get_unaligned(&p->sys_ind)
35#define NR_SECTS(p) ({ __le32 __a = get_unaligned(&p->nr_sects); \
36 le32_to_cpu(__a); \
37 })
38 35
39#define START_SECT(p) ({ __le32 __a = get_unaligned(&p->start_sect); \ 36static inline sector_t nr_sects(struct partition *p)
40 le32_to_cpu(__a); \ 37{
41 }) 38 return (sector_t)get_unaligned_le32(&p->nr_sects);
39}
40
41static inline sector_t start_sect(struct partition *p)
42{
43 return (sector_t)get_unaligned_le32(&p->start_sect);
44}
42 45
43static inline int is_extended_partition(struct partition *p) 46static inline int is_extended_partition(struct partition *p)
44{ 47{
@@ -104,13 +107,13 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
104 107
105static void 108static void
106parse_extended(struct parsed_partitions *state, struct block_device *bdev, 109parse_extended(struct parsed_partitions *state, struct block_device *bdev,
107 u32 first_sector, u32 first_size) 110 sector_t first_sector, sector_t first_size)
108{ 111{
109 struct partition *p; 112 struct partition *p;
110 Sector sect; 113 Sector sect;
111 unsigned char *data; 114 unsigned char *data;
112 u32 this_sector, this_size; 115 sector_t this_sector, this_size;
113 int sector_size = bdev_logical_block_size(bdev) / 512; 116 sector_t sector_size = bdev_logical_block_size(bdev) / 512;
114 int loopct = 0; /* number of links followed 117 int loopct = 0; /* number of links followed
115 without finding a data partition */ 118 without finding a data partition */
116 int i; 119 int i;
@@ -145,14 +148,14 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
145 * First process the data partition(s) 148 * First process the data partition(s)
146 */ 149 */
147 for (i=0; i<4; i++, p++) { 150 for (i=0; i<4; i++, p++) {
148 u32 offs, size, next; 151 sector_t offs, size, next;
149 if (!NR_SECTS(p) || is_extended_partition(p)) 152 if (!nr_sects(p) || is_extended_partition(p))
150 continue; 153 continue;
151 154
152 /* Check the 3rd and 4th entries - 155 /* Check the 3rd and 4th entries -
153 these sometimes contain random garbage */ 156 these sometimes contain random garbage */
154 offs = START_SECT(p)*sector_size; 157 offs = start_sect(p)*sector_size;
155 size = NR_SECTS(p)*sector_size; 158 size = nr_sects(p)*sector_size;
156 next = this_sector + offs; 159 next = this_sector + offs;
157 if (i >= 2) { 160 if (i >= 2) {
158 if (offs + size > this_size) 161 if (offs + size > this_size)
@@ -179,13 +182,13 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
179 */ 182 */
180 p -= 4; 183 p -= 4;
181 for (i=0; i<4; i++, p++) 184 for (i=0; i<4; i++, p++)
182 if (NR_SECTS(p) && is_extended_partition(p)) 185 if (nr_sects(p) && is_extended_partition(p))
183 break; 186 break;
184 if (i == 4) 187 if (i == 4)
185 goto done; /* nothing left to do */ 188 goto done; /* nothing left to do */
186 189
187 this_sector = first_sector + START_SECT(p) * sector_size; 190 this_sector = first_sector + start_sect(p) * sector_size;
188 this_size = NR_SECTS(p) * sector_size; 191 this_size = nr_sects(p) * sector_size;
189 put_dev_sector(sect); 192 put_dev_sector(sect);
190 } 193 }
191done: 194done:
@@ -197,7 +200,7 @@ done:
197 200
198static void 201static void
199parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev, 202parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
200 u32 offset, u32 size, int origin) 203 sector_t offset, sector_t size, int origin)
201{ 204{
202#ifdef CONFIG_SOLARIS_X86_PARTITION 205#ifdef CONFIG_SOLARIS_X86_PARTITION
203 Sector sect; 206 Sector sect;
@@ -244,7 +247,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
244 */ 247 */
245static void 248static void
246parse_bsd(struct parsed_partitions *state, struct block_device *bdev, 249parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
247 u32 offset, u32 size, int origin, char *flavour, 250 sector_t offset, sector_t size, int origin, char *flavour,
248 int max_partitions) 251 int max_partitions)
249{ 252{
250 Sector sect; 253 Sector sect;
@@ -263,7 +266,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
263 if (le16_to_cpu(l->d_npartitions) < max_partitions) 266 if (le16_to_cpu(l->d_npartitions) < max_partitions)
264 max_partitions = le16_to_cpu(l->d_npartitions); 267 max_partitions = le16_to_cpu(l->d_npartitions);
265 for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) { 268 for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) {
266 u32 bsd_start, bsd_size; 269 sector_t bsd_start, bsd_size;
267 270
268 if (state->next == state->limit) 271 if (state->next == state->limit)
269 break; 272 break;
@@ -290,7 +293,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
290 293
291static void 294static void
292parse_freebsd(struct parsed_partitions *state, struct block_device *bdev, 295parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
293 u32 offset, u32 size, int origin) 296 sector_t offset, sector_t size, int origin)
294{ 297{
295#ifdef CONFIG_BSD_DISKLABEL 298#ifdef CONFIG_BSD_DISKLABEL
296 parse_bsd(state, bdev, offset, size, origin, 299 parse_bsd(state, bdev, offset, size, origin,
@@ -300,7 +303,7 @@ parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
300 303
301static void 304static void
302parse_netbsd(struct parsed_partitions *state, struct block_device *bdev, 305parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
303 u32 offset, u32 size, int origin) 306 sector_t offset, sector_t size, int origin)
304{ 307{
305#ifdef CONFIG_BSD_DISKLABEL 308#ifdef CONFIG_BSD_DISKLABEL
306 parse_bsd(state, bdev, offset, size, origin, 309 parse_bsd(state, bdev, offset, size, origin,
@@ -310,7 +313,7 @@ parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
310 313
311static void 314static void
312parse_openbsd(struct parsed_partitions *state, struct block_device *bdev, 315parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
313 u32 offset, u32 size, int origin) 316 sector_t offset, sector_t size, int origin)
314{ 317{
315#ifdef CONFIG_BSD_DISKLABEL 318#ifdef CONFIG_BSD_DISKLABEL
316 parse_bsd(state, bdev, offset, size, origin, 319 parse_bsd(state, bdev, offset, size, origin,
@@ -324,7 +327,7 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
324 */ 327 */
325static void 328static void
326parse_unixware(struct parsed_partitions *state, struct block_device *bdev, 329parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
327 u32 offset, u32 size, int origin) 330 sector_t offset, sector_t size, int origin)
328{ 331{
329#ifdef CONFIG_UNIXWARE_DISKLABEL 332#ifdef CONFIG_UNIXWARE_DISKLABEL
330 Sector sect; 333 Sector sect;
@@ -348,7 +351,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
348 351
349 if (p->s_label != UNIXWARE_FS_UNUSED) 352 if (p->s_label != UNIXWARE_FS_UNUSED)
350 put_partition(state, state->next++, 353 put_partition(state, state->next++,
351 START_SECT(p), NR_SECTS(p)); 354 le32_to_cpu(p->start_sect),
355 le32_to_cpu(p->nr_sects));
352 p++; 356 p++;
353 } 357 }
354 put_dev_sector(sect); 358 put_dev_sector(sect);
@@ -363,7 +367,7 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
363 */ 367 */
364static void 368static void
365parse_minix(struct parsed_partitions *state, struct block_device *bdev, 369parse_minix(struct parsed_partitions *state, struct block_device *bdev,
366 u32 offset, u32 size, int origin) 370 sector_t offset, sector_t size, int origin)
367{ 371{
368#ifdef CONFIG_MINIX_SUBPARTITION 372#ifdef CONFIG_MINIX_SUBPARTITION
369 Sector sect; 373 Sector sect;
@@ -390,7 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
390 /* add each partition in use */ 394 /* add each partition in use */
391 if (SYS_IND(p) == MINIX_PARTITION) 395 if (SYS_IND(p) == MINIX_PARTITION)
392 put_partition(state, state->next++, 396 put_partition(state, state->next++,
393 START_SECT(p), NR_SECTS(p)); 397 start_sect(p), nr_sects(p));
394 } 398 }
395 printk(" >\n"); 399 printk(" >\n");
396 } 400 }
@@ -401,7 +405,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
401static struct { 405static struct {
402 unsigned char id; 406 unsigned char id;
403 void (*parse)(struct parsed_partitions *, struct block_device *, 407 void (*parse)(struct parsed_partitions *, struct block_device *,
404 u32, u32, int); 408 sector_t, sector_t, int);
405} subtypes[] = { 409} subtypes[] = {
406 {FREEBSD_PARTITION, parse_freebsd}, 410 {FREEBSD_PARTITION, parse_freebsd},
407 {NETBSD_PARTITION, parse_netbsd}, 411 {NETBSD_PARTITION, parse_netbsd},
@@ -415,7 +419,7 @@ static struct {
415 419
416int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) 420int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
417{ 421{
418 int sector_size = bdev_logical_block_size(bdev) / 512; 422 sector_t sector_size = bdev_logical_block_size(bdev) / 512;
419 Sector sect; 423 Sector sect;
420 unsigned char *data; 424 unsigned char *data;
421 struct partition *p; 425 struct partition *p;
@@ -483,14 +487,21 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
483 487
484 state->next = 5; 488 state->next = 5;
485 for (slot = 1 ; slot <= 4 ; slot++, p++) { 489 for (slot = 1 ; slot <= 4 ; slot++, p++) {
486 u32 start = START_SECT(p)*sector_size; 490 sector_t start = start_sect(p)*sector_size;
487 u32 size = NR_SECTS(p)*sector_size; 491 sector_t size = nr_sects(p)*sector_size;
488 if (!size) 492 if (!size)
489 continue; 493 continue;
490 if (is_extended_partition(p)) { 494 if (is_extended_partition(p)) {
491 /* prevent someone doing mkfs or mkswap on an 495 /*
492 extended partition, but leave room for LILO */ 496 * prevent someone doing mkfs or mkswap on an
493 put_partition(state, slot, start, size == 1 ? 1 : 2); 497 * extended partition, but leave room for LILO
498 * FIXME: this uses one logical sector for > 512b
499 * sector, although it may not be enough/proper.
500 */
501 sector_t n = 2;
502 n = min(size, max(sector_size, n));
503 put_partition(state, slot, start, n);
504
494 printk(" <"); 505 printk(" <");
495 parse_extended(state, bdev, start, size); 506 parse_extended(state, bdev, start, size);
496 printk(" >"); 507 printk(" >");
@@ -513,7 +524,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
513 unsigned char id = SYS_IND(p); 524 unsigned char id = SYS_IND(p);
514 int n; 525 int n;
515 526
516 if (!NR_SECTS(p)) 527 if (!nr_sects(p))
517 continue; 528 continue;
518 529
519 for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++) 530 for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++)
@@ -521,8 +532,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
521 532
522 if (!subtypes[n].parse) 533 if (!subtypes[n].parse)
523 continue; 534 continue;
524 subtypes[n].parse(state, bdev, START_SECT(p)*sector_size, 535 subtypes[n].parse(state, bdev, start_sect(p)*sector_size,
525 NR_SECTS(p)*sector_size, slot); 536 nr_sects(p)*sector_size, slot);
526 } 537 }
527 put_dev_sector(sect); 538 put_dev_sector(sect);
528 return 1; 539 return 1;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index aa8637b81028..e51f2ec2c5e5 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -68,7 +68,6 @@
68#include <linux/hugetlb.h> 68#include <linux/hugetlb.h>
69#include <linux/pagemap.h> 69#include <linux/pagemap.h>
70#include <linux/swap.h> 70#include <linux/swap.h>
71#include <linux/slab.h>
72#include <linux/smp.h> 71#include <linux/smp.h>
73#include <linux/signal.h> 72#include <linux/signal.h>
74#include <linux/highmem.h> 73#include <linux/highmem.h>
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a7310841c831..7621db800a74 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -81,6 +81,7 @@
81#include <linux/elf.h> 81#include <linux/elf.h>
82#include <linux/pid_namespace.h> 82#include <linux/pid_namespace.h>
83#include <linux/fs_struct.h> 83#include <linux/fs_struct.h>
84#include <linux/slab.h>
84#include "internal.h" 85#include "internal.h"
85 86
86/* NOTE: 87/* NOTE:
@@ -442,12 +443,13 @@ static const struct file_operations proc_lstats_operations = {
442unsigned long badness(struct task_struct *p, unsigned long uptime); 443unsigned long badness(struct task_struct *p, unsigned long uptime);
443static int proc_oom_score(struct task_struct *task, char *buffer) 444static int proc_oom_score(struct task_struct *task, char *buffer)
444{ 445{
445 unsigned long points; 446 unsigned long points = 0;
446 struct timespec uptime; 447 struct timespec uptime;
447 448
448 do_posix_clock_monotonic_gettime(&uptime); 449 do_posix_clock_monotonic_gettime(&uptime);
449 read_lock(&tasklist_lock); 450 read_lock(&tasklist_lock);
450 points = badness(task->group_leader, uptime.tv_sec); 451 if (pid_alive(task))
452 points = badness(task, uptime.tv_sec);
451 read_unlock(&tasklist_lock); 453 read_unlock(&tasklist_lock);
452 return sprintf(buffer, "%lu\n", points); 454 return sprintf(buffer, "%lu\n", points);
453} 455}
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 08f4d71dacd7..43c127490606 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -13,6 +13,7 @@
13#include <linux/proc_fs.h> 13#include <linux/proc_fs.h>
14#include <linux/stat.h> 14#include <linux/stat.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/slab.h>
16#include <linux/mount.h> 17#include <linux/mount.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/idr.h> 19#include <linux/idr.h>
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 445a02bcaab3..d35b23238fb1 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -18,6 +18,7 @@
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/smp_lock.h> 19#include <linux/smp_lock.h>
20#include <linux/sysctl.h> 20#include <linux/sysctl.h>
21#include <linux/slab.h>
21 22
22#include <asm/system.h> 23#include <asm/system.h>
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a44a7897fd4d..19979a2ce272 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -19,6 +19,7 @@
19#include <linux/highmem.h> 19#include <linux/highmem.h>
20#include <linux/bootmem.h> 20#include <linux/bootmem.h>
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/slab.h>
22#include <asm/uaccess.h> 23#include <asm/uaccess.h>
23#include <asm/io.h> 24#include <asm/io.h>
24#include <linux/list.h> 25#include <linux/list.h>
@@ -490,7 +491,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
490 } 491 }
491 read_unlock(&kclist_lock); 492 read_unlock(&kclist_lock);
492 493
493 if (m == NULL) { 494 if (&m->list == &kclist_head) {
494 if (clear_user(buffer, tsz)) 495 if (clear_user(buffer, tsz))
495 return -EFAULT; 496 return -EFAULT;
496 } else if (is_vmalloc_or_module_addr((void *)start)) { 497 } else if (is_vmalloc_or_module_addr((void *)start)) {
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 9fe7d7ebe115..b1822dde55c2 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -21,7 +21,6 @@
21#include <linux/mmzone.h> 21#include <linux/mmzone.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/swap.h> 23#include <linux/swap.h>
24#include <linux/slab.h>
25#include <linux/smp.h> 24#include <linux/smp.h>
26#include <linux/seq_file.h> 25#include <linux/seq_file.h>
27#include <linux/hugetlb.h> 26#include <linux/hugetlb.h>
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index f8650dce74fb..ce94801f48ca 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -12,6 +12,7 @@
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/of.h> 13#include <linux/of.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/slab.h>
15#include <asm/prom.h> 16#include <asm/prom.h>
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17#include "internal.h" 18#include "internal.h"
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 04d1270f1c38..9020ac15baaa 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -14,6 +14,7 @@
14#include <linux/time.h> 14#include <linux/time.h>
15#include <linux/proc_fs.h> 15#include <linux/proc_fs.h>
16#include <linux/stat.h> 16#include <linux/stat.h>
17#include <linux/slab.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/sched.h> 19#include <linux/sched.h>
19#include <linux/module.h> 20#include <linux/module.h>
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index b9b7aad2003d..bf31b03fc275 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -1,6 +1,5 @@
1#include <linux/cpumask.h> 1#include <linux/cpumask.h>
2#include <linux/fs.h> 2#include <linux/fs.h>
3#include <linux/gfp.h>
4#include <linux/init.h> 3#include <linux/init.h>
5#include <linux/interrupt.h> 4#include <linux/interrupt.h>
6#include <linux/kernel_stat.h> 5#include <linux/kernel_stat.h>
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 183f8ff5f400..a05a669510a4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -4,6 +4,7 @@
4#include <linux/seq_file.h> 4#include <linux/seq_file.h>
5#include <linux/highmem.h> 5#include <linux/highmem.h>
6#include <linux/ptrace.h> 6#include <linux/ptrace.h>
7#include <linux/slab.h>
7#include <linux/pagemap.h> 8#include <linux/pagemap.h>
8#include <linux/mempolicy.h> 9#include <linux/mempolicy.h>
9#include <linux/swap.h> 10#include <linux/swap.h>
@@ -406,6 +407,7 @@ static int show_smap(struct seq_file *m, void *v)
406 407
407 memset(&mss, 0, sizeof mss); 408 memset(&mss, 0, sizeof mss);
408 mss.vma = vma; 409 mss.vma = vma;
410 /* mmap_sem is held in m_start */
409 if (vma->vm_mm && !is_vm_hugetlb_page(vma)) 411 if (vma->vm_mm && !is_vm_hugetlb_page(vma))
410 walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); 412 walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
411 413
@@ -552,7 +554,8 @@ const struct file_operations proc_clear_refs_operations = {
552}; 554};
553 555
554struct pagemapread { 556struct pagemapread {
555 u64 __user *out, *end; 557 int pos, len;
558 u64 *buffer;
556}; 559};
557 560
558#define PM_ENTRY_BYTES sizeof(u64) 561#define PM_ENTRY_BYTES sizeof(u64)
@@ -575,10 +578,8 @@ struct pagemapread {
575static int add_to_pagemap(unsigned long addr, u64 pfn, 578static int add_to_pagemap(unsigned long addr, u64 pfn,
576 struct pagemapread *pm) 579 struct pagemapread *pm)
577{ 580{
578 if (put_user(pfn, pm->out)) 581 pm->buffer[pm->pos++] = pfn;
579 return -EFAULT; 582 if (pm->pos >= pm->len)
580 pm->out++;
581 if (pm->out >= pm->end)
582 return PM_END_OF_BUFFER; 583 return PM_END_OF_BUFFER;
583 return 0; 584 return 0;
584} 585}
@@ -720,21 +721,20 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr,
720 * determine which areas of memory are actually mapped and llseek to 721 * determine which areas of memory are actually mapped and llseek to
721 * skip over unmapped regions. 722 * skip over unmapped regions.
722 */ 723 */
724#define PAGEMAP_WALK_SIZE (PMD_SIZE)
723static ssize_t pagemap_read(struct file *file, char __user *buf, 725static ssize_t pagemap_read(struct file *file, char __user *buf,
724 size_t count, loff_t *ppos) 726 size_t count, loff_t *ppos)
725{ 727{
726 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); 728 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
727 struct page **pages, *page;
728 unsigned long uaddr, uend;
729 struct mm_struct *mm; 729 struct mm_struct *mm;
730 struct pagemapread pm; 730 struct pagemapread pm;
731 int pagecount;
732 int ret = -ESRCH; 731 int ret = -ESRCH;
733 struct mm_walk pagemap_walk = {}; 732 struct mm_walk pagemap_walk = {};
734 unsigned long src; 733 unsigned long src;
735 unsigned long svpfn; 734 unsigned long svpfn;
736 unsigned long start_vaddr; 735 unsigned long start_vaddr;
737 unsigned long end_vaddr; 736 unsigned long end_vaddr;
737 int copied = 0;
738 738
739 if (!task) 739 if (!task)
740 goto out; 740 goto out;
@@ -757,35 +757,12 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
757 if (!mm) 757 if (!mm)
758 goto out_task; 758 goto out_task;
759 759
760 760 pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
761 uaddr = (unsigned long)buf & PAGE_MASK; 761 pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
762 uend = (unsigned long)(buf + count);
763 pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE;
764 ret = 0;
765 if (pagecount == 0)
766 goto out_mm;
767 pages = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
768 ret = -ENOMEM; 762 ret = -ENOMEM;
769 if (!pages) 763 if (!pm.buffer)
770 goto out_mm; 764 goto out_mm;
771 765
772 down_read(&current->mm->mmap_sem);
773 ret = get_user_pages(current, current->mm, uaddr, pagecount,
774 1, 0, pages, NULL);
775 up_read(&current->mm->mmap_sem);
776
777 if (ret < 0)
778 goto out_free;
779
780 if (ret != pagecount) {
781 pagecount = ret;
782 ret = -EFAULT;
783 goto out_pages;
784 }
785
786 pm.out = (u64 __user *)buf;
787 pm.end = (u64 __user *)(buf + count);
788
789 pagemap_walk.pmd_entry = pagemap_pte_range; 766 pagemap_walk.pmd_entry = pagemap_pte_range;
790 pagemap_walk.pte_hole = pagemap_pte_hole; 767 pagemap_walk.pte_hole = pagemap_pte_hole;
791 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; 768 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
@@ -807,23 +784,36 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
807 * user buffer is tracked in "pm", and the walk 784 * user buffer is tracked in "pm", and the walk
808 * will stop when we hit the end of the buffer. 785 * will stop when we hit the end of the buffer.
809 */ 786 */
810 ret = walk_page_range(start_vaddr, end_vaddr, &pagemap_walk); 787 ret = 0;
811 if (ret == PM_END_OF_BUFFER) 788 while (count && (start_vaddr < end_vaddr)) {
812 ret = 0; 789 int len;
813 /* don't need mmap_sem for these, but this looks cleaner */ 790 unsigned long end;
814 *ppos += (char __user *)pm.out - buf; 791
815 if (!ret) 792 pm.pos = 0;
816 ret = (char __user *)pm.out - buf; 793 end = start_vaddr + PAGEMAP_WALK_SIZE;
817 794 /* overflow ? */
818out_pages: 795 if (end < start_vaddr || end > end_vaddr)
819 for (; pagecount; pagecount--) { 796 end = end_vaddr;
820 page = pages[pagecount-1]; 797 down_read(&mm->mmap_sem);
821 if (!PageReserved(page)) 798 ret = walk_page_range(start_vaddr, end, &pagemap_walk);
822 SetPageDirty(page); 799 up_read(&mm->mmap_sem);
823 page_cache_release(page); 800 start_vaddr = end;
801
802 len = min(count, PM_ENTRY_BYTES * pm.pos);
803 if (copy_to_user(buf, pm.buffer, len)) {
804 ret = -EFAULT;
805 goto out_free;
806 }
807 copied += len;
808 buf += len;
809 count -= len;
824 } 810 }
811 *ppos += copied;
812 if (!ret || ret == PM_END_OF_BUFFER)
813 ret = copied;
814
825out_free: 815out_free:
826 kfree(pages); 816 kfree(pm.buffer);
827out_mm: 817out_mm:
828 mmput(mm); 818 mmput(mm);
829out_task: 819out_task:
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 5d9fd64ef81a..46d4b5d72bd3 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -5,6 +5,7 @@
5#include <linux/fs_struct.h> 5#include <linux/fs_struct.h>
6#include <linux/mount.h> 6#include <linux/mount.h>
7#include <linux/ptrace.h> 7#include <linux/ptrace.h>
8#include <linux/slab.h>
8#include <linux/seq_file.h> 9#include <linux/seq_file.h>
9#include "internal.h" 10#include "internal.h"
10 11
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 0872afa58d39..9fbc99ec799a 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -12,6 +12,7 @@
12#include <linux/user.h> 12#include <linux/user.h>
13#include <linux/elf.h> 13#include <linux/elf.h>
14#include <linux/elfcore.h> 14#include <linux/elfcore.h>
15#include <linux/slab.h>
15#include <linux/highmem.h> 16#include <linux/highmem.h>
16#include <linux/bootmem.h> 17#include <linux/bootmem.h>
17#include <linux/init.h> 18#include <linux/init.h>
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index 2663ed90fb03..d67908b407d9 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -5,6 +5,7 @@
5#include <linux/kernel.h> 5#include <linux/kernel.h>
6#include <linux/quotaops.h> 6#include <linux/quotaops.h>
7#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/slab.h>
8#include <net/netlink.h> 9#include <net/netlink.h>
9#include <net/genetlink.h> 10#include <net/genetlink.h>
10 11
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 1739a4aba25f..5ea4ad81a429 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -21,6 +21,7 @@
21#include <linux/pagevec.h> 21#include <linux/pagevec.h>
22#include <linux/mman.h> 22#include <linux/mman.h>
23#include <linux/sched.h> 23#include <linux/sched.h>
24#include <linux/slab.h>
24 25
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26#include "internal.h" 27#include "internal.h"
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a6090aa1a7c1..c94853473ca9 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -35,6 +35,7 @@
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/parser.h> 36#include <linux/parser.h>
37#include <linux/magic.h> 37#include <linux/magic.h>
38#include <linux/slab.h>
38#include <asm/uaccess.h> 39#include <asm/uaccess.h>
39#include "internal.h" 40#include "internal.h"
40 41
diff --git a/fs/read_write.c b/fs/read_write.c
index b7f4a1f94d48..113386d6fd2d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -258,6 +258,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
258 init_sync_kiocb(&kiocb, filp); 258 init_sync_kiocb(&kiocb, filp);
259 kiocb.ki_pos = *ppos; 259 kiocb.ki_pos = *ppos;
260 kiocb.ki_left = len; 260 kiocb.ki_left = len;
261 kiocb.ki_nbytes = len;
261 262
262 for (;;) { 263 for (;;) {
263 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); 264 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
@@ -313,6 +314,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
313 init_sync_kiocb(&kiocb, filp); 314 init_sync_kiocb(&kiocb, filp);
314 kiocb.ki_pos = *ppos; 315 kiocb.ki_pos = *ppos;
315 kiocb.ki_left = len; 316 kiocb.ki_left = len;
317 kiocb.ki_nbytes = len;
316 318
317 for (;;) { 319 for (;;) {
318 ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); 320 ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index c094f58c7448..f8a6075abf50 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -8,6 +8,7 @@
8#include <linux/reiserfs_fs.h> 8#include <linux/reiserfs_fs.h>
9#include <linux/stat.h> 9#include <linux/stat.h>
10#include <linux/buffer_head.h> 10#include <linux/buffer_head.h>
11#include <linux/slab.h>
11#include <asm/uaccess.h> 12#include <asm/uaccess.h>
12 13
13extern const struct reiserfs_key MIN_KEY; 14extern const struct reiserfs_key MIN_KEY;
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 6591cb21edf6..1e4250bc3a6f 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -35,6 +35,7 @@
35 **/ 35 **/
36 36
37#include <linux/time.h> 37#include <linux/time.h>
38#include <linux/slab.h>
38#include <linux/string.h> 39#include <linux/string.h>
39#include <linux/reiserfs_fs.h> 40#include <linux/reiserfs_fs.h>
40#include <linux/buffer_head.h> 41#include <linux/buffer_head.h>
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index d1da94b82d8f..dc2c65e04853 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -11,6 +11,7 @@
11#include <linux/smp_lock.h> 11#include <linux/smp_lock.h>
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/highmem.h> 13#include <linux/highmem.h>
14#include <linux/slab.h>
14#include <asm/uaccess.h> 15#include <asm/uaccess.h>
15#include <asm/unaligned.h> 16#include <asm/unaligned.h>
16#include <linux/buffer_head.h> 17#include <linux/buffer_head.h>
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index ba98546fabbd..19fbc810e8e7 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -50,6 +50,7 @@
50#include <linux/blkdev.h> 50#include <linux/blkdev.h>
51#include <linux/backing-dev.h> 51#include <linux/backing-dev.h>
52#include <linux/uaccess.h> 52#include <linux/uaccess.h>
53#include <linux/slab.h>
53 54
54#include <asm/system.h> 55#include <asm/system.h>
55 56
@@ -2217,6 +2218,15 @@ static int journal_read_transaction(struct super_block *sb,
2217 brelse(d_bh); 2218 brelse(d_bh);
2218 return 1; 2219 return 1;
2219 } 2220 }
2221
2222 if (bdev_read_only(sb->s_bdev)) {
2223 reiserfs_warning(sb, "clm-2076",
2224 "device is readonly, unable to replay log");
2225 brelse(c_bh);
2226 brelse(d_bh);
2227 return -EROFS;
2228 }
2229
2220 trans_id = get_desc_trans_id(desc); 2230 trans_id = get_desc_trans_id(desc);
2221 /* now we know we've got a good transaction, and it was inside the valid time ranges */ 2231 /* now we know we've got a good transaction, and it was inside the valid time ranges */
2222 log_blocks = kmalloc(get_desc_trans_len(desc) * 2232 log_blocks = kmalloc(get_desc_trans_len(desc) *
@@ -2459,12 +2469,6 @@ static int journal_read(struct super_block *sb)
2459 goto start_log_replay; 2469 goto start_log_replay;
2460 } 2470 }
2461 2471
2462 if (continue_replay && bdev_read_only(sb->s_bdev)) {
2463 reiserfs_warning(sb, "clm-2076",
2464 "device is readonly, unable to replay log");
2465 return -1;
2466 }
2467
2468 /* ok, there are transactions that need to be replayed. start with the first log block, find 2472 /* ok, there are transactions that need to be replayed. start with the first log block, find
2469 ** all the valid transactions, and pick out the oldest. 2473 ** all the valid transactions, and pick out the oldest.
2470 */ 2474 */
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 96e4cbbfaa18..d0c43cb99ffc 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -13,6 +13,7 @@
13 13
14#include <linux/time.h> 14#include <linux/time.h>
15#include <linux/bitops.h> 15#include <linux/bitops.h>
16#include <linux/slab.h>
16#include <linux/reiserfs_fs.h> 17#include <linux/reiserfs_fs.h>
17#include <linux/reiserfs_acl.h> 18#include <linux/reiserfs_acl.h>
18#include <linux/reiserfs_xattr.h> 19#include <linux/reiserfs_xattr.h>
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 04bf5d791bda..59125fb36d42 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -12,6 +12,7 @@
12 */ 12 */
13 13
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/slab.h>
15#include <linux/vmalloc.h> 16#include <linux/vmalloc.h>
16#include <linux/time.h> 17#include <linux/time.h>
17#include <asm/uaccess.h> 18#include <asm/uaccess.h>
@@ -1618,10 +1619,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1618 save_mount_options(s, data); 1619 save_mount_options(s, data);
1619 1620
1620 sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL); 1621 sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
1621 if (!sbi) { 1622 if (!sbi)
1622 errval = -ENOMEM; 1623 return -ENOMEM;
1623 goto error_alloc;
1624 }
1625 s->s_fs_info = sbi; 1624 s->s_fs_info = sbi;
1626 /* Set default values for options: non-aggressive tails, RO on errors */ 1625 /* Set default values for options: non-aggressive tails, RO on errors */
1627 REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL); 1626 REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
@@ -1878,12 +1877,12 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1878 return (0); 1877 return (0);
1879 1878
1880error: 1879error:
1881 reiserfs_write_unlock(s);
1882error_alloc:
1883 if (jinit_done) { /* kill the commit thread, free journal ram */ 1880 if (jinit_done) { /* kill the commit thread, free journal ram */
1884 journal_release_error(NULL, s); 1881 journal_release_error(NULL, s);
1885 } 1882 }
1886 1883
1884 reiserfs_write_unlock(s);
1885
1887 reiserfs_free_bitmap_cache(s); 1886 reiserfs_free_bitmap_cache(s);
1888 if (SB_BUFFER_WITH_SB(s)) 1887 if (SB_BUFFER_WITH_SB(s))
1889 brelse(SB_BUFFER_WITH_SB(s)); 1888 brelse(SB_BUFFER_WITH_SB(s));
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 37d034ca7d99..4f9586bb7631 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -38,6 +38,7 @@
38#include <linux/dcache.h> 38#include <linux/dcache.h>
39#include <linux/namei.h> 39#include <linux/namei.h>
40#include <linux/errno.h> 40#include <linux/errno.h>
41#include <linux/gfp.h>
41#include <linux/fs.h> 42#include <linux/fs.h>
42#include <linux/file.h> 43#include <linux/file.h>
43#include <linux/pagemap.h> 44#include <linux/pagemap.h>
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index dd20a7883f0f..9cdb759645a9 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -5,6 +5,7 @@
5#include <linux/errno.h> 5#include <linux/errno.h>
6#include <linux/pagemap.h> 6#include <linux/pagemap.h>
7#include <linux/xattr.h> 7#include <linux/xattr.h>
8#include <linux/slab.h>
8#include <linux/posix_acl_xattr.h> 9#include <linux/posix_acl_xattr.h>
9#include <linux/reiserfs_xattr.h> 10#include <linux/reiserfs_xattr.h>
10#include <linux/reiserfs_acl.h> 11#include <linux/reiserfs_acl.h>
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index d8b5bfcbdd30..7271a477c041 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -3,6 +3,7 @@
3#include <linux/fs.h> 3#include <linux/fs.h>
4#include <linux/pagemap.h> 4#include <linux/pagemap.h>
5#include <linux/xattr.h> 5#include <linux/xattr.h>
6#include <linux/slab.h>
6#include <linux/reiserfs_xattr.h> 7#include <linux/reiserfs_xattr.h>
7#include <linux/security.h> 8#include <linux/security.h>
8#include <asm/uaccess.h> 9#include <asm/uaccess.h>
@@ -76,7 +77,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
76 return error; 77 return error;
77 } 78 }
78 79
79 if (sec->length) { 80 if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) {
80 blocks = reiserfs_xattr_jcreate_nblocks(inode) + 81 blocks = reiserfs_xattr_jcreate_nblocks(inode) +
81 reiserfs_xattr_nblocks(inode, sec->length); 82 reiserfs_xattr_nblocks(inode, sec->length);
82 /* We don't want to count the directories twice if we have 83 /* We don't want to count the directories twice if we have
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 1dabe4ee02fe..f329849ce3c0 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -22,6 +22,7 @@
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/slab.h>
25#include <linux/kernel.h> 26#include <linux/kernel.h>
26#include <linux/signal.h> 27#include <linux/signal.h>
27#include <linux/list.h> 28#include <linux/list.h>
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index 92d5e8ffb639..dbf6548bbf06 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -13,7 +13,6 @@
13#include <linux/fcntl.h> 13#include <linux/fcntl.h>
14#include <linux/stat.h> 14#include <linux/stat.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/slab.h>
17#include <linux/pagemap.h> 16#include <linux/pagemap.h>
18#include <linux/smp_lock.h> 17#include <linux/smp_lock.h>
19#include <linux/net.h> 18#include <linux/net.h>
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
index 6bd9b691a463..0e39a924f10a 100644
--- a/fs/smbfs/smbiod.c
+++ b/fs/smbfs/smbiod.c
@@ -12,7 +12,6 @@
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/stat.h> 13#include <linux/stat.h>
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/slab.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/file.h> 16#include <linux/file.h>
18#include <linux/dcache.h> 17#include <linux/dcache.h>
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
index 00b2909bd469..54350b59046b 100644
--- a/fs/smbfs/symlink.c
+++ b/fs/smbfs/symlink.c
@@ -15,6 +15,7 @@
15#include <linux/pagemap.h> 15#include <linux/pagemap.h>
16#include <linux/net.h> 16#include <linux/net.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/slab.h>
18 19
19#include <asm/uaccess.h> 20#include <asm/uaccess.h>
20#include <asm/system.h> 21#include <asm/system.h>
diff --git a/fs/splice.c b/fs/splice.c
index 39208663aaf1..9313b6124a2e 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -30,6 +30,7 @@
30#include <linux/syscalls.h> 30#include <linux/syscalls.h>
31#include <linux/uio.h> 31#include <linux/uio.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/gfp.h>
33 34
34/* 35/*
35 * Attempt to steal a page from a pipe buffer. This should perhaps go into 36 * Attempt to steal a page from a pipe buffer. This should perhaps go into
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index e80be2022a7f..32b911f4ee39 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -33,7 +33,6 @@
33#include <linux/fs.h> 33#include <linux/fs.h>
34#include <linux/vfs.h> 34#include <linux/vfs.h>
35#include <linux/kernel.h> 35#include <linux/kernel.h>
36#include <linux/slab.h>
37#include <linux/string.h> 36#include <linux/string.h>
38#include <linux/pagemap.h> 37#include <linux/pagemap.h>
39 38
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 4dd70e04333b..15a03d0fb9f3 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -24,6 +24,7 @@
24 24
25#include <linux/mutex.h> 25#include <linux/mutex.h>
26#include <linux/buffer_head.h> 26#include <linux/buffer_head.h>
27#include <linux/slab.h>
27#include <linux/zlib.h> 28#include <linux/zlib.h>
28 29
29#include "squashfs_fs.h" 30#include "squashfs_fs.h"
diff --git a/fs/sync.c b/fs/sync.c
index f557d71cb097..fc5c3d75cf3c 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -5,6 +5,7 @@
5#include <linux/kernel.h> 5#include <linux/kernel.h>
6#include <linux/file.h> 6#include <linux/file.h>
7#include <linux/fs.h> 7#include <linux/fs.h>
8#include <linux/slab.h>
8#include <linux/module.h> 9#include <linux/module.h>
9#include <linux/sched.h> 10#include <linux/sched.h>
10#include <linux/writeback.h> 11#include <linux/writeback.h>
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 082daaecac1b..a4a0a9419711 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -18,6 +18,7 @@
18#include <linux/capability.h> 18#include <linux/capability.h>
19#include <linux/errno.h> 19#include <linux/errno.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h>
21#include <linux/xattr.h> 22#include <linux/xattr.h>
22#include <linux/security.h> 23#include <linux/security.h>
23#include "sysfs.h" 24#include "sysfs.h"
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 0cb10884a2fc..776137828dca 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -18,6 +18,7 @@
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/magic.h> 20#include <linux/magic.h>
21#include <linux/slab.h>
21 22
22#include "sysfs.h" 23#include "sysfs.h"
23 24
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 1b9a3a1e8a17..b93ec51fa7ac 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -11,6 +11,7 @@
11 */ 11 */
12 12
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/gfp.h>
14#include <linux/mount.h> 15#include <linux/mount.h>
15#include <linux/module.h> 16#include <linux/module.h>
16#include <linux/kobject.h> 17#include <linux/kobject.h>
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 1bfc95ad5f71..98158de91d24 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -14,6 +14,7 @@
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/slab.h>
17#include <linux/list.h> 18#include <linux/list.h>
18#include <linux/spinlock.h> 19#include <linux/spinlock.h>
19#include <linux/time.h> 20#include <linux/time.h>
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 4775af401167..37fa7ed062d8 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -45,6 +45,7 @@
45 45
46#include <linux/freezer.h> 46#include <linux/freezer.h>
47#include <linux/kthread.h> 47#include <linux/kthread.h>
48#include <linux/slab.h>
48#include "ubifs.h" 49#include "ubifs.h"
49 50
50/** 51/**
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 90492327b383..c2a68baa782f 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -34,6 +34,7 @@
34#include <linux/moduleparam.h> 34#include <linux/moduleparam.h>
35#include <linux/debugfs.h> 35#include <linux/debugfs.h>
36#include <linux/math64.h> 36#include <linux/math64.h>
37#include <linux/slab.h>
37 38
38#ifdef CONFIG_UBIFS_FS_DEBUG 39#ifdef CONFIG_UBIFS_FS_DEBUG
39 40
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index e26c02ab6cd5..5692cf72b807 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -52,6 +52,7 @@
52#include "ubifs.h" 52#include "ubifs.h"
53#include <linux/mount.h> 53#include <linux/mount.h>
54#include <linux/namei.h> 54#include <linux/namei.h>
55#include <linux/slab.h>
55 56
56static int read_block(struct inode *inode, void *addr, unsigned int block, 57static int read_block(struct inode *inode, void *addr, unsigned int block,
57 struct ubifs_data_node *dn) 58 struct ubifs_data_node *dn)
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index e5a3d8e96bb7..918d1582ca05 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -53,6 +53,7 @@
53 * good, and GC takes extra care when moving them. 53 * good, and GC takes extra care when moving them.
54 */ 54 */
55 55
56#include <linux/slab.h>
56#include <linux/pagemap.h> 57#include <linux/pagemap.h>
57#include <linux/list_sort.h> 58#include <linux/list_sort.h>
58#include "ubifs.h" 59#include "ubifs.h"
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index e589fedaf1ef..77d5cf4a7547 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -51,6 +51,7 @@
51 */ 51 */
52 52
53#include <linux/crc32.h> 53#include <linux/crc32.h>
54#include <linux/slab.h>
54#include "ubifs.h" 55#include "ubifs.h"
55 56
56/** 57/**
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index b2792e84d245..ad7f67b827ea 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -46,6 +46,7 @@
46#include "ubifs.h" 46#include "ubifs.h"
47#include <linux/crc16.h> 47#include <linux/crc16.h>
48#include <linux/math64.h> 48#include <linux/math64.h>
49#include <linux/slab.h>
49 50
50/** 51/**
51 * do_calc_lpt_geom - calculate sizes for the LPT area. 52 * do_calc_lpt_geom - calculate sizes for the LPT area.
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 8cbfb8248025..13cb7a4237bf 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -26,6 +26,7 @@
26 */ 26 */
27 27
28#include <linux/crc16.h> 28#include <linux/crc16.h>
29#include <linux/slab.h>
29#include "ubifs.h" 30#include "ubifs.h"
30 31
31/** 32/**
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 868a55ee080f..109c6ea03bb5 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -31,6 +31,7 @@
31 */ 31 */
32 32
33#include <linux/crc32.h> 33#include <linux/crc32.h>
34#include <linux/slab.h>
34#include "ubifs.h" 35#include "ubifs.h"
35 36
36/** 37/**
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 57085e43320f..96cb62c8a9dd 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -27,6 +27,7 @@
27 */ 27 */
28 28
29#include "ubifs.h" 29#include "ubifs.h"
30#include <linux/slab.h>
30#include <linux/random.h> 31#include <linux/random.h>
31#include <linux/math64.h> 32#include <linux/math64.h>
32 33
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index e5b1a7d00fa0..2194915220e5 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -31,6 +31,7 @@
31 */ 31 */
32 32
33#include <linux/crc32.h> 33#include <linux/crc32.h>
34#include <linux/slab.h>
34#include "ubifs.h" 35#include "ubifs.h"
35 36
36/* 37/*
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index b2d976366a46..bd2542dad014 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -28,6 +28,7 @@
28#include <linux/fs.h> 28#include <linux/fs.h>
29#include <linux/err.h> 29#include <linux/err.h>
30#include <linux/sched.h> 30#include <linux/sched.h>
31#include <linux/slab.h>
31#include <linux/vmalloc.h> 32#include <linux/vmalloc.h>
32#include <linux/spinlock.h> 33#include <linux/spinlock.h>
33#include <linux/mutex.h> 34#include <linux/mutex.h>
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 195830f47569..c74400f88fe0 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -56,6 +56,7 @@
56 */ 56 */
57 57
58#include "ubifs.h" 58#include "ubifs.h"
59#include <linux/slab.h>
59#include <linux/xattr.h> 60#include <linux/xattr.h>
60#include <linux/posix_acl_xattr.h> 61#include <linux/posix_acl_xattr.h>
61 62
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 4b540ee632d5..745eb209be0c 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -24,7 +24,6 @@
24 24
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/string.h> 26#include <linux/string.h>
27#include <linux/slab.h>
28#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
29 28
30uint32_t udf_get_pblock(struct super_block *sb, uint32_t block, 29uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 852e91845688..16064787d2b7 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -26,7 +26,6 @@
26#include <linux/time.h> 26#include <linux/time.h>
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/stat.h> 28#include <linux/stat.h>
29#include <linux/slab.h>
30#include <linux/pagemap.h> 29#include <linux/pagemap.h>
31#include <linux/smp_lock.h> 30#include <linux/smp_lock.h>
32#include <linux/buffer_head.h> 31#include <linux/buffer_head.h>
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index cefa8c8913e6..d03a90b6ad69 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -24,6 +24,7 @@
24#include <linux/string.h> /* for memset */ 24#include <linux/string.h> /* for memset */
25#include <linux/nls.h> 25#include <linux/nls.h>
26#include <linux/crc-itu-t.h> 26#include <linux/crc-itu-t.h>
27#include <linux/slab.h>
27 28
28#include "udf_sb.h" 29#include "udf_sb.h"
29 30
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
index 05ac0fe9c4d3..8d5a506c82eb 100644
--- a/fs/xattr_acl.c
+++ b/fs/xattr_acl.c
@@ -6,9 +6,9 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/slab.h>
10#include <linux/fs.h> 9#include <linux/fs.h>
11#include <linux/posix_acl_xattr.h> 10#include <linux/posix_acl_xattr.h>
11#include <linux/gfp.h>
12 12
13 13
14/* 14/*
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index bc7405585def..666c9db48eb6 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include <linux/mm.h> 18#include <linux/mm.h>
19#include <linux/highmem.h> 19#include <linux/highmem.h>
20#include <linux/slab.h>
20#include <linux/swap.h> 21#include <linux/swap.h>
21#include <linux/blkdev.h> 22#include <linux/blkdev.h>
22#include <linux/backing-dev.h> 23#include <linux/backing-dev.h>
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index bf85bbe4a9ae..a7bc925c4d60 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -22,6 +22,7 @@
22#include "xfs_inode.h" 22#include "xfs_inode.h"
23#include "xfs_vnodeops.h" 23#include "xfs_vnodeops.h"
24#include "xfs_trace.h" 24#include "xfs_trace.h"
25#include <linux/slab.h>
25#include <linux/xattr.h> 26#include <linux/xattr.h>
26#include <linux/posix_acl_xattr.h> 27#include <linux/posix_acl_xattr.h>
27 28
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 99628508cb11..0f8b9968a803 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -40,6 +40,7 @@
40#include "xfs_vnodeops.h" 40#include "xfs_vnodeops.h"
41#include "xfs_trace.h" 41#include "xfs_trace.h"
42#include "xfs_bmap.h" 42#include "xfs_bmap.h"
43#include <linux/gfp.h>
43#include <linux/mpage.h> 44#include <linux/mpage.h>
44#include <linux/pagevec.h> 45#include <linux/pagevec.h>
45#include <linux/writeback.h> 46#include <linux/writeback.h>
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index bd111b7e1daa..44c2b0ef9a41 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -18,7 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include <linux/stddef.h> 19#include <linux/stddef.h>
20#include <linux/errno.h> 20#include <linux/errno.h>
21#include <linux/slab.h> 21#include <linux/gfp.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 4ea1ee18aded..7b26cc2fd284 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -58,6 +58,7 @@
58#include <linux/mount.h> 58#include <linux/mount.h>
59#include <linux/namei.h> 59#include <linux/namei.h>
60#include <linux/pagemap.h> 60#include <linux/pagemap.h>
61#include <linux/slab.h>
61#include <linux/exportfs.h> 62#include <linux/exportfs.h>
62 63
63/* 64/*
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 0bf6d61f0528..593c05b4df8d 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -18,6 +18,7 @@
18#include <linux/compat.h> 18#include <linux/compat.h>
19#include <linux/ioctl.h> 19#include <linux/ioctl.h>
20#include <linux/mount.h> 20#include <linux/mount.h>
21#include <linux/slab.h>
21#include <asm/uaccess.h> 22#include <asm/uaccess.h>
22#include "xfs.h" 23#include "xfs.h"
23#include "xfs_fs.h" 24#include "xfs_fs.h"
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 61a99608731e..e65a7937f3a4 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -56,6 +56,7 @@
56#include <linux/security.h> 56#include <linux/security.h>
57#include <linux/falloc.h> 57#include <linux/falloc.h>
58#include <linux/fiemap.h> 58#include <linux/fiemap.h>
59#include <linux/slab.h>
59 60
60/* 61/*
61 * Bring the timestamps in the XFS inode uptodate. 62 * Bring the timestamps in the XFS inode uptodate.
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 71345a370d9f..52e06b487ced 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -61,6 +61,7 @@
61 61
62#include <linux/namei.h> 62#include <linux/namei.h>
63#include <linux/init.h> 63#include <linux/init.h>
64#include <linux/slab.h>
64#include <linux/mount.h> 65#include <linux/mount.h>
65#include <linux/mempool.h> 66#include <linux/mempool.h>
66#include <linux/writeback.h> 67#include <linux/writeback.h>