aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2010-04-22 20:08:44 -0400
committerJiri Kosina <jkosina@suse.cz>2010-04-22 20:08:44 -0400
commit6c9468e9eb1252eaefd94ce7f06e1be9b0b641b1 (patch)
tree797676a336b050bfa1ef879377c07e541b9075d6 /fs
parent4cb3ca7cd7e2cae8d1daf5345ec99a1e8502cf3f (diff)
parentc81eddb0e3728661d1585fbc564449c94165cc36 (diff)
Merge branch 'master' into for-next
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/cache.c1
-rw-r--r--fs/9p/fid.c13
-rw-r--r--fs/9p/v9fs.c22
-rw-r--r--fs/9p/v9fs.h7
-rw-r--r--fs/9p/vfs_dentry.c1
-rw-r--r--fs/9p/vfs_dir.c14
-rw-r--r--fs/9p/vfs_file.c4
-rw-r--r--fs/9p/vfs_inode.c10
-rw-r--r--fs/9p/vfs_super.c4
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile1
-rw-r--r--fs/adfs/super.c1
-rw-r--r--fs/affs/bitmap.c1
-rw-r--r--fs/affs/inode.c1
-rw-r--r--fs/affs/super.c1
-rw-r--r--fs/afs/cache.c1
-rw-r--r--fs/afs/cmservice.c1
-rw-r--r--fs/afs/dir.c1
-rw-r--r--fs/afs/file.c2
-rw-r--r--fs/afs/fsclient.c1
-rw-r--r--fs/afs/inode.c1
-rw-r--r--fs/afs/mntpt.c26
-rw-r--r--fs/afs/rxrpc.c1
-rw-r--r--fs/afs/security.c5
-rw-r--r--fs/afs/vlclient.c1
-rw-r--r--fs/afs/vlocation.c1
-rw-r--r--fs/afs/vnode.c1
-rw-r--r--fs/anon_inodes.c1
-rw-r--r--fs/autofs/root.c1
-rw-r--r--fs/autofs4/dev-ioctl.c1
-rw-r--r--fs/autofs4/root.c1
-rw-r--r--fs/befs/datastream.c1
-rw-r--r--fs/binfmt_aout.c16
-rw-r--r--fs/binfmt_elf_fdpic.c2
-rw-r--r--fs/binfmt_em86.c1
-rw-r--r--fs/binfmt_flat.c2
-rw-r--r--fs/binfmt_script.c1
-rw-r--r--fs/bio-integrity.c1
-rw-r--r--fs/bio.c4
-rw-r--r--fs/block_dev.c5
-rw-r--r--fs/btrfs/acl.c1
-rw-r--r--fs/btrfs/async-thread.c1
-rw-r--r--fs/btrfs/btrfs_inode.h5
-rw-r--r--fs/btrfs/compression.c23
-rw-r--r--fs/btrfs/ctree.c5
-rw-r--r--fs/btrfs/ctree.h15
-rw-r--r--fs/btrfs/delayed-ref.c1
-rw-r--r--fs/btrfs/disk-io.c28
-rw-r--r--fs/btrfs/export.c4
-rw-r--r--fs/btrfs/extent-tree.c55
-rw-r--r--fs/btrfs/extent_io.c95
-rw-r--r--fs/btrfs/extent_io.h10
-rw-r--r--fs/btrfs/extent_map.c1
-rw-r--r--fs/btrfs/file-item.c1
-rw-r--r--fs/btrfs/file.c24
-rw-r--r--fs/btrfs/free-space-cache.c1
-rw-r--r--fs/btrfs/inode.c199
-rw-r--r--fs/btrfs/ioctl.c710
-rw-r--r--fs/btrfs/ioctl.h111
-rw-r--r--fs/btrfs/locking.c1
-rw-r--r--fs/btrfs/ordered-data.c48
-rw-r--r--fs/btrfs/ordered-data.h7
-rw-r--r--fs/btrfs/ref-cache.c1
-rw-r--r--fs/btrfs/relocation.c5
-rw-r--r--fs/btrfs/super.c254
-rw-r--r--fs/btrfs/transaction.c118
-rw-r--r--fs/btrfs/tree-log.c3
-rw-r--r--fs/btrfs/volumes.c56
-rw-r--r--fs/cachefiles/interface.c1
-rw-r--r--fs/cachefiles/namei.c1
-rw-r--r--fs/cachefiles/rdwr.c1
-rw-r--r--fs/cachefiles/xattr.c1
-rw-r--r--fs/ceph/Kconfig27
-rw-r--r--fs/ceph/Makefile39
-rw-r--r--fs/ceph/README20
-rw-r--r--fs/ceph/addr.c1193
-rw-r--r--fs/ceph/armor.c99
-rw-r--r--fs/ceph/auth.c258
-rw-r--r--fs/ceph/auth.h84
-rw-r--r--fs/ceph/auth_none.c122
-rw-r--r--fs/ceph/auth_none.h28
-rw-r--r--fs/ceph/auth_x.c680
-rw-r--r--fs/ceph/auth_x.h49
-rw-r--r--fs/ceph/auth_x_protocol.h90
-rw-r--r--fs/ceph/buffer.c81
-rw-r--r--fs/ceph/buffer.h39
-rw-r--r--fs/ceph/caps.c2955
-rw-r--r--fs/ceph/ceph_debug.h37
-rw-r--r--fs/ceph/ceph_frag.c21
-rw-r--r--fs/ceph/ceph_frag.h109
-rw-r--r--fs/ceph/ceph_fs.c74
-rw-r--r--fs/ceph/ceph_fs.h650
-rw-r--r--fs/ceph/ceph_hash.c118
-rw-r--r--fs/ceph/ceph_hash.h13
-rw-r--r--fs/ceph/ceph_strings.c176
-rw-r--r--fs/ceph/crush/crush.c151
-rw-r--r--fs/ceph/crush/crush.h180
-rw-r--r--fs/ceph/crush/hash.c149
-rw-r--r--fs/ceph/crush/hash.h17
-rw-r--r--fs/ceph/crush/mapper.c596
-rw-r--r--fs/ceph/crush/mapper.h20
-rw-r--r--fs/ceph/crypto.c409
-rw-r--r--fs/ceph/crypto.h48
-rw-r--r--fs/ceph/debugfs.c484
-rw-r--r--fs/ceph/decode.h194
-rw-r--r--fs/ceph/dir.c1224
-rw-r--r--fs/ceph/export.c224
-rw-r--r--fs/ceph/file.c938
-rw-r--r--fs/ceph/inode.c1774
-rw-r--r--fs/ceph/ioctl.c160
-rw-r--r--fs/ceph/ioctl.h40
-rw-r--r--fs/ceph/mds_client.c3043
-rw-r--r--fs/ceph/mds_client.h335
-rw-r--r--fs/ceph/mdsmap.c174
-rw-r--r--fs/ceph/mdsmap.h54
-rw-r--r--fs/ceph/messenger.c2249
-rw-r--r--fs/ceph/messenger.h255
-rw-r--r--fs/ceph/mon_client.c835
-rw-r--r--fs/ceph/mon_client.h119
-rw-r--r--fs/ceph/msgpool.c186
-rw-r--r--fs/ceph/msgpool.h27
-rw-r--r--fs/ceph/msgr.h158
-rw-r--r--fs/ceph/osd_client.c1550
-rw-r--r--fs/ceph/osd_client.h166
-rw-r--r--fs/ceph/osdmap.c1062
-rw-r--r--fs/ceph/osdmap.h126
-rw-r--r--fs/ceph/pagelist.c55
-rw-r--r--fs/ceph/pagelist.h54
-rw-r--r--fs/ceph/rados.h376
-rw-r--r--fs/ceph/snap.c907
-rw-r--r--fs/ceph/super.c1031
-rw-r--r--fs/ceph/super.h901
-rw-r--r--fs/ceph/types.h29
-rw-r--r--fs/ceph/xattr.c845
-rw-r--r--fs/cifs/cifs_dfs_ref.c1
-rw-r--r--fs/cifs/cifs_spnego.c1
-rw-r--r--fs/cifs/cifs_unicode.c1
-rw-r--r--fs/cifs/cifsacl.c1
-rw-r--r--fs/cifs/cifsencrypt.c1
-rw-r--r--fs/cifs/cifsfs.c4
-rw-r--r--fs/cifs/cifsfs.h3
-rw-r--r--fs/cifs/cifsglob.h2
-rw-r--r--fs/cifs/cifsproto.h6
-rw-r--r--fs/cifs/cifssmb.c170
-rw-r--r--fs/cifs/connect.c1
-rw-r--r--fs/cifs/dir.c2
-rw-r--r--fs/cifs/dns_resolve.c1
-rw-r--r--fs/cifs/file.c36
-rw-r--r--fs/cifs/inode.c298
-rw-r--r--fs/cifs/link.c1
-rw-r--r--fs/cifs/readdir.c1
-rw-r--r--fs/cifs/sess.c1
-rw-r--r--fs/cifs/smbencrypt.c1
-rw-r--r--fs/cifs/transport.c1
-rw-r--r--fs/cifs/xattr.c1
-rw-r--r--fs/coda/dir.c1
-rw-r--r--fs/coda/file.c1
-rw-r--r--fs/coda/inode.c1
-rw-r--r--fs/coda/upcall.c1
-rw-r--r--fs/compat.c1
-rw-r--r--fs/compat_ioctl.c2
-rw-r--r--fs/configfs/inode.c1
-rw-r--r--fs/configfs/mount.c1
-rw-r--r--fs/configfs/symlink.c1
-rw-r--r--fs/debugfs/inode.c1
-rw-r--r--fs/devpts/inode.c1
-rw-r--r--fs/dlm/config.c1
-rw-r--r--fs/dlm/debug_fs.c1
-rw-r--r--fs/dlm/lock.c1
-rw-r--r--fs/dlm/lowcomms.c1
-rw-r--r--fs/dlm/netlink.c1
-rw-r--r--fs/dlm/plock.c1
-rw-r--r--fs/dlm/user.c1
-rw-r--r--fs/ecryptfs/crypto.c38
-rw-r--r--fs/ecryptfs/dentry.c1
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h13
-rw-r--r--fs/ecryptfs/file.c1
-rw-r--r--fs/ecryptfs/inode.c130
-rw-r--r--fs/ecryptfs/keystore.c1
-rw-r--r--fs/ecryptfs/kthread.c1
-rw-r--r--fs/ecryptfs/main.c1
-rw-r--r--fs/ecryptfs/messaging.c1
-rw-r--r--fs/ecryptfs/miscdev.c1
-rw-r--r--fs/ecryptfs/mmap.c39
-rw-r--r--fs/ecryptfs/super.c2
-rw-r--r--fs/eventfd.c1
-rw-r--r--fs/exofs/inode.c1
-rw-r--r--fs/exofs/ios.c1
-rw-r--r--fs/exofs/super.c1
-rw-r--r--fs/ext2/balloc.c1
-rw-r--r--fs/ext2/symlink.c2
-rw-r--r--fs/ext2/xattr_security.c1
-rw-r--r--fs/ext3/balloc.c1
-rw-r--r--fs/ext3/ialloc.c4
-rw-r--r--fs/ext3/inode.c2
-rw-r--r--fs/ext3/symlink.c2
-rw-r--r--fs/ext3/xattr_security.c1
-rw-r--r--fs/ext4/block_validity.c1
-rw-r--r--fs/ext4/ialloc.c4
-rw-r--r--fs/ext4/inode.c5
-rw-r--r--fs/ext4/mballoc.c1
-rw-r--r--fs/ext4/migrate.c1
-rw-r--r--fs/ext4/move_extent.c1
-rw-r--r--fs/ext4/super.c29
-rw-r--r--fs/ext4/xattr_security.c1
-rw-r--r--fs/fat/cache.c1
-rw-r--r--fs/fat/namei_vfat.c6
-rw-r--r--fs/fifo.c1
-rw-r--r--fs/filesystems.c2
-rw-r--r--fs/freevxfs/vxfs_subr.c1
-rw-r--r--fs/fs-writeback.c134
-rw-r--r--fs/fscache/object-list.c1
-rw-r--r--fs/fscache/object.c6
-rw-r--r--fs/fscache/operation.c5
-rw-r--r--fs/fscache/page.c2
-rw-r--r--fs/fscache/stats.c4
-rw-r--r--fs/fuse/cuse.c1
-rw-r--r--fs/generic_acl.c1
-rw-r--r--fs/gfs2/Kconfig1
-rw-r--r--fs/gfs2/bmap.c1
-rw-r--r--fs/gfs2/dentry.c1
-rw-r--r--fs/gfs2/export.c1
-rw-r--r--fs/gfs2/file.c2
-rw-r--r--fs/gfs2/glops.c1
-rw-r--r--fs/gfs2/incore.h2
-rw-r--r--fs/gfs2/lock_dlm.c1
-rw-r--r--fs/gfs2/log.c3
-rw-r--r--fs/gfs2/rgrp.h2
-rw-r--r--fs/gfs2/sys.c1
-rw-r--r--fs/gfs2/util.c1
-rw-r--r--fs/hfs/bnode.c1
-rw-r--r--fs/hfs/btree.c1
-rw-r--r--fs/hfs/mdb.c1
-rw-r--r--fs/hfs/super.c1
-rw-r--r--fs/hfsplus/options.c1
-rw-r--r--fs/hostfs/hostfs_kern.c1
-rw-r--r--fs/hpfs/buffer.c1
-rw-r--r--fs/hpfs/dir.c1
-rw-r--r--fs/hpfs/inode.c1
-rw-r--r--fs/hpfs/super.c1
-rw-r--r--fs/ioprio.c1
-rw-r--r--fs/isofs/dir.c1
-rw-r--r--fs/isofs/namei.c1
-rw-r--r--fs/jbd/commit.c1
-rw-r--r--fs/jbd/recovery.c1
-rw-r--r--fs/jbd2/recovery.c1
-rw-r--r--fs/jffs2/compr_lzo.c1
-rw-r--r--fs/jffs2/compr_zlib.c1
-rw-r--r--fs/jffs2/debug.c1
-rw-r--r--fs/jffs2/file.c1
-rw-r--r--fs/jffs2/nodelist.c1
-rw-r--r--fs/jffs2/nodemgmt.c1
-rw-r--r--fs/jffs2/readinode.c2
-rw-r--r--fs/jffs2/symlink.c1
-rw-r--r--fs/jffs2/write.c1
-rw-r--r--fs/jfs/acl.c1
-rw-r--r--fs/jfs/inode.c2
-rw-r--r--fs/jfs/jfs_dmap.c17
-rw-r--r--fs/jfs/jfs_dmap.h6
-rw-r--r--fs/jfs/jfs_dtree.c1
-rw-r--r--fs/jfs/jfs_imap.c1
-rw-r--r--fs/jfs/jfs_inode.h1
-rw-r--r--fs/jfs/jfs_logmgr.c1
-rw-r--r--fs/jfs/jfs_metapage.c1
-rw-r--r--fs/jfs/jfs_unicode.h1
-rw-r--r--fs/jfs/namei.c4
-rw-r--r--fs/jfs/resize.c6
-rw-r--r--fs/jfs/super.c1
-rw-r--r--fs/jfs/symlink.c14
-rw-r--r--fs/jfs/xattr.c1
-rw-r--r--fs/libfs.c1
-rw-r--r--fs/lockd/clntlock.c1
-rw-r--r--fs/lockd/clntproc.c1
-rw-r--r--fs/lockd/mon.c1
-rw-r--r--fs/lockd/svc.c1
-rw-r--r--fs/lockd/svc4proc.c1
-rw-r--r--fs/lockd/svclock.c1
-rw-r--r--fs/lockd/svcproc.c1
-rw-r--r--fs/lockd/svcsubs.c1
-rw-r--r--fs/logfs/dev_bdev.c10
-rw-r--r--fs/logfs/dir.c6
-rw-r--r--fs/logfs/gc.c9
-rw-r--r--fs/logfs/inode.c1
-rw-r--r--fs/logfs/journal.c37
-rw-r--r--fs/logfs/logfs.h16
-rw-r--r--fs/logfs/readwrite.c89
-rw-r--r--fs/logfs/segment.c63
-rw-r--r--fs/logfs/super.c27
-rw-r--r--fs/minix/itree_v1.c1
-rw-r--r--fs/mpage.c1
-rw-r--r--fs/namei.c18
-rw-r--r--fs/ncpfs/dir.c1
-rw-r--r--fs/ncpfs/file.c1
-rw-r--r--fs/ncpfs/ioctl.c1
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/ncpfs/sock.c1
-rw-r--r--fs/ncpfs/symlink.c1
-rw-r--r--fs/nfs/cache_lib.c1
-rw-r--r--fs/nfs/callback_proc.c1
-rw-r--r--fs/nfs/callback_xdr.c2
-rw-r--r--fs/nfs/client.c4
-rw-r--r--fs/nfs/delegation.c1
-rw-r--r--fs/nfs/delegation.h6
-rw-r--r--fs/nfs/dir.c4
-rw-r--r--fs/nfs/direct.c1
-rw-r--r--fs/nfs/dns_resolve.c1
-rw-r--r--fs/nfs/file.c5
-rw-r--r--fs/nfs/fscache.c1
-rw-r--r--fs/nfs/inode.c11
-rw-r--r--fs/nfs/namespace.c1
-rw-r--r--fs/nfs/nfs2xdr.c1
-rw-r--r--fs/nfs/nfs3acl.c1
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs3xdr.c1
-rw-r--r--fs/nfs/nfs4namespace.c1
-rw-r--r--fs/nfs/nfs4proc.c9
-rw-r--r--fs/nfs/nfs4xdr.c3
-rw-r--r--fs/nfs/pagelist.c23
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/super.c26
-rw-r--r--fs/nfs/symlink.c1
-rw-r--r--fs/nfs/write.c44
-rw-r--r--fs/nfs_common/nfsacl.c1
-rw-r--r--fs/nfsd/export.c1
-rw-r--r--fs/nfsd/nfs2acl.c1
-rw-r--r--fs/nfsd/nfs3acl.c1
-rw-r--r--fs/nfsd/nfs4acl.c1
-rw-r--r--fs/nfsd/nfs4callback.c1
-rw-r--r--fs/nfsd/nfs4idmap.c1
-rw-r--r--fs/nfsd/nfs4proc.c1
-rw-r--r--fs/nfsd/nfs4recover.c1
-rw-r--r--fs/nfsd/nfs4state.c1
-rw-r--r--fs/nfsd/nfs4xdr.c1
-rw-r--r--fs/nfsd/nfscache.c2
-rw-r--r--fs/nfsd/nfsctl.c1
-rw-r--r--fs/nfsd/vfs.c1
-rw-r--r--fs/nilfs2/alloc.c3
-rw-r--r--fs/nilfs2/alloc.h2
-rw-r--r--fs/nilfs2/btnode.c1
-rw-r--r--fs/nilfs2/btree.c2
-rw-r--r--fs/nilfs2/dat.c2
-rw-r--r--fs/nilfs2/dir.c2
-rw-r--r--fs/nilfs2/gcinode.c5
-rw-r--r--fs/nilfs2/inode.c1
-rw-r--r--fs/nilfs2/ioctl.c3
-rw-r--r--fs/nilfs2/mdt.c1
-rw-r--r--fs/nilfs2/page.c5
-rw-r--r--fs/nilfs2/recovery.c1
-rw-r--r--fs/nilfs2/segbuf.c19
-rw-r--r--fs/nilfs2/segment.c24
-rw-r--r--fs/nilfs2/segment.h4
-rw-r--r--fs/nilfs2/sufile.c2
-rw-r--r--fs/nilfs2/super.c4
-rw-r--r--fs/nilfs2/the_nilfs.c2
-rw-r--r--fs/nilfs2/the_nilfs.h1
-rw-r--r--fs/notify/fsnotify.c1
-rw-r--r--fs/notify/inode_mark.c1
-rw-r--r--fs/ntfs/aops.c1
-rw-r--r--fs/ntfs/attrib.c1
-rw-r--r--fs/ntfs/compress.c1
-rw-r--r--fs/ntfs/dir.c1
-rw-r--r--fs/ntfs/file.c1
-rw-r--r--fs/ntfs/index.c2
-rw-r--r--fs/ntfs/mft.c1
-rw-r--r--fs/ntfs/namei.c1
-rw-r--r--fs/ntfs/super.c25
-rw-r--r--fs/ocfs2/acl.c78
-rw-r--r--fs/ocfs2/buffer_head_io.c1
-rw-r--r--fs/ocfs2/cluster/heartbeat.c1
-rw-r--r--fs/ocfs2/cluster/nodemanager.c1
-rw-r--r--fs/ocfs2/cluster/quorum.c1
-rw-r--r--fs/ocfs2/dlm/dlmast.c1
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c1
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c4
-rw-r--r--fs/ocfs2/dlm/dlmthread.c1
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c1
-rw-r--r--fs/ocfs2/extent_map.c1
-rw-r--r--fs/ocfs2/heartbeat.c1
-rw-r--r--fs/ocfs2/inode.c16
-rw-r--r--fs/ocfs2/localalloc.c10
-rw-r--r--fs/ocfs2/locks.c2
-rw-r--r--fs/ocfs2/mmap.c1
-rw-r--r--fs/ocfs2/namei.c28
-rw-r--r--fs/ocfs2/ocfs2.h14
-rw-r--r--fs/ocfs2/quota_global.c1
-rw-r--r--fs/ocfs2/quota_local.c1
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/ocfs2/stack_o2cb.c1
-rw-r--r--fs/ocfs2/stack_user.c1
-rw-r--r--fs/ocfs2/suballoc.c129
-rw-r--r--fs/ocfs2/suballoc.h5
-rw-r--r--fs/ocfs2/sysfile.c1
-rw-r--r--fs/ocfs2/xattr.c12
-rw-r--r--fs/omfs/inode.c1
-rw-r--r--fs/open.c2
-rw-r--r--fs/partitions/check.c1
-rw-r--r--fs/partitions/efi.c1
-rw-r--r--fs/partitions/msdos.c85
-rw-r--r--fs/proc/array.c1
-rw-r--r--fs/proc/base.c6
-rw-r--r--fs/proc/generic.c1
-rw-r--r--fs/proc/inode.c1
-rw-r--r--fs/proc/kcore.c3
-rw-r--r--fs/proc/nommu.c1
-rw-r--r--fs/proc/proc_devtree.c1
-rw-r--r--fs/proc/proc_net.c1
-rw-r--r--fs/proc/stat.c1
-rw-r--r--fs/proc/task_mmu.c115
-rw-r--r--fs/proc/task_nommu.c1
-rw-r--r--fs/proc/vmcore.c1
-rw-r--r--fs/quota/Kconfig8
-rw-r--r--fs/quota/dquot.c28
-rw-r--r--fs/quota/netlink.c1
-rw-r--r--fs/ramfs/file-nommu.c1
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/read_write.c2
-rw-r--r--fs/reiserfs/dir.c1
-rw-r--r--fs/reiserfs/fix_node.c1
-rw-r--r--fs/reiserfs/inode.c1
-rw-r--r--fs/reiserfs/journal.c16
-rw-r--r--fs/reiserfs/namei.c1
-rw-r--r--fs/reiserfs/super.c11
-rw-r--r--fs/reiserfs/xattr.c1
-rw-r--r--fs/reiserfs/xattr_acl.c1
-rw-r--r--fs/reiserfs/xattr_security.c3
-rw-r--r--fs/signalfd.c1
-rw-r--r--fs/smbfs/file.c1
-rw-r--r--fs/smbfs/smbiod.c1
-rw-r--r--fs/smbfs/symlink.c1
-rw-r--r--fs/splice.c1
-rw-r--r--fs/squashfs/symlink.c1
-rw-r--r--fs/squashfs/zlib_wrapper.c1
-rw-r--r--fs/sync.c1
-rw-r--r--fs/sysfs/inode.c1
-rw-r--r--fs/sysfs/mount.c1
-rw-r--r--fs/sysfs/symlink.c1
-rw-r--r--fs/timerfd.c1
-rw-r--r--fs/ubifs/commit.c1
-rw-r--r--fs/ubifs/debug.c1
-rw-r--r--fs/ubifs/file.c1
-rw-r--r--fs/ubifs/gc.c1
-rw-r--r--fs/ubifs/io.c1
-rw-r--r--fs/ubifs/lpt.c1
-rw-r--r--fs/ubifs/lpt_commit.c1
-rw-r--r--fs/ubifs/recovery.c1
-rw-r--r--fs/ubifs/sb.c1
-rw-r--r--fs/ubifs/tnc.c1
-rw-r--r--fs/ubifs/ubifs.h1
-rw-r--r--fs/ubifs/xattr.c1
-rw-r--r--fs/udf/balloc.c10
-rw-r--r--fs/udf/file.c2
-rw-r--r--fs/udf/inode.c2
-rw-r--r--fs/udf/namei.c9
-rw-r--r--fs/udf/partition.c1
-rw-r--r--fs/udf/symlink.c1
-rw-r--r--fs/udf/udfdecl.h3
-rw-r--r--fs/udf/unicode.c1
-rw-r--r--fs/xattr_acl.c2
-rw-r--r--fs/xfs/linux-2.6/kmem.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c14
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c83
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c4
-rw-r--r--fs/xfs/xfs_log.c38
468 files changed, 31182 insertions, 1378 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index e777961939f3..0dbe0d139ac2 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24#include <linux/file.h> 24#include <linux/file.h>
25#include <linux/slab.h>
25#include <linux/stat.h> 26#include <linux/stat.h>
26#include <linux/sched.h> 27#include <linux/sched.h>
27#include <linux/fs.h> 28#include <linux/fs.h>
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 08b2eb157048..7317b39b2815 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -24,6 +24,7 @@
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/errno.h> 25#include <linux/errno.h>
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/slab.h>
27#include <linux/sched.h> 28#include <linux/sched.h>
28#include <linux/idr.h> 29#include <linux/idr.h>
29#include <net/9p/9p.h> 30#include <net/9p/9p.h>
@@ -110,7 +111,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
110{ 111{
111 int i, n, l, clone, any, access; 112 int i, n, l, clone, any, access;
112 u32 uid; 113 u32 uid;
113 struct p9_fid *fid; 114 struct p9_fid *fid, *old_fid = NULL;
114 struct dentry *d, *ds; 115 struct dentry *d, *ds;
115 struct v9fs_session_info *v9ses; 116 struct v9fs_session_info *v9ses;
116 char **wnames, *uname; 117 char **wnames, *uname;
@@ -183,10 +184,18 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
183 l = min(n - i, P9_MAXWELEM); 184 l = min(n - i, P9_MAXWELEM);
184 fid = p9_client_walk(fid, l, &wnames[i], clone); 185 fid = p9_client_walk(fid, l, &wnames[i], clone);
185 if (IS_ERR(fid)) { 186 if (IS_ERR(fid)) {
187 if (old_fid) {
188 /*
189 * If we fail, clunk fid which are mapping
190 * to path component and not the last component
191 * of the path.
192 */
193 p9_client_clunk(old_fid);
194 }
186 kfree(wnames); 195 kfree(wnames);
187 return fid; 196 return fid;
188 } 197 }
189 198 old_fid = fid;
190 i += l; 199 i += l;
191 clone = 0; 200 clone = 0;
192 } 201 }
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6c7f6a251115..5c5bc8480070 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -29,6 +29,7 @@
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/parser.h> 30#include <linux/parser.h>
31#include <linux/idr.h> 31#include <linux/idr.h>
32#include <linux/slab.h>
32#include <net/9p/9p.h> 33#include <net/9p/9p.h>
33#include <net/9p/client.h> 34#include <net/9p/client.h>
34#include <net/9p/transport.h> 35#include <net/9p/transport.h>
@@ -241,7 +242,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
241 list_add(&v9ses->slist, &v9fs_sessionlist); 242 list_add(&v9ses->slist, &v9fs_sessionlist);
242 spin_unlock(&v9fs_sessionlist_lock); 243 spin_unlock(&v9fs_sessionlist_lock);
243 244
244 v9ses->flags = V9FS_PROTO_2000U | V9FS_ACCESS_USER; 245 v9ses->flags = V9FS_ACCESS_USER;
245 strcpy(v9ses->uname, V9FS_DEFUSER); 246 strcpy(v9ses->uname, V9FS_DEFUSER);
246 strcpy(v9ses->aname, V9FS_DEFANAME); 247 strcpy(v9ses->aname, V9FS_DEFANAME);
247 v9ses->uid = ~0; 248 v9ses->uid = ~0;
@@ -262,8 +263,10 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
262 goto error; 263 goto error;
263 } 264 }
264 265
265 if (!p9_is_proto_dotu(v9ses->clnt)) 266 if (p9_is_proto_dotl(v9ses->clnt))
266 v9ses->flags &= ~V9FS_PROTO_2000U; 267 v9ses->flags |= V9FS_PROTO_2000L;
268 else if (p9_is_proto_dotu(v9ses->clnt))
269 v9ses->flags |= V9FS_PROTO_2000U;
267 270
268 v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; 271 v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
269 272
@@ -340,6 +343,19 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
340 p9_client_disconnect(v9ses->clnt); 343 p9_client_disconnect(v9ses->clnt);
341} 344}
342 345
346/**
347 * v9fs_session_begin_cancel - Begin terminate of a session
348 * @v9ses: session to terminate
349 *
350 * After this call we don't allow any request other than clunk.
351 */
352
353void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses)
354{
355 P9_DPRINTK(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses);
356 p9_client_begin_disconnect(v9ses->clnt);
357}
358
343extern int v9fs_error_init(void); 359extern int v9fs_error_init(void);
344 360
345static struct kobject *v9fs_kobj; 361static struct kobject *v9fs_kobj;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 79000bf62491..a0a8d3dd1361 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -24,7 +24,7 @@
24/** 24/**
25 * enum p9_session_flags - option flags for each 9P session 25 * enum p9_session_flags - option flags for each 9P session
26 * @V9FS_PROTO_2000U: whether or not to use 9P2000.u extensions 26 * @V9FS_PROTO_2000U: whether or not to use 9P2000.u extensions
27 * @V9FS_PROTO_2010L: whether or not to use 9P2010.l extensions 27 * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions
28 * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy 28 * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy
29 * @V9FS_ACCESS_USER: a new attach will be issued for every user (default) 29 * @V9FS_ACCESS_USER: a new attach will be issued for every user (default)
30 * @V9FS_ACCESS_ANY: use a single attach for all users 30 * @V9FS_ACCESS_ANY: use a single attach for all users
@@ -34,7 +34,7 @@
34 */ 34 */
35enum p9_session_flags { 35enum p9_session_flags {
36 V9FS_PROTO_2000U = 0x01, 36 V9FS_PROTO_2000U = 0x01,
37 V9FS_PROTO_2010L = 0x02, 37 V9FS_PROTO_2000L = 0x02,
38 V9FS_ACCESS_SINGLE = 0x04, 38 V9FS_ACCESS_SINGLE = 0x04,
39 V9FS_ACCESS_USER = 0x08, 39 V9FS_ACCESS_USER = 0x08,
40 V9FS_ACCESS_ANY = 0x0C, 40 V9FS_ACCESS_ANY = 0x0C,
@@ -108,6 +108,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
108 char *); 108 char *);
109void v9fs_session_close(struct v9fs_session_info *v9ses); 109void v9fs_session_close(struct v9fs_session_info *v9ses);
110void v9fs_session_cancel(struct v9fs_session_info *v9ses); 110void v9fs_session_cancel(struct v9fs_session_info *v9ses);
111void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
111 112
112#define V9FS_MAGIC 0x01021997 113#define V9FS_MAGIC 0x01021997
113 114
@@ -130,5 +131,5 @@ static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses)
130 131
131static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses) 132static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
132{ 133{
133 return v9ses->flags & V9FS_PROTO_2010L; 134 return v9ses->flags & V9FS_PROTO_2000L;
134} 135}
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index d74325295b1e..cbf4e50f3933 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -34,6 +34,7 @@
34#include <linux/namei.h> 34#include <linux/namei.h>
35#include <linux/idr.h> 35#include <linux/idr.h>
36#include <linux/sched.h> 36#include <linux/sched.h>
37#include <linux/slab.h>
37#include <net/9p/9p.h> 38#include <net/9p/9p.h>
38#include <net/9p/client.h> 39#include <net/9p/client.h>
39 40
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 6580aa449541..0adfd64dfcee 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -32,6 +32,7 @@
32#include <linux/sched.h> 32#include <linux/sched.h>
33#include <linux/inet.h> 33#include <linux/inet.h>
34#include <linux/idr.h> 34#include <linux/idr.h>
35#include <linux/slab.h>
35#include <net/9p/9p.h> 36#include <net/9p/9p.h>
36#include <net/9p/client.h> 37#include <net/9p/client.h>
37 38
@@ -76,6 +77,15 @@ static inline int dt_type(struct p9_wstat *mistat)
76 return rettype; 77 return rettype;
77} 78}
78 79
80static void p9stat_init(struct p9_wstat *stbuf)
81{
82 stbuf->name = NULL;
83 stbuf->uid = NULL;
84 stbuf->gid = NULL;
85 stbuf->muid = NULL;
86 stbuf->extension = NULL;
87}
88
79/** 89/**
80 * v9fs_dir_readdir - read a directory 90 * v9fs_dir_readdir - read a directory
81 * @filp: opened file structure 91 * @filp: opened file structure
@@ -121,6 +131,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
121 rdir = (struct p9_rdir *) fid->rdir; 131 rdir = (struct p9_rdir *) fid->rdir;
122 132
123 err = mutex_lock_interruptible(&rdir->mutex); 133 err = mutex_lock_interruptible(&rdir->mutex);
134 if (err)
135 return err;
124 while (err == 0) { 136 while (err == 0) {
125 if (rdir->tail == rdir->head) { 137 if (rdir->tail == rdir->head) {
126 err = v9fs_file_readn(filp, rdir->buf, NULL, 138 err = v9fs_file_readn(filp, rdir->buf, NULL,
@@ -131,8 +143,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
131 rdir->head = 0; 143 rdir->head = 0;
132 rdir->tail = err; 144 rdir->tail = err;
133 } 145 }
134
135 while (rdir->head < rdir->tail) { 146 while (rdir->head < rdir->tail) {
147 p9stat_init(&st);
136 err = p9stat_read(rdir->buf + rdir->head, 148 err = p9stat_read(rdir->buf + rdir->head,
137 buflen - rdir->head, &st, 149 buflen - rdir->head, &st,
138 fid->clnt->proto_version); 150 fid->clnt->proto_version);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 36122683fae8..df52d488d2a6 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -114,7 +114,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
114 P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); 114 P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
115 115
116 /* No mandatory locks */ 116 /* No mandatory locks */
117 if (__mandatory_lock(inode)) 117 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
118 return -ENOLCK; 118 return -ENOLCK;
119 119
120 if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { 120 if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
@@ -215,7 +215,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
215 struct p9_fid *fid; 215 struct p9_fid *fid;
216 struct p9_client *clnt; 216 struct p9_client *clnt;
217 struct inode *inode = filp->f_path.dentry->d_inode; 217 struct inode *inode = filp->f_path.dentry->d_inode;
218 int origin = *offset; 218 loff_t origin = *offset;
219 unsigned long pg_start, pg_end; 219 unsigned long pg_start, pg_end;
220 220
221 P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data, 221 P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 5fe45d692c9f..f2434fc9d2c4 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -34,6 +34,7 @@
34#include <linux/namei.h> 34#include <linux/namei.h>
35#include <linux/idr.h> 35#include <linux/idr.h>
36#include <linux/sched.h> 36#include <linux/sched.h>
37#include <linux/slab.h>
37#include <net/9p/9p.h> 38#include <net/9p/9p.h>
38#include <net/9p/client.h> 39#include <net/9p/client.h>
39 40
@@ -431,6 +432,7 @@ error:
431 432
432static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir) 433static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
433{ 434{
435 int retval;
434 struct inode *file_inode; 436 struct inode *file_inode;
435 struct v9fs_session_info *v9ses; 437 struct v9fs_session_info *v9ses;
436 struct p9_fid *v9fid; 438 struct p9_fid *v9fid;
@@ -444,7 +446,10 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
444 if (IS_ERR(v9fid)) 446 if (IS_ERR(v9fid))
445 return PTR_ERR(v9fid); 447 return PTR_ERR(v9fid);
446 448
447 return p9_client_remove(v9fid); 449 retval = p9_client_remove(v9fid);
450 if (!retval)
451 drop_nlink(file_inode);
452 return retval;
448} 453}
449 454
450static int 455static int
@@ -656,6 +661,9 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
656 P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n", 661 P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
657 dir, dentry->d_name.name, dentry, nameidata); 662 dir, dentry->d_name.name, dentry, nameidata);
658 663
664 if (dentry->d_name.len > NAME_MAX)
665 return ERR_PTR(-ENAMETOOLONG);
666
659 sb = dir->i_sb; 667 sb = dir->i_sb;
660 v9ses = v9fs_inode2v9ses(dir); 668 v9ses = v9fs_inode2v9ses(dir);
661 dfid = v9fs_fid_lookup(dentry->d_parent); 669 dfid = v9fs_fid_lookup(dentry->d_parent);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 69357c0d9899..491108bd6e0d 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -37,6 +37,7 @@
37#include <linux/mount.h> 37#include <linux/mount.h>
38#include <linux/idr.h> 38#include <linux/idr.h>
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <linux/slab.h>
40#include <net/9p/9p.h> 41#include <net/9p/9p.h>
41#include <net/9p/client.h> 42#include <net/9p/client.h>
42 43
@@ -193,6 +194,7 @@ static void v9fs_kill_super(struct super_block *s)
193 194
194 kill_anon_super(s); 195 kill_anon_super(s);
195 196
197 v9fs_session_cancel(v9ses);
196 v9fs_session_close(v9ses); 198 v9fs_session_close(v9ses);
197 kfree(v9ses); 199 kfree(v9ses);
198 s->s_fs_info = NULL; 200 s->s_fs_info = NULL;
@@ -205,7 +207,7 @@ v9fs_umount_begin(struct super_block *sb)
205 struct v9fs_session_info *v9ses; 207 struct v9fs_session_info *v9ses;
206 208
207 v9ses = sb->s_fs_info; 209 v9ses = sb->s_fs_info;
208 v9fs_session_cancel(v9ses); 210 v9fs_session_begin_cancel(v9ses);
209} 211}
210 212
211static const struct super_operations v9fs_super_ops = { 213static const struct super_operations v9fs_super_ops = {
diff --git a/fs/Kconfig b/fs/Kconfig
index 7405f071be67..5f85b5947613 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -235,6 +235,7 @@ config NFS_COMMON
235 235
236source "net/sunrpc/Kconfig" 236source "net/sunrpc/Kconfig"
237source "fs/smbfs/Kconfig" 237source "fs/smbfs/Kconfig"
238source "fs/ceph/Kconfig"
238source "fs/cifs/Kconfig" 239source "fs/cifs/Kconfig"
239source "fs/ncpfs/Kconfig" 240source "fs/ncpfs/Kconfig"
240source "fs/coda/Kconfig" 241source "fs/coda/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index c3633aa46911..97f340f14ba2 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -125,3 +125,4 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/
125obj-$(CONFIG_BTRFS_FS) += btrfs/ 125obj-$(CONFIG_BTRFS_FS) += btrfs/
126obj-$(CONFIG_GFS2_FS) += gfs2/ 126obj-$(CONFIG_GFS2_FS) += gfs2/
127obj-$(CONFIG_EXOFS_FS) += exofs/ 127obj-$(CONFIG_EXOFS_FS) += exofs/
128obj-$(CONFIG_CEPH_FS) += ceph/
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 6910a98bd73c..4a3af7075c1d 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -13,6 +13,7 @@
13#include <linux/parser.h> 13#include <linux/parser.h>
14#include <linux/mount.h> 14#include <linux/mount.h>
15#include <linux/seq_file.h> 15#include <linux/seq_file.h>
16#include <linux/slab.h>
16#include <linux/smp_lock.h> 17#include <linux/smp_lock.h>
17#include <linux/statfs.h> 18#include <linux/statfs.h>
18#include "adfs.h" 19#include "adfs.h"
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index 8306d53307ed..3e262711ae06 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -7,6 +7,7 @@
7 * block allocation, deallocation, calculation of free space. 7 * block allocation, deallocation, calculation of free space.
8 */ 8 */
9 9
10#include <linux/slab.h>
10#include "affs.h" 11#include "affs.h"
11 12
12/* This is, of course, shamelessly stolen from fs/minix */ 13/* This is, of course, shamelessly stolen from fs/minix */
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index c9744d771d98..f4b2a4ee4f91 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -10,6 +10,7 @@
10 * (C) 1991 Linus Torvalds - minix filesystem 10 * (C) 1991 Linus Torvalds - minix filesystem
11 */ 11 */
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/gfp.h>
13#include "affs.h" 14#include "affs.h"
14 15
15extern const struct inode_operations affs_symlink_inode_operations; 16extern const struct inode_operations affs_symlink_inode_operations;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index d41e9673cd97..16a3e4765f68 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -17,6 +17,7 @@
17#include <linux/magic.h> 17#include <linux/magic.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/smp_lock.h> 19#include <linux/smp_lock.h>
20#include <linux/slab.h>
20#include "affs.h" 21#include "affs.h"
21 22
22extern struct timezone sys_tz; 23extern struct timezone sys_tz;
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index e2b1d3f16519..0fb315dd4d2a 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -9,7 +9,6 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/slab.h>
13#include <linux/sched.h> 12#include <linux/sched.h>
14#include "internal.h" 13#include "internal.h"
15 14
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index eb765489164f..a3bcec75c54a 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/slab.h>
14#include <linux/sched.h> 15#include <linux/sched.h>
15#include <linux/ip.h> 16#include <linux/ip.h>
16#include "internal.h" 17#include "internal.h"
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 88067f36e5e7..adc1cb771b57 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -12,7 +12,6 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
17#include <linux/pagemap.h> 16#include <linux/pagemap.h>
18#include <linux/ctype.h> 17#include <linux/ctype.h>
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 39b301662f22..0df9bc2b724d 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -12,10 +12,10 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
17#include <linux/pagemap.h> 16#include <linux/pagemap.h>
18#include <linux/writeback.h> 17#include <linux/writeback.h>
18#include <linux/gfp.h>
19#include "internal.h" 19#include "internal.h"
20 20
21static int afs_readpage(struct file *file, struct page *page); 21static int afs_readpage(struct file *file, struct page *page);
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 023b95b0d9d7..4bd0218473a9 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -10,6 +10,7 @@
10 */ 10 */
11 11
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/slab.h>
13#include <linux/sched.h> 14#include <linux/sched.h>
14#include <linux/circ_buf.h> 15#include <linux/circ_buf.h>
15#include "internal.h" 16#include "internal.h"
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index c048f0658751..d00b312e3110 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -16,7 +16,6 @@
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/slab.h>
20#include <linux/fs.h> 19#include <linux/fs.h>
21#include <linux/pagemap.h> 20#include <linux/pagemap.h>
22#include <linux/sched.h> 21#include <linux/sched.h>
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 5ffb570cd3a8..b3feddc4f7d6 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -12,11 +12,11 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
17#include <linux/pagemap.h> 16#include <linux/pagemap.h>
18#include <linux/mount.h> 17#include <linux/mount.h>
19#include <linux/namei.h> 18#include <linux/namei.h>
19#include <linux/gfp.h>
20#include "internal.h" 20#include "internal.h"
21 21
22 22
@@ -138,9 +138,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
138{ 138{
139 struct afs_super_info *super; 139 struct afs_super_info *super;
140 struct vfsmount *mnt; 140 struct vfsmount *mnt;
141 struct page *page = NULL; 141 struct page *page;
142 size_t size; 142 size_t size;
143 char *buf, *devname = NULL, *options = NULL; 143 char *buf, *devname, *options;
144 int ret; 144 int ret;
145 145
146 _enter("{%s}", mntpt->d_name.name); 146 _enter("{%s}", mntpt->d_name.name);
@@ -150,22 +150,22 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
150 ret = -EINVAL; 150 ret = -EINVAL;
151 size = mntpt->d_inode->i_size; 151 size = mntpt->d_inode->i_size;
152 if (size > PAGE_SIZE - 1) 152 if (size > PAGE_SIZE - 1)
153 goto error; 153 goto error_no_devname;
154 154
155 ret = -ENOMEM; 155 ret = -ENOMEM;
156 devname = (char *) get_zeroed_page(GFP_KERNEL); 156 devname = (char *) get_zeroed_page(GFP_KERNEL);
157 if (!devname) 157 if (!devname)
158 goto error; 158 goto error_no_devname;
159 159
160 options = (char *) get_zeroed_page(GFP_KERNEL); 160 options = (char *) get_zeroed_page(GFP_KERNEL);
161 if (!options) 161 if (!options)
162 goto error; 162 goto error_no_options;
163 163
164 /* read the contents of the AFS special symlink */ 164 /* read the contents of the AFS special symlink */
165 page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL); 165 page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
166 if (IS_ERR(page)) { 166 if (IS_ERR(page)) {
167 ret = PTR_ERR(page); 167 ret = PTR_ERR(page);
168 goto error; 168 goto error_no_page;
169 } 169 }
170 170
171 ret = -EIO; 171 ret = -EIO;
@@ -196,12 +196,12 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
196 return mnt; 196 return mnt;
197 197
198error: 198error:
199 if (page) 199 page_cache_release(page);
200 page_cache_release(page); 200error_no_page:
201 if (devname) 201 free_page((unsigned long) options);
202 free_page((unsigned long) devname); 202error_no_options:
203 if (options) 203 free_page((unsigned long) devname);
204 free_page((unsigned long) options); 204error_no_devname:
205 _leave(" = %d", ret); 205 _leave(" = %d", ret);
206 return ERR_PTR(ret); 206 return ERR_PTR(ret);
207} 207}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index bde3f19c0995..67cf810e0fd6 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -9,6 +9,7 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/slab.h>
12#include <net/sock.h> 13#include <net/sock.h>
13#include <net/af_rxrpc.h> 14#include <net/af_rxrpc.h>
14#include <rxrpc/packet.h> 15#include <rxrpc/packet.h>
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 3ef504370034..bb4ed144d0e4 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -189,8 +189,9 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order)
189 if (!permits) 189 if (!permits)
190 goto out_unlock; 190 goto out_unlock;
191 191
192 memcpy(permits->permits, xpermits->permits, 192 if (xpermits)
193 count * sizeof(struct afs_permit)); 193 memcpy(permits->permits, xpermits->permits,
194 count * sizeof(struct afs_permit));
194 195
195 _debug("key %x access %x", 196 _debug("key %x access %x",
196 key_serial(key), vnode->status.caller_access); 197 key_serial(key), vnode->status.caller_access);
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 36c1306e09e0..340afd0cd182 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -9,6 +9,7 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/gfp.h>
12#include <linux/init.h> 13#include <linux/init.h>
13#include <linux/sched.h> 14#include <linux/sched.h>
14#include "internal.h" 15#include "internal.h"
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 6e689208def2..9ac260d1361d 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/slab.h>
14#include <linux/init.h> 15#include <linux/init.h>
15#include <linux/sched.h> 16#include <linux/sched.h>
16#include "internal.h" 17#include "internal.h"
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index 2f05c4fc2a70..25cf4c3f4ff7 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -12,7 +12,6 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
17#include <linux/sched.h> 16#include <linux/sched.h>
18#include "internal.h" 17#include "internal.h"
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 2de009565d8e..e4b75d6eda83 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -12,7 +12,6 @@
12#include <linux/file.h> 12#include <linux/file.h>
13#include <linux/poll.h> 13#include <linux/poll.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/slab.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/fs.h> 16#include <linux/fs.h>
18#include <linux/mount.h> 17#include <linux/mount.h>
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 4a1401cea0a1..8713c7cfbc79 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -13,6 +13,7 @@
13#include <linux/capability.h> 13#include <linux/capability.h>
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/stat.h> 15#include <linux/stat.h>
16#include <linux/slab.h>
16#include <linux/param.h> 17#include <linux/param.h>
17#include <linux/time.h> 18#include <linux/time.h>
18#include <linux/smp_lock.h> 19#include <linux/smp_lock.h>
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index c8a80dffb455..d29b7f6df862 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -22,6 +22,7 @@
22#include <linux/magic.h> 22#include <linux/magic.h>
23#include <linux/dcache.h> 23#include <linux/dcache.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/slab.h>
25 26
26#include "autofs_i.h" 27#include "autofs_i.h"
27 28
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index a015b49891df..109a6c606d92 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -15,6 +15,7 @@
15#include <linux/capability.h> 15#include <linux/capability.h>
16#include <linux/errno.h> 16#include <linux/errno.h>
17#include <linux/stat.h> 17#include <linux/stat.h>
18#include <linux/slab.h>
18#include <linux/param.h> 19#include <linux/param.h>
19#include <linux/time.h> 20#include <linux/time.h>
20#include "autofs_i.h" 21#include "autofs_i.h"
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index e3287d0d1a58..59096b5e0fc7 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -11,7 +11,6 @@
11 */ 11 */
12 12
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/slab.h>
15#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
16#include <linux/string.h> 15#include <linux/string.h>
17 16
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 15d80bb35d6f..f96eff04e11a 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -20,11 +20,11 @@
20#include <linux/fcntl.h> 20#include <linux/fcntl.h>
21#include <linux/ptrace.h> 21#include <linux/ptrace.h>
22#include <linux/user.h> 22#include <linux/user.h>
23#include <linux/slab.h>
24#include <linux/binfmts.h> 23#include <linux/binfmts.h>
25#include <linux/personality.h> 24#include <linux/personality.h>
26#include <linux/init.h> 25#include <linux/init.h>
27#include <linux/coredump.h> 26#include <linux/coredump.h>
27#include <linux/slab.h>
28 28
29#include <asm/system.h> 29#include <asm/system.h>
30#include <asm/uaccess.h> 30#include <asm/uaccess.h>
@@ -75,14 +75,16 @@ static int aout_core_dump(struct coredump_params *cprm)
75 struct file *file = cprm->file; 75 struct file *file = cprm->file;
76 mm_segment_t fs; 76 mm_segment_t fs;
77 int has_dumped = 0; 77 int has_dumped = 0;
78 unsigned long dump_start, dump_size; 78 void __user *dump_start;
79 int dump_size;
79 struct user dump; 80 struct user dump;
80#ifdef __alpha__ 81#ifdef __alpha__
81# define START_DATA(u) (u.start_data) 82# define START_DATA(u) ((void __user *)u.start_data)
82#else 83#else
83# define START_DATA(u) ((u.u_tsize << PAGE_SHIFT) + u.start_code) 84# define START_DATA(u) ((void __user *)((u.u_tsize << PAGE_SHIFT) + \
85 u.start_code))
84#endif 86#endif
85# define START_STACK(u) (u.start_stack) 87# define START_STACK(u) ((void __user *)u.start_stack)
86 88
87 fs = get_fs(); 89 fs = get_fs();
88 set_fs(KERNEL_DS); 90 set_fs(KERNEL_DS);
@@ -104,9 +106,9 @@ static int aout_core_dump(struct coredump_params *cprm)
104 106
105/* make sure we actually have a data and stack area to dump */ 107/* make sure we actually have a data and stack area to dump */
106 set_fs(USER_DS); 108 set_fs(USER_DS);
107 if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize << PAGE_SHIFT)) 109 if (!access_ok(VERIFY_READ, START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
108 dump.u_dsize = 0; 110 dump.u_dsize = 0;
109 if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize << PAGE_SHIFT)) 111 if (!access_ok(VERIFY_READ, START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
110 dump.u_ssize = 0; 112 dump.u_ssize = 0;
111 113
112 set_fs(KERNEL_DS); 114 set_fs(KERNEL_DS);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 2c32d00a6690..7ab23e006e4c 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1590,7 +1590,7 @@ static size_t elf_core_vma_data_size(unsigned long mm_flags)
1590 struct vm_area_struct *vma; 1590 struct vm_area_struct *vma;
1591 size_t size = 0; 1591 size_t size = 0;
1592 1592
1593 for (vma = current->mm->mmap; vma; vma->vm_next) 1593 for (vma = current->mm->mmap; vma; vma = vma->vm_next)
1594 if (maydump(vma, mm_flags)) 1594 if (maydump(vma, mm_flags))
1595 size += vma->vm_end - vma->vm_start; 1595 size += vma->vm_end - vma->vm_start;
1596 return size; 1596 return size;
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index 32fb00b52cd0..b8e8b0acf9bd 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -11,7 +11,6 @@
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/stat.h> 13#include <linux/stat.h>
14#include <linux/slab.h>
15#include <linux/binfmts.h> 14#include <linux/binfmts.h>
16#include <linux/elf.h> 15#include <linux/elf.h>
17#include <linux/init.h> 16#include <linux/init.h>
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index e0e769bdca59..49566c1687d8 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -355,7 +355,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
355 355
356 if (!flat_reloc_valid(r, start_brk - start_data + text_len)) { 356 if (!flat_reloc_valid(r, start_brk - start_data + text_len)) {
357 printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)", 357 printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)",
358 (int) r,(int)(start_brk-start_code),(int)text_len); 358 (int) r,(int)(start_brk-start_data+text_len),(int)text_len);
359 goto failed; 359 goto failed;
360 } 360 }
361 361
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 08343505e184..aca9d55afb22 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -8,7 +8,6 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/string.h> 9#include <linux/string.h>
10#include <linux/stat.h> 10#include <linux/stat.h>
11#include <linux/slab.h>
12#include <linux/binfmts.h> 11#include <linux/binfmts.h>
13#include <linux/init.h> 12#include <linux/init.h>
14#include <linux/file.h> 13#include <linux/file.h>
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index a16f29e888cd..612a5c38d3c1 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -24,6 +24,7 @@
24#include <linux/mempool.h> 24#include <linux/mempool.h>
25#include <linux/bio.h> 25#include <linux/bio.h>
26#include <linux/workqueue.h> 26#include <linux/workqueue.h>
27#include <linux/slab.h>
27 28
28struct integrity_slab { 29struct integrity_slab {
29 struct kmem_cache *slab; 30 struct kmem_cache *slab;
diff --git a/fs/bio.c b/fs/bio.c
index e1f922184b45..e7bf6ca64dcf 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -554,7 +554,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
554 .bi_rw = bio->bi_rw, 554 .bi_rw = bio->bi_rw,
555 }; 555 };
556 556
557 if (q->merge_bvec_fn(q, &bvm, prev) < len) { 557 if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) {
558 prev->bv_len -= len; 558 prev->bv_len -= len;
559 return 0; 559 return 0;
560 } 560 }
@@ -607,7 +607,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
607 * merge_bvec_fn() returns number of bytes it can accept 607 * merge_bvec_fn() returns number of bytes it can accept
608 * at this offset 608 * at this offset
609 */ 609 */
610 if (q->merge_bvec_fn(q, &bvm, bvec) < len) { 610 if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) {
611 bvec->bv_page = NULL; 611 bvec->bv_page = NULL;
612 bvec->bv_len = 0; 612 bvec->bv_len = 0;
613 bvec->bv_offset = 0; 613 bvec->bv_offset = 0;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index d11d0289f3d2..2a6d0193f139 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -404,7 +404,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
404 * NULL first argument is nfsd_sync_dir() and that's not a directory. 404 * NULL first argument is nfsd_sync_dir() and that's not a directory.
405 */ 405 */
406 406
407static int block_fsync(struct file *filp, struct dentry *dentry, int datasync) 407int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
408{ 408{
409 struct block_device *bdev = I_BDEV(filp->f_mapping->host); 409 struct block_device *bdev = I_BDEV(filp->f_mapping->host);
410 int error; 410 int error;
@@ -418,6 +418,7 @@ static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
418 error = 0; 418 error = 0;
419 return error; 419 return error;
420} 420}
421EXPORT_SYMBOL(blkdev_fsync);
421 422
422/* 423/*
423 * pseudo-fs 424 * pseudo-fs
@@ -1481,7 +1482,7 @@ const struct file_operations def_blk_fops = {
1481 .aio_read = generic_file_aio_read, 1482 .aio_read = generic_file_aio_read,
1482 .aio_write = blkdev_aio_write, 1483 .aio_write = blkdev_aio_write,
1483 .mmap = generic_file_mmap, 1484 .mmap = generic_file_mmap,
1484 .fsync = block_fsync, 1485 .fsync = blkdev_fsync,
1485 .unlocked_ioctl = block_ioctl, 1486 .unlocked_ioctl = block_ioctl,
1486#ifdef CONFIG_COMPAT 1487#ifdef CONFIG_COMPAT
1487 .compat_ioctl = compat_blkdev_ioctl, 1488 .compat_ioctl = compat_blkdev_ioctl,
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6df6d6ed74fd..6ef7b26724ec 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -22,6 +22,7 @@
22#include <linux/posix_acl_xattr.h> 22#include <linux/posix_acl_xattr.h>
23#include <linux/posix_acl.h> 23#include <linux/posix_acl.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/slab.h>
25 26
26#include "ctree.h" 27#include "ctree.h"
27#include "btrfs_inode.h" 28#include "btrfs_inode.h"
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c0861e781cdb..462859a30141 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/kthread.h> 19#include <linux/kthread.h>
20#include <linux/slab.h>
20#include <linux/list.h> 21#include <linux/list.h>
21#include <linux/spinlock.h> 22#include <linux/spinlock.h>
22#include <linux/freezer.h> 23#include <linux/freezer.h>
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 3f1f50d9d916..7a4dee199832 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -153,6 +153,11 @@ struct btrfs_inode {
153 unsigned ordered_data_close:1; 153 unsigned ordered_data_close:1;
154 unsigned dummy_inode:1; 154 unsigned dummy_inode:1;
155 155
156 /*
157 * always compress this one file
158 */
159 unsigned force_compress:1;
160
156 struct inode vfs_inode; 161 struct inode vfs_inode;
157}; 162};
158 163
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index a11a32058b50..396039b3a8a2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -31,7 +31,7 @@
31#include <linux/swap.h> 31#include <linux/swap.h>
32#include <linux/writeback.h> 32#include <linux/writeback.h>
33#include <linux/bit_spinlock.h> 33#include <linux/bit_spinlock.h>
34#include <linux/pagevec.h> 34#include <linux/slab.h>
35#include "compat.h" 35#include "compat.h"
36#include "ctree.h" 36#include "ctree.h"
37#include "disk-io.h" 37#include "disk-io.h"
@@ -445,7 +445,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
445 unsigned long nr_pages = 0; 445 unsigned long nr_pages = 0;
446 struct extent_map *em; 446 struct extent_map *em;
447 struct address_space *mapping = inode->i_mapping; 447 struct address_space *mapping = inode->i_mapping;
448 struct pagevec pvec;
449 struct extent_map_tree *em_tree; 448 struct extent_map_tree *em_tree;
450 struct extent_io_tree *tree; 449 struct extent_io_tree *tree;
451 u64 end; 450 u64 end;
@@ -461,7 +460,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
461 460
462 end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; 461 end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
463 462
464 pagevec_init(&pvec, 0);
465 while (last_offset < compressed_end) { 463 while (last_offset < compressed_end) {
466 page_index = last_offset >> PAGE_CACHE_SHIFT; 464 page_index = last_offset >> PAGE_CACHE_SHIFT;
467 465
@@ -478,26 +476,17 @@ static noinline int add_ra_bio_pages(struct inode *inode,
478 goto next; 476 goto next;
479 } 477 }
480 478
481 page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS); 479 page = __page_cache_alloc(mapping_gfp_mask(mapping) &
480 ~__GFP_FS);
482 if (!page) 481 if (!page)
483 break; 482 break;
484 483
485 page->index = page_index; 484 if (add_to_page_cache_lru(page, mapping, page_index,
486 /* 485 GFP_NOFS)) {
487 * what we want to do here is call add_to_page_cache_lru,
488 * but that isn't exported, so we reproduce it here
489 */
490 if (add_to_page_cache(page, mapping,
491 page->index, GFP_NOFS)) {
492 page_cache_release(page); 486 page_cache_release(page);
493 goto next; 487 goto next;
494 } 488 }
495 489
496 /* open coding of lru_cache_add, also not exported */
497 page_cache_get(page);
498 if (!pagevec_add(&pvec, page))
499 __pagevec_lru_add_file(&pvec);
500
501 end = last_offset + PAGE_CACHE_SIZE - 1; 490 end = last_offset + PAGE_CACHE_SIZE - 1;
502 /* 491 /*
503 * at this point, we have a locked page in the page cache 492 * at this point, we have a locked page in the page cache
@@ -551,8 +540,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
551next: 540next:
552 last_offset += PAGE_CACHE_SIZE; 541 last_offset += PAGE_CACHE_SIZE;
553 } 542 }
554 if (pagevec_count(&pvec))
555 __pagevec_lru_add_file(&pvec);
556 return 0; 543 return 0;
557} 544}
558 545
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c4bc570a396e..6795a713b205 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "disk-io.h" 22#include "disk-io.h"
22#include "transaction.h" 23#include "transaction.h"
@@ -3040,6 +3041,10 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3040 if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0])) 3041 if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
3041 goto err; 3042 goto err;
3042 3043
3044 /* the leaf has changed, it now has room. return now */
3045 if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
3046 goto err;
3047
3043 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3048 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3044 fi = btrfs_item_ptr(leaf, path->slots[0], 3049 fi = btrfs_item_ptr(leaf, path->slots[0],
3045 struct btrfs_file_extent_item); 3050 struct btrfs_file_extent_item);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8b5cfdd4bfc1..746a7248678e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -26,6 +26,7 @@
26#include <linux/completion.h> 26#include <linux/completion.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/wait.h> 28#include <linux/wait.h>
29#include <linux/slab.h>
29#include <asm/kmap_types.h> 30#include <asm/kmap_types.h>
30#include "extent_io.h" 31#include "extent_io.h"
31#include "extent_map.h" 32#include "extent_map.h"
@@ -373,11 +374,13 @@ struct btrfs_super_block {
373 * ones specified below then we will fail to mount 374 * ones specified below then we will fail to mount
374 */ 375 */
375#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) 376#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
377#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (2ULL << 0)
376 378
377#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 379#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
378#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 380#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
379#define BTRFS_FEATURE_INCOMPAT_SUPP \ 381#define BTRFS_FEATURE_INCOMPAT_SUPP \
380 BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF 382 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
383 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)
381 384
382/* 385/*
383 * A leaf is full of items. offset and size tell us where to find 386 * A leaf is full of items. offset and size tell us where to find
@@ -832,7 +835,6 @@ struct btrfs_fs_info {
832 u64 last_trans_log_full_commit; 835 u64 last_trans_log_full_commit;
833 u64 open_ioctl_trans; 836 u64 open_ioctl_trans;
834 unsigned long mount_opt; 837 unsigned long mount_opt;
835 u64 max_extent;
836 u64 max_inline; 838 u64 max_inline;
837 u64 alloc_start; 839 u64 alloc_start;
838 struct btrfs_transaction *running_transaction; 840 struct btrfs_transaction *running_transaction;
@@ -1182,7 +1184,6 @@ struct btrfs_root {
1182#define BTRFS_INODE_NOATIME (1 << 9) 1184#define BTRFS_INODE_NOATIME (1 << 9)
1183#define BTRFS_INODE_DIRSYNC (1 << 10) 1185#define BTRFS_INODE_DIRSYNC (1 << 10)
1184 1186
1185
1186/* some macros to generate set/get funcs for the struct fields. This 1187/* some macros to generate set/get funcs for the struct fields. This
1187 * assumes there is a lefoo_to_cpu for every type, so lets make a simple 1188 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
1188 * one for u8: 1189 * one for u8:
@@ -1842,7 +1843,7 @@ BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
1842BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, 1843BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
1843 compat_flags, 64); 1844 compat_flags, 64);
1844BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, 1845BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
1845 compat_flags, 64); 1846 compat_ro_flags, 64);
1846BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, 1847BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
1847 incompat_flags, 64); 1848 incompat_flags, 64);
1848BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, 1849BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
@@ -2310,7 +2311,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2310 u32 min_type); 2311 u32 min_type);
2311 2312
2312int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 2313int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
2313int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end); 2314int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2315 struct extent_state **cached_state);
2314int btrfs_writepages(struct address_space *mapping, 2316int btrfs_writepages(struct address_space *mapping,
2315 struct writeback_control *wbc); 2317 struct writeback_control *wbc);
2316int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 2318int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -2335,7 +2337,7 @@ int btrfs_init_cachep(void);
2335void btrfs_destroy_cachep(void); 2337void btrfs_destroy_cachep(void);
2336long btrfs_ioctl_trans_end(struct file *file); 2338long btrfs_ioctl_trans_end(struct file *file);
2337struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 2339struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
2338 struct btrfs_root *root); 2340 struct btrfs_root *root, int *was_new);
2339int btrfs_commit_write(struct file *file, struct page *page, 2341int btrfs_commit_write(struct file *file, struct page *page,
2340 unsigned from, unsigned to); 2342 unsigned from, unsigned to);
2341struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, 2343struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
@@ -2386,7 +2388,6 @@ void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
2386ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); 2388ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
2387 2389
2388/* super.c */ 2390/* super.c */
2389u64 btrfs_parse_size(char *str);
2390int btrfs_parse_options(struct btrfs_root *root, char *options); 2391int btrfs_parse_options(struct btrfs_root *root, char *options);
2391int btrfs_sync_fs(struct super_block *sb, int wait); 2392int btrfs_sync_fs(struct super_block *sb, int wait);
2392 2393
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 84e6781413b1..902ce507c4e3 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include <linux/sort.h> 21#include <linux/sort.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "delayed-ref.h" 23#include "delayed-ref.h"
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0427183e3e05..e7b8f2c89ccb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -27,6 +27,7 @@
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <linux/freezer.h> 28#include <linux/freezer.h>
29#include <linux/crc32c.h> 29#include <linux/crc32c.h>
30#include <linux/slab.h>
30#include "compat.h" 31#include "compat.h"
31#include "ctree.h" 32#include "ctree.h"
32#include "disk-io.h" 33#include "disk-io.h"
@@ -263,13 +264,15 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
263static int verify_parent_transid(struct extent_io_tree *io_tree, 264static int verify_parent_transid(struct extent_io_tree *io_tree,
264 struct extent_buffer *eb, u64 parent_transid) 265 struct extent_buffer *eb, u64 parent_transid)
265{ 266{
267 struct extent_state *cached_state = NULL;
266 int ret; 268 int ret;
267 269
268 if (!parent_transid || btrfs_header_generation(eb) == parent_transid) 270 if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
269 return 0; 271 return 0;
270 272
271 lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS); 273 lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
272 if (extent_buffer_uptodate(io_tree, eb) && 274 0, &cached_state, GFP_NOFS);
275 if (extent_buffer_uptodate(io_tree, eb, cached_state) &&
273 btrfs_header_generation(eb) == parent_transid) { 276 btrfs_header_generation(eb) == parent_transid) {
274 ret = 0; 277 ret = 0;
275 goto out; 278 goto out;
@@ -282,10 +285,10 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
282 (unsigned long long)btrfs_header_generation(eb)); 285 (unsigned long long)btrfs_header_generation(eb));
283 } 286 }
284 ret = 1; 287 ret = 1;
285 clear_extent_buffer_uptodate(io_tree, eb); 288 clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
286out: 289out:
287 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, 290 unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
288 GFP_NOFS); 291 &cached_state, GFP_NOFS);
289 return ret; 292 return ret;
290} 293}
291 294
@@ -1632,7 +1635,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1632 atomic_set(&fs_info->async_submit_draining, 0); 1635 atomic_set(&fs_info->async_submit_draining, 0);
1633 atomic_set(&fs_info->nr_async_bios, 0); 1636 atomic_set(&fs_info->nr_async_bios, 0);
1634 fs_info->sb = sb; 1637 fs_info->sb = sb;
1635 fs_info->max_extent = (u64)-1;
1636 fs_info->max_inline = 8192 * 1024; 1638 fs_info->max_inline = 8192 * 1024;
1637 fs_info->metadata_ratio = 0; 1639 fs_info->metadata_ratio = 0;
1638 1640
@@ -1920,7 +1922,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1920 1922
1921 csum_root->track_dirty = 1; 1923 csum_root->track_dirty = 1;
1922 1924
1923 btrfs_read_block_groups(extent_root); 1925 ret = btrfs_read_block_groups(extent_root);
1926 if (ret) {
1927 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
1928 goto fail_block_groups;
1929 }
1924 1930
1925 fs_info->generation = generation; 1931 fs_info->generation = generation;
1926 fs_info->last_trans_committed = generation; 1932 fs_info->last_trans_committed = generation;
@@ -1930,7 +1936,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1930 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 1936 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1931 "btrfs-cleaner"); 1937 "btrfs-cleaner");
1932 if (IS_ERR(fs_info->cleaner_kthread)) 1938 if (IS_ERR(fs_info->cleaner_kthread))
1933 goto fail_csum_root; 1939 goto fail_block_groups;
1934 1940
1935 fs_info->transaction_kthread = kthread_run(transaction_kthread, 1941 fs_info->transaction_kthread = kthread_run(transaction_kthread,
1936 tree_root, 1942 tree_root,
@@ -2018,7 +2024,8 @@ fail_cleaner:
2018 filemap_write_and_wait(fs_info->btree_inode->i_mapping); 2024 filemap_write_and_wait(fs_info->btree_inode->i_mapping);
2019 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2025 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2020 2026
2021fail_csum_root: 2027fail_block_groups:
2028 btrfs_free_block_groups(fs_info);
2022 free_extent_buffer(csum_root->node); 2029 free_extent_buffer(csum_root->node);
2023 free_extent_buffer(csum_root->commit_root); 2030 free_extent_buffer(csum_root->commit_root);
2024fail_dev_root: 2031fail_dev_root:
@@ -2497,7 +2504,8 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
2497 int ret; 2504 int ret;
2498 struct inode *btree_inode = buf->first_page->mapping->host; 2505 struct inode *btree_inode = buf->first_page->mapping->host;
2499 2506
2500 ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf); 2507 ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf,
2508 NULL);
2501 if (!ret) 2509 if (!ret)
2502 return ret; 2510 return ret;
2503 2511
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index ba5c3fd5ab8c..951ef09b82f4 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -95,7 +95,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
95 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 95 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
96 key.offset = 0; 96 key.offset = 0;
97 97
98 inode = btrfs_iget(sb, &key, root); 98 inode = btrfs_iget(sb, &key, root, NULL);
99 if (IS_ERR(inode)) { 99 if (IS_ERR(inode)) {
100 err = PTR_ERR(inode); 100 err = PTR_ERR(inode);
101 goto fail; 101 goto fail;
@@ -223,7 +223,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
223 223
224 key.type = BTRFS_INODE_ITEM_KEY; 224 key.type = BTRFS_INODE_ITEM_KEY;
225 key.offset = 0; 225 key.offset = 0;
226 dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root)); 226 dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
227 if (!IS_ERR(dentry)) 227 if (!IS_ERR(dentry))
228 dentry->d_op = &btrfs_dentry_operations; 228 dentry->d_op = &btrfs_dentry_operations;
229 return dentry; 229 return dentry;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 559f72489b3b..b34d32fdaaec 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -22,6 +22,7 @@
22#include <linux/sort.h> 22#include <linux/sort.h>
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/slab.h>
25#include "compat.h" 26#include "compat.h"
26#include "hash.h" 27#include "hash.h"
27#include "ctree.h" 28#include "ctree.h"
@@ -2676,6 +2677,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2676 2677
2677 INIT_LIST_HEAD(&found->block_groups); 2678 INIT_LIST_HEAD(&found->block_groups);
2678 init_rwsem(&found->groups_sem); 2679 init_rwsem(&found->groups_sem);
2680 init_waitqueue_head(&found->flush_wait);
2681 init_waitqueue_head(&found->allocate_wait);
2679 spin_lock_init(&found->lock); 2682 spin_lock_init(&found->lock);
2680 found->flags = flags; 2683 found->flags = flags;
2681 found->total_bytes = total_bytes; 2684 found->total_bytes = total_bytes;
@@ -2846,7 +2849,7 @@ int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2846 } 2849 }
2847 spin_unlock(&BTRFS_I(inode)->accounting_lock); 2850 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2848 2851
2849 BTRFS_I(inode)->reserved_extents--; 2852 BTRFS_I(inode)->reserved_extents -= num_items;
2850 BUG_ON(BTRFS_I(inode)->reserved_extents < 0); 2853 BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
2851 2854
2852 if (meta_sinfo->bytes_delalloc < num_bytes) { 2855 if (meta_sinfo->bytes_delalloc < num_bytes) {
@@ -2944,12 +2947,10 @@ static void flush_delalloc(struct btrfs_root *root,
2944 2947
2945 spin_lock(&info->lock); 2948 spin_lock(&info->lock);
2946 2949
2947 if (!info->flushing) { 2950 if (!info->flushing)
2948 info->flushing = 1; 2951 info->flushing = 1;
2949 init_waitqueue_head(&info->flush_wait); 2952 else
2950 } else {
2951 wait = true; 2953 wait = true;
2952 }
2953 2954
2954 spin_unlock(&info->lock); 2955 spin_unlock(&info->lock);
2955 2956
@@ -3011,7 +3012,6 @@ static int maybe_allocate_chunk(struct btrfs_root *root,
3011 if (!info->allocating_chunk) { 3012 if (!info->allocating_chunk) {
3012 info->force_alloc = 1; 3013 info->force_alloc = 1;
3013 info->allocating_chunk = 1; 3014 info->allocating_chunk = 1;
3014 init_waitqueue_head(&info->allocate_wait);
3015 } else { 3015 } else {
3016 wait = true; 3016 wait = true;
3017 } 3017 }
@@ -3111,7 +3111,7 @@ again:
3111 return -ENOSPC; 3111 return -ENOSPC;
3112 } 3112 }
3113 3113
3114 BTRFS_I(inode)->reserved_extents++; 3114 BTRFS_I(inode)->reserved_extents += num_items;
3115 check_force_delalloc(meta_sinfo); 3115 check_force_delalloc(meta_sinfo);
3116 spin_unlock(&meta_sinfo->lock); 3116 spin_unlock(&meta_sinfo->lock);
3117 3117
@@ -3235,7 +3235,8 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
3235 u64 bytes) 3235 u64 bytes)
3236{ 3236{
3237 struct btrfs_space_info *data_sinfo; 3237 struct btrfs_space_info *data_sinfo;
3238 int ret = 0, committed = 0; 3238 u64 used;
3239 int ret = 0, committed = 0, flushed = 0;
3239 3240
3240 /* make sure bytes are sectorsize aligned */ 3241 /* make sure bytes are sectorsize aligned */
3241 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3242 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
@@ -3247,12 +3248,21 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
3247again: 3248again:
3248 /* make sure we have enough space to handle the data first */ 3249 /* make sure we have enough space to handle the data first */
3249 spin_lock(&data_sinfo->lock); 3250 spin_lock(&data_sinfo->lock);
3250 if (data_sinfo->total_bytes - data_sinfo->bytes_used - 3251 used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc +
3251 data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - 3252 data_sinfo->bytes_reserved + data_sinfo->bytes_pinned +
3252 data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - 3253 data_sinfo->bytes_readonly + data_sinfo->bytes_may_use +
3253 data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) { 3254 data_sinfo->bytes_super;
3255
3256 if (used + bytes > data_sinfo->total_bytes) {
3254 struct btrfs_trans_handle *trans; 3257 struct btrfs_trans_handle *trans;
3255 3258
3259 if (!flushed) {
3260 spin_unlock(&data_sinfo->lock);
3261 flush_delalloc(root, data_sinfo);
3262 flushed = 1;
3263 goto again;
3264 }
3265
3256 /* 3266 /*
3257 * if we don't have enough free bytes in this space then we need 3267 * if we don't have enough free bytes in this space then we need
3258 * to alloc a new chunk. 3268 * to alloc a new chunk.
@@ -4170,6 +4180,10 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4170 ins->offset = 0; 4180 ins->offset = 0;
4171 4181
4172 space_info = __find_space_info(root->fs_info, data); 4182 space_info = __find_space_info(root->fs_info, data);
4183 if (!space_info) {
4184 printk(KERN_ERR "No space info for %d\n", data);
4185 return -ENOSPC;
4186 }
4173 4187
4174 if (orig_root->ref_cows || empty_size) 4188 if (orig_root->ref_cows || empty_size)
4175 allowed_chunk_alloc = 1; 4189 allowed_chunk_alloc = 1;
@@ -5205,6 +5219,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5205 next = btrfs_find_tree_block(root, bytenr, blocksize); 5219 next = btrfs_find_tree_block(root, bytenr, blocksize);
5206 if (!next) { 5220 if (!next) {
5207 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 5221 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
5222 if (!next)
5223 return -ENOMEM;
5208 reada = 1; 5224 reada = 1;
5209 } 5225 }
5210 btrfs_tree_lock(next); 5226 btrfs_tree_lock(next);
@@ -5417,7 +5433,8 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5417 if (ret > 0) { 5433 if (ret > 0) {
5418 path->slots[level]++; 5434 path->slots[level]++;
5419 continue; 5435 continue;
5420 } 5436 } else if (ret < 0)
5437 return ret;
5421 level = wc->level; 5438 level = wc->level;
5422 } 5439 }
5423 return 0; 5440 return 0;
@@ -6561,6 +6578,7 @@ static noinline int invalidate_extent_cache(struct btrfs_root *root,
6561 struct btrfs_key key; 6578 struct btrfs_key key;
6562 struct inode *inode = NULL; 6579 struct inode *inode = NULL;
6563 struct btrfs_file_extent_item *fi; 6580 struct btrfs_file_extent_item *fi;
6581 struct extent_state *cached_state = NULL;
6564 u64 num_bytes; 6582 u64 num_bytes;
6565 u64 skip_objectid = 0; 6583 u64 skip_objectid = 0;
6566 u32 nritems; 6584 u32 nritems;
@@ -6589,12 +6607,14 @@ static noinline int invalidate_extent_cache(struct btrfs_root *root,
6589 } 6607 }
6590 num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 6608 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6591 6609
6592 lock_extent(&BTRFS_I(inode)->io_tree, key.offset, 6610 lock_extent_bits(&BTRFS_I(inode)->io_tree, key.offset,
6593 key.offset + num_bytes - 1, GFP_NOFS); 6611 key.offset + num_bytes - 1, 0, &cached_state,
6612 GFP_NOFS);
6594 btrfs_drop_extent_cache(inode, key.offset, 6613 btrfs_drop_extent_cache(inode, key.offset,
6595 key.offset + num_bytes - 1, 1); 6614 key.offset + num_bytes - 1, 1);
6596 unlock_extent(&BTRFS_I(inode)->io_tree, key.offset, 6615 unlock_extent_cached(&BTRFS_I(inode)->io_tree, key.offset,
6597 key.offset + num_bytes - 1, GFP_NOFS); 6616 key.offset + num_bytes - 1, &cached_state,
6617 GFP_NOFS);
6598 cond_resched(); 6618 cond_resched();
6599 } 6619 }
6600 iput(inode); 6620 iput(inode);
@@ -7366,7 +7386,6 @@ static int find_first_block_group(struct btrfs_root *root,
7366 } 7386 }
7367 path->slots[0]++; 7387 path->slots[0]++;
7368 } 7388 }
7369 ret = -ENOENT;
7370out: 7389out:
7371 return ret; 7390 return ret;
7372} 7391}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7073cbb1b2d4..d2d03684fab2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2,7 +2,6 @@
2#include <linux/slab.h> 2#include <linux/slab.h>
3#include <linux/bio.h> 3#include <linux/bio.h>
4#include <linux/mm.h> 4#include <linux/mm.h>
5#include <linux/gfp.h>
6#include <linux/pagemap.h> 5#include <linux/pagemap.h>
7#include <linux/page-flags.h> 6#include <linux/page-flags.h>
8#include <linux/module.h> 7#include <linux/module.h>
@@ -513,7 +512,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
513 u64 last_end; 512 u64 last_end;
514 int err; 513 int err;
515 int set = 0; 514 int set = 0;
515 int clear = 0;
516 516
517 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
518 clear = 1;
517again: 519again:
518 if (!prealloc && (mask & __GFP_WAIT)) { 520 if (!prealloc && (mask & __GFP_WAIT)) {
519 prealloc = alloc_extent_state(mask); 521 prealloc = alloc_extent_state(mask);
@@ -524,14 +526,20 @@ again:
524 spin_lock(&tree->lock); 526 spin_lock(&tree->lock);
525 if (cached_state) { 527 if (cached_state) {
526 cached = *cached_state; 528 cached = *cached_state;
527 *cached_state = NULL; 529
528 cached_state = NULL; 530 if (clear) {
531 *cached_state = NULL;
532 cached_state = NULL;
533 }
534
529 if (cached && cached->tree && cached->start == start) { 535 if (cached && cached->tree && cached->start == start) {
530 atomic_dec(&cached->refs); 536 if (clear)
537 atomic_dec(&cached->refs);
531 state = cached; 538 state = cached;
532 goto hit_next; 539 goto hit_next;
533 } 540 }
534 free_extent_state(cached); 541 if (clear)
542 free_extent_state(cached);
535 } 543 }
536 /* 544 /*
537 * this search will find the extents that end after 545 * this search will find the extents that end after
@@ -946,11 +954,11 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
946} 954}
947 955
948int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 956int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
949 gfp_t mask) 957 struct extent_state **cached_state, gfp_t mask)
950{ 958{
951 return set_extent_bit(tree, start, end, 959 return set_extent_bit(tree, start, end,
952 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 960 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
953 0, NULL, NULL, mask); 961 0, NULL, cached_state, mask);
954} 962}
955 963
956int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 964int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -984,10 +992,11 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
984} 992}
985 993
986static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, 994static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
987 u64 end, gfp_t mask) 995 u64 end, struct extent_state **cached_state,
996 gfp_t mask)
988{ 997{
989 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, 998 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
990 NULL, mask); 999 cached_state, mask);
991} 1000}
992 1001
993int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1002int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
@@ -1171,7 +1180,8 @@ out:
1171 * 1 is returned if we find something, 0 if nothing was in the tree 1180 * 1 is returned if we find something, 0 if nothing was in the tree
1172 */ 1181 */
1173static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1182static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1174 u64 *start, u64 *end, u64 max_bytes) 1183 u64 *start, u64 *end, u64 max_bytes,
1184 struct extent_state **cached_state)
1175{ 1185{
1176 struct rb_node *node; 1186 struct rb_node *node;
1177 struct extent_state *state; 1187 struct extent_state *state;
@@ -1203,8 +1213,11 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1203 *end = state->end; 1213 *end = state->end;
1204 goto out; 1214 goto out;
1205 } 1215 }
1206 if (!found) 1216 if (!found) {
1207 *start = state->start; 1217 *start = state->start;
1218 *cached_state = state;
1219 atomic_inc(&state->refs);
1220 }
1208 found++; 1221 found++;
1209 *end = state->end; 1222 *end = state->end;
1210 cur_start = state->end + 1; 1223 cur_start = state->end + 1;
@@ -1336,10 +1349,11 @@ again:
1336 delalloc_start = *start; 1349 delalloc_start = *start;
1337 delalloc_end = 0; 1350 delalloc_end = 0;
1338 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1351 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1339 max_bytes); 1352 max_bytes, &cached_state);
1340 if (!found || delalloc_end <= *start) { 1353 if (!found || delalloc_end <= *start) {
1341 *start = delalloc_start; 1354 *start = delalloc_start;
1342 *end = delalloc_end; 1355 *end = delalloc_end;
1356 free_extent_state(cached_state);
1343 return found; 1357 return found;
1344 } 1358 }
1345 1359
@@ -1722,7 +1736,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
1722 } 1736 }
1723 1737
1724 if (!uptodate) { 1738 if (!uptodate) {
1725 clear_extent_uptodate(tree, start, end, GFP_NOFS); 1739 clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
1726 ClearPageUptodate(page); 1740 ClearPageUptodate(page);
1727 SetPageError(page); 1741 SetPageError(page);
1728 } 1742 }
@@ -1750,7 +1764,8 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
1750static void end_bio_extent_readpage(struct bio *bio, int err) 1764static void end_bio_extent_readpage(struct bio *bio, int err)
1751{ 1765{
1752 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1766 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1753 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1767 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
1768 struct bio_vec *bvec = bio->bi_io_vec;
1754 struct extent_io_tree *tree; 1769 struct extent_io_tree *tree;
1755 u64 start; 1770 u64 start;
1756 u64 end; 1771 u64 end;
@@ -1773,7 +1788,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1773 else 1788 else
1774 whole_page = 0; 1789 whole_page = 0;
1775 1790
1776 if (--bvec >= bio->bi_io_vec) 1791 if (++bvec <= bvec_end)
1777 prefetchw(&bvec->bv_page->flags); 1792 prefetchw(&bvec->bv_page->flags);
1778 1793
1779 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 1794 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
@@ -1818,7 +1833,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1818 } 1833 }
1819 check_page_locked(tree, page); 1834 check_page_locked(tree, page);
1820 } 1835 }
1821 } while (bvec >= bio->bi_io_vec); 1836 } while (bvec <= bvec_end);
1822 1837
1823 bio_put(bio); 1838 bio_put(bio);
1824} 1839}
@@ -2663,33 +2678,20 @@ int extent_readpages(struct extent_io_tree *tree,
2663{ 2678{
2664 struct bio *bio = NULL; 2679 struct bio *bio = NULL;
2665 unsigned page_idx; 2680 unsigned page_idx;
2666 struct pagevec pvec;
2667 unsigned long bio_flags = 0; 2681 unsigned long bio_flags = 0;
2668 2682
2669 pagevec_init(&pvec, 0);
2670 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 2683 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2671 struct page *page = list_entry(pages->prev, struct page, lru); 2684 struct page *page = list_entry(pages->prev, struct page, lru);
2672 2685
2673 prefetchw(&page->flags); 2686 prefetchw(&page->flags);
2674 list_del(&page->lru); 2687 list_del(&page->lru);
2675 /* 2688 if (!add_to_page_cache_lru(page, mapping,
2676 * what we want to do here is call add_to_page_cache_lru,
2677 * but that isn't exported, so we reproduce it here
2678 */
2679 if (!add_to_page_cache(page, mapping,
2680 page->index, GFP_KERNEL)) { 2689 page->index, GFP_KERNEL)) {
2681
2682 /* open coding of lru_cache_add, also not exported */
2683 page_cache_get(page);
2684 if (!pagevec_add(&pvec, page))
2685 __pagevec_lru_add_file(&pvec);
2686 __extent_read_full_page(tree, page, get_extent, 2690 __extent_read_full_page(tree, page, get_extent,
2687 &bio, 0, &bio_flags); 2691 &bio, 0, &bio_flags);
2688 } 2692 }
2689 page_cache_release(page); 2693 page_cache_release(page);
2690 } 2694 }
2691 if (pagevec_count(&pvec))
2692 __pagevec_lru_add_file(&pvec);
2693 BUG_ON(!list_empty(pages)); 2695 BUG_ON(!list_empty(pages));
2694 if (bio) 2696 if (bio)
2695 submit_one_bio(READ, bio, 0, bio_flags); 2697 submit_one_bio(READ, bio, 0, bio_flags);
@@ -2704,6 +2706,7 @@ int extent_readpages(struct extent_io_tree *tree,
2704int extent_invalidatepage(struct extent_io_tree *tree, 2706int extent_invalidatepage(struct extent_io_tree *tree,
2705 struct page *page, unsigned long offset) 2707 struct page *page, unsigned long offset)
2706{ 2708{
2709 struct extent_state *cached_state = NULL;
2707 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); 2710 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2708 u64 end = start + PAGE_CACHE_SIZE - 1; 2711 u64 end = start + PAGE_CACHE_SIZE - 1;
2709 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 2712 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
@@ -2712,12 +2715,12 @@ int extent_invalidatepage(struct extent_io_tree *tree,
2712 if (start > end) 2715 if (start > end)
2713 return 0; 2716 return 0;
2714 2717
2715 lock_extent(tree, start, end, GFP_NOFS); 2718 lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
2716 wait_on_page_writeback(page); 2719 wait_on_page_writeback(page);
2717 clear_extent_bit(tree, start, end, 2720 clear_extent_bit(tree, start, end,
2718 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 2721 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
2719 EXTENT_DO_ACCOUNTING, 2722 EXTENT_DO_ACCOUNTING,
2720 1, 1, NULL, GFP_NOFS); 2723 1, 1, &cached_state, GFP_NOFS);
2721 return 0; 2724 return 0;
2722} 2725}
2723 2726
@@ -2920,16 +2923,17 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2920 get_extent_t *get_extent) 2923 get_extent_t *get_extent)
2921{ 2924{
2922 struct inode *inode = mapping->host; 2925 struct inode *inode = mapping->host;
2926 struct extent_state *cached_state = NULL;
2923 u64 start = iblock << inode->i_blkbits; 2927 u64 start = iblock << inode->i_blkbits;
2924 sector_t sector = 0; 2928 sector_t sector = 0;
2925 size_t blksize = (1 << inode->i_blkbits); 2929 size_t blksize = (1 << inode->i_blkbits);
2926 struct extent_map *em; 2930 struct extent_map *em;
2927 2931
2928 lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, 2932 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2929 GFP_NOFS); 2933 0, &cached_state, GFP_NOFS);
2930 em = get_extent(inode, NULL, 0, start, blksize, 0); 2934 em = get_extent(inode, NULL, 0, start, blksize, 0);
2931 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, 2935 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start,
2932 GFP_NOFS); 2936 start + blksize - 1, &cached_state, GFP_NOFS);
2933 if (!em || IS_ERR(em)) 2937 if (!em || IS_ERR(em))
2934 return 0; 2938 return 0;
2935 2939
@@ -2951,6 +2955,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2951 u32 flags = 0; 2955 u32 flags = 0;
2952 u64 disko = 0; 2956 u64 disko = 0;
2953 struct extent_map *em = NULL; 2957 struct extent_map *em = NULL;
2958 struct extent_state *cached_state = NULL;
2954 int end = 0; 2959 int end = 0;
2955 u64 em_start = 0, em_len = 0; 2960 u64 em_start = 0, em_len = 0;
2956 unsigned long emflags; 2961 unsigned long emflags;
@@ -2959,8 +2964,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2959 if (len == 0) 2964 if (len == 0)
2960 return -EINVAL; 2965 return -EINVAL;
2961 2966
2962 lock_extent(&BTRFS_I(inode)->io_tree, start, start + len, 2967 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2963 GFP_NOFS); 2968 &cached_state, GFP_NOFS);
2964 em = get_extent(inode, NULL, 0, off, max - off, 0); 2969 em = get_extent(inode, NULL, 0, off, max - off, 0);
2965 if (!em) 2970 if (!em)
2966 goto out; 2971 goto out;
@@ -3023,8 +3028,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3023out_free: 3028out_free:
3024 free_extent_map(em); 3029 free_extent_map(em);
3025out: 3030out:
3026 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len, 3031 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
3027 GFP_NOFS); 3032 &cached_state, GFP_NOFS);
3028 return ret; 3033 return ret;
3029} 3034}
3030 3035
@@ -3264,7 +3269,8 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
3264} 3269}
3265 3270
3266int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 3271int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3267 struct extent_buffer *eb) 3272 struct extent_buffer *eb,
3273 struct extent_state **cached_state)
3268{ 3274{
3269 unsigned long i; 3275 unsigned long i;
3270 struct page *page; 3276 struct page *page;
@@ -3274,7 +3280,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3274 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3280 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3275 3281
3276 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3282 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3277 GFP_NOFS); 3283 cached_state, GFP_NOFS);
3278 for (i = 0; i < num_pages; i++) { 3284 for (i = 0; i < num_pages; i++) {
3279 page = extent_buffer_page(eb, i); 3285 page = extent_buffer_page(eb, i);
3280 if (page) 3286 if (page)
@@ -3334,7 +3340,8 @@ int extent_range_uptodate(struct extent_io_tree *tree,
3334} 3340}
3335 3341
3336int extent_buffer_uptodate(struct extent_io_tree *tree, 3342int extent_buffer_uptodate(struct extent_io_tree *tree,
3337 struct extent_buffer *eb) 3343 struct extent_buffer *eb,
3344 struct extent_state *cached_state)
3338{ 3345{
3339 int ret = 0; 3346 int ret = 0;
3340 unsigned long num_pages; 3347 unsigned long num_pages;
@@ -3346,7 +3353,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3346 return 1; 3353 return 1;
3347 3354
3348 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3355 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3349 EXTENT_UPTODATE, 1, NULL); 3356 EXTENT_UPTODATE, 1, cached_state);
3350 if (ret) 3357 if (ret)
3351 return ret; 3358 return ret;
3352 3359
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 36de250a7b2b..bbab4813646f 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -163,6 +163,8 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
163int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 163int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
164 int bits, struct extent_state **cached, gfp_t mask); 164 int bits, struct extent_state **cached, gfp_t mask);
165int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); 165int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
166int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
167 struct extent_state **cached, gfp_t mask);
166int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 168int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
167 gfp_t mask); 169 gfp_t mask);
168int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 170int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
@@ -196,7 +198,7 @@ int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
196int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start, 198int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
197 u64 end, gfp_t mask); 199 u64 end, gfp_t mask);
198int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 200int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
199 gfp_t mask); 201 struct extent_state **cached_state, gfp_t mask);
200int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, 202int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
201 gfp_t mask); 203 gfp_t mask);
202int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 204int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -281,9 +283,11 @@ int test_extent_buffer_dirty(struct extent_io_tree *tree,
281int set_extent_buffer_uptodate(struct extent_io_tree *tree, 283int set_extent_buffer_uptodate(struct extent_io_tree *tree,
282 struct extent_buffer *eb); 284 struct extent_buffer *eb);
283int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 285int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
284 struct extent_buffer *eb); 286 struct extent_buffer *eb,
287 struct extent_state **cached_state);
285int extent_buffer_uptodate(struct extent_io_tree *tree, 288int extent_buffer_uptodate(struct extent_io_tree *tree,
286 struct extent_buffer *eb); 289 struct extent_buffer *eb,
290 struct extent_state *cached_state);
287int map_extent_buffer(struct extent_buffer *eb, unsigned long offset, 291int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
288 unsigned long min_len, char **token, char **map, 292 unsigned long min_len, char **token, char **map,
289 unsigned long *map_start, 293 unsigned long *map_start,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 28d87ba60ce8..454ca52d6451 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1,5 +1,4 @@
1#include <linux/err.h> 1#include <linux/err.h>
2#include <linux/gfp.h>
3#include <linux/slab.h> 2#include <linux/slab.h>
4#include <linux/module.h> 3#include <linux/module.h>
5#include <linux/spinlock.h> 4#include <linux/spinlock.h>
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 9b99886562d0..54a255065aa3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/bio.h> 19#include <linux/bio.h>
20#include <linux/slab.h>
20#include <linux/pagemap.h> 21#include <linux/pagemap.h>
21#include <linux/highmem.h> 22#include <linux/highmem.h>
22#include "ctree.h" 23#include "ctree.h"
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 6ed434ac037f..29ff749ff4ca 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -28,6 +28,7 @@
28#include <linux/writeback.h> 28#include <linux/writeback.h>
29#include <linux/statfs.h> 29#include <linux/statfs.h>
30#include <linux/compat.h> 30#include <linux/compat.h>
31#include <linux/slab.h>
31#include "ctree.h" 32#include "ctree.h"
32#include "disk-io.h" 33#include "disk-io.h"
33#include "transaction.h" 34#include "transaction.h"
@@ -123,7 +124,8 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
123 root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 124 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
124 125
125 end_of_last_block = start_pos + num_bytes - 1; 126 end_of_last_block = start_pos + num_bytes - 1;
126 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); 127 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
128 NULL);
127 if (err) 129 if (err)
128 return err; 130 return err;
129 131
@@ -753,6 +755,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
753 loff_t pos, unsigned long first_index, 755 loff_t pos, unsigned long first_index,
754 unsigned long last_index, size_t write_bytes) 756 unsigned long last_index, size_t write_bytes)
755{ 757{
758 struct extent_state *cached_state = NULL;
756 int i; 759 int i;
757 unsigned long index = pos >> PAGE_CACHE_SHIFT; 760 unsigned long index = pos >> PAGE_CACHE_SHIFT;
758 struct inode *inode = fdentry(file)->d_inode; 761 struct inode *inode = fdentry(file)->d_inode;
@@ -781,16 +784,18 @@ again:
781 } 784 }
782 if (start_pos < inode->i_size) { 785 if (start_pos < inode->i_size) {
783 struct btrfs_ordered_extent *ordered; 786 struct btrfs_ordered_extent *ordered;
784 lock_extent(&BTRFS_I(inode)->io_tree, 787 lock_extent_bits(&BTRFS_I(inode)->io_tree,
785 start_pos, last_pos - 1, GFP_NOFS); 788 start_pos, last_pos - 1, 0, &cached_state,
789 GFP_NOFS);
786 ordered = btrfs_lookup_first_ordered_extent(inode, 790 ordered = btrfs_lookup_first_ordered_extent(inode,
787 last_pos - 1); 791 last_pos - 1);
788 if (ordered && 792 if (ordered &&
789 ordered->file_offset + ordered->len > start_pos && 793 ordered->file_offset + ordered->len > start_pos &&
790 ordered->file_offset < last_pos) { 794 ordered->file_offset < last_pos) {
791 btrfs_put_ordered_extent(ordered); 795 btrfs_put_ordered_extent(ordered);
792 unlock_extent(&BTRFS_I(inode)->io_tree, 796 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
793 start_pos, last_pos - 1, GFP_NOFS); 797 start_pos, last_pos - 1,
798 &cached_state, GFP_NOFS);
794 for (i = 0; i < num_pages; i++) { 799 for (i = 0; i < num_pages; i++) {
795 unlock_page(pages[i]); 800 unlock_page(pages[i]);
796 page_cache_release(pages[i]); 801 page_cache_release(pages[i]);
@@ -802,12 +807,13 @@ again:
802 if (ordered) 807 if (ordered)
803 btrfs_put_ordered_extent(ordered); 808 btrfs_put_ordered_extent(ordered);
804 809
805 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, 810 clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
806 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | 811 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
807 EXTENT_DO_ACCOUNTING, 812 EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
808 GFP_NOFS); 813 GFP_NOFS);
809 unlock_extent(&BTRFS_I(inode)->io_tree, 814 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
810 start_pos, last_pos - 1, GFP_NOFS); 815 start_pos, last_pos - 1, &cached_state,
816 GFP_NOFS);
811 } 817 }
812 for (i = 0; i < num_pages; i++) { 818 for (i = 0; i < num_pages; i++) {
813 clear_page_dirty_for_io(pages[i]); 819 clear_page_dirty_for_io(pages[i]);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index dd831ed31eea..f488fac04d99 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h>
21#include <linux/math64.h> 22#include <linux/math64.h>
22#include "ctree.h" 23#include "ctree.h"
23#include "free-space-cache.h" 24#include "free-space-cache.h"
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c41db6d45ab6..2bfdc641d4e3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -36,6 +36,7 @@
36#include <linux/xattr.h> 36#include <linux/xattr.h>
37#include <linux/posix_acl.h> 37#include <linux/posix_acl.h>
38#include <linux/falloc.h> 38#include <linux/falloc.h>
39#include <linux/slab.h>
39#include "compat.h" 40#include "compat.h"
40#include "ctree.h" 41#include "ctree.h"
41#include "disk-io.h" 42#include "disk-io.h"
@@ -379,7 +380,8 @@ again:
379 * change at any time if we discover bad compression ratios. 380 * change at any time if we discover bad compression ratios.
380 */ 381 */
381 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && 382 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
382 btrfs_test_opt(root, COMPRESS)) { 383 (btrfs_test_opt(root, COMPRESS) ||
384 (BTRFS_I(inode)->force_compress))) {
383 WARN_ON(pages); 385 WARN_ON(pages);
384 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 386 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
385 387
@@ -483,8 +485,10 @@ again:
483 nr_pages_ret = 0; 485 nr_pages_ret = 0;
484 486
485 /* flag the file so we don't compress in the future */ 487 /* flag the file so we don't compress in the future */
486 if (!btrfs_test_opt(root, FORCE_COMPRESS)) 488 if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
489 !(BTRFS_I(inode)->force_compress)) {
487 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 490 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
491 }
488 } 492 }
489 if (will_compress) { 493 if (will_compress) {
490 *num_added += 1; 494 *num_added += 1;
@@ -570,8 +574,8 @@ retry:
570 unsigned long nr_written = 0; 574 unsigned long nr_written = 0;
571 575
572 lock_extent(io_tree, async_extent->start, 576 lock_extent(io_tree, async_extent->start,
573 async_extent->start + 577 async_extent->start +
574 async_extent->ram_size - 1, GFP_NOFS); 578 async_extent->ram_size - 1, GFP_NOFS);
575 579
576 /* allocate blocks */ 580 /* allocate blocks */
577 ret = cow_file_range(inode, async_cow->locked_page, 581 ret = cow_file_range(inode, async_cow->locked_page,
@@ -793,7 +797,7 @@ static noinline int cow_file_range(struct inode *inode,
793 while (disk_num_bytes > 0) { 797 while (disk_num_bytes > 0) {
794 unsigned long op; 798 unsigned long op;
795 799
796 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); 800 cur_alloc_size = disk_num_bytes;
797 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 801 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
798 root->sectorsize, 0, alloc_hint, 802 root->sectorsize, 0, alloc_hint,
799 (u64)-1, &ins, 1); 803 (u64)-1, &ins, 1);
@@ -1211,7 +1215,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1211 else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) 1215 else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
1212 ret = run_delalloc_nocow(inode, locked_page, start, end, 1216 ret = run_delalloc_nocow(inode, locked_page, start, end,
1213 page_started, 0, nr_written); 1217 page_started, 0, nr_written);
1214 else if (!btrfs_test_opt(root, COMPRESS)) 1218 else if (!btrfs_test_opt(root, COMPRESS) &&
1219 !(BTRFS_I(inode)->force_compress))
1215 ret = cow_file_range(inode, locked_page, start, end, 1220 ret = cow_file_range(inode, locked_page, start, end,
1216 page_started, nr_written, 1); 1221 page_started, nr_written, 1);
1217 else 1222 else
@@ -1223,30 +1228,9 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1223static int btrfs_split_extent_hook(struct inode *inode, 1228static int btrfs_split_extent_hook(struct inode *inode,
1224 struct extent_state *orig, u64 split) 1229 struct extent_state *orig, u64 split)
1225{ 1230{
1226 struct btrfs_root *root = BTRFS_I(inode)->root;
1227 u64 size;
1228
1229 if (!(orig->state & EXTENT_DELALLOC)) 1231 if (!(orig->state & EXTENT_DELALLOC))
1230 return 0; 1232 return 0;
1231 1233
1232 size = orig->end - orig->start + 1;
1233 if (size > root->fs_info->max_extent) {
1234 u64 num_extents;
1235 u64 new_size;
1236
1237 new_size = orig->end - split + 1;
1238 num_extents = div64_u64(size + root->fs_info->max_extent - 1,
1239 root->fs_info->max_extent);
1240
1241 /*
1242 * if we break a large extent up then leave oustanding_extents
1243 * be, since we've already accounted for the large extent.
1244 */
1245 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1246 root->fs_info->max_extent) < num_extents)
1247 return 0;
1248 }
1249
1250 spin_lock(&BTRFS_I(inode)->accounting_lock); 1234 spin_lock(&BTRFS_I(inode)->accounting_lock);
1251 BTRFS_I(inode)->outstanding_extents++; 1235 BTRFS_I(inode)->outstanding_extents++;
1252 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1236 spin_unlock(&BTRFS_I(inode)->accounting_lock);
@@ -1264,38 +1248,10 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1264 struct extent_state *new, 1248 struct extent_state *new,
1265 struct extent_state *other) 1249 struct extent_state *other)
1266{ 1250{
1267 struct btrfs_root *root = BTRFS_I(inode)->root;
1268 u64 new_size, old_size;
1269 u64 num_extents;
1270
1271 /* not delalloc, ignore it */ 1251 /* not delalloc, ignore it */
1272 if (!(other->state & EXTENT_DELALLOC)) 1252 if (!(other->state & EXTENT_DELALLOC))
1273 return 0; 1253 return 0;
1274 1254
1275 old_size = other->end - other->start + 1;
1276 if (new->start < other->start)
1277 new_size = other->end - new->start + 1;
1278 else
1279 new_size = new->end - other->start + 1;
1280
1281 /* we're not bigger than the max, unreserve the space and go */
1282 if (new_size <= root->fs_info->max_extent) {
1283 spin_lock(&BTRFS_I(inode)->accounting_lock);
1284 BTRFS_I(inode)->outstanding_extents--;
1285 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1286 return 0;
1287 }
1288
1289 /*
1290 * If we grew by another max_extent, just return, we want to keep that
1291 * reserved amount.
1292 */
1293 num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
1294 root->fs_info->max_extent);
1295 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1296 root->fs_info->max_extent) > num_extents)
1297 return 0;
1298
1299 spin_lock(&BTRFS_I(inode)->accounting_lock); 1255 spin_lock(&BTRFS_I(inode)->accounting_lock);
1300 BTRFS_I(inode)->outstanding_extents--; 1256 BTRFS_I(inode)->outstanding_extents--;
1301 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1257 spin_unlock(&BTRFS_I(inode)->accounting_lock);
@@ -1324,6 +1280,7 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1324 BTRFS_I(inode)->outstanding_extents++; 1280 BTRFS_I(inode)->outstanding_extents++;
1325 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1281 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1326 btrfs_delalloc_reserve_space(root, inode, end - start + 1); 1282 btrfs_delalloc_reserve_space(root, inode, end - start + 1);
1283
1327 spin_lock(&root->fs_info->delalloc_lock); 1284 spin_lock(&root->fs_info->delalloc_lock);
1328 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1285 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
1329 root->fs_info->delalloc_bytes += end - start + 1; 1286 root->fs_info->delalloc_bytes += end - start + 1;
@@ -1352,6 +1309,7 @@ static int btrfs_clear_bit_hook(struct inode *inode,
1352 1309
1353 if (bits & EXTENT_DO_ACCOUNTING) { 1310 if (bits & EXTENT_DO_ACCOUNTING) {
1354 spin_lock(&BTRFS_I(inode)->accounting_lock); 1311 spin_lock(&BTRFS_I(inode)->accounting_lock);
1312 WARN_ON(!BTRFS_I(inode)->outstanding_extents);
1355 BTRFS_I(inode)->outstanding_extents--; 1313 BTRFS_I(inode)->outstanding_extents--;
1356 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1314 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1357 btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 1315 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
@@ -1508,12 +1466,13 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1508 return 0; 1466 return 0;
1509} 1467}
1510 1468
1511int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end) 1469int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1470 struct extent_state **cached_state)
1512{ 1471{
1513 if ((end & (PAGE_CACHE_SIZE - 1)) == 0) 1472 if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
1514 WARN_ON(1); 1473 WARN_ON(1);
1515 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1474 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1516 GFP_NOFS); 1475 cached_state, GFP_NOFS);
1517} 1476}
1518 1477
1519/* see btrfs_writepage_start_hook for details on why this is required */ 1478/* see btrfs_writepage_start_hook for details on why this is required */
@@ -1526,6 +1485,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1526{ 1485{
1527 struct btrfs_writepage_fixup *fixup; 1486 struct btrfs_writepage_fixup *fixup;
1528 struct btrfs_ordered_extent *ordered; 1487 struct btrfs_ordered_extent *ordered;
1488 struct extent_state *cached_state = NULL;
1529 struct page *page; 1489 struct page *page;
1530 struct inode *inode; 1490 struct inode *inode;
1531 u64 page_start; 1491 u64 page_start;
@@ -1544,7 +1504,8 @@ again:
1544 page_start = page_offset(page); 1504 page_start = page_offset(page);
1545 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; 1505 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1546 1506
1547 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); 1507 lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
1508 &cached_state, GFP_NOFS);
1548 1509
1549 /* already ordered? We're done */ 1510 /* already ordered? We're done */
1550 if (PagePrivate2(page)) 1511 if (PagePrivate2(page))
@@ -1552,17 +1513,18 @@ again:
1552 1513
1553 ordered = btrfs_lookup_ordered_extent(inode, page_start); 1514 ordered = btrfs_lookup_ordered_extent(inode, page_start);
1554 if (ordered) { 1515 if (ordered) {
1555 unlock_extent(&BTRFS_I(inode)->io_tree, page_start, 1516 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
1556 page_end, GFP_NOFS); 1517 page_end, &cached_state, GFP_NOFS);
1557 unlock_page(page); 1518 unlock_page(page);
1558 btrfs_start_ordered_extent(inode, ordered, 1); 1519 btrfs_start_ordered_extent(inode, ordered, 1);
1559 goto again; 1520 goto again;
1560 } 1521 }
1561 1522
1562 btrfs_set_extent_delalloc(inode, page_start, page_end); 1523 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
1563 ClearPageChecked(page); 1524 ClearPageChecked(page);
1564out: 1525out:
1565 unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); 1526 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
1527 &cached_state, GFP_NOFS);
1566out_page: 1528out_page:
1567 unlock_page(page); 1529 unlock_page(page);
1568 page_cache_release(page); 1530 page_cache_release(page);
@@ -1691,14 +1653,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1691 struct btrfs_trans_handle *trans; 1653 struct btrfs_trans_handle *trans;
1692 struct btrfs_ordered_extent *ordered_extent = NULL; 1654 struct btrfs_ordered_extent *ordered_extent = NULL;
1693 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1655 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1656 struct extent_state *cached_state = NULL;
1694 int compressed = 0; 1657 int compressed = 0;
1695 int ret; 1658 int ret;
1696 1659
1697 ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1); 1660 ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
1661 end - start + 1);
1698 if (!ret) 1662 if (!ret)
1699 return 0; 1663 return 0;
1700
1701 ordered_extent = btrfs_lookup_ordered_extent(inode, start);
1702 BUG_ON(!ordered_extent); 1664 BUG_ON(!ordered_extent);
1703 1665
1704 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1666 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
@@ -1713,9 +1675,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1713 goto out; 1675 goto out;
1714 } 1676 }
1715 1677
1716 lock_extent(io_tree, ordered_extent->file_offset, 1678 lock_extent_bits(io_tree, ordered_extent->file_offset,
1717 ordered_extent->file_offset + ordered_extent->len - 1, 1679 ordered_extent->file_offset + ordered_extent->len - 1,
1718 GFP_NOFS); 1680 0, &cached_state, GFP_NOFS);
1719 1681
1720 trans = btrfs_join_transaction(root, 1); 1682 trans = btrfs_join_transaction(root, 1);
1721 1683
@@ -1742,9 +1704,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1742 ordered_extent->len); 1704 ordered_extent->len);
1743 BUG_ON(ret); 1705 BUG_ON(ret);
1744 } 1706 }
1745 unlock_extent(io_tree, ordered_extent->file_offset, 1707 unlock_extent_cached(io_tree, ordered_extent->file_offset,
1746 ordered_extent->file_offset + ordered_extent->len - 1, 1708 ordered_extent->file_offset +
1747 GFP_NOFS); 1709 ordered_extent->len - 1, &cached_state, GFP_NOFS);
1710
1748 add_pending_csums(trans, inode, ordered_extent->file_offset, 1711 add_pending_csums(trans, inode, ordered_extent->file_offset,
1749 &ordered_extent->list); 1712 &ordered_extent->list);
1750 1713
@@ -2153,7 +2116,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2153 found_key.objectid = found_key.offset; 2116 found_key.objectid = found_key.offset;
2154 found_key.type = BTRFS_INODE_ITEM_KEY; 2117 found_key.type = BTRFS_INODE_ITEM_KEY;
2155 found_key.offset = 0; 2118 found_key.offset = 0;
2156 inode = btrfs_iget(root->fs_info->sb, &found_key, root); 2119 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2157 if (IS_ERR(inode)) 2120 if (IS_ERR(inode))
2158 break; 2121 break;
2159 2122
@@ -3081,6 +3044,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3081 struct btrfs_root *root = BTRFS_I(inode)->root; 3044 struct btrfs_root *root = BTRFS_I(inode)->root;
3082 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3045 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3083 struct btrfs_ordered_extent *ordered; 3046 struct btrfs_ordered_extent *ordered;
3047 struct extent_state *cached_state = NULL;
3084 char *kaddr; 3048 char *kaddr;
3085 u32 blocksize = root->sectorsize; 3049 u32 blocksize = root->sectorsize;
3086 pgoff_t index = from >> PAGE_CACHE_SHIFT; 3050 pgoff_t index = from >> PAGE_CACHE_SHIFT;
@@ -3127,12 +3091,14 @@ again:
3127 } 3091 }
3128 wait_on_page_writeback(page); 3092 wait_on_page_writeback(page);
3129 3093
3130 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 3094 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
3095 GFP_NOFS);
3131 set_page_extent_mapped(page); 3096 set_page_extent_mapped(page);
3132 3097
3133 ordered = btrfs_lookup_ordered_extent(inode, page_start); 3098 ordered = btrfs_lookup_ordered_extent(inode, page_start);
3134 if (ordered) { 3099 if (ordered) {
3135 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3100 unlock_extent_cached(io_tree, page_start, page_end,
3101 &cached_state, GFP_NOFS);
3136 unlock_page(page); 3102 unlock_page(page);
3137 page_cache_release(page); 3103 page_cache_release(page);
3138 btrfs_start_ordered_extent(inode, ordered, 1); 3104 btrfs_start_ordered_extent(inode, ordered, 1);
@@ -3140,13 +3106,15 @@ again:
3140 goto again; 3106 goto again;
3141 } 3107 }
3142 3108
3143 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 3109 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
3144 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 3110 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
3145 GFP_NOFS); 3111 0, 0, &cached_state, GFP_NOFS);
3146 3112
3147 ret = btrfs_set_extent_delalloc(inode, page_start, page_end); 3113 ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
3114 &cached_state);
3148 if (ret) { 3115 if (ret) {
3149 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3116 unlock_extent_cached(io_tree, page_start, page_end,
3117 &cached_state, GFP_NOFS);
3150 goto out_unlock; 3118 goto out_unlock;
3151 } 3119 }
3152 3120
@@ -3159,7 +3127,8 @@ again:
3159 } 3127 }
3160 ClearPageChecked(page); 3128 ClearPageChecked(page);
3161 set_page_dirty(page); 3129 set_page_dirty(page);
3162 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3130 unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
3131 GFP_NOFS);
3163 3132
3164out_unlock: 3133out_unlock:
3165 if (ret) 3134 if (ret)
@@ -3177,6 +3146,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3177 struct btrfs_root *root = BTRFS_I(inode)->root; 3146 struct btrfs_root *root = BTRFS_I(inode)->root;
3178 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3147 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3179 struct extent_map *em; 3148 struct extent_map *em;
3149 struct extent_state *cached_state = NULL;
3180 u64 mask = root->sectorsize - 1; 3150 u64 mask = root->sectorsize - 1;
3181 u64 hole_start = (inode->i_size + mask) & ~mask; 3151 u64 hole_start = (inode->i_size + mask) & ~mask;
3182 u64 block_end = (size + mask) & ~mask; 3152 u64 block_end = (size + mask) & ~mask;
@@ -3192,11 +3162,13 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3192 struct btrfs_ordered_extent *ordered; 3162 struct btrfs_ordered_extent *ordered;
3193 btrfs_wait_ordered_range(inode, hole_start, 3163 btrfs_wait_ordered_range(inode, hole_start,
3194 block_end - hole_start); 3164 block_end - hole_start);
3195 lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3165 lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
3166 &cached_state, GFP_NOFS);
3196 ordered = btrfs_lookup_ordered_extent(inode, hole_start); 3167 ordered = btrfs_lookup_ordered_extent(inode, hole_start);
3197 if (!ordered) 3168 if (!ordered)
3198 break; 3169 break;
3199 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3170 unlock_extent_cached(io_tree, hole_start, block_end - 1,
3171 &cached_state, GFP_NOFS);
3200 btrfs_put_ordered_extent(ordered); 3172 btrfs_put_ordered_extent(ordered);
3201 } 3173 }
3202 3174
@@ -3241,7 +3213,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3241 break; 3213 break;
3242 } 3214 }
3243 3215
3244 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3216 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
3217 GFP_NOFS);
3245 return err; 3218 return err;
3246} 3219}
3247 3220
@@ -3639,6 +3612,7 @@ static noinline void init_btrfs_i(struct inode *inode)
3639 bi->index_cnt = (u64)-1; 3612 bi->index_cnt = (u64)-1;
3640 bi->last_unlink_trans = 0; 3613 bi->last_unlink_trans = 0;
3641 bi->ordered_data_close = 0; 3614 bi->ordered_data_close = 0;
3615 bi->force_compress = 0;
3642 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); 3616 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
3643 extent_io_tree_init(&BTRFS_I(inode)->io_tree, 3617 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
3644 inode->i_mapping, GFP_NOFS); 3618 inode->i_mapping, GFP_NOFS);
@@ -3687,7 +3661,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s,
3687 * Returns in *is_new if the inode was read from disk 3661 * Returns in *is_new if the inode was read from disk
3688 */ 3662 */
3689struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 3663struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3690 struct btrfs_root *root) 3664 struct btrfs_root *root, int *new)
3691{ 3665{
3692 struct inode *inode; 3666 struct inode *inode;
3693 3667
@@ -3702,6 +3676,8 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3702 3676
3703 inode_tree_add(inode); 3677 inode_tree_add(inode);
3704 unlock_new_inode(inode); 3678 unlock_new_inode(inode);
3679 if (new)
3680 *new = 1;
3705 } 3681 }
3706 3682
3707 return inode; 3683 return inode;
@@ -3754,7 +3730,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3754 return NULL; 3730 return NULL;
3755 3731
3756 if (location.type == BTRFS_INODE_ITEM_KEY) { 3732 if (location.type == BTRFS_INODE_ITEM_KEY) {
3757 inode = btrfs_iget(dir->i_sb, &location, root); 3733 inode = btrfs_iget(dir->i_sb, &location, root, NULL);
3758 return inode; 3734 return inode;
3759 } 3735 }
3760 3736
@@ -3769,7 +3745,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3769 else 3745 else
3770 inode = new_simple_dir(dir->i_sb, &location, sub_root); 3746 inode = new_simple_dir(dir->i_sb, &location, sub_root);
3771 } else { 3747 } else {
3772 inode = btrfs_iget(dir->i_sb, &location, sub_root); 3748 inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
3773 } 3749 }
3774 srcu_read_unlock(&root->fs_info->subvol_srcu, index); 3750 srcu_read_unlock(&root->fs_info->subvol_srcu, index);
3775 3751
@@ -4501,7 +4477,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4501 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4477 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4502 if (err) { 4478 if (err) {
4503 err = -ENOSPC; 4479 err = -ENOSPC;
4504 goto out_unlock; 4480 goto out_fail;
4505 } 4481 }
4506 4482
4507 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4483 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
@@ -4979,6 +4955,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4979{ 4955{
4980 struct extent_io_tree *tree; 4956 struct extent_io_tree *tree;
4981 struct btrfs_ordered_extent *ordered; 4957 struct btrfs_ordered_extent *ordered;
4958 struct extent_state *cached_state = NULL;
4982 u64 page_start = page_offset(page); 4959 u64 page_start = page_offset(page);
4983 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 4960 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
4984 4961
@@ -4997,7 +4974,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4997 btrfs_releasepage(page, GFP_NOFS); 4974 btrfs_releasepage(page, GFP_NOFS);
4998 return; 4975 return;
4999 } 4976 }
5000 lock_extent(tree, page_start, page_end, GFP_NOFS); 4977 lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
4978 GFP_NOFS);
5001 ordered = btrfs_lookup_ordered_extent(page->mapping->host, 4979 ordered = btrfs_lookup_ordered_extent(page->mapping->host,
5002 page_offset(page)); 4980 page_offset(page));
5003 if (ordered) { 4981 if (ordered) {
@@ -5008,7 +4986,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
5008 clear_extent_bit(tree, page_start, page_end, 4986 clear_extent_bit(tree, page_start, page_end,
5009 EXTENT_DIRTY | EXTENT_DELALLOC | 4987 EXTENT_DIRTY | EXTENT_DELALLOC |
5010 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, 4988 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
5011 NULL, GFP_NOFS); 4989 &cached_state, GFP_NOFS);
5012 /* 4990 /*
5013 * whoever cleared the private bit is responsible 4991 * whoever cleared the private bit is responsible
5014 * for the finish_ordered_io 4992 * for the finish_ordered_io
@@ -5018,11 +4996,13 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
5018 page_start, page_end); 4996 page_start, page_end);
5019 } 4997 }
5020 btrfs_put_ordered_extent(ordered); 4998 btrfs_put_ordered_extent(ordered);
5021 lock_extent(tree, page_start, page_end, GFP_NOFS); 4999 cached_state = NULL;
5000 lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
5001 GFP_NOFS);
5022 } 5002 }
5023 clear_extent_bit(tree, page_start, page_end, 5003 clear_extent_bit(tree, page_start, page_end,
5024 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 5004 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
5025 EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS); 5005 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS);
5026 __btrfs_releasepage(page, GFP_NOFS); 5006 __btrfs_releasepage(page, GFP_NOFS);
5027 5007
5028 ClearPageChecked(page); 5008 ClearPageChecked(page);
@@ -5055,6 +5035,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5055 struct btrfs_root *root = BTRFS_I(inode)->root; 5035 struct btrfs_root *root = BTRFS_I(inode)->root;
5056 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5036 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5057 struct btrfs_ordered_extent *ordered; 5037 struct btrfs_ordered_extent *ordered;
5038 struct extent_state *cached_state = NULL;
5058 char *kaddr; 5039 char *kaddr;
5059 unsigned long zero_start; 5040 unsigned long zero_start;
5060 loff_t size; 5041 loff_t size;
@@ -5093,7 +5074,8 @@ again:
5093 } 5074 }
5094 wait_on_page_writeback(page); 5075 wait_on_page_writeback(page);
5095 5076
5096 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 5077 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
5078 GFP_NOFS);
5097 set_page_extent_mapped(page); 5079 set_page_extent_mapped(page);
5098 5080
5099 /* 5081 /*
@@ -5102,7 +5084,8 @@ again:
5102 */ 5084 */
5103 ordered = btrfs_lookup_ordered_extent(inode, page_start); 5085 ordered = btrfs_lookup_ordered_extent(inode, page_start);
5104 if (ordered) { 5086 if (ordered) {
5105 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 5087 unlock_extent_cached(io_tree, page_start, page_end,
5088 &cached_state, GFP_NOFS);
5106 unlock_page(page); 5089 unlock_page(page);
5107 btrfs_start_ordered_extent(inode, ordered, 1); 5090 btrfs_start_ordered_extent(inode, ordered, 1);
5108 btrfs_put_ordered_extent(ordered); 5091 btrfs_put_ordered_extent(ordered);
@@ -5116,13 +5099,15 @@ again:
5116 * is probably a better way to do this, but for now keep consistent with 5099 * is probably a better way to do this, but for now keep consistent with
5117 * prepare_pages in the normal write path. 5100 * prepare_pages in the normal write path.
5118 */ 5101 */
5119 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 5102 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
5120 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 5103 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
5121 GFP_NOFS); 5104 0, 0, &cached_state, GFP_NOFS);
5122 5105
5123 ret = btrfs_set_extent_delalloc(inode, page_start, page_end); 5106 ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
5107 &cached_state);
5124 if (ret) { 5108 if (ret) {
5125 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 5109 unlock_extent_cached(io_tree, page_start, page_end,
5110 &cached_state, GFP_NOFS);
5126 ret = VM_FAULT_SIGBUS; 5111 ret = VM_FAULT_SIGBUS;
5127 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 5112 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5128 goto out_unlock; 5113 goto out_unlock;
@@ -5148,7 +5133,7 @@ again:
5148 BTRFS_I(inode)->last_trans = root->fs_info->generation; 5133 BTRFS_I(inode)->last_trans = root->fs_info->generation;
5149 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 5134 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
5150 5135
5151 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 5136 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
5152 5137
5153out_unlock: 5138out_unlock:
5154 btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 5139 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
@@ -5353,7 +5338,6 @@ free:
5353void btrfs_drop_inode(struct inode *inode) 5338void btrfs_drop_inode(struct inode *inode)
5354{ 5339{
5355 struct btrfs_root *root = BTRFS_I(inode)->root; 5340 struct btrfs_root *root = BTRFS_I(inode)->root;
5356
5357 if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0) 5341 if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
5358 generic_delete_inode(inode); 5342 generic_delete_inode(inode);
5359 else 5343 else
@@ -5757,18 +5741,15 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
5757 struct btrfs_trans_handle *trans; 5741 struct btrfs_trans_handle *trans;
5758 struct btrfs_root *root = BTRFS_I(inode)->root; 5742 struct btrfs_root *root = BTRFS_I(inode)->root;
5759 struct btrfs_key ins; 5743 struct btrfs_key ins;
5760 u64 alloc_size;
5761 u64 cur_offset = start; 5744 u64 cur_offset = start;
5762 u64 num_bytes = end - start; 5745 u64 num_bytes = end - start;
5763 int ret = 0; 5746 int ret = 0;
5764 u64 i_size; 5747 u64 i_size;
5765 5748
5766 while (num_bytes > 0) { 5749 while (num_bytes > 0) {
5767 alloc_size = min(num_bytes, root->fs_info->max_extent);
5768
5769 trans = btrfs_start_transaction(root, 1); 5750 trans = btrfs_start_transaction(root, 1);
5770 5751
5771 ret = btrfs_reserve_extent(trans, root, alloc_size, 5752 ret = btrfs_reserve_extent(trans, root, num_bytes,
5772 root->sectorsize, 0, alloc_hint, 5753 root->sectorsize, 0, alloc_hint,
5773 (u64)-1, &ins, 1); 5754 (u64)-1, &ins, 1);
5774 if (ret) { 5755 if (ret) {
@@ -5827,6 +5808,7 @@ stop_trans:
5827static long btrfs_fallocate(struct inode *inode, int mode, 5808static long btrfs_fallocate(struct inode *inode, int mode,
5828 loff_t offset, loff_t len) 5809 loff_t offset, loff_t len)
5829{ 5810{
5811 struct extent_state *cached_state = NULL;
5830 u64 cur_offset; 5812 u64 cur_offset;
5831 u64 last_byte; 5813 u64 last_byte;
5832 u64 alloc_start; 5814 u64 alloc_start;
@@ -5865,16 +5847,17 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5865 /* the extent lock is ordered inside the running 5847 /* the extent lock is ordered inside the running
5866 * transaction 5848 * transaction
5867 */ 5849 */
5868 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 5850 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
5869 GFP_NOFS); 5851 locked_end, 0, &cached_state, GFP_NOFS);
5870 ordered = btrfs_lookup_first_ordered_extent(inode, 5852 ordered = btrfs_lookup_first_ordered_extent(inode,
5871 alloc_end - 1); 5853 alloc_end - 1);
5872 if (ordered && 5854 if (ordered &&
5873 ordered->file_offset + ordered->len > alloc_start && 5855 ordered->file_offset + ordered->len > alloc_start &&
5874 ordered->file_offset < alloc_end) { 5856 ordered->file_offset < alloc_end) {
5875 btrfs_put_ordered_extent(ordered); 5857 btrfs_put_ordered_extent(ordered);
5876 unlock_extent(&BTRFS_I(inode)->io_tree, 5858 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
5877 alloc_start, locked_end, GFP_NOFS); 5859 alloc_start, locked_end,
5860 &cached_state, GFP_NOFS);
5878 /* 5861 /*
5879 * we can't wait on the range with the transaction 5862 * we can't wait on the range with the transaction
5880 * running or with the extent lock held 5863 * running or with the extent lock held
@@ -5916,8 +5899,8 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5916 break; 5899 break;
5917 } 5900 }
5918 } 5901 }
5919 unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 5902 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
5920 GFP_NOFS); 5903 &cached_state, GFP_NOFS);
5921 5904
5922 btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode, 5905 btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode,
5923 alloc_end - alloc_start); 5906 alloc_end - alloc_start);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 645a17927a8f..e84ef60ffe35 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -39,6 +39,7 @@
39#include <linux/security.h> 39#include <linux/security.h>
40#include <linux/xattr.h> 40#include <linux/xattr.h>
41#include <linux/vmalloc.h> 41#include <linux/vmalloc.h>
42#include <linux/slab.h>
42#include "compat.h" 43#include "compat.h"
43#include "ctree.h" 44#include "ctree.h"
44#include "disk-io.h" 45#include "disk-io.h"
@@ -474,7 +475,79 @@ out_unlock:
474 return error; 475 return error;
475} 476}
476 477
477static int btrfs_defrag_file(struct file *file) 478static int should_defrag_range(struct inode *inode, u64 start, u64 len,
479 int thresh, u64 *last_len, u64 *skip,
480 u64 *defrag_end)
481{
482 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
483 struct extent_map *em = NULL;
484 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
485 int ret = 1;
486
487
488 if (thresh == 0)
489 thresh = 256 * 1024;
490
491 /*
492 * make sure that once we start defragging and extent, we keep on
493 * defragging it
494 */
495 if (start < *defrag_end)
496 return 1;
497
498 *skip = 0;
499
500 /*
501 * hopefully we have this extent in the tree already, try without
502 * the full extent lock
503 */
504 read_lock(&em_tree->lock);
505 em = lookup_extent_mapping(em_tree, start, len);
506 read_unlock(&em_tree->lock);
507
508 if (!em) {
509 /* get the big lock and read metadata off disk */
510 lock_extent(io_tree, start, start + len - 1, GFP_NOFS);
511 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
512 unlock_extent(io_tree, start, start + len - 1, GFP_NOFS);
513
514 if (IS_ERR(em))
515 return 0;
516 }
517
518 /* this will cover holes, and inline extents */
519 if (em->block_start >= EXTENT_MAP_LAST_BYTE)
520 ret = 0;
521
522 /*
523 * we hit a real extent, if it is big don't bother defragging it again
524 */
525 if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh)
526 ret = 0;
527
528 /*
529 * last_len ends up being a counter of how many bytes we've defragged.
530 * every time we choose not to defrag an extent, we reset *last_len
531 * so that the next tiny extent will force a defrag.
532 *
533 * The end result of this is that tiny extents before a single big
534 * extent will force at least part of that big extent to be defragged.
535 */
536 if (ret) {
537 *last_len += len;
538 *defrag_end = extent_map_end(em);
539 } else {
540 *last_len = 0;
541 *skip = extent_map_end(em);
542 *defrag_end = 0;
543 }
544
545 free_extent_map(em);
546 return ret;
547}
548
549static int btrfs_defrag_file(struct file *file,
550 struct btrfs_ioctl_defrag_range_args *range)
478{ 551{
479 struct inode *inode = fdentry(file)->d_inode; 552 struct inode *inode = fdentry(file)->d_inode;
480 struct btrfs_root *root = BTRFS_I(inode)->root; 553 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -486,37 +559,96 @@ static int btrfs_defrag_file(struct file *file)
486 unsigned long total_read = 0; 559 unsigned long total_read = 0;
487 u64 page_start; 560 u64 page_start;
488 u64 page_end; 561 u64 page_end;
562 u64 last_len = 0;
563 u64 skip = 0;
564 u64 defrag_end = 0;
489 unsigned long i; 565 unsigned long i;
490 int ret; 566 int ret;
491 567
492 ret = btrfs_check_data_free_space(root, inode, inode->i_size); 568 if (inode->i_size == 0)
493 if (ret) 569 return 0;
494 return -ENOSPC; 570
571 if (range->start + range->len > range->start) {
572 last_index = min_t(u64, inode->i_size - 1,
573 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
574 } else {
575 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
576 }
577
578 i = range->start >> PAGE_CACHE_SHIFT;
579 while (i <= last_index) {
580 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
581 PAGE_CACHE_SIZE,
582 range->extent_thresh,
583 &last_len, &skip,
584 &defrag_end)) {
585 unsigned long next;
586 /*
587 * the should_defrag function tells us how much to skip
588 * bump our counter by the suggested amount
589 */
590 next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
591 i = max(i + 1, next);
592 continue;
593 }
495 594
496 mutex_lock(&inode->i_mutex);
497 last_index = inode->i_size >> PAGE_CACHE_SHIFT;
498 for (i = 0; i <= last_index; i++) {
499 if (total_read % ra_pages == 0) { 595 if (total_read % ra_pages == 0) {
500 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i, 596 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
501 min(last_index, i + ra_pages - 1)); 597 min(last_index, i + ra_pages - 1));
502 } 598 }
503 total_read++; 599 total_read++;
600 mutex_lock(&inode->i_mutex);
601 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
602 BTRFS_I(inode)->force_compress = 1;
603
604 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
605 if (ret) {
606 ret = -ENOSPC;
607 break;
608 }
609
610 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
611 if (ret) {
612 btrfs_free_reserved_data_space(root, inode,
613 PAGE_CACHE_SIZE);
614 ret = -ENOSPC;
615 break;
616 }
504again: 617again:
618 if (inode->i_size == 0 ||
619 i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
620 ret = 0;
621 goto err_reservations;
622 }
623
505 page = grab_cache_page(inode->i_mapping, i); 624 page = grab_cache_page(inode->i_mapping, i);
506 if (!page) 625 if (!page)
507 goto out_unlock; 626 goto err_reservations;
627
508 if (!PageUptodate(page)) { 628 if (!PageUptodate(page)) {
509 btrfs_readpage(NULL, page); 629 btrfs_readpage(NULL, page);
510 lock_page(page); 630 lock_page(page);
511 if (!PageUptodate(page)) { 631 if (!PageUptodate(page)) {
512 unlock_page(page); 632 unlock_page(page);
513 page_cache_release(page); 633 page_cache_release(page);
514 goto out_unlock; 634 goto err_reservations;
515 } 635 }
516 } 636 }
517 637
638 if (page->mapping != inode->i_mapping) {
639 unlock_page(page);
640 page_cache_release(page);
641 goto again;
642 }
643
518 wait_on_page_writeback(page); 644 wait_on_page_writeback(page);
519 645
646 if (PageDirty(page)) {
647 btrfs_free_reserved_data_space(root, inode,
648 PAGE_CACHE_SIZE);
649 goto loop_unlock;
650 }
651
520 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 652 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
521 page_end = page_start + PAGE_CACHE_SIZE - 1; 653 page_end = page_start + PAGE_CACHE_SIZE - 1;
522 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 654 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
@@ -537,18 +669,54 @@ again:
537 * page if it is dirtied again later 669 * page if it is dirtied again later
538 */ 670 */
539 clear_page_dirty_for_io(page); 671 clear_page_dirty_for_io(page);
672 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start,
673 page_end, EXTENT_DIRTY | EXTENT_DELALLOC |
674 EXTENT_DO_ACCOUNTING, GFP_NOFS);
540 675
541 btrfs_set_extent_delalloc(inode, page_start, page_end); 676 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
677 ClearPageChecked(page);
542 set_page_dirty(page); 678 set_page_dirty(page);
543 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 679 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
680
681loop_unlock:
544 unlock_page(page); 682 unlock_page(page);
545 page_cache_release(page); 683 page_cache_release(page);
684 mutex_unlock(&inode->i_mutex);
685
686 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
546 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); 687 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
688 i++;
689 }
690
691 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
692 filemap_flush(inode->i_mapping);
693
694 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
695 /* the filemap_flush will queue IO into the worker threads, but
696 * we have to make sure the IO is actually started and that
697 * ordered extents get created before we return
698 */
699 atomic_inc(&root->fs_info->async_submit_draining);
700 while (atomic_read(&root->fs_info->nr_async_submits) ||
701 atomic_read(&root->fs_info->async_delalloc_pages)) {
702 wait_event(root->fs_info->async_submit_wait,
703 (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
704 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
705 }
706 atomic_dec(&root->fs_info->async_submit_draining);
707
708 mutex_lock(&inode->i_mutex);
709 BTRFS_I(inode)->force_compress = 0;
710 mutex_unlock(&inode->i_mutex);
547 } 711 }
548 712
549out_unlock:
550 mutex_unlock(&inode->i_mutex);
551 return 0; 713 return 0;
714
715err_reservations:
716 mutex_unlock(&inode->i_mutex);
717 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
718 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
719 return ret;
552} 720}
553 721
554static noinline int btrfs_ioctl_resize(struct btrfs_root *root, 722static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
@@ -608,7 +776,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
608 mod = 1; 776 mod = 1;
609 sizestr++; 777 sizestr++;
610 } 778 }
611 new_size = btrfs_parse_size(sizestr); 779 new_size = memparse(sizestr, NULL);
612 if (new_size == 0) { 780 if (new_size == 0) {
613 ret = -EINVAL; 781 ret = -EINVAL;
614 goto out_unlock; 782 goto out_unlock;
@@ -743,6 +911,330 @@ out:
743 return ret; 911 return ret;
744} 912}
745 913
914static noinline int key_in_sk(struct btrfs_key *key,
915 struct btrfs_ioctl_search_key *sk)
916{
917 struct btrfs_key test;
918 int ret;
919
920 test.objectid = sk->min_objectid;
921 test.type = sk->min_type;
922 test.offset = sk->min_offset;
923
924 ret = btrfs_comp_cpu_keys(key, &test);
925 if (ret < 0)
926 return 0;
927
928 test.objectid = sk->max_objectid;
929 test.type = sk->max_type;
930 test.offset = sk->max_offset;
931
932 ret = btrfs_comp_cpu_keys(key, &test);
933 if (ret > 0)
934 return 0;
935 return 1;
936}
937
938static noinline int copy_to_sk(struct btrfs_root *root,
939 struct btrfs_path *path,
940 struct btrfs_key *key,
941 struct btrfs_ioctl_search_key *sk,
942 char *buf,
943 unsigned long *sk_offset,
944 int *num_found)
945{
946 u64 found_transid;
947 struct extent_buffer *leaf;
948 struct btrfs_ioctl_search_header sh;
949 unsigned long item_off;
950 unsigned long item_len;
951 int nritems;
952 int i;
953 int slot;
954 int found = 0;
955 int ret = 0;
956
957 leaf = path->nodes[0];
958 slot = path->slots[0];
959 nritems = btrfs_header_nritems(leaf);
960
961 if (btrfs_header_generation(leaf) > sk->max_transid) {
962 i = nritems;
963 goto advance_key;
964 }
965 found_transid = btrfs_header_generation(leaf);
966
967 for (i = slot; i < nritems; i++) {
968 item_off = btrfs_item_ptr_offset(leaf, i);
969 item_len = btrfs_item_size_nr(leaf, i);
970
971 if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
972 item_len = 0;
973
974 if (sizeof(sh) + item_len + *sk_offset >
975 BTRFS_SEARCH_ARGS_BUFSIZE) {
976 ret = 1;
977 goto overflow;
978 }
979
980 btrfs_item_key_to_cpu(leaf, key, i);
981 if (!key_in_sk(key, sk))
982 continue;
983
984 sh.objectid = key->objectid;
985 sh.offset = key->offset;
986 sh.type = key->type;
987 sh.len = item_len;
988 sh.transid = found_transid;
989
990 /* copy search result header */
991 memcpy(buf + *sk_offset, &sh, sizeof(sh));
992 *sk_offset += sizeof(sh);
993
994 if (item_len) {
995 char *p = buf + *sk_offset;
996 /* copy the item */
997 read_extent_buffer(leaf, p,
998 item_off, item_len);
999 *sk_offset += item_len;
1000 }
1001 found++;
1002
1003 if (*num_found >= sk->nr_items)
1004 break;
1005 }
1006advance_key:
1007 ret = 0;
1008 if (key->offset < (u64)-1 && key->offset < sk->max_offset)
1009 key->offset++;
1010 else if (key->type < (u8)-1 && key->type < sk->max_type) {
1011 key->offset = 0;
1012 key->type++;
1013 } else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) {
1014 key->offset = 0;
1015 key->type = 0;
1016 key->objectid++;
1017 } else
1018 ret = 1;
1019overflow:
1020 *num_found += found;
1021 return ret;
1022}
1023
1024static noinline int search_ioctl(struct inode *inode,
1025 struct btrfs_ioctl_search_args *args)
1026{
1027 struct btrfs_root *root;
1028 struct btrfs_key key;
1029 struct btrfs_key max_key;
1030 struct btrfs_path *path;
1031 struct btrfs_ioctl_search_key *sk = &args->key;
1032 struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
1033 int ret;
1034 int num_found = 0;
1035 unsigned long sk_offset = 0;
1036
1037 path = btrfs_alloc_path();
1038 if (!path)
1039 return -ENOMEM;
1040
1041 if (sk->tree_id == 0) {
1042 /* search the root of the inode that was passed */
1043 root = BTRFS_I(inode)->root;
1044 } else {
1045 key.objectid = sk->tree_id;
1046 key.type = BTRFS_ROOT_ITEM_KEY;
1047 key.offset = (u64)-1;
1048 root = btrfs_read_fs_root_no_name(info, &key);
1049 if (IS_ERR(root)) {
1050 printk(KERN_ERR "could not find root %llu\n",
1051 sk->tree_id);
1052 btrfs_free_path(path);
1053 return -ENOENT;
1054 }
1055 }
1056
1057 key.objectid = sk->min_objectid;
1058 key.type = sk->min_type;
1059 key.offset = sk->min_offset;
1060
1061 max_key.objectid = sk->max_objectid;
1062 max_key.type = sk->max_type;
1063 max_key.offset = sk->max_offset;
1064
1065 path->keep_locks = 1;
1066
1067 while(1) {
1068 ret = btrfs_search_forward(root, &key, &max_key, path, 0,
1069 sk->min_transid);
1070 if (ret != 0) {
1071 if (ret > 0)
1072 ret = 0;
1073 goto err;
1074 }
1075 ret = copy_to_sk(root, path, &key, sk, args->buf,
1076 &sk_offset, &num_found);
1077 btrfs_release_path(root, path);
1078 if (ret || num_found >= sk->nr_items)
1079 break;
1080
1081 }
1082 ret = 0;
1083err:
1084 sk->nr_items = num_found;
1085 btrfs_free_path(path);
1086 return ret;
1087}
1088
1089static noinline int btrfs_ioctl_tree_search(struct file *file,
1090 void __user *argp)
1091{
1092 struct btrfs_ioctl_search_args *args;
1093 struct inode *inode;
1094 int ret;
1095
1096 if (!capable(CAP_SYS_ADMIN))
1097 return -EPERM;
1098
1099 args = kmalloc(sizeof(*args), GFP_KERNEL);
1100 if (!args)
1101 return -ENOMEM;
1102
1103 if (copy_from_user(args, argp, sizeof(*args))) {
1104 kfree(args);
1105 return -EFAULT;
1106 }
1107 inode = fdentry(file)->d_inode;
1108 ret = search_ioctl(inode, args);
1109 if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
1110 ret = -EFAULT;
1111 kfree(args);
1112 return ret;
1113}
1114
1115/*
1116 * Search INODE_REFs to identify path name of 'dirid' directory
1117 * in a 'tree_id' tree. and sets path name to 'name'.
1118 */
1119static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
1120 u64 tree_id, u64 dirid, char *name)
1121{
1122 struct btrfs_root *root;
1123 struct btrfs_key key;
1124 char *ptr;
1125 int ret = -1;
1126 int slot;
1127 int len;
1128 int total_len = 0;
1129 struct btrfs_inode_ref *iref;
1130 struct extent_buffer *l;
1131 struct btrfs_path *path;
1132
1133 if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
1134 name[0]='\0';
1135 return 0;
1136 }
1137
1138 path = btrfs_alloc_path();
1139 if (!path)
1140 return -ENOMEM;
1141
1142 ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX];
1143
1144 key.objectid = tree_id;
1145 key.type = BTRFS_ROOT_ITEM_KEY;
1146 key.offset = (u64)-1;
1147 root = btrfs_read_fs_root_no_name(info, &key);
1148 if (IS_ERR(root)) {
1149 printk(KERN_ERR "could not find root %llu\n", tree_id);
1150 ret = -ENOENT;
1151 goto out;
1152 }
1153
1154 key.objectid = dirid;
1155 key.type = BTRFS_INODE_REF_KEY;
1156 key.offset = (u64)-1;
1157
1158 while(1) {
1159 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1160 if (ret < 0)
1161 goto out;
1162
1163 l = path->nodes[0];
1164 slot = path->slots[0];
1165 if (ret > 0 && slot > 0)
1166 slot--;
1167 btrfs_item_key_to_cpu(l, &key, slot);
1168
1169 if (ret > 0 && (key.objectid != dirid ||
1170 key.type != BTRFS_INODE_REF_KEY)) {
1171 ret = -ENOENT;
1172 goto out;
1173 }
1174
1175 iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
1176 len = btrfs_inode_ref_name_len(l, iref);
1177 ptr -= len + 1;
1178 total_len += len + 1;
1179 if (ptr < name)
1180 goto out;
1181
1182 *(ptr + len) = '/';
1183 read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len);
1184
1185 if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
1186 break;
1187
1188 btrfs_release_path(root, path);
1189 key.objectid = key.offset;
1190 key.offset = (u64)-1;
1191 dirid = key.objectid;
1192
1193 }
1194 if (ptr < name)
1195 goto out;
1196 memcpy(name, ptr, total_len);
1197 name[total_len]='\0';
1198 ret = 0;
1199out:
1200 btrfs_free_path(path);
1201 return ret;
1202}
1203
1204static noinline int btrfs_ioctl_ino_lookup(struct file *file,
1205 void __user *argp)
1206{
1207 struct btrfs_ioctl_ino_lookup_args *args;
1208 struct inode *inode;
1209 int ret;
1210
1211 if (!capable(CAP_SYS_ADMIN))
1212 return -EPERM;
1213
1214 args = kmalloc(sizeof(*args), GFP_KERNEL);
1215 if (!args)
1216 return -ENOMEM;
1217
1218 if (copy_from_user(args, argp, sizeof(*args))) {
1219 kfree(args);
1220 return -EFAULT;
1221 }
1222 inode = fdentry(file)->d_inode;
1223
1224 if (args->treeid == 0)
1225 args->treeid = BTRFS_I(inode)->root->root_key.objectid;
1226
1227 ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
1228 args->treeid, args->objectid,
1229 args->name);
1230
1231 if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
1232 ret = -EFAULT;
1233
1234 kfree(args);
1235 return ret;
1236}
1237
746static noinline int btrfs_ioctl_snap_destroy(struct file *file, 1238static noinline int btrfs_ioctl_snap_destroy(struct file *file,
747 void __user *arg) 1239 void __user *arg)
748{ 1240{
@@ -849,10 +1341,11 @@ out:
849 return err; 1341 return err;
850} 1342}
851 1343
852static int btrfs_ioctl_defrag(struct file *file) 1344static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
853{ 1345{
854 struct inode *inode = fdentry(file)->d_inode; 1346 struct inode *inode = fdentry(file)->d_inode;
855 struct btrfs_root *root = BTRFS_I(inode)->root; 1347 struct btrfs_root *root = BTRFS_I(inode)->root;
1348 struct btrfs_ioctl_defrag_range_args *range;
856 int ret; 1349 int ret;
857 1350
858 ret = mnt_want_write(file->f_path.mnt); 1351 ret = mnt_want_write(file->f_path.mnt);
@@ -873,7 +1366,31 @@ static int btrfs_ioctl_defrag(struct file *file)
873 ret = -EINVAL; 1366 ret = -EINVAL;
874 goto out; 1367 goto out;
875 } 1368 }
876 btrfs_defrag_file(file); 1369
1370 range = kzalloc(sizeof(*range), GFP_KERNEL);
1371 if (!range) {
1372 ret = -ENOMEM;
1373 goto out;
1374 }
1375
1376 if (argp) {
1377 if (copy_from_user(range, argp,
1378 sizeof(*range))) {
1379 ret = -EFAULT;
1380 kfree(range);
1381 goto out;
1382 }
1383 /* compression requires us to start the IO */
1384 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
1385 range->flags |= BTRFS_DEFRAG_RANGE_START_IO;
1386 range->extent_thresh = (u32)-1;
1387 }
1388 } else {
1389 /* the rest are all set to zero by kzalloc */
1390 range->len = (u64)-1;
1391 }
1392 btrfs_defrag_file(file, range);
1393 kfree(range);
877 break; 1394 break;
878 } 1395 }
879out: 1396out:
@@ -1274,6 +1791,157 @@ out:
1274 return ret; 1791 return ret;
1275} 1792}
1276 1793
1794static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
1795{
1796 struct inode *inode = fdentry(file)->d_inode;
1797 struct btrfs_root *root = BTRFS_I(inode)->root;
1798 struct btrfs_root *new_root;
1799 struct btrfs_dir_item *di;
1800 struct btrfs_trans_handle *trans;
1801 struct btrfs_path *path;
1802 struct btrfs_key location;
1803 struct btrfs_disk_key disk_key;
1804 struct btrfs_super_block *disk_super;
1805 u64 features;
1806 u64 objectid = 0;
1807 u64 dir_id;
1808
1809 if (!capable(CAP_SYS_ADMIN))
1810 return -EPERM;
1811
1812 if (copy_from_user(&objectid, argp, sizeof(objectid)))
1813 return -EFAULT;
1814
1815 if (!objectid)
1816 objectid = root->root_key.objectid;
1817
1818 location.objectid = objectid;
1819 location.type = BTRFS_ROOT_ITEM_KEY;
1820 location.offset = (u64)-1;
1821
1822 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
1823 if (IS_ERR(new_root))
1824 return PTR_ERR(new_root);
1825
1826 if (btrfs_root_refs(&new_root->root_item) == 0)
1827 return -ENOENT;
1828
1829 path = btrfs_alloc_path();
1830 if (!path)
1831 return -ENOMEM;
1832 path->leave_spinning = 1;
1833
1834 trans = btrfs_start_transaction(root, 1);
1835 if (!trans) {
1836 btrfs_free_path(path);
1837 return -ENOMEM;
1838 }
1839
1840 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
1841 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
1842 dir_id, "default", 7, 1);
1843 if (!di) {
1844 btrfs_free_path(path);
1845 btrfs_end_transaction(trans, root);
1846 printk(KERN_ERR "Umm, you don't have the default dir item, "
1847 "this isn't going to work\n");
1848 return -ENOENT;
1849 }
1850
1851 btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
1852 btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
1853 btrfs_mark_buffer_dirty(path->nodes[0]);
1854 btrfs_free_path(path);
1855
1856 disk_super = &root->fs_info->super_copy;
1857 features = btrfs_super_incompat_flags(disk_super);
1858 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
1859 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
1860 btrfs_set_super_incompat_flags(disk_super, features);
1861 }
1862 btrfs_end_transaction(trans, root);
1863
1864 return 0;
1865}
1866
1867long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
1868{
1869 struct btrfs_ioctl_space_args space_args;
1870 struct btrfs_ioctl_space_info space;
1871 struct btrfs_ioctl_space_info *dest;
1872 struct btrfs_ioctl_space_info *dest_orig;
1873 struct btrfs_ioctl_space_info *user_dest;
1874 struct btrfs_space_info *info;
1875 int alloc_size;
1876 int ret = 0;
1877 int slot_count = 0;
1878
1879 if (copy_from_user(&space_args,
1880 (struct btrfs_ioctl_space_args __user *)arg,
1881 sizeof(space_args)))
1882 return -EFAULT;
1883
1884 /* first we count slots */
1885 rcu_read_lock();
1886 list_for_each_entry_rcu(info, &root->fs_info->space_info, list)
1887 slot_count++;
1888 rcu_read_unlock();
1889
1890 /* space_slots == 0 means they are asking for a count */
1891 if (space_args.space_slots == 0) {
1892 space_args.total_spaces = slot_count;
1893 goto out;
1894 }
1895 alloc_size = sizeof(*dest) * slot_count;
1896 /* we generally have at most 6 or so space infos, one for each raid
1897 * level. So, a whole page should be more than enough for everyone
1898 */
1899 if (alloc_size > PAGE_CACHE_SIZE)
1900 return -ENOMEM;
1901
1902 space_args.total_spaces = 0;
1903 dest = kmalloc(alloc_size, GFP_NOFS);
1904 if (!dest)
1905 return -ENOMEM;
1906 dest_orig = dest;
1907
1908 /* now we have a buffer to copy into */
1909 rcu_read_lock();
1910 list_for_each_entry_rcu(info, &root->fs_info->space_info, list) {
1911 /* make sure we don't copy more than we allocated
1912 * in our buffer
1913 */
1914 if (slot_count == 0)
1915 break;
1916 slot_count--;
1917
1918 /* make sure userland has enough room in their buffer */
1919 if (space_args.total_spaces >= space_args.space_slots)
1920 break;
1921
1922 space.flags = info->flags;
1923 space.total_bytes = info->total_bytes;
1924 space.used_bytes = info->bytes_used;
1925 memcpy(dest, &space, sizeof(space));
1926 dest++;
1927 space_args.total_spaces++;
1928 }
1929 rcu_read_unlock();
1930
1931 user_dest = (struct btrfs_ioctl_space_info *)
1932 (arg + sizeof(struct btrfs_ioctl_space_args));
1933
1934 if (copy_to_user(user_dest, dest_orig, alloc_size))
1935 ret = -EFAULT;
1936
1937 kfree(dest_orig);
1938out:
1939 if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args)))
1940 ret = -EFAULT;
1941
1942 return ret;
1943}
1944
1277/* 1945/*
1278 * there are many ways the trans_start and trans_end ioctls can lead 1946 * there are many ways the trans_start and trans_end ioctls can lead
1279 * to deadlocks. They should only be used by applications that 1947 * to deadlocks. They should only be used by applications that
@@ -1320,8 +1988,12 @@ long btrfs_ioctl(struct file *file, unsigned int
1320 return btrfs_ioctl_snap_create(file, argp, 1); 1988 return btrfs_ioctl_snap_create(file, argp, 1);
1321 case BTRFS_IOC_SNAP_DESTROY: 1989 case BTRFS_IOC_SNAP_DESTROY:
1322 return btrfs_ioctl_snap_destroy(file, argp); 1990 return btrfs_ioctl_snap_destroy(file, argp);
1991 case BTRFS_IOC_DEFAULT_SUBVOL:
1992 return btrfs_ioctl_default_subvol(file, argp);
1323 case BTRFS_IOC_DEFRAG: 1993 case BTRFS_IOC_DEFRAG:
1324 return btrfs_ioctl_defrag(file); 1994 return btrfs_ioctl_defrag(file, NULL);
1995 case BTRFS_IOC_DEFRAG_RANGE:
1996 return btrfs_ioctl_defrag(file, argp);
1325 case BTRFS_IOC_RESIZE: 1997 case BTRFS_IOC_RESIZE:
1326 return btrfs_ioctl_resize(root, argp); 1998 return btrfs_ioctl_resize(root, argp);
1327 case BTRFS_IOC_ADD_DEV: 1999 case BTRFS_IOC_ADD_DEV:
@@ -1338,6 +2010,12 @@ long btrfs_ioctl(struct file *file, unsigned int
1338 return btrfs_ioctl_trans_start(file); 2010 return btrfs_ioctl_trans_start(file);
1339 case BTRFS_IOC_TRANS_END: 2011 case BTRFS_IOC_TRANS_END:
1340 return btrfs_ioctl_trans_end(file); 2012 return btrfs_ioctl_trans_end(file);
2013 case BTRFS_IOC_TREE_SEARCH:
2014 return btrfs_ioctl_tree_search(file, argp);
2015 case BTRFS_IOC_INO_LOOKUP:
2016 return btrfs_ioctl_ino_lookup(file, argp);
2017 case BTRFS_IOC_SPACE_INFO:
2018 return btrfs_ioctl_space_info(root, argp);
1341 case BTRFS_IOC_SYNC: 2019 case BTRFS_IOC_SYNC:
1342 btrfs_sync_fs(file->f_dentry->d_sb, 1); 2020 btrfs_sync_fs(file->f_dentry->d_sb, 1);
1343 return 0; 2021 return 0;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index bc49914475eb..424694aa517f 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,12 +30,114 @@ struct btrfs_ioctl_vol_args {
30 char name[BTRFS_PATH_NAME_MAX + 1]; 30 char name[BTRFS_PATH_NAME_MAX + 1];
31}; 31};
32 32
33#define BTRFS_INO_LOOKUP_PATH_MAX 4080
34struct btrfs_ioctl_ino_lookup_args {
35 __u64 treeid;
36 __u64 objectid;
37 char name[BTRFS_INO_LOOKUP_PATH_MAX];
38};
39
40struct btrfs_ioctl_search_key {
41 /* which root are we searching. 0 is the tree of tree roots */
42 __u64 tree_id;
43
44 /* keys returned will be >= min and <= max */
45 __u64 min_objectid;
46 __u64 max_objectid;
47
48 /* keys returned will be >= min and <= max */
49 __u64 min_offset;
50 __u64 max_offset;
51
52 /* max and min transids to search for */
53 __u64 min_transid;
54 __u64 max_transid;
55
56 /* keys returned will be >= min and <= max */
57 __u32 min_type;
58 __u32 max_type;
59
60 /*
61 * how many items did userland ask for, and how many are we
62 * returning
63 */
64 __u32 nr_items;
65
66 /* align to 64 bits */
67 __u32 unused;
68
69 /* some extra for later */
70 __u64 unused1;
71 __u64 unused2;
72 __u64 unused3;
73 __u64 unused4;
74};
75
76struct btrfs_ioctl_search_header {
77 __u64 transid;
78 __u64 objectid;
79 __u64 offset;
80 __u32 type;
81 __u32 len;
82};
83
84#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key))
85/*
86 * the buf is an array of search headers where
87 * each header is followed by the actual item
88 * the type field is expanded to 32 bits for alignment
89 */
90struct btrfs_ioctl_search_args {
91 struct btrfs_ioctl_search_key key;
92 char buf[BTRFS_SEARCH_ARGS_BUFSIZE];
93};
94
33struct btrfs_ioctl_clone_range_args { 95struct btrfs_ioctl_clone_range_args {
34 __s64 src_fd; 96 __s64 src_fd;
35 __u64 src_offset, src_length; 97 __u64 src_offset, src_length;
36 __u64 dest_offset; 98 __u64 dest_offset;
37}; 99};
38 100
101/* flags for the defrag range ioctl */
102#define BTRFS_DEFRAG_RANGE_COMPRESS 1
103#define BTRFS_DEFRAG_RANGE_START_IO 2
104
105struct btrfs_ioctl_defrag_range_args {
106 /* start of the defrag operation */
107 __u64 start;
108
109 /* number of bytes to defrag, use (u64)-1 to say all */
110 __u64 len;
111
112 /*
113 * flags for the operation, which can include turning
114 * on compression for this one defrag
115 */
116 __u64 flags;
117
118 /*
119 * any extent bigger than this will be considered
120 * already defragged. Use 0 to take the kernel default
121 * Use 1 to say every single extent must be rewritten
122 */
123 __u32 extent_thresh;
124
125 /* spare for later */
126 __u32 unused[5];
127};
128
129struct btrfs_ioctl_space_info {
130 __u64 flags;
131 __u64 total_bytes;
132 __u64 used_bytes;
133};
134
135struct btrfs_ioctl_space_args {
136 __u64 space_slots;
137 __u64 total_spaces;
138 struct btrfs_ioctl_space_info spaces[0];
139};
140
39#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ 141#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
40 struct btrfs_ioctl_vol_args) 142 struct btrfs_ioctl_vol_args)
41#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ 143#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -67,4 +169,13 @@ struct btrfs_ioctl_clone_range_args {
67 struct btrfs_ioctl_vol_args) 169 struct btrfs_ioctl_vol_args)
68#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \ 170#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
69 struct btrfs_ioctl_vol_args) 171 struct btrfs_ioctl_vol_args)
172#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \
173 struct btrfs_ioctl_defrag_range_args)
174#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \
175 struct btrfs_ioctl_search_args)
176#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
177 struct btrfs_ioctl_ino_lookup_args)
178#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
179#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
180 struct btrfs_ioctl_space_args)
70#endif 181#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 1c36e5cd8f55..6151f2ea38bb 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -16,7 +16,6 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/gfp.h>
20#include <linux/pagemap.h> 19#include <linux/pagemap.h>
21#include <linux/spinlock.h> 20#include <linux/spinlock.h>
22#include <linux/page-flags.h> 21#include <linux/page-flags.h>
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 5c2a9e78a949..a127c0ebb2dc 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -16,7 +16,6 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/gfp.h>
20#include <linux/slab.h> 19#include <linux/slab.h>
21#include <linux/blkdev.h> 20#include <linux/blkdev.h>
22#include <linux/writeback.h> 21#include <linux/writeback.h>
@@ -174,7 +173,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
174 if (!entry) 173 if (!entry)
175 return -ENOMEM; 174 return -ENOMEM;
176 175
177 mutex_lock(&tree->mutex);
178 entry->file_offset = file_offset; 176 entry->file_offset = file_offset;
179 entry->start = start; 177 entry->start = start;
180 entry->len = len; 178 entry->len = len;
@@ -190,16 +188,17 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
190 INIT_LIST_HEAD(&entry->list); 188 INIT_LIST_HEAD(&entry->list);
191 INIT_LIST_HEAD(&entry->root_extent_list); 189 INIT_LIST_HEAD(&entry->root_extent_list);
192 190
191 spin_lock(&tree->lock);
193 node = tree_insert(&tree->tree, file_offset, 192 node = tree_insert(&tree->tree, file_offset,
194 &entry->rb_node); 193 &entry->rb_node);
195 BUG_ON(node); 194 BUG_ON(node);
195 spin_unlock(&tree->lock);
196 196
197 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 197 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
198 list_add_tail(&entry->root_extent_list, 198 list_add_tail(&entry->root_extent_list,
199 &BTRFS_I(inode)->root->fs_info->ordered_extents); 199 &BTRFS_I(inode)->root->fs_info->ordered_extents);
200 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 200 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
201 201
202 mutex_unlock(&tree->mutex);
203 BUG_ON(node); 202 BUG_ON(node);
204 return 0; 203 return 0;
205} 204}
@@ -216,9 +215,9 @@ int btrfs_add_ordered_sum(struct inode *inode,
216 struct btrfs_ordered_inode_tree *tree; 215 struct btrfs_ordered_inode_tree *tree;
217 216
218 tree = &BTRFS_I(inode)->ordered_tree; 217 tree = &BTRFS_I(inode)->ordered_tree;
219 mutex_lock(&tree->mutex); 218 spin_lock(&tree->lock);
220 list_add_tail(&sum->list, &entry->list); 219 list_add_tail(&sum->list, &entry->list);
221 mutex_unlock(&tree->mutex); 220 spin_unlock(&tree->lock);
222 return 0; 221 return 0;
223} 222}
224 223
@@ -232,15 +231,16 @@ int btrfs_add_ordered_sum(struct inode *inode,
232 * to make sure this function only returns 1 once for a given ordered extent. 231 * to make sure this function only returns 1 once for a given ordered extent.
233 */ 232 */
234int btrfs_dec_test_ordered_pending(struct inode *inode, 233int btrfs_dec_test_ordered_pending(struct inode *inode,
234 struct btrfs_ordered_extent **cached,
235 u64 file_offset, u64 io_size) 235 u64 file_offset, u64 io_size)
236{ 236{
237 struct btrfs_ordered_inode_tree *tree; 237 struct btrfs_ordered_inode_tree *tree;
238 struct rb_node *node; 238 struct rb_node *node;
239 struct btrfs_ordered_extent *entry; 239 struct btrfs_ordered_extent *entry = NULL;
240 int ret; 240 int ret;
241 241
242 tree = &BTRFS_I(inode)->ordered_tree; 242 tree = &BTRFS_I(inode)->ordered_tree;
243 mutex_lock(&tree->mutex); 243 spin_lock(&tree->lock);
244 node = tree_search(tree, file_offset); 244 node = tree_search(tree, file_offset);
245 if (!node) { 245 if (!node) {
246 ret = 1; 246 ret = 1;
@@ -264,7 +264,11 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
264 else 264 else
265 ret = 1; 265 ret = 1;
266out: 266out:
267 mutex_unlock(&tree->mutex); 267 if (!ret && cached && entry) {
268 *cached = entry;
269 atomic_inc(&entry->refs);
270 }
271 spin_unlock(&tree->lock);
268 return ret == 0; 272 return ret == 0;
269} 273}
270 274
@@ -291,13 +295,14 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
291 295
292/* 296/*
293 * remove an ordered extent from the tree. No references are dropped 297 * remove an ordered extent from the tree. No references are dropped
294 * and you must wake_up entry->wait. You must hold the tree mutex 298 * and you must wake_up entry->wait. You must hold the tree lock
295 * while you call this function. 299 * while you call this function.
296 */ 300 */
297static int __btrfs_remove_ordered_extent(struct inode *inode, 301static int __btrfs_remove_ordered_extent(struct inode *inode,
298 struct btrfs_ordered_extent *entry) 302 struct btrfs_ordered_extent *entry)
299{ 303{
300 struct btrfs_ordered_inode_tree *tree; 304 struct btrfs_ordered_inode_tree *tree;
305 struct btrfs_root *root = BTRFS_I(inode)->root;
301 struct rb_node *node; 306 struct rb_node *node;
302 307
303 tree = &BTRFS_I(inode)->ordered_tree; 308 tree = &BTRFS_I(inode)->ordered_tree;
@@ -307,12 +312,13 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
307 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 312 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
308 313
309 spin_lock(&BTRFS_I(inode)->accounting_lock); 314 spin_lock(&BTRFS_I(inode)->accounting_lock);
315 WARN_ON(!BTRFS_I(inode)->outstanding_extents);
310 BTRFS_I(inode)->outstanding_extents--; 316 BTRFS_I(inode)->outstanding_extents--;
311 spin_unlock(&BTRFS_I(inode)->accounting_lock); 317 spin_unlock(&BTRFS_I(inode)->accounting_lock);
312 btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root, 318 btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
313 inode, 1); 319 inode, 1);
314 320
315 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 321 spin_lock(&root->fs_info->ordered_extent_lock);
316 list_del_init(&entry->root_extent_list); 322 list_del_init(&entry->root_extent_list);
317 323
318 /* 324 /*
@@ -324,7 +330,7 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
324 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { 330 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
325 list_del_init(&BTRFS_I(inode)->ordered_operations); 331 list_del_init(&BTRFS_I(inode)->ordered_operations);
326 } 332 }
327 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 333 spin_unlock(&root->fs_info->ordered_extent_lock);
328 334
329 return 0; 335 return 0;
330} 336}
@@ -340,9 +346,9 @@ int btrfs_remove_ordered_extent(struct inode *inode,
340 int ret; 346 int ret;
341 347
342 tree = &BTRFS_I(inode)->ordered_tree; 348 tree = &BTRFS_I(inode)->ordered_tree;
343 mutex_lock(&tree->mutex); 349 spin_lock(&tree->lock);
344 ret = __btrfs_remove_ordered_extent(inode, entry); 350 ret = __btrfs_remove_ordered_extent(inode, entry);
345 mutex_unlock(&tree->mutex); 351 spin_unlock(&tree->lock);
346 wake_up(&entry->wait); 352 wake_up(&entry->wait);
347 353
348 return ret; 354 return ret;
@@ -567,7 +573,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
567 struct btrfs_ordered_extent *entry = NULL; 573 struct btrfs_ordered_extent *entry = NULL;
568 574
569 tree = &BTRFS_I(inode)->ordered_tree; 575 tree = &BTRFS_I(inode)->ordered_tree;
570 mutex_lock(&tree->mutex); 576 spin_lock(&tree->lock);
571 node = tree_search(tree, file_offset); 577 node = tree_search(tree, file_offset);
572 if (!node) 578 if (!node)
573 goto out; 579 goto out;
@@ -578,7 +584,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
578 if (entry) 584 if (entry)
579 atomic_inc(&entry->refs); 585 atomic_inc(&entry->refs);
580out: 586out:
581 mutex_unlock(&tree->mutex); 587 spin_unlock(&tree->lock);
582 return entry; 588 return entry;
583} 589}
584 590
@@ -594,7 +600,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
594 struct btrfs_ordered_extent *entry = NULL; 600 struct btrfs_ordered_extent *entry = NULL;
595 601
596 tree = &BTRFS_I(inode)->ordered_tree; 602 tree = &BTRFS_I(inode)->ordered_tree;
597 mutex_lock(&tree->mutex); 603 spin_lock(&tree->lock);
598 node = tree_search(tree, file_offset); 604 node = tree_search(tree, file_offset);
599 if (!node) 605 if (!node)
600 goto out; 606 goto out;
@@ -602,7 +608,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
602 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 608 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
603 atomic_inc(&entry->refs); 609 atomic_inc(&entry->refs);
604out: 610out:
605 mutex_unlock(&tree->mutex); 611 spin_unlock(&tree->lock);
606 return entry; 612 return entry;
607} 613}
608 614
@@ -629,7 +635,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
629 else 635 else
630 offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); 636 offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
631 637
632 mutex_lock(&tree->mutex); 638 spin_lock(&tree->lock);
633 disk_i_size = BTRFS_I(inode)->disk_i_size; 639 disk_i_size = BTRFS_I(inode)->disk_i_size;
634 640
635 /* truncate file */ 641 /* truncate file */
@@ -735,7 +741,7 @@ out:
735 */ 741 */
736 if (ordered) 742 if (ordered)
737 __btrfs_remove_ordered_extent(inode, ordered); 743 __btrfs_remove_ordered_extent(inode, ordered);
738 mutex_unlock(&tree->mutex); 744 spin_unlock(&tree->lock);
739 if (ordered) 745 if (ordered)
740 wake_up(&ordered->wait); 746 wake_up(&ordered->wait);
741 return ret; 747 return ret;
@@ -762,7 +768,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
762 if (!ordered) 768 if (!ordered)
763 return 1; 769 return 1;
764 770
765 mutex_lock(&tree->mutex); 771 spin_lock(&tree->lock);
766 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { 772 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
767 if (disk_bytenr >= ordered_sum->bytenr) { 773 if (disk_bytenr >= ordered_sum->bytenr) {
768 num_sectors = ordered_sum->len / sectorsize; 774 num_sectors = ordered_sum->len / sectorsize;
@@ -777,7 +783,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
777 } 783 }
778 } 784 }
779out: 785out:
780 mutex_unlock(&tree->mutex); 786 spin_unlock(&tree->lock);
781 btrfs_put_ordered_extent(ordered); 787 btrfs_put_ordered_extent(ordered);
782 return ret; 788 return ret;
783} 789}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 9116c6d0c5a9..c82f76a9f040 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -21,7 +21,7 @@
21 21
22/* one of these per inode */ 22/* one of these per inode */
23struct btrfs_ordered_inode_tree { 23struct btrfs_ordered_inode_tree {
24 struct mutex mutex; 24 spinlock_t lock;
25 struct rb_root tree; 25 struct rb_root tree;
26 struct rb_node *last; 26 struct rb_node *last;
27}; 27};
@@ -128,7 +128,7 @@ static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
128static inline void 128static inline void
129btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) 129btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
130{ 130{
131 mutex_init(&t->mutex); 131 spin_lock_init(&t->lock);
132 t->tree = RB_ROOT; 132 t->tree = RB_ROOT;
133 t->last = NULL; 133 t->last = NULL;
134} 134}
@@ -137,7 +137,8 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
137int btrfs_remove_ordered_extent(struct inode *inode, 137int btrfs_remove_ordered_extent(struct inode *inode,
138 struct btrfs_ordered_extent *entry); 138 struct btrfs_ordered_extent *entry);
139int btrfs_dec_test_ordered_pending(struct inode *inode, 139int btrfs_dec_test_ordered_pending(struct inode *inode,
140 u64 file_offset, u64 io_size); 140 struct btrfs_ordered_extent **cached,
141 u64 file_offset, u64 io_size);
141int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 142int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
142 u64 start, u64 len, u64 disk_len, int tyep); 143 u64 start, u64 len, u64 disk_len, int tyep);
143int btrfs_add_ordered_sum(struct inode *inode, 144int btrfs_add_ordered_sum(struct inode *inode,
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index d0cc62bccb94..a97314cf6bd6 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include <linux/sort.h> 21#include <linux/sort.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "ref-cache.h" 23#include "ref-cache.h"
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 0109e5606bad..e558dd941ded 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -21,6 +21,7 @@
21#include <linux/writeback.h> 21#include <linux/writeback.h>
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/rbtree.h> 23#include <linux/rbtree.h>
24#include <linux/slab.h>
24#include "ctree.h" 25#include "ctree.h"
25#include "disk-io.h" 26#include "disk-io.h"
26#include "transaction.h" 27#include "transaction.h"
@@ -2659,7 +2660,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2659 EXTENT_BOUNDARY, GFP_NOFS); 2660 EXTENT_BOUNDARY, GFP_NOFS);
2660 nr++; 2661 nr++;
2661 } 2662 }
2662 btrfs_set_extent_delalloc(inode, page_start, page_end); 2663 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
2663 2664
2664 set_page_dirty(page); 2665 set_page_dirty(page);
2665 dirty_page++; 2666 dirty_page++;
@@ -3487,7 +3488,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3487 key.objectid = objectid; 3488 key.objectid = objectid;
3488 key.type = BTRFS_INODE_ITEM_KEY; 3489 key.type = BTRFS_INODE_ITEM_KEY;
3489 key.offset = 0; 3490 key.offset = 0;
3490 inode = btrfs_iget(root->fs_info->sb, &key, root); 3491 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
3491 BUG_ON(IS_ERR(inode) || is_bad_inode(inode)); 3492 BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
3492 BTRFS_I(inode)->index_cnt = group->key.objectid; 3493 BTRFS_I(inode)->index_cnt = group->key.objectid;
3493 3494
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f8b4521de907..1866dff0538e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -38,6 +38,7 @@
38#include <linux/namei.h> 38#include <linux/namei.h>
39#include <linux/miscdevice.h> 39#include <linux/miscdevice.h>
40#include <linux/magic.h> 40#include <linux/magic.h>
41#include <linux/slab.h>
41#include "compat.h" 42#include "compat.h"
42#include "ctree.h" 43#include "ctree.h"
43#include "disk-io.h" 44#include "disk-io.h"
@@ -63,22 +64,21 @@ static void btrfs_put_super(struct super_block *sb)
63} 64}
64 65
65enum { 66enum {
66 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, 67 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
67 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, 68 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
68 Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, 69 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
69 Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio, 70 Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
70 Opt_flushoncommit,
71 Opt_discard, Opt_err, 71 Opt_discard, Opt_err,
72}; 72};
73 73
74static match_table_t tokens = { 74static match_table_t tokens = {
75 {Opt_degraded, "degraded"}, 75 {Opt_degraded, "degraded"},
76 {Opt_subvol, "subvol=%s"}, 76 {Opt_subvol, "subvol=%s"},
77 {Opt_subvolid, "subvolid=%d"},
77 {Opt_device, "device=%s"}, 78 {Opt_device, "device=%s"},
78 {Opt_nodatasum, "nodatasum"}, 79 {Opt_nodatasum, "nodatasum"},
79 {Opt_nodatacow, "nodatacow"}, 80 {Opt_nodatacow, "nodatacow"},
80 {Opt_nobarrier, "nobarrier"}, 81 {Opt_nobarrier, "nobarrier"},
81 {Opt_max_extent, "max_extent=%s"},
82 {Opt_max_inline, "max_inline=%s"}, 82 {Opt_max_inline, "max_inline=%s"},
83 {Opt_alloc_start, "alloc_start=%s"}, 83 {Opt_alloc_start, "alloc_start=%s"},
84 {Opt_thread_pool, "thread_pool=%d"}, 84 {Opt_thread_pool, "thread_pool=%d"},
@@ -95,31 +95,6 @@ static match_table_t tokens = {
95 {Opt_err, NULL}, 95 {Opt_err, NULL},
96}; 96};
97 97
98u64 btrfs_parse_size(char *str)
99{
100 u64 res;
101 int mult = 1;
102 char *end;
103 char last;
104
105 res = simple_strtoul(str, &end, 10);
106
107 last = end[0];
108 if (isalpha(last)) {
109 last = tolower(last);
110 switch (last) {
111 case 'g':
112 mult *= 1024;
113 case 'm':
114 mult *= 1024;
115 case 'k':
116 mult *= 1024;
117 }
118 res = res * mult;
119 }
120 return res;
121}
122
123/* 98/*
124 * Regular mount options parser. Everything that is needed only when 99 * Regular mount options parser. Everything that is needed only when
125 * reading in a new superblock is parsed here. 100 * reading in a new superblock is parsed here.
@@ -157,6 +132,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
157 btrfs_set_opt(info->mount_opt, DEGRADED); 132 btrfs_set_opt(info->mount_opt, DEGRADED);
158 break; 133 break;
159 case Opt_subvol: 134 case Opt_subvol:
135 case Opt_subvolid:
160 case Opt_device: 136 case Opt_device:
161 /* 137 /*
162 * These are parsed by btrfs_parse_early_options 138 * These are parsed by btrfs_parse_early_options
@@ -211,22 +187,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
211 info->thread_pool_size); 187 info->thread_pool_size);
212 } 188 }
213 break; 189 break;
214 case Opt_max_extent:
215 num = match_strdup(&args[0]);
216 if (num) {
217 info->max_extent = btrfs_parse_size(num);
218 kfree(num);
219
220 info->max_extent = max_t(u64,
221 info->max_extent, root->sectorsize);
222 printk(KERN_INFO "btrfs: max_extent at %llu\n",
223 (unsigned long long)info->max_extent);
224 }
225 break;
226 case Opt_max_inline: 190 case Opt_max_inline:
227 num = match_strdup(&args[0]); 191 num = match_strdup(&args[0]);
228 if (num) { 192 if (num) {
229 info->max_inline = btrfs_parse_size(num); 193 info->max_inline = memparse(num, NULL);
230 kfree(num); 194 kfree(num);
231 195
232 if (info->max_inline) { 196 if (info->max_inline) {
@@ -241,7 +205,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
241 case Opt_alloc_start: 205 case Opt_alloc_start:
242 num = match_strdup(&args[0]); 206 num = match_strdup(&args[0]);
243 if (num) { 207 if (num) {
244 info->alloc_start = btrfs_parse_size(num); 208 info->alloc_start = memparse(num, NULL);
245 kfree(num); 209 kfree(num);
246 printk(KERN_INFO 210 printk(KERN_INFO
247 "btrfs: allocations start at %llu\n", 211 "btrfs: allocations start at %llu\n",
@@ -292,12 +256,13 @@ out:
292 * only when we need to allocate a new super block. 256 * only when we need to allocate a new super block.
293 */ 257 */
294static int btrfs_parse_early_options(const char *options, fmode_t flags, 258static int btrfs_parse_early_options(const char *options, fmode_t flags,
295 void *holder, char **subvol_name, 259 void *holder, char **subvol_name, u64 *subvol_objectid,
296 struct btrfs_fs_devices **fs_devices) 260 struct btrfs_fs_devices **fs_devices)
297{ 261{
298 substring_t args[MAX_OPT_ARGS]; 262 substring_t args[MAX_OPT_ARGS];
299 char *opts, *p; 263 char *opts, *p;
300 int error = 0; 264 int error = 0;
265 int intarg;
301 266
302 if (!options) 267 if (!options)
303 goto out; 268 goto out;
@@ -320,6 +285,18 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
320 case Opt_subvol: 285 case Opt_subvol:
321 *subvol_name = match_strdup(&args[0]); 286 *subvol_name = match_strdup(&args[0]);
322 break; 287 break;
288 case Opt_subvolid:
289 intarg = 0;
290 error = match_int(&args[0], &intarg);
291 if (!error) {
292 /* we want the original fs_tree */
293 if (!intarg)
294 *subvol_objectid =
295 BTRFS_FS_TREE_OBJECTID;
296 else
297 *subvol_objectid = intarg;
298 }
299 break;
323 case Opt_device: 300 case Opt_device:
324 error = btrfs_scan_one_device(match_strdup(&args[0]), 301 error = btrfs_scan_one_device(match_strdup(&args[0]),
325 flags, holder, fs_devices); 302 flags, holder, fs_devices);
@@ -347,6 +324,110 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
347 return error; 324 return error;
348} 325}
349 326
327static struct dentry *get_default_root(struct super_block *sb,
328 u64 subvol_objectid)
329{
330 struct btrfs_root *root = sb->s_fs_info;
331 struct btrfs_root *new_root;
332 struct btrfs_dir_item *di;
333 struct btrfs_path *path;
334 struct btrfs_key location;
335 struct inode *inode;
336 struct dentry *dentry;
337 u64 dir_id;
338 int new = 0;
339
340 /*
341 * We have a specific subvol we want to mount, just setup location and
342 * go look up the root.
343 */
344 if (subvol_objectid) {
345 location.objectid = subvol_objectid;
346 location.type = BTRFS_ROOT_ITEM_KEY;
347 location.offset = (u64)-1;
348 goto find_root;
349 }
350
351 path = btrfs_alloc_path();
352 if (!path)
353 return ERR_PTR(-ENOMEM);
354 path->leave_spinning = 1;
355
356 /*
357 * Find the "default" dir item which points to the root item that we
358 * will mount by default if we haven't been given a specific subvolume
359 * to mount.
360 */
361 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
362 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
363 if (!di) {
364 /*
365 * Ok the default dir item isn't there. This is weird since
366 * it's always been there, but don't freak out, just try and
367 * mount to root most subvolume.
368 */
369 btrfs_free_path(path);
370 dir_id = BTRFS_FIRST_FREE_OBJECTID;
371 new_root = root->fs_info->fs_root;
372 goto setup_root;
373 }
374
375 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
376 btrfs_free_path(path);
377
378find_root:
379 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
380 if (IS_ERR(new_root))
381 return ERR_PTR(PTR_ERR(new_root));
382
383 if (btrfs_root_refs(&new_root->root_item) == 0)
384 return ERR_PTR(-ENOENT);
385
386 dir_id = btrfs_root_dirid(&new_root->root_item);
387setup_root:
388 location.objectid = dir_id;
389 location.type = BTRFS_INODE_ITEM_KEY;
390 location.offset = 0;
391
392 inode = btrfs_iget(sb, &location, new_root, &new);
393 if (!inode)
394 return ERR_PTR(-ENOMEM);
395
396 /*
397 * If we're just mounting the root most subvol put the inode and return
398 * a reference to the dentry. We will have already gotten a reference
399 * to the inode in btrfs_fill_super so we're good to go.
400 */
401 if (!new && sb->s_root->d_inode == inode) {
402 iput(inode);
403 return dget(sb->s_root);
404 }
405
406 if (new) {
407 const struct qstr name = { .name = "/", .len = 1 };
408
409 /*
410 * New inode, we need to make the dentry a sibling of s_root so
411 * everything gets cleaned up properly on unmount.
412 */
413 dentry = d_alloc(sb->s_root, &name);
414 if (!dentry) {
415 iput(inode);
416 return ERR_PTR(-ENOMEM);
417 }
418 d_splice_alias(inode, dentry);
419 } else {
420 /*
421 * We found the inode in cache, just find a dentry for it and
422 * put the reference to the inode we just got.
423 */
424 dentry = d_find_alias(inode);
425 iput(inode);
426 }
427
428 return dentry;
429}
430
350static int btrfs_fill_super(struct super_block *sb, 431static int btrfs_fill_super(struct super_block *sb,
351 struct btrfs_fs_devices *fs_devices, 432 struct btrfs_fs_devices *fs_devices,
352 void *data, int silent) 433 void *data, int silent)
@@ -380,7 +461,7 @@ static int btrfs_fill_super(struct super_block *sb,
380 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 461 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
381 key.type = BTRFS_INODE_ITEM_KEY; 462 key.type = BTRFS_INODE_ITEM_KEY;
382 key.offset = 0; 463 key.offset = 0;
383 inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root); 464 inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL);
384 if (IS_ERR(inode)) { 465 if (IS_ERR(inode)) {
385 err = PTR_ERR(inode); 466 err = PTR_ERR(inode);
386 goto fail_close; 467 goto fail_close;
@@ -392,12 +473,6 @@ static int btrfs_fill_super(struct super_block *sb,
392 err = -ENOMEM; 473 err = -ENOMEM;
393 goto fail_close; 474 goto fail_close;
394 } 475 }
395#if 0
396 /* this does the super kobj at the same time */
397 err = btrfs_sysfs_add_super(tree_root->fs_info);
398 if (err)
399 goto fail_close;
400#endif
401 476
402 sb->s_root = root_dentry; 477 sb->s_root = root_dentry;
403 478
@@ -441,9 +516,6 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
441 seq_puts(seq, ",nodatacow"); 516 seq_puts(seq, ",nodatacow");
442 if (btrfs_test_opt(root, NOBARRIER)) 517 if (btrfs_test_opt(root, NOBARRIER))
443 seq_puts(seq, ",nobarrier"); 518 seq_puts(seq, ",nobarrier");
444 if (info->max_extent != (u64)-1)
445 seq_printf(seq, ",max_extent=%llu",
446 (unsigned long long)info->max_extent);
447 if (info->max_inline != 8192 * 1024) 519 if (info->max_inline != 8192 * 1024)
448 seq_printf(seq, ",max_inline=%llu", 520 seq_printf(seq, ",max_inline=%llu",
449 (unsigned long long)info->max_inline); 521 (unsigned long long)info->max_inline);
@@ -489,19 +561,22 @@ static int btrfs_test_super(struct super_block *s, void *data)
489static int btrfs_get_sb(struct file_system_type *fs_type, int flags, 561static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
490 const char *dev_name, void *data, struct vfsmount *mnt) 562 const char *dev_name, void *data, struct vfsmount *mnt)
491{ 563{
492 char *subvol_name = NULL;
493 struct block_device *bdev = NULL; 564 struct block_device *bdev = NULL;
494 struct super_block *s; 565 struct super_block *s;
495 struct dentry *root; 566 struct dentry *root;
496 struct btrfs_fs_devices *fs_devices = NULL; 567 struct btrfs_fs_devices *fs_devices = NULL;
497 fmode_t mode = FMODE_READ; 568 fmode_t mode = FMODE_READ;
569 char *subvol_name = NULL;
570 u64 subvol_objectid = 0;
498 int error = 0; 571 int error = 0;
572 int found = 0;
499 573
500 if (!(flags & MS_RDONLY)) 574 if (!(flags & MS_RDONLY))
501 mode |= FMODE_WRITE; 575 mode |= FMODE_WRITE;
502 576
503 error = btrfs_parse_early_options(data, mode, fs_type, 577 error = btrfs_parse_early_options(data, mode, fs_type,
504 &subvol_name, &fs_devices); 578 &subvol_name, &subvol_objectid,
579 &fs_devices);
505 if (error) 580 if (error)
506 return error; 581 return error;
507 582
@@ -530,6 +605,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
530 goto error_close_devices; 605 goto error_close_devices;
531 } 606 }
532 607
608 found = 1;
533 btrfs_close_devices(fs_devices); 609 btrfs_close_devices(fs_devices);
534 } else { 610 } else {
535 char b[BDEVNAME_SIZE]; 611 char b[BDEVNAME_SIZE];
@@ -547,25 +623,35 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
547 s->s_flags |= MS_ACTIVE; 623 s->s_flags |= MS_ACTIVE;
548 } 624 }
549 625
550 if (!strcmp(subvol_name, ".")) 626 root = get_default_root(s, subvol_objectid);
551 root = dget(s->s_root); 627 if (IS_ERR(root)) {
552 else { 628 error = PTR_ERR(root);
553 mutex_lock(&s->s_root->d_inode->i_mutex); 629 deactivate_locked_super(s);
554 root = lookup_one_len(subvol_name, s->s_root, 630 goto error;
631 }
632 /* if they gave us a subvolume name bind mount into that */
633 if (strcmp(subvol_name, ".")) {
634 struct dentry *new_root;
635 mutex_lock(&root->d_inode->i_mutex);
636 new_root = lookup_one_len(subvol_name, root,
555 strlen(subvol_name)); 637 strlen(subvol_name));
556 mutex_unlock(&s->s_root->d_inode->i_mutex); 638 mutex_unlock(&root->d_inode->i_mutex);
557 639
558 if (IS_ERR(root)) { 640 if (IS_ERR(new_root)) {
559 deactivate_locked_super(s); 641 deactivate_locked_super(s);
560 error = PTR_ERR(root); 642 error = PTR_ERR(new_root);
561 goto error_free_subvol_name; 643 dput(root);
644 goto error_close_devices;
562 } 645 }
563 if (!root->d_inode) { 646 if (!new_root->d_inode) {
564 dput(root); 647 dput(root);
648 dput(new_root);
565 deactivate_locked_super(s); 649 deactivate_locked_super(s);
566 error = -ENXIO; 650 error = -ENXIO;
567 goto error_free_subvol_name; 651 goto error_close_devices;
568 } 652 }
653 dput(root);
654 root = new_root;
569 } 655 }
570 656
571 mnt->mnt_sb = s; 657 mnt->mnt_sb = s;
@@ -580,6 +666,7 @@ error_close_devices:
580 btrfs_close_devices(fs_devices); 666 btrfs_close_devices(fs_devices);
581error_free_subvol_name: 667error_free_subvol_name:
582 kfree(subvol_name); 668 kfree(subvol_name);
669error:
583 return error; 670 return error;
584} 671}
585 672
@@ -624,14 +711,37 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
624{ 711{
625 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 712 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
626 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 713 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
714 struct list_head *head = &root->fs_info->space_info;
715 struct btrfs_space_info *found;
716 u64 total_used = 0;
717 u64 data_used = 0;
627 int bits = dentry->d_sb->s_blocksize_bits; 718 int bits = dentry->d_sb->s_blocksize_bits;
628 __be32 *fsid = (__be32 *)root->fs_info->fsid; 719 __be32 *fsid = (__be32 *)root->fs_info->fsid;
629 720
721 rcu_read_lock();
722 list_for_each_entry_rcu(found, head, list) {
723 if (found->flags & (BTRFS_BLOCK_GROUP_DUP|
724 BTRFS_BLOCK_GROUP_RAID10|
725 BTRFS_BLOCK_GROUP_RAID1)) {
726 total_used += found->bytes_used;
727 if (found->flags & BTRFS_BLOCK_GROUP_DATA)
728 data_used += found->bytes_used;
729 else
730 data_used += found->total_bytes;
731 }
732
733 total_used += found->bytes_used;
734 if (found->flags & BTRFS_BLOCK_GROUP_DATA)
735 data_used += found->bytes_used;
736 else
737 data_used += found->total_bytes;
738 }
739 rcu_read_unlock();
740
630 buf->f_namelen = BTRFS_NAME_LEN; 741 buf->f_namelen = BTRFS_NAME_LEN;
631 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 742 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
632 buf->f_bfree = buf->f_blocks - 743 buf->f_bfree = buf->f_blocks - (total_used >> bits);
633 (btrfs_super_bytes_used(disk_super) >> bits); 744 buf->f_bavail = buf->f_blocks - (data_used >> bits);
634 buf->f_bavail = buf->f_bfree;
635 buf->f_bsize = dentry->d_sb->s_blocksize; 745 buf->f_bsize = dentry->d_sb->s_blocksize;
636 buf->f_type = BTRFS_SUPER_MAGIC; 746 buf->f_type = BTRFS_SUPER_MAGIC;
637 747
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2a36e236a492..2cb116099b90 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/slab.h>
20#include <linux/sched.h> 21#include <linux/sched.h>
21#include <linux/writeback.h> 22#include <linux/writeback.h>
22#include <linux/pagemap.h> 23#include <linux/pagemap.h>
@@ -147,18 +148,13 @@ static void wait_current_trans(struct btrfs_root *root)
147 while (1) { 148 while (1) {
148 prepare_to_wait(&root->fs_info->transaction_wait, &wait, 149 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
149 TASK_UNINTERRUPTIBLE); 150 TASK_UNINTERRUPTIBLE);
150 if (cur_trans->blocked) { 151 if (!cur_trans->blocked)
151 mutex_unlock(&root->fs_info->trans_mutex);
152 schedule();
153 mutex_lock(&root->fs_info->trans_mutex);
154 finish_wait(&root->fs_info->transaction_wait,
155 &wait);
156 } else {
157 finish_wait(&root->fs_info->transaction_wait,
158 &wait);
159 break; 152 break;
160 } 153 mutex_unlock(&root->fs_info->trans_mutex);
154 schedule();
155 mutex_lock(&root->fs_info->trans_mutex);
161 } 156 }
157 finish_wait(&root->fs_info->transaction_wait, &wait);
162 put_transaction(cur_trans); 158 put_transaction(cur_trans);
163 } 159 }
164} 160}
@@ -760,10 +756,17 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
760 struct btrfs_root_item *new_root_item; 756 struct btrfs_root_item *new_root_item;
761 struct btrfs_root *tree_root = fs_info->tree_root; 757 struct btrfs_root *tree_root = fs_info->tree_root;
762 struct btrfs_root *root = pending->root; 758 struct btrfs_root *root = pending->root;
759 struct btrfs_root *parent_root;
760 struct inode *parent_inode;
763 struct extent_buffer *tmp; 761 struct extent_buffer *tmp;
764 struct extent_buffer *old; 762 struct extent_buffer *old;
765 int ret; 763 int ret;
766 u64 objectid; 764 u64 objectid;
765 int namelen;
766 u64 index = 0;
767
768 parent_inode = pending->dentry->d_parent->d_inode;
769 parent_root = BTRFS_I(parent_inode)->root;
767 770
768 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 771 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
769 if (!new_root_item) { 772 if (!new_root_item) {
@@ -774,79 +777,59 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
774 if (ret) 777 if (ret)
775 goto fail; 778 goto fail;
776 779
777 record_root_in_trans(trans, root);
778 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
779 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
780
781 key.objectid = objectid; 780 key.objectid = objectid;
782 /* record when the snapshot was created in key.offset */ 781 /* record when the snapshot was created in key.offset */
783 key.offset = trans->transid; 782 key.offset = trans->transid;
784 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 783 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
785 784
786 old = btrfs_lock_root_node(root);
787 btrfs_cow_block(trans, root, old, NULL, 0, &old);
788 btrfs_set_lock_blocking(old);
789
790 btrfs_copy_root(trans, root, old, &tmp, objectid);
791 btrfs_tree_unlock(old);
792 free_extent_buffer(old);
793
794 btrfs_set_root_node(new_root_item, tmp);
795 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
796 new_root_item);
797 btrfs_tree_unlock(tmp);
798 free_extent_buffer(tmp);
799 if (ret)
800 goto fail;
801
802 key.offset = (u64)-1;
803 memcpy(&pending->root_key, &key, sizeof(key)); 785 memcpy(&pending->root_key, &key, sizeof(key));
804fail: 786 pending->root_key.offset = (u64)-1;
805 kfree(new_root_item);
806 return ret;
807}
808
809static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
810 struct btrfs_pending_snapshot *pending)
811{
812 int ret;
813 int namelen;
814 u64 index = 0;
815 struct btrfs_trans_handle *trans;
816 struct inode *parent_inode;
817 struct btrfs_root *parent_root;
818
819 parent_inode = pending->dentry->d_parent->d_inode;
820 parent_root = BTRFS_I(parent_inode)->root;
821 trans = btrfs_join_transaction(parent_root, 1);
822 787
788 record_root_in_trans(trans, parent_root);
823 /* 789 /*
824 * insert the directory item 790 * insert the directory item
825 */ 791 */
826 namelen = strlen(pending->name); 792 namelen = strlen(pending->name);
827 ret = btrfs_set_inode_index(parent_inode, &index); 793 ret = btrfs_set_inode_index(parent_inode, &index);
794 BUG_ON(ret);
828 ret = btrfs_insert_dir_item(trans, parent_root, 795 ret = btrfs_insert_dir_item(trans, parent_root,
829 pending->name, namelen, 796 pending->name, namelen,
830 parent_inode->i_ino, 797 parent_inode->i_ino,
831 &pending->root_key, BTRFS_FT_DIR, index); 798 &pending->root_key, BTRFS_FT_DIR, index);
832 799 BUG_ON(ret);
833 if (ret)
834 goto fail;
835 800
836 btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2); 801 btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
837 ret = btrfs_update_inode(trans, parent_root, parent_inode); 802 ret = btrfs_update_inode(trans, parent_root, parent_inode);
838 BUG_ON(ret); 803 BUG_ON(ret);
839 804
805 record_root_in_trans(trans, root);
806 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
807 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
808
809 old = btrfs_lock_root_node(root);
810 btrfs_cow_block(trans, root, old, NULL, 0, &old);
811 btrfs_set_lock_blocking(old);
812
813 btrfs_copy_root(trans, root, old, &tmp, objectid);
814 btrfs_tree_unlock(old);
815 free_extent_buffer(old);
816
817 btrfs_set_root_node(new_root_item, tmp);
818 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
819 new_root_item);
820 BUG_ON(ret);
821 btrfs_tree_unlock(tmp);
822 free_extent_buffer(tmp);
823
840 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, 824 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
841 pending->root_key.objectid, 825 pending->root_key.objectid,
842 parent_root->root_key.objectid, 826 parent_root->root_key.objectid,
843 parent_inode->i_ino, index, pending->name, 827 parent_inode->i_ino, index, pending->name,
844 namelen); 828 namelen);
845
846 BUG_ON(ret); 829 BUG_ON(ret);
847 830
848fail: 831fail:
849 btrfs_end_transaction(trans, fs_info->fs_root); 832 kfree(new_root_item);
850 return ret; 833 return ret;
851} 834}
852 835
@@ -867,25 +850,6 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
867 return 0; 850 return 0;
868} 851}
869 852
870static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
871 struct btrfs_fs_info *fs_info)
872{
873 struct btrfs_pending_snapshot *pending;
874 struct list_head *head = &trans->transaction->pending_snapshots;
875 int ret;
876
877 while (!list_empty(head)) {
878 pending = list_entry(head->next,
879 struct btrfs_pending_snapshot, list);
880 ret = finish_pending_snapshot(fs_info, pending);
881 BUG_ON(ret);
882 list_del(&pending->list);
883 kfree(pending->name);
884 kfree(pending);
885 }
886 return 0;
887}
888
889static void update_super_roots(struct btrfs_root *root) 853static void update_super_roots(struct btrfs_root *root)
890{ 854{
891 struct btrfs_root_item *root_item; 855 struct btrfs_root_item *root_item;
@@ -997,13 +961,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
997 961
998 mutex_unlock(&root->fs_info->trans_mutex); 962 mutex_unlock(&root->fs_info->trans_mutex);
999 963
1000 if (flush_on_commit) { 964 if (flush_on_commit || snap_pending) {
1001 btrfs_start_delalloc_inodes(root, 1); 965 btrfs_start_delalloc_inodes(root, 1);
1002 ret = btrfs_wait_ordered_extents(root, 0, 1); 966 ret = btrfs_wait_ordered_extents(root, 0, 1);
1003 BUG_ON(ret); 967 BUG_ON(ret);
1004 } else if (snap_pending) {
1005 ret = btrfs_wait_ordered_extents(root, 0, 1);
1006 BUG_ON(ret);
1007 } 968 }
1008 969
1009 /* 970 /*
@@ -1100,9 +1061,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1100 1061
1101 btrfs_finish_extent_commit(trans, root); 1062 btrfs_finish_extent_commit(trans, root);
1102 1063
1103 /* do the directory inserts of any pending snapshot creations */
1104 finish_pending_snapshots(trans, root->fs_info);
1105
1106 mutex_lock(&root->fs_info->trans_mutex); 1064 mutex_lock(&root->fs_info->trans_mutex);
1107 1065
1108 cur_trans->commit_done = 1; 1066 cur_trans->commit_done = 1;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4a9434b622ec..af57dd2b43d4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "transaction.h" 22#include "transaction.h"
22#include "disk-io.h" 23#include "disk-io.h"
@@ -445,7 +446,7 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root,
445 key.objectid = objectid; 446 key.objectid = objectid;
446 key.type = BTRFS_INODE_ITEM_KEY; 447 key.type = BTRFS_INODE_ITEM_KEY;
447 key.offset = 0; 448 key.offset = 0;
448 inode = btrfs_iget(root->fs_info->sb, &key, root); 449 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
449 if (IS_ERR(inode)) { 450 if (IS_ERR(inode)) {
450 inode = NULL; 451 inode = NULL;
451 } else if (is_bad_inode(inode)) { 452 } else if (is_bad_inode(inode)) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 41ecbb2347f2..8db7b14bbae8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/bio.h> 19#include <linux/bio.h>
20#include <linux/slab.h>
20#include <linux/buffer_head.h> 21#include <linux/buffer_head.h>
21#include <linux/blkdev.h> 22#include <linux/blkdev.h>
22#include <linux/random.h> 23#include <linux/random.h>
@@ -256,13 +257,13 @@ loop_lock:
256 wake_up(&fs_info->async_submit_wait); 257 wake_up(&fs_info->async_submit_wait);
257 258
258 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 259 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
259 submit_bio(cur->bi_rw, cur);
260 num_run++;
261 batch_run++;
262 260
263 if (bio_rw_flagged(cur, BIO_RW_SYNCIO)) 261 if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
264 num_sync_run++; 262 num_sync_run++;
265 263
264 submit_bio(cur->bi_rw, cur);
265 num_run++;
266 batch_run++;
266 if (need_resched()) { 267 if (need_resched()) {
267 if (num_sync_run) { 268 if (num_sync_run) {
268 blk_run_backing_dev(bdi, NULL); 269 blk_run_backing_dev(bdi, NULL);
@@ -325,16 +326,6 @@ loop_lock:
325 num_sync_run = 0; 326 num_sync_run = 0;
326 blk_run_backing_dev(bdi, NULL); 327 blk_run_backing_dev(bdi, NULL);
327 } 328 }
328
329 cond_resched();
330 if (again)
331 goto loop;
332
333 spin_lock(&device->io_lock);
334 if (device->pending_bios.head || device->pending_sync_bios.head)
335 goto loop_lock;
336 spin_unlock(&device->io_lock);
337
338 /* 329 /*
339 * IO has already been through a long path to get here. Checksumming, 330 * IO has already been through a long path to get here. Checksumming,
340 * async helper threads, perhaps compression. We've done a pretty 331 * async helper threads, perhaps compression. We've done a pretty
@@ -346,6 +337,16 @@ loop_lock:
346 * cared about found its way down here. 337 * cared about found its way down here.
347 */ 338 */
348 blk_run_backing_dev(bdi, NULL); 339 blk_run_backing_dev(bdi, NULL);
340
341 cond_resched();
342 if (again)
343 goto loop;
344
345 spin_lock(&device->io_lock);
346 if (device->pending_bios.head || device->pending_sync_bios.head)
347 goto loop_lock;
348 spin_unlock(&device->io_lock);
349
349done: 350done:
350 return 0; 351 return 0;
351} 352}
@@ -365,6 +366,7 @@ static noinline int device_list_add(const char *path,
365 struct btrfs_device *device; 366 struct btrfs_device *device;
366 struct btrfs_fs_devices *fs_devices; 367 struct btrfs_fs_devices *fs_devices;
367 u64 found_transid = btrfs_super_generation(disk_super); 368 u64 found_transid = btrfs_super_generation(disk_super);
369 char *name;
368 370
369 fs_devices = find_fsid(disk_super->fsid); 371 fs_devices = find_fsid(disk_super->fsid);
370 if (!fs_devices) { 372 if (!fs_devices) {
@@ -411,6 +413,12 @@ static noinline int device_list_add(const char *path,
411 413
412 device->fs_devices = fs_devices; 414 device->fs_devices = fs_devices;
413 fs_devices->num_devices++; 415 fs_devices->num_devices++;
416 } else if (strcmp(device->name, path)) {
417 name = kstrdup(path, GFP_NOFS);
418 if (!name)
419 return -ENOMEM;
420 kfree(device->name);
421 device->name = name;
414 } 422 }
415 423
416 if (found_transid > fs_devices->latest_trans) { 424 if (found_transid > fs_devices->latest_trans) {
@@ -592,7 +600,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
592 goto error_close; 600 goto error_close;
593 601
594 disk_super = (struct btrfs_super_block *)bh->b_data; 602 disk_super = (struct btrfs_super_block *)bh->b_data;
595 devid = le64_to_cpu(disk_super->dev_item.devid); 603 devid = btrfs_stack_device_id(&disk_super->dev_item);
596 if (devid != device->devid) 604 if (devid != device->devid)
597 goto error_brelse; 605 goto error_brelse;
598 606
@@ -694,7 +702,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
694 goto error_close; 702 goto error_close;
695 } 703 }
696 disk_super = (struct btrfs_super_block *)bh->b_data; 704 disk_super = (struct btrfs_super_block *)bh->b_data;
697 devid = le64_to_cpu(disk_super->dev_item.devid); 705 devid = btrfs_stack_device_id(&disk_super->dev_item);
698 transid = btrfs_super_generation(disk_super); 706 transid = btrfs_super_generation(disk_super);
699 if (disk_super->label[0]) 707 if (disk_super->label[0])
700 printk(KERN_INFO "device label %s ", disk_super->label); 708 printk(KERN_INFO "device label %s ", disk_super->label);
@@ -1187,7 +1195,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1187 goto error_close; 1195 goto error_close;
1188 } 1196 }
1189 disk_super = (struct btrfs_super_block *)bh->b_data; 1197 disk_super = (struct btrfs_super_block *)bh->b_data;
1190 devid = le64_to_cpu(disk_super->dev_item.devid); 1198 devid = btrfs_stack_device_id(&disk_super->dev_item);
1191 dev_uuid = disk_super->dev_item.uuid; 1199 dev_uuid = disk_super->dev_item.uuid;
1192 device = btrfs_find_device(root, devid, dev_uuid, 1200 device = btrfs_find_device(root, devid, dev_uuid,
1193 disk_super->fsid); 1201 disk_super->fsid);
@@ -2191,9 +2199,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2191 min_stripes = 2; 2199 min_stripes = 2;
2192 } 2200 }
2193 if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2201 if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
2194 num_stripes = min_t(u64, 2, fs_devices->rw_devices); 2202 if (fs_devices->rw_devices < 2)
2195 if (num_stripes < 2)
2196 return -ENOSPC; 2203 return -ENOSPC;
2204 num_stripes = 2;
2197 min_stripes = 2; 2205 min_stripes = 2;
2198 } 2206 }
2199 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2207 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
@@ -2237,8 +2245,16 @@ again:
2237 do_div(calc_size, stripe_len); 2245 do_div(calc_size, stripe_len);
2238 calc_size *= stripe_len; 2246 calc_size *= stripe_len;
2239 } 2247 }
2248
2240 /* we don't want tiny stripes */ 2249 /* we don't want tiny stripes */
2241 calc_size = max_t(u64, min_stripe_size, calc_size); 2250 if (!looped)
2251 calc_size = max_t(u64, min_stripe_size, calc_size);
2252
2253 /*
2254 * we're about to do_div by the stripe_len so lets make sure
2255 * we end up with something bigger than a stripe
2256 */
2257 calc_size = max_t(u64, calc_size, stripe_len * 4);
2242 2258
2243 do_div(calc_size, stripe_len); 2259 do_div(calc_size, stripe_len);
2244 calc_size *= stripe_len; 2260 calc_size *= stripe_len;
@@ -3382,6 +3398,8 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
3382 key.type = 0; 3398 key.type = 0;
3383again: 3399again:
3384 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3400 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3401 if (ret < 0)
3402 goto error;
3385 while (1) { 3403 while (1) {
3386 leaf = path->nodes[0]; 3404 leaf = path->nodes[0];
3387 slot = path->slots[0]; 3405 slot = path->slots[0];
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 27089311fbea..37fe101a4e0d 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -9,6 +9,7 @@
9 * 2 of the Licence, or (at your option) any later version. 9 * 2 of the Licence, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/slab.h>
12#include <linux/mount.h> 13#include <linux/mount.h>
13#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
14#include "internal.h" 15#include "internal.h"
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index eeb4986ea7db..d5db84a1ee0d 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -19,6 +19,7 @@
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/namei.h> 20#include <linux/namei.h>
21#include <linux/security.h> 21#include <linux/security.h>
22#include <linux/slab.h>
22#include "internal.h" 23#include "internal.h"
23 24
24#define CACHEFILES_KEYBUF_SIZE 512 25#define CACHEFILES_KEYBUF_SIZE 512
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 1d8332563863..0f0d41fbb03f 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -10,6 +10,7 @@
10 */ 10 */
11 11
12#include <linux/mount.h> 12#include <linux/mount.h>
13#include <linux/slab.h>
13#include <linux/file.h> 14#include <linux/file.h>
14#include "internal.h" 15#include "internal.h"
15 16
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index f3e7a0bf068b..e18b183b47e1 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -16,6 +16,7 @@
16#include <linux/fsnotify.h> 16#include <linux/fsnotify.h>
17#include <linux/quotaops.h> 17#include <linux/quotaops.h>
18#include <linux/xattr.h> 18#include <linux/xattr.h>
19#include <linux/slab.h>
19#include "internal.h" 20#include "internal.h"
20 21
21static const char cachefiles_xattr_cache[] = 22static const char cachefiles_xattr_cache[] =
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
new file mode 100644
index 000000000000..04b8280582a9
--- /dev/null
+++ b/fs/ceph/Kconfig
@@ -0,0 +1,27 @@
1config CEPH_FS
2 tristate "Ceph distributed file system (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL
4 select LIBCRC32C
5 select CONFIG_CRYPTO_AES
6 help
7 Choose Y or M here to include support for mounting the
8 experimental Ceph distributed file system. Ceph is an extremely
9 scalable file system designed to provide high performance,
10 reliable access to petabytes of storage.
11
12 More information at http://ceph.newdream.net/.
13
14 If unsure, say N.
15
16config CEPH_FS_PRETTYDEBUG
17 bool "Include file:line in ceph debug output"
18 depends on CEPH_FS
19 default n
20 help
21 If you say Y here, debug output will include a filename and
22 line to aid debugging. This icnreases kernel size and slows
23 execution slightly when debug call sites are enabled (e.g.,
24 via CONFIG_DYNAMIC_DEBUG).
25
26 If unsure, say N.
27
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
new file mode 100644
index 000000000000..6a660e610be8
--- /dev/null
+++ b/fs/ceph/Makefile
@@ -0,0 +1,39 @@
1#
2# Makefile for CEPH filesystem.
3#
4
5ifneq ($(KERNELRELEASE),)
6
7obj-$(CONFIG_CEPH_FS) += ceph.o
8
9ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
10 export.o caps.o snap.o xattr.o \
11 messenger.o msgpool.o buffer.o pagelist.o \
12 mds_client.o mdsmap.o \
13 mon_client.o \
14 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
15 debugfs.o \
16 auth.o auth_none.o \
17 crypto.o armor.o \
18 auth_x.o \
19 ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
20
21else
22#Otherwise we were called directly from the command
23# line; invoke the kernel build system.
24
25KERNELDIR ?= /lib/modules/$(shell uname -r)/build
26PWD := $(shell pwd)
27
28default: all
29
30all:
31 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
32
33modules_install:
34 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
35
36clean:
37 $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
38
39endif
diff --git a/fs/ceph/README b/fs/ceph/README
new file mode 100644
index 000000000000..18352fab37c0
--- /dev/null
+++ b/fs/ceph/README
@@ -0,0 +1,20 @@
1#
2# The following files are shared by (and manually synchronized
3# between) the Ceph userland and kernel client.
4#
5# userland kernel
6src/include/ceph_fs.h fs/ceph/ceph_fs.h
7src/include/ceph_fs.cc fs/ceph/ceph_fs.c
8src/include/msgr.h fs/ceph/msgr.h
9src/include/rados.h fs/ceph/rados.h
10src/include/ceph_strings.cc fs/ceph/ceph_strings.c
11src/include/ceph_frag.h fs/ceph/ceph_frag.h
12src/include/ceph_frag.cc fs/ceph/ceph_frag.c
13src/include/ceph_hash.h fs/ceph/ceph_hash.h
14src/include/ceph_hash.cc fs/ceph/ceph_hash.c
15src/crush/crush.c fs/ceph/crush/crush.c
16src/crush/crush.h fs/ceph/crush/crush.h
17src/crush/mapper.c fs/ceph/crush/mapper.c
18src/crush/mapper.h fs/ceph/crush/mapper.h
19src/crush/hash.h fs/ceph/crush/hash.h
20src/crush/hash.c fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
new file mode 100644
index 000000000000..412593703d1e
--- /dev/null
+++ b/fs/ceph/addr.c
@@ -0,0 +1,1193 @@
1#include "ceph_debug.h"
2
3#include <linux/backing-dev.h>
4#include <linux/fs.h>
5#include <linux/mm.h>
6#include <linux/pagemap.h>
7#include <linux/writeback.h> /* generic_writepages */
8#include <linux/slab.h>
9#include <linux/pagevec.h>
10#include <linux/task_io_accounting_ops.h>
11
12#include "super.h"
13#include "osd_client.h"
14
15/*
16 * Ceph address space ops.
17 *
18 * There are a few funny things going on here.
19 *
20 * The page->private field is used to reference a struct
21 * ceph_snap_context for _every_ dirty page. This indicates which
22 * snapshot the page was logically dirtied in, and thus which snap
23 * context needs to be associated with the osd write during writeback.
24 *
25 * Similarly, struct ceph_inode_info maintains a set of counters to
26 * count dirty pages on the inode. In the absense of snapshots,
27 * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
28 *
29 * When a snapshot is taken (that is, when the client receives
30 * notification that a snapshot was taken), each inode with caps and
31 * with dirty pages (dirty pages implies there is a cap) gets a new
32 * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
33 * order, new snaps go to the tail). The i_wrbuffer_ref_head count is
34 * moved to capsnap->dirty. (Unless a sync write is currently in
35 * progress. In that case, the capsnap is said to be "pending", new
36 * writes cannot start, and the capsnap isn't "finalized" until the
37 * write completes (or fails) and a final size/mtime for the inode for
38 * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0.
39 *
40 * On writeback, we must submit writes to the osd IN SNAP ORDER. So,
41 * we look for the first capsnap in i_cap_snaps and write out pages in
42 * that snap context _only_. Then we move on to the next capsnap,
43 * eventually reaching the "live" or "head" context (i.e., pages that
44 * are not yet snapped) and are writing the most recently dirtied
45 * pages.
46 *
47 * Invalidate and so forth must take care to ensure the dirty page
48 * accounting is preserved.
49 */
50
51#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
52#define CONGESTION_OFF_THRESH(congestion_kb) \
53 (CONGESTION_ON_THRESH(congestion_kb) - \
54 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
55
56
57
58/*
59 * Dirty a page. Optimistically adjust accounting, on the assumption
60 * that we won't race with invalidate. If we do, readjust.
61 */
62static int ceph_set_page_dirty(struct page *page)
63{
64 struct address_space *mapping = page->mapping;
65 struct inode *inode;
66 struct ceph_inode_info *ci;
67 int undo = 0;
68 struct ceph_snap_context *snapc;
69
70 if (unlikely(!mapping))
71 return !TestSetPageDirty(page);
72
73 if (TestSetPageDirty(page)) {
74 dout("%p set_page_dirty %p idx %lu -- already dirty\n",
75 mapping->host, page, page->index);
76 return 0;
77 }
78
79 inode = mapping->host;
80 ci = ceph_inode(inode);
81
82 /*
83 * Note that we're grabbing a snapc ref here without holding
84 * any locks!
85 */
86 snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
87
88 /* dirty the head */
89 spin_lock(&inode->i_lock);
90 if (ci->i_wrbuffer_ref_head == 0)
91 ci->i_head_snapc = ceph_get_snap_context(snapc);
92 ++ci->i_wrbuffer_ref_head;
93 if (ci->i_wrbuffer_ref == 0)
94 igrab(inode);
95 ++ci->i_wrbuffer_ref;
96 dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
97 "snapc %p seq %lld (%d snaps)\n",
98 mapping->host, page, page->index,
99 ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
100 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
101 snapc, snapc->seq, snapc->num_snaps);
102 spin_unlock(&inode->i_lock);
103
104 /* now adjust page */
105 spin_lock_irq(&mapping->tree_lock);
106 if (page->mapping) { /* Race with truncate? */
107 WARN_ON_ONCE(!PageUptodate(page));
108
109 if (mapping_cap_account_dirty(mapping)) {
110 __inc_zone_page_state(page, NR_FILE_DIRTY);
111 __inc_bdi_stat(mapping->backing_dev_info,
112 BDI_RECLAIMABLE);
113 task_io_account_write(PAGE_CACHE_SIZE);
114 }
115 radix_tree_tag_set(&mapping->page_tree,
116 page_index(page), PAGECACHE_TAG_DIRTY);
117
118 /*
119 * Reference snap context in page->private. Also set
120 * PagePrivate so that we get invalidatepage callback.
121 */
122 page->private = (unsigned long)snapc;
123 SetPagePrivate(page);
124 } else {
125 dout("ANON set_page_dirty %p (raced truncate?)\n", page);
126 undo = 1;
127 }
128
129 spin_unlock_irq(&mapping->tree_lock);
130
131 if (undo)
132 /* whoops, we failed to dirty the page */
133 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
134
135 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
136
137 BUG_ON(!PageDirty(page));
138 return 1;
139}
140
141/*
142 * If we are truncating the full page (i.e. offset == 0), adjust the
143 * dirty page counters appropriately. Only called if there is private
144 * data on the page.
145 */
146static void ceph_invalidatepage(struct page *page, unsigned long offset)
147{
148 struct inode *inode;
149 struct ceph_inode_info *ci;
150 struct ceph_snap_context *snapc = (void *)page->private;
151
152 BUG_ON(!PageLocked(page));
153 BUG_ON(!page->private);
154 BUG_ON(!PagePrivate(page));
155 BUG_ON(!page->mapping);
156
157 inode = page->mapping->host;
158
159 /*
160 * We can get non-dirty pages here due to races between
161 * set_page_dirty and truncate_complete_page; just spit out a
162 * warning, in case we end up with accounting problems later.
163 */
164 if (!PageDirty(page))
165 pr_err("%p invalidatepage %p page not dirty\n", inode, page);
166
167 if (offset == 0)
168 ClearPageChecked(page);
169
170 ci = ceph_inode(inode);
171 if (offset == 0) {
172 dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
173 inode, page, page->index, offset);
174 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
175 ceph_put_snap_context(snapc);
176 page->private = 0;
177 ClearPagePrivate(page);
178 } else {
179 dout("%p invalidatepage %p idx %lu partial dirty page\n",
180 inode, page, page->index);
181 }
182}
183
184/* just a sanity check */
185static int ceph_releasepage(struct page *page, gfp_t g)
186{
187 struct inode *inode = page->mapping ? page->mapping->host : NULL;
188 dout("%p releasepage %p idx %lu\n", inode, page, page->index);
189 WARN_ON(PageDirty(page));
190 WARN_ON(page->private);
191 WARN_ON(PagePrivate(page));
192 return 0;
193}
194
195/*
196 * read a single page, without unlocking it.
197 */
198static int readpage_nounlock(struct file *filp, struct page *page)
199{
200 struct inode *inode = filp->f_dentry->d_inode;
201 struct ceph_inode_info *ci = ceph_inode(inode);
202 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
203 int err = 0;
204 u64 len = PAGE_CACHE_SIZE;
205
206 dout("readpage inode %p file %p page %p index %lu\n",
207 inode, filp, page, page->index);
208 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
209 page->index << PAGE_CACHE_SHIFT, &len,
210 ci->i_truncate_seq, ci->i_truncate_size,
211 &page, 1);
212 if (err == -ENOENT)
213 err = 0;
214 if (err < 0) {
215 SetPageError(page);
216 goto out;
217 } else if (err < PAGE_CACHE_SIZE) {
218 /* zero fill remainder of page */
219 zero_user_segment(page, err, PAGE_CACHE_SIZE);
220 }
221 SetPageUptodate(page);
222
223out:
224 return err < 0 ? err : 0;
225}
226
227static int ceph_readpage(struct file *filp, struct page *page)
228{
229 int r = readpage_nounlock(filp, page);
230 unlock_page(page);
231 return r;
232}
233
234/*
235 * Build a vector of contiguous pages from the provided page list.
236 */
237static struct page **page_vector_from_list(struct list_head *page_list,
238 unsigned *nr_pages)
239{
240 struct page **pages;
241 struct page *page;
242 int next_index, contig_pages = 0;
243
244 /* build page vector */
245 pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
246 if (!pages)
247 return ERR_PTR(-ENOMEM);
248
249 BUG_ON(list_empty(page_list));
250 next_index = list_entry(page_list->prev, struct page, lru)->index;
251 list_for_each_entry_reverse(page, page_list, lru) {
252 if (page->index == next_index) {
253 dout("readpages page %d %p\n", contig_pages, page);
254 pages[contig_pages] = page;
255 contig_pages++;
256 next_index++;
257 } else {
258 break;
259 }
260 }
261 *nr_pages = contig_pages;
262 return pages;
263}
264
265/*
266 * Read multiple pages. Leave pages we don't read + unlock in page_list;
267 * the caller (VM) cleans them up.
268 */
269static int ceph_readpages(struct file *file, struct address_space *mapping,
270 struct list_head *page_list, unsigned nr_pages)
271{
272 struct inode *inode = file->f_dentry->d_inode;
273 struct ceph_inode_info *ci = ceph_inode(inode);
274 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
275 int rc = 0;
276 struct page **pages;
277 struct pagevec pvec;
278 loff_t offset;
279 u64 len;
280
281 dout("readpages %p file %p nr_pages %d\n",
282 inode, file, nr_pages);
283
284 pages = page_vector_from_list(page_list, &nr_pages);
285 if (IS_ERR(pages))
286 return PTR_ERR(pages);
287
288 /* guess read extent */
289 offset = pages[0]->index << PAGE_CACHE_SHIFT;
290 len = nr_pages << PAGE_CACHE_SHIFT;
291 rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
292 offset, &len,
293 ci->i_truncate_seq, ci->i_truncate_size,
294 pages, nr_pages);
295 if (rc == -ENOENT)
296 rc = 0;
297 if (rc < 0)
298 goto out;
299
300 /* set uptodate and add to lru in pagevec-sized chunks */
301 pagevec_init(&pvec, 0);
302 for (; !list_empty(page_list) && len > 0;
303 rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
304 struct page *page =
305 list_entry(page_list->prev, struct page, lru);
306
307 list_del(&page->lru);
308
309 if (rc < (int)PAGE_CACHE_SIZE) {
310 /* zero (remainder of) page */
311 int s = rc < 0 ? 0 : rc;
312 zero_user_segment(page, s, PAGE_CACHE_SIZE);
313 }
314
315 if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
316 page_cache_release(page);
317 dout("readpages %p add_to_page_cache failed %p\n",
318 inode, page);
319 continue;
320 }
321 dout("readpages %p adding %p idx %lu\n", inode, page,
322 page->index);
323 flush_dcache_page(page);
324 SetPageUptodate(page);
325 unlock_page(page);
326 if (pagevec_add(&pvec, page) == 0)
327 pagevec_lru_add_file(&pvec); /* add to lru */
328 }
329 pagevec_lru_add_file(&pvec);
330 rc = 0;
331
332out:
333 kfree(pages);
334 return rc;
335}
336
337/*
338 * Get ref for the oldest snapc for an inode with dirty data... that is, the
339 * only snap context we are allowed to write back.
340 */
341static struct ceph_snap_context *get_oldest_context(struct inode *inode,
342 u64 *snap_size)
343{
344 struct ceph_inode_info *ci = ceph_inode(inode);
345 struct ceph_snap_context *snapc = NULL;
346 struct ceph_cap_snap *capsnap = NULL;
347
348 spin_lock(&inode->i_lock);
349 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
350 dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
351 capsnap->context, capsnap->dirty_pages);
352 if (capsnap->dirty_pages) {
353 snapc = ceph_get_snap_context(capsnap->context);
354 if (snap_size)
355 *snap_size = capsnap->size;
356 break;
357 }
358 }
359 if (!snapc && ci->i_head_snapc) {
360 snapc = ceph_get_snap_context(ci->i_head_snapc);
361 dout(" head snapc %p has %d dirty pages\n",
362 snapc, ci->i_wrbuffer_ref_head);
363 }
364 spin_unlock(&inode->i_lock);
365 return snapc;
366}
367
368/*
369 * Write a single page, but leave the page locked.
370 *
371 * If we get a write error, set the page error bit, but still adjust the
372 * dirty page accounting (i.e., page is no longer dirty).
373 */
374static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
375{
376 struct inode *inode;
377 struct ceph_inode_info *ci;
378 struct ceph_client *client;
379 struct ceph_osd_client *osdc;
380 loff_t page_off = page->index << PAGE_CACHE_SHIFT;
381 int len = PAGE_CACHE_SIZE;
382 loff_t i_size;
383 int err = 0;
384 struct ceph_snap_context *snapc, *oldest;
385 u64 snap_size = 0;
386 long writeback_stat;
387
388 dout("writepage %p idx %lu\n", page, page->index);
389
390 if (!page->mapping || !page->mapping->host) {
391 dout("writepage %p - no mapping\n", page);
392 return -EFAULT;
393 }
394 inode = page->mapping->host;
395 ci = ceph_inode(inode);
396 client = ceph_inode_to_client(inode);
397 osdc = &client->osdc;
398
399 /* verify this is a writeable snap context */
400 snapc = (void *)page->private;
401 if (snapc == NULL) {
402 dout("writepage %p page %p not dirty?\n", inode, page);
403 goto out;
404 }
405 oldest = get_oldest_context(inode, &snap_size);
406 if (snapc->seq > oldest->seq) {
407 dout("writepage %p page %p snapc %p not writeable - noop\n",
408 inode, page, (void *)page->private);
409 /* we should only noop if called by kswapd */
410 WARN_ON((current->flags & PF_MEMALLOC) == 0);
411 ceph_put_snap_context(oldest);
412 goto out;
413 }
414 ceph_put_snap_context(oldest);
415
416 /* is this a partial page at end of file? */
417 if (snap_size)
418 i_size = snap_size;
419 else
420 i_size = i_size_read(inode);
421 if (i_size < page_off + len)
422 len = i_size - page_off;
423
424 dout("writepage %p page %p index %lu on %llu~%u\n",
425 inode, page, page->index, page_off, len);
426
427 writeback_stat = atomic_long_inc_return(&client->writeback_count);
428 if (writeback_stat >
429 CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
430 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
431
432 set_page_writeback(page);
433 err = ceph_osdc_writepages(osdc, ceph_vino(inode),
434 &ci->i_layout, snapc,
435 page_off, len,
436 ci->i_truncate_seq, ci->i_truncate_size,
437 &inode->i_mtime,
438 &page, 1, 0, 0, true);
439 if (err < 0) {
440 dout("writepage setting page/mapping error %d %p\n", err, page);
441 SetPageError(page);
442 mapping_set_error(&inode->i_data, err);
443 if (wbc)
444 wbc->pages_skipped++;
445 } else {
446 dout("writepage cleaned page %p\n", page);
447 err = 0; /* vfs expects us to return 0 */
448 }
449 page->private = 0;
450 ClearPagePrivate(page);
451 end_page_writeback(page);
452 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
453 ceph_put_snap_context(snapc); /* page's reference */
454out:
455 return err;
456}
457
458static int ceph_writepage(struct page *page, struct writeback_control *wbc)
459{
460 int err;
461 struct inode *inode = page->mapping->host;
462 BUG_ON(!inode);
463 igrab(inode);
464 err = writepage_nounlock(page, wbc);
465 unlock_page(page);
466 iput(inode);
467 return err;
468}
469
470
471/*
472 * lame release_pages helper. release_pages() isn't exported to
473 * modules.
474 */
475static void ceph_release_pages(struct page **pages, int num)
476{
477 struct pagevec pvec;
478 int i;
479
480 pagevec_init(&pvec, 0);
481 for (i = 0; i < num; i++) {
482 if (pagevec_add(&pvec, pages[i]) == 0)
483 pagevec_release(&pvec);
484 }
485 pagevec_release(&pvec);
486}
487
488
489/*
490 * async writeback completion handler.
491 *
492 * If we get an error, set the mapping error bit, but not the individual
493 * page error bits.
494 */
495static void writepages_finish(struct ceph_osd_request *req,
496 struct ceph_msg *msg)
497{
498 struct inode *inode = req->r_inode;
499 struct ceph_osd_reply_head *replyhead;
500 struct ceph_osd_op *op;
501 struct ceph_inode_info *ci = ceph_inode(inode);
502 unsigned wrote;
503 struct page *page;
504 int i;
505 struct ceph_snap_context *snapc = req->r_snapc;
506 struct address_space *mapping = inode->i_mapping;
507 struct writeback_control *wbc = req->r_wbc;
508 __s32 rc = -EIO;
509 u64 bytes = 0;
510 struct ceph_client *client = ceph_inode_to_client(inode);
511 long writeback_stat;
512 unsigned issued = __ceph_caps_issued(ci, NULL);
513
514 /* parse reply */
515 replyhead = msg->front.iov_base;
516 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
517 op = (void *)(replyhead + 1);
518 rc = le32_to_cpu(replyhead->result);
519 bytes = le64_to_cpu(op->extent.length);
520
521 if (rc >= 0) {
522 /*
523 * Assume we wrote the pages we originally sent. The
524 * osd might reply with fewer pages if our writeback
525 * raced with a truncation and was adjusted at the osd,
526 * so don't believe the reply.
527 */
528 wrote = req->r_num_pages;
529 } else {
530 wrote = 0;
531 mapping_set_error(mapping, rc);
532 }
533 dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
534 inode, rc, bytes, wrote);
535
536 /* clean all pages */
537 for (i = 0; i < req->r_num_pages; i++) {
538 page = req->r_pages[i];
539 BUG_ON(!page);
540 WARN_ON(!PageUptodate(page));
541
542 writeback_stat =
543 atomic_long_dec_return(&client->writeback_count);
544 if (writeback_stat <
545 CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
546 clear_bdi_congested(&client->backing_dev_info,
547 BLK_RW_ASYNC);
548
549 if (i >= wrote) {
550 dout("inode %p skipping page %p\n", inode, page);
551 wbc->pages_skipped++;
552 }
553 ceph_put_snap_context((void *)page->private);
554 page->private = 0;
555 ClearPagePrivate(page);
556 dout("unlocking %d %p\n", i, page);
557 end_page_writeback(page);
558
559 /*
560 * We lost the cache cap, need to truncate the page before
561 * it is unlocked, otherwise we'd truncate it later in the
562 * page truncation thread, possibly losing some data that
563 * raced its way in
564 */
565 if ((issued & CEPH_CAP_FILE_CACHE) == 0)
566 generic_error_remove_page(inode->i_mapping, page);
567
568 unlock_page(page);
569 }
570 dout("%p wrote+cleaned %d pages\n", inode, wrote);
571 ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
572
573 ceph_release_pages(req->r_pages, req->r_num_pages);
574 if (req->r_pages_from_pool)
575 mempool_free(req->r_pages,
576 ceph_client(inode->i_sb)->wb_pagevec_pool);
577 else
578 kfree(req->r_pages);
579 ceph_osdc_put_request(req);
580}
581
582/*
583 * allocate a page vec, either directly, or if necessary, via a the
584 * mempool. we avoid the mempool if we can because req->r_num_pages
585 * may be less than the maximum write size.
586 */
587static void alloc_page_vec(struct ceph_client *client,
588 struct ceph_osd_request *req)
589{
590 req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
591 GFP_NOFS);
592 if (!req->r_pages) {
593 req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
594 req->r_pages_from_pool = 1;
595 WARN_ON(!req->r_pages);
596 }
597}
598
599/*
600 * initiate async writeback
601 */
602static int ceph_writepages_start(struct address_space *mapping,
603 struct writeback_control *wbc)
604{
605 struct inode *inode = mapping->host;
606 struct backing_dev_info *bdi = mapping->backing_dev_info;
607 struct ceph_inode_info *ci = ceph_inode(inode);
608 struct ceph_client *client;
609 pgoff_t index, start, end;
610 int range_whole = 0;
611 int should_loop = 1;
612 pgoff_t max_pages = 0, max_pages_ever = 0;
613 struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
614 struct pagevec pvec;
615 int done = 0;
616 int rc = 0;
617 unsigned wsize = 1 << inode->i_blkbits;
618 struct ceph_osd_request *req = NULL;
619 int do_sync;
620 u64 snap_size = 0;
621
622 /*
623 * Include a 'sync' in the OSD request if this is a data
624 * integrity write (e.g., O_SYNC write or fsync()), or if our
625 * cap is being revoked.
626 */
627 do_sync = wbc->sync_mode == WB_SYNC_ALL;
628 if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
629 do_sync = 1;
630 dout("writepages_start %p dosync=%d (mode=%s)\n",
631 inode, do_sync,
632 wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
633 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
634
635 client = ceph_inode_to_client(inode);
636 if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
637 pr_warning("writepage_start %p on forced umount\n", inode);
638 return -EIO; /* we're in a forced umount, don't write! */
639 }
640 if (client->mount_args->wsize && client->mount_args->wsize < wsize)
641 wsize = client->mount_args->wsize;
642 if (wsize < PAGE_CACHE_SIZE)
643 wsize = PAGE_CACHE_SIZE;
644 max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
645
646 pagevec_init(&pvec, 0);
647
648 /* ?? */
649 if (wbc->nonblocking && bdi_write_congested(bdi)) {
650 dout(" writepages congested\n");
651 wbc->encountered_congestion = 1;
652 goto out_final;
653 }
654
655 /* where to start/end? */
656 if (wbc->range_cyclic) {
657 start = mapping->writeback_index; /* Start from prev offset */
658 end = -1;
659 dout(" cyclic, start at %lu\n", start);
660 } else {
661 start = wbc->range_start >> PAGE_CACHE_SHIFT;
662 end = wbc->range_end >> PAGE_CACHE_SHIFT;
663 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
664 range_whole = 1;
665 should_loop = 0;
666 dout(" not cyclic, %lu to %lu\n", start, end);
667 }
668 index = start;
669
670retry:
671 /* find oldest snap context with dirty data */
672 ceph_put_snap_context(snapc);
673 snapc = get_oldest_context(inode, &snap_size);
674 if (!snapc) {
675 /* hmm, why does writepages get called when there
676 is no dirty data? */
677 dout(" no snap context with dirty data?\n");
678 goto out;
679 }
680 dout(" oldest snapc is %p seq %lld (%d snaps)\n",
681 snapc, snapc->seq, snapc->num_snaps);
682 if (last_snapc && snapc != last_snapc) {
683 /* if we switched to a newer snapc, restart our scan at the
684 * start of the original file range. */
685 dout(" snapc differs from last pass, restarting at %lu\n",
686 index);
687 index = start;
688 }
689 last_snapc = snapc;
690
691 while (!done && index <= end) {
692 unsigned i;
693 int first;
694 pgoff_t next;
695 int pvec_pages, locked_pages;
696 struct page *page;
697 int want;
698 u64 offset, len;
699 struct ceph_osd_request_head *reqhead;
700 struct ceph_osd_op *op;
701 long writeback_stat;
702
703 next = 0;
704 locked_pages = 0;
705 max_pages = max_pages_ever;
706
707get_more_pages:
708 first = -1;
709 want = min(end - index,
710 min((pgoff_t)PAGEVEC_SIZE,
711 max_pages - (pgoff_t)locked_pages) - 1)
712 + 1;
713 pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
714 PAGECACHE_TAG_DIRTY,
715 want);
716 dout("pagevec_lookup_tag got %d\n", pvec_pages);
717 if (!pvec_pages && !locked_pages)
718 break;
719 for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
720 page = pvec.pages[i];
721 dout("? %p idx %lu\n", page, page->index);
722 if (locked_pages == 0)
723 lock_page(page); /* first page */
724 else if (!trylock_page(page))
725 break;
726
727 /* only dirty pages, or our accounting breaks */
728 if (unlikely(!PageDirty(page)) ||
729 unlikely(page->mapping != mapping)) {
730 dout("!dirty or !mapping %p\n", page);
731 unlock_page(page);
732 break;
733 }
734 if (!wbc->range_cyclic && page->index > end) {
735 dout("end of range %p\n", page);
736 done = 1;
737 unlock_page(page);
738 break;
739 }
740 if (next && (page->index != next)) {
741 dout("not consecutive %p\n", page);
742 unlock_page(page);
743 break;
744 }
745 if (wbc->sync_mode != WB_SYNC_NONE) {
746 dout("waiting on writeback %p\n", page);
747 wait_on_page_writeback(page);
748 }
749 if ((snap_size && page_offset(page) > snap_size) ||
750 (!snap_size &&
751 page_offset(page) > i_size_read(inode))) {
752 dout("%p page eof %llu\n", page, snap_size ?
753 snap_size : i_size_read(inode));
754 done = 1;
755 unlock_page(page);
756 break;
757 }
758 if (PageWriteback(page)) {
759 dout("%p under writeback\n", page);
760 unlock_page(page);
761 break;
762 }
763
764 /* only if matching snap context */
765 pgsnapc = (void *)page->private;
766 if (pgsnapc->seq > snapc->seq) {
767 dout("page snapc %p %lld > oldest %p %lld\n",
768 pgsnapc, pgsnapc->seq, snapc, snapc->seq);
769 unlock_page(page);
770 if (!locked_pages)
771 continue; /* keep looking for snap */
772 break;
773 }
774
775 if (!clear_page_dirty_for_io(page)) {
776 dout("%p !clear_page_dirty_for_io\n", page);
777 unlock_page(page);
778 break;
779 }
780
781 /* ok */
782 if (locked_pages == 0) {
783 /* prepare async write request */
784 offset = page->index << PAGE_CACHE_SHIFT;
785 len = wsize;
786 req = ceph_osdc_new_request(&client->osdc,
787 &ci->i_layout,
788 ceph_vino(inode),
789 offset, &len,
790 CEPH_OSD_OP_WRITE,
791 CEPH_OSD_FLAG_WRITE |
792 CEPH_OSD_FLAG_ONDISK,
793 snapc, do_sync,
794 ci->i_truncate_seq,
795 ci->i_truncate_size,
796 &inode->i_mtime, true, 1);
797 max_pages = req->r_num_pages;
798
799 alloc_page_vec(client, req);
800 req->r_callback = writepages_finish;
801 req->r_inode = inode;
802 req->r_wbc = wbc;
803 }
804
805 /* note position of first page in pvec */
806 if (first < 0)
807 first = i;
808 dout("%p will write page %p idx %lu\n",
809 inode, page, page->index);
810
811 writeback_stat = atomic_long_inc_return(&client->writeback_count);
812 if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) {
813 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
814 }
815
816 set_page_writeback(page);
817 req->r_pages[locked_pages] = page;
818 locked_pages++;
819 next = page->index + 1;
820 }
821
822 /* did we get anything? */
823 if (!locked_pages)
824 goto release_pvec_pages;
825 if (i) {
826 int j;
827 BUG_ON(!locked_pages || first < 0);
828
829 if (pvec_pages && i == pvec_pages &&
830 locked_pages < max_pages) {
831 dout("reached end pvec, trying for more\n");
832 pagevec_reinit(&pvec);
833 goto get_more_pages;
834 }
835
836 /* shift unused pages over in the pvec... we
837 * will need to release them below. */
838 for (j = i; j < pvec_pages; j++) {
839 dout(" pvec leftover page %p\n",
840 pvec.pages[j]);
841 pvec.pages[j-i+first] = pvec.pages[j];
842 }
843 pvec.nr -= i-first;
844 }
845
846 /* submit the write */
847 offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
848 len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
849 (u64)locked_pages << PAGE_CACHE_SHIFT);
850 dout("writepages got %d pages at %llu~%llu\n",
851 locked_pages, offset, len);
852
853 /* revise final length, page count */
854 req->r_num_pages = locked_pages;
855 reqhead = req->r_request->front.iov_base;
856 op = (void *)(reqhead + 1);
857 op->extent.length = cpu_to_le64(len);
858 op->payload_len = cpu_to_le32(len);
859 req->r_request->hdr.data_len = cpu_to_le32(len);
860
861 ceph_osdc_start_request(&client->osdc, req, true);
862 req = NULL;
863
864 /* continue? */
865 index = next;
866 wbc->nr_to_write -= locked_pages;
867 if (wbc->nr_to_write <= 0)
868 done = 1;
869
870release_pvec_pages:
871 dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
872 pvec.nr ? pvec.pages[0] : NULL);
873 pagevec_release(&pvec);
874
875 if (locked_pages && !done)
876 goto retry;
877 }
878
879 if (should_loop && !done) {
880 /* more to do; loop back to beginning of file */
881 dout("writepages looping back to beginning of file\n");
882 should_loop = 0;
883 index = 0;
884 goto retry;
885 }
886
887 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
888 mapping->writeback_index = index;
889
890out:
891 if (req)
892 ceph_osdc_put_request(req);
893 if (rc > 0)
894 rc = 0; /* vfs expects us to return 0 */
895 ceph_put_snap_context(snapc);
896 dout("writepages done, rc = %d\n", rc);
897out_final:
898 return rc;
899}
900
901
902
903/*
904 * See if a given @snapc is either writeable, or already written.
905 */
906static int context_is_writeable_or_written(struct inode *inode,
907 struct ceph_snap_context *snapc)
908{
909 struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
910 int ret = !oldest || snapc->seq <= oldest->seq;
911
912 ceph_put_snap_context(oldest);
913 return ret;
914}
915
916/*
917 * We are only allowed to write into/dirty the page if the page is
918 * clean, or already dirty within the same snap context.
919 *
920 * called with page locked.
921 * return success with page locked,
922 * or any failure (incl -EAGAIN) with page unlocked.
923 */
924static int ceph_update_writeable_page(struct file *file,
925 loff_t pos, unsigned len,
926 struct page *page)
927{
928 struct inode *inode = file->f_dentry->d_inode;
929 struct ceph_inode_info *ci = ceph_inode(inode);
930 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
931 loff_t page_off = pos & PAGE_CACHE_MASK;
932 int pos_in_page = pos & ~PAGE_CACHE_MASK;
933 int end_in_page = pos_in_page + len;
934 loff_t i_size;
935 int r;
936 struct ceph_snap_context *snapc, *oldest;
937
938retry_locked:
939 /* writepages currently holds page lock, but if we change that later, */
940 wait_on_page_writeback(page);
941
942 /* check snap context */
943 BUG_ON(!ci->i_snap_realm);
944 down_read(&mdsc->snap_rwsem);
945 BUG_ON(!ci->i_snap_realm->cached_context);
946 snapc = (void *)page->private;
947 if (snapc && snapc != ci->i_head_snapc) {
948 /*
949 * this page is already dirty in another (older) snap
950 * context! is it writeable now?
951 */
952 oldest = get_oldest_context(inode, NULL);
953 up_read(&mdsc->snap_rwsem);
954
955 if (snapc->seq > oldest->seq) {
956 ceph_put_snap_context(oldest);
957 dout(" page %p snapc %p not current or oldest\n",
958 page, snapc);
959 /*
960 * queue for writeback, and wait for snapc to
961 * be writeable or written
962 */
963 snapc = ceph_get_snap_context(snapc);
964 unlock_page(page);
965 ceph_queue_writeback(inode);
966 r = wait_event_interruptible(ci->i_cap_wq,
967 context_is_writeable_or_written(inode, snapc));
968 ceph_put_snap_context(snapc);
969 if (r == -ERESTARTSYS)
970 return r;
971 return -EAGAIN;
972 }
973 ceph_put_snap_context(oldest);
974
975 /* yay, writeable, do it now (without dropping page lock) */
976 dout(" page %p snapc %p not current, but oldest\n",
977 page, snapc);
978 if (!clear_page_dirty_for_io(page))
979 goto retry_locked;
980 r = writepage_nounlock(page, NULL);
981 if (r < 0)
982 goto fail_nosnap;
983 goto retry_locked;
984 }
985
986 if (PageUptodate(page)) {
987 dout(" page %p already uptodate\n", page);
988 return 0;
989 }
990
991 /* full page? */
992 if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
993 return 0;
994
995 /* past end of file? */
996 i_size = inode->i_size; /* caller holds i_mutex */
997
998 if (i_size + len > inode->i_sb->s_maxbytes) {
999 /* file is too big */
1000 r = -EINVAL;
1001 goto fail;
1002 }
1003
1004 if (page_off >= i_size ||
1005 (pos_in_page == 0 && (pos+len) >= i_size &&
1006 end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
1007 dout(" zeroing %p 0 - %d and %d - %d\n",
1008 page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
1009 zero_user_segments(page,
1010 0, pos_in_page,
1011 end_in_page, PAGE_CACHE_SIZE);
1012 return 0;
1013 }
1014
1015 /* we need to read it. */
1016 up_read(&mdsc->snap_rwsem);
1017 r = readpage_nounlock(file, page);
1018 if (r < 0)
1019 goto fail_nosnap;
1020 goto retry_locked;
1021
1022fail:
1023 up_read(&mdsc->snap_rwsem);
1024fail_nosnap:
1025 unlock_page(page);
1026 return r;
1027}
1028
1029/*
1030 * We are only allowed to write into/dirty the page if the page is
1031 * clean, or already dirty within the same snap context.
1032 */
1033static int ceph_write_begin(struct file *file, struct address_space *mapping,
1034 loff_t pos, unsigned len, unsigned flags,
1035 struct page **pagep, void **fsdata)
1036{
1037 struct inode *inode = file->f_dentry->d_inode;
1038 struct page *page;
1039 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1040 int r;
1041
1042 do {
1043 /* get a page */
1044 page = grab_cache_page_write_begin(mapping, index, 0);
1045 if (!page)
1046 return -ENOMEM;
1047 *pagep = page;
1048
1049 dout("write_begin file %p inode %p page %p %d~%d\n", file,
1050 inode, page, (int)pos, (int)len);
1051
1052 r = ceph_update_writeable_page(file, pos, len, page);
1053 } while (r == -EAGAIN);
1054
1055 return r;
1056}
1057
1058/*
1059 * we don't do anything in here that simple_write_end doesn't do
1060 * except adjust dirty page accounting and drop read lock on
1061 * mdsc->snap_rwsem.
1062 */
1063static int ceph_write_end(struct file *file, struct address_space *mapping,
1064 loff_t pos, unsigned len, unsigned copied,
1065 struct page *page, void *fsdata)
1066{
1067 struct inode *inode = file->f_dentry->d_inode;
1068 struct ceph_client *client = ceph_inode_to_client(inode);
1069 struct ceph_mds_client *mdsc = &client->mdsc;
1070 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1071 int check_cap = 0;
1072
1073 dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
1074 inode, page, (int)pos, (int)copied, (int)len);
1075
1076 /* zero the stale part of the page if we did a short copy */
1077 if (copied < len)
1078 zero_user_segment(page, from+copied, len);
1079
1080 /* did file size increase? */
1081 /* (no need for i_size_read(); we caller holds i_mutex */
1082 if (pos+copied > inode->i_size)
1083 check_cap = ceph_inode_set_size(inode, pos+copied);
1084
1085 if (!PageUptodate(page))
1086 SetPageUptodate(page);
1087
1088 set_page_dirty(page);
1089
1090 unlock_page(page);
1091 up_read(&mdsc->snap_rwsem);
1092 page_cache_release(page);
1093
1094 if (check_cap)
1095 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
1096
1097 return copied;
1098}
1099
1100/*
1101 * we set .direct_IO to indicate direct io is supported, but since we
1102 * intercept O_DIRECT reads and writes early, this function should
1103 * never get called.
1104 */
1105static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
1106 const struct iovec *iov,
1107 loff_t pos, unsigned long nr_segs)
1108{
1109 WARN_ON(1);
1110 return -EINVAL;
1111}
1112
1113const struct address_space_operations ceph_aops = {
1114 .readpage = ceph_readpage,
1115 .readpages = ceph_readpages,
1116 .writepage = ceph_writepage,
1117 .writepages = ceph_writepages_start,
1118 .write_begin = ceph_write_begin,
1119 .write_end = ceph_write_end,
1120 .set_page_dirty = ceph_set_page_dirty,
1121 .invalidatepage = ceph_invalidatepage,
1122 .releasepage = ceph_releasepage,
1123 .direct_IO = ceph_direct_io,
1124};
1125
1126
1127/*
1128 * vm ops
1129 */
1130
1131/*
1132 * Reuse write_begin here for simplicity.
1133 */
1134static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1135{
1136 struct inode *inode = vma->vm_file->f_dentry->d_inode;
1137 struct page *page = vmf->page;
1138 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
1139 loff_t off = page->index << PAGE_CACHE_SHIFT;
1140 loff_t size, len;
1141 int ret;
1142
1143 size = i_size_read(inode);
1144 if (off + PAGE_CACHE_SIZE <= size)
1145 len = PAGE_CACHE_SIZE;
1146 else
1147 len = size & ~PAGE_CACHE_MASK;
1148
1149 dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
1150 off, len, page, page->index);
1151
1152 lock_page(page);
1153
1154 ret = VM_FAULT_NOPAGE;
1155 if ((off > size) ||
1156 (page->mapping != inode->i_mapping))
1157 goto out;
1158
1159 ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
1160 if (ret == 0) {
1161 /* success. we'll keep the page locked. */
1162 set_page_dirty(page);
1163 up_read(&mdsc->snap_rwsem);
1164 ret = VM_FAULT_LOCKED;
1165 } else {
1166 if (ret == -ENOMEM)
1167 ret = VM_FAULT_OOM;
1168 else
1169 ret = VM_FAULT_SIGBUS;
1170 }
1171out:
1172 dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
1173 if (ret != VM_FAULT_LOCKED)
1174 unlock_page(page);
1175 return ret;
1176}
1177
1178static struct vm_operations_struct ceph_vmops = {
1179 .fault = filemap_fault,
1180 .page_mkwrite = ceph_page_mkwrite,
1181};
1182
1183int ceph_mmap(struct file *file, struct vm_area_struct *vma)
1184{
1185 struct address_space *mapping = file->f_mapping;
1186
1187 if (!mapping->a_ops->readpage)
1188 return -ENOEXEC;
1189 file_accessed(file);
1190 vma->vm_ops = &ceph_vmops;
1191 vma->vm_flags |= VM_CAN_NONLINEAR;
1192 return 0;
1193}
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
new file mode 100644
index 000000000000..67b2c030924b
--- /dev/null
+++ b/fs/ceph/armor.c
@@ -0,0 +1,99 @@
1
2#include <linux/errno.h>
3
4/*
5 * base64 encode/decode.
6 */
7
8const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
9
10static int encode_bits(int c)
11{
12 return pem_key[c];
13}
14
15static int decode_bits(char c)
16{
17 if (c >= 'A' && c <= 'Z')
18 return c - 'A';
19 if (c >= 'a' && c <= 'z')
20 return c - 'a' + 26;
21 if (c >= '0' && c <= '9')
22 return c - '0' + 52;
23 if (c == '+')
24 return 62;
25 if (c == '/')
26 return 63;
27 if (c == '=')
28 return 0; /* just non-negative, please */
29 return -EINVAL;
30}
31
32int ceph_armor(char *dst, const char *src, const char *end)
33{
34 int olen = 0;
35 int line = 0;
36
37 while (src < end) {
38 unsigned char a, b, c;
39
40 a = *src++;
41 *dst++ = encode_bits(a >> 2);
42 if (src < end) {
43 b = *src++;
44 *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
45 if (src < end) {
46 c = *src++;
47 *dst++ = encode_bits(((b & 15) << 2) |
48 (c >> 6));
49 *dst++ = encode_bits(c & 63);
50 } else {
51 *dst++ = encode_bits((b & 15) << 2);
52 *dst++ = '=';
53 }
54 } else {
55 *dst++ = encode_bits(((a & 3) << 4));
56 *dst++ = '=';
57 *dst++ = '=';
58 }
59 olen += 4;
60 line += 4;
61 if (line == 64) {
62 line = 0;
63 *(dst++) = '\n';
64 olen++;
65 }
66 }
67 return olen;
68}
69
70int ceph_unarmor(char *dst, const char *src, const char *end)
71{
72 int olen = 0;
73
74 while (src < end) {
75 int a, b, c, d;
76
77 if (src < end && src[0] == '\n')
78 src++;
79 if (src + 4 > end)
80 return -EINVAL;
81 a = decode_bits(src[0]);
82 b = decode_bits(src[1]);
83 c = decode_bits(src[2]);
84 d = decode_bits(src[3]);
85 if (a < 0 || b < 0 || c < 0 || d < 0)
86 return -EINVAL;
87
88 *dst++ = (a << 2) | (b >> 4);
89 if (src[2] == '=')
90 return olen + 1;
91 *dst++ = ((b & 15) << 4) | (c >> 2);
92 if (src[3] == '=')
93 return olen + 2;
94 *dst++ = ((c & 3) << 6) | d;
95 olen += 3;
96 src += 4;
97 }
98 return olen;
99}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
new file mode 100644
index 000000000000..f6394b94b866
--- /dev/null
+++ b/fs/ceph/auth.c
@@ -0,0 +1,258 @@
1#include "ceph_debug.h"
2
3#include <linux/module.h>
4#include <linux/slab.h>
5#include <linux/err.h>
6
7#include "types.h"
8#include "auth_none.h"
9#include "auth_x.h"
10#include "decode.h"
11#include "super.h"
12
13#include "messenger.h"
14
15/*
16 * get protocol handler
17 */
18static u32 supported_protocols[] = {
19 CEPH_AUTH_NONE,
20 CEPH_AUTH_CEPHX
21};
22
23int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
24{
25 switch (protocol) {
26 case CEPH_AUTH_NONE:
27 return ceph_auth_none_init(ac);
28 case CEPH_AUTH_CEPHX:
29 return ceph_x_init(ac);
30 default:
31 return -ENOENT;
32 }
33}
34
35/*
36 * setup, teardown.
37 */
38struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
39{
40 struct ceph_auth_client *ac;
41 int ret;
42
43 dout("auth_init name '%s' secret '%s'\n", name, secret);
44
45 ret = -ENOMEM;
46 ac = kzalloc(sizeof(*ac), GFP_NOFS);
47 if (!ac)
48 goto out;
49
50 ac->negotiating = true;
51 if (name)
52 ac->name = name;
53 else
54 ac->name = CEPH_AUTH_NAME_DEFAULT;
55 dout("auth_init name %s secret %s\n", ac->name, secret);
56 ac->secret = secret;
57 return ac;
58
59out:
60 return ERR_PTR(ret);
61}
62
63void ceph_auth_destroy(struct ceph_auth_client *ac)
64{
65 dout("auth_destroy %p\n", ac);
66 if (ac->ops)
67 ac->ops->destroy(ac);
68 kfree(ac);
69}
70
71/*
72 * Reset occurs when reconnecting to the monitor.
73 */
74void ceph_auth_reset(struct ceph_auth_client *ac)
75{
76 dout("auth_reset %p\n", ac);
77 if (ac->ops && !ac->negotiating)
78 ac->ops->reset(ac);
79 ac->negotiating = true;
80}
81
82int ceph_entity_name_encode(const char *name, void **p, void *end)
83{
84 int len = strlen(name);
85
86 if (*p + 2*sizeof(u32) + len > end)
87 return -ERANGE;
88 ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
89 ceph_encode_32(p, len);
90 ceph_encode_copy(p, name, len);
91 return 0;
92}
93
94/*
95 * Initiate protocol negotiation with monitor. Include entity name
96 * and list supported protocols.
97 */
98int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
99{
100 struct ceph_mon_request_header *monhdr = buf;
101 void *p = monhdr + 1, *end = buf + len, *lenp;
102 int i, num;
103 int ret;
104
105 dout("auth_build_hello\n");
106 monhdr->have_version = 0;
107 monhdr->session_mon = cpu_to_le16(-1);
108 monhdr->session_mon_tid = 0;
109
110 ceph_encode_32(&p, 0); /* no protocol, yet */
111
112 lenp = p;
113 p += sizeof(u32);
114
115 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
116 ceph_encode_8(&p, 1);
117 num = ARRAY_SIZE(supported_protocols);
118 ceph_encode_32(&p, num);
119 ceph_decode_need(&p, end, num * sizeof(u32), bad);
120 for (i = 0; i < num; i++)
121 ceph_encode_32(&p, supported_protocols[i]);
122
123 ret = ceph_entity_name_encode(ac->name, &p, end);
124 if (ret < 0)
125 return ret;
126 ceph_decode_need(&p, end, sizeof(u64), bad);
127 ceph_encode_64(&p, ac->global_id);
128
129 ceph_encode_32(&lenp, p - lenp - sizeof(u32));
130 return p - buf;
131
132bad:
133 return -ERANGE;
134}
135
136int ceph_build_auth_request(struct ceph_auth_client *ac,
137 void *msg_buf, size_t msg_len)
138{
139 struct ceph_mon_request_header *monhdr = msg_buf;
140 void *p = monhdr + 1;
141 void *end = msg_buf + msg_len;
142 int ret;
143
144 monhdr->have_version = 0;
145 monhdr->session_mon = cpu_to_le16(-1);
146 monhdr->session_mon_tid = 0;
147
148 ceph_encode_32(&p, ac->protocol);
149
150 ret = ac->ops->build_request(ac, p + sizeof(u32), end);
151 if (ret < 0) {
152 pr_err("error %d building request\n", ret);
153 return ret;
154 }
155 dout(" built request %d bytes\n", ret);
156 ceph_encode_32(&p, ret);
157 return p + ret - msg_buf;
158}
159
160/*
161 * Handle auth message from monitor.
162 */
163int ceph_handle_auth_reply(struct ceph_auth_client *ac,
164 void *buf, size_t len,
165 void *reply_buf, size_t reply_len)
166{
167 void *p = buf;
168 void *end = buf + len;
169 int protocol;
170 s32 result;
171 u64 global_id;
172 void *payload, *payload_end;
173 int payload_len;
174 char *result_msg;
175 int result_msg_len;
176 int ret = -EINVAL;
177
178 dout("handle_auth_reply %p %p\n", p, end);
179 ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
180 protocol = ceph_decode_32(&p);
181 result = ceph_decode_32(&p);
182 global_id = ceph_decode_64(&p);
183 payload_len = ceph_decode_32(&p);
184 payload = p;
185 p += payload_len;
186 ceph_decode_need(&p, end, sizeof(u32), bad);
187 result_msg_len = ceph_decode_32(&p);
188 result_msg = p;
189 p += result_msg_len;
190 if (p != end)
191 goto bad;
192
193 dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
194 result_msg, global_id, payload_len);
195
196 payload_end = payload + payload_len;
197
198 if (global_id && ac->global_id != global_id) {
199 dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
200 ac->global_id = global_id;
201 }
202
203 if (ac->negotiating) {
204 /* server does not support our protocols? */
205 if (!protocol && result < 0) {
206 ret = result;
207 goto out;
208 }
209 /* set up (new) protocol handler? */
210 if (ac->protocol && ac->protocol != protocol) {
211 ac->ops->destroy(ac);
212 ac->protocol = 0;
213 ac->ops = NULL;
214 }
215 if (ac->protocol != protocol) {
216 ret = ceph_auth_init_protocol(ac, protocol);
217 if (ret) {
218 pr_err("error %d on auth protocol %d init\n",
219 ret, protocol);
220 goto out;
221 }
222 }
223
224 ac->negotiating = false;
225 }
226
227 ret = ac->ops->handle_reply(ac, result, payload, payload_end);
228 if (ret == -EAGAIN) {
229 return ceph_build_auth_request(ac, reply_buf, reply_len);
230 } else if (ret) {
231 pr_err("authentication error %d\n", ret);
232 return ret;
233 }
234 return 0;
235
236bad:
237 pr_err("failed to decode auth msg\n");
238out:
239 return ret;
240}
241
242int ceph_build_auth(struct ceph_auth_client *ac,
243 void *msg_buf, size_t msg_len)
244{
245 if (!ac->protocol)
246 return ceph_auth_build_hello(ac, msg_buf, msg_len);
247 BUG_ON(!ac->ops);
248 if (!ac->ops->is_authenticated(ac))
249 return ceph_build_auth_request(ac, msg_buf, msg_len);
250 return 0;
251}
252
253int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
254{
255 if (!ac->ops)
256 return 0;
257 return ac->ops->is_authenticated(ac);
258}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
new file mode 100644
index 000000000000..ca4f57cfb267
--- /dev/null
+++ b/fs/ceph/auth.h
@@ -0,0 +1,84 @@
1#ifndef _FS_CEPH_AUTH_H
2#define _FS_CEPH_AUTH_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * Abstract interface for communicating with the authenticate module.
9 * There is some handshake that takes place between us and the monitor
10 * to acquire the necessary keys. These are used to generate an
11 * 'authorizer' that we use when connecting to a service (mds, osd).
12 */
13
14struct ceph_auth_client;
15struct ceph_authorizer;
16
17struct ceph_auth_client_ops {
18 /*
19 * true if we are authenticated and can connect to
20 * services.
21 */
22 int (*is_authenticated)(struct ceph_auth_client *ac);
23
24 /*
25 * build requests and process replies during monitor
26 * handshake. if handle_reply returns -EAGAIN, we build
27 * another request.
28 */
29 int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
30 int (*handle_reply)(struct ceph_auth_client *ac, int result,
31 void *buf, void *end);
32
33 /*
34 * Create authorizer for connecting to a service, and verify
35 * the response to authenticate the service.
36 */
37 int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
38 struct ceph_authorizer **a,
39 void **buf, size_t *len,
40 void **reply_buf, size_t *reply_len);
41 int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
42 struct ceph_authorizer *a, size_t len);
43 void (*destroy_authorizer)(struct ceph_auth_client *ac,
44 struct ceph_authorizer *a);
45 void (*invalidate_authorizer)(struct ceph_auth_client *ac,
46 int peer_type);
47
48 /* reset when we (re)connect to a monitor */
49 void (*reset)(struct ceph_auth_client *ac);
50
51 void (*destroy)(struct ceph_auth_client *ac);
52};
53
54struct ceph_auth_client {
55 u32 protocol; /* CEPH_AUTH_* */
56 void *private; /* for use by protocol implementation */
57 const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */
58
59 bool negotiating; /* true if negotiating protocol */
60 const char *name; /* entity name */
61 u64 global_id; /* our unique id in system */
62 const char *secret; /* our secret key */
63 unsigned want_keys; /* which services we want */
64};
65
66extern struct ceph_auth_client *ceph_auth_init(const char *name,
67 const char *secret);
68extern void ceph_auth_destroy(struct ceph_auth_client *ac);
69
70extern void ceph_auth_reset(struct ceph_auth_client *ac);
71
72extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
73 void *buf, size_t len);
74extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
75 void *buf, size_t len,
76 void *reply_buf, size_t reply_len);
77extern int ceph_entity_name_encode(const char *name, void **p, void *end);
78
79extern int ceph_build_auth(struct ceph_auth_client *ac,
80 void *msg_buf, size_t msg_len);
81
82extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
83
84#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
new file mode 100644
index 000000000000..8cd9e3af07f7
--- /dev/null
+++ b/fs/ceph/auth_none.c
@@ -0,0 +1,122 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7#include <linux/slab.h>
8
9#include "auth_none.h"
10#include "auth.h"
11#include "decode.h"
12
13static void reset(struct ceph_auth_client *ac)
14{
15 struct ceph_auth_none_info *xi = ac->private;
16
17 xi->starting = true;
18 xi->built_authorizer = false;
19}
20
21static void destroy(struct ceph_auth_client *ac)
22{
23 kfree(ac->private);
24 ac->private = NULL;
25}
26
27static int is_authenticated(struct ceph_auth_client *ac)
28{
29 struct ceph_auth_none_info *xi = ac->private;
30
31 return !xi->starting;
32}
33
34/*
35 * the generic auth code decode the global_id, and we carry no actual
36 * authenticate state, so nothing happens here.
37 */
38static int handle_reply(struct ceph_auth_client *ac, int result,
39 void *buf, void *end)
40{
41 struct ceph_auth_none_info *xi = ac->private;
42
43 xi->starting = false;
44 return result;
45}
46
47/*
48 * build an 'authorizer' with our entity_name and global_id. we can
49 * reuse a single static copy since it is identical for all services
50 * we connect to.
51 */
52static int ceph_auth_none_create_authorizer(
53 struct ceph_auth_client *ac, int peer_type,
54 struct ceph_authorizer **a,
55 void **buf, size_t *len,
56 void **reply_buf, size_t *reply_len)
57{
58 struct ceph_auth_none_info *ai = ac->private;
59 struct ceph_none_authorizer *au = &ai->au;
60 void *p, *end;
61 int ret;
62
63 if (!ai->built_authorizer) {
64 p = au->buf;
65 end = p + sizeof(au->buf);
66 ceph_encode_8(&p, 1);
67 ret = ceph_entity_name_encode(ac->name, &p, end - 8);
68 if (ret < 0)
69 goto bad;
70 ceph_decode_need(&p, end, sizeof(u64), bad2);
71 ceph_encode_64(&p, ac->global_id);
72 au->buf_len = p - (void *)au->buf;
73 ai->built_authorizer = true;
74 dout("built authorizer len %d\n", au->buf_len);
75 }
76
77 *a = (struct ceph_authorizer *)au;
78 *buf = au->buf;
79 *len = au->buf_len;
80 *reply_buf = au->reply_buf;
81 *reply_len = sizeof(au->reply_buf);
82 return 0;
83
84bad2:
85 ret = -ERANGE;
86bad:
87 return ret;
88}
89
90static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
91 struct ceph_authorizer *a)
92{
93 /* nothing to do */
94}
95
96static const struct ceph_auth_client_ops ceph_auth_none_ops = {
97 .reset = reset,
98 .destroy = destroy,
99 .is_authenticated = is_authenticated,
100 .handle_reply = handle_reply,
101 .create_authorizer = ceph_auth_none_create_authorizer,
102 .destroy_authorizer = ceph_auth_none_destroy_authorizer,
103};
104
105int ceph_auth_none_init(struct ceph_auth_client *ac)
106{
107 struct ceph_auth_none_info *xi;
108
109 dout("ceph_auth_none_init %p\n", ac);
110 xi = kzalloc(sizeof(*xi), GFP_NOFS);
111 if (!xi)
112 return -ENOMEM;
113
114 xi->starting = true;
115 xi->built_authorizer = false;
116
117 ac->protocol = CEPH_AUTH_NONE;
118 ac->private = xi;
119 ac->ops = &ceph_auth_none_ops;
120 return 0;
121}
122
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
new file mode 100644
index 000000000000..56c05533a31c
--- /dev/null
+++ b/fs/ceph/auth_none.h
@@ -0,0 +1,28 @@
1#ifndef _FS_CEPH_AUTH_NONE_H
2#define _FS_CEPH_AUTH_NONE_H
3
4#include "auth.h"
5
6/*
7 * null security mode.
8 *
9 * we use a single static authorizer that simply encodes our entity name
10 * and global id.
11 */
12
13struct ceph_none_authorizer {
14 char buf[128];
15 int buf_len;
16 char reply_buf[0];
17};
18
19struct ceph_auth_none_info {
20 bool starting;
21 bool built_authorizer;
22 struct ceph_none_authorizer au; /* we only need one; it's static */
23};
24
25extern int ceph_auth_none_init(struct ceph_auth_client *ac);
26
27#endif
28
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
new file mode 100644
index 000000000000..d9001a4dc8cc
--- /dev/null
+++ b/fs/ceph/auth_x.c
@@ -0,0 +1,680 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7#include <linux/slab.h>
8
9#include "auth_x.h"
10#include "auth_x_protocol.h"
11#include "crypto.h"
12#include "auth.h"
13#include "decode.h"
14
15struct kmem_cache *ceph_x_ticketbuf_cachep;
16
17#define TEMP_TICKET_BUF_LEN 256
18
19static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
20
21static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
22{
23 struct ceph_x_info *xi = ac->private;
24 int need;
25
26 ceph_x_validate_tickets(ac, &need);
27 dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
28 ac->want_keys, need, xi->have_keys);
29 return (ac->want_keys & xi->have_keys) == ac->want_keys;
30}
31
32static int ceph_x_encrypt_buflen(int ilen)
33{
34 return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
35 sizeof(u32);
36}
37
38static int ceph_x_encrypt(struct ceph_crypto_key *secret,
39 void *ibuf, int ilen, void *obuf, size_t olen)
40{
41 struct ceph_x_encrypt_header head = {
42 .struct_v = 1,
43 .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
44 };
45 size_t len = olen - sizeof(u32);
46 int ret;
47
48 ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
49 &head, sizeof(head), ibuf, ilen);
50 if (ret)
51 return ret;
52 ceph_encode_32(&obuf, len);
53 return len + sizeof(u32);
54}
55
56static int ceph_x_decrypt(struct ceph_crypto_key *secret,
57 void **p, void *end, void *obuf, size_t olen)
58{
59 struct ceph_x_encrypt_header head;
60 size_t head_len = sizeof(head);
61 int len, ret;
62
63 len = ceph_decode_32(p);
64 if (*p + len > end)
65 return -EINVAL;
66
67 dout("ceph_x_decrypt len %d\n", len);
68 ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
69 *p, len);
70 if (ret)
71 return ret;
72 if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
73 return -EPERM;
74 *p += len;
75 return olen;
76}
77
78/*
79 * get existing (or insert new) ticket handler
80 */
81struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac,
82 int service)
83{
84 struct ceph_x_ticket_handler *th;
85 struct ceph_x_info *xi = ac->private;
86 struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
87
88 while (*p) {
89 parent = *p;
90 th = rb_entry(parent, struct ceph_x_ticket_handler, node);
91 if (service < th->service)
92 p = &(*p)->rb_left;
93 else if (service > th->service)
94 p = &(*p)->rb_right;
95 else
96 return th;
97 }
98
99 /* add it */
100 th = kzalloc(sizeof(*th), GFP_NOFS);
101 if (!th)
102 return ERR_PTR(-ENOMEM);
103 th->service = service;
104 rb_link_node(&th->node, parent, p);
105 rb_insert_color(&th->node, &xi->ticket_handlers);
106 return th;
107}
108
109static void remove_ticket_handler(struct ceph_auth_client *ac,
110 struct ceph_x_ticket_handler *th)
111{
112 struct ceph_x_info *xi = ac->private;
113
114 dout("remove_ticket_handler %p %d\n", th, th->service);
115 rb_erase(&th->node, &xi->ticket_handlers);
116 ceph_crypto_key_destroy(&th->session_key);
117 if (th->ticket_blob)
118 ceph_buffer_put(th->ticket_blob);
119 kfree(th);
120}
121
122static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
123 struct ceph_crypto_key *secret,
124 void *buf, void *end)
125{
126 struct ceph_x_info *xi = ac->private;
127 int num;
128 void *p = buf;
129 int ret;
130 char *dbuf;
131 char *ticket_buf;
132 u8 struct_v;
133
134 dbuf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, GFP_NOFS | GFP_ATOMIC);
135 if (!dbuf)
136 return -ENOMEM;
137
138 ret = -ENOMEM;
139 ticket_buf = kmem_cache_alloc(ceph_x_ticketbuf_cachep,
140 GFP_NOFS | GFP_ATOMIC);
141 if (!ticket_buf)
142 goto out_dbuf;
143
144 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
145 struct_v = ceph_decode_8(&p);
146 if (struct_v != 1)
147 goto bad;
148 num = ceph_decode_32(&p);
149 dout("%d tickets\n", num);
150 while (num--) {
151 int type;
152 u8 struct_v;
153 struct ceph_x_ticket_handler *th;
154 void *dp, *dend;
155 int dlen;
156 char is_enc;
157 struct timespec validity;
158 struct ceph_crypto_key old_key;
159 void *tp, *tpend;
160 struct ceph_timespec new_validity;
161 struct ceph_crypto_key new_session_key;
162 struct ceph_buffer *new_ticket_blob;
163 unsigned long new_expires, new_renew_after;
164 u64 new_secret_id;
165
166 ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
167
168 type = ceph_decode_32(&p);
169 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
170
171 struct_v = ceph_decode_8(&p);
172 if (struct_v != 1)
173 goto bad;
174
175 th = get_ticket_handler(ac, type);
176 if (IS_ERR(th)) {
177 ret = PTR_ERR(th);
178 goto out;
179 }
180
181 /* blob for me */
182 dlen = ceph_x_decrypt(secret, &p, end, dbuf,
183 TEMP_TICKET_BUF_LEN);
184 if (dlen <= 0) {
185 ret = dlen;
186 goto out;
187 }
188 dout(" decrypted %d bytes\n", dlen);
189 dend = dbuf + dlen;
190 dp = dbuf;
191
192 struct_v = ceph_decode_8(&dp);
193 if (struct_v != 1)
194 goto bad;
195
196 memcpy(&old_key, &th->session_key, sizeof(old_key));
197 ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
198 if (ret)
199 goto out;
200
201 ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
202 ceph_decode_timespec(&validity, &new_validity);
203 new_expires = get_seconds() + validity.tv_sec;
204 new_renew_after = new_expires - (validity.tv_sec / 4);
205 dout(" expires=%lu renew_after=%lu\n", new_expires,
206 new_renew_after);
207
208 /* ticket blob for service */
209 ceph_decode_8_safe(&p, end, is_enc, bad);
210 tp = ticket_buf;
211 if (is_enc) {
212 /* encrypted */
213 dout(" encrypted ticket\n");
214 dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
215 TEMP_TICKET_BUF_LEN);
216 if (dlen < 0) {
217 ret = dlen;
218 goto out;
219 }
220 dlen = ceph_decode_32(&tp);
221 } else {
222 /* unencrypted */
223 ceph_decode_32_safe(&p, end, dlen, bad);
224 ceph_decode_need(&p, end, dlen, bad);
225 ceph_decode_copy(&p, ticket_buf, dlen);
226 }
227 tpend = tp + dlen;
228 dout(" ticket blob is %d bytes\n", dlen);
229 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
230 struct_v = ceph_decode_8(&tp);
231 new_secret_id = ceph_decode_64(&tp);
232 ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
233 if (ret)
234 goto out;
235
236 /* all is well, update our ticket */
237 ceph_crypto_key_destroy(&th->session_key);
238 if (th->ticket_blob)
239 ceph_buffer_put(th->ticket_blob);
240 th->session_key = new_session_key;
241 th->ticket_blob = new_ticket_blob;
242 th->validity = new_validity;
243 th->secret_id = new_secret_id;
244 th->expires = new_expires;
245 th->renew_after = new_renew_after;
246 dout(" got ticket service %d (%s) secret_id %lld len %d\n",
247 type, ceph_entity_type_name(type), th->secret_id,
248 (int)th->ticket_blob->vec.iov_len);
249 xi->have_keys |= th->service;
250 }
251
252 ret = 0;
253out:
254 kmem_cache_free(ceph_x_ticketbuf_cachep, ticket_buf);
255out_dbuf:
256 kmem_cache_free(ceph_x_ticketbuf_cachep, dbuf);
257 return ret;
258
259bad:
260 ret = -EINVAL;
261 goto out;
262}
263
264static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
265 struct ceph_x_ticket_handler *th,
266 struct ceph_x_authorizer *au)
267{
268 int maxlen;
269 struct ceph_x_authorize_a *msg_a;
270 struct ceph_x_authorize_b msg_b;
271 void *p, *end;
272 int ret;
273 int ticket_blob_len =
274 (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
275
276 dout("build_authorizer for %s %p\n",
277 ceph_entity_type_name(th->service), au);
278
279 maxlen = sizeof(*msg_a) + sizeof(msg_b) +
280 ceph_x_encrypt_buflen(ticket_blob_len);
281 dout(" need len %d\n", maxlen);
282 if (au->buf && au->buf->alloc_len < maxlen) {
283 ceph_buffer_put(au->buf);
284 au->buf = NULL;
285 }
286 if (!au->buf) {
287 au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
288 if (!au->buf)
289 return -ENOMEM;
290 }
291 au->service = th->service;
292
293 msg_a = au->buf->vec.iov_base;
294 msg_a->struct_v = 1;
295 msg_a->global_id = cpu_to_le64(ac->global_id);
296 msg_a->service_id = cpu_to_le32(th->service);
297 msg_a->ticket_blob.struct_v = 1;
298 msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
299 msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
300 if (ticket_blob_len) {
301 memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
302 th->ticket_blob->vec.iov_len);
303 }
304 dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
305 le64_to_cpu(msg_a->ticket_blob.secret_id));
306
307 p = msg_a + 1;
308 p += ticket_blob_len;
309 end = au->buf->vec.iov_base + au->buf->vec.iov_len;
310
311 get_random_bytes(&au->nonce, sizeof(au->nonce));
312 msg_b.struct_v = 1;
313 msg_b.nonce = cpu_to_le64(au->nonce);
314 ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
315 p, end - p);
316 if (ret < 0)
317 goto out_buf;
318 p += ret;
319 au->buf->vec.iov_len = p - au->buf->vec.iov_base;
320 dout(" built authorizer nonce %llx len %d\n", au->nonce,
321 (int)au->buf->vec.iov_len);
322 BUG_ON(au->buf->vec.iov_len > maxlen);
323 return 0;
324
325out_buf:
326 ceph_buffer_put(au->buf);
327 au->buf = NULL;
328 return ret;
329}
330
331static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
332 void **p, void *end)
333{
334 ceph_decode_need(p, end, 1 + sizeof(u64), bad);
335 ceph_encode_8(p, 1);
336 ceph_encode_64(p, th->secret_id);
337 if (th->ticket_blob) {
338 const char *buf = th->ticket_blob->vec.iov_base;
339 u32 len = th->ticket_blob->vec.iov_len;
340
341 ceph_encode_32_safe(p, end, len, bad);
342 ceph_encode_copy_safe(p, end, buf, len, bad);
343 } else {
344 ceph_encode_32_safe(p, end, 0, bad);
345 }
346
347 return 0;
348bad:
349 return -ERANGE;
350}
351
352static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
353{
354 int want = ac->want_keys;
355 struct ceph_x_info *xi = ac->private;
356 int service;
357
358 *pneed = ac->want_keys & ~(xi->have_keys);
359
360 for (service = 1; service <= want; service <<= 1) {
361 struct ceph_x_ticket_handler *th;
362
363 if (!(ac->want_keys & service))
364 continue;
365
366 if (*pneed & service)
367 continue;
368
369 th = get_ticket_handler(ac, service);
370
371 if (!th) {
372 *pneed |= service;
373 continue;
374 }
375
376 if (get_seconds() >= th->renew_after)
377 *pneed |= service;
378 if (get_seconds() >= th->expires)
379 xi->have_keys &= ~service;
380 }
381}
382
383
384static int ceph_x_build_request(struct ceph_auth_client *ac,
385 void *buf, void *end)
386{
387 struct ceph_x_info *xi = ac->private;
388 int need;
389 struct ceph_x_request_header *head = buf;
390 int ret;
391 struct ceph_x_ticket_handler *th =
392 get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
393
394 ceph_x_validate_tickets(ac, &need);
395
396 dout("build_request want %x have %x need %x\n",
397 ac->want_keys, xi->have_keys, need);
398
399 if (need & CEPH_ENTITY_TYPE_AUTH) {
400 struct ceph_x_authenticate *auth = (void *)(head + 1);
401 void *p = auth + 1;
402 struct ceph_x_challenge_blob tmp;
403 char tmp_enc[40];
404 u64 *u;
405
406 if (p > end)
407 return -ERANGE;
408
409 dout(" get_auth_session_key\n");
410 head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
411
412 /* encrypt and hash */
413 get_random_bytes(&auth->client_challenge, sizeof(u64));
414 tmp.client_challenge = auth->client_challenge;
415 tmp.server_challenge = cpu_to_le64(xi->server_challenge);
416 ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
417 tmp_enc, sizeof(tmp_enc));
418 if (ret < 0)
419 return ret;
420
421 auth->struct_v = 1;
422 auth->key = 0;
423 for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
424 auth->key ^= *u;
425 dout(" server_challenge %llx client_challenge %llx key %llx\n",
426 xi->server_challenge, le64_to_cpu(auth->client_challenge),
427 le64_to_cpu(auth->key));
428
429 /* now encode the old ticket if exists */
430 ret = ceph_x_encode_ticket(th, &p, end);
431 if (ret < 0)
432 return ret;
433
434 return p - buf;
435 }
436
437 if (need) {
438 void *p = head + 1;
439 struct ceph_x_service_ticket_request *req;
440
441 if (p > end)
442 return -ERANGE;
443 head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
444
445 BUG_ON(!th);
446 ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
447 if (ret)
448 return ret;
449 ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
450 xi->auth_authorizer.buf->vec.iov_len);
451
452 req = p;
453 req->keys = cpu_to_le32(need);
454 p += sizeof(*req);
455 return p - buf;
456 }
457
458 return 0;
459}
460
461static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
462 void *buf, void *end)
463{
464 struct ceph_x_info *xi = ac->private;
465 struct ceph_x_reply_header *head = buf;
466 struct ceph_x_ticket_handler *th;
467 int len = end - buf;
468 int op;
469 int ret;
470
471 if (result)
472 return result; /* XXX hmm? */
473
474 if (xi->starting) {
475 /* it's a hello */
476 struct ceph_x_server_challenge *sc = buf;
477
478 if (len != sizeof(*sc))
479 return -EINVAL;
480 xi->server_challenge = le64_to_cpu(sc->server_challenge);
481 dout("handle_reply got server challenge %llx\n",
482 xi->server_challenge);
483 xi->starting = false;
484 xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
485 return -EAGAIN;
486 }
487
488 op = le32_to_cpu(head->op);
489 result = le32_to_cpu(head->result);
490 dout("handle_reply op %d result %d\n", op, result);
491 switch (op) {
492 case CEPHX_GET_AUTH_SESSION_KEY:
493 /* verify auth key */
494 ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
495 buf + sizeof(*head), end);
496 break;
497
498 case CEPHX_GET_PRINCIPAL_SESSION_KEY:
499 th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
500 BUG_ON(!th);
501 ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
502 buf + sizeof(*head), end);
503 break;
504
505 default:
506 return -EINVAL;
507 }
508 if (ret)
509 return ret;
510 if (ac->want_keys == xi->have_keys)
511 return 0;
512 return -EAGAIN;
513}
514
515static int ceph_x_create_authorizer(
516 struct ceph_auth_client *ac, int peer_type,
517 struct ceph_authorizer **a,
518 void **buf, size_t *len,
519 void **reply_buf, size_t *reply_len)
520{
521 struct ceph_x_authorizer *au;
522 struct ceph_x_ticket_handler *th;
523 int ret;
524
525 th = get_ticket_handler(ac, peer_type);
526 if (IS_ERR(th))
527 return PTR_ERR(th);
528
529 au = kzalloc(sizeof(*au), GFP_NOFS);
530 if (!au)
531 return -ENOMEM;
532
533 ret = ceph_x_build_authorizer(ac, th, au);
534 if (ret) {
535 kfree(au);
536 return ret;
537 }
538
539 *a = (struct ceph_authorizer *)au;
540 *buf = au->buf->vec.iov_base;
541 *len = au->buf->vec.iov_len;
542 *reply_buf = au->reply_buf;
543 *reply_len = sizeof(au->reply_buf);
544 return 0;
545}
546
547static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
548 struct ceph_authorizer *a, size_t len)
549{
550 struct ceph_x_authorizer *au = (void *)a;
551 struct ceph_x_ticket_handler *th;
552 int ret = 0;
553 struct ceph_x_authorize_reply reply;
554 void *p = au->reply_buf;
555 void *end = p + sizeof(au->reply_buf);
556
557 th = get_ticket_handler(ac, au->service);
558 if (!th)
559 return -EIO; /* hrm! */
560 ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
561 if (ret < 0)
562 return ret;
563 if (ret != sizeof(reply))
564 return -EPERM;
565
566 if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
567 ret = -EPERM;
568 else
569 ret = 0;
570 dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
571 au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
572 return ret;
573}
574
575static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
576 struct ceph_authorizer *a)
577{
578 struct ceph_x_authorizer *au = (void *)a;
579
580 ceph_buffer_put(au->buf);
581 kfree(au);
582}
583
584
585static void ceph_x_reset(struct ceph_auth_client *ac)
586{
587 struct ceph_x_info *xi = ac->private;
588
589 dout("reset\n");
590 xi->starting = true;
591 xi->server_challenge = 0;
592}
593
594static void ceph_x_destroy(struct ceph_auth_client *ac)
595{
596 struct ceph_x_info *xi = ac->private;
597 struct rb_node *p;
598
599 dout("ceph_x_destroy %p\n", ac);
600 ceph_crypto_key_destroy(&xi->secret);
601
602 while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
603 struct ceph_x_ticket_handler *th =
604 rb_entry(p, struct ceph_x_ticket_handler, node);
605 remove_ticket_handler(ac, th);
606 }
607
608 kmem_cache_destroy(ceph_x_ticketbuf_cachep);
609
610 kfree(ac->private);
611 ac->private = NULL;
612}
613
614static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
615 int peer_type)
616{
617 struct ceph_x_ticket_handler *th;
618
619 th = get_ticket_handler(ac, peer_type);
620 if (th && !IS_ERR(th))
621 remove_ticket_handler(ac, th);
622}
623
624
625static const struct ceph_auth_client_ops ceph_x_ops = {
626 .is_authenticated = ceph_x_is_authenticated,
627 .build_request = ceph_x_build_request,
628 .handle_reply = ceph_x_handle_reply,
629 .create_authorizer = ceph_x_create_authorizer,
630 .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
631 .destroy_authorizer = ceph_x_destroy_authorizer,
632 .invalidate_authorizer = ceph_x_invalidate_authorizer,
633 .reset = ceph_x_reset,
634 .destroy = ceph_x_destroy,
635};
636
637
638int ceph_x_init(struct ceph_auth_client *ac)
639{
640 struct ceph_x_info *xi;
641 int ret;
642
643 dout("ceph_x_init %p\n", ac);
644 xi = kzalloc(sizeof(*xi), GFP_NOFS);
645 if (!xi)
646 return -ENOMEM;
647
648 ret = -ENOMEM;
649 ceph_x_ticketbuf_cachep = kmem_cache_create("ceph_x_ticketbuf",
650 TEMP_TICKET_BUF_LEN, 8,
651 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
652 NULL);
653 if (!ceph_x_ticketbuf_cachep)
654 goto done_nomem;
655 ret = -EINVAL;
656 if (!ac->secret) {
657 pr_err("no secret set (for auth_x protocol)\n");
658 goto done_nomem;
659 }
660
661 ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
662 if (ret)
663 goto done_nomem;
664
665 xi->starting = true;
666 xi->ticket_handlers = RB_ROOT;
667
668 ac->protocol = CEPH_AUTH_CEPHX;
669 ac->private = xi;
670 ac->ops = &ceph_x_ops;
671 return 0;
672
673done_nomem:
674 kfree(xi);
675 if (ceph_x_ticketbuf_cachep)
676 kmem_cache_destroy(ceph_x_ticketbuf_cachep);
677 return ret;
678}
679
680
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h
new file mode 100644
index 000000000000..ff6f8180e681
--- /dev/null
+++ b/fs/ceph/auth_x.h
@@ -0,0 +1,49 @@
1#ifndef _FS_CEPH_AUTH_X_H
2#define _FS_CEPH_AUTH_X_H
3
4#include <linux/rbtree.h>
5
6#include "crypto.h"
7#include "auth.h"
8#include "auth_x_protocol.h"
9
10/*
11 * Handle ticket for a single service.
12 */
13struct ceph_x_ticket_handler {
14 struct rb_node node;
15 unsigned service;
16
17 struct ceph_crypto_key session_key;
18 struct ceph_timespec validity;
19
20 u64 secret_id;
21 struct ceph_buffer *ticket_blob;
22
23 unsigned long renew_after, expires;
24};
25
26
27struct ceph_x_authorizer {
28 struct ceph_buffer *buf;
29 unsigned service;
30 u64 nonce;
31 char reply_buf[128]; /* big enough for encrypted blob */
32};
33
34struct ceph_x_info {
35 struct ceph_crypto_key secret;
36
37 bool starting;
38 u64 server_challenge;
39
40 unsigned have_keys;
41 struct rb_root ticket_handlers;
42
43 struct ceph_x_authorizer auth_authorizer;
44};
45
46extern int ceph_x_init(struct ceph_auth_client *ac);
47
48#endif
49
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h
new file mode 100644
index 000000000000..671d30576c4f
--- /dev/null
+++ b/fs/ceph/auth_x_protocol.h
@@ -0,0 +1,90 @@
1#ifndef __FS_CEPH_AUTH_X_PROTOCOL
2#define __FS_CEPH_AUTH_X_PROTOCOL
3
4#define CEPHX_GET_AUTH_SESSION_KEY 0x0100
5#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
6#define CEPHX_GET_ROTATING_KEY 0x0400
7
8/* common bits */
9struct ceph_x_ticket_blob {
10 __u8 struct_v;
11 __le64 secret_id;
12 __le32 blob_len;
13 char blob[];
14} __attribute__ ((packed));
15
16
17/* common request/reply headers */
18struct ceph_x_request_header {
19 __le16 op;
20} __attribute__ ((packed));
21
22struct ceph_x_reply_header {
23 __le16 op;
24 __le32 result;
25} __attribute__ ((packed));
26
27
28/* authenticate handshake */
29
30/* initial hello (no reply header) */
31struct ceph_x_server_challenge {
32 __u8 struct_v;
33 __le64 server_challenge;
34} __attribute__ ((packed));
35
36struct ceph_x_authenticate {
37 __u8 struct_v;
38 __le64 client_challenge;
39 __le64 key;
40 /* ticket blob */
41} __attribute__ ((packed));
42
43struct ceph_x_service_ticket_request {
44 __u8 struct_v;
45 __le32 keys;
46} __attribute__ ((packed));
47
48struct ceph_x_challenge_blob {
49 __le64 server_challenge;
50 __le64 client_challenge;
51} __attribute__ ((packed));
52
53
54
55/* authorize handshake */
56
57/*
58 * The authorizer consists of two pieces:
59 * a - service id, ticket blob
60 * b - encrypted with session key
61 */
62struct ceph_x_authorize_a {
63 __u8 struct_v;
64 __le64 global_id;
65 __le32 service_id;
66 struct ceph_x_ticket_blob ticket_blob;
67} __attribute__ ((packed));
68
69struct ceph_x_authorize_b {
70 __u8 struct_v;
71 __le64 nonce;
72} __attribute__ ((packed));
73
74struct ceph_x_authorize_reply {
75 __u8 struct_v;
76 __le64 nonce_plus_one;
77} __attribute__ ((packed));
78
79
80/*
81 * encyption bundle
82 */
83#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
84
85struct ceph_x_encrypt_header {
86 __u8 struct_v;
87 __le64 magic;
88} __attribute__ ((packed));
89
90#endif
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
new file mode 100644
index 000000000000..c67535d70aa6
--- /dev/null
+++ b/fs/ceph/buffer.c
@@ -0,0 +1,81 @@
1
2#include "ceph_debug.h"
3
4#include <linux/slab.h>
5
6#include "buffer.h"
7#include "decode.h"
8
9struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
10{
11 struct ceph_buffer *b;
12
13 b = kmalloc(sizeof(*b), gfp);
14 if (!b)
15 return NULL;
16
17 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
18 if (b->vec.iov_base) {
19 b->is_vmalloc = false;
20 } else {
21 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
22 if (!b->vec.iov_base) {
23 kfree(b);
24 return NULL;
25 }
26 b->is_vmalloc = true;
27 }
28
29 kref_init(&b->kref);
30 b->alloc_len = len;
31 b->vec.iov_len = len;
32 dout("buffer_new %p\n", b);
33 return b;
34}
35
36void ceph_buffer_release(struct kref *kref)
37{
38 struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
39
40 dout("buffer_release %p\n", b);
41 if (b->vec.iov_base) {
42 if (b->is_vmalloc)
43 vfree(b->vec.iov_base);
44 else
45 kfree(b->vec.iov_base);
46 }
47 kfree(b);
48}
49
50int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
51{
52 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
53 if (b->vec.iov_base) {
54 b->is_vmalloc = false;
55 } else {
56 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
57 b->is_vmalloc = true;
58 }
59 if (!b->vec.iov_base)
60 return -ENOMEM;
61 b->alloc_len = len;
62 b->vec.iov_len = len;
63 return 0;
64}
65
66int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
67{
68 size_t len;
69
70 ceph_decode_need(p, end, sizeof(u32), bad);
71 len = ceph_decode_32(p);
72 dout("decode_buffer len %d\n", (int)len);
73 ceph_decode_need(p, end, len, bad);
74 *b = ceph_buffer_new(len, GFP_NOFS);
75 if (!*b)
76 return -ENOMEM;
77 ceph_decode_copy(p, (*b)->vec.iov_base, len);
78 return 0;
79bad:
80 return -EINVAL;
81}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
new file mode 100644
index 000000000000..58d19014068f
--- /dev/null
+++ b/fs/ceph/buffer.h
@@ -0,0 +1,39 @@
1#ifndef __FS_CEPH_BUFFER_H
2#define __FS_CEPH_BUFFER_H
3
4#include <linux/kref.h>
5#include <linux/mm.h>
6#include <linux/vmalloc.h>
7#include <linux/types.h>
8#include <linux/uio.h>
9
10/*
11 * a simple reference counted buffer.
12 *
13 * use kmalloc for small sizes (<= one page), vmalloc for larger
14 * sizes.
15 */
16struct ceph_buffer {
17 struct kref kref;
18 struct kvec vec;
19 size_t alloc_len;
20 bool is_vmalloc;
21};
22
23extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
24extern void ceph_buffer_release(struct kref *kref);
25
26static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
27{
28 kref_get(&b->kref);
29 return b;
30}
31
32static inline void ceph_buffer_put(struct ceph_buffer *b)
33{
34 kref_put(&b->kref, ceph_buffer_release);
35}
36
37extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
38
39#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
new file mode 100644
index 000000000000..aa2239fa9a3b
--- /dev/null
+++ b/fs/ceph/caps.c
@@ -0,0 +1,2955 @@
1#include "ceph_debug.h"
2
3#include <linux/fs.h>
4#include <linux/kernel.h>
5#include <linux/sched.h>
6#include <linux/slab.h>
7#include <linux/vmalloc.h>
8#include <linux/wait.h>
9#include <linux/writeback.h>
10
11#include "super.h"
12#include "decode.h"
13#include "messenger.h"
14
15/*
16 * Capability management
17 *
18 * The Ceph metadata servers control client access to inode metadata
19 * and file data by issuing capabilities, granting clients permission
20 * to read and/or write both inode field and file data to OSDs
21 * (storage nodes). Each capability consists of a set of bits
22 * indicating which operations are allowed.
23 *
24 * If the client holds a *_SHARED cap, the client has a coherent value
25 * that can be safely read from the cached inode.
26 *
27 * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
28 * client is allowed to change inode attributes (e.g., file size,
29 * mtime), note its dirty state in the ceph_cap, and asynchronously
30 * flush that metadata change to the MDS.
31 *
32 * In the event of a conflicting operation (perhaps by another
33 * client), the MDS will revoke the conflicting client capabilities.
34 *
35 * In order for a client to cache an inode, it must hold a capability
36 * with at least one MDS server. When inodes are released, release
37 * notifications are batched and periodically sent en masse to the MDS
38 * cluster to release server state.
39 */
40
41
42/*
43 * Generate readable cap strings for debugging output.
44 */
45#define MAX_CAP_STR 20
46static char cap_str[MAX_CAP_STR][40];
47static DEFINE_SPINLOCK(cap_str_lock);
48static int last_cap_str;
49
50static char *gcap_string(char *s, int c)
51{
52 if (c & CEPH_CAP_GSHARED)
53 *s++ = 's';
54 if (c & CEPH_CAP_GEXCL)
55 *s++ = 'x';
56 if (c & CEPH_CAP_GCACHE)
57 *s++ = 'c';
58 if (c & CEPH_CAP_GRD)
59 *s++ = 'r';
60 if (c & CEPH_CAP_GWR)
61 *s++ = 'w';
62 if (c & CEPH_CAP_GBUFFER)
63 *s++ = 'b';
64 if (c & CEPH_CAP_GLAZYIO)
65 *s++ = 'l';
66 return s;
67}
68
69const char *ceph_cap_string(int caps)
70{
71 int i;
72 char *s;
73 int c;
74
75 spin_lock(&cap_str_lock);
76 i = last_cap_str++;
77 if (last_cap_str == MAX_CAP_STR)
78 last_cap_str = 0;
79 spin_unlock(&cap_str_lock);
80
81 s = cap_str[i];
82
83 if (caps & CEPH_CAP_PIN)
84 *s++ = 'p';
85
86 c = (caps >> CEPH_CAP_SAUTH) & 3;
87 if (c) {
88 *s++ = 'A';
89 s = gcap_string(s, c);
90 }
91
92 c = (caps >> CEPH_CAP_SLINK) & 3;
93 if (c) {
94 *s++ = 'L';
95 s = gcap_string(s, c);
96 }
97
98 c = (caps >> CEPH_CAP_SXATTR) & 3;
99 if (c) {
100 *s++ = 'X';
101 s = gcap_string(s, c);
102 }
103
104 c = caps >> CEPH_CAP_SFILE;
105 if (c) {
106 *s++ = 'F';
107 s = gcap_string(s, c);
108 }
109
110 if (s == cap_str[i])
111 *s++ = '-';
112 *s = 0;
113 return cap_str[i];
114}
115
116/*
117 * Cap reservations
118 *
119 * Maintain a global pool of preallocated struct ceph_caps, referenced
120 * by struct ceph_caps_reservations. This ensures that we preallocate
121 * memory needed to successfully process an MDS response. (If an MDS
122 * sends us cap information and we fail to process it, we will have
123 * problems due to the client and MDS being out of sync.)
124 *
125 * Reservations are 'owned' by a ceph_cap_reservation context.
126 */
127static spinlock_t caps_list_lock;
128static struct list_head caps_list; /* unused (reserved or unreserved) */
129static int caps_total_count; /* total caps allocated */
130static int caps_use_count; /* in use */
131static int caps_reserve_count; /* unused, reserved */
132static int caps_avail_count; /* unused, unreserved */
133static int caps_min_count; /* keep at least this many (unreserved) */
134
135void __init ceph_caps_init(void)
136{
137 INIT_LIST_HEAD(&caps_list);
138 spin_lock_init(&caps_list_lock);
139}
140
141void ceph_caps_finalize(void)
142{
143 struct ceph_cap *cap;
144
145 spin_lock(&caps_list_lock);
146 while (!list_empty(&caps_list)) {
147 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
148 list_del(&cap->caps_item);
149 kmem_cache_free(ceph_cap_cachep, cap);
150 }
151 caps_total_count = 0;
152 caps_avail_count = 0;
153 caps_use_count = 0;
154 caps_reserve_count = 0;
155 caps_min_count = 0;
156 spin_unlock(&caps_list_lock);
157}
158
159void ceph_adjust_min_caps(int delta)
160{
161 spin_lock(&caps_list_lock);
162 caps_min_count += delta;
163 BUG_ON(caps_min_count < 0);
164 spin_unlock(&caps_list_lock);
165}
166
167int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
168{
169 int i;
170 struct ceph_cap *cap;
171 int have;
172 int alloc = 0;
173 LIST_HEAD(newcaps);
174 int ret = 0;
175
176 dout("reserve caps ctx=%p need=%d\n", ctx, need);
177
178 /* first reserve any caps that are already allocated */
179 spin_lock(&caps_list_lock);
180 if (caps_avail_count >= need)
181 have = need;
182 else
183 have = caps_avail_count;
184 caps_avail_count -= have;
185 caps_reserve_count += have;
186 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
187 caps_avail_count);
188 spin_unlock(&caps_list_lock);
189
190 for (i = have; i < need; i++) {
191 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
192 if (!cap) {
193 ret = -ENOMEM;
194 goto out_alloc_count;
195 }
196 list_add(&cap->caps_item, &newcaps);
197 alloc++;
198 }
199 BUG_ON(have + alloc != need);
200
201 spin_lock(&caps_list_lock);
202 caps_total_count += alloc;
203 caps_reserve_count += alloc;
204 list_splice(&newcaps, &caps_list);
205
206 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
207 caps_avail_count);
208 spin_unlock(&caps_list_lock);
209
210 ctx->count = need;
211 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
212 ctx, caps_total_count, caps_use_count, caps_reserve_count,
213 caps_avail_count);
214 return 0;
215
216out_alloc_count:
217 /* we didn't manage to reserve as much as we needed */
218 pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
219 ctx, need, have);
220 return ret;
221}
222
223int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
224{
225 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
226 if (ctx->count) {
227 spin_lock(&caps_list_lock);
228 BUG_ON(caps_reserve_count < ctx->count);
229 caps_reserve_count -= ctx->count;
230 caps_avail_count += ctx->count;
231 ctx->count = 0;
232 dout("unreserve caps %d = %d used + %d resv + %d avail\n",
233 caps_total_count, caps_use_count, caps_reserve_count,
234 caps_avail_count);
235 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
236 caps_avail_count);
237 spin_unlock(&caps_list_lock);
238 }
239 return 0;
240}
241
242static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
243{
244 struct ceph_cap *cap = NULL;
245
246 /* temporary, until we do something about cap import/export */
247 if (!ctx)
248 return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
249
250 spin_lock(&caps_list_lock);
251 dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
252 ctx, ctx->count, caps_total_count, caps_use_count,
253 caps_reserve_count, caps_avail_count);
254 BUG_ON(!ctx->count);
255 BUG_ON(ctx->count > caps_reserve_count);
256 BUG_ON(list_empty(&caps_list));
257
258 ctx->count--;
259 caps_reserve_count--;
260 caps_use_count++;
261
262 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
263 list_del(&cap->caps_item);
264
265 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
266 caps_avail_count);
267 spin_unlock(&caps_list_lock);
268 return cap;
269}
270
271void ceph_put_cap(struct ceph_cap *cap)
272{
273 spin_lock(&caps_list_lock);
274 dout("put_cap %p %d = %d used + %d resv + %d avail\n",
275 cap, caps_total_count, caps_use_count,
276 caps_reserve_count, caps_avail_count);
277 caps_use_count--;
278 /*
279 * Keep some preallocated caps around (ceph_min_count), to
280 * avoid lots of free/alloc churn.
281 */
282 if (caps_avail_count >= caps_reserve_count + caps_min_count) {
283 caps_total_count--;
284 kmem_cache_free(ceph_cap_cachep, cap);
285 } else {
286 caps_avail_count++;
287 list_add(&cap->caps_item, &caps_list);
288 }
289
290 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
291 caps_avail_count);
292 spin_unlock(&caps_list_lock);
293}
294
295void ceph_reservation_status(struct ceph_client *client,
296 int *total, int *avail, int *used, int *reserved,
297 int *min)
298{
299 if (total)
300 *total = caps_total_count;
301 if (avail)
302 *avail = caps_avail_count;
303 if (used)
304 *used = caps_use_count;
305 if (reserved)
306 *reserved = caps_reserve_count;
307 if (min)
308 *min = caps_min_count;
309}
310
311/*
312 * Find ceph_cap for given mds, if any.
313 *
314 * Called with i_lock held.
315 */
316static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
317{
318 struct ceph_cap *cap;
319 struct rb_node *n = ci->i_caps.rb_node;
320
321 while (n) {
322 cap = rb_entry(n, struct ceph_cap, ci_node);
323 if (mds < cap->mds)
324 n = n->rb_left;
325 else if (mds > cap->mds)
326 n = n->rb_right;
327 else
328 return cap;
329 }
330 return NULL;
331}
332
333/*
334 * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else
335 * -1.
336 */
337static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq)
338{
339 struct ceph_cap *cap;
340 int mds = -1;
341 struct rb_node *p;
342
343 /* prefer mds with WR|WRBUFFER|EXCL caps */
344 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
345 cap = rb_entry(p, struct ceph_cap, ci_node);
346 mds = cap->mds;
347 if (mseq)
348 *mseq = cap->mseq;
349 if (cap->issued & (CEPH_CAP_FILE_WR |
350 CEPH_CAP_FILE_BUFFER |
351 CEPH_CAP_FILE_EXCL))
352 break;
353 }
354 return mds;
355}
356
357int ceph_get_cap_mds(struct inode *inode)
358{
359 int mds;
360 spin_lock(&inode->i_lock);
361 mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
362 spin_unlock(&inode->i_lock);
363 return mds;
364}
365
366/*
367 * Called under i_lock.
368 */
369static void __insert_cap_node(struct ceph_inode_info *ci,
370 struct ceph_cap *new)
371{
372 struct rb_node **p = &ci->i_caps.rb_node;
373 struct rb_node *parent = NULL;
374 struct ceph_cap *cap = NULL;
375
376 while (*p) {
377 parent = *p;
378 cap = rb_entry(parent, struct ceph_cap, ci_node);
379 if (new->mds < cap->mds)
380 p = &(*p)->rb_left;
381 else if (new->mds > cap->mds)
382 p = &(*p)->rb_right;
383 else
384 BUG();
385 }
386
387 rb_link_node(&new->ci_node, parent, p);
388 rb_insert_color(&new->ci_node, &ci->i_caps);
389}
390
391/*
392 * (re)set cap hold timeouts, which control the delayed release
393 * of unused caps back to the MDS. Should be called on cap use.
394 */
395static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
396 struct ceph_inode_info *ci)
397{
398 struct ceph_mount_args *ma = mdsc->client->mount_args;
399
400 ci->i_hold_caps_min = round_jiffies(jiffies +
401 ma->caps_wanted_delay_min * HZ);
402 ci->i_hold_caps_max = round_jiffies(jiffies +
403 ma->caps_wanted_delay_max * HZ);
404 dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
405 ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
406}
407
408/*
409 * (Re)queue cap at the end of the delayed cap release list.
410 *
411 * If I_FLUSH is set, leave the inode at the front of the list.
412 *
413 * Caller holds i_lock
414 * -> we take mdsc->cap_delay_lock
415 */
416static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
417 struct ceph_inode_info *ci)
418{
419 __cap_set_timeouts(mdsc, ci);
420 dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
421 ci->i_ceph_flags, ci->i_hold_caps_max);
422 if (!mdsc->stopping) {
423 spin_lock(&mdsc->cap_delay_lock);
424 if (!list_empty(&ci->i_cap_delay_list)) {
425 if (ci->i_ceph_flags & CEPH_I_FLUSH)
426 goto no_change;
427 list_del_init(&ci->i_cap_delay_list);
428 }
429 list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
430no_change:
431 spin_unlock(&mdsc->cap_delay_lock);
432 }
433}
434
435/*
436 * Queue an inode for immediate writeback. Mark inode with I_FLUSH,
437 * indicating we should send a cap message to flush dirty metadata
438 * asap, and move to the front of the delayed cap list.
439 */
440static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
441 struct ceph_inode_info *ci)
442{
443 dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
444 spin_lock(&mdsc->cap_delay_lock);
445 ci->i_ceph_flags |= CEPH_I_FLUSH;
446 if (!list_empty(&ci->i_cap_delay_list))
447 list_del_init(&ci->i_cap_delay_list);
448 list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
449 spin_unlock(&mdsc->cap_delay_lock);
450}
451
452/*
453 * Cancel delayed work on cap.
454 *
455 * Caller must hold i_lock.
456 */
457static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
458 struct ceph_inode_info *ci)
459{
460 dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
461 if (list_empty(&ci->i_cap_delay_list))
462 return;
463 spin_lock(&mdsc->cap_delay_lock);
464 list_del_init(&ci->i_cap_delay_list);
465 spin_unlock(&mdsc->cap_delay_lock);
466}
467
468/*
469 * Common issue checks for add_cap, handle_cap_grant.
470 */
471static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
472 unsigned issued)
473{
474 unsigned had = __ceph_caps_issued(ci, NULL);
475
476 /*
477 * Each time we receive FILE_CACHE anew, we increment
478 * i_rdcache_gen.
479 */
480 if ((issued & CEPH_CAP_FILE_CACHE) &&
481 (had & CEPH_CAP_FILE_CACHE) == 0)
482 ci->i_rdcache_gen++;
483
484 /*
485 * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
486 * don't know what happened to this directory while we didn't
487 * have the cap.
488 */
489 if ((issued & CEPH_CAP_FILE_SHARED) &&
490 (had & CEPH_CAP_FILE_SHARED) == 0) {
491 ci->i_shared_gen++;
492 if (S_ISDIR(ci->vfs_inode.i_mode)) {
493 dout(" marking %p NOT complete\n", &ci->vfs_inode);
494 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
495 }
496 }
497}
498
499/*
500 * Add a capability under the given MDS session.
501 *
502 * Caller should hold session snap_rwsem (read) and s_mutex.
503 *
504 * @fmode is the open file mode, if we are opening a file, otherwise
505 * it is < 0. (This is so we can atomically add the cap and add an
506 * open file reference to it.)
507 */
508int ceph_add_cap(struct inode *inode,
509 struct ceph_mds_session *session, u64 cap_id,
510 int fmode, unsigned issued, unsigned wanted,
511 unsigned seq, unsigned mseq, u64 realmino, int flags,
512 struct ceph_cap_reservation *caps_reservation)
513{
514 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
515 struct ceph_inode_info *ci = ceph_inode(inode);
516 struct ceph_cap *new_cap = NULL;
517 struct ceph_cap *cap;
518 int mds = session->s_mds;
519 int actual_wanted;
520
521 dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
522 session->s_mds, cap_id, ceph_cap_string(issued), seq);
523
524 /*
525 * If we are opening the file, include file mode wanted bits
526 * in wanted.
527 */
528 if (fmode >= 0)
529 wanted |= ceph_caps_for_mode(fmode);
530
531retry:
532 spin_lock(&inode->i_lock);
533 cap = __get_cap_for_mds(ci, mds);
534 if (!cap) {
535 if (new_cap) {
536 cap = new_cap;
537 new_cap = NULL;
538 } else {
539 spin_unlock(&inode->i_lock);
540 new_cap = get_cap(caps_reservation);
541 if (new_cap == NULL)
542 return -ENOMEM;
543 goto retry;
544 }
545
546 cap->issued = 0;
547 cap->implemented = 0;
548 cap->mds = mds;
549 cap->mds_wanted = 0;
550
551 cap->ci = ci;
552 __insert_cap_node(ci, cap);
553
554 /* clear out old exporting info? (i.e. on cap import) */
555 if (ci->i_cap_exporting_mds == mds) {
556 ci->i_cap_exporting_issued = 0;
557 ci->i_cap_exporting_mseq = 0;
558 ci->i_cap_exporting_mds = -1;
559 }
560
561 /* add to session cap list */
562 cap->session = session;
563 spin_lock(&session->s_cap_lock);
564 list_add_tail(&cap->session_caps, &session->s_caps);
565 session->s_nr_caps++;
566 spin_unlock(&session->s_cap_lock);
567 }
568
569 if (!ci->i_snap_realm) {
570 /*
571 * add this inode to the appropriate snap realm
572 */
573 struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
574 realmino);
575 if (realm) {
576 ceph_get_snap_realm(mdsc, realm);
577 spin_lock(&realm->inodes_with_caps_lock);
578 ci->i_snap_realm = realm;
579 list_add(&ci->i_snap_realm_item,
580 &realm->inodes_with_caps);
581 spin_unlock(&realm->inodes_with_caps_lock);
582 } else {
583 pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
584 realmino);
585 }
586 }
587
588 __check_cap_issue(ci, cap, issued);
589
590 /*
591 * If we are issued caps we don't want, or the mds' wanted
592 * value appears to be off, queue a check so we'll release
593 * later and/or update the mds wanted value.
594 */
595 actual_wanted = __ceph_caps_wanted(ci);
596 if ((wanted & ~actual_wanted) ||
597 (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
598 dout(" issued %s, mds wanted %s, actual %s, queueing\n",
599 ceph_cap_string(issued), ceph_cap_string(wanted),
600 ceph_cap_string(actual_wanted));
601 __cap_delay_requeue(mdsc, ci);
602 }
603
604 if (flags & CEPH_CAP_FLAG_AUTH)
605 ci->i_auth_cap = cap;
606 else if (ci->i_auth_cap == cap)
607 ci->i_auth_cap = NULL;
608
609 dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
610 inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
611 ceph_cap_string(issued|cap->issued), seq, mds);
612 cap->cap_id = cap_id;
613 cap->issued = issued;
614 cap->implemented |= issued;
615 cap->mds_wanted |= wanted;
616 cap->seq = seq;
617 cap->issue_seq = seq;
618 cap->mseq = mseq;
619 cap->cap_gen = session->s_cap_gen;
620
621 if (fmode >= 0)
622 __ceph_get_fmode(ci, fmode);
623 spin_unlock(&inode->i_lock);
624 wake_up(&ci->i_cap_wq);
625 return 0;
626}
627
628/*
629 * Return true if cap has not timed out and belongs to the current
630 * generation of the MDS session (i.e. has not gone 'stale' due to
631 * us losing touch with the mds).
632 */
633static int __cap_is_valid(struct ceph_cap *cap)
634{
635 unsigned long ttl;
636 u32 gen;
637
638 spin_lock(&cap->session->s_cap_lock);
639 gen = cap->session->s_cap_gen;
640 ttl = cap->session->s_cap_ttl;
641 spin_unlock(&cap->session->s_cap_lock);
642
643 if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
644 dout("__cap_is_valid %p cap %p issued %s "
645 "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
646 cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
647 return 0;
648 }
649
650 return 1;
651}
652
653/*
654 * Return set of valid cap bits issued to us. Note that caps time
655 * out, and may be invalidated in bulk if the client session times out
656 * and session->s_cap_gen is bumped.
657 */
658int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
659{
660 int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
661 struct ceph_cap *cap;
662 struct rb_node *p;
663
664 if (implemented)
665 *implemented = 0;
666 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
667 cap = rb_entry(p, struct ceph_cap, ci_node);
668 if (!__cap_is_valid(cap))
669 continue;
670 dout("__ceph_caps_issued %p cap %p issued %s\n",
671 &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
672 have |= cap->issued;
673 if (implemented)
674 *implemented |= cap->implemented;
675 }
676 return have;
677}
678
679/*
680 * Get cap bits issued by caps other than @ocap
681 */
682int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
683{
684 int have = ci->i_snap_caps;
685 struct ceph_cap *cap;
686 struct rb_node *p;
687
688 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
689 cap = rb_entry(p, struct ceph_cap, ci_node);
690 if (cap == ocap)
691 continue;
692 if (!__cap_is_valid(cap))
693 continue;
694 have |= cap->issued;
695 }
696 return have;
697}
698
699/*
700 * Move a cap to the end of the LRU (oldest caps at list head, newest
701 * at list tail).
702 */
703static void __touch_cap(struct ceph_cap *cap)
704{
705 struct ceph_mds_session *s = cap->session;
706
707 spin_lock(&s->s_cap_lock);
708 if (s->s_cap_iterator == NULL) {
709 dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
710 s->s_mds);
711 list_move_tail(&cap->session_caps, &s->s_caps);
712 } else {
713 dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
714 &cap->ci->vfs_inode, cap, s->s_mds);
715 }
716 spin_unlock(&s->s_cap_lock);
717}
718
719/*
720 * Check if we hold the given mask. If so, move the cap(s) to the
721 * front of their respective LRUs. (This is the preferred way for
722 * callers to check for caps they want.)
723 */
724int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
725{
726 struct ceph_cap *cap;
727 struct rb_node *p;
728 int have = ci->i_snap_caps;
729
730 if ((have & mask) == mask) {
731 dout("__ceph_caps_issued_mask %p snap issued %s"
732 " (mask %s)\n", &ci->vfs_inode,
733 ceph_cap_string(have),
734 ceph_cap_string(mask));
735 return 1;
736 }
737
738 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
739 cap = rb_entry(p, struct ceph_cap, ci_node);
740 if (!__cap_is_valid(cap))
741 continue;
742 if ((cap->issued & mask) == mask) {
743 dout("__ceph_caps_issued_mask %p cap %p issued %s"
744 " (mask %s)\n", &ci->vfs_inode, cap,
745 ceph_cap_string(cap->issued),
746 ceph_cap_string(mask));
747 if (touch)
748 __touch_cap(cap);
749 return 1;
750 }
751
752 /* does a combination of caps satisfy mask? */
753 have |= cap->issued;
754 if ((have & mask) == mask) {
755 dout("__ceph_caps_issued_mask %p combo issued %s"
756 " (mask %s)\n", &ci->vfs_inode,
757 ceph_cap_string(cap->issued),
758 ceph_cap_string(mask));
759 if (touch) {
760 struct rb_node *q;
761
762 /* touch this + preceeding caps */
763 __touch_cap(cap);
764 for (q = rb_first(&ci->i_caps); q != p;
765 q = rb_next(q)) {
766 cap = rb_entry(q, struct ceph_cap,
767 ci_node);
768 if (!__cap_is_valid(cap))
769 continue;
770 __touch_cap(cap);
771 }
772 }
773 return 1;
774 }
775 }
776
777 return 0;
778}
779
780/*
781 * Return true if mask caps are currently being revoked by an MDS.
782 */
783int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
784{
785 struct inode *inode = &ci->vfs_inode;
786 struct ceph_cap *cap;
787 struct rb_node *p;
788 int ret = 0;
789
790 spin_lock(&inode->i_lock);
791 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
792 cap = rb_entry(p, struct ceph_cap, ci_node);
793 if (__cap_is_valid(cap) &&
794 (cap->implemented & ~cap->issued & mask)) {
795 ret = 1;
796 break;
797 }
798 }
799 spin_unlock(&inode->i_lock);
800 dout("ceph_caps_revoking %p %s = %d\n", inode,
801 ceph_cap_string(mask), ret);
802 return ret;
803}
804
805int __ceph_caps_used(struct ceph_inode_info *ci)
806{
807 int used = 0;
808 if (ci->i_pin_ref)
809 used |= CEPH_CAP_PIN;
810 if (ci->i_rd_ref)
811 used |= CEPH_CAP_FILE_RD;
812 if (ci->i_rdcache_ref || ci->i_rdcache_gen)
813 used |= CEPH_CAP_FILE_CACHE;
814 if (ci->i_wr_ref)
815 used |= CEPH_CAP_FILE_WR;
816 if (ci->i_wrbuffer_ref)
817 used |= CEPH_CAP_FILE_BUFFER;
818 return used;
819}
820
821/*
822 * wanted, by virtue of open file modes
823 */
824int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
825{
826 int want = 0;
827 int mode;
828 for (mode = 0; mode < 4; mode++)
829 if (ci->i_nr_by_mode[mode])
830 want |= ceph_caps_for_mode(mode);
831 return want;
832}
833
834/*
835 * Return caps we have registered with the MDS(s) as 'wanted'.
836 */
837int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
838{
839 struct ceph_cap *cap;
840 struct rb_node *p;
841 int mds_wanted = 0;
842
843 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
844 cap = rb_entry(p, struct ceph_cap, ci_node);
845 if (!__cap_is_valid(cap))
846 continue;
847 mds_wanted |= cap->mds_wanted;
848 }
849 return mds_wanted;
850}
851
852/*
853 * called under i_lock
854 */
855static int __ceph_is_any_caps(struct ceph_inode_info *ci)
856{
857 return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
858}
859
860/*
861 * caller should hold i_lock.
862 * caller will not hold session s_mutex if called from destroy_inode.
863 */
864void __ceph_remove_cap(struct ceph_cap *cap)
865{
866 struct ceph_mds_session *session = cap->session;
867 struct ceph_inode_info *ci = cap->ci;
868 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
869
870 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
871
872 /* remove from inode list */
873 rb_erase(&cap->ci_node, &ci->i_caps);
874 cap->ci = NULL;
875 if (ci->i_auth_cap == cap)
876 ci->i_auth_cap = NULL;
877
878 /* remove from session list */
879 spin_lock(&session->s_cap_lock);
880 if (session->s_cap_iterator == cap) {
881 /* not yet, we are iterating over this very cap */
882 dout("__ceph_remove_cap delaying %p removal from session %p\n",
883 cap, cap->session);
884 } else {
885 list_del_init(&cap->session_caps);
886 session->s_nr_caps--;
887 cap->session = NULL;
888 }
889 spin_unlock(&session->s_cap_lock);
890
891 if (cap->session == NULL)
892 ceph_put_cap(cap);
893
894 if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
895 struct ceph_snap_realm *realm = ci->i_snap_realm;
896 spin_lock(&realm->inodes_with_caps_lock);
897 list_del_init(&ci->i_snap_realm_item);
898 ci->i_snap_realm_counter++;
899 ci->i_snap_realm = NULL;
900 spin_unlock(&realm->inodes_with_caps_lock);
901 ceph_put_snap_realm(mdsc, realm);
902 }
903 if (!__ceph_is_any_real_caps(ci))
904 __cap_delay_cancel(mdsc, ci);
905}
906
907/*
908 * Build and send a cap message to the given MDS.
909 *
910 * Caller should be holding s_mutex.
911 */
912static int send_cap_msg(struct ceph_mds_session *session,
913 u64 ino, u64 cid, int op,
914 int caps, int wanted, int dirty,
915 u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
916 u64 size, u64 max_size,
917 struct timespec *mtime, struct timespec *atime,
918 u64 time_warp_seq,
919 uid_t uid, gid_t gid, mode_t mode,
920 u64 xattr_version,
921 struct ceph_buffer *xattrs_buf,
922 u64 follows)
923{
924 struct ceph_mds_caps *fc;
925 struct ceph_msg *msg;
926
927 dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
928 " seq %u/%u mseq %u follows %lld size %llu/%llu"
929 " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
930 cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
931 ceph_cap_string(dirty),
932 seq, issue_seq, mseq, follows, size, max_size,
933 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
934
935 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
936 if (IS_ERR(msg))
937 return PTR_ERR(msg);
938
939 msg->hdr.tid = cpu_to_le64(flush_tid);
940
941 fc = msg->front.iov_base;
942 memset(fc, 0, sizeof(*fc));
943
944 fc->cap_id = cpu_to_le64(cid);
945 fc->op = cpu_to_le32(op);
946 fc->seq = cpu_to_le32(seq);
947 fc->issue_seq = cpu_to_le32(issue_seq);
948 fc->migrate_seq = cpu_to_le32(mseq);
949 fc->caps = cpu_to_le32(caps);
950 fc->wanted = cpu_to_le32(wanted);
951 fc->dirty = cpu_to_le32(dirty);
952 fc->ino = cpu_to_le64(ino);
953 fc->snap_follows = cpu_to_le64(follows);
954
955 fc->size = cpu_to_le64(size);
956 fc->max_size = cpu_to_le64(max_size);
957 if (mtime)
958 ceph_encode_timespec(&fc->mtime, mtime);
959 if (atime)
960 ceph_encode_timespec(&fc->atime, atime);
961 fc->time_warp_seq = cpu_to_le32(time_warp_seq);
962
963 fc->uid = cpu_to_le32(uid);
964 fc->gid = cpu_to_le32(gid);
965 fc->mode = cpu_to_le32(mode);
966
967 fc->xattr_version = cpu_to_le64(xattr_version);
968 if (xattrs_buf) {
969 msg->middle = ceph_buffer_get(xattrs_buf);
970 fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
971 msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
972 }
973
974 ceph_con_send(&session->s_con, msg);
975 return 0;
976}
977
978/*
979 * Queue cap releases when an inode is dropped from our cache. Since
980 * inode is about to be destroyed, there is no need for i_lock.
981 */
982void ceph_queue_caps_release(struct inode *inode)
983{
984 struct ceph_inode_info *ci = ceph_inode(inode);
985 struct rb_node *p;
986
987 p = rb_first(&ci->i_caps);
988 while (p) {
989 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
990 struct ceph_mds_session *session = cap->session;
991 struct ceph_msg *msg;
992 struct ceph_mds_cap_release *head;
993 struct ceph_mds_cap_item *item;
994
995 spin_lock(&session->s_cap_lock);
996 BUG_ON(!session->s_num_cap_releases);
997 msg = list_first_entry(&session->s_cap_releases,
998 struct ceph_msg, list_head);
999
1000 dout(" adding %p release to mds%d msg %p (%d left)\n",
1001 inode, session->s_mds, msg, session->s_num_cap_releases);
1002
1003 BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
1004 head = msg->front.iov_base;
1005 head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
1006 item = msg->front.iov_base + msg->front.iov_len;
1007 item->ino = cpu_to_le64(ceph_ino(inode));
1008 item->cap_id = cpu_to_le64(cap->cap_id);
1009 item->migrate_seq = cpu_to_le32(cap->mseq);
1010 item->seq = cpu_to_le32(cap->issue_seq);
1011
1012 session->s_num_cap_releases--;
1013
1014 msg->front.iov_len += sizeof(*item);
1015 if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
1016 dout(" release msg %p full\n", msg);
1017 list_move_tail(&msg->list_head,
1018 &session->s_cap_releases_done);
1019 } else {
1020 dout(" release msg %p at %d/%d (%d)\n", msg,
1021 (int)le32_to_cpu(head->num),
1022 (int)CEPH_CAPS_PER_RELEASE,
1023 (int)msg->front.iov_len);
1024 }
1025 spin_unlock(&session->s_cap_lock);
1026 p = rb_next(p);
1027 __ceph_remove_cap(cap);
1028 }
1029}
1030
1031/*
1032 * Send a cap msg on the given inode. Update our caps state, then
1033 * drop i_lock and send the message.
1034 *
1035 * Make note of max_size reported/requested from mds, revoked caps
1036 * that have now been implemented.
1037 *
1038 * Make half-hearted attempt ot to invalidate page cache if we are
1039 * dropping RDCACHE. Note that this will leave behind locked pages
1040 * that we'll then need to deal with elsewhere.
1041 *
1042 * Return non-zero if delayed release, or we experienced an error
1043 * such that the caller should requeue + retry later.
1044 *
1045 * called with i_lock, then drops it.
1046 * caller should hold snap_rwsem (read), s_mutex.
1047 */
1048static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1049 int op, int used, int want, int retain, int flushing,
1050 unsigned *pflush_tid)
1051 __releases(cap->ci->vfs_inode->i_lock)
1052{
1053 struct ceph_inode_info *ci = cap->ci;
1054 struct inode *inode = &ci->vfs_inode;
1055 u64 cap_id = cap->cap_id;
1056 int held, revoking, dropping, keep;
1057 u64 seq, issue_seq, mseq, time_warp_seq, follows;
1058 u64 size, max_size;
1059 struct timespec mtime, atime;
1060 int wake = 0;
1061 mode_t mode;
1062 uid_t uid;
1063 gid_t gid;
1064 struct ceph_mds_session *session;
1065 u64 xattr_version = 0;
1066 int delayed = 0;
1067 u64 flush_tid = 0;
1068 int i;
1069 int ret;
1070
1071 held = cap->issued | cap->implemented;
1072 revoking = cap->implemented & ~cap->issued;
1073 retain &= ~revoking;
1074 dropping = cap->issued & ~retain;
1075
1076 dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
1077 inode, cap, cap->session,
1078 ceph_cap_string(held), ceph_cap_string(held & retain),
1079 ceph_cap_string(revoking));
1080 BUG_ON((retain & CEPH_CAP_PIN) == 0);
1081
1082 session = cap->session;
1083
1084 /* don't release wanted unless we've waited a bit. */
1085 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1086 time_before(jiffies, ci->i_hold_caps_min)) {
1087 dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
1088 ceph_cap_string(cap->issued),
1089 ceph_cap_string(cap->issued & retain),
1090 ceph_cap_string(cap->mds_wanted),
1091 ceph_cap_string(want));
1092 want |= cap->mds_wanted;
1093 retain |= cap->issued;
1094 delayed = 1;
1095 }
1096 ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
1097
1098 cap->issued &= retain; /* drop bits we don't want */
1099 if (cap->implemented & ~cap->issued) {
1100 /*
1101 * Wake up any waiters on wanted -> needed transition.
1102 * This is due to the weird transition from buffered
1103 * to sync IO... we need to flush dirty pages _before_
1104 * allowing sync writes to avoid reordering.
1105 */
1106 wake = 1;
1107 }
1108 cap->implemented &= cap->issued | used;
1109 cap->mds_wanted = want;
1110
1111 if (flushing) {
1112 /*
1113 * assign a tid for flush operations so we can avoid
1114 * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
1115 * clean type races. track latest tid for every bit
1116 * so we can handle flush AxFw, flush Fw, and have the
1117 * first ack clean Ax.
1118 */
1119 flush_tid = ++ci->i_cap_flush_last_tid;
1120 if (pflush_tid)
1121 *pflush_tid = flush_tid;
1122 dout(" cap_flush_tid %d\n", (int)flush_tid);
1123 for (i = 0; i < CEPH_CAP_BITS; i++)
1124 if (flushing & (1 << i))
1125 ci->i_cap_flush_tid[i] = flush_tid;
1126 }
1127
1128 keep = cap->implemented;
1129 seq = cap->seq;
1130 issue_seq = cap->issue_seq;
1131 mseq = cap->mseq;
1132 size = inode->i_size;
1133 ci->i_reported_size = size;
1134 max_size = ci->i_wanted_max_size;
1135 ci->i_requested_max_size = max_size;
1136 mtime = inode->i_mtime;
1137 atime = inode->i_atime;
1138 time_warp_seq = ci->i_time_warp_seq;
1139 follows = ci->i_snap_realm->cached_context->seq;
1140 uid = inode->i_uid;
1141 gid = inode->i_gid;
1142 mode = inode->i_mode;
1143
1144 if (dropping & CEPH_CAP_XATTR_EXCL) {
1145 __ceph_build_xattrs_blob(ci);
1146 xattr_version = ci->i_xattrs.version + 1;
1147 }
1148
1149 spin_unlock(&inode->i_lock);
1150
1151 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
1152 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
1153 size, max_size, &mtime, &atime, time_warp_seq,
1154 uid, gid, mode,
1155 xattr_version,
1156 (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
1157 follows);
1158 if (ret < 0) {
1159 dout("error sending cap msg, must requeue %p\n", inode);
1160 delayed = 1;
1161 }
1162
1163 if (wake)
1164 wake_up(&ci->i_cap_wq);
1165
1166 return delayed;
1167}
1168
1169/*
1170 * When a snapshot is taken, clients accumulate dirty metadata on
1171 * inodes with capabilities in ceph_cap_snaps to describe the file
1172 * state at the time the snapshot was taken. This must be flushed
1173 * asynchronously back to the MDS once sync writes complete and dirty
1174 * data is written out.
1175 *
1176 * Called under i_lock. Takes s_mutex as needed.
1177 */
1178void __ceph_flush_snaps(struct ceph_inode_info *ci,
1179 struct ceph_mds_session **psession)
1180{
1181 struct inode *inode = &ci->vfs_inode;
1182 int mds;
1183 struct ceph_cap_snap *capsnap;
1184 u32 mseq;
1185 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
1186 struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
1187 session->s_mutex */
1188 u64 next_follows = 0; /* keep track of how far we've gotten through the
1189 i_cap_snaps list, and skip these entries next time
1190 around to avoid an infinite loop */
1191
1192 if (psession)
1193 session = *psession;
1194
1195 dout("__flush_snaps %p\n", inode);
1196retry:
1197 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
1198 /* avoid an infiniute loop after retry */
1199 if (capsnap->follows < next_follows)
1200 continue;
1201 /*
1202 * we need to wait for sync writes to complete and for dirty
1203 * pages to be written out.
1204 */
1205 if (capsnap->dirty_pages || capsnap->writing)
1206 continue;
1207
1208 /*
1209 * if cap writeback already occurred, we should have dropped
1210 * the capsnap in ceph_put_wrbuffer_cap_refs.
1211 */
1212 BUG_ON(capsnap->dirty == 0);
1213
1214 /* pick mds, take s_mutex */
1215 mds = __ceph_get_cap_mds(ci, &mseq);
1216 if (session && session->s_mds != mds) {
1217 dout("oops, wrong session %p mutex\n", session);
1218 mutex_unlock(&session->s_mutex);
1219 ceph_put_mds_session(session);
1220 session = NULL;
1221 }
1222 if (!session) {
1223 spin_unlock(&inode->i_lock);
1224 mutex_lock(&mdsc->mutex);
1225 session = __ceph_lookup_mds_session(mdsc, mds);
1226 mutex_unlock(&mdsc->mutex);
1227 if (session) {
1228 dout("inverting session/ino locks on %p\n",
1229 session);
1230 mutex_lock(&session->s_mutex);
1231 }
1232 /*
1233 * if session == NULL, we raced against a cap
1234 * deletion. retry, and we'll get a better
1235 * @mds value next time.
1236 */
1237 spin_lock(&inode->i_lock);
1238 goto retry;
1239 }
1240
1241 capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
1242 atomic_inc(&capsnap->nref);
1243 if (!list_empty(&capsnap->flushing_item))
1244 list_del_init(&capsnap->flushing_item);
1245 list_add_tail(&capsnap->flushing_item,
1246 &session->s_cap_snaps_flushing);
1247 spin_unlock(&inode->i_lock);
1248
1249 dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
1250 inode, capsnap, next_follows, capsnap->size);
1251 send_cap_msg(session, ceph_vino(inode).ino, 0,
1252 CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
1253 capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
1254 capsnap->size, 0,
1255 &capsnap->mtime, &capsnap->atime,
1256 capsnap->time_warp_seq,
1257 capsnap->uid, capsnap->gid, capsnap->mode,
1258 0, NULL,
1259 capsnap->follows);
1260
1261 next_follows = capsnap->follows + 1;
1262 ceph_put_cap_snap(capsnap);
1263
1264 spin_lock(&inode->i_lock);
1265 goto retry;
1266 }
1267
1268 /* we flushed them all; remove this inode from the queue */
1269 spin_lock(&mdsc->snap_flush_lock);
1270 list_del_init(&ci->i_snap_flush_item);
1271 spin_unlock(&mdsc->snap_flush_lock);
1272
1273 if (psession)
1274 *psession = session;
1275 else if (session) {
1276 mutex_unlock(&session->s_mutex);
1277 ceph_put_mds_session(session);
1278 }
1279}
1280
1281static void ceph_flush_snaps(struct ceph_inode_info *ci)
1282{
1283 struct inode *inode = &ci->vfs_inode;
1284
1285 spin_lock(&inode->i_lock);
1286 __ceph_flush_snaps(ci, NULL);
1287 spin_unlock(&inode->i_lock);
1288}
1289
1290/*
1291 * Mark caps dirty. If inode is newly dirty, add to the global dirty
1292 * list.
1293 */
1294void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1295{
1296 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
1297 struct inode *inode = &ci->vfs_inode;
1298 int was = ci->i_dirty_caps;
1299 int dirty = 0;
1300
1301 dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
1302 ceph_cap_string(mask), ceph_cap_string(was),
1303 ceph_cap_string(was | mask));
1304 ci->i_dirty_caps |= mask;
1305 if (was == 0) {
1306 dout(" inode %p now dirty\n", &ci->vfs_inode);
1307 BUG_ON(!list_empty(&ci->i_dirty_item));
1308 spin_lock(&mdsc->cap_dirty_lock);
1309 list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
1310 spin_unlock(&mdsc->cap_dirty_lock);
1311 if (ci->i_flushing_caps == 0) {
1312 igrab(inode);
1313 dirty |= I_DIRTY_SYNC;
1314 }
1315 }
1316 BUG_ON(list_empty(&ci->i_dirty_item));
1317 if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
1318 (mask & CEPH_CAP_FILE_BUFFER))
1319 dirty |= I_DIRTY_DATASYNC;
1320 if (dirty)
1321 __mark_inode_dirty(inode, dirty);
1322 __cap_delay_requeue(mdsc, ci);
1323}
1324
1325/*
1326 * Add dirty inode to the flushing list. Assigned a seq number so we
1327 * can wait for caps to flush without starving.
1328 *
1329 * Called under i_lock.
1330 */
1331static int __mark_caps_flushing(struct inode *inode,
1332 struct ceph_mds_session *session)
1333{
1334 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1335 struct ceph_inode_info *ci = ceph_inode(inode);
1336 int flushing;
1337
1338 BUG_ON(ci->i_dirty_caps == 0);
1339 BUG_ON(list_empty(&ci->i_dirty_item));
1340
1341 flushing = ci->i_dirty_caps;
1342 dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
1343 ceph_cap_string(flushing),
1344 ceph_cap_string(ci->i_flushing_caps),
1345 ceph_cap_string(ci->i_flushing_caps | flushing));
1346 ci->i_flushing_caps |= flushing;
1347 ci->i_dirty_caps = 0;
1348 dout(" inode %p now !dirty\n", inode);
1349
1350 spin_lock(&mdsc->cap_dirty_lock);
1351 list_del_init(&ci->i_dirty_item);
1352
1353 ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
1354 if (list_empty(&ci->i_flushing_item)) {
1355 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1356 mdsc->num_cap_flushing++;
1357 dout(" inode %p now flushing seq %lld\n", inode,
1358 ci->i_cap_flush_seq);
1359 } else {
1360 list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1361 dout(" inode %p now flushing (more) seq %lld\n", inode,
1362 ci->i_cap_flush_seq);
1363 }
1364 spin_unlock(&mdsc->cap_dirty_lock);
1365
1366 return flushing;
1367}
1368
1369/*
1370 * try to invalidate mapping pages without blocking.
1371 */
1372static int mapping_is_empty(struct address_space *mapping)
1373{
1374 struct page *page = find_get_page(mapping, 0);
1375
1376 if (!page)
1377 return 1;
1378
1379 put_page(page);
1380 return 0;
1381}
1382
1383static int try_nonblocking_invalidate(struct inode *inode)
1384{
1385 struct ceph_inode_info *ci = ceph_inode(inode);
1386 u32 invalidating_gen = ci->i_rdcache_gen;
1387
1388 spin_unlock(&inode->i_lock);
1389 invalidate_mapping_pages(&inode->i_data, 0, -1);
1390 spin_lock(&inode->i_lock);
1391
1392 if (mapping_is_empty(&inode->i_data) &&
1393 invalidating_gen == ci->i_rdcache_gen) {
1394 /* success. */
1395 dout("try_nonblocking_invalidate %p success\n", inode);
1396 ci->i_rdcache_gen = 0;
1397 ci->i_rdcache_revoking = 0;
1398 return 0;
1399 }
1400 dout("try_nonblocking_invalidate %p failed\n", inode);
1401 return -1;
1402}
1403
1404/*
1405 * Swiss army knife function to examine currently used and wanted
1406 * versus held caps. Release, flush, ack revoked caps to mds as
1407 * appropriate.
1408 *
1409 * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
1410 * cap release further.
1411 * CHECK_CAPS_AUTHONLY - we should only check the auth cap
1412 * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
1413 * further delay.
1414 */
1415void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1416 struct ceph_mds_session *session)
1417 __releases(session->s_mutex)
1418{
1419 struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
1420 struct ceph_mds_client *mdsc = &client->mdsc;
1421 struct inode *inode = &ci->vfs_inode;
1422 struct ceph_cap *cap;
1423 int file_wanted, used;
1424 int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
1425 int issued, implemented, want, retain, revoking, flushing = 0;
1426 int mds = -1; /* keep track of how far we've gone through i_caps list
1427 to avoid an infinite loop on retry */
1428 struct rb_node *p;
1429 int tried_invalidate = 0;
1430 int delayed = 0, sent = 0, force_requeue = 0, num;
1431 int queue_invalidate = 0;
1432 int is_delayed = flags & CHECK_CAPS_NODELAY;
1433
1434 /* if we are unmounting, flush any unused caps immediately. */
1435 if (mdsc->stopping)
1436 is_delayed = 1;
1437
1438 spin_lock(&inode->i_lock);
1439
1440 if (ci->i_ceph_flags & CEPH_I_FLUSH)
1441 flags |= CHECK_CAPS_FLUSH;
1442
1443 /* flush snaps first time around only */
1444 if (!list_empty(&ci->i_cap_snaps))
1445 __ceph_flush_snaps(ci, &session);
1446 goto retry_locked;
1447retry:
1448 spin_lock(&inode->i_lock);
1449retry_locked:
1450 file_wanted = __ceph_caps_file_wanted(ci);
1451 used = __ceph_caps_used(ci);
1452 want = file_wanted | used;
1453 issued = __ceph_caps_issued(ci, &implemented);
1454 revoking = implemented & ~issued;
1455
1456 retain = want | CEPH_CAP_PIN;
1457 if (!mdsc->stopping && inode->i_nlink > 0) {
1458 if (want) {
1459 retain |= CEPH_CAP_ANY; /* be greedy */
1460 } else {
1461 retain |= CEPH_CAP_ANY_SHARED;
1462 /*
1463 * keep RD only if we didn't have the file open RW,
1464 * because then the mds would revoke it anyway to
1465 * journal max_size=0.
1466 */
1467 if (ci->i_max_size == 0)
1468 retain |= CEPH_CAP_ANY_RD;
1469 }
1470 }
1471
1472 dout("check_caps %p file_want %s used %s dirty %s flushing %s"
1473 " issued %s revoking %s retain %s %s%s%s\n", inode,
1474 ceph_cap_string(file_wanted),
1475 ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
1476 ceph_cap_string(ci->i_flushing_caps),
1477 ceph_cap_string(issued), ceph_cap_string(revoking),
1478 ceph_cap_string(retain),
1479 (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
1480 (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
1481 (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
1482
1483 /*
1484 * If we no longer need to hold onto old our caps, and we may
1485 * have cached pages, but don't want them, then try to invalidate.
1486 * If we fail, it's because pages are locked.... try again later.
1487 */
1488 if ((!is_delayed || mdsc->stopping) &&
1489 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
1490 ci->i_rdcache_gen && /* may have cached pages */
1491 (file_wanted == 0 || /* no open files */
1492 (revoking & CEPH_CAP_FILE_CACHE)) && /* or revoking cache */
1493 !tried_invalidate) {
1494 dout("check_caps trying to invalidate on %p\n", inode);
1495 if (try_nonblocking_invalidate(inode) < 0) {
1496 if (revoking & CEPH_CAP_FILE_CACHE) {
1497 dout("check_caps queuing invalidate\n");
1498 queue_invalidate = 1;
1499 ci->i_rdcache_revoking = ci->i_rdcache_gen;
1500 } else {
1501 dout("check_caps failed to invalidate pages\n");
1502 /* we failed to invalidate pages. check these
1503 caps again later. */
1504 force_requeue = 1;
1505 __cap_set_timeouts(mdsc, ci);
1506 }
1507 }
1508 tried_invalidate = 1;
1509 goto retry_locked;
1510 }
1511
1512 num = 0;
1513 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
1514 cap = rb_entry(p, struct ceph_cap, ci_node);
1515 num++;
1516
1517 /* avoid looping forever */
1518 if (mds >= cap->mds ||
1519 ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
1520 continue;
1521
1522 /* NOTE: no side-effects allowed, until we take s_mutex */
1523
1524 revoking = cap->implemented & ~cap->issued;
1525 if (revoking)
1526 dout(" mds%d revoking %s\n", cap->mds,
1527 ceph_cap_string(revoking));
1528
1529 if (cap == ci->i_auth_cap &&
1530 (cap->issued & CEPH_CAP_FILE_WR)) {
1531 /* request larger max_size from MDS? */
1532 if (ci->i_wanted_max_size > ci->i_max_size &&
1533 ci->i_wanted_max_size > ci->i_requested_max_size) {
1534 dout("requesting new max_size\n");
1535 goto ack;
1536 }
1537
1538 /* approaching file_max? */
1539 if ((inode->i_size << 1) >= ci->i_max_size &&
1540 (ci->i_reported_size << 1) < ci->i_max_size) {
1541 dout("i_size approaching max_size\n");
1542 goto ack;
1543 }
1544 }
1545 /* flush anything dirty? */
1546 if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
1547 ci->i_dirty_caps) {
1548 dout("flushing dirty caps\n");
1549 goto ack;
1550 }
1551
1552 /* completed revocation? going down and there are no caps? */
1553 if (revoking && (revoking & used) == 0) {
1554 dout("completed revocation of %s\n",
1555 ceph_cap_string(cap->implemented & ~cap->issued));
1556 goto ack;
1557 }
1558
1559 /* want more caps from mds? */
1560 if (want & ~(cap->mds_wanted | cap->issued))
1561 goto ack;
1562
1563 /* things we might delay */
1564 if ((cap->issued & ~retain) == 0 &&
1565 cap->mds_wanted == want)
1566 continue; /* nope, all good */
1567
1568 if (is_delayed)
1569 goto ack;
1570
1571 /* delay? */
1572 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1573 time_before(jiffies, ci->i_hold_caps_max)) {
1574 dout(" delaying issued %s -> %s, wanted %s -> %s\n",
1575 ceph_cap_string(cap->issued),
1576 ceph_cap_string(cap->issued & retain),
1577 ceph_cap_string(cap->mds_wanted),
1578 ceph_cap_string(want));
1579 delayed++;
1580 continue;
1581 }
1582
1583ack:
1584 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1585 dout(" skipping %p I_NOFLUSH set\n", inode);
1586 continue;
1587 }
1588
1589 if (session && session != cap->session) {
1590 dout("oops, wrong session %p mutex\n", session);
1591 mutex_unlock(&session->s_mutex);
1592 session = NULL;
1593 }
1594 if (!session) {
1595 session = cap->session;
1596 if (mutex_trylock(&session->s_mutex) == 0) {
1597 dout("inverting session/ino locks on %p\n",
1598 session);
1599 spin_unlock(&inode->i_lock);
1600 if (took_snap_rwsem) {
1601 up_read(&mdsc->snap_rwsem);
1602 took_snap_rwsem = 0;
1603 }
1604 mutex_lock(&session->s_mutex);
1605 goto retry;
1606 }
1607 }
1608 /* take snap_rwsem after session mutex */
1609 if (!took_snap_rwsem) {
1610 if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
1611 dout("inverting snap/in locks on %p\n",
1612 inode);
1613 spin_unlock(&inode->i_lock);
1614 down_read(&mdsc->snap_rwsem);
1615 took_snap_rwsem = 1;
1616 goto retry;
1617 }
1618 took_snap_rwsem = 1;
1619 }
1620
1621 if (cap == ci->i_auth_cap && ci->i_dirty_caps)
1622 flushing = __mark_caps_flushing(inode, session);
1623
1624 mds = cap->mds; /* remember mds, so we don't repeat */
1625 sent++;
1626
1627 /* __send_cap drops i_lock */
1628 delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
1629 retain, flushing, NULL);
1630 goto retry; /* retake i_lock and restart our cap scan. */
1631 }
1632
1633 /*
1634 * Reschedule delayed caps release if we delayed anything,
1635 * otherwise cancel.
1636 */
1637 if (delayed && is_delayed)
1638 force_requeue = 1; /* __send_cap delayed release; requeue */
1639 if (!delayed && !is_delayed)
1640 __cap_delay_cancel(mdsc, ci);
1641 else if (!is_delayed || force_requeue)
1642 __cap_delay_requeue(mdsc, ci);
1643
1644 spin_unlock(&inode->i_lock);
1645
1646 if (queue_invalidate)
1647 ceph_queue_invalidate(inode);
1648
1649 if (session)
1650 mutex_unlock(&session->s_mutex);
1651 if (took_snap_rwsem)
1652 up_read(&mdsc->snap_rwsem);
1653}
1654
1655/*
1656 * Try to flush dirty caps back to the auth mds.
1657 */
1658static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
1659 unsigned *flush_tid)
1660{
1661 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1662 struct ceph_inode_info *ci = ceph_inode(inode);
1663 int unlock_session = session ? 0 : 1;
1664 int flushing = 0;
1665
1666retry:
1667 spin_lock(&inode->i_lock);
1668 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1669 dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
1670 goto out;
1671 }
1672 if (ci->i_dirty_caps && ci->i_auth_cap) {
1673 struct ceph_cap *cap = ci->i_auth_cap;
1674 int used = __ceph_caps_used(ci);
1675 int want = __ceph_caps_wanted(ci);
1676 int delayed;
1677
1678 if (!session) {
1679 spin_unlock(&inode->i_lock);
1680 session = cap->session;
1681 mutex_lock(&session->s_mutex);
1682 goto retry;
1683 }
1684 BUG_ON(session != cap->session);
1685 if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
1686 goto out;
1687
1688 flushing = __mark_caps_flushing(inode, session);
1689
1690 /* __send_cap drops i_lock */
1691 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
1692 cap->issued | cap->implemented, flushing,
1693 flush_tid);
1694 if (!delayed)
1695 goto out_unlocked;
1696
1697 spin_lock(&inode->i_lock);
1698 __cap_delay_requeue(mdsc, ci);
1699 }
1700out:
1701 spin_unlock(&inode->i_lock);
1702out_unlocked:
1703 if (session && unlock_session)
1704 mutex_unlock(&session->s_mutex);
1705 return flushing;
1706}
1707
1708/*
1709 * Return true if we've flushed caps through the given flush_tid.
1710 */
1711static int caps_are_flushed(struct inode *inode, unsigned tid)
1712{
1713 struct ceph_inode_info *ci = ceph_inode(inode);
1714 int dirty, i, ret = 1;
1715
1716 spin_lock(&inode->i_lock);
1717 dirty = __ceph_caps_dirty(ci);
1718 for (i = 0; i < CEPH_CAP_BITS; i++)
1719 if ((ci->i_flushing_caps & (1 << i)) &&
1720 ci->i_cap_flush_tid[i] <= tid) {
1721 /* still flushing this bit */
1722 ret = 0;
1723 break;
1724 }
1725 spin_unlock(&inode->i_lock);
1726 return ret;
1727}
1728
1729/*
1730 * Wait on any unsafe replies for the given inode. First wait on the
1731 * newest request, and make that the upper bound. Then, if there are
1732 * more requests, keep waiting on the oldest as long as it is still older
1733 * than the original request.
1734 */
1735static void sync_write_wait(struct inode *inode)
1736{
1737 struct ceph_inode_info *ci = ceph_inode(inode);
1738 struct list_head *head = &ci->i_unsafe_writes;
1739 struct ceph_osd_request *req;
1740 u64 last_tid;
1741
1742 spin_lock(&ci->i_unsafe_lock);
1743 if (list_empty(head))
1744 goto out;
1745
1746 /* set upper bound as _last_ entry in chain */
1747 req = list_entry(head->prev, struct ceph_osd_request,
1748 r_unsafe_item);
1749 last_tid = req->r_tid;
1750
1751 do {
1752 ceph_osdc_get_request(req);
1753 spin_unlock(&ci->i_unsafe_lock);
1754 dout("sync_write_wait on tid %llu (until %llu)\n",
1755 req->r_tid, last_tid);
1756 wait_for_completion(&req->r_safe_completion);
1757 spin_lock(&ci->i_unsafe_lock);
1758 ceph_osdc_put_request(req);
1759
1760 /*
1761 * from here on look at first entry in chain, since we
1762 * only want to wait for anything older than last_tid
1763 */
1764 if (list_empty(head))
1765 break;
1766 req = list_entry(head->next, struct ceph_osd_request,
1767 r_unsafe_item);
1768 } while (req->r_tid < last_tid);
1769out:
1770 spin_unlock(&ci->i_unsafe_lock);
1771}
1772
1773int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
1774{
1775 struct inode *inode = dentry->d_inode;
1776 struct ceph_inode_info *ci = ceph_inode(inode);
1777 unsigned flush_tid;
1778 int ret;
1779 int dirty;
1780
1781 dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
1782 sync_write_wait(inode);
1783
1784 ret = filemap_write_and_wait(inode->i_mapping);
1785 if (ret < 0)
1786 return ret;
1787
1788 dirty = try_flush_caps(inode, NULL, &flush_tid);
1789 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
1790
1791 /*
1792 * only wait on non-file metadata writeback (the mds
1793 * can recover size and mtime, so we don't need to
1794 * wait for that)
1795 */
1796 if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
1797 dout("fsync waiting for flush_tid %u\n", flush_tid);
1798 ret = wait_event_interruptible(ci->i_cap_wq,
1799 caps_are_flushed(inode, flush_tid));
1800 }
1801
1802 dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
1803 return ret;
1804}
1805
1806/*
1807 * Flush any dirty caps back to the mds. If we aren't asked to wait,
1808 * queue inode for flush but don't do so immediately, because we can
1809 * get by with fewer MDS messages if we wait for data writeback to
1810 * complete first.
1811 */
1812int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1813{
1814 struct ceph_inode_info *ci = ceph_inode(inode);
1815 unsigned flush_tid;
1816 int err = 0;
1817 int dirty;
1818 int wait = wbc->sync_mode == WB_SYNC_ALL;
1819
1820 dout("write_inode %p wait=%d\n", inode, wait);
1821 if (wait) {
1822 dirty = try_flush_caps(inode, NULL, &flush_tid);
1823 if (dirty)
1824 err = wait_event_interruptible(ci->i_cap_wq,
1825 caps_are_flushed(inode, flush_tid));
1826 } else {
1827 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1828
1829 spin_lock(&inode->i_lock);
1830 if (__ceph_caps_dirty(ci))
1831 __cap_delay_requeue_front(mdsc, ci);
1832 spin_unlock(&inode->i_lock);
1833 }
1834 return err;
1835}
1836
1837/*
1838 * After a recovering MDS goes active, we need to resend any caps
1839 * we were flushing.
1840 *
1841 * Caller holds session->s_mutex.
1842 */
1843static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
1844 struct ceph_mds_session *session)
1845{
1846 struct ceph_cap_snap *capsnap;
1847
1848 dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
1849 list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
1850 flushing_item) {
1851 struct ceph_inode_info *ci = capsnap->ci;
1852 struct inode *inode = &ci->vfs_inode;
1853 struct ceph_cap *cap;
1854
1855 spin_lock(&inode->i_lock);
1856 cap = ci->i_auth_cap;
1857 if (cap && cap->session == session) {
1858 dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
1859 cap, capsnap);
1860 __ceph_flush_snaps(ci, &session);
1861 } else {
1862 pr_err("%p auth cap %p not mds%d ???\n", inode,
1863 cap, session->s_mds);
1864 spin_unlock(&inode->i_lock);
1865 }
1866 }
1867}
1868
1869void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
1870 struct ceph_mds_session *session)
1871{
1872 struct ceph_inode_info *ci;
1873
1874 kick_flushing_capsnaps(mdsc, session);
1875
1876 dout("kick_flushing_caps mds%d\n", session->s_mds);
1877 list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
1878 struct inode *inode = &ci->vfs_inode;
1879 struct ceph_cap *cap;
1880 int delayed = 0;
1881
1882 spin_lock(&inode->i_lock);
1883 cap = ci->i_auth_cap;
1884 if (cap && cap->session == session) {
1885 dout("kick_flushing_caps %p cap %p %s\n", inode,
1886 cap, ceph_cap_string(ci->i_flushing_caps));
1887 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
1888 __ceph_caps_used(ci),
1889 __ceph_caps_wanted(ci),
1890 cap->issued | cap->implemented,
1891 ci->i_flushing_caps, NULL);
1892 if (delayed) {
1893 spin_lock(&inode->i_lock);
1894 __cap_delay_requeue(mdsc, ci);
1895 spin_unlock(&inode->i_lock);
1896 }
1897 } else {
1898 pr_err("%p auth cap %p not mds%d ???\n", inode,
1899 cap, session->s_mds);
1900 spin_unlock(&inode->i_lock);
1901 }
1902 }
1903}
1904
1905
1906/*
1907 * Take references to capabilities we hold, so that we don't release
1908 * them to the MDS prematurely.
1909 *
1910 * Protected by i_lock.
1911 */
1912static void __take_cap_refs(struct ceph_inode_info *ci, int got)
1913{
1914 if (got & CEPH_CAP_PIN)
1915 ci->i_pin_ref++;
1916 if (got & CEPH_CAP_FILE_RD)
1917 ci->i_rd_ref++;
1918 if (got & CEPH_CAP_FILE_CACHE)
1919 ci->i_rdcache_ref++;
1920 if (got & CEPH_CAP_FILE_WR)
1921 ci->i_wr_ref++;
1922 if (got & CEPH_CAP_FILE_BUFFER) {
1923 if (ci->i_wrbuffer_ref == 0)
1924 igrab(&ci->vfs_inode);
1925 ci->i_wrbuffer_ref++;
1926 dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n",
1927 &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref);
1928 }
1929}
1930
1931/*
1932 * Try to grab cap references. Specify those refs we @want, and the
1933 * minimal set we @need. Also include the larger offset we are writing
1934 * to (when applicable), and check against max_size here as well.
1935 * Note that caller is responsible for ensuring max_size increases are
1936 * requested from the MDS.
1937 */
1938static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
1939 int *got, loff_t endoff, int *check_max, int *err)
1940{
1941 struct inode *inode = &ci->vfs_inode;
1942 int ret = 0;
1943 int have, implemented;
1944 int file_wanted;
1945
1946 dout("get_cap_refs %p need %s want %s\n", inode,
1947 ceph_cap_string(need), ceph_cap_string(want));
1948 spin_lock(&inode->i_lock);
1949
1950 /* make sure file is actually open */
1951 file_wanted = __ceph_caps_file_wanted(ci);
1952 if ((file_wanted & need) == 0) {
1953 dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
1954 ceph_cap_string(need), ceph_cap_string(file_wanted));
1955 *err = -EBADF;
1956 ret = 1;
1957 goto out;
1958 }
1959
1960 if (need & CEPH_CAP_FILE_WR) {
1961 if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
1962 dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
1963 inode, endoff, ci->i_max_size);
1964 if (endoff > ci->i_wanted_max_size) {
1965 *check_max = 1;
1966 ret = 1;
1967 }
1968 goto out;
1969 }
1970 /*
1971 * If a sync write is in progress, we must wait, so that we
1972 * can get a final snapshot value for size+mtime.
1973 */
1974 if (__ceph_have_pending_cap_snap(ci)) {
1975 dout("get_cap_refs %p cap_snap_pending\n", inode);
1976 goto out;
1977 }
1978 }
1979 have = __ceph_caps_issued(ci, &implemented);
1980
1981 /*
1982 * disallow writes while a truncate is pending
1983 */
1984 if (ci->i_truncate_pending)
1985 have &= ~CEPH_CAP_FILE_WR;
1986
1987 if ((have & need) == need) {
1988 /*
1989 * Look at (implemented & ~have & not) so that we keep waiting
1990 * on transition from wanted -> needed caps. This is needed
1991 * for WRBUFFER|WR -> WR to avoid a new WR sync write from
1992 * going before a prior buffered writeback happens.
1993 */
1994 int not = want & ~(have & need);
1995 int revoking = implemented & ~have;
1996 dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
1997 inode, ceph_cap_string(have), ceph_cap_string(not),
1998 ceph_cap_string(revoking));
1999 if ((revoking & not) == 0) {
2000 *got = need | (have & want);
2001 __take_cap_refs(ci, *got);
2002 ret = 1;
2003 }
2004 } else {
2005 dout("get_cap_refs %p have %s needed %s\n", inode,
2006 ceph_cap_string(have), ceph_cap_string(need));
2007 }
2008out:
2009 spin_unlock(&inode->i_lock);
2010 dout("get_cap_refs %p ret %d got %s\n", inode,
2011 ret, ceph_cap_string(*got));
2012 return ret;
2013}
2014
2015/*
2016 * Check the offset we are writing up to against our current
2017 * max_size. If necessary, tell the MDS we want to write to
2018 * a larger offset.
2019 */
2020static void check_max_size(struct inode *inode, loff_t endoff)
2021{
2022 struct ceph_inode_info *ci = ceph_inode(inode);
2023 int check = 0;
2024
2025 /* do we need to explicitly request a larger max_size? */
2026 spin_lock(&inode->i_lock);
2027 if ((endoff >= ci->i_max_size ||
2028 endoff > (inode->i_size << 1)) &&
2029 endoff > ci->i_wanted_max_size) {
2030 dout("write %p at large endoff %llu, req max_size\n",
2031 inode, endoff);
2032 ci->i_wanted_max_size = endoff;
2033 check = 1;
2034 }
2035 spin_unlock(&inode->i_lock);
2036 if (check)
2037 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2038}
2039
2040/*
2041 * Wait for caps, and take cap references. If we can't get a WR cap
2042 * due to a small max_size, make sure we check_max_size (and possibly
2043 * ask the mds) so we don't get hung up indefinitely.
2044 */
2045int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
2046 loff_t endoff)
2047{
2048 int check_max, ret, err;
2049
2050retry:
2051 if (endoff > 0)
2052 check_max_size(&ci->vfs_inode, endoff);
2053 check_max = 0;
2054 err = 0;
2055 ret = wait_event_interruptible(ci->i_cap_wq,
2056 try_get_cap_refs(ci, need, want,
2057 got, endoff,
2058 &check_max, &err));
2059 if (err)
2060 ret = err;
2061 if (check_max)
2062 goto retry;
2063 return ret;
2064}
2065
2066/*
2067 * Take cap refs. Caller must already know we hold at least one ref
2068 * on the caps in question or we don't know this is safe.
2069 */
2070void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
2071{
2072 spin_lock(&ci->vfs_inode.i_lock);
2073 __take_cap_refs(ci, caps);
2074 spin_unlock(&ci->vfs_inode.i_lock);
2075}
2076
2077/*
2078 * Release cap refs.
2079 *
2080 * If we released the last ref on any given cap, call ceph_check_caps
2081 * to release (or schedule a release).
2082 *
2083 * If we are releasing a WR cap (from a sync write), finalize any affected
2084 * cap_snap, and wake up any waiters.
2085 */
2086void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
2087{
2088 struct inode *inode = &ci->vfs_inode;
2089 int last = 0, put = 0, flushsnaps = 0, wake = 0;
2090 struct ceph_cap_snap *capsnap;
2091
2092 spin_lock(&inode->i_lock);
2093 if (had & CEPH_CAP_PIN)
2094 --ci->i_pin_ref;
2095 if (had & CEPH_CAP_FILE_RD)
2096 if (--ci->i_rd_ref == 0)
2097 last++;
2098 if (had & CEPH_CAP_FILE_CACHE)
2099 if (--ci->i_rdcache_ref == 0)
2100 last++;
2101 if (had & CEPH_CAP_FILE_BUFFER) {
2102 if (--ci->i_wrbuffer_ref == 0) {
2103 last++;
2104 put++;
2105 }
2106 dout("put_cap_refs %p wrbuffer %d -> %d (?)\n",
2107 inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref);
2108 }
2109 if (had & CEPH_CAP_FILE_WR)
2110 if (--ci->i_wr_ref == 0) {
2111 last++;
2112 if (!list_empty(&ci->i_cap_snaps)) {
2113 capsnap = list_first_entry(&ci->i_cap_snaps,
2114 struct ceph_cap_snap,
2115 ci_item);
2116 if (capsnap->writing) {
2117 capsnap->writing = 0;
2118 flushsnaps =
2119 __ceph_finish_cap_snap(ci,
2120 capsnap);
2121 wake = 1;
2122 }
2123 }
2124 }
2125 spin_unlock(&inode->i_lock);
2126
2127 dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
2128 last ? " last" : "", put ? " put" : "");
2129
2130 if (last && !flushsnaps)
2131 ceph_check_caps(ci, 0, NULL);
2132 else if (flushsnaps)
2133 ceph_flush_snaps(ci);
2134 if (wake)
2135 wake_up(&ci->i_cap_wq);
2136 if (put)
2137 iput(inode);
2138}
2139
2140/*
2141 * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
2142 * context. Adjust per-snap dirty page accounting as appropriate.
2143 * Once all dirty data for a cap_snap is flushed, flush snapped file
2144 * metadata back to the MDS. If we dropped the last ref, call
2145 * ceph_check_caps.
2146 */
2147void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2148 struct ceph_snap_context *snapc)
2149{
2150 struct inode *inode = &ci->vfs_inode;
2151 int last = 0;
2152 int complete_capsnap = 0;
2153 int drop_capsnap = 0;
2154 int found = 0;
2155 struct ceph_cap_snap *capsnap = NULL;
2156
2157 spin_lock(&inode->i_lock);
2158 ci->i_wrbuffer_ref -= nr;
2159 last = !ci->i_wrbuffer_ref;
2160
2161 if (ci->i_head_snapc == snapc) {
2162 ci->i_wrbuffer_ref_head -= nr;
2163 if (!ci->i_wrbuffer_ref_head) {
2164 ceph_put_snap_context(ci->i_head_snapc);
2165 ci->i_head_snapc = NULL;
2166 }
2167 dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
2168 inode,
2169 ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
2170 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
2171 last ? " LAST" : "");
2172 } else {
2173 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2174 if (capsnap->context == snapc) {
2175 found = 1;
2176 break;
2177 }
2178 }
2179 BUG_ON(!found);
2180 capsnap->dirty_pages -= nr;
2181 if (capsnap->dirty_pages == 0) {
2182 complete_capsnap = 1;
2183 if (capsnap->dirty == 0)
2184 /* cap writeback completed before we created
2185 * the cap_snap; no FLUSHSNAP is needed */
2186 drop_capsnap = 1;
2187 }
2188 dout("put_wrbuffer_cap_refs on %p cap_snap %p "
2189 " snap %lld %d/%d -> %d/%d %s%s%s\n",
2190 inode, capsnap, capsnap->context->seq,
2191 ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
2192 ci->i_wrbuffer_ref, capsnap->dirty_pages,
2193 last ? " (wrbuffer last)" : "",
2194 complete_capsnap ? " (complete capsnap)" : "",
2195 drop_capsnap ? " (drop capsnap)" : "");
2196 if (drop_capsnap) {
2197 ceph_put_snap_context(capsnap->context);
2198 list_del(&capsnap->ci_item);
2199 list_del(&capsnap->flushing_item);
2200 ceph_put_cap_snap(capsnap);
2201 }
2202 }
2203
2204 spin_unlock(&inode->i_lock);
2205
2206 if (last) {
2207 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2208 iput(inode);
2209 } else if (complete_capsnap) {
2210 ceph_flush_snaps(ci);
2211 wake_up(&ci->i_cap_wq);
2212 }
2213 if (drop_capsnap)
2214 iput(inode);
2215}
2216
2217/*
2218 * Handle a cap GRANT message from the MDS. (Note that a GRANT may
2219 * actually be a revocation if it specifies a smaller cap set.)
2220 *
2221 * caller holds s_mutex and i_lock, we drop both.
2222 *
2223 * return value:
2224 * 0 - ok
2225 * 1 - check_caps on auth cap only (writeback)
2226 * 2 - check_caps (ack revoke)
2227 */
2228static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2229 struct ceph_mds_session *session,
2230 struct ceph_cap *cap,
2231 struct ceph_buffer *xattr_buf)
2232 __releases(inode->i_lock)
2233 __releases(session->s_mutex)
2234{
2235 struct ceph_inode_info *ci = ceph_inode(inode);
2236 int mds = session->s_mds;
2237 int seq = le32_to_cpu(grant->seq);
2238 int newcaps = le32_to_cpu(grant->caps);
2239 int issued, implemented, used, wanted, dirty;
2240 u64 size = le64_to_cpu(grant->size);
2241 u64 max_size = le64_to_cpu(grant->max_size);
2242 struct timespec mtime, atime, ctime;
2243 int check_caps = 0;
2244 int wake = 0;
2245 int writeback = 0;
2246 int revoked_rdcache = 0;
2247 int queue_invalidate = 0;
2248
2249 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
2250 inode, cap, mds, seq, ceph_cap_string(newcaps));
2251 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
2252 inode->i_size);
2253
2254 /*
2255 * If CACHE is being revoked, and we have no dirty buffers,
2256 * try to invalidate (once). (If there are dirty buffers, we
2257 * will invalidate _after_ writeback.)
2258 */
2259 if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
2260 !ci->i_wrbuffer_ref) {
2261 if (try_nonblocking_invalidate(inode) == 0) {
2262 revoked_rdcache = 1;
2263 } else {
2264 /* there were locked pages.. invalidate later
2265 in a separate thread. */
2266 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
2267 queue_invalidate = 1;
2268 ci->i_rdcache_revoking = ci->i_rdcache_gen;
2269 }
2270 }
2271 }
2272
2273 /* side effects now are allowed */
2274
2275 issued = __ceph_caps_issued(ci, &implemented);
2276 issued |= implemented | __ceph_caps_dirty(ci);
2277
2278 cap->cap_gen = session->s_cap_gen;
2279
2280 __check_cap_issue(ci, cap, newcaps);
2281
2282 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
2283 inode->i_mode = le32_to_cpu(grant->mode);
2284 inode->i_uid = le32_to_cpu(grant->uid);
2285 inode->i_gid = le32_to_cpu(grant->gid);
2286 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
2287 inode->i_uid, inode->i_gid);
2288 }
2289
2290 if ((issued & CEPH_CAP_LINK_EXCL) == 0)
2291 inode->i_nlink = le32_to_cpu(grant->nlink);
2292
2293 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
2294 int len = le32_to_cpu(grant->xattr_len);
2295 u64 version = le64_to_cpu(grant->xattr_version);
2296
2297 if (version > ci->i_xattrs.version) {
2298 dout(" got new xattrs v%llu on %p len %d\n",
2299 version, inode, len);
2300 if (ci->i_xattrs.blob)
2301 ceph_buffer_put(ci->i_xattrs.blob);
2302 ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
2303 ci->i_xattrs.version = version;
2304 }
2305 }
2306
2307 /* size/ctime/mtime/atime? */
2308 ceph_fill_file_size(inode, issued,
2309 le32_to_cpu(grant->truncate_seq),
2310 le64_to_cpu(grant->truncate_size), size);
2311 ceph_decode_timespec(&mtime, &grant->mtime);
2312 ceph_decode_timespec(&atime, &grant->atime);
2313 ceph_decode_timespec(&ctime, &grant->ctime);
2314 ceph_fill_file_time(inode, issued,
2315 le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
2316 &atime);
2317
2318 /* max size increase? */
2319 if (max_size != ci->i_max_size) {
2320 dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
2321 ci->i_max_size = max_size;
2322 if (max_size >= ci->i_wanted_max_size) {
2323 ci->i_wanted_max_size = 0; /* reset */
2324 ci->i_requested_max_size = 0;
2325 }
2326 wake = 1;
2327 }
2328
2329 /* check cap bits */
2330 wanted = __ceph_caps_wanted(ci);
2331 used = __ceph_caps_used(ci);
2332 dirty = __ceph_caps_dirty(ci);
2333 dout(" my wanted = %s, used = %s, dirty %s\n",
2334 ceph_cap_string(wanted),
2335 ceph_cap_string(used),
2336 ceph_cap_string(dirty));
2337 if (wanted != le32_to_cpu(grant->wanted)) {
2338 dout("mds wanted %s -> %s\n",
2339 ceph_cap_string(le32_to_cpu(grant->wanted)),
2340 ceph_cap_string(wanted));
2341 grant->wanted = cpu_to_le32(wanted);
2342 }
2343
2344 cap->seq = seq;
2345
2346 /* file layout may have changed */
2347 ci->i_layout = grant->layout;
2348
2349 /* revocation, grant, or no-op? */
2350 if (cap->issued & ~newcaps) {
2351 dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
2352 ceph_cap_string(newcaps));
2353 if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
2354 writeback = 1; /* will delay ack */
2355 else if (dirty & ~newcaps)
2356 check_caps = 1; /* initiate writeback in check_caps */
2357 else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
2358 revoked_rdcache)
2359 check_caps = 2; /* send revoke ack in check_caps */
2360 cap->issued = newcaps;
2361 cap->implemented |= newcaps;
2362 } else if (cap->issued == newcaps) {
2363 dout("caps unchanged: %s -> %s\n",
2364 ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
2365 } else {
2366 dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
2367 ceph_cap_string(newcaps));
2368 cap->issued = newcaps;
2369 cap->implemented |= newcaps; /* add bits only, to
2370 * avoid stepping on a
2371 * pending revocation */
2372 wake = 1;
2373 }
2374 BUG_ON(cap->issued & ~cap->implemented);
2375
2376 spin_unlock(&inode->i_lock);
2377 if (writeback)
2378 /*
2379 * queue inode for writeback: we can't actually call
2380 * filemap_write_and_wait, etc. from message handler
2381 * context.
2382 */
2383 ceph_queue_writeback(inode);
2384 if (queue_invalidate)
2385 ceph_queue_invalidate(inode);
2386 if (wake)
2387 wake_up(&ci->i_cap_wq);
2388
2389 if (check_caps == 1)
2390 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
2391 session);
2392 else if (check_caps == 2)
2393 ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
2394 else
2395 mutex_unlock(&session->s_mutex);
2396}
2397
2398/*
2399 * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
2400 * MDS has been safely committed.
2401 */
2402static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2403 struct ceph_mds_caps *m,
2404 struct ceph_mds_session *session,
2405 struct ceph_cap *cap)
2406 __releases(inode->i_lock)
2407{
2408 struct ceph_inode_info *ci = ceph_inode(inode);
2409 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
2410 unsigned seq = le32_to_cpu(m->seq);
2411 int dirty = le32_to_cpu(m->dirty);
2412 int cleaned = 0;
2413 int drop = 0;
2414 int i;
2415
2416 for (i = 0; i < CEPH_CAP_BITS; i++)
2417 if ((dirty & (1 << i)) &&
2418 flush_tid == ci->i_cap_flush_tid[i])
2419 cleaned |= 1 << i;
2420
2421 dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
2422 " flushing %s -> %s\n",
2423 inode, session->s_mds, seq, ceph_cap_string(dirty),
2424 ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
2425 ceph_cap_string(ci->i_flushing_caps & ~cleaned));
2426
2427 if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
2428 goto out;
2429
2430 ci->i_flushing_caps &= ~cleaned;
2431
2432 spin_lock(&mdsc->cap_dirty_lock);
2433 if (ci->i_flushing_caps == 0) {
2434 list_del_init(&ci->i_flushing_item);
2435 if (!list_empty(&session->s_cap_flushing))
2436 dout(" mds%d still flushing cap on %p\n",
2437 session->s_mds,
2438 &list_entry(session->s_cap_flushing.next,
2439 struct ceph_inode_info,
2440 i_flushing_item)->vfs_inode);
2441 mdsc->num_cap_flushing--;
2442 wake_up(&mdsc->cap_flushing_wq);
2443 dout(" inode %p now !flushing\n", inode);
2444
2445 if (ci->i_dirty_caps == 0) {
2446 dout(" inode %p now clean\n", inode);
2447 BUG_ON(!list_empty(&ci->i_dirty_item));
2448 drop = 1;
2449 } else {
2450 BUG_ON(list_empty(&ci->i_dirty_item));
2451 }
2452 }
2453 spin_unlock(&mdsc->cap_dirty_lock);
2454 wake_up(&ci->i_cap_wq);
2455
2456out:
2457 spin_unlock(&inode->i_lock);
2458 if (drop)
2459 iput(inode);
2460}
2461
2462/*
2463 * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
2464 * throw away our cap_snap.
2465 *
2466 * Caller hold s_mutex.
2467 */
2468static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
2469 struct ceph_mds_caps *m,
2470 struct ceph_mds_session *session)
2471{
2472 struct ceph_inode_info *ci = ceph_inode(inode);
2473 u64 follows = le64_to_cpu(m->snap_follows);
2474 struct ceph_cap_snap *capsnap;
2475 int drop = 0;
2476
2477 dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
2478 inode, ci, session->s_mds, follows);
2479
2480 spin_lock(&inode->i_lock);
2481 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2482 if (capsnap->follows == follows) {
2483 if (capsnap->flush_tid != flush_tid) {
2484 dout(" cap_snap %p follows %lld tid %lld !="
2485 " %lld\n", capsnap, follows,
2486 flush_tid, capsnap->flush_tid);
2487 break;
2488 }
2489 WARN_ON(capsnap->dirty_pages || capsnap->writing);
2490 dout(" removing %p cap_snap %p follows %lld\n",
2491 inode, capsnap, follows);
2492 ceph_put_snap_context(capsnap->context);
2493 list_del(&capsnap->ci_item);
2494 list_del(&capsnap->flushing_item);
2495 ceph_put_cap_snap(capsnap);
2496 drop = 1;
2497 break;
2498 } else {
2499 dout(" skipping cap_snap %p follows %lld\n",
2500 capsnap, capsnap->follows);
2501 }
2502 }
2503 spin_unlock(&inode->i_lock);
2504 if (drop)
2505 iput(inode);
2506}
2507
2508/*
2509 * Handle TRUNC from MDS, indicating file truncation.
2510 *
2511 * caller hold s_mutex.
2512 */
2513static void handle_cap_trunc(struct inode *inode,
2514 struct ceph_mds_caps *trunc,
2515 struct ceph_mds_session *session)
2516 __releases(inode->i_lock)
2517{
2518 struct ceph_inode_info *ci = ceph_inode(inode);
2519 int mds = session->s_mds;
2520 int seq = le32_to_cpu(trunc->seq);
2521 u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
2522 u64 truncate_size = le64_to_cpu(trunc->truncate_size);
2523 u64 size = le64_to_cpu(trunc->size);
2524 int implemented = 0;
2525 int dirty = __ceph_caps_dirty(ci);
2526 int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
2527 int queue_trunc = 0;
2528
2529 issued |= implemented | dirty;
2530
2531 dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
2532 inode, mds, seq, truncate_size, truncate_seq);
2533 queue_trunc = ceph_fill_file_size(inode, issued,
2534 truncate_seq, truncate_size, size);
2535 spin_unlock(&inode->i_lock);
2536
2537 if (queue_trunc)
2538 ceph_queue_vmtruncate(inode);
2539}
2540
2541/*
2542 * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
2543 * different one. If we are the most recent migration we've seen (as
2544 * indicated by mseq), make note of the migrating cap bits for the
2545 * duration (until we see the corresponding IMPORT).
2546 *
2547 * caller holds s_mutex
2548 */
2549static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2550 struct ceph_mds_session *session)
2551{
2552 struct ceph_inode_info *ci = ceph_inode(inode);
2553 int mds = session->s_mds;
2554 unsigned mseq = le32_to_cpu(ex->migrate_seq);
2555 struct ceph_cap *cap = NULL, *t;
2556 struct rb_node *p;
2557 int remember = 1;
2558
2559 dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
2560 inode, ci, mds, mseq);
2561
2562 spin_lock(&inode->i_lock);
2563
2564 /* make sure we haven't seen a higher mseq */
2565 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
2566 t = rb_entry(p, struct ceph_cap, ci_node);
2567 if (ceph_seq_cmp(t->mseq, mseq) > 0) {
2568 dout(" higher mseq on cap from mds%d\n",
2569 t->session->s_mds);
2570 remember = 0;
2571 }
2572 if (t->session->s_mds == mds)
2573 cap = t;
2574 }
2575
2576 if (cap) {
2577 if (remember) {
2578 /* make note */
2579 ci->i_cap_exporting_mds = mds;
2580 ci->i_cap_exporting_mseq = mseq;
2581 ci->i_cap_exporting_issued = cap->issued;
2582 }
2583 __ceph_remove_cap(cap);
2584 }
2585 /* else, we already released it */
2586
2587 spin_unlock(&inode->i_lock);
2588}
2589
2590/*
2591 * Handle cap IMPORT. If there are temp bits from an older EXPORT,
2592 * clean them up.
2593 *
2594 * caller holds s_mutex.
2595 */
2596static void handle_cap_import(struct ceph_mds_client *mdsc,
2597 struct inode *inode, struct ceph_mds_caps *im,
2598 struct ceph_mds_session *session,
2599 void *snaptrace, int snaptrace_len)
2600{
2601 struct ceph_inode_info *ci = ceph_inode(inode);
2602 int mds = session->s_mds;
2603 unsigned issued = le32_to_cpu(im->caps);
2604 unsigned wanted = le32_to_cpu(im->wanted);
2605 unsigned seq = le32_to_cpu(im->seq);
2606 unsigned mseq = le32_to_cpu(im->migrate_seq);
2607 u64 realmino = le64_to_cpu(im->realm);
2608 u64 cap_id = le64_to_cpu(im->cap_id);
2609
2610 if (ci->i_cap_exporting_mds >= 0 &&
2611 ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
2612 dout("handle_cap_import inode %p ci %p mds%d mseq %d"
2613 " - cleared exporting from mds%d\n",
2614 inode, ci, mds, mseq,
2615 ci->i_cap_exporting_mds);
2616 ci->i_cap_exporting_issued = 0;
2617 ci->i_cap_exporting_mseq = 0;
2618 ci->i_cap_exporting_mds = -1;
2619 } else {
2620 dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
2621 inode, ci, mds, mseq);
2622 }
2623
2624 down_write(&mdsc->snap_rwsem);
2625 ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
2626 false);
2627 downgrade_write(&mdsc->snap_rwsem);
2628 ceph_add_cap(inode, session, cap_id, -1,
2629 issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
2630 NULL /* no caps context */);
2631 try_flush_caps(inode, session, NULL);
2632 up_read(&mdsc->snap_rwsem);
2633}
2634
2635/*
2636 * Handle a caps message from the MDS.
2637 *
2638 * Identify the appropriate session, inode, and call the right handler
2639 * based on the cap op.
2640 */
2641void ceph_handle_caps(struct ceph_mds_session *session,
2642 struct ceph_msg *msg)
2643{
2644 struct ceph_mds_client *mdsc = session->s_mdsc;
2645 struct super_block *sb = mdsc->client->sb;
2646 struct inode *inode;
2647 struct ceph_cap *cap;
2648 struct ceph_mds_caps *h;
2649 int mds = session->s_mds;
2650 int op;
2651 u32 seq;
2652 struct ceph_vino vino;
2653 u64 cap_id;
2654 u64 size, max_size;
2655 u64 tid;
2656 void *snaptrace;
2657
2658 dout("handle_caps from mds%d\n", mds);
2659
2660 /* decode */
2661 tid = le64_to_cpu(msg->hdr.tid);
2662 if (msg->front.iov_len < sizeof(*h))
2663 goto bad;
2664 h = msg->front.iov_base;
2665 snaptrace = h + 1;
2666 op = le32_to_cpu(h->op);
2667 vino.ino = le64_to_cpu(h->ino);
2668 vino.snap = CEPH_NOSNAP;
2669 cap_id = le64_to_cpu(h->cap_id);
2670 seq = le32_to_cpu(h->seq);
2671 size = le64_to_cpu(h->size);
2672 max_size = le64_to_cpu(h->max_size);
2673
2674 mutex_lock(&session->s_mutex);
2675 session->s_seq++;
2676 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
2677 (unsigned)seq);
2678
2679 /* lookup ino */
2680 inode = ceph_find_inode(sb, vino);
2681 dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
2682 vino.snap, inode);
2683 if (!inode) {
2684 dout(" i don't have ino %llx\n", vino.ino);
2685 goto done;
2686 }
2687
2688 /* these will work even if we don't have a cap yet */
2689 switch (op) {
2690 case CEPH_CAP_OP_FLUSHSNAP_ACK:
2691 handle_cap_flushsnap_ack(inode, tid, h, session);
2692 goto done;
2693
2694 case CEPH_CAP_OP_EXPORT:
2695 handle_cap_export(inode, h, session);
2696 goto done;
2697
2698 case CEPH_CAP_OP_IMPORT:
2699 handle_cap_import(mdsc, inode, h, session,
2700 snaptrace, le32_to_cpu(h->snap_trace_len));
2701 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
2702 session);
2703 goto done_unlocked;
2704 }
2705
2706 /* the rest require a cap */
2707 spin_lock(&inode->i_lock);
2708 cap = __get_cap_for_mds(ceph_inode(inode), mds);
2709 if (!cap) {
2710 dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
2711 inode, ceph_ino(inode), ceph_snap(inode), mds);
2712 spin_unlock(&inode->i_lock);
2713 goto done;
2714 }
2715
2716 /* note that each of these drops i_lock for us */
2717 switch (op) {
2718 case CEPH_CAP_OP_REVOKE:
2719 case CEPH_CAP_OP_GRANT:
2720 handle_cap_grant(inode, h, session, cap, msg->middle);
2721 goto done_unlocked;
2722
2723 case CEPH_CAP_OP_FLUSH_ACK:
2724 handle_cap_flush_ack(inode, tid, h, session, cap);
2725 break;
2726
2727 case CEPH_CAP_OP_TRUNC:
2728 handle_cap_trunc(inode, h, session);
2729 break;
2730
2731 default:
2732 spin_unlock(&inode->i_lock);
2733 pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
2734 ceph_cap_op_name(op));
2735 }
2736
2737done:
2738 mutex_unlock(&session->s_mutex);
2739done_unlocked:
2740 if (inode)
2741 iput(inode);
2742 return;
2743
2744bad:
2745 pr_err("ceph_handle_caps: corrupt message\n");
2746 ceph_msg_dump(msg);
2747 return;
2748}
2749
2750/*
2751 * Delayed work handler to process end of delayed cap release LRU list.
2752 */
2753void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
2754{
2755 struct ceph_inode_info *ci;
2756 int flags = CHECK_CAPS_NODELAY;
2757
2758 dout("check_delayed_caps\n");
2759 while (1) {
2760 spin_lock(&mdsc->cap_delay_lock);
2761 if (list_empty(&mdsc->cap_delay_list))
2762 break;
2763 ci = list_first_entry(&mdsc->cap_delay_list,
2764 struct ceph_inode_info,
2765 i_cap_delay_list);
2766 if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
2767 time_before(jiffies, ci->i_hold_caps_max))
2768 break;
2769 list_del_init(&ci->i_cap_delay_list);
2770 spin_unlock(&mdsc->cap_delay_lock);
2771 dout("check_delayed_caps on %p\n", &ci->vfs_inode);
2772 ceph_check_caps(ci, flags, NULL);
2773 }
2774 spin_unlock(&mdsc->cap_delay_lock);
2775}
2776
2777/*
2778 * Flush all dirty caps to the mds
2779 */
2780void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
2781{
2782 struct ceph_inode_info *ci, *nci = NULL;
2783 struct inode *inode, *ninode = NULL;
2784 struct list_head *p, *n;
2785
2786 dout("flush_dirty_caps\n");
2787 spin_lock(&mdsc->cap_dirty_lock);
2788 list_for_each_safe(p, n, &mdsc->cap_dirty) {
2789 if (nci) {
2790 ci = nci;
2791 inode = ninode;
2792 ci->i_ceph_flags &= ~CEPH_I_NOFLUSH;
2793 dout("flush_dirty_caps inode %p (was next inode)\n",
2794 inode);
2795 } else {
2796 ci = list_entry(p, struct ceph_inode_info,
2797 i_dirty_item);
2798 inode = igrab(&ci->vfs_inode);
2799 BUG_ON(!inode);
2800 dout("flush_dirty_caps inode %p\n", inode);
2801 }
2802 if (n != &mdsc->cap_dirty) {
2803 nci = list_entry(n, struct ceph_inode_info,
2804 i_dirty_item);
2805 ninode = igrab(&nci->vfs_inode);
2806 BUG_ON(!ninode);
2807 nci->i_ceph_flags |= CEPH_I_NOFLUSH;
2808 dout("flush_dirty_caps next inode %p, noflush\n",
2809 ninode);
2810 } else {
2811 nci = NULL;
2812 ninode = NULL;
2813 }
2814 spin_unlock(&mdsc->cap_dirty_lock);
2815 if (inode) {
2816 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
2817 NULL);
2818 iput(inode);
2819 }
2820 spin_lock(&mdsc->cap_dirty_lock);
2821 }
2822 spin_unlock(&mdsc->cap_dirty_lock);
2823}
2824
2825/*
2826 * Drop open file reference. If we were the last open file,
2827 * we may need to release capabilities to the MDS (or schedule
2828 * their delayed release).
2829 */
2830void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
2831{
2832 struct inode *inode = &ci->vfs_inode;
2833 int last = 0;
2834
2835 spin_lock(&inode->i_lock);
2836 dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
2837 ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
2838 BUG_ON(ci->i_nr_by_mode[fmode] == 0);
2839 if (--ci->i_nr_by_mode[fmode] == 0)
2840 last++;
2841 spin_unlock(&inode->i_lock);
2842
2843 if (last && ci->i_vino.snap == CEPH_NOSNAP)
2844 ceph_check_caps(ci, 0, NULL);
2845}
2846
2847/*
2848 * Helpers for embedding cap and dentry lease releases into mds
2849 * requests.
2850 *
2851 * @force is used by dentry_release (below) to force inclusion of a
2852 * record for the directory inode, even when there aren't any caps to
2853 * drop.
2854 */
2855int ceph_encode_inode_release(void **p, struct inode *inode,
2856 int mds, int drop, int unless, int force)
2857{
2858 struct ceph_inode_info *ci = ceph_inode(inode);
2859 struct ceph_cap *cap;
2860 struct ceph_mds_request_release *rel = *p;
2861 int ret = 0;
2862 int used = 0;
2863
2864 spin_lock(&inode->i_lock);
2865 used = __ceph_caps_used(ci);
2866
2867 dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode,
2868 mds, ceph_cap_string(used), ceph_cap_string(drop),
2869 ceph_cap_string(unless));
2870
2871 /* only drop unused caps */
2872 drop &= ~used;
2873
2874 cap = __get_cap_for_mds(ci, mds);
2875 if (cap && __cap_is_valid(cap)) {
2876 if (force ||
2877 ((cap->issued & drop) &&
2878 (cap->issued & unless) == 0)) {
2879 if ((cap->issued & drop) &&
2880 (cap->issued & unless) == 0) {
2881 dout("encode_inode_release %p cap %p %s -> "
2882 "%s\n", inode, cap,
2883 ceph_cap_string(cap->issued),
2884 ceph_cap_string(cap->issued & ~drop));
2885 cap->issued &= ~drop;
2886 cap->implemented &= ~drop;
2887 if (ci->i_ceph_flags & CEPH_I_NODELAY) {
2888 int wanted = __ceph_caps_wanted(ci);
2889 dout(" wanted %s -> %s (act %s)\n",
2890 ceph_cap_string(cap->mds_wanted),
2891 ceph_cap_string(cap->mds_wanted &
2892 ~wanted),
2893 ceph_cap_string(wanted));
2894 cap->mds_wanted &= wanted;
2895 }
2896 } else {
2897 dout("encode_inode_release %p cap %p %s"
2898 " (force)\n", inode, cap,
2899 ceph_cap_string(cap->issued));
2900 }
2901
2902 rel->ino = cpu_to_le64(ceph_ino(inode));
2903 rel->cap_id = cpu_to_le64(cap->cap_id);
2904 rel->seq = cpu_to_le32(cap->seq);
2905 rel->issue_seq = cpu_to_le32(cap->issue_seq),
2906 rel->mseq = cpu_to_le32(cap->mseq);
2907 rel->caps = cpu_to_le32(cap->issued);
2908 rel->wanted = cpu_to_le32(cap->mds_wanted);
2909 rel->dname_len = 0;
2910 rel->dname_seq = 0;
2911 *p += sizeof(*rel);
2912 ret = 1;
2913 } else {
2914 dout("encode_inode_release %p cap %p %s\n",
2915 inode, cap, ceph_cap_string(cap->issued));
2916 }
2917 }
2918 spin_unlock(&inode->i_lock);
2919 return ret;
2920}
2921
2922int ceph_encode_dentry_release(void **p, struct dentry *dentry,
2923 int mds, int drop, int unless)
2924{
2925 struct inode *dir = dentry->d_parent->d_inode;
2926 struct ceph_mds_request_release *rel = *p;
2927 struct ceph_dentry_info *di = ceph_dentry(dentry);
2928 int force = 0;
2929 int ret;
2930
2931 /*
2932 * force an record for the directory caps if we have a dentry lease.
2933 * this is racy (can't take i_lock and d_lock together), but it
2934 * doesn't have to be perfect; the mds will revoke anything we don't
2935 * release.
2936 */
2937 spin_lock(&dentry->d_lock);
2938 if (di->lease_session && di->lease_session->s_mds == mds)
2939 force = 1;
2940 spin_unlock(&dentry->d_lock);
2941
2942 ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
2943
2944 spin_lock(&dentry->d_lock);
2945 if (ret && di->lease_session && di->lease_session->s_mds == mds) {
2946 dout("encode_dentry_release %p mds%d seq %d\n",
2947 dentry, mds, (int)di->lease_seq);
2948 rel->dname_len = cpu_to_le32(dentry->d_name.len);
2949 memcpy(*p, dentry->d_name.name, dentry->d_name.len);
2950 *p += dentry->d_name.len;
2951 rel->dname_seq = cpu_to_le32(di->lease_seq);
2952 }
2953 spin_unlock(&dentry->d_lock);
2954 return ret;
2955}
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
new file mode 100644
index 000000000000..1818c2305610
--- /dev/null
+++ b/fs/ceph/ceph_debug.h
@@ -0,0 +1,37 @@
1#ifndef _FS_CEPH_DEBUG_H
2#define _FS_CEPH_DEBUG_H
3
4#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
5
6#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
7
8/*
9 * wrap pr_debug to include a filename:lineno prefix on each line.
10 * this incurs some overhead (kernel size and execution time) due to
11 * the extra function call at each call site.
12 */
13
14# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
15extern const char *ceph_file_part(const char *s, int len);
16# define dout(fmt, ...) \
17 pr_debug(" %12.12s:%-4d : " fmt, \
18 ceph_file_part(__FILE__, sizeof(__FILE__)), \
19 __LINE__, ##__VA_ARGS__)
20# else
21/* faux printk call just to see any compiler warnings. */
22# define dout(fmt, ...) do { \
23 if (0) \
24 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
25 } while (0)
26# endif
27
28#else
29
30/*
31 * or, just wrap pr_debug
32 */
33# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
34
35#endif
36
37#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
new file mode 100644
index 000000000000..ab6cf35c4091
--- /dev/null
+++ b/fs/ceph/ceph_frag.c
@@ -0,0 +1,21 @@
1/*
2 * Ceph 'frag' type
3 */
4#include "types.h"
5
6int ceph_frag_compare(__u32 a, __u32 b)
7{
8 unsigned va = ceph_frag_value(a);
9 unsigned vb = ceph_frag_value(b);
10 if (va < vb)
11 return -1;
12 if (va > vb)
13 return 1;
14 va = ceph_frag_bits(a);
15 vb = ceph_frag_bits(b);
16 if (va < vb)
17 return -1;
18 if (va > vb)
19 return 1;
20 return 0;
21}
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
new file mode 100644
index 000000000000..793f50cb7c22
--- /dev/null
+++ b/fs/ceph/ceph_frag.h
@@ -0,0 +1,109 @@
1#ifndef _FS_CEPH_FRAG_H
2#define _FS_CEPH_FRAG_H
3
4/*
5 * "Frags" are a way to describe a subset of a 32-bit number space,
6 * using a mask and a value to match against that mask. Any given frag
7 * (subset of the number space) can be partitioned into 2^n sub-frags.
8 *
9 * Frags are encoded into a 32-bit word:
10 * 8 upper bits = "bits"
11 * 24 lower bits = "value"
12 * (We could go to 5+27 bits, but who cares.)
13 *
14 * We use the _most_ significant bits of the 24 bit value. This makes
15 * values logically sort.
16 *
17 * Unfortunately, because the "bits" field is still in the high bits, we
18 * can't sort encoded frags numerically. However, it does allow you
19 * to feed encoded frags as values into frag_contains_value.
20 */
21static inline __u32 ceph_frag_make(__u32 b, __u32 v)
22{
23 return (b << 24) |
24 (v & (0xffffffu << (24-b)) & 0xffffffu);
25}
26static inline __u32 ceph_frag_bits(__u32 f)
27{
28 return f >> 24;
29}
30static inline __u32 ceph_frag_value(__u32 f)
31{
32 return f & 0xffffffu;
33}
34static inline __u32 ceph_frag_mask(__u32 f)
35{
36 return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
37}
38static inline __u32 ceph_frag_mask_shift(__u32 f)
39{
40 return 24 - ceph_frag_bits(f);
41}
42
43static inline int ceph_frag_contains_value(__u32 f, __u32 v)
44{
45 return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
46}
47static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
48{
49 /* is sub as specific as us, and contained by us? */
50 return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
51 (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
52}
53
54static inline __u32 ceph_frag_parent(__u32 f)
55{
56 return ceph_frag_make(ceph_frag_bits(f) - 1,
57 ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
58}
59static inline int ceph_frag_is_left_child(__u32 f)
60{
61 return ceph_frag_bits(f) > 0 &&
62 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
63}
64static inline int ceph_frag_is_right_child(__u32 f)
65{
66 return ceph_frag_bits(f) > 0 &&
67 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
68}
69static inline __u32 ceph_frag_sibling(__u32 f)
70{
71 return ceph_frag_make(ceph_frag_bits(f),
72 ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
73}
74static inline __u32 ceph_frag_left_child(__u32 f)
75{
76 return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
77}
78static inline __u32 ceph_frag_right_child(__u32 f)
79{
80 return ceph_frag_make(ceph_frag_bits(f)+1,
81 ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
82}
83static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
84{
85 int newbits = ceph_frag_bits(f) + by;
86 return ceph_frag_make(newbits,
87 ceph_frag_value(f) | (i << (24 - newbits)));
88}
89static inline int ceph_frag_is_leftmost(__u32 f)
90{
91 return ceph_frag_value(f) == 0;
92}
93static inline int ceph_frag_is_rightmost(__u32 f)
94{
95 return ceph_frag_value(f) == ceph_frag_mask(f);
96}
97static inline __u32 ceph_frag_next(__u32 f)
98{
99 return ceph_frag_make(ceph_frag_bits(f),
100 ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
101}
102
103/*
104 * comparator to sort frags logically, as when traversing the
105 * number space in ascending order...
106 */
107int ceph_frag_compare(__u32 a, __u32 b);
108
109#endif
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
new file mode 100644
index 000000000000..79d76bc4303f
--- /dev/null
+++ b/fs/ceph/ceph_fs.c
@@ -0,0 +1,74 @@
1/*
2 * Some non-inline ceph helpers
3 */
4#include "types.h"
5
6/*
7 * return true if @layout appears to be valid
8 */
9int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
10{
11 __u32 su = le32_to_cpu(layout->fl_stripe_unit);
12 __u32 sc = le32_to_cpu(layout->fl_stripe_count);
13 __u32 os = le32_to_cpu(layout->fl_object_size);
14
15 /* stripe unit, object size must be non-zero, 64k increment */
16 if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
17 return 0;
18 if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
19 return 0;
20 /* object size must be a multiple of stripe unit */
21 if (os < su || os % su)
22 return 0;
23 /* stripe count must be non-zero */
24 if (!sc)
25 return 0;
26 return 1;
27}
28
29
30int ceph_flags_to_mode(int flags)
31{
32#ifdef O_DIRECTORY /* fixme */
33 if ((flags & O_DIRECTORY) == O_DIRECTORY)
34 return CEPH_FILE_MODE_PIN;
35#endif
36#ifdef O_LAZY
37 if (flags & O_LAZY)
38 return CEPH_FILE_MODE_LAZY;
39#endif
40 if ((flags & O_APPEND) == O_APPEND)
41 flags |= O_WRONLY;
42
43 flags &= O_ACCMODE;
44 if ((flags & O_RDWR) == O_RDWR)
45 return CEPH_FILE_MODE_RDWR;
46 if ((flags & O_WRONLY) == O_WRONLY)
47 return CEPH_FILE_MODE_WR;
48 return CEPH_FILE_MODE_RD;
49}
50
51int ceph_caps_for_mode(int mode)
52{
53 switch (mode) {
54 case CEPH_FILE_MODE_PIN:
55 return CEPH_CAP_PIN;
56 case CEPH_FILE_MODE_RD:
57 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
58 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
59 case CEPH_FILE_MODE_RDWR:
60 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
61 CEPH_CAP_FILE_EXCL |
62 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
63 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
64 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
65 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
66 case CEPH_FILE_MODE_WR:
67 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
68 CEPH_CAP_FILE_EXCL |
69 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
70 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
71 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
72 }
73 return 0;
74}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
new file mode 100644
index 000000000000..0c2241ef3653
--- /dev/null
+++ b/fs/ceph/ceph_fs.h
@@ -0,0 +1,650 @@
1/*
2 * ceph_fs.h - Ceph constants and data types to share between kernel and
3 * user space.
4 *
5 * Most types in this file are defined as little-endian, and are
6 * primarily intended to describe data structures that pass over the
7 * wire or that are stored on disk.
8 *
9 * LGPL2
10 */
11
12#ifndef _FS_CEPH_CEPH_FS_H
13#define _FS_CEPH_CEPH_FS_H
14
15#include "msgr.h"
16#include "rados.h"
17
18/*
19 * Ceph release version
20 */
21#define CEPH_VERSION_MAJOR 0
22#define CEPH_VERSION_MINOR 19
23#define CEPH_VERSION_PATCH 0
24
25#define _CEPH_STRINGIFY(x) #x
26#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
27#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
28 "." CEPH_STRINGIFY(z)
29#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
30 CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
31
32/*
33 * subprotocol versions. when specific messages types or high-level
34 * protocols change, bump the affected components. we keep rev
35 * internal cluster protocols separately from the public,
36 * client-facing protocol.
37 */
38#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
39#define CEPH_MDS_PROTOCOL 9 /* cluster internal */
40#define CEPH_MON_PROTOCOL 5 /* cluster internal */
41#define CEPH_OSDC_PROTOCOL 24 /* server/client */
42#define CEPH_MDSC_PROTOCOL 32 /* server/client */
43#define CEPH_MONC_PROTOCOL 15 /* server/client */
44
45
46#define CEPH_INO_ROOT 1
47#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
48
49/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
50#define CEPH_MAX_MON 31
51
52
53/*
54 * feature bits
55 */
56#define CEPH_FEATURE_SUPPORTED 0
57#define CEPH_FEATURE_REQUIRED 0
58
59
60/*
61 * ceph_file_layout - describe data layout for a file/inode
62 */
63struct ceph_file_layout {
64 /* file -> object mapping */
65 __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
66 of page size. */
67 __le32 fl_stripe_count; /* over this many objects */
68 __le32 fl_object_size; /* until objects are this big, then move to
69 new objects */
70 __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */
71
72 /* pg -> disk layout */
73 __le32 fl_object_stripe_unit; /* for per-object parity, if any */
74
75 /* object -> pg layout */
76 __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
77 __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
78} __attribute__ ((packed));
79
80#define CEPH_MIN_STRIPE_UNIT 65536
81
82int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
83
84
85/* crypto algorithms */
86#define CEPH_CRYPTO_NONE 0x0
87#define CEPH_CRYPTO_AES 0x1
88
89/* security/authentication protocols */
90#define CEPH_AUTH_UNKNOWN 0x0
91#define CEPH_AUTH_NONE 0x1
92#define CEPH_AUTH_CEPHX 0x2
93
94
95/*********************************************
96 * message layer
97 */
98
99/*
100 * message types
101 */
102
103/* misc */
104#define CEPH_MSG_SHUTDOWN 1
105#define CEPH_MSG_PING 2
106
107/* client <-> monitor */
108#define CEPH_MSG_MON_MAP 4
109#define CEPH_MSG_MON_GET_MAP 5
110#define CEPH_MSG_STATFS 13
111#define CEPH_MSG_STATFS_REPLY 14
112#define CEPH_MSG_MON_SUBSCRIBE 15
113#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
114#define CEPH_MSG_AUTH 17
115#define CEPH_MSG_AUTH_REPLY 18
116
117/* client <-> mds */
118#define CEPH_MSG_MDS_MAP 21
119
120#define CEPH_MSG_CLIENT_SESSION 22
121#define CEPH_MSG_CLIENT_RECONNECT 23
122
123#define CEPH_MSG_CLIENT_REQUEST 24
124#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
125#define CEPH_MSG_CLIENT_REPLY 26
126#define CEPH_MSG_CLIENT_CAPS 0x310
127#define CEPH_MSG_CLIENT_LEASE 0x311
128#define CEPH_MSG_CLIENT_SNAP 0x312
129#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
130
131/* osd */
132#define CEPH_MSG_OSD_MAP 41
133#define CEPH_MSG_OSD_OP 42
134#define CEPH_MSG_OSD_OPREPLY 43
135
136struct ceph_mon_request_header {
137 __le64 have_version;
138 __le16 session_mon;
139 __le64 session_mon_tid;
140} __attribute__ ((packed));
141
142struct ceph_mon_statfs {
143 struct ceph_mon_request_header monhdr;
144 struct ceph_fsid fsid;
145} __attribute__ ((packed));
146
147struct ceph_statfs {
148 __le64 kb, kb_used, kb_avail;
149 __le64 num_objects;
150} __attribute__ ((packed));
151
152struct ceph_mon_statfs_reply {
153 struct ceph_fsid fsid;
154 __le64 version;
155 struct ceph_statfs st;
156} __attribute__ ((packed));
157
158struct ceph_osd_getmap {
159 struct ceph_mon_request_header monhdr;
160 struct ceph_fsid fsid;
161 __le32 start;
162} __attribute__ ((packed));
163
164struct ceph_mds_getmap {
165 struct ceph_mon_request_header monhdr;
166 struct ceph_fsid fsid;
167} __attribute__ ((packed));
168
169struct ceph_client_mount {
170 struct ceph_mon_request_header monhdr;
171} __attribute__ ((packed));
172
173struct ceph_mon_subscribe_item {
174 __le64 have_version; __le64 have;
175 __u8 onetime;
176} __attribute__ ((packed));
177
178struct ceph_mon_subscribe_ack {
179 __le32 duration; /* seconds */
180 struct ceph_fsid fsid;
181} __attribute__ ((packed));
182
183/*
184 * mds states
185 * > 0 -> in
186 * <= 0 -> out
187 */
188#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */
189#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees.
190 empty log. */
191#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */
192#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
193#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
194#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
195#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
196
197#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
198#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
199 operations (import, rename, etc.) */
200#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */
201#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */
202#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
203#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */
204#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */
205
206extern const char *ceph_mds_state_name(int s);
207
208
209/*
210 * metadata lock types.
211 * - these are bitmasks.. we can compose them
212 * - they also define the lock ordering by the MDS
213 * - a few of these are internal to the mds
214 */
215#define CEPH_LOCK_DN 1
216#define CEPH_LOCK_ISNAP 2
217#define CEPH_LOCK_IVERSION 4 /* mds internal */
218#define CEPH_LOCK_IFILE 8 /* mds internal */
219#define CEPH_LOCK_IAUTH 32
220#define CEPH_LOCK_ILINK 64
221#define CEPH_LOCK_IDFT 128 /* dir frag tree */
222#define CEPH_LOCK_INEST 256 /* mds internal */
223#define CEPH_LOCK_IXATTR 512
224#define CEPH_LOCK_INO 2048 /* immutable inode bits; not a lock */
225
226/* client_session ops */
227enum {
228 CEPH_SESSION_REQUEST_OPEN,
229 CEPH_SESSION_OPEN,
230 CEPH_SESSION_REQUEST_CLOSE,
231 CEPH_SESSION_CLOSE,
232 CEPH_SESSION_REQUEST_RENEWCAPS,
233 CEPH_SESSION_RENEWCAPS,
234 CEPH_SESSION_STALE,
235 CEPH_SESSION_RECALL_STATE,
236};
237
238extern const char *ceph_session_op_name(int op);
239
240struct ceph_mds_session_head {
241 __le32 op;
242 __le64 seq;
243 struct ceph_timespec stamp;
244 __le32 max_caps, max_leases;
245} __attribute__ ((packed));
246
247/* client_request */
248/*
249 * metadata ops.
250 * & 0x001000 -> write op
251 * & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
252 & & 0x100000 -> use weird ino/path trace
253 */
254#define CEPH_MDS_OP_WRITE 0x001000
255enum {
256 CEPH_MDS_OP_LOOKUP = 0x00100,
257 CEPH_MDS_OP_GETATTR = 0x00101,
258 CEPH_MDS_OP_LOOKUPHASH = 0x00102,
259 CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
260
261 CEPH_MDS_OP_SETXATTR = 0x01105,
262 CEPH_MDS_OP_RMXATTR = 0x01106,
263 CEPH_MDS_OP_SETLAYOUT = 0x01107,
264 CEPH_MDS_OP_SETATTR = 0x01108,
265
266 CEPH_MDS_OP_MKNOD = 0x01201,
267 CEPH_MDS_OP_LINK = 0x01202,
268 CEPH_MDS_OP_UNLINK = 0x01203,
269 CEPH_MDS_OP_RENAME = 0x01204,
270 CEPH_MDS_OP_MKDIR = 0x01220,
271 CEPH_MDS_OP_RMDIR = 0x01221,
272 CEPH_MDS_OP_SYMLINK = 0x01222,
273
274 CEPH_MDS_OP_CREATE = 0x01301,
275 CEPH_MDS_OP_OPEN = 0x00302,
276 CEPH_MDS_OP_READDIR = 0x00305,
277
278 CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
279 CEPH_MDS_OP_MKSNAP = 0x01400,
280 CEPH_MDS_OP_RMSNAP = 0x01401,
281 CEPH_MDS_OP_LSSNAP = 0x00402,
282};
283
284extern const char *ceph_mds_op_name(int op);
285
286
287#define CEPH_SETATTR_MODE 1
288#define CEPH_SETATTR_UID 2
289#define CEPH_SETATTR_GID 4
290#define CEPH_SETATTR_MTIME 8
291#define CEPH_SETATTR_ATIME 16
292#define CEPH_SETATTR_SIZE 32
293#define CEPH_SETATTR_CTIME 64
294
295union ceph_mds_request_args {
296 struct {
297 __le32 mask; /* CEPH_CAP_* */
298 } __attribute__ ((packed)) getattr;
299 struct {
300 __le32 mode;
301 __le32 uid;
302 __le32 gid;
303 struct ceph_timespec mtime;
304 struct ceph_timespec atime;
305 __le64 size, old_size; /* old_size needed by truncate */
306 __le32 mask; /* CEPH_SETATTR_* */
307 } __attribute__ ((packed)) setattr;
308 struct {
309 __le32 frag; /* which dir fragment */
310 __le32 max_entries; /* how many dentries to grab */
311 } __attribute__ ((packed)) readdir;
312 struct {
313 __le32 mode;
314 __le32 rdev;
315 } __attribute__ ((packed)) mknod;
316 struct {
317 __le32 mode;
318 } __attribute__ ((packed)) mkdir;
319 struct {
320 __le32 flags;
321 __le32 mode;
322 __le32 stripe_unit; /* layout for newly created file */
323 __le32 stripe_count; /* ... */
324 __le32 object_size;
325 __le32 file_replication;
326 __le32 preferred;
327 } __attribute__ ((packed)) open;
328 struct {
329 __le32 flags;
330 } __attribute__ ((packed)) setxattr;
331 struct {
332 struct ceph_file_layout layout;
333 } __attribute__ ((packed)) setlayout;
334} __attribute__ ((packed));
335
336#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
337#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
338
339struct ceph_mds_request_head {
340 __le64 oldest_client_tid;
341 __le32 mdsmap_epoch; /* on client */
342 __le32 flags; /* CEPH_MDS_FLAG_* */
343 __u8 num_retry, num_fwd; /* count retry, fwd attempts */
344 __le16 num_releases; /* # include cap/lease release records */
345 __le32 op; /* mds op code */
346 __le32 caller_uid, caller_gid;
347 __le64 ino; /* use this ino for openc, mkdir, mknod,
348 etc. (if replaying) */
349 union ceph_mds_request_args args;
350} __attribute__ ((packed));
351
352/* cap/lease release record */
353struct ceph_mds_request_release {
354 __le64 ino, cap_id; /* ino and unique cap id */
355 __le32 caps, wanted; /* new issued, wanted */
356 __le32 seq, issue_seq, mseq;
357 __le32 dname_seq; /* if releasing a dentry lease, a */
358 __le32 dname_len; /* string follows. */
359} __attribute__ ((packed));
360
361/* client reply */
362struct ceph_mds_reply_head {
363 __le32 op;
364 __le32 result;
365 __le32 mdsmap_epoch;
366 __u8 safe; /* true if committed to disk */
367 __u8 is_dentry, is_target; /* true if dentry, target inode records
368 are included with reply */
369} __attribute__ ((packed));
370
371/* one for each node split */
372struct ceph_frag_tree_split {
373 __le32 frag; /* this frag splits... */
374 __le32 by; /* ...by this many bits */
375} __attribute__ ((packed));
376
377struct ceph_frag_tree_head {
378 __le32 nsplits; /* num ceph_frag_tree_split records */
379 struct ceph_frag_tree_split splits[];
380} __attribute__ ((packed));
381
382/* capability issue, for bundling with mds reply */
383struct ceph_mds_reply_cap {
384 __le32 caps, wanted; /* caps issued, wanted */
385 __le64 cap_id;
386 __le32 seq, mseq;
387 __le64 realm; /* snap realm */
388 __u8 flags; /* CEPH_CAP_FLAG_* */
389} __attribute__ ((packed));
390
391#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */
392
393/* inode record, for bundling with mds reply */
394struct ceph_mds_reply_inode {
395 __le64 ino;
396 __le64 snapid;
397 __le32 rdev;
398 __le64 version; /* inode version */
399 __le64 xattr_version; /* version for xattr blob */
400 struct ceph_mds_reply_cap cap; /* caps issued for this inode */
401 struct ceph_file_layout layout;
402 struct ceph_timespec ctime, mtime, atime;
403 __le32 time_warp_seq;
404 __le64 size, max_size, truncate_size;
405 __le32 truncate_seq;
406 __le32 mode, uid, gid;
407 __le32 nlink;
408 __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */
409 struct ceph_timespec rctime;
410 struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */
411} __attribute__ ((packed));
412/* followed by frag array, then symlink string, then xattr blob */
413
414/* reply_lease follows dname, and reply_inode */
415struct ceph_mds_reply_lease {
416 __le16 mask; /* lease type(s) */
417 __le32 duration_ms; /* lease duration */
418 __le32 seq;
419} __attribute__ ((packed));
420
421struct ceph_mds_reply_dirfrag {
422 __le32 frag; /* fragment */
423 __le32 auth; /* auth mds, if this is a delegation point */
424 __le32 ndist; /* number of mds' this is replicated on */
425 __le32 dist[];
426} __attribute__ ((packed));
427
428/* file access modes */
429#define CEPH_FILE_MODE_PIN 0
430#define CEPH_FILE_MODE_RD 1
431#define CEPH_FILE_MODE_WR 2
432#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
433#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
434#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
435
436int ceph_flags_to_mode(int flags);
437
438
439/* capability bits */
440#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */
441
442/* generic cap bits */
443#define CEPH_CAP_GSHARED 1 /* client can reads */
444#define CEPH_CAP_GEXCL 2 /* client can read and update */
445#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */
446#define CEPH_CAP_GRD 8 /* (file) client can read */
447#define CEPH_CAP_GWR 16 /* (file) client can write */
448#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */
449#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
450#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
451
452/* per-lock shift */
453#define CEPH_CAP_SAUTH 2
454#define CEPH_CAP_SLINK 4
455#define CEPH_CAP_SXATTR 6
456#define CEPH_CAP_SFILE 8 /* goes at the end (uses >2 cap bits) */
457
458#define CEPH_CAP_BITS 16
459
460/* composed values */
461#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
462#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)
463#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK)
464#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK)
465#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR)
466#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR)
467#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE)
468#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE)
469#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE)
470#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE)
471#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE)
472#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE)
473#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
474#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
475#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
476
477/* cap masks (for getattr) */
478#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
479#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */
480#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN
481#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED
482#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED
483#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED
484#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED
485#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED
486#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED
487#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED
488#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */
489#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED
490#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \
491 CEPH_CAP_AUTH_SHARED | \
492 CEPH_CAP_LINK_SHARED | \
493 CEPH_CAP_FILE_SHARED | \
494 CEPH_CAP_XATTR_SHARED)
495
496#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
497 CEPH_CAP_LINK_SHARED | \
498 CEPH_CAP_XATTR_SHARED | \
499 CEPH_CAP_FILE_SHARED)
500#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \
501 CEPH_CAP_FILE_CACHE)
502
503#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \
504 CEPH_CAP_LINK_EXCL | \
505 CEPH_CAP_XATTR_EXCL | \
506 CEPH_CAP_FILE_EXCL)
507#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
508 CEPH_CAP_FILE_EXCL)
509#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
510#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
511 CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN)
512
513#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
514 CEPH_LOCK_IXATTR)
515
516int ceph_caps_for_mode(int mode);
517
518enum {
519 CEPH_CAP_OP_GRANT, /* mds->client grant */
520 CEPH_CAP_OP_REVOKE, /* mds->client revoke */
521 CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */
522 CEPH_CAP_OP_EXPORT, /* mds has exported the cap */
523 CEPH_CAP_OP_IMPORT, /* mds has imported the cap */
524 CEPH_CAP_OP_UPDATE, /* client->mds update */
525 CEPH_CAP_OP_DROP, /* client->mds drop cap bits */
526 CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */
527 CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */
528 CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
529 CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
530 CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
531 CEPH_CAP_OP_RENEW, /* client->mds renewal request */
532};
533
534extern const char *ceph_cap_op_name(int op);
535
536/*
537 * caps message, used for capability callbacks, acks, requests, etc.
538 */
539struct ceph_mds_caps {
540 __le32 op; /* CEPH_CAP_OP_* */
541 __le64 ino, realm;
542 __le64 cap_id;
543 __le32 seq, issue_seq;
544 __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
545 __le32 migrate_seq;
546 __le64 snap_follows;
547 __le32 snap_trace_len;
548
549 /* authlock */
550 __le32 uid, gid, mode;
551
552 /* linklock */
553 __le32 nlink;
554
555 /* xattrlock */
556 __le32 xattr_len;
557 __le64 xattr_version;
558
559 /* filelock */
560 __le64 size, max_size, truncate_size;
561 __le32 truncate_seq;
562 struct ceph_timespec mtime, atime, ctime;
563 struct ceph_file_layout layout;
564 __le32 time_warp_seq;
565} __attribute__ ((packed));
566
567/* cap release msg head */
568struct ceph_mds_cap_release {
569 __le32 num; /* number of cap_items that follow */
570} __attribute__ ((packed));
571
572struct ceph_mds_cap_item {
573 __le64 ino;
574 __le64 cap_id;
575 __le32 migrate_seq, seq;
576} __attribute__ ((packed));
577
578#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */
579#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */
580#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */
581#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */
582
583extern const char *ceph_lease_op_name(int o);
584
585/* lease msg header */
586struct ceph_mds_lease {
587 __u8 action; /* CEPH_MDS_LEASE_* */
588 __le16 mask; /* which lease */
589 __le64 ino;
590 __le64 first, last; /* snap range */
591 __le32 seq;
592 __le32 duration_ms; /* duration of renewal */
593} __attribute__ ((packed));
594/* followed by a __le32+string for dname */
595
596/* client reconnect */
597struct ceph_mds_cap_reconnect {
598 __le64 cap_id;
599 __le32 wanted;
600 __le32 issued;
601 __le64 size;
602 struct ceph_timespec mtime, atime;
603 __le64 snaprealm;
604 __le64 pathbase; /* base ino for our path to this ino */
605} __attribute__ ((packed));
606/* followed by encoded string */
607
608struct ceph_mds_snaprealm_reconnect {
609 __le64 ino; /* snap realm base */
610 __le64 seq; /* snap seq for this snap realm */
611 __le64 parent; /* parent realm */
612} __attribute__ ((packed));
613
614/*
615 * snaps
616 */
617enum {
618 CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */
619 CEPH_SNAP_OP_CREATE,
620 CEPH_SNAP_OP_DESTROY,
621 CEPH_SNAP_OP_SPLIT,
622};
623
624extern const char *ceph_snap_op_name(int o);
625
626/* snap msg header */
627struct ceph_mds_snap_head {
628 __le32 op; /* CEPH_SNAP_OP_* */
629 __le64 split; /* ino to split off, if any */
630 __le32 num_split_inos; /* # inos belonging to new child realm */
631 __le32 num_split_realms; /* # child realms udner new child realm */
632 __le32 trace_len; /* size of snap trace blob */
633} __attribute__ ((packed));
634/* followed by split ino list, then split realms, then the trace blob */
635
636/*
637 * encode info about a snaprealm, as viewed by a client
638 */
639struct ceph_mds_snap_realm {
640 __le64 ino; /* ino */
641 __le64 created; /* snap: when created */
642 __le64 parent; /* ino: parent realm */
643 __le64 parent_since; /* snap: same parent since */
644 __le64 seq; /* snap: version */
645 __le32 num_snaps;
646 __le32 num_prior_parent_snaps;
647} __attribute__ ((packed));
648/* followed by my snap list, then prior parent snap list */
649
650#endif
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
new file mode 100644
index 000000000000..bd570015d147
--- /dev/null
+++ b/fs/ceph/ceph_hash.c
@@ -0,0 +1,118 @@
1
2#include "types.h"
3
4/*
5 * Robert Jenkin's hash function.
6 * http://burtleburtle.net/bob/hash/evahash.html
7 * This is in the public domain.
8 */
9#define mix(a, b, c) \
10 do { \
11 a = a - b; a = a - c; a = a ^ (c >> 13); \
12 b = b - c; b = b - a; b = b ^ (a << 8); \
13 c = c - a; c = c - b; c = c ^ (b >> 13); \
14 a = a - b; a = a - c; a = a ^ (c >> 12); \
15 b = b - c; b = b - a; b = b ^ (a << 16); \
16 c = c - a; c = c - b; c = c ^ (b >> 5); \
17 a = a - b; a = a - c; a = a ^ (c >> 3); \
18 b = b - c; b = b - a; b = b ^ (a << 10); \
19 c = c - a; c = c - b; c = c ^ (b >> 15); \
20 } while (0)
21
22unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
23{
24 const unsigned char *k = (const unsigned char *)str;
25 __u32 a, b, c; /* the internal state */
26 __u32 len; /* how many key bytes still need mixing */
27
28 /* Set up the internal state */
29 len = length;
30 a = 0x9e3779b9; /* the golden ratio; an arbitrary value */
31 b = a;
32 c = 0; /* variable initialization of internal state */
33
34 /* handle most of the key */
35 while (len >= 12) {
36 a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
37 ((__u32)k[3] << 24));
38 b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
39 ((__u32)k[7] << 24));
40 c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
41 ((__u32)k[11] << 24));
42 mix(a, b, c);
43 k = k + 12;
44 len = len - 12;
45 }
46
47 /* handle the last 11 bytes */
48 c = c + length;
49 switch (len) { /* all the case statements fall through */
50 case 11:
51 c = c + ((__u32)k[10] << 24);
52 case 10:
53 c = c + ((__u32)k[9] << 16);
54 case 9:
55 c = c + ((__u32)k[8] << 8);
56 /* the first byte of c is reserved for the length */
57 case 8:
58 b = b + ((__u32)k[7] << 24);
59 case 7:
60 b = b + ((__u32)k[6] << 16);
61 case 6:
62 b = b + ((__u32)k[5] << 8);
63 case 5:
64 b = b + k[4];
65 case 4:
66 a = a + ((__u32)k[3] << 24);
67 case 3:
68 a = a + ((__u32)k[2] << 16);
69 case 2:
70 a = a + ((__u32)k[1] << 8);
71 case 1:
72 a = a + k[0];
73 /* case 0: nothing left to add */
74 }
75 mix(a, b, c);
76
77 return c;
78}
79
80/*
81 * linux dcache hash
82 */
83unsigned ceph_str_hash_linux(const char *str, unsigned length)
84{
85 unsigned long hash = 0;
86 unsigned char c;
87
88 while (length--) {
89 c = *str++;
90 hash = (hash + (c << 4) + (c >> 4)) * 11;
91 }
92 return hash;
93}
94
95
96unsigned ceph_str_hash(int type, const char *s, unsigned len)
97{
98 switch (type) {
99 case CEPH_STR_HASH_LINUX:
100 return ceph_str_hash_linux(s, len);
101 case CEPH_STR_HASH_RJENKINS:
102 return ceph_str_hash_rjenkins(s, len);
103 default:
104 return -1;
105 }
106}
107
108const char *ceph_str_hash_name(int type)
109{
110 switch (type) {
111 case CEPH_STR_HASH_LINUX:
112 return "linux";
113 case CEPH_STR_HASH_RJENKINS:
114 return "rjenkins";
115 default:
116 return "unknown";
117 }
118}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
new file mode 100644
index 000000000000..5ac470c433c9
--- /dev/null
+++ b/fs/ceph/ceph_hash.h
@@ -0,0 +1,13 @@
1#ifndef _FS_CEPH_HASH_H
2#define _FS_CEPH_HASH_H
3
4#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
5#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
6
7extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
8extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
9
10extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
11extern const char *ceph_str_hash_name(int type);
12
13#endif
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
new file mode 100644
index 000000000000..8e4be6a80c62
--- /dev/null
+++ b/fs/ceph/ceph_strings.c
@@ -0,0 +1,176 @@
1/*
2 * Ceph string constants
3 */
4#include "types.h"
5
6const char *ceph_entity_type_name(int type)
7{
8 switch (type) {
9 case CEPH_ENTITY_TYPE_MDS: return "mds";
10 case CEPH_ENTITY_TYPE_OSD: return "osd";
11 case CEPH_ENTITY_TYPE_MON: return "mon";
12 case CEPH_ENTITY_TYPE_CLIENT: return "client";
13 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
14 case CEPH_ENTITY_TYPE_AUTH: return "auth";
15 default: return "unknown";
16 }
17}
18
19const char *ceph_osd_op_name(int op)
20{
21 switch (op) {
22 case CEPH_OSD_OP_READ: return "read";
23 case CEPH_OSD_OP_STAT: return "stat";
24
25 case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
26
27 case CEPH_OSD_OP_WRITE: return "write";
28 case CEPH_OSD_OP_DELETE: return "delete";
29 case CEPH_OSD_OP_TRUNCATE: return "truncate";
30 case CEPH_OSD_OP_ZERO: return "zero";
31 case CEPH_OSD_OP_WRITEFULL: return "writefull";
32
33 case CEPH_OSD_OP_APPEND: return "append";
34 case CEPH_OSD_OP_STARTSYNC: return "startsync";
35 case CEPH_OSD_OP_SETTRUNC: return "settrunc";
36 case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
37
38 case CEPH_OSD_OP_TMAPUP: return "tmapup";
39 case CEPH_OSD_OP_TMAPGET: return "tmapget";
40 case CEPH_OSD_OP_TMAPPUT: return "tmapput";
41
42 case CEPH_OSD_OP_GETXATTR: return "getxattr";
43 case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
44 case CEPH_OSD_OP_SETXATTR: return "setxattr";
45 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
46 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
47 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
48
49 case CEPH_OSD_OP_PULL: return "pull";
50 case CEPH_OSD_OP_PUSH: return "push";
51 case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
52 case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
53 case CEPH_OSD_OP_SCRUB: return "scrub";
54
55 case CEPH_OSD_OP_WRLOCK: return "wrlock";
56 case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
57 case CEPH_OSD_OP_RDLOCK: return "rdlock";
58 case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
59 case CEPH_OSD_OP_UPLOCK: return "uplock";
60 case CEPH_OSD_OP_DNLOCK: return "dnlock";
61
62 case CEPH_OSD_OP_CALL: return "call";
63
64 case CEPH_OSD_OP_PGLS: return "pgls";
65 }
66 return "???";
67}
68
69const char *ceph_mds_state_name(int s)
70{
71 switch (s) {
72 /* down and out */
73 case CEPH_MDS_STATE_DNE: return "down:dne";
74 case CEPH_MDS_STATE_STOPPED: return "down:stopped";
75 /* up and out */
76 case CEPH_MDS_STATE_BOOT: return "up:boot";
77 case CEPH_MDS_STATE_STANDBY: return "up:standby";
78 case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay";
79 case CEPH_MDS_STATE_CREATING: return "up:creating";
80 case CEPH_MDS_STATE_STARTING: return "up:starting";
81 /* up and in */
82 case CEPH_MDS_STATE_REPLAY: return "up:replay";
83 case CEPH_MDS_STATE_RESOLVE: return "up:resolve";
84 case CEPH_MDS_STATE_RECONNECT: return "up:reconnect";
85 case CEPH_MDS_STATE_REJOIN: return "up:rejoin";
86 case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
87 case CEPH_MDS_STATE_ACTIVE: return "up:active";
88 case CEPH_MDS_STATE_STOPPING: return "up:stopping";
89 }
90 return "???";
91}
92
93const char *ceph_session_op_name(int op)
94{
95 switch (op) {
96 case CEPH_SESSION_REQUEST_OPEN: return "request_open";
97 case CEPH_SESSION_OPEN: return "open";
98 case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
99 case CEPH_SESSION_CLOSE: return "close";
100 case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
101 case CEPH_SESSION_RENEWCAPS: return "renewcaps";
102 case CEPH_SESSION_STALE: return "stale";
103 case CEPH_SESSION_RECALL_STATE: return "recall_state";
104 }
105 return "???";
106}
107
108const char *ceph_mds_op_name(int op)
109{
110 switch (op) {
111 case CEPH_MDS_OP_LOOKUP: return "lookup";
112 case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
113 case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
114 case CEPH_MDS_OP_GETATTR: return "getattr";
115 case CEPH_MDS_OP_SETXATTR: return "setxattr";
116 case CEPH_MDS_OP_SETATTR: return "setattr";
117 case CEPH_MDS_OP_RMXATTR: return "rmxattr";
118 case CEPH_MDS_OP_READDIR: return "readdir";
119 case CEPH_MDS_OP_MKNOD: return "mknod";
120 case CEPH_MDS_OP_LINK: return "link";
121 case CEPH_MDS_OP_UNLINK: return "unlink";
122 case CEPH_MDS_OP_RENAME: return "rename";
123 case CEPH_MDS_OP_MKDIR: return "mkdir";
124 case CEPH_MDS_OP_RMDIR: return "rmdir";
125 case CEPH_MDS_OP_SYMLINK: return "symlink";
126 case CEPH_MDS_OP_CREATE: return "create";
127 case CEPH_MDS_OP_OPEN: return "open";
128 case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
129 case CEPH_MDS_OP_LSSNAP: return "lssnap";
130 case CEPH_MDS_OP_MKSNAP: return "mksnap";
131 case CEPH_MDS_OP_RMSNAP: return "rmsnap";
132 }
133 return "???";
134}
135
136const char *ceph_cap_op_name(int op)
137{
138 switch (op) {
139 case CEPH_CAP_OP_GRANT: return "grant";
140 case CEPH_CAP_OP_REVOKE: return "revoke";
141 case CEPH_CAP_OP_TRUNC: return "trunc";
142 case CEPH_CAP_OP_EXPORT: return "export";
143 case CEPH_CAP_OP_IMPORT: return "import";
144 case CEPH_CAP_OP_UPDATE: return "update";
145 case CEPH_CAP_OP_DROP: return "drop";
146 case CEPH_CAP_OP_FLUSH: return "flush";
147 case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
148 case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
149 case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
150 case CEPH_CAP_OP_RELEASE: return "release";
151 case CEPH_CAP_OP_RENEW: return "renew";
152 }
153 return "???";
154}
155
156const char *ceph_lease_op_name(int o)
157{
158 switch (o) {
159 case CEPH_MDS_LEASE_REVOKE: return "revoke";
160 case CEPH_MDS_LEASE_RELEASE: return "release";
161 case CEPH_MDS_LEASE_RENEW: return "renew";
162 case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
163 }
164 return "???";
165}
166
167const char *ceph_snap_op_name(int o)
168{
169 switch (o) {
170 case CEPH_SNAP_OP_UPDATE: return "update";
171 case CEPH_SNAP_OP_CREATE: return "create";
172 case CEPH_SNAP_OP_DESTROY: return "destroy";
173 case CEPH_SNAP_OP_SPLIT: return "split";
174 }
175 return "???";
176}
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
new file mode 100644
index 000000000000..fabd302e5779
--- /dev/null
+++ b/fs/ceph/crush/crush.c
@@ -0,0 +1,151 @@
1
2#ifdef __KERNEL__
3# include <linux/slab.h>
4#else
5# include <stdlib.h>
6# include <assert.h>
7# define kfree(x) do { if (x) free(x); } while (0)
8# define BUG_ON(x) assert(!(x))
9#endif
10
11#include "crush.h"
12
13const char *crush_bucket_alg_name(int alg)
14{
15 switch (alg) {
16 case CRUSH_BUCKET_UNIFORM: return "uniform";
17 case CRUSH_BUCKET_LIST: return "list";
18 case CRUSH_BUCKET_TREE: return "tree";
19 case CRUSH_BUCKET_STRAW: return "straw";
20 default: return "unknown";
21 }
22}
23
24/**
25 * crush_get_bucket_item_weight - Get weight of an item in given bucket
26 * @b: bucket pointer
27 * @p: item index in bucket
28 */
29int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
30{
31 if (p >= b->size)
32 return 0;
33
34 switch (b->alg) {
35 case CRUSH_BUCKET_UNIFORM:
36 return ((struct crush_bucket_uniform *)b)->item_weight;
37 case CRUSH_BUCKET_LIST:
38 return ((struct crush_bucket_list *)b)->item_weights[p];
39 case CRUSH_BUCKET_TREE:
40 if (p & 1)
41 return ((struct crush_bucket_tree *)b)->node_weights[p];
42 return 0;
43 case CRUSH_BUCKET_STRAW:
44 return ((struct crush_bucket_straw *)b)->item_weights[p];
45 }
46 return 0;
47}
48
49/**
50 * crush_calc_parents - Calculate parent vectors for the given crush map.
51 * @map: crush_map pointer
52 */
53void crush_calc_parents(struct crush_map *map)
54{
55 int i, b, c;
56
57 for (b = 0; b < map->max_buckets; b++) {
58 if (map->buckets[b] == NULL)
59 continue;
60 for (i = 0; i < map->buckets[b]->size; i++) {
61 c = map->buckets[b]->items[i];
62 BUG_ON(c >= map->max_devices ||
63 c < -map->max_buckets);
64 if (c >= 0)
65 map->device_parents[c] = map->buckets[b]->id;
66 else
67 map->bucket_parents[-1-c] = map->buckets[b]->id;
68 }
69 }
70}
71
72void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
73{
74 kfree(b->h.perm);
75 kfree(b->h.items);
76 kfree(b);
77}
78
79void crush_destroy_bucket_list(struct crush_bucket_list *b)
80{
81 kfree(b->item_weights);
82 kfree(b->sum_weights);
83 kfree(b->h.perm);
84 kfree(b->h.items);
85 kfree(b);
86}
87
88void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
89{
90 kfree(b->node_weights);
91 kfree(b);
92}
93
94void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
95{
96 kfree(b->straws);
97 kfree(b->item_weights);
98 kfree(b->h.perm);
99 kfree(b->h.items);
100 kfree(b);
101}
102
103void crush_destroy_bucket(struct crush_bucket *b)
104{
105 switch (b->alg) {
106 case CRUSH_BUCKET_UNIFORM:
107 crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
108 break;
109 case CRUSH_BUCKET_LIST:
110 crush_destroy_bucket_list((struct crush_bucket_list *)b);
111 break;
112 case CRUSH_BUCKET_TREE:
113 crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
114 break;
115 case CRUSH_BUCKET_STRAW:
116 crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
117 break;
118 }
119}
120
121/**
122 * crush_destroy - Destroy a crush_map
123 * @map: crush_map pointer
124 */
125void crush_destroy(struct crush_map *map)
126{
127 int b;
128
129 /* buckets */
130 if (map->buckets) {
131 for (b = 0; b < map->max_buckets; b++) {
132 if (map->buckets[b] == NULL)
133 continue;
134 crush_destroy_bucket(map->buckets[b]);
135 }
136 kfree(map->buckets);
137 }
138
139 /* rules */
140 if (map->rules) {
141 for (b = 0; b < map->max_rules; b++)
142 kfree(map->rules[b]);
143 kfree(map->rules);
144 }
145
146 kfree(map->bucket_parents);
147 kfree(map->device_parents);
148 kfree(map);
149}
150
151
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
new file mode 100644
index 000000000000..dcd7e7523700
--- /dev/null
+++ b/fs/ceph/crush/crush.h
@@ -0,0 +1,180 @@
1#ifndef _CRUSH_CRUSH_H
2#define _CRUSH_CRUSH_H
3
4#include <linux/types.h>
5
6/*
7 * CRUSH is a pseudo-random data distribution algorithm that
8 * efficiently distributes input values (typically, data objects)
9 * across a heterogeneous, structured storage cluster.
10 *
11 * The algorithm was originally described in detail in this paper
12 * (although the algorithm has evolved somewhat since then):
13 *
14 * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
15 *
16 * LGPL2
17 */
18
19
20#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
21
22
23#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
24#define CRUSH_MAX_SET 10 /* max size of a mapping result */
25
26
27/*
28 * CRUSH uses user-defined "rules" to describe how inputs should be
29 * mapped to devices. A rule consists of sequence of steps to perform
30 * to generate the set of output devices.
31 */
32struct crush_rule_step {
33 __u32 op;
34 __s32 arg1;
35 __s32 arg2;
36};
37
38/* step op codes */
39enum {
40 CRUSH_RULE_NOOP = 0,
41 CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
42 CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
43 /* arg2 = type */
44 CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
45 CRUSH_RULE_EMIT = 4, /* no args */
46 CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
47 CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
48};
49
50/*
51 * for specifying choose num (arg1) relative to the max parameter
52 * passed to do_rule
53 */
54#define CRUSH_CHOOSE_N 0
55#define CRUSH_CHOOSE_N_MINUS(x) (-(x))
56
57/*
58 * The rule mask is used to describe what the rule is intended for.
59 * Given a ruleset and size of output set, we search through the
60 * rule list for a matching rule_mask.
61 */
62struct crush_rule_mask {
63 __u8 ruleset;
64 __u8 type;
65 __u8 min_size;
66 __u8 max_size;
67};
68
69struct crush_rule {
70 __u32 len;
71 struct crush_rule_mask mask;
72 struct crush_rule_step steps[0];
73};
74
75#define crush_rule_size(len) (sizeof(struct crush_rule) + \
76 (len)*sizeof(struct crush_rule_step))
77
78
79
80/*
81 * A bucket is a named container of other items (either devices or
82 * other buckets). Items within a bucket are chosen using one of a
83 * few different algorithms. The table summarizes how the speed of
84 * each option measures up against mapping stability when items are
85 * added or removed.
86 *
87 * Bucket Alg Speed Additions Removals
88 * ------------------------------------------------
89 * uniform O(1) poor poor
90 * list O(n) optimal poor
91 * tree O(log n) good good
92 * straw O(n) optimal optimal
93 */
94enum {
95 CRUSH_BUCKET_UNIFORM = 1,
96 CRUSH_BUCKET_LIST = 2,
97 CRUSH_BUCKET_TREE = 3,
98 CRUSH_BUCKET_STRAW = 4
99};
100extern const char *crush_bucket_alg_name(int alg);
101
102struct crush_bucket {
103 __s32 id; /* this'll be negative */
104 __u16 type; /* non-zero; type=0 is reserved for devices */
105 __u8 alg; /* one of CRUSH_BUCKET_* */
106 __u8 hash; /* which hash function to use, CRUSH_HASH_* */
107 __u32 weight; /* 16-bit fixed point */
108 __u32 size; /* num items */
109 __s32 *items;
110
111 /*
112 * cached random permutation: used for uniform bucket and for
113 * the linear search fallback for the other bucket types.
114 */
115 __u32 perm_x; /* @x for which *perm is defined */
116 __u32 perm_n; /* num elements of *perm that are permuted/defined */
117 __u32 *perm;
118};
119
120struct crush_bucket_uniform {
121 struct crush_bucket h;
122 __u32 item_weight; /* 16-bit fixed point; all items equally weighted */
123};
124
125struct crush_bucket_list {
126 struct crush_bucket h;
127 __u32 *item_weights; /* 16-bit fixed point */
128 __u32 *sum_weights; /* 16-bit fixed point. element i is sum
129 of weights 0..i, inclusive */
130};
131
132struct crush_bucket_tree {
133 struct crush_bucket h; /* note: h.size is _tree_ size, not number of
134 actual items */
135 __u8 num_nodes;
136 __u32 *node_weights;
137};
138
139struct crush_bucket_straw {
140 struct crush_bucket h;
141 __u32 *item_weights; /* 16-bit fixed point */
142 __u32 *straws; /* 16-bit fixed point */
143};
144
145
146
147/*
148 * CRUSH map includes all buckets, rules, etc.
149 */
150struct crush_map {
151 struct crush_bucket **buckets;
152 struct crush_rule **rules;
153
154 /*
155 * Parent pointers to identify the parent bucket a device or
156 * bucket in the hierarchy. If an item appears more than
157 * once, this is the _last_ time it appeared (where buckets
158 * are processed in bucket id order, from -1 on down to
159 * -max_buckets.
160 */
161 __u32 *bucket_parents;
162 __u32 *device_parents;
163
164 __s32 max_buckets;
165 __u32 max_rules;
166 __s32 max_devices;
167};
168
169
170/* crush.c */
171extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
172extern void crush_calc_parents(struct crush_map *map);
173extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
174extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
175extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
176extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
177extern void crush_destroy_bucket(struct crush_bucket *b);
178extern void crush_destroy(struct crush_map *map);
179
180#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
new file mode 100644
index 000000000000..5873aed694bf
--- /dev/null
+++ b/fs/ceph/crush/hash.c
@@ -0,0 +1,149 @@
1
2#include <linux/types.h>
3#include "hash.h"
4
5/*
6 * Robert Jenkins' function for mixing 32-bit values
7 * http://burtleburtle.net/bob/hash/evahash.html
8 * a, b = random bits, c = input and output
9 */
10#define crush_hashmix(a, b, c) do { \
11 a = a-b; a = a-c; a = a^(c>>13); \
12 b = b-c; b = b-a; b = b^(a<<8); \
13 c = c-a; c = c-b; c = c^(b>>13); \
14 a = a-b; a = a-c; a = a^(c>>12); \
15 b = b-c; b = b-a; b = b^(a<<16); \
16 c = c-a; c = c-b; c = c^(b>>5); \
17 a = a-b; a = a-c; a = a^(c>>3); \
18 b = b-c; b = b-a; b = b^(a<<10); \
19 c = c-a; c = c-b; c = c^(b>>15); \
20 } while (0)
21
22#define crush_hash_seed 1315423911
23
24static __u32 crush_hash32_rjenkins1(__u32 a)
25{
26 __u32 hash = crush_hash_seed ^ a;
27 __u32 b = a;
28 __u32 x = 231232;
29 __u32 y = 1232;
30 crush_hashmix(b, x, hash);
31 crush_hashmix(y, a, hash);
32 return hash;
33}
34
35static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
36{
37 __u32 hash = crush_hash_seed ^ a ^ b;
38 __u32 x = 231232;
39 __u32 y = 1232;
40 crush_hashmix(a, b, hash);
41 crush_hashmix(x, a, hash);
42 crush_hashmix(b, y, hash);
43 return hash;
44}
45
46static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
47{
48 __u32 hash = crush_hash_seed ^ a ^ b ^ c;
49 __u32 x = 231232;
50 __u32 y = 1232;
51 crush_hashmix(a, b, hash);
52 crush_hashmix(c, x, hash);
53 crush_hashmix(y, a, hash);
54 crush_hashmix(b, x, hash);
55 crush_hashmix(y, c, hash);
56 return hash;
57}
58
59static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
60{
61 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
62 __u32 x = 231232;
63 __u32 y = 1232;
64 crush_hashmix(a, b, hash);
65 crush_hashmix(c, d, hash);
66 crush_hashmix(a, x, hash);
67 crush_hashmix(y, b, hash);
68 crush_hashmix(c, x, hash);
69 crush_hashmix(y, d, hash);
70 return hash;
71}
72
73static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
74 __u32 e)
75{
76 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
77 __u32 x = 231232;
78 __u32 y = 1232;
79 crush_hashmix(a, b, hash);
80 crush_hashmix(c, d, hash);
81 crush_hashmix(e, x, hash);
82 crush_hashmix(y, a, hash);
83 crush_hashmix(b, x, hash);
84 crush_hashmix(y, c, hash);
85 crush_hashmix(d, x, hash);
86 crush_hashmix(y, e, hash);
87 return hash;
88}
89
90
91__u32 crush_hash32(int type, __u32 a)
92{
93 switch (type) {
94 case CRUSH_HASH_RJENKINS1:
95 return crush_hash32_rjenkins1(a);
96 default:
97 return 0;
98 }
99}
100
101__u32 crush_hash32_2(int type, __u32 a, __u32 b)
102{
103 switch (type) {
104 case CRUSH_HASH_RJENKINS1:
105 return crush_hash32_rjenkins1_2(a, b);
106 default:
107 return 0;
108 }
109}
110
111__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
112{
113 switch (type) {
114 case CRUSH_HASH_RJENKINS1:
115 return crush_hash32_rjenkins1_3(a, b, c);
116 default:
117 return 0;
118 }
119}
120
121__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
122{
123 switch (type) {
124 case CRUSH_HASH_RJENKINS1:
125 return crush_hash32_rjenkins1_4(a, b, c, d);
126 default:
127 return 0;
128 }
129}
130
131__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
132{
133 switch (type) {
134 case CRUSH_HASH_RJENKINS1:
135 return crush_hash32_rjenkins1_5(a, b, c, d, e);
136 default:
137 return 0;
138 }
139}
140
141const char *crush_hash_name(int type)
142{
143 switch (type) {
144 case CRUSH_HASH_RJENKINS1:
145 return "rjenkins1";
146 default:
147 return "unknown";
148 }
149}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
new file mode 100644
index 000000000000..ff48e110e4bb
--- /dev/null
+++ b/fs/ceph/crush/hash.h
@@ -0,0 +1,17 @@
1#ifndef _CRUSH_HASH_H
2#define _CRUSH_HASH_H
3
4#define CRUSH_HASH_RJENKINS1 0
5
6#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
7
8extern const char *crush_hash_name(int type);
9
10extern __u32 crush_hash32(int type, __u32 a);
11extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
12extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
13extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
14extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
15 __u32 e);
16
17#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
new file mode 100644
index 000000000000..9ba54efb6543
--- /dev/null
+++ b/fs/ceph/crush/mapper.c
@@ -0,0 +1,596 @@
1
2#ifdef __KERNEL__
3# include <linux/string.h>
4# include <linux/slab.h>
5# include <linux/bug.h>
6# include <linux/kernel.h>
7# ifndef dprintk
8# define dprintk(args...)
9# endif
10#else
11# include <string.h>
12# include <stdio.h>
13# include <stdlib.h>
14# include <assert.h>
15# define BUG_ON(x) assert(!(x))
16# define dprintk(args...) /* printf(args) */
17# define kmalloc(x, f) malloc(x)
18# define kfree(x) free(x)
19#endif
20
21#include "crush.h"
22#include "hash.h"
23
24/*
25 * Implement the core CRUSH mapping algorithm.
26 */
27
28/**
29 * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
30 * @map: the crush_map
31 * @ruleset: the storage ruleset id (user defined)
32 * @type: storage ruleset type (user defined)
33 * @size: output set size
34 */
35int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
36{
37 int i;
38
39 for (i = 0; i < map->max_rules; i++) {
40 if (map->rules[i] &&
41 map->rules[i]->mask.ruleset == ruleset &&
42 map->rules[i]->mask.type == type &&
43 map->rules[i]->mask.min_size <= size &&
44 map->rules[i]->mask.max_size >= size)
45 return i;
46 }
47 return -1;
48}
49
50
51/*
52 * bucket choose methods
53 *
54 * For each bucket algorithm, we have a "choose" method that, given a
55 * crush input @x and replica position (usually, position in output set) @r,
56 * will produce an item in the bucket.
57 */
58
59/*
60 * Choose based on a random permutation of the bucket.
61 *
62 * We used to use some prime number arithmetic to do this, but it
63 * wasn't very random, and had some other bad behaviors. Instead, we
64 * calculate an actual random permutation of the bucket members.
65 * Since this is expensive, we optimize for the r=0 case, which
66 * captures the vast majority of calls.
67 */
68static int bucket_perm_choose(struct crush_bucket *bucket,
69 int x, int r)
70{
71 unsigned pr = r % bucket->size;
72 unsigned i, s;
73
74 /* start a new permutation if @x has changed */
75 if (bucket->perm_x != x || bucket->perm_n == 0) {
76 dprintk("bucket %d new x=%d\n", bucket->id, x);
77 bucket->perm_x = x;
78
79 /* optimize common r=0 case */
80 if (pr == 0) {
81 s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
82 bucket->size;
83 bucket->perm[0] = s;
84 bucket->perm_n = 0xffff; /* magic value, see below */
85 goto out;
86 }
87
88 for (i = 0; i < bucket->size; i++)
89 bucket->perm[i] = i;
90 bucket->perm_n = 0;
91 } else if (bucket->perm_n == 0xffff) {
92 /* clean up after the r=0 case above */
93 for (i = 1; i < bucket->size; i++)
94 bucket->perm[i] = i;
95 bucket->perm[bucket->perm[0]] = 0;
96 bucket->perm_n = 1;
97 }
98
99 /* calculate permutation up to pr */
100 for (i = 0; i < bucket->perm_n; i++)
101 dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
102 while (bucket->perm_n <= pr) {
103 unsigned p = bucket->perm_n;
104 /* no point in swapping the final entry */
105 if (p < bucket->size - 1) {
106 i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
107 (bucket->size - p);
108 if (i) {
109 unsigned t = bucket->perm[p + i];
110 bucket->perm[p + i] = bucket->perm[p];
111 bucket->perm[p] = t;
112 }
113 dprintk(" perm_choose swap %d with %d\n", p, p+i);
114 }
115 bucket->perm_n++;
116 }
117 for (i = 0; i < bucket->size; i++)
118 dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
119
120 s = bucket->perm[pr];
121out:
122 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
123 bucket->size, x, r, pr, s);
124 return bucket->items[s];
125}
126
127/* uniform */
128static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
129 int x, int r)
130{
131 return bucket_perm_choose(&bucket->h, x, r);
132}
133
134/* list */
135static int bucket_list_choose(struct crush_bucket_list *bucket,
136 int x, int r)
137{
138 int i;
139
140 for (i = bucket->h.size-1; i >= 0; i--) {
141 __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
142 r, bucket->h.id);
143 w &= 0xffff;
144 dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
145 "sw %x rand %llx",
146 i, x, r, bucket->h.items[i], bucket->item_weights[i],
147 bucket->sum_weights[i], w);
148 w *= bucket->sum_weights[i];
149 w = w >> 16;
150 /*dprintk(" scaled %llx\n", w);*/
151 if (w < bucket->item_weights[i])
152 return bucket->h.items[i];
153 }
154
155 BUG_ON(1);
156 return 0;
157}
158
159
160/* (binary) tree */
161static int height(int n)
162{
163 int h = 0;
164 while ((n & 1) == 0) {
165 h++;
166 n = n >> 1;
167 }
168 return h;
169}
170
171static int left(int x)
172{
173 int h = height(x);
174 return x - (1 << (h-1));
175}
176
177static int right(int x)
178{
179 int h = height(x);
180 return x + (1 << (h-1));
181}
182
183static int terminal(int x)
184{
185 return x & 1;
186}
187
188static int bucket_tree_choose(struct crush_bucket_tree *bucket,
189 int x, int r)
190{
191 int n, l;
192 __u32 w;
193 __u64 t;
194
195 /* start at root */
196 n = bucket->num_nodes >> 1;
197
198 while (!terminal(n)) {
199 /* pick point in [0, w) */
200 w = bucket->node_weights[n];
201 t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
202 bucket->h.id) * (__u64)w;
203 t = t >> 32;
204
205 /* descend to the left or right? */
206 l = left(n);
207 if (t < bucket->node_weights[l])
208 n = l;
209 else
210 n = right(n);
211 }
212
213 return bucket->h.items[n >> 1];
214}
215
216
217/* straw */
218
219static int bucket_straw_choose(struct crush_bucket_straw *bucket,
220 int x, int r)
221{
222 int i;
223 int high = 0;
224 __u64 high_draw = 0;
225 __u64 draw;
226
227 for (i = 0; i < bucket->h.size; i++) {
228 draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
229 draw &= 0xffff;
230 draw *= bucket->straws[i];
231 if (i == 0 || draw > high_draw) {
232 high = i;
233 high_draw = draw;
234 }
235 }
236 return bucket->h.items[high];
237}
238
239static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
240{
241 dprintk("choose %d x=%d r=%d\n", in->id, x, r);
242 switch (in->alg) {
243 case CRUSH_BUCKET_UNIFORM:
244 return bucket_uniform_choose((struct crush_bucket_uniform *)in,
245 x, r);
246 case CRUSH_BUCKET_LIST:
247 return bucket_list_choose((struct crush_bucket_list *)in,
248 x, r);
249 case CRUSH_BUCKET_TREE:
250 return bucket_tree_choose((struct crush_bucket_tree *)in,
251 x, r);
252 case CRUSH_BUCKET_STRAW:
253 return bucket_straw_choose((struct crush_bucket_straw *)in,
254 x, r);
255 default:
256 BUG_ON(1);
257 return in->items[0];
258 }
259}
260
261/*
262 * true if device is marked "out" (failed, fully offloaded)
263 * of the cluster
264 */
265static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
266{
267 if (weight[item] >= 0x1000)
268 return 0;
269 if (weight[item] == 0)
270 return 1;
271 if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
272 < weight[item])
273 return 0;
274 return 1;
275}
276
277/**
278 * crush_choose - choose numrep distinct items of given type
279 * @map: the crush_map
280 * @bucket: the bucket we are choose an item from
281 * @x: crush input value
282 * @numrep: the number of items to choose
283 * @type: the type of item to choose
284 * @out: pointer to output vector
285 * @outpos: our position in that vector
286 * @firstn: true if choosing "first n" items, false if choosing "indep"
287 * @recurse_to_leaf: true if we want one device under each item of given type
288 * @out2: second output vector for leaf items (if @recurse_to_leaf)
289 */
290static int crush_choose(struct crush_map *map,
291 struct crush_bucket *bucket,
292 __u32 *weight,
293 int x, int numrep, int type,
294 int *out, int outpos,
295 int firstn, int recurse_to_leaf,
296 int *out2)
297{
298 int rep;
299 int ftotal, flocal;
300 int retry_descent, retry_bucket, skip_rep;
301 struct crush_bucket *in = bucket;
302 int r;
303 int i;
304 int item = 0;
305 int itemtype;
306 int collide, reject;
307 const int orig_tries = 5; /* attempts before we fall back to search */
308 dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos);
309
310 for (rep = outpos; rep < numrep; rep++) {
311 /* keep trying until we get a non-out, non-colliding item */
312 ftotal = 0;
313 skip_rep = 0;
314 do {
315 retry_descent = 0;
316 in = bucket; /* initial bucket */
317
318 /* choose through intervening buckets */
319 flocal = 0;
320 do {
321 collide = 0;
322 retry_bucket = 0;
323 r = rep;
324 if (in->alg == CRUSH_BUCKET_UNIFORM) {
325 /* be careful */
326 if (firstn || numrep >= in->size)
327 /* r' = r + f_total */
328 r += ftotal;
329 else if (in->size % numrep == 0)
330 /* r'=r+(n+1)*f_local */
331 r += (numrep+1) *
332 (flocal+ftotal);
333 else
334 /* r' = r + n*f_local */
335 r += numrep * (flocal+ftotal);
336 } else {
337 if (firstn)
338 /* r' = r + f_total */
339 r += ftotal;
340 else
341 /* r' = r + n*f_local */
342 r += numrep * (flocal+ftotal);
343 }
344
345 /* bucket choose */
346 if (in->size == 0) {
347 reject = 1;
348 goto reject;
349 }
350 if (flocal >= (in->size>>1) &&
351 flocal > orig_tries)
352 item = bucket_perm_choose(in, x, r);
353 else
354 item = crush_bucket_choose(in, x, r);
355 BUG_ON(item >= map->max_devices);
356
357 /* desired type? */
358 if (item < 0)
359 itemtype = map->buckets[-1-item]->type;
360 else
361 itemtype = 0;
362 dprintk(" item %d type %d\n", item, itemtype);
363
364 /* keep going? */
365 if (itemtype != type) {
366 BUG_ON(item >= 0 ||
367 (-1-item) >= map->max_buckets);
368 in = map->buckets[-1-item];
369 continue;
370 }
371
372 /* collision? */
373 for (i = 0; i < outpos; i++) {
374 if (out[i] == item) {
375 collide = 1;
376 break;
377 }
378 }
379
380 if (recurse_to_leaf &&
381 item < 0 &&
382 crush_choose(map, map->buckets[-1-item],
383 weight,
384 x, outpos+1, 0,
385 out2, outpos,
386 firstn, 0, NULL) <= outpos) {
387 reject = 1;
388 } else {
389 /* out? */
390 if (itemtype == 0)
391 reject = is_out(map, weight,
392 item, x);
393 else
394 reject = 0;
395 }
396
397reject:
398 if (reject || collide) {
399 ftotal++;
400 flocal++;
401
402 if (collide && flocal < 3)
403 /* retry locally a few times */
404 retry_bucket = 1;
405 else if (flocal < in->size + orig_tries)
406 /* exhaustive bucket search */
407 retry_bucket = 1;
408 else if (ftotal < 20)
409 /* then retry descent */
410 retry_descent = 1;
411 else
412 /* else give up */
413 skip_rep = 1;
414 dprintk(" reject %d collide %d "
415 "ftotal %d flocal %d\n",
416 reject, collide, ftotal,
417 flocal);
418 }
419 } while (retry_bucket);
420 } while (retry_descent);
421
422 if (skip_rep) {
423 dprintk("skip rep\n");
424 continue;
425 }
426
427 dprintk("choose got %d\n", item);
428 out[outpos] = item;
429 outpos++;
430 }
431
432 dprintk("choose returns %d\n", outpos);
433 return outpos;
434}
435
436
437/**
438 * crush_do_rule - calculate a mapping with the given input and rule
439 * @map: the crush_map
440 * @ruleno: the rule id
441 * @x: hash input
442 * @result: pointer to result vector
443 * @result_max: maximum result size
444 * @force: force initial replica choice; -1 for none
445 */
446int crush_do_rule(struct crush_map *map,
447 int ruleno, int x, int *result, int result_max,
448 int force, __u32 *weight)
449{
450 int result_len;
451 int force_context[CRUSH_MAX_DEPTH];
452 int force_pos = -1;
453 int a[CRUSH_MAX_SET];
454 int b[CRUSH_MAX_SET];
455 int c[CRUSH_MAX_SET];
456 int recurse_to_leaf;
457 int *w;
458 int wsize = 0;
459 int *o;
460 int osize;
461 int *tmp;
462 struct crush_rule *rule;
463 int step;
464 int i, j;
465 int numrep;
466 int firstn;
467 int rc = -1;
468
469 BUG_ON(ruleno >= map->max_rules);
470
471 rule = map->rules[ruleno];
472 result_len = 0;
473 w = a;
474 o = b;
475
476 /*
477 * determine hierarchical context of force, if any. note
478 * that this may or may not correspond to the specific types
479 * referenced by the crush rule.
480 */
481 if (force >= 0) {
482 if (force >= map->max_devices ||
483 map->device_parents[force] == 0) {
484 /*dprintk("CRUSH: forcefed device dne\n");*/
485 rc = -1; /* force fed device dne */
486 goto out;
487 }
488 if (!is_out(map, weight, force, x)) {
489 while (1) {
490 force_context[++force_pos] = force;
491 if (force >= 0)
492 force = map->device_parents[force];
493 else
494 force = map->bucket_parents[-1-force];
495 if (force == 0)
496 break;
497 }
498 }
499 }
500
501 for (step = 0; step < rule->len; step++) {
502 firstn = 0;
503 switch (rule->steps[step].op) {
504 case CRUSH_RULE_TAKE:
505 w[0] = rule->steps[step].arg1;
506 if (force_pos >= 0) {
507 BUG_ON(force_context[force_pos] != w[0]);
508 force_pos--;
509 }
510 wsize = 1;
511 break;
512
513 case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
514 case CRUSH_RULE_CHOOSE_FIRSTN:
515 firstn = 1;
516 case CRUSH_RULE_CHOOSE_LEAF_INDEP:
517 case CRUSH_RULE_CHOOSE_INDEP:
518 BUG_ON(wsize == 0);
519
520 recurse_to_leaf =
521 rule->steps[step].op ==
522 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
523 rule->steps[step].op ==
524 CRUSH_RULE_CHOOSE_LEAF_INDEP;
525
526 /* reset output */
527 osize = 0;
528
529 for (i = 0; i < wsize; i++) {
530 /*
531 * see CRUSH_N, CRUSH_N_MINUS macros.
532 * basically, numrep <= 0 means relative to
533 * the provided result_max
534 */
535 numrep = rule->steps[step].arg1;
536 if (numrep <= 0) {
537 numrep += result_max;
538 if (numrep <= 0)
539 continue;
540 }
541 j = 0;
542 if (osize == 0 && force_pos >= 0) {
543 /* skip any intermediate types */
544 while (force_pos &&
545 force_context[force_pos] < 0 &&
546 rule->steps[step].arg2 !=
547 map->buckets[-1 -
548 force_context[force_pos]]->type)
549 force_pos--;
550 o[osize] = force_context[force_pos];
551 if (recurse_to_leaf)
552 c[osize] = force_context[0];
553 j++;
554 force_pos--;
555 }
556 osize += crush_choose(map,
557 map->buckets[-1-w[i]],
558 weight,
559 x, numrep,
560 rule->steps[step].arg2,
561 o+osize, j,
562 firstn,
563 recurse_to_leaf, c+osize);
564 }
565
566 if (recurse_to_leaf)
567 /* copy final _leaf_ values to output set */
568 memcpy(o, c, osize*sizeof(*o));
569
570 /* swap t and w arrays */
571 tmp = o;
572 o = w;
573 w = tmp;
574 wsize = osize;
575 break;
576
577
578 case CRUSH_RULE_EMIT:
579 for (i = 0; i < wsize && result_len < result_max; i++) {
580 result[result_len] = w[i];
581 result_len++;
582 }
583 wsize = 0;
584 break;
585
586 default:
587 BUG_ON(1);
588 }
589 }
590 rc = result_len;
591
592out:
593 return rc;
594}
595
596
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
new file mode 100644
index 000000000000..98e90046fd9f
--- /dev/null
+++ b/fs/ceph/crush/mapper.h
@@ -0,0 +1,20 @@
1#ifndef _CRUSH_MAPPER_H
2#define _CRUSH_MAPPER_H
3
4/*
5 * CRUSH functions for find rules and then mapping an input to an
6 * output set.
7 *
8 * LGPL2
9 */
10
11#include "crush.h"
12
13extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
14extern int crush_do_rule(struct crush_map *map,
15 int ruleno,
16 int x, int *result, int result_max,
17 int forcefeed, /* -1 for none */
18 __u32 *weights);
19
20#endif
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
new file mode 100644
index 000000000000..f704b3b62424
--- /dev/null
+++ b/fs/ceph/crypto.c
@@ -0,0 +1,409 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/scatterlist.h>
6#include <linux/slab.h>
7#include <crypto/hash.h>
8
9#include "crypto.h"
10#include "decode.h"
11
12int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
13{
14 if (*p + sizeof(u16) + sizeof(key->created) +
15 sizeof(u16) + key->len > end)
16 return -ERANGE;
17 ceph_encode_16(p, key->type);
18 ceph_encode_copy(p, &key->created, sizeof(key->created));
19 ceph_encode_16(p, key->len);
20 ceph_encode_copy(p, key->key, key->len);
21 return 0;
22}
23
24int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
25{
26 ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
27 key->type = ceph_decode_16(p);
28 ceph_decode_copy(p, &key->created, sizeof(key->created));
29 key->len = ceph_decode_16(p);
30 ceph_decode_need(p, end, key->len, bad);
31 key->key = kmalloc(key->len, GFP_NOFS);
32 if (!key->key)
33 return -ENOMEM;
34 ceph_decode_copy(p, key->key, key->len);
35 return 0;
36
37bad:
38 dout("failed to decode crypto key\n");
39 return -EINVAL;
40}
41
42int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
43{
44 int inlen = strlen(inkey);
45 int blen = inlen * 3 / 4;
46 void *buf, *p;
47 int ret;
48
49 dout("crypto_key_unarmor %s\n", inkey);
50 buf = kmalloc(blen, GFP_NOFS);
51 if (!buf)
52 return -ENOMEM;
53 blen = ceph_unarmor(buf, inkey, inkey+inlen);
54 if (blen < 0) {
55 kfree(buf);
56 return blen;
57 }
58
59 p = buf;
60 ret = ceph_crypto_key_decode(key, &p, p + blen);
61 kfree(buf);
62 if (ret)
63 return ret;
64 dout("crypto_key_unarmor key %p type %d len %d\n", key,
65 key->type, key->len);
66 return 0;
67}
68
69
70
71#define AES_KEY_SIZE 16
72
73static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
74{
75 return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
76}
77
78const u8 *aes_iv = "cephsageyudagreg";
79
80int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len,
81 const void *src, size_t src_len)
82{
83 struct scatterlist sg_in[2], sg_out[1];
84 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
85 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
86 int ret;
87 void *iv;
88 int ivsize;
89 size_t zero_padding = (0x10 - (src_len & 0x0f));
90 char pad[16];
91
92 if (IS_ERR(tfm))
93 return PTR_ERR(tfm);
94
95 memset(pad, zero_padding, zero_padding);
96
97 *dst_len = src_len + zero_padding;
98
99 crypto_blkcipher_setkey((void *)tfm, key, key_len);
100 sg_init_table(sg_in, 2);
101 sg_set_buf(&sg_in[0], src, src_len);
102 sg_set_buf(&sg_in[1], pad, zero_padding);
103 sg_init_table(sg_out, 1);
104 sg_set_buf(sg_out, dst, *dst_len);
105 iv = crypto_blkcipher_crt(tfm)->iv;
106 ivsize = crypto_blkcipher_ivsize(tfm);
107
108 memcpy(iv, aes_iv, ivsize);
109 /*
110 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
111 key, key_len, 1);
112 print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
113 src, src_len, 1);
114 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
115 pad, zero_padding, 1);
116 */
117 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
118 src_len + zero_padding);
119 crypto_free_blkcipher(tfm);
120 if (ret < 0)
121 pr_err("ceph_aes_crypt failed %d\n", ret);
122 /*
123 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
124 dst, *dst_len, 1);
125 */
126 return 0;
127}
128
129int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len,
130 const void *src1, size_t src1_len,
131 const void *src2, size_t src2_len)
132{
133 struct scatterlist sg_in[3], sg_out[1];
134 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
135 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
136 int ret;
137 void *iv;
138 int ivsize;
139 size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
140 char pad[16];
141
142 if (IS_ERR(tfm))
143 return PTR_ERR(tfm);
144
145 memset(pad, zero_padding, zero_padding);
146
147 *dst_len = src1_len + src2_len + zero_padding;
148
149 crypto_blkcipher_setkey((void *)tfm, key, key_len);
150 sg_init_table(sg_in, 3);
151 sg_set_buf(&sg_in[0], src1, src1_len);
152 sg_set_buf(&sg_in[1], src2, src2_len);
153 sg_set_buf(&sg_in[2], pad, zero_padding);
154 sg_init_table(sg_out, 1);
155 sg_set_buf(sg_out, dst, *dst_len);
156 iv = crypto_blkcipher_crt(tfm)->iv;
157 ivsize = crypto_blkcipher_ivsize(tfm);
158
159 memcpy(iv, aes_iv, ivsize);
160 /*
161 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
162 key, key_len, 1);
163 print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
164 src1, src1_len, 1);
165 print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
166 src2, src2_len, 1);
167 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
168 pad, zero_padding, 1);
169 */
170 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
171 src1_len + src2_len + zero_padding);
172 crypto_free_blkcipher(tfm);
173 if (ret < 0)
174 pr_err("ceph_aes_crypt2 failed %d\n", ret);
175 /*
176 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
177 dst, *dst_len, 1);
178 */
179 return 0;
180}
181
182int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len,
183 const void *src, size_t src_len)
184{
185 struct scatterlist sg_in[1], sg_out[2];
186 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
187 struct blkcipher_desc desc = { .tfm = tfm };
188 char pad[16];
189 void *iv;
190 int ivsize;
191 int ret;
192 int last_byte;
193
194 if (IS_ERR(tfm))
195 return PTR_ERR(tfm);
196
197 crypto_blkcipher_setkey((void *)tfm, key, key_len);
198 sg_init_table(sg_in, 1);
199 sg_init_table(sg_out, 2);
200 sg_set_buf(sg_in, src, src_len);
201 sg_set_buf(&sg_out[0], dst, *dst_len);
202 sg_set_buf(&sg_out[1], pad, sizeof(pad));
203
204 iv = crypto_blkcipher_crt(tfm)->iv;
205 ivsize = crypto_blkcipher_ivsize(tfm);
206
207 memcpy(iv, aes_iv, ivsize);
208
209 /*
210 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
211 key, key_len, 1);
212 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
213 src, src_len, 1);
214 */
215
216 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
217 crypto_free_blkcipher(tfm);
218 if (ret < 0) {
219 pr_err("ceph_aes_decrypt failed %d\n", ret);
220 return ret;
221 }
222
223 if (src_len <= *dst_len)
224 last_byte = ((char *)dst)[src_len - 1];
225 else
226 last_byte = pad[src_len - *dst_len - 1];
227 if (last_byte <= 16 && src_len >= last_byte) {
228 *dst_len = src_len - last_byte;
229 } else {
230 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
231 last_byte, (int)src_len);
232 return -EPERM; /* bad padding */
233 }
234 /*
235 print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
236 dst, *dst_len, 1);
237 */
238 return 0;
239}
240
241int ceph_aes_decrypt2(const void *key, int key_len,
242 void *dst1, size_t *dst1_len,
243 void *dst2, size_t *dst2_len,
244 const void *src, size_t src_len)
245{
246 struct scatterlist sg_in[1], sg_out[3];
247 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
248 struct blkcipher_desc desc = { .tfm = tfm };
249 char pad[16];
250 void *iv;
251 int ivsize;
252 int ret;
253 int last_byte;
254
255 if (IS_ERR(tfm))
256 return PTR_ERR(tfm);
257
258 sg_init_table(sg_in, 1);
259 sg_set_buf(sg_in, src, src_len);
260 sg_init_table(sg_out, 3);
261 sg_set_buf(&sg_out[0], dst1, *dst1_len);
262 sg_set_buf(&sg_out[1], dst2, *dst2_len);
263 sg_set_buf(&sg_out[2], pad, sizeof(pad));
264
265 crypto_blkcipher_setkey((void *)tfm, key, key_len);
266 iv = crypto_blkcipher_crt(tfm)->iv;
267 ivsize = crypto_blkcipher_ivsize(tfm);
268
269 memcpy(iv, aes_iv, ivsize);
270
271 /*
272 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
273 key, key_len, 1);
274 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
275 src, src_len, 1);
276 */
277
278 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
279 crypto_free_blkcipher(tfm);
280 if (ret < 0) {
281 pr_err("ceph_aes_decrypt failed %d\n", ret);
282 return ret;
283 }
284
285 if (src_len <= *dst1_len)
286 last_byte = ((char *)dst1)[src_len - 1];
287 else if (src_len <= *dst1_len + *dst2_len)
288 last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
289 else
290 last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
291 if (last_byte <= 16 && src_len >= last_byte) {
292 src_len -= last_byte;
293 } else {
294 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
295 last_byte, (int)src_len);
296 return -EPERM; /* bad padding */
297 }
298
299 if (src_len < *dst1_len) {
300 *dst1_len = src_len;
301 *dst2_len = 0;
302 } else {
303 *dst2_len = src_len - *dst1_len;
304 }
305 /*
306 print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1,
307 dst1, *dst1_len, 1);
308 print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1,
309 dst2, *dst2_len, 1);
310 */
311
312 return 0;
313}
314
315
316int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
317 const void *src, size_t src_len)
318{
319 switch (secret->type) {
320 case CEPH_CRYPTO_NONE:
321 if (*dst_len < src_len)
322 return -ERANGE;
323 memcpy(dst, src, src_len);
324 *dst_len = src_len;
325 return 0;
326
327 case CEPH_CRYPTO_AES:
328 return ceph_aes_decrypt(secret->key, secret->len, dst,
329 dst_len, src, src_len);
330
331 default:
332 return -EINVAL;
333 }
334}
335
336int ceph_decrypt2(struct ceph_crypto_key *secret,
337 void *dst1, size_t *dst1_len,
338 void *dst2, size_t *dst2_len,
339 const void *src, size_t src_len)
340{
341 size_t t;
342
343 switch (secret->type) {
344 case CEPH_CRYPTO_NONE:
345 if (*dst1_len + *dst2_len < src_len)
346 return -ERANGE;
347 t = min(*dst1_len, src_len);
348 memcpy(dst1, src, t);
349 *dst1_len = t;
350 src += t;
351 src_len -= t;
352 if (src_len) {
353 t = min(*dst2_len, src_len);
354 memcpy(dst2, src, t);
355 *dst2_len = t;
356 }
357 return 0;
358
359 case CEPH_CRYPTO_AES:
360 return ceph_aes_decrypt2(secret->key, secret->len,
361 dst1, dst1_len, dst2, dst2_len,
362 src, src_len);
363
364 default:
365 return -EINVAL;
366 }
367}
368
369int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
370 const void *src, size_t src_len)
371{
372 switch (secret->type) {
373 case CEPH_CRYPTO_NONE:
374 if (*dst_len < src_len)
375 return -ERANGE;
376 memcpy(dst, src, src_len);
377 *dst_len = src_len;
378 return 0;
379
380 case CEPH_CRYPTO_AES:
381 return ceph_aes_encrypt(secret->key, secret->len, dst,
382 dst_len, src, src_len);
383
384 default:
385 return -EINVAL;
386 }
387}
388
389int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
390 const void *src1, size_t src1_len,
391 const void *src2, size_t src2_len)
392{
393 switch (secret->type) {
394 case CEPH_CRYPTO_NONE:
395 if (*dst_len < src1_len + src2_len)
396 return -ERANGE;
397 memcpy(dst, src1, src1_len);
398 memcpy(dst + src1_len, src2, src2_len);
399 *dst_len = src1_len + src2_len;
400 return 0;
401
402 case CEPH_CRYPTO_AES:
403 return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
404 src1, src1_len, src2, src2_len);
405
406 default:
407 return -EINVAL;
408 }
409}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
new file mode 100644
index 000000000000..40b502e6bd89
--- /dev/null
+++ b/fs/ceph/crypto.h
@@ -0,0 +1,48 @@
1#ifndef _FS_CEPH_CRYPTO_H
2#define _FS_CEPH_CRYPTO_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * cryptographic secret
9 */
10struct ceph_crypto_key {
11 int type;
12 struct ceph_timespec created;
13 int len;
14 void *key;
15};
16
17static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
18{
19 kfree(key->key);
20}
21
22extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
23 void **p, void *end);
24extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
25 void **p, void *end);
26extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
27
28/* crypto.c */
29extern int ceph_decrypt(struct ceph_crypto_key *secret,
30 void *dst, size_t *dst_len,
31 const void *src, size_t src_len);
32extern int ceph_encrypt(struct ceph_crypto_key *secret,
33 void *dst, size_t *dst_len,
34 const void *src, size_t src_len);
35extern int ceph_decrypt2(struct ceph_crypto_key *secret,
36 void *dst1, size_t *dst1_len,
37 void *dst2, size_t *dst2_len,
38 const void *src, size_t src_len);
39extern int ceph_encrypt2(struct ceph_crypto_key *secret,
40 void *dst, size_t *dst_len,
41 const void *src1, size_t src1_len,
42 const void *src2, size_t src2_len);
43
44/* armor.c */
45extern int ceph_armor(char *dst, const void *src, const void *end);
46extern int ceph_unarmor(void *dst, const char *src, const char *end);
47
48#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
new file mode 100644
index 000000000000..f7048da92acc
--- /dev/null
+++ b/fs/ceph/debugfs.c
@@ -0,0 +1,484 @@
1#include "ceph_debug.h"
2
3#include <linux/device.h>
4#include <linux/slab.h>
5#include <linux/module.h>
6#include <linux/ctype.h>
7#include <linux/debugfs.h>
8#include <linux/seq_file.h>
9
10#include "super.h"
11#include "mds_client.h"
12#include "mon_client.h"
13#include "auth.h"
14
15#ifdef CONFIG_DEBUG_FS
16
17/*
18 * Implement /sys/kernel/debug/ceph fun
19 *
20 * /sys/kernel/debug/ceph/client* - an instance of the ceph client
21 * .../osdmap - current osdmap
22 * .../mdsmap - current mdsmap
23 * .../monmap - current monmap
24 * .../osdc - active osd requests
25 * .../mdsc - active mds requests
26 * .../monc - mon client state
27 * .../dentry_lru - dump contents of dentry lru
28 * .../caps - expose cap (reservation) stats
29 * .../bdi - symlink to ../../bdi/something
30 */
31
32static struct dentry *ceph_debugfs_dir;
33
34static int monmap_show(struct seq_file *s, void *p)
35{
36 int i;
37 struct ceph_client *client = s->private;
38
39 if (client->monc.monmap == NULL)
40 return 0;
41
42 seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
43 for (i = 0; i < client->monc.monmap->num_mon; i++) {
44 struct ceph_entity_inst *inst =
45 &client->monc.monmap->mon_inst[i];
46
47 seq_printf(s, "\t%s%lld\t%s\n",
48 ENTITY_NAME(inst->name),
49 pr_addr(&inst->addr.in_addr));
50 }
51 return 0;
52}
53
54static int mdsmap_show(struct seq_file *s, void *p)
55{
56 int i;
57 struct ceph_client *client = s->private;
58
59 if (client->mdsc.mdsmap == NULL)
60 return 0;
61 seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
62 seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
63 seq_printf(s, "session_timeout %d\n",
64 client->mdsc.mdsmap->m_session_timeout);
65 seq_printf(s, "session_autoclose %d\n",
66 client->mdsc.mdsmap->m_session_autoclose);
67 for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
68 struct ceph_entity_addr *addr =
69 &client->mdsc.mdsmap->m_info[i].addr;
70 int state = client->mdsc.mdsmap->m_info[i].state;
71
72 seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
73 ceph_mds_state_name(state));
74 }
75 return 0;
76}
77
78static int osdmap_show(struct seq_file *s, void *p)
79{
80 int i;
81 struct ceph_client *client = s->private;
82 struct rb_node *n;
83
84 if (client->osdc.osdmap == NULL)
85 return 0;
86 seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
87 seq_printf(s, "flags%s%s\n",
88 (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
89 " NEARFULL" : "",
90 (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
91 " FULL" : "");
92 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
93 struct ceph_pg_pool_info *pool =
94 rb_entry(n, struct ceph_pg_pool_info, node);
95 seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
96 pool->id, pool->v.pg_num, pool->pg_num_mask,
97 pool->v.lpg_num, pool->lpg_num_mask);
98 }
99 for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
100 struct ceph_entity_addr *addr =
101 &client->osdc.osdmap->osd_addr[i];
102 int state = client->osdc.osdmap->osd_state[i];
103 char sb[64];
104
105 seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
106 i, pr_addr(&addr->in_addr),
107 ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
108 ceph_osdmap_state_str(sb, sizeof(sb), state));
109 }
110 return 0;
111}
112
113static int monc_show(struct seq_file *s, void *p)
114{
115 struct ceph_client *client = s->private;
116 struct ceph_mon_statfs_request *req;
117 struct ceph_mon_client *monc = &client->monc;
118 struct rb_node *rp;
119
120 mutex_lock(&monc->mutex);
121
122 if (monc->have_mdsmap)
123 seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
124 if (monc->have_osdmap)
125 seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
126 if (monc->want_next_osdmap)
127 seq_printf(s, "want next osdmap\n");
128
129 for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) {
130 req = rb_entry(rp, struct ceph_mon_statfs_request, node);
131 seq_printf(s, "%lld statfs\n", req->tid);
132 }
133
134 mutex_unlock(&monc->mutex);
135 return 0;
136}
137
138static int mdsc_show(struct seq_file *s, void *p)
139{
140 struct ceph_client *client = s->private;
141 struct ceph_mds_client *mdsc = &client->mdsc;
142 struct ceph_mds_request *req;
143 struct rb_node *rp;
144 int pathlen;
145 u64 pathbase;
146 char *path;
147
148 mutex_lock(&mdsc->mutex);
149 for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
150 req = rb_entry(rp, struct ceph_mds_request, r_node);
151
152 if (req->r_request)
153 seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
154 else
155 seq_printf(s, "%lld\t(no request)\t", req->r_tid);
156
157 seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
158
159 if (req->r_got_unsafe)
160 seq_printf(s, "\t(unsafe)");
161 else
162 seq_printf(s, "\t");
163
164 if (req->r_inode) {
165 seq_printf(s, " #%llx", ceph_ino(req->r_inode));
166 } else if (req->r_dentry) {
167 path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
168 &pathbase, 0);
169 spin_lock(&req->r_dentry->d_lock);
170 seq_printf(s, " #%llx/%.*s (%s)",
171 ceph_ino(req->r_dentry->d_parent->d_inode),
172 req->r_dentry->d_name.len,
173 req->r_dentry->d_name.name,
174 path ? path : "");
175 spin_unlock(&req->r_dentry->d_lock);
176 kfree(path);
177 } else if (req->r_path1) {
178 seq_printf(s, " #%llx/%s", req->r_ino1.ino,
179 req->r_path1);
180 }
181
182 if (req->r_old_dentry) {
183 path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
184 &pathbase, 0);
185 spin_lock(&req->r_old_dentry->d_lock);
186 seq_printf(s, " #%llx/%.*s (%s)",
187 ceph_ino(req->r_old_dentry->d_parent->d_inode),
188 req->r_old_dentry->d_name.len,
189 req->r_old_dentry->d_name.name,
190 path ? path : "");
191 spin_unlock(&req->r_old_dentry->d_lock);
192 kfree(path);
193 } else if (req->r_path2) {
194 if (req->r_ino2.ino)
195 seq_printf(s, " #%llx/%s", req->r_ino2.ino,
196 req->r_path2);
197 else
198 seq_printf(s, " %s", req->r_path2);
199 }
200
201 seq_printf(s, "\n");
202 }
203 mutex_unlock(&mdsc->mutex);
204
205 return 0;
206}
207
208static int osdc_show(struct seq_file *s, void *pp)
209{
210 struct ceph_client *client = s->private;
211 struct ceph_osd_client *osdc = &client->osdc;
212 struct rb_node *p;
213
214 mutex_lock(&osdc->request_mutex);
215 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
216 struct ceph_osd_request *req;
217 struct ceph_osd_request_head *head;
218 struct ceph_osd_op *op;
219 int num_ops;
220 int opcode, olen;
221 int i;
222
223 req = rb_entry(p, struct ceph_osd_request, r_node);
224
225 seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
226 req->r_osd ? req->r_osd->o_osd : -1,
227 le32_to_cpu(req->r_pgid.pool),
228 le16_to_cpu(req->r_pgid.ps));
229
230 head = req->r_request->front.iov_base;
231 op = (void *)(head + 1);
232
233 num_ops = le16_to_cpu(head->num_ops);
234 olen = le32_to_cpu(head->object_len);
235 seq_printf(s, "%.*s", olen,
236 (const char *)(head->ops + num_ops));
237
238 if (req->r_reassert_version.epoch)
239 seq_printf(s, "\t%u'%llu",
240 (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
241 le64_to_cpu(req->r_reassert_version.version));
242 else
243 seq_printf(s, "\t");
244
245 for (i = 0; i < num_ops; i++) {
246 opcode = le16_to_cpu(op->op);
247 seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
248 op++;
249 }
250
251 seq_printf(s, "\n");
252 }
253 mutex_unlock(&osdc->request_mutex);
254 return 0;
255}
256
257static int caps_show(struct seq_file *s, void *p)
258{
259 struct ceph_client *client = p;
260 int total, avail, used, reserved, min;
261
262 ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
263 seq_printf(s, "total\t\t%d\n"
264 "avail\t\t%d\n"
265 "used\t\t%d\n"
266 "reserved\t%d\n"
267 "min\t%d\n",
268 total, avail, used, reserved, min);
269 return 0;
270}
271
272static int dentry_lru_show(struct seq_file *s, void *ptr)
273{
274 struct ceph_client *client = s->private;
275 struct ceph_mds_client *mdsc = &client->mdsc;
276 struct ceph_dentry_info *di;
277
278 spin_lock(&mdsc->dentry_lru_lock);
279 list_for_each_entry(di, &mdsc->dentry_lru, lru) {
280 struct dentry *dentry = di->dentry;
281 seq_printf(s, "%p %p\t%.*s\n",
282 di, dentry, dentry->d_name.len, dentry->d_name.name);
283 }
284 spin_unlock(&mdsc->dentry_lru_lock);
285
286 return 0;
287}
288
289#define DEFINE_SHOW_FUNC(name) \
290static int name##_open(struct inode *inode, struct file *file) \
291{ \
292 struct seq_file *sf; \
293 int ret; \
294 \
295 ret = single_open(file, name, NULL); \
296 sf = file->private_data; \
297 sf->private = inode->i_private; \
298 return ret; \
299} \
300 \
301static const struct file_operations name##_fops = { \
302 .open = name##_open, \
303 .read = seq_read, \
304 .llseek = seq_lseek, \
305 .release = single_release, \
306};
307
308DEFINE_SHOW_FUNC(monmap_show)
309DEFINE_SHOW_FUNC(mdsmap_show)
310DEFINE_SHOW_FUNC(osdmap_show)
311DEFINE_SHOW_FUNC(monc_show)
312DEFINE_SHOW_FUNC(mdsc_show)
313DEFINE_SHOW_FUNC(osdc_show)
314DEFINE_SHOW_FUNC(dentry_lru_show)
315DEFINE_SHOW_FUNC(caps_show)
316
317static int congestion_kb_set(void *data, u64 val)
318{
319 struct ceph_client *client = (struct ceph_client *)data;
320
321 if (client)
322 client->mount_args->congestion_kb = (int)val;
323
324 return 0;
325}
326
327static int congestion_kb_get(void *data, u64 *val)
328{
329 struct ceph_client *client = (struct ceph_client *)data;
330
331 if (client)
332 *val = (u64)client->mount_args->congestion_kb;
333
334 return 0;
335}
336
337
338DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
339 congestion_kb_set, "%llu\n");
340
341int __init ceph_debugfs_init(void)
342{
343 ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
344 if (!ceph_debugfs_dir)
345 return -ENOMEM;
346 return 0;
347}
348
349void ceph_debugfs_cleanup(void)
350{
351 debugfs_remove(ceph_debugfs_dir);
352}
353
354int ceph_debugfs_client_init(struct ceph_client *client)
355{
356 int ret = 0;
357 char name[80];
358
359 snprintf(name, sizeof(name), FSID_FORMAT ".client%lld",
360 PR_FSID(&client->fsid), client->monc.auth->global_id);
361
362 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
363 if (!client->debugfs_dir)
364 goto out;
365
366 client->monc.debugfs_file = debugfs_create_file("monc",
367 0600,
368 client->debugfs_dir,
369 client,
370 &monc_show_fops);
371 if (!client->monc.debugfs_file)
372 goto out;
373
374 client->mdsc.debugfs_file = debugfs_create_file("mdsc",
375 0600,
376 client->debugfs_dir,
377 client,
378 &mdsc_show_fops);
379 if (!client->mdsc.debugfs_file)
380 goto out;
381
382 client->osdc.debugfs_file = debugfs_create_file("osdc",
383 0600,
384 client->debugfs_dir,
385 client,
386 &osdc_show_fops);
387 if (!client->osdc.debugfs_file)
388 goto out;
389
390 client->debugfs_monmap = debugfs_create_file("monmap",
391 0600,
392 client->debugfs_dir,
393 client,
394 &monmap_show_fops);
395 if (!client->debugfs_monmap)
396 goto out;
397
398 client->debugfs_mdsmap = debugfs_create_file("mdsmap",
399 0600,
400 client->debugfs_dir,
401 client,
402 &mdsmap_show_fops);
403 if (!client->debugfs_mdsmap)
404 goto out;
405
406 client->debugfs_osdmap = debugfs_create_file("osdmap",
407 0600,
408 client->debugfs_dir,
409 client,
410 &osdmap_show_fops);
411 if (!client->debugfs_osdmap)
412 goto out;
413
414 client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
415 0600,
416 client->debugfs_dir,
417 client,
418 &dentry_lru_show_fops);
419 if (!client->debugfs_dentry_lru)
420 goto out;
421
422 client->debugfs_caps = debugfs_create_file("caps",
423 0400,
424 client->debugfs_dir,
425 client,
426 &caps_show_fops);
427 if (!client->debugfs_caps)
428 goto out;
429
430 client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb",
431 0600,
432 client->debugfs_dir,
433 client,
434 &congestion_kb_fops);
435 if (!client->debugfs_congestion_kb)
436 goto out;
437
438 sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
439 client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
440 name);
441
442 return 0;
443
444out:
445 ceph_debugfs_client_cleanup(client);
446 return ret;
447}
448
449void ceph_debugfs_client_cleanup(struct ceph_client *client)
450{
451 debugfs_remove(client->debugfs_bdi);
452 debugfs_remove(client->debugfs_caps);
453 debugfs_remove(client->debugfs_dentry_lru);
454 debugfs_remove(client->debugfs_osdmap);
455 debugfs_remove(client->debugfs_mdsmap);
456 debugfs_remove(client->debugfs_monmap);
457 debugfs_remove(client->osdc.debugfs_file);
458 debugfs_remove(client->mdsc.debugfs_file);
459 debugfs_remove(client->monc.debugfs_file);
460 debugfs_remove(client->debugfs_congestion_kb);
461 debugfs_remove(client->debugfs_dir);
462}
463
464#else // CONFIG_DEBUG_FS
465
466int __init ceph_debugfs_init(void)
467{
468 return 0;
469}
470
471void ceph_debugfs_cleanup(void)
472{
473}
474
475int ceph_debugfs_client_init(struct ceph_client *client)
476{
477 return 0;
478}
479
480void ceph_debugfs_client_cleanup(struct ceph_client *client)
481{
482}
483
484#endif // CONFIG_DEBUG_FS
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
new file mode 100644
index 000000000000..65b3e022eaf5
--- /dev/null
+++ b/fs/ceph/decode.h
@@ -0,0 +1,194 @@
1#ifndef __CEPH_DECODE_H
2#define __CEPH_DECODE_H
3
4#include <asm/unaligned.h>
5#include <linux/time.h>
6
7#include "types.h"
8
9/*
10 * in all cases,
11 * void **p pointer to position pointer
12 * void *end pointer to end of buffer (last byte + 1)
13 */
14
15static inline u64 ceph_decode_64(void **p)
16{
17 u64 v = get_unaligned_le64(*p);
18 *p += sizeof(u64);
19 return v;
20}
21static inline u32 ceph_decode_32(void **p)
22{
23 u32 v = get_unaligned_le32(*p);
24 *p += sizeof(u32);
25 return v;
26}
27static inline u16 ceph_decode_16(void **p)
28{
29 u16 v = get_unaligned_le16(*p);
30 *p += sizeof(u16);
31 return v;
32}
33static inline u8 ceph_decode_8(void **p)
34{
35 u8 v = *(u8 *)*p;
36 (*p)++;
37 return v;
38}
39static inline void ceph_decode_copy(void **p, void *pv, size_t n)
40{
41 memcpy(pv, *p, n);
42 *p += n;
43}
44
45/*
46 * bounds check input.
47 */
48#define ceph_decode_need(p, end, n, bad) \
49 do { \
50 if (unlikely(*(p) + (n) > (end))) \
51 goto bad; \
52 } while (0)
53
54#define ceph_decode_64_safe(p, end, v, bad) \
55 do { \
56 ceph_decode_need(p, end, sizeof(u64), bad); \
57 v = ceph_decode_64(p); \
58 } while (0)
59#define ceph_decode_32_safe(p, end, v, bad) \
60 do { \
61 ceph_decode_need(p, end, sizeof(u32), bad); \
62 v = ceph_decode_32(p); \
63 } while (0)
64#define ceph_decode_16_safe(p, end, v, bad) \
65 do { \
66 ceph_decode_need(p, end, sizeof(u16), bad); \
67 v = ceph_decode_16(p); \
68 } while (0)
69#define ceph_decode_8_safe(p, end, v, bad) \
70 do { \
71 ceph_decode_need(p, end, sizeof(u8), bad); \
72 v = ceph_decode_8(p); \
73 } while (0)
74
75#define ceph_decode_copy_safe(p, end, pv, n, bad) \
76 do { \
77 ceph_decode_need(p, end, n, bad); \
78 ceph_decode_copy(p, pv, n); \
79 } while (0)
80
81/*
82 * struct ceph_timespec <-> struct timespec
83 */
84static inline void ceph_decode_timespec(struct timespec *ts,
85 const struct ceph_timespec *tv)
86{
87 ts->tv_sec = le32_to_cpu(tv->tv_sec);
88 ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
89}
90static inline void ceph_encode_timespec(struct ceph_timespec *tv,
91 const struct timespec *ts)
92{
93 tv->tv_sec = cpu_to_le32(ts->tv_sec);
94 tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
95}
96
97/*
98 * sockaddr_storage <-> ceph_sockaddr
99 */
100static inline void ceph_encode_addr(struct ceph_entity_addr *a)
101{
102 a->in_addr.ss_family = htons(a->in_addr.ss_family);
103}
104static inline void ceph_decode_addr(struct ceph_entity_addr *a)
105{
106 a->in_addr.ss_family = ntohs(a->in_addr.ss_family);
107 WARN_ON(a->in_addr.ss_family == 512);
108}
109
110/*
111 * encoders
112 */
113static inline void ceph_encode_64(void **p, u64 v)
114{
115 put_unaligned_le64(v, (__le64 *)*p);
116 *p += sizeof(u64);
117}
118static inline void ceph_encode_32(void **p, u32 v)
119{
120 put_unaligned_le32(v, (__le32 *)*p);
121 *p += sizeof(u32);
122}
123static inline void ceph_encode_16(void **p, u16 v)
124{
125 put_unaligned_le16(v, (__le16 *)*p);
126 *p += sizeof(u16);
127}
128static inline void ceph_encode_8(void **p, u8 v)
129{
130 *(u8 *)*p = v;
131 (*p)++;
132}
133static inline void ceph_encode_copy(void **p, const void *s, int len)
134{
135 memcpy(*p, s, len);
136 *p += len;
137}
138
139/*
140 * filepath, string encoders
141 */
142static inline void ceph_encode_filepath(void **p, void *end,
143 u64 ino, const char *path)
144{
145 u32 len = path ? strlen(path) : 0;
146 BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
147 ceph_encode_8(p, 1);
148 ceph_encode_64(p, ino);
149 ceph_encode_32(p, len);
150 if (len)
151 memcpy(*p, path, len);
152 *p += len;
153}
154
155static inline void ceph_encode_string(void **p, void *end,
156 const char *s, u32 len)
157{
158 BUG_ON(*p + sizeof(len) + len > end);
159 ceph_encode_32(p, len);
160 if (len)
161 memcpy(*p, s, len);
162 *p += len;
163}
164
165#define ceph_encode_need(p, end, n, bad) \
166 do { \
167 if (unlikely(*(p) + (n) > (end))) \
168 goto bad; \
169 } while (0)
170
171#define ceph_encode_64_safe(p, end, v, bad) \
172 do { \
173 ceph_encode_need(p, end, sizeof(u64), bad); \
174 ceph_encode_64(p, v); \
175 } while (0)
176#define ceph_encode_32_safe(p, end, v, bad) \
177 do { \
178 ceph_encode_need(p, end, sizeof(u32), bad); \
179 ceph_encode_32(p, v); \
180 } while (0)
181#define ceph_encode_16_safe(p, end, v, bad) \
182 do { \
183 ceph_encode_need(p, end, sizeof(u16), bad); \
184 ceph_encode_16(p, v); \
185 } while (0)
186
187#define ceph_encode_copy_safe(p, end, pv, n, bad) \
188 do { \
189 ceph_encode_need(p, end, n, bad); \
190 ceph_encode_copy(p, pv, n); \
191 } while (0)
192
193
194#endif
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
new file mode 100644
index 000000000000..ea8ee2e526aa
--- /dev/null
+++ b/fs/ceph/dir.c
@@ -0,0 +1,1224 @@
1#include "ceph_debug.h"
2
3#include <linux/spinlock.h>
4#include <linux/fs_struct.h>
5#include <linux/namei.h>
6#include <linux/slab.h>
7#include <linux/sched.h>
8
9#include "super.h"
10
11/*
12 * Directory operations: readdir, lookup, create, link, unlink,
13 * rename, etc.
14 */
15
16/*
17 * Ceph MDS operations are specified in terms of a base ino and
18 * relative path. Thus, the client can specify an operation on a
19 * specific inode (e.g., a getattr due to fstat(2)), or as a path
20 * relative to, say, the root directory.
21 *
22 * Normally, we limit ourselves to strict inode ops (no path component)
23 * or dentry operations (a single path component relative to an ino). The
24 * exception to this is open_root_dentry(), which will open the mount
25 * point by name.
26 */
27
28const struct inode_operations ceph_dir_iops;
29const struct file_operations ceph_dir_fops;
30struct dentry_operations ceph_dentry_ops;
31
32/*
33 * Initialize ceph dentry state.
34 */
35int ceph_init_dentry(struct dentry *dentry)
36{
37 struct ceph_dentry_info *di;
38
39 if (dentry->d_fsdata)
40 return 0;
41
42 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
43 dentry->d_op = &ceph_dentry_ops;
44 else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
45 dentry->d_op = &ceph_snapdir_dentry_ops;
46 else
47 dentry->d_op = &ceph_snap_dentry_ops;
48
49 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS);
50 if (!di)
51 return -ENOMEM; /* oh well */
52
53 spin_lock(&dentry->d_lock);
54 if (dentry->d_fsdata) /* lost a race */
55 goto out_unlock;
56 di->dentry = dentry;
57 di->lease_session = NULL;
58 dentry->d_fsdata = di;
59 dentry->d_time = jiffies;
60 ceph_dentry_lru_add(dentry);
61out_unlock:
62 spin_unlock(&dentry->d_lock);
63 return 0;
64}
65
66
67
68/*
69 * for readdir, we encode the directory frag and offset within that
70 * frag into f_pos.
71 */
72static unsigned fpos_frag(loff_t p)
73{
74 return p >> 32;
75}
76static unsigned fpos_off(loff_t p)
77{
78 return p & 0xffffffff;
79}
80
81/*
82 * When possible, we try to satisfy a readdir by peeking at the
83 * dcache. We make this work by carefully ordering dentries on
84 * d_u.d_child when we initially get results back from the MDS, and
85 * falling back to a "normal" sync readdir if any dentries in the dir
86 * are dropped.
87 *
88 * I_COMPLETE tells indicates we have all dentries in the dir. It is
89 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
90 * the MDS if/when the directory is modified).
91 */
92static int __dcache_readdir(struct file *filp,
93 void *dirent, filldir_t filldir)
94{
95 struct inode *inode = filp->f_dentry->d_inode;
96 struct ceph_file_info *fi = filp->private_data;
97 struct dentry *parent = filp->f_dentry;
98 struct inode *dir = parent->d_inode;
99 struct list_head *p;
100 struct dentry *dentry, *last;
101 struct ceph_dentry_info *di;
102 int err = 0;
103
104 /* claim ref on last dentry we returned */
105 last = fi->dentry;
106 fi->dentry = NULL;
107
108 dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
109 last);
110
111 spin_lock(&dcache_lock);
112
113 /* start at beginning? */
114 if (filp->f_pos == 2 || (last &&
115 filp->f_pos < ceph_dentry(last)->offset)) {
116 if (list_empty(&parent->d_subdirs))
117 goto out_unlock;
118 p = parent->d_subdirs.prev;
119 dout(" initial p %p/%p\n", p->prev, p->next);
120 } else {
121 p = last->d_u.d_child.prev;
122 }
123
124more:
125 dentry = list_entry(p, struct dentry, d_u.d_child);
126 di = ceph_dentry(dentry);
127 while (1) {
128 dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next,
129 parent->d_subdirs.prev, parent->d_subdirs.next);
130 if (p == &parent->d_subdirs) {
131 fi->at_end = 1;
132 goto out_unlock;
133 }
134 if (!d_unhashed(dentry) && dentry->d_inode &&
135 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
136 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
137 filp->f_pos <= di->offset)
138 break;
139 dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
140 dentry->d_name.len, dentry->d_name.name, di->offset,
141 filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
142 !dentry->d_inode ? " null" : "");
143 p = p->prev;
144 dentry = list_entry(p, struct dentry, d_u.d_child);
145 di = ceph_dentry(dentry);
146 }
147
148 atomic_inc(&dentry->d_count);
149 spin_unlock(&dcache_lock);
150 spin_unlock(&inode->i_lock);
151
152 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
153 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
154 filp->f_pos = di->offset;
155 err = filldir(dirent, dentry->d_name.name,
156 dentry->d_name.len, di->offset,
157 dentry->d_inode->i_ino,
158 dentry->d_inode->i_mode >> 12);
159
160 if (last) {
161 if (err < 0) {
162 /* remember our position */
163 fi->dentry = last;
164 fi->next_offset = di->offset;
165 } else {
166 dput(last);
167 }
168 last = NULL;
169 }
170
171 spin_lock(&inode->i_lock);
172 spin_lock(&dcache_lock);
173
174 last = dentry;
175
176 if (err < 0)
177 goto out_unlock;
178
179 p = p->prev;
180 filp->f_pos++;
181
182 /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
183 if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
184 goto more;
185 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
186 err = -EAGAIN;
187
188out_unlock:
189 spin_unlock(&dcache_lock);
190
191 if (last) {
192 spin_unlock(&inode->i_lock);
193 dput(last);
194 spin_lock(&inode->i_lock);
195 }
196
197 return err;
198}
199
200/*
201 * make note of the last dentry we read, so we can
202 * continue at the same lexicographical point,
203 * regardless of what dir changes take place on the
204 * server.
205 */
206static int note_last_dentry(struct ceph_file_info *fi, const char *name,
207 int len)
208{
209 kfree(fi->last_name);
210 fi->last_name = kmalloc(len+1, GFP_NOFS);
211 if (!fi->last_name)
212 return -ENOMEM;
213 memcpy(fi->last_name, name, len);
214 fi->last_name[len] = 0;
215 dout("note_last_dentry '%s'\n", fi->last_name);
216 return 0;
217}
218
219static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
220{
221 struct ceph_file_info *fi = filp->private_data;
222 struct inode *inode = filp->f_dentry->d_inode;
223 struct ceph_inode_info *ci = ceph_inode(inode);
224 struct ceph_client *client = ceph_inode_to_client(inode);
225 struct ceph_mds_client *mdsc = &client->mdsc;
226 unsigned frag = fpos_frag(filp->f_pos);
227 int off = fpos_off(filp->f_pos);
228 int err;
229 u32 ftype;
230 struct ceph_mds_reply_info_parsed *rinfo;
231 const int max_entries = client->mount_args->max_readdir;
232
233 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
234 if (fi->at_end)
235 return 0;
236
237 /* always start with . and .. */
238 if (filp->f_pos == 0) {
239 /* note dir version at start of readdir so we can tell
240 * if any dentries get dropped */
241 fi->dir_release_count = ci->i_release_count;
242
243 dout("readdir off 0 -> '.'\n");
244 if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
245 inode->i_ino, inode->i_mode >> 12) < 0)
246 return 0;
247 filp->f_pos = 1;
248 off = 1;
249 }
250 if (filp->f_pos == 1) {
251 dout("readdir off 1 -> '..'\n");
252 if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
253 filp->f_dentry->d_parent->d_inode->i_ino,
254 inode->i_mode >> 12) < 0)
255 return 0;
256 filp->f_pos = 2;
257 off = 2;
258 }
259
260 /* can we use the dcache? */
261 spin_lock(&inode->i_lock);
262 if ((filp->f_pos == 2 || fi->dentry) &&
263 !ceph_test_opt(client, NOASYNCREADDIR) &&
264 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
265 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
266 err = __dcache_readdir(filp, dirent, filldir);
267 if (err != -EAGAIN) {
268 spin_unlock(&inode->i_lock);
269 return err;
270 }
271 }
272 spin_unlock(&inode->i_lock);
273 if (fi->dentry) {
274 err = note_last_dentry(fi, fi->dentry->d_name.name,
275 fi->dentry->d_name.len);
276 if (err)
277 return err;
278 dput(fi->dentry);
279 fi->dentry = NULL;
280 }
281
282 /* proceed with a normal readdir */
283
284more:
285 /* do we have the correct frag content buffered? */
286 if (fi->frag != frag || fi->last_readdir == NULL) {
287 struct ceph_mds_request *req;
288 int op = ceph_snap(inode) == CEPH_SNAPDIR ?
289 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
290
291 /* discard old result, if any */
292 if (fi->last_readdir) {
293 ceph_mdsc_put_request(fi->last_readdir);
294 fi->last_readdir = NULL;
295 }
296
297 /* requery frag tree, as the frag topology may have changed */
298 frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
299
300 dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
301 ceph_vinop(inode), frag, fi->last_name);
302 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
303 if (IS_ERR(req))
304 return PTR_ERR(req);
305 req->r_inode = igrab(inode);
306 req->r_dentry = dget(filp->f_dentry);
307 /* hints to request -> mds selection code */
308 req->r_direct_mode = USE_AUTH_MDS;
309 req->r_direct_hash = ceph_frag_value(frag);
310 req->r_direct_is_hash = true;
311 req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
312 req->r_readdir_offset = fi->next_offset;
313 req->r_args.readdir.frag = cpu_to_le32(frag);
314 req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
315 req->r_num_caps = max_entries + 1;
316 err = ceph_mdsc_do_request(mdsc, NULL, req);
317 if (err < 0) {
318 ceph_mdsc_put_request(req);
319 return err;
320 }
321 dout("readdir got and parsed readdir result=%d"
322 " on frag %x, end=%d, complete=%d\n", err, frag,
323 (int)req->r_reply_info.dir_end,
324 (int)req->r_reply_info.dir_complete);
325
326 if (!req->r_did_prepopulate) {
327 dout("readdir !did_prepopulate");
328 fi->dir_release_count--; /* preclude I_COMPLETE */
329 }
330
331 /* note next offset and last dentry name */
332 fi->offset = fi->next_offset;
333 fi->last_readdir = req;
334
335 if (req->r_reply_info.dir_end) {
336 kfree(fi->last_name);
337 fi->last_name = NULL;
338 fi->next_offset = 0;
339 } else {
340 rinfo = &req->r_reply_info;
341 err = note_last_dentry(fi,
342 rinfo->dir_dname[rinfo->dir_nr-1],
343 rinfo->dir_dname_len[rinfo->dir_nr-1]);
344 if (err)
345 return err;
346 fi->next_offset += rinfo->dir_nr;
347 }
348 }
349
350 rinfo = &fi->last_readdir->r_reply_info;
351 dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
352 rinfo->dir_nr, off, fi->offset);
353 while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) {
354 u64 pos = ceph_make_fpos(frag, off);
355 struct ceph_mds_reply_inode *in =
356 rinfo->dir_in[off - fi->offset].in;
357 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
358 off, off - fi->offset, rinfo->dir_nr, pos,
359 rinfo->dir_dname_len[off - fi->offset],
360 rinfo->dir_dname[off - fi->offset], in);
361 BUG_ON(!in);
362 ftype = le32_to_cpu(in->mode) >> 12;
363 if (filldir(dirent,
364 rinfo->dir_dname[off - fi->offset],
365 rinfo->dir_dname_len[off - fi->offset],
366 pos,
367 le64_to_cpu(in->ino),
368 ftype) < 0) {
369 dout("filldir stopping us...\n");
370 return 0;
371 }
372 off++;
373 filp->f_pos = pos + 1;
374 }
375
376 if (fi->last_name) {
377 ceph_mdsc_put_request(fi->last_readdir);
378 fi->last_readdir = NULL;
379 goto more;
380 }
381
382 /* more frags? */
383 if (!ceph_frag_is_rightmost(frag)) {
384 frag = ceph_frag_next(frag);
385 off = 0;
386 filp->f_pos = ceph_make_fpos(frag, off);
387 dout("readdir next frag is %x\n", frag);
388 goto more;
389 }
390 fi->at_end = 1;
391
392 /*
393 * if dir_release_count still matches the dir, no dentries
394 * were released during the whole readdir, and we should have
395 * the complete dir contents in our cache.
396 */
397 spin_lock(&inode->i_lock);
398 if (ci->i_release_count == fi->dir_release_count) {
399 dout(" marking %p complete\n", inode);
400 ci->i_ceph_flags |= CEPH_I_COMPLETE;
401 ci->i_max_offset = filp->f_pos;
402 }
403 spin_unlock(&inode->i_lock);
404
405 dout("readdir %p filp %p done.\n", inode, filp);
406 return 0;
407}
408
409static void reset_readdir(struct ceph_file_info *fi)
410{
411 if (fi->last_readdir) {
412 ceph_mdsc_put_request(fi->last_readdir);
413 fi->last_readdir = NULL;
414 }
415 kfree(fi->last_name);
416 fi->next_offset = 2; /* compensate for . and .. */
417 if (fi->dentry) {
418 dput(fi->dentry);
419 fi->dentry = NULL;
420 }
421 fi->at_end = 0;
422}
423
424static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
425{
426 struct ceph_file_info *fi = file->private_data;
427 struct inode *inode = file->f_mapping->host;
428 loff_t old_offset = offset;
429 loff_t retval;
430
431 mutex_lock(&inode->i_mutex);
432 switch (origin) {
433 case SEEK_END:
434 offset += inode->i_size + 2; /* FIXME */
435 break;
436 case SEEK_CUR:
437 offset += file->f_pos;
438 }
439 retval = -EINVAL;
440 if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
441 if (offset != file->f_pos) {
442 file->f_pos = offset;
443 file->f_version = 0;
444 fi->at_end = 0;
445 }
446 retval = offset;
447
448 /*
449 * discard buffered readdir content on seekdir(0), or
450 * seek to new frag, or seek prior to current chunk.
451 */
452 if (offset == 0 ||
453 fpos_frag(offset) != fpos_frag(old_offset) ||
454 fpos_off(offset) < fi->offset) {
455 dout("dir_llseek dropping %p content\n", file);
456 reset_readdir(fi);
457 }
458
459 /* bump dir_release_count if we did a forward seek */
460 if (offset > old_offset)
461 fi->dir_release_count--;
462 }
463 mutex_unlock(&inode->i_mutex);
464 return retval;
465}
466
467/*
468 * Process result of a lookup/open request.
469 *
470 * Mainly, make sure we return the final req->r_dentry (if it already
471 * existed) in place of the original VFS-provided dentry when they
472 * differ.
473 *
474 * Gracefully handle the case where the MDS replies with -ENOENT and
475 * no trace (which it may do, at its discretion, e.g., if it doesn't
476 * care to issue a lease on the negative dentry).
477 */
478struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
479 struct dentry *dentry, int err)
480{
481 struct ceph_client *client = ceph_client(dentry->d_sb);
482 struct inode *parent = dentry->d_parent->d_inode;
483
484 /* .snap dir? */
485 if (err == -ENOENT &&
486 ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
487 strcmp(dentry->d_name.name,
488 client->mount_args->snapdir_name) == 0) {
489 struct inode *inode = ceph_get_snapdir(parent);
490 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
491 dentry, dentry->d_name.len, dentry->d_name.name, inode);
492 BUG_ON(!d_unhashed(dentry));
493 d_add(dentry, inode);
494 err = 0;
495 }
496
497 if (err == -ENOENT) {
498 /* no trace? */
499 err = 0;
500 if (!req->r_reply_info.head->is_dentry) {
501 dout("ENOENT and no trace, dentry %p inode %p\n",
502 dentry, dentry->d_inode);
503 if (dentry->d_inode) {
504 d_drop(dentry);
505 err = -ENOENT;
506 } else {
507 d_add(dentry, NULL);
508 }
509 }
510 }
511 if (err)
512 dentry = ERR_PTR(err);
513 else if (dentry != req->r_dentry)
514 dentry = dget(req->r_dentry); /* we got spliced */
515 else
516 dentry = NULL;
517 return dentry;
518}
519
520static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
521{
522 return ceph_ino(inode) == CEPH_INO_ROOT &&
523 strncmp(dentry->d_name.name, ".ceph", 5) == 0;
524}
525
526/*
527 * Look up a single dir entry. If there is a lookup intent, inform
528 * the MDS so that it gets our 'caps wanted' value in a single op.
529 */
530static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
531 struct nameidata *nd)
532{
533 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
534 struct ceph_mds_client *mdsc = &client->mdsc;
535 struct ceph_mds_request *req;
536 int op;
537 int err;
538
539 dout("lookup %p dentry %p '%.*s'\n",
540 dir, dentry, dentry->d_name.len, dentry->d_name.name);
541
542 if (dentry->d_name.len > NAME_MAX)
543 return ERR_PTR(-ENAMETOOLONG);
544
545 err = ceph_init_dentry(dentry);
546 if (err < 0)
547 return ERR_PTR(err);
548
549 /* open (but not create!) intent? */
550 if (nd &&
551 (nd->flags & LOOKUP_OPEN) &&
552 (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */
553 !(nd->intent.open.flags & O_CREAT)) {
554 int mode = nd->intent.open.create_mode & ~current->fs->umask;
555 return ceph_lookup_open(dir, dentry, nd, mode, 1);
556 }
557
558 /* can we conclude ENOENT locally? */
559 if (dentry->d_inode == NULL) {
560 struct ceph_inode_info *ci = ceph_inode(dir);
561 struct ceph_dentry_info *di = ceph_dentry(dentry);
562
563 spin_lock(&dir->i_lock);
564 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
565 if (strncmp(dentry->d_name.name,
566 client->mount_args->snapdir_name,
567 dentry->d_name.len) &&
568 !is_root_ceph_dentry(dir, dentry) &&
569 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
570 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
571 di->offset = ci->i_max_offset++;
572 spin_unlock(&dir->i_lock);
573 dout(" dir %p complete, -ENOENT\n", dir);
574 d_add(dentry, NULL);
575 di->lease_shared_gen = ci->i_shared_gen;
576 return NULL;
577 }
578 spin_unlock(&dir->i_lock);
579 }
580
581 op = ceph_snap(dir) == CEPH_SNAPDIR ?
582 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
583 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
584 if (IS_ERR(req))
585 return ERR_PTR(PTR_ERR(req));
586 req->r_dentry = dget(dentry);
587 req->r_num_caps = 2;
588 /* we only need inode linkage */
589 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
590 req->r_locked_dir = dir;
591 err = ceph_mdsc_do_request(mdsc, NULL, req);
592 dentry = ceph_finish_lookup(req, dentry, err);
593 ceph_mdsc_put_request(req); /* will dput(dentry) */
594 dout("lookup result=%p\n", dentry);
595 return dentry;
596}
597
598/*
599 * If we do a create but get no trace back from the MDS, follow up with
600 * a lookup (the VFS expects us to link up the provided dentry).
601 */
602int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
603{
604 struct dentry *result = ceph_lookup(dir, dentry, NULL);
605
606 if (result && !IS_ERR(result)) {
607 /*
608 * We created the item, then did a lookup, and found
609 * it was already linked to another inode we already
610 * had in our cache (and thus got spliced). Link our
611 * dentry to that inode, but don't hash it, just in
612 * case the VFS wants to dereference it.
613 */
614 BUG_ON(!result->d_inode);
615 d_instantiate(dentry, result->d_inode);
616 return 0;
617 }
618 return PTR_ERR(result);
619}
620
621static int ceph_mknod(struct inode *dir, struct dentry *dentry,
622 int mode, dev_t rdev)
623{
624 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
625 struct ceph_mds_client *mdsc = &client->mdsc;
626 struct ceph_mds_request *req;
627 int err;
628
629 if (ceph_snap(dir) != CEPH_NOSNAP)
630 return -EROFS;
631
632 dout("mknod in dir %p dentry %p mode 0%o rdev %d\n",
633 dir, dentry, mode, rdev);
634 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
635 if (IS_ERR(req)) {
636 d_drop(dentry);
637 return PTR_ERR(req);
638 }
639 req->r_dentry = dget(dentry);
640 req->r_num_caps = 2;
641 req->r_locked_dir = dir;
642 req->r_args.mknod.mode = cpu_to_le32(mode);
643 req->r_args.mknod.rdev = cpu_to_le32(rdev);
644 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
645 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
646 err = ceph_mdsc_do_request(mdsc, dir, req);
647 if (!err && !req->r_reply_info.head->is_dentry)
648 err = ceph_handle_notrace_create(dir, dentry);
649 ceph_mdsc_put_request(req);
650 if (err)
651 d_drop(dentry);
652 return err;
653}
654
655static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
656 struct nameidata *nd)
657{
658 dout("create in dir %p dentry %p name '%.*s'\n",
659 dir, dentry, dentry->d_name.len, dentry->d_name.name);
660
661 if (ceph_snap(dir) != CEPH_NOSNAP)
662 return -EROFS;
663
664 if (nd) {
665 BUG_ON((nd->flags & LOOKUP_OPEN) == 0);
666 dentry = ceph_lookup_open(dir, dentry, nd, mode, 0);
667 /* hrm, what should i do here if we get aliased? */
668 if (IS_ERR(dentry))
669 return PTR_ERR(dentry);
670 return 0;
671 }
672
673 /* fall back to mknod */
674 return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0);
675}
676
677static int ceph_symlink(struct inode *dir, struct dentry *dentry,
678 const char *dest)
679{
680 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
681 struct ceph_mds_client *mdsc = &client->mdsc;
682 struct ceph_mds_request *req;
683 int err;
684
685 if (ceph_snap(dir) != CEPH_NOSNAP)
686 return -EROFS;
687
688 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
689 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
690 if (IS_ERR(req)) {
691 d_drop(dentry);
692 return PTR_ERR(req);
693 }
694 req->r_dentry = dget(dentry);
695 req->r_num_caps = 2;
696 req->r_path2 = kstrdup(dest, GFP_NOFS);
697 req->r_locked_dir = dir;
698 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
699 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
700 err = ceph_mdsc_do_request(mdsc, dir, req);
701 if (!err && !req->r_reply_info.head->is_dentry)
702 err = ceph_handle_notrace_create(dir, dentry);
703 ceph_mdsc_put_request(req);
704 if (err)
705 d_drop(dentry);
706 return err;
707}
708
709static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
710{
711 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
712 struct ceph_mds_client *mdsc = &client->mdsc;
713 struct ceph_mds_request *req;
714 int err = -EROFS;
715 int op;
716
717 if (ceph_snap(dir) == CEPH_SNAPDIR) {
718 /* mkdir .snap/foo is a MKSNAP */
719 op = CEPH_MDS_OP_MKSNAP;
720 dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
721 dentry->d_name.len, dentry->d_name.name, dentry);
722 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
723 dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode);
724 op = CEPH_MDS_OP_MKDIR;
725 } else {
726 goto out;
727 }
728 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
729 if (IS_ERR(req)) {
730 err = PTR_ERR(req);
731 goto out;
732 }
733
734 req->r_dentry = dget(dentry);
735 req->r_num_caps = 2;
736 req->r_locked_dir = dir;
737 req->r_args.mkdir.mode = cpu_to_le32(mode);
738 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
739 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
740 err = ceph_mdsc_do_request(mdsc, dir, req);
741 if (!err && !req->r_reply_info.head->is_dentry)
742 err = ceph_handle_notrace_create(dir, dentry);
743 ceph_mdsc_put_request(req);
744out:
745 if (err < 0)
746 d_drop(dentry);
747 return err;
748}
749
750static int ceph_link(struct dentry *old_dentry, struct inode *dir,
751 struct dentry *dentry)
752{
753 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
754 struct ceph_mds_client *mdsc = &client->mdsc;
755 struct ceph_mds_request *req;
756 int err;
757
758 if (ceph_snap(dir) != CEPH_NOSNAP)
759 return -EROFS;
760
761 dout("link in dir %p old_dentry %p dentry %p\n", dir,
762 old_dentry, dentry);
763 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
764 if (IS_ERR(req)) {
765 d_drop(dentry);
766 return PTR_ERR(req);
767 }
768 req->r_dentry = dget(dentry);
769 req->r_num_caps = 2;
770 req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
771 req->r_locked_dir = dir;
772 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
773 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
774 err = ceph_mdsc_do_request(mdsc, dir, req);
775 if (err)
776 d_drop(dentry);
777 else if (!req->r_reply_info.head->is_dentry)
778 d_instantiate(dentry, igrab(old_dentry->d_inode));
779 ceph_mdsc_put_request(req);
780 return err;
781}
782
783/*
784 * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
785 * looks like the link count will hit 0, drop any other caps (other
786 * than PIN) we don't specifically want (due to the file still being
787 * open).
788 */
789static int drop_caps_for_unlink(struct inode *inode)
790{
791 struct ceph_inode_info *ci = ceph_inode(inode);
792 int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
793
794 spin_lock(&inode->i_lock);
795 if (inode->i_nlink == 1) {
796 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
797 ci->i_ceph_flags |= CEPH_I_NODELAY;
798 }
799 spin_unlock(&inode->i_lock);
800 return drop;
801}
802
803/*
804 * rmdir and unlink are differ only by the metadata op code
805 */
806static int ceph_unlink(struct inode *dir, struct dentry *dentry)
807{
808 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
809 struct ceph_mds_client *mdsc = &client->mdsc;
810 struct inode *inode = dentry->d_inode;
811 struct ceph_mds_request *req;
812 int err = -EROFS;
813 int op;
814
815 if (ceph_snap(dir) == CEPH_SNAPDIR) {
816 /* rmdir .snap/foo is RMSNAP */
817 dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
818 dentry->d_name.name, dentry);
819 op = CEPH_MDS_OP_RMSNAP;
820 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
821 dout("unlink/rmdir dir %p dn %p inode %p\n",
822 dir, dentry, inode);
823 op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ?
824 CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
825 } else
826 goto out;
827 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
828 if (IS_ERR(req)) {
829 err = PTR_ERR(req);
830 goto out;
831 }
832 req->r_dentry = dget(dentry);
833 req->r_num_caps = 2;
834 req->r_locked_dir = dir;
835 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
836 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
837 req->r_inode_drop = drop_caps_for_unlink(inode);
838 err = ceph_mdsc_do_request(mdsc, dir, req);
839 if (!err && !req->r_reply_info.head->is_dentry)
840 d_delete(dentry);
841 ceph_mdsc_put_request(req);
842out:
843 return err;
844}
845
846static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
847 struct inode *new_dir, struct dentry *new_dentry)
848{
849 struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
850 struct ceph_mds_client *mdsc = &client->mdsc;
851 struct ceph_mds_request *req;
852 int err;
853
854 if (ceph_snap(old_dir) != ceph_snap(new_dir))
855 return -EXDEV;
856 if (ceph_snap(old_dir) != CEPH_NOSNAP ||
857 ceph_snap(new_dir) != CEPH_NOSNAP)
858 return -EROFS;
859 dout("rename dir %p dentry %p to dir %p dentry %p\n",
860 old_dir, old_dentry, new_dir, new_dentry);
861 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
862 if (IS_ERR(req))
863 return PTR_ERR(req);
864 req->r_dentry = dget(new_dentry);
865 req->r_num_caps = 2;
866 req->r_old_dentry = dget(old_dentry);
867 req->r_locked_dir = new_dir;
868 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
869 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
870 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
871 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
872 /* release LINK_RDCACHE on source inode (mds will lock it) */
873 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
874 if (new_dentry->d_inode)
875 req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
876 err = ceph_mdsc_do_request(mdsc, old_dir, req);
877 if (!err && !req->r_reply_info.head->is_dentry) {
878 /*
879 * Normally d_move() is done by fill_trace (called by
880 * do_request, above). If there is no trace, we need
881 * to do it here.
882 */
883 d_move(old_dentry, new_dentry);
884 }
885 ceph_mdsc_put_request(req);
886 return err;
887}
888
889
890/*
891 * Check if dentry lease is valid. If not, delete the lease. Try to
892 * renew if the least is more than half up.
893 */
894static int dentry_lease_is_valid(struct dentry *dentry)
895{
896 struct ceph_dentry_info *di;
897 struct ceph_mds_session *s;
898 int valid = 0;
899 u32 gen;
900 unsigned long ttl;
901 struct ceph_mds_session *session = NULL;
902 struct inode *dir = NULL;
903 u32 seq = 0;
904
905 spin_lock(&dentry->d_lock);
906 di = ceph_dentry(dentry);
907 if (di && di->lease_session) {
908 s = di->lease_session;
909 spin_lock(&s->s_cap_lock);
910 gen = s->s_cap_gen;
911 ttl = s->s_cap_ttl;
912 spin_unlock(&s->s_cap_lock);
913
914 if (di->lease_gen == gen &&
915 time_before(jiffies, dentry->d_time) &&
916 time_before(jiffies, ttl)) {
917 valid = 1;
918 if (di->lease_renew_after &&
919 time_after(jiffies, di->lease_renew_after)) {
920 /* we should renew */
921 dir = dentry->d_parent->d_inode;
922 session = ceph_get_mds_session(s);
923 seq = di->lease_seq;
924 di->lease_renew_after = 0;
925 di->lease_renew_from = jiffies;
926 }
927 }
928 }
929 spin_unlock(&dentry->d_lock);
930
931 if (session) {
932 ceph_mdsc_lease_send_msg(session, dir, dentry,
933 CEPH_MDS_LEASE_RENEW, seq);
934 ceph_put_mds_session(session);
935 }
936 dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
937 return valid;
938}
939
940/*
941 * Check if directory-wide content lease/cap is valid.
942 */
943static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
944{
945 struct ceph_inode_info *ci = ceph_inode(dir);
946 struct ceph_dentry_info *di = ceph_dentry(dentry);
947 int valid = 0;
948
949 spin_lock(&dir->i_lock);
950 if (ci->i_shared_gen == di->lease_shared_gen)
951 valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
952 spin_unlock(&dir->i_lock);
953 dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
954 dir, (unsigned)ci->i_shared_gen, dentry,
955 (unsigned)di->lease_shared_gen, valid);
956 return valid;
957}
958
959/*
960 * Check if cached dentry can be trusted.
961 */
962static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
963{
964 struct inode *dir = dentry->d_parent->d_inode;
965
966 dout("d_revalidate %p '%.*s' inode %p\n", dentry,
967 dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
968
969 /* always trust cached snapped dentries, snapdir dentry */
970 if (ceph_snap(dir) != CEPH_NOSNAP) {
971 dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
972 dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
973 goto out_touch;
974 }
975 if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR)
976 goto out_touch;
977
978 if (dentry_lease_is_valid(dentry) ||
979 dir_lease_is_valid(dir, dentry))
980 goto out_touch;
981
982 dout("d_revalidate %p invalid\n", dentry);
983 d_drop(dentry);
984 return 0;
985out_touch:
986 ceph_dentry_lru_touch(dentry);
987 return 1;
988}
989
990/*
991 * When a dentry is released, clear the dir I_COMPLETE if it was part
992 * of the current dir gen.
993 */
994static void ceph_dentry_release(struct dentry *dentry)
995{
996 struct ceph_dentry_info *di = ceph_dentry(dentry);
997 struct inode *parent_inode = dentry->d_parent->d_inode;
998
999 if (parent_inode) {
1000 struct ceph_inode_info *ci = ceph_inode(parent_inode);
1001
1002 spin_lock(&parent_inode->i_lock);
1003 if (ci->i_shared_gen == di->lease_shared_gen) {
1004 dout(" clearing %p complete (d_release)\n",
1005 parent_inode);
1006 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1007 ci->i_release_count++;
1008 }
1009 spin_unlock(&parent_inode->i_lock);
1010 }
1011 if (di) {
1012 ceph_dentry_lru_del(dentry);
1013 if (di->lease_session)
1014 ceph_put_mds_session(di->lease_session);
1015 kmem_cache_free(ceph_dentry_cachep, di);
1016 dentry->d_fsdata = NULL;
1017 }
1018}
1019
1020static int ceph_snapdir_d_revalidate(struct dentry *dentry,
1021 struct nameidata *nd)
1022{
1023 /*
1024 * Eventually, we'll want to revalidate snapped metadata
1025 * too... probably...
1026 */
1027 return 1;
1028}
1029
1030
1031
1032/*
1033 * read() on a dir. This weird interface hack only works if mounted
1034 * with '-o dirstat'.
1035 */
1036static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1037 loff_t *ppos)
1038{
1039 struct ceph_file_info *cf = file->private_data;
1040 struct inode *inode = file->f_dentry->d_inode;
1041 struct ceph_inode_info *ci = ceph_inode(inode);
1042 int left;
1043
1044 if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT))
1045 return -EISDIR;
1046
1047 if (!cf->dir_info) {
1048 cf->dir_info = kmalloc(1024, GFP_NOFS);
1049 if (!cf->dir_info)
1050 return -ENOMEM;
1051 cf->dir_info_len =
1052 sprintf(cf->dir_info,
1053 "entries: %20lld\n"
1054 " files: %20lld\n"
1055 " subdirs: %20lld\n"
1056 "rentries: %20lld\n"
1057 " rfiles: %20lld\n"
1058 " rsubdirs: %20lld\n"
1059 "rbytes: %20lld\n"
1060 "rctime: %10ld.%09ld\n",
1061 ci->i_files + ci->i_subdirs,
1062 ci->i_files,
1063 ci->i_subdirs,
1064 ci->i_rfiles + ci->i_rsubdirs,
1065 ci->i_rfiles,
1066 ci->i_rsubdirs,
1067 ci->i_rbytes,
1068 (long)ci->i_rctime.tv_sec,
1069 (long)ci->i_rctime.tv_nsec);
1070 }
1071
1072 if (*ppos >= cf->dir_info_len)
1073 return 0;
1074 size = min_t(unsigned, size, cf->dir_info_len-*ppos);
1075 left = copy_to_user(buf, cf->dir_info + *ppos, size);
1076 if (left == size)
1077 return -EFAULT;
1078 *ppos += (size - left);
1079 return size - left;
1080}
1081
1082/*
1083 * an fsync() on a dir will wait for any uncommitted directory
1084 * operations to commit.
1085 */
1086static int ceph_dir_fsync(struct file *file, struct dentry *dentry,
1087 int datasync)
1088{
1089 struct inode *inode = dentry->d_inode;
1090 struct ceph_inode_info *ci = ceph_inode(inode);
1091 struct list_head *head = &ci->i_unsafe_dirops;
1092 struct ceph_mds_request *req;
1093 u64 last_tid;
1094 int ret = 0;
1095
1096 dout("dir_fsync %p\n", inode);
1097 spin_lock(&ci->i_unsafe_lock);
1098 if (list_empty(head))
1099 goto out;
1100
1101 req = list_entry(head->prev,
1102 struct ceph_mds_request, r_unsafe_dir_item);
1103 last_tid = req->r_tid;
1104
1105 do {
1106 ceph_mdsc_get_request(req);
1107 spin_unlock(&ci->i_unsafe_lock);
1108 dout("dir_fsync %p wait on tid %llu (until %llu)\n",
1109 inode, req->r_tid, last_tid);
1110 if (req->r_timeout) {
1111 ret = wait_for_completion_timeout(
1112 &req->r_safe_completion, req->r_timeout);
1113 if (ret > 0)
1114 ret = 0;
1115 else if (ret == 0)
1116 ret = -EIO; /* timed out */
1117 } else {
1118 wait_for_completion(&req->r_safe_completion);
1119 }
1120 spin_lock(&ci->i_unsafe_lock);
1121 ceph_mdsc_put_request(req);
1122
1123 if (ret || list_empty(head))
1124 break;
1125 req = list_entry(head->next,
1126 struct ceph_mds_request, r_unsafe_dir_item);
1127 } while (req->r_tid < last_tid);
1128out:
1129 spin_unlock(&ci->i_unsafe_lock);
1130 return ret;
1131}
1132
1133/*
1134 * We maintain a private dentry LRU.
1135 *
1136 * FIXME: this needs to be changed to a per-mds lru to be useful.
1137 */
1138void ceph_dentry_lru_add(struct dentry *dn)
1139{
1140 struct ceph_dentry_info *di = ceph_dentry(dn);
1141 struct ceph_mds_client *mdsc;
1142
1143 dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1144 dn->d_name.len, dn->d_name.name);
1145 if (di) {
1146 mdsc = &ceph_client(dn->d_sb)->mdsc;
1147 spin_lock(&mdsc->dentry_lru_lock);
1148 list_add_tail(&di->lru, &mdsc->dentry_lru);
1149 mdsc->num_dentry++;
1150 spin_unlock(&mdsc->dentry_lru_lock);
1151 }
1152}
1153
1154void ceph_dentry_lru_touch(struct dentry *dn)
1155{
1156 struct ceph_dentry_info *di = ceph_dentry(dn);
1157 struct ceph_mds_client *mdsc;
1158
1159 dout("dentry_lru_touch %p %p '%.*s'\n", di, dn,
1160 dn->d_name.len, dn->d_name.name);
1161 if (di) {
1162 mdsc = &ceph_client(dn->d_sb)->mdsc;
1163 spin_lock(&mdsc->dentry_lru_lock);
1164 list_move_tail(&di->lru, &mdsc->dentry_lru);
1165 spin_unlock(&mdsc->dentry_lru_lock);
1166 }
1167}
1168
1169void ceph_dentry_lru_del(struct dentry *dn)
1170{
1171 struct ceph_dentry_info *di = ceph_dentry(dn);
1172 struct ceph_mds_client *mdsc;
1173
1174 dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1175 dn->d_name.len, dn->d_name.name);
1176 if (di) {
1177 mdsc = &ceph_client(dn->d_sb)->mdsc;
1178 spin_lock(&mdsc->dentry_lru_lock);
1179 list_del_init(&di->lru);
1180 mdsc->num_dentry--;
1181 spin_unlock(&mdsc->dentry_lru_lock);
1182 }
1183}
1184
1185const struct file_operations ceph_dir_fops = {
1186 .read = ceph_read_dir,
1187 .readdir = ceph_readdir,
1188 .llseek = ceph_dir_llseek,
1189 .open = ceph_open,
1190 .release = ceph_release,
1191 .unlocked_ioctl = ceph_ioctl,
1192 .fsync = ceph_dir_fsync,
1193};
1194
1195const struct inode_operations ceph_dir_iops = {
1196 .lookup = ceph_lookup,
1197 .permission = ceph_permission,
1198 .getattr = ceph_getattr,
1199 .setattr = ceph_setattr,
1200 .setxattr = ceph_setxattr,
1201 .getxattr = ceph_getxattr,
1202 .listxattr = ceph_listxattr,
1203 .removexattr = ceph_removexattr,
1204 .mknod = ceph_mknod,
1205 .symlink = ceph_symlink,
1206 .mkdir = ceph_mkdir,
1207 .link = ceph_link,
1208 .unlink = ceph_unlink,
1209 .rmdir = ceph_unlink,
1210 .rename = ceph_rename,
1211 .create = ceph_create,
1212};
1213
1214struct dentry_operations ceph_dentry_ops = {
1215 .d_revalidate = ceph_d_revalidate,
1216 .d_release = ceph_dentry_release,
1217};
1218
1219struct dentry_operations ceph_snapdir_dentry_ops = {
1220 .d_revalidate = ceph_snapdir_d_revalidate,
1221};
1222
1223struct dentry_operations ceph_snap_dentry_ops = {
1224};
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
new file mode 100644
index 000000000000..9d67572fb328
--- /dev/null
+++ b/fs/ceph/export.c
@@ -0,0 +1,224 @@
1#include "ceph_debug.h"
2
3#include <linux/exportfs.h>
4#include <linux/slab.h>
5#include <asm/unaligned.h>
6
7#include "super.h"
8
9/*
10 * NFS export support
11 *
12 * NFS re-export of a ceph mount is, at present, only semireliable.
13 * The basic issue is that the Ceph architectures doesn't lend itself
14 * well to generating filehandles that will remain valid forever.
15 *
16 * So, we do our best. If you're lucky, your inode will be in the
17 * client's cache. If it's not, and you have a connectable fh, then
18 * the MDS server may be able to find it for you. Otherwise, you get
19 * ESTALE.
20 *
21 * There are ways to this more reliable, but in the non-connectable fh
22 * case, we won't every work perfectly, and in the connectable case,
23 * some changes are needed on the MDS side to work better.
24 */
25
26/*
27 * Basic fh
28 */
29struct ceph_nfs_fh {
30 u64 ino;
31} __attribute__ ((packed));
32
33/*
34 * Larger 'connectable' fh that includes parent ino and name hash.
35 * Use this whenever possible, as it works more reliably.
36 */
37struct ceph_nfs_confh {
38 u64 ino, parent_ino;
39 u32 parent_name_hash;
40} __attribute__ ((packed));
41
42static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
43 int connectable)
44{
45 struct ceph_nfs_fh *fh = (void *)rawfh;
46 struct ceph_nfs_confh *cfh = (void *)rawfh;
47 struct dentry *parent = dentry->d_parent;
48 struct inode *inode = dentry->d_inode;
49 int type;
50
51 /* don't re-export snaps */
52 if (ceph_snap(inode) != CEPH_NOSNAP)
53 return -EINVAL;
54
55 if (*max_len >= sizeof(*cfh)) {
56 dout("encode_fh %p connectable\n", dentry);
57 cfh->ino = ceph_ino(dentry->d_inode);
58 cfh->parent_ino = ceph_ino(parent->d_inode);
59 cfh->parent_name_hash = parent->d_name.hash;
60 *max_len = sizeof(*cfh);
61 type = 2;
62 } else if (*max_len > sizeof(*fh)) {
63 if (connectable)
64 return -ENOSPC;
65 dout("encode_fh %p\n", dentry);
66 fh->ino = ceph_ino(dentry->d_inode);
67 *max_len = sizeof(*fh);
68 type = 1;
69 } else {
70 return -ENOSPC;
71 }
72 return type;
73}
74
75/*
76 * convert regular fh to dentry
77 *
78 * FIXME: we should try harder by querying the mds for the ino.
79 */
80static struct dentry *__fh_to_dentry(struct super_block *sb,
81 struct ceph_nfs_fh *fh)
82{
83 struct inode *inode;
84 struct dentry *dentry;
85 struct ceph_vino vino;
86 int err;
87
88 dout("__fh_to_dentry %llx\n", fh->ino);
89 vino.ino = fh->ino;
90 vino.snap = CEPH_NOSNAP;
91 inode = ceph_find_inode(sb, vino);
92 if (!inode)
93 return ERR_PTR(-ESTALE);
94
95 dentry = d_obtain_alias(inode);
96 if (!dentry) {
97 pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
98 fh->ino, inode);
99 iput(inode);
100 return ERR_PTR(-ENOMEM);
101 }
102 err = ceph_init_dentry(dentry);
103
104 if (err < 0) {
105 iput(inode);
106 return ERR_PTR(err);
107 }
108 dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
109 return dentry;
110}
111
112/*
113 * convert connectable fh to dentry
114 */
115static struct dentry *__cfh_to_dentry(struct super_block *sb,
116 struct ceph_nfs_confh *cfh)
117{
118 struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc;
119 struct inode *inode;
120 struct dentry *dentry;
121 struct ceph_vino vino;
122 int err;
123
124 dout("__cfh_to_dentry %llx (%llx/%x)\n",
125 cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
126
127 vino.ino = cfh->ino;
128 vino.snap = CEPH_NOSNAP;
129 inode = ceph_find_inode(sb, vino);
130 if (!inode) {
131 struct ceph_mds_request *req;
132
133 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
134 USE_ANY_MDS);
135 if (IS_ERR(req))
136 return ERR_PTR(PTR_ERR(req));
137
138 req->r_ino1 = vino;
139 req->r_ino2.ino = cfh->parent_ino;
140 req->r_ino2.snap = CEPH_NOSNAP;
141 req->r_path2 = kmalloc(16, GFP_NOFS);
142 snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
143 req->r_num_caps = 1;
144 err = ceph_mdsc_do_request(mdsc, NULL, req);
145 ceph_mdsc_put_request(req);
146 inode = ceph_find_inode(sb, vino);
147 if (!inode)
148 return ERR_PTR(err ? err : -ESTALE);
149 }
150
151 dentry = d_obtain_alias(inode);
152 if (!dentry) {
153 pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
154 cfh->ino, inode);
155 iput(inode);
156 return ERR_PTR(-ENOMEM);
157 }
158 err = ceph_init_dentry(dentry);
159 if (err < 0) {
160 iput(inode);
161 return ERR_PTR(err);
162 }
163 dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
164 return dentry;
165}
166
167static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
168 int fh_len, int fh_type)
169{
170 if (fh_type == 1)
171 return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw);
172 else
173 return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw);
174}
175
176/*
177 * get parent, if possible.
178 *
179 * FIXME: we could do better by querying the mds to discover the
180 * parent.
181 */
182static struct dentry *ceph_fh_to_parent(struct super_block *sb,
183 struct fid *fid,
184 int fh_len, int fh_type)
185{
186 struct ceph_nfs_confh *cfh = (void *)fid->raw;
187 struct ceph_vino vino;
188 struct inode *inode;
189 struct dentry *dentry;
190 int err;
191
192 if (fh_type == 1)
193 return ERR_PTR(-ESTALE);
194
195 pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
196 cfh->parent_name_hash);
197
198 vino.ino = cfh->ino;
199 vino.snap = CEPH_NOSNAP;
200 inode = ceph_find_inode(sb, vino);
201 if (!inode)
202 return ERR_PTR(-ESTALE);
203
204 dentry = d_obtain_alias(inode);
205 if (!dentry) {
206 pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
207 cfh->ino, inode);
208 iput(inode);
209 return ERR_PTR(-ENOMEM);
210 }
211 err = ceph_init_dentry(dentry);
212 if (err < 0) {
213 iput(inode);
214 return ERR_PTR(err);
215 }
216 dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry);
217 return dentry;
218}
219
220const struct export_operations ceph_export_ops = {
221 .encode_fh = ceph_encode_fh,
222 .fh_to_dentry = ceph_fh_to_dentry,
223 .fh_to_parent = ceph_fh_to_parent,
224};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
new file mode 100644
index 000000000000..4add3d5da2c1
--- /dev/null
+++ b/fs/ceph/file.c
@@ -0,0 +1,938 @@
1#include "ceph_debug.h"
2
3#include <linux/sched.h>
4#include <linux/slab.h>
5#include <linux/file.h>
6#include <linux/namei.h>
7#include <linux/writeback.h>
8
9#include "super.h"
10#include "mds_client.h"
11
12/*
13 * Ceph file operations
14 *
15 * Implement basic open/close functionality, and implement
16 * read/write.
17 *
18 * We implement three modes of file I/O:
19 * - buffered uses the generic_file_aio_{read,write} helpers
20 *
21 * - synchronous is used when there is multi-client read/write
22 * sharing, avoids the page cache, and synchronously waits for an
23 * ack from the OSD.
24 *
25 * - direct io takes the variant of the sync path that references
26 * user pages directly.
27 *
28 * fsync() flushes and waits on dirty pages, but just queues metadata
29 * for writeback: since the MDS can recover size and mtime there is no
30 * need to wait for MDS acknowledgement.
31 */
32
33
34/*
35 * Prepare an open request. Preallocate ceph_cap to avoid an
36 * inopportune ENOMEM later.
37 */
38static struct ceph_mds_request *
39prepare_open_request(struct super_block *sb, int flags, int create_mode)
40{
41 struct ceph_client *client = ceph_sb_to_client(sb);
42 struct ceph_mds_client *mdsc = &client->mdsc;
43 struct ceph_mds_request *req;
44 int want_auth = USE_ANY_MDS;
45 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
46
47 if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
48 want_auth = USE_AUTH_MDS;
49
50 req = ceph_mdsc_create_request(mdsc, op, want_auth);
51 if (IS_ERR(req))
52 goto out;
53 req->r_fmode = ceph_flags_to_mode(flags);
54 req->r_args.open.flags = cpu_to_le32(flags);
55 req->r_args.open.mode = cpu_to_le32(create_mode);
56 req->r_args.open.preferred = cpu_to_le32(-1);
57out:
58 return req;
59}
60
61/*
62 * initialize private struct file data.
63 * if we fail, clean up by dropping fmode reference on the ceph_inode
64 */
65static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
66{
67 struct ceph_file_info *cf;
68 int ret = 0;
69
70 switch (inode->i_mode & S_IFMT) {
71 case S_IFREG:
72 case S_IFDIR:
73 dout("init_file %p %p 0%o (regular)\n", inode, file,
74 inode->i_mode);
75 cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
76 if (cf == NULL) {
77 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
78 return -ENOMEM;
79 }
80 cf->fmode = fmode;
81 cf->next_offset = 2;
82 file->private_data = cf;
83 BUG_ON(inode->i_fop->release != ceph_release);
84 break;
85
86 case S_IFLNK:
87 dout("init_file %p %p 0%o (symlink)\n", inode, file,
88 inode->i_mode);
89 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
90 break;
91
92 default:
93 dout("init_file %p %p 0%o (special)\n", inode, file,
94 inode->i_mode);
95 /*
96 * we need to drop the open ref now, since we don't
97 * have .release set to ceph_release.
98 */
99 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
100 BUG_ON(inode->i_fop->release == ceph_release);
101
102 /* call the proper open fop */
103 ret = inode->i_fop->open(inode, file);
104 }
105 return ret;
106}
107
108/*
109 * If the filp already has private_data, that means the file was
110 * already opened by intent during lookup, and we do nothing.
111 *
112 * If we already have the requisite capabilities, we can satisfy
113 * the open request locally (no need to request new caps from the
114 * MDS). We do, however, need to inform the MDS (asynchronously)
115 * if our wanted caps set expands.
116 */
117int ceph_open(struct inode *inode, struct file *file)
118{
119 struct ceph_inode_info *ci = ceph_inode(inode);
120 struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
121 struct ceph_mds_client *mdsc = &client->mdsc;
122 struct ceph_mds_request *req;
123 struct ceph_file_info *cf = file->private_data;
124 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
125 int err;
126 int flags, fmode, wanted;
127
128 if (cf) {
129 dout("open file %p is already opened\n", file);
130 return 0;
131 }
132
133 /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */
134 flags = file->f_flags & ~(O_CREAT|O_EXCL);
135 if (S_ISDIR(inode->i_mode))
136 flags = O_DIRECTORY; /* mds likes to know */
137
138 dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
139 ceph_vinop(inode), file, flags, file->f_flags);
140 fmode = ceph_flags_to_mode(flags);
141 wanted = ceph_caps_for_mode(fmode);
142
143 /* snapped files are read-only */
144 if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
145 return -EROFS;
146
147 /* trivially open snapdir */
148 if (ceph_snap(inode) == CEPH_SNAPDIR) {
149 spin_lock(&inode->i_lock);
150 __ceph_get_fmode(ci, fmode);
151 spin_unlock(&inode->i_lock);
152 return ceph_init_file(inode, file, fmode);
153 }
154
155 /*
156 * No need to block if we have any caps. Update wanted set
157 * asynchronously.
158 */
159 spin_lock(&inode->i_lock);
160 if (__ceph_is_any_real_caps(ci)) {
161 int mds_wanted = __ceph_caps_mds_wanted(ci);
162 int issued = __ceph_caps_issued(ci, NULL);
163
164 dout("open %p fmode %d want %s issued %s using existing\n",
165 inode, fmode, ceph_cap_string(wanted),
166 ceph_cap_string(issued));
167 __ceph_get_fmode(ci, fmode);
168 spin_unlock(&inode->i_lock);
169
170 /* adjust wanted? */
171 if ((issued & wanted) != wanted &&
172 (mds_wanted & wanted) != wanted &&
173 ceph_snap(inode) != CEPH_SNAPDIR)
174 ceph_check_caps(ci, 0, NULL);
175
176 return ceph_init_file(inode, file, fmode);
177 } else if (ceph_snap(inode) != CEPH_NOSNAP &&
178 (ci->i_snap_caps & wanted) == wanted) {
179 __ceph_get_fmode(ci, fmode);
180 spin_unlock(&inode->i_lock);
181 return ceph_init_file(inode, file, fmode);
182 }
183 spin_unlock(&inode->i_lock);
184
185 dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
186 req = prepare_open_request(inode->i_sb, flags, 0);
187 if (IS_ERR(req)) {
188 err = PTR_ERR(req);
189 goto out;
190 }
191 req->r_inode = igrab(inode);
192 req->r_num_caps = 1;
193 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
194 if (!err)
195 err = ceph_init_file(inode, file, req->r_fmode);
196 ceph_mdsc_put_request(req);
197 dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
198out:
199 return err;
200}
201
202
203/*
204 * Do a lookup + open with a single request.
205 *
206 * If this succeeds, but some subsequent check in the vfs
207 * may_open() fails, the struct *file gets cleaned up (i.e.
208 * ceph_release gets called). So fear not!
209 */
210/*
211 * flags
212 * path_lookup_open -> LOOKUP_OPEN
213 * path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE
214 */
215struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
216 struct nameidata *nd, int mode,
217 int locked_dir)
218{
219 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
220 struct ceph_mds_client *mdsc = &client->mdsc;
221 struct file *file = nd->intent.open.file;
222 struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
223 struct ceph_mds_request *req;
224 int err;
225 int flags = nd->intent.open.flags - 1; /* silly vfs! */
226
227 dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n",
228 dentry, dentry->d_name.len, dentry->d_name.name, flags, mode);
229
230 /* do the open */
231 req = prepare_open_request(dir->i_sb, flags, mode);
232 if (IS_ERR(req))
233 return ERR_PTR(PTR_ERR(req));
234 req->r_dentry = dget(dentry);
235 req->r_num_caps = 2;
236 if (flags & O_CREAT) {
237 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
238 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
239 }
240 req->r_locked_dir = dir; /* caller holds dir->i_mutex */
241 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
242 dentry = ceph_finish_lookup(req, dentry, err);
243 if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
244 err = ceph_handle_notrace_create(dir, dentry);
245 if (!err)
246 err = ceph_init_file(req->r_dentry->d_inode, file,
247 req->r_fmode);
248 ceph_mdsc_put_request(req);
249 dout("ceph_lookup_open result=%p\n", dentry);
250 return dentry;
251}
252
253int ceph_release(struct inode *inode, struct file *file)
254{
255 struct ceph_inode_info *ci = ceph_inode(inode);
256 struct ceph_file_info *cf = file->private_data;
257
258 dout("release inode %p file %p\n", inode, file);
259 ceph_put_fmode(ci, cf->fmode);
260 if (cf->last_readdir)
261 ceph_mdsc_put_request(cf->last_readdir);
262 kfree(cf->last_name);
263 kfree(cf->dir_info);
264 dput(cf->dentry);
265 kmem_cache_free(ceph_file_cachep, cf);
266
267 /* wake up anyone waiting for caps on this inode */
268 wake_up(&ci->i_cap_wq);
269 return 0;
270}
271
272/*
273 * build a vector of user pages
274 */
275static struct page **get_direct_page_vector(const char __user *data,
276 int num_pages,
277 loff_t off, size_t len)
278{
279 struct page **pages;
280 int rc;
281
282 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
283 if (!pages)
284 return ERR_PTR(-ENOMEM);
285
286 down_read(&current->mm->mmap_sem);
287 rc = get_user_pages(current, current->mm, (unsigned long)data,
288 num_pages, 0, 0, pages, NULL);
289 up_read(&current->mm->mmap_sem);
290 if (rc < 0)
291 goto fail;
292 return pages;
293
294fail:
295 kfree(pages);
296 return ERR_PTR(rc);
297}
298
299static void put_page_vector(struct page **pages, int num_pages)
300{
301 int i;
302
303 for (i = 0; i < num_pages; i++)
304 put_page(pages[i]);
305 kfree(pages);
306}
307
308void ceph_release_page_vector(struct page **pages, int num_pages)
309{
310 int i;
311
312 for (i = 0; i < num_pages; i++)
313 __free_pages(pages[i], 0);
314 kfree(pages);
315}
316
317/*
318 * allocate a vector new pages
319 */
320static struct page **alloc_page_vector(int num_pages)
321{
322 struct page **pages;
323 int i;
324
325 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
326 if (!pages)
327 return ERR_PTR(-ENOMEM);
328 for (i = 0; i < num_pages; i++) {
329 pages[i] = alloc_page(GFP_NOFS);
330 if (pages[i] == NULL) {
331 ceph_release_page_vector(pages, i);
332 return ERR_PTR(-ENOMEM);
333 }
334 }
335 return pages;
336}
337
338/*
339 * copy user data into a page vector
340 */
341static int copy_user_to_page_vector(struct page **pages,
342 const char __user *data,
343 loff_t off, size_t len)
344{
345 int i = 0;
346 int po = off & ~PAGE_CACHE_MASK;
347 int left = len;
348 int l, bad;
349
350 while (left > 0) {
351 l = min_t(int, PAGE_CACHE_SIZE-po, left);
352 bad = copy_from_user(page_address(pages[i]) + po, data, l);
353 if (bad == l)
354 return -EFAULT;
355 data += l - bad;
356 left -= l - bad;
357 po += l - bad;
358 if (po == PAGE_CACHE_SIZE) {
359 po = 0;
360 i++;
361 }
362 }
363 return len;
364}
365
366/*
367 * copy user data from a page vector into a user pointer
368 */
369static int copy_page_vector_to_user(struct page **pages, char __user *data,
370 loff_t off, size_t len)
371{
372 int i = 0;
373 int po = off & ~PAGE_CACHE_MASK;
374 int left = len;
375 int l, bad;
376
377 while (left > 0) {
378 l = min_t(int, left, PAGE_CACHE_SIZE-po);
379 bad = copy_to_user(data, page_address(pages[i]) + po, l);
380 if (bad == l)
381 return -EFAULT;
382 data += l - bad;
383 left -= l - bad;
384 if (po) {
385 po += l - bad;
386 if (po == PAGE_CACHE_SIZE)
387 po = 0;
388 }
389 i++;
390 }
391 return len;
392}
393
394/*
395 * Zero an extent within a page vector. Offset is relative to the
396 * start of the first page.
397 */
398static void zero_page_vector_range(int off, int len, struct page **pages)
399{
400 int i = off >> PAGE_CACHE_SHIFT;
401
402 off &= ~PAGE_CACHE_MASK;
403
404 dout("zero_page_vector_page %u~%u\n", off, len);
405
406 /* leading partial page? */
407 if (off) {
408 int end = min((int)PAGE_CACHE_SIZE, off + len);
409 dout("zeroing %d %p head from %d\n", i, pages[i],
410 (int)off);
411 zero_user_segment(pages[i], off, end);
412 len -= (end - off);
413 i++;
414 }
415 while (len >= PAGE_CACHE_SIZE) {
416 dout("zeroing %d %p len=%d\n", i, pages[i], len);
417 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
418 len -= PAGE_CACHE_SIZE;
419 i++;
420 }
421 /* trailing partial page? */
422 if (len) {
423 dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
424 zero_user_segment(pages[i], 0, len);
425 }
426}
427
428
429/*
430 * Read a range of bytes striped over one or more objects. Iterate over
431 * objects we stripe over. (That's not atomic, but good enough for now.)
432 *
433 * If we get a short result from the OSD, check against i_size; we need to
434 * only return a short read to the caller if we hit EOF.
435 */
436static int striped_read(struct inode *inode,
437 u64 off, u64 len,
438 struct page **pages, int num_pages,
439 int *checkeof)
440{
441 struct ceph_client *client = ceph_inode_to_client(inode);
442 struct ceph_inode_info *ci = ceph_inode(inode);
443 u64 pos, this_len;
444 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
445 int left, pages_left;
446 int read;
447 struct page **page_pos;
448 int ret;
449 bool hit_stripe, was_short;
450
451 /*
452 * we may need to do multiple reads. not atomic, unfortunately.
453 */
454 pos = off;
455 left = len;
456 page_pos = pages;
457 pages_left = num_pages;
458 read = 0;
459
460more:
461 this_len = left;
462 ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
463 &ci->i_layout, pos, &this_len,
464 ci->i_truncate_seq,
465 ci->i_truncate_size,
466 page_pos, pages_left);
467 hit_stripe = this_len < left;
468 was_short = ret >= 0 && ret < this_len;
469 if (ret == -ENOENT)
470 ret = 0;
471 dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
472 ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
473
474 if (ret > 0) {
475 int didpages =
476 ((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT;
477
478 if (read < pos - off) {
479 dout(" zero gap %llu to %llu\n", off + read, pos);
480 zero_page_vector_range(page_off + read,
481 pos - off - read, pages);
482 }
483 pos += ret;
484 read = pos - off;
485 left -= ret;
486 page_pos += didpages;
487 pages_left -= didpages;
488
489 /* hit stripe? */
490 if (left && hit_stripe)
491 goto more;
492 }
493
494 if (was_short) {
495 /* was original extent fully inside i_size? */
496 if (pos + left <= inode->i_size) {
497 dout("zero tail\n");
498 zero_page_vector_range(page_off + read, len - read,
499 pages);
500 read = len;
501 goto out;
502 }
503
504 /* check i_size */
505 *checkeof = 1;
506 }
507
508out:
509 if (ret >= 0)
510 ret = read;
511 dout("striped_read returns %d\n", ret);
512 return ret;
513}
514
515/*
516 * Completely synchronous read and write methods. Direct from __user
517 * buffer to osd, or directly to user pages (if O_DIRECT).
518 *
519 * If the read spans object boundary, just do multiple reads.
520 */
521static ssize_t ceph_sync_read(struct file *file, char __user *data,
522 unsigned len, loff_t *poff, int *checkeof)
523{
524 struct inode *inode = file->f_dentry->d_inode;
525 struct page **pages;
526 u64 off = *poff;
527 int num_pages = calc_pages_for(off, len);
528 int ret;
529
530 dout("sync_read on file %p %llu~%u %s\n", file, off, len,
531 (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
532
533 if (file->f_flags & O_DIRECT) {
534 pages = get_direct_page_vector(data, num_pages, off, len);
535
536 /*
537 * flush any page cache pages in this range. this
538 * will make concurrent normal and O_DIRECT io slow,
539 * but it will at least behave sensibly when they are
540 * in sequence.
541 */
542 } else {
543 pages = alloc_page_vector(num_pages);
544 }
545 if (IS_ERR(pages))
546 return PTR_ERR(pages);
547
548 ret = filemap_write_and_wait(inode->i_mapping);
549 if (ret < 0)
550 goto done;
551
552 ret = striped_read(inode, off, len, pages, num_pages, checkeof);
553
554 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
555 ret = copy_page_vector_to_user(pages, data, off, ret);
556 if (ret >= 0)
557 *poff = off + ret;
558
559done:
560 if (file->f_flags & O_DIRECT)
561 put_page_vector(pages, num_pages);
562 else
563 ceph_release_page_vector(pages, num_pages);
564 dout("sync_read result %d\n", ret);
565 return ret;
566}
567
568/*
569 * Write commit callback, called if we requested both an ACK and
570 * ONDISK commit reply from the OSD.
571 */
572static void sync_write_commit(struct ceph_osd_request *req,
573 struct ceph_msg *msg)
574{
575 struct ceph_inode_info *ci = ceph_inode(req->r_inode);
576
577 dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
578 spin_lock(&ci->i_unsafe_lock);
579 list_del_init(&req->r_unsafe_item);
580 spin_unlock(&ci->i_unsafe_lock);
581 ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
582}
583
584/*
585 * Synchronous write, straight from __user pointer or user pages (if
586 * O_DIRECT).
587 *
588 * If write spans object boundary, just do multiple writes. (For a
589 * correct atomic write, we should e.g. take write locks on all
590 * objects, rollback on failure, etc.)
591 */
592static ssize_t ceph_sync_write(struct file *file, const char __user *data,
593 size_t left, loff_t *offset)
594{
595 struct inode *inode = file->f_dentry->d_inode;
596 struct ceph_inode_info *ci = ceph_inode(inode);
597 struct ceph_client *client = ceph_inode_to_client(inode);
598 struct ceph_osd_request *req;
599 struct page **pages;
600 int num_pages;
601 long long unsigned pos;
602 u64 len;
603 int written = 0;
604 int flags;
605 int do_sync = 0;
606 int check_caps = 0;
607 int ret;
608 struct timespec mtime = CURRENT_TIME;
609
610 if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
611 return -EROFS;
612
613 dout("sync_write on file %p %lld~%u %s\n", file, *offset,
614 (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
615
616 if (file->f_flags & O_APPEND)
617 pos = i_size_read(inode);
618 else
619 pos = *offset;
620
621 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
622 if (ret < 0)
623 return ret;
624
625 ret = invalidate_inode_pages2_range(inode->i_mapping,
626 pos >> PAGE_CACHE_SHIFT,
627 (pos + left) >> PAGE_CACHE_SHIFT);
628 if (ret < 0)
629 dout("invalidate_inode_pages2_range returned %d\n", ret);
630
631 flags = CEPH_OSD_FLAG_ORDERSNAP |
632 CEPH_OSD_FLAG_ONDISK |
633 CEPH_OSD_FLAG_WRITE;
634 if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
635 flags |= CEPH_OSD_FLAG_ACK;
636 else
637 do_sync = 1;
638
639 /*
640 * we may need to do multiple writes here if we span an object
641 * boundary. this isn't atomic, unfortunately. :(
642 */
643more:
644 len = left;
645 req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
646 ceph_vino(inode), pos, &len,
647 CEPH_OSD_OP_WRITE, flags,
648 ci->i_snap_realm->cached_context,
649 do_sync,
650 ci->i_truncate_seq, ci->i_truncate_size,
651 &mtime, false, 2);
652 if (IS_ERR(req))
653 return PTR_ERR(req);
654
655 num_pages = calc_pages_for(pos, len);
656
657 if (file->f_flags & O_DIRECT) {
658 pages = get_direct_page_vector(data, num_pages, pos, len);
659 if (IS_ERR(pages)) {
660 ret = PTR_ERR(pages);
661 goto out;
662 }
663
664 /*
665 * throw out any page cache pages in this range. this
666 * may block.
667 */
668 truncate_inode_pages_range(inode->i_mapping, pos, pos+len);
669 } else {
670 pages = alloc_page_vector(num_pages);
671 if (IS_ERR(pages)) {
672 ret = PTR_ERR(pages);
673 goto out;
674 }
675 ret = copy_user_to_page_vector(pages, data, pos, len);
676 if (ret < 0) {
677 ceph_release_page_vector(pages, num_pages);
678 goto out;
679 }
680
681 if ((file->f_flags & O_SYNC) == 0) {
682 /* get a second commit callback */
683 req->r_safe_callback = sync_write_commit;
684 req->r_own_pages = 1;
685 }
686 }
687 req->r_pages = pages;
688 req->r_num_pages = num_pages;
689 req->r_inode = inode;
690
691 ret = ceph_osdc_start_request(&client->osdc, req, false);
692 if (!ret) {
693 if (req->r_safe_callback) {
694 /*
695 * Add to inode unsafe list only after we
696 * start_request so that a tid has been assigned.
697 */
698 spin_lock(&ci->i_unsafe_lock);
699 list_add(&ci->i_unsafe_writes, &req->r_unsafe_item);
700 spin_unlock(&ci->i_unsafe_lock);
701 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
702 }
703 ret = ceph_osdc_wait_request(&client->osdc, req);
704 }
705
706 if (file->f_flags & O_DIRECT)
707 put_page_vector(pages, num_pages);
708 else if (file->f_flags & O_SYNC)
709 ceph_release_page_vector(pages, num_pages);
710
711out:
712 ceph_osdc_put_request(req);
713 if (ret == 0) {
714 pos += len;
715 written += len;
716 left -= len;
717 if (left)
718 goto more;
719
720 ret = written;
721 *offset = pos;
722 if (pos > i_size_read(inode))
723 check_caps = ceph_inode_set_size(inode, pos);
724 if (check_caps)
725 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
726 NULL);
727 }
728 return ret;
729}
730
731/*
732 * Wrap generic_file_aio_read with checks for cap bits on the inode.
733 * Atomically grab references, so that those bits are not released
734 * back to the MDS mid-read.
735 *
736 * Hmm, the sync read case isn't actually async... should it be?
737 */
738static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
739 unsigned long nr_segs, loff_t pos)
740{
741 struct file *filp = iocb->ki_filp;
742 loff_t *ppos = &iocb->ki_pos;
743 size_t len = iov->iov_len;
744 struct inode *inode = filp->f_dentry->d_inode;
745 struct ceph_inode_info *ci = ceph_inode(inode);
746 void *base = iov->iov_base;
747 ssize_t ret;
748 int got = 0;
749 int checkeof = 0, read = 0;
750
751 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
752 inode, ceph_vinop(inode), pos, (unsigned)len, inode);
753again:
754 __ceph_do_pending_vmtruncate(inode);
755 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE,
756 &got, -1);
757 if (ret < 0)
758 goto out;
759 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
760 inode, ceph_vinop(inode), pos, (unsigned)len,
761 ceph_cap_string(got));
762
763 if ((got & CEPH_CAP_FILE_CACHE) == 0 ||
764 (iocb->ki_filp->f_flags & O_DIRECT) ||
765 (inode->i_sb->s_flags & MS_SYNCHRONOUS))
766 /* hmm, this isn't really async... */
767 ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
768 else
769 ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
770
771out:
772 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
773 inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
774 ceph_put_cap_refs(ci, got);
775
776 if (checkeof && ret >= 0) {
777 int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
778
779 /* hit EOF or hole? */
780 if (statret == 0 && *ppos < inode->i_size) {
781 dout("aio_read sync_read hit hole, reading more\n");
782 read += ret;
783 base += ret;
784 len -= ret;
785 checkeof = 0;
786 goto again;
787 }
788 }
789 if (ret >= 0)
790 ret += read;
791
792 return ret;
793}
794
795/*
796 * Take cap references to avoid releasing caps to MDS mid-write.
797 *
798 * If we are synchronous, and write with an old snap context, the OSD
799 * may return EOLDSNAPC. In that case, retry the write.. _after_
800 * dropping our cap refs and allowing the pending snap to logically
801 * complete _before_ this write occurs.
802 *
803 * If we are near ENOSPC, write synchronously.
804 */
805static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
806 unsigned long nr_segs, loff_t pos)
807{
808 struct file *file = iocb->ki_filp;
809 struct inode *inode = file->f_dentry->d_inode;
810 struct ceph_inode_info *ci = ceph_inode(inode);
811 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
812 loff_t endoff = pos + iov->iov_len;
813 int got = 0;
814 int ret, err;
815
816 if (ceph_snap(inode) != CEPH_NOSNAP)
817 return -EROFS;
818
819retry_snap:
820 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
821 return -ENOSPC;
822 __ceph_do_pending_vmtruncate(inode);
823 dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
824 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
825 inode->i_size);
826 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
827 &got, endoff);
828 if (ret < 0)
829 goto out;
830
831 dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n",
832 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
833 ceph_cap_string(got));
834
835 if ((got & CEPH_CAP_FILE_BUFFER) == 0 ||
836 (iocb->ki_filp->f_flags & O_DIRECT) ||
837 (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
838 ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
839 &iocb->ki_pos);
840 } else {
841 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
842
843 if ((ret >= 0 || ret == -EIOCBQUEUED) &&
844 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
845 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
846 err = vfs_fsync_range(file, file->f_path.dentry,
847 pos, pos + ret - 1, 1);
848 if (err < 0)
849 ret = err;
850 }
851 }
852 if (ret >= 0) {
853 spin_lock(&inode->i_lock);
854 __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
855 spin_unlock(&inode->i_lock);
856 }
857
858out:
859 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
860 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
861 ceph_cap_string(got));
862 ceph_put_cap_refs(ci, got);
863
864 if (ret == -EOLDSNAPC) {
865 dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
866 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
867 goto retry_snap;
868 }
869
870 return ret;
871}
872
873/*
874 * llseek. be sure to verify file size on SEEK_END.
875 */
876static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
877{
878 struct inode *inode = file->f_mapping->host;
879 int ret;
880
881 mutex_lock(&inode->i_mutex);
882 __ceph_do_pending_vmtruncate(inode);
883 switch (origin) {
884 case SEEK_END:
885 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
886 if (ret < 0) {
887 offset = ret;
888 goto out;
889 }
890 offset += inode->i_size;
891 break;
892 case SEEK_CUR:
893 /*
894 * Here we special-case the lseek(fd, 0, SEEK_CUR)
895 * position-querying operation. Avoid rewriting the "same"
896 * f_pos value back to the file because a concurrent read(),
897 * write() or lseek() might have altered it
898 */
899 if (offset == 0) {
900 offset = file->f_pos;
901 goto out;
902 }
903 offset += file->f_pos;
904 break;
905 }
906
907 if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
908 offset = -EINVAL;
909 goto out;
910 }
911
912 /* Special lock needed here? */
913 if (offset != file->f_pos) {
914 file->f_pos = offset;
915 file->f_version = 0;
916 }
917
918out:
919 mutex_unlock(&inode->i_mutex);
920 return offset;
921}
922
923const struct file_operations ceph_file_fops = {
924 .open = ceph_open,
925 .release = ceph_release,
926 .llseek = ceph_llseek,
927 .read = do_sync_read,
928 .write = do_sync_write,
929 .aio_read = ceph_aio_read,
930 .aio_write = ceph_aio_write,
931 .mmap = ceph_mmap,
932 .fsync = ceph_fsync,
933 .splice_read = generic_file_splice_read,
934 .splice_write = generic_file_splice_write,
935 .unlocked_ioctl = ceph_ioctl,
936 .compat_ioctl = ceph_ioctl,
937};
938
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
new file mode 100644
index 000000000000..26f883c275e8
--- /dev/null
+++ b/fs/ceph/inode.c
@@ -0,0 +1,1774 @@
1#include "ceph_debug.h"
2
3#include <linux/module.h>
4#include <linux/fs.h>
5#include <linux/smp_lock.h>
6#include <linux/slab.h>
7#include <linux/string.h>
8#include <linux/uaccess.h>
9#include <linux/kernel.h>
10#include <linux/namei.h>
11#include <linux/writeback.h>
12#include <linux/vmalloc.h>
13#include <linux/pagevec.h>
14
15#include "super.h"
16#include "decode.h"
17
18/*
19 * Ceph inode operations
20 *
21 * Implement basic inode helpers (get, alloc) and inode ops (getattr,
22 * setattr, etc.), xattr helpers, and helpers for assimilating
23 * metadata returned by the MDS into our cache.
24 *
25 * Also define helpers for doing asynchronous writeback, invalidation,
26 * and truncation for the benefit of those who can't afford to block
27 * (typically because they are in the message handler path).
28 */
29
30static const struct inode_operations ceph_symlink_iops;
31
32static void ceph_invalidate_work(struct work_struct *work);
33static void ceph_writeback_work(struct work_struct *work);
34static void ceph_vmtruncate_work(struct work_struct *work);
35
36/*
37 * find or create an inode, given the ceph ino number
38 */
39struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
40{
41 struct inode *inode;
42 ino_t t = ceph_vino_to_ino(vino);
43
44 inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
45 if (inode == NULL)
46 return ERR_PTR(-ENOMEM);
47 if (inode->i_state & I_NEW) {
48 dout("get_inode created new inode %p %llx.%llx ino %llx\n",
49 inode, ceph_vinop(inode), (u64)inode->i_ino);
50 unlock_new_inode(inode);
51 }
52
53 dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
54 vino.snap, inode);
55 return inode;
56}
57
58/*
59 * get/constuct snapdir inode for a given directory
60 */
61struct inode *ceph_get_snapdir(struct inode *parent)
62{
63 struct ceph_vino vino = {
64 .ino = ceph_ino(parent),
65 .snap = CEPH_SNAPDIR,
66 };
67 struct inode *inode = ceph_get_inode(parent->i_sb, vino);
68 struct ceph_inode_info *ci = ceph_inode(inode);
69
70 BUG_ON(!S_ISDIR(parent->i_mode));
71 if (IS_ERR(inode))
72 return ERR_PTR(PTR_ERR(inode));
73 inode->i_mode = parent->i_mode;
74 inode->i_uid = parent->i_uid;
75 inode->i_gid = parent->i_gid;
76 inode->i_op = &ceph_dir_iops;
77 inode->i_fop = &ceph_dir_fops;
78 ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
79 ci->i_rbytes = 0;
80 return inode;
81}
82
83const struct inode_operations ceph_file_iops = {
84 .permission = ceph_permission,
85 .setattr = ceph_setattr,
86 .getattr = ceph_getattr,
87 .setxattr = ceph_setxattr,
88 .getxattr = ceph_getxattr,
89 .listxattr = ceph_listxattr,
90 .removexattr = ceph_removexattr,
91};
92
93
94/*
95 * We use a 'frag tree' to keep track of the MDS's directory fragments
96 * for a given inode (usually there is just a single fragment). We
97 * need to know when a child frag is delegated to a new MDS, or when
98 * it is flagged as replicated, so we can direct our requests
99 * accordingly.
100 */
101
102/*
103 * find/create a frag in the tree
104 */
105static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
106 u32 f)
107{
108 struct rb_node **p;
109 struct rb_node *parent = NULL;
110 struct ceph_inode_frag *frag;
111 int c;
112
113 p = &ci->i_fragtree.rb_node;
114 while (*p) {
115 parent = *p;
116 frag = rb_entry(parent, struct ceph_inode_frag, node);
117 c = ceph_frag_compare(f, frag->frag);
118 if (c < 0)
119 p = &(*p)->rb_left;
120 else if (c > 0)
121 p = &(*p)->rb_right;
122 else
123 return frag;
124 }
125
126 frag = kmalloc(sizeof(*frag), GFP_NOFS);
127 if (!frag) {
128 pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx "
129 "frag %x\n", &ci->vfs_inode,
130 ceph_vinop(&ci->vfs_inode), f);
131 return ERR_PTR(-ENOMEM);
132 }
133 frag->frag = f;
134 frag->split_by = 0;
135 frag->mds = -1;
136 frag->ndist = 0;
137
138 rb_link_node(&frag->node, parent, p);
139 rb_insert_color(&frag->node, &ci->i_fragtree);
140
141 dout("get_or_create_frag added %llx.%llx frag %x\n",
142 ceph_vinop(&ci->vfs_inode), f);
143 return frag;
144}
145
146/*
147 * find a specific frag @f
148 */
149struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
150{
151 struct rb_node *n = ci->i_fragtree.rb_node;
152
153 while (n) {
154 struct ceph_inode_frag *frag =
155 rb_entry(n, struct ceph_inode_frag, node);
156 int c = ceph_frag_compare(f, frag->frag);
157 if (c < 0)
158 n = n->rb_left;
159 else if (c > 0)
160 n = n->rb_right;
161 else
162 return frag;
163 }
164 return NULL;
165}
166
167/*
168 * Choose frag containing the given value @v. If @pfrag is
169 * specified, copy the frag delegation info to the caller if
170 * it is present.
171 */
172u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
173 struct ceph_inode_frag *pfrag,
174 int *found)
175{
176 u32 t = ceph_frag_make(0, 0);
177 struct ceph_inode_frag *frag;
178 unsigned nway, i;
179 u32 n;
180
181 if (found)
182 *found = 0;
183
184 mutex_lock(&ci->i_fragtree_mutex);
185 while (1) {
186 WARN_ON(!ceph_frag_contains_value(t, v));
187 frag = __ceph_find_frag(ci, t);
188 if (!frag)
189 break; /* t is a leaf */
190 if (frag->split_by == 0) {
191 if (pfrag)
192 memcpy(pfrag, frag, sizeof(*pfrag));
193 if (found)
194 *found = 1;
195 break;
196 }
197
198 /* choose child */
199 nway = 1 << frag->split_by;
200 dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
201 frag->split_by, nway);
202 for (i = 0; i < nway; i++) {
203 n = ceph_frag_make_child(t, frag->split_by, i);
204 if (ceph_frag_contains_value(n, v)) {
205 t = n;
206 break;
207 }
208 }
209 BUG_ON(i == nway);
210 }
211 dout("choose_frag(%x) = %x\n", v, t);
212
213 mutex_unlock(&ci->i_fragtree_mutex);
214 return t;
215}
216
217/*
218 * Process dirfrag (delegation) info from the mds. Include leaf
219 * fragment in tree ONLY if ndist > 0. Otherwise, only
220 * branches/splits are included in i_fragtree)
221 */
222static int ceph_fill_dirfrag(struct inode *inode,
223 struct ceph_mds_reply_dirfrag *dirinfo)
224{
225 struct ceph_inode_info *ci = ceph_inode(inode);
226 struct ceph_inode_frag *frag;
227 u32 id = le32_to_cpu(dirinfo->frag);
228 int mds = le32_to_cpu(dirinfo->auth);
229 int ndist = le32_to_cpu(dirinfo->ndist);
230 int i;
231 int err = 0;
232
233 mutex_lock(&ci->i_fragtree_mutex);
234 if (ndist == 0) {
235 /* no delegation info needed. */
236 frag = __ceph_find_frag(ci, id);
237 if (!frag)
238 goto out;
239 if (frag->split_by == 0) {
240 /* tree leaf, remove */
241 dout("fill_dirfrag removed %llx.%llx frag %x"
242 " (no ref)\n", ceph_vinop(inode), id);
243 rb_erase(&frag->node, &ci->i_fragtree);
244 kfree(frag);
245 } else {
246 /* tree branch, keep and clear */
247 dout("fill_dirfrag cleared %llx.%llx frag %x"
248 " referral\n", ceph_vinop(inode), id);
249 frag->mds = -1;
250 frag->ndist = 0;
251 }
252 goto out;
253 }
254
255
256 /* find/add this frag to store mds delegation info */
257 frag = __get_or_create_frag(ci, id);
258 if (IS_ERR(frag)) {
259 /* this is not the end of the world; we can continue
260 with bad/inaccurate delegation info */
261 pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
262 ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
263 err = -ENOMEM;
264 goto out;
265 }
266
267 frag->mds = mds;
268 frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
269 for (i = 0; i < frag->ndist; i++)
270 frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
271 dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
272 ceph_vinop(inode), frag->frag, frag->ndist);
273
274out:
275 mutex_unlock(&ci->i_fragtree_mutex);
276 return err;
277}
278
279
280/*
281 * initialize a newly allocated inode.
282 */
283struct inode *ceph_alloc_inode(struct super_block *sb)
284{
285 struct ceph_inode_info *ci;
286 int i;
287
288 ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
289 if (!ci)
290 return NULL;
291
292 dout("alloc_inode %p\n", &ci->vfs_inode);
293
294 ci->i_version = 0;
295 ci->i_time_warp_seq = 0;
296 ci->i_ceph_flags = 0;
297 ci->i_release_count = 0;
298 ci->i_symlink = NULL;
299
300 ci->i_fragtree = RB_ROOT;
301 mutex_init(&ci->i_fragtree_mutex);
302
303 ci->i_xattrs.blob = NULL;
304 ci->i_xattrs.prealloc_blob = NULL;
305 ci->i_xattrs.dirty = false;
306 ci->i_xattrs.index = RB_ROOT;
307 ci->i_xattrs.count = 0;
308 ci->i_xattrs.names_size = 0;
309 ci->i_xattrs.vals_size = 0;
310 ci->i_xattrs.version = 0;
311 ci->i_xattrs.index_version = 0;
312
313 ci->i_caps = RB_ROOT;
314 ci->i_auth_cap = NULL;
315 ci->i_dirty_caps = 0;
316 ci->i_flushing_caps = 0;
317 INIT_LIST_HEAD(&ci->i_dirty_item);
318 INIT_LIST_HEAD(&ci->i_flushing_item);
319 ci->i_cap_flush_seq = 0;
320 ci->i_cap_flush_last_tid = 0;
321 memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
322 init_waitqueue_head(&ci->i_cap_wq);
323 ci->i_hold_caps_min = 0;
324 ci->i_hold_caps_max = 0;
325 INIT_LIST_HEAD(&ci->i_cap_delay_list);
326 ci->i_cap_exporting_mds = 0;
327 ci->i_cap_exporting_mseq = 0;
328 ci->i_cap_exporting_issued = 0;
329 INIT_LIST_HEAD(&ci->i_cap_snaps);
330 ci->i_head_snapc = NULL;
331 ci->i_snap_caps = 0;
332
333 for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
334 ci->i_nr_by_mode[i] = 0;
335
336 ci->i_truncate_seq = 0;
337 ci->i_truncate_size = 0;
338 ci->i_truncate_pending = 0;
339
340 ci->i_max_size = 0;
341 ci->i_reported_size = 0;
342 ci->i_wanted_max_size = 0;
343 ci->i_requested_max_size = 0;
344
345 ci->i_pin_ref = 0;
346 ci->i_rd_ref = 0;
347 ci->i_rdcache_ref = 0;
348 ci->i_wr_ref = 0;
349 ci->i_wrbuffer_ref = 0;
350 ci->i_wrbuffer_ref_head = 0;
351 ci->i_shared_gen = 0;
352 ci->i_rdcache_gen = 0;
353 ci->i_rdcache_revoking = 0;
354
355 INIT_LIST_HEAD(&ci->i_unsafe_writes);
356 INIT_LIST_HEAD(&ci->i_unsafe_dirops);
357 spin_lock_init(&ci->i_unsafe_lock);
358
359 ci->i_snap_realm = NULL;
360 INIT_LIST_HEAD(&ci->i_snap_realm_item);
361 INIT_LIST_HEAD(&ci->i_snap_flush_item);
362
363 INIT_WORK(&ci->i_wb_work, ceph_writeback_work);
364 INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work);
365
366 INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
367
368 return &ci->vfs_inode;
369}
370
371void ceph_destroy_inode(struct inode *inode)
372{
373 struct ceph_inode_info *ci = ceph_inode(inode);
374 struct ceph_inode_frag *frag;
375 struct rb_node *n;
376
377 dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
378
379 ceph_queue_caps_release(inode);
380
381 /*
382 * we may still have a snap_realm reference if there are stray
383 * caps in i_cap_exporting_issued or i_snap_caps.
384 */
385 if (ci->i_snap_realm) {
386 struct ceph_mds_client *mdsc =
387 &ceph_client(ci->vfs_inode.i_sb)->mdsc;
388 struct ceph_snap_realm *realm = ci->i_snap_realm;
389
390 dout(" dropping residual ref to snap realm %p\n", realm);
391 spin_lock(&realm->inodes_with_caps_lock);
392 list_del_init(&ci->i_snap_realm_item);
393 spin_unlock(&realm->inodes_with_caps_lock);
394 ceph_put_snap_realm(mdsc, realm);
395 }
396
397 kfree(ci->i_symlink);
398 while ((n = rb_first(&ci->i_fragtree)) != NULL) {
399 frag = rb_entry(n, struct ceph_inode_frag, node);
400 rb_erase(n, &ci->i_fragtree);
401 kfree(frag);
402 }
403
404 __ceph_destroy_xattrs(ci);
405 if (ci->i_xattrs.blob)
406 ceph_buffer_put(ci->i_xattrs.blob);
407 if (ci->i_xattrs.prealloc_blob)
408 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
409
410 kmem_cache_free(ceph_inode_cachep, ci);
411}
412
413
414/*
415 * Helpers to fill in size, ctime, mtime, and atime. We have to be
416 * careful because either the client or MDS may have more up to date
417 * info, depending on which capabilities are held, and whether
418 * time_warp_seq or truncate_seq have increased. (Ordinarily, mtime
419 * and size are monotonically increasing, except when utimes() or
420 * truncate() increments the corresponding _seq values.)
421 */
422int ceph_fill_file_size(struct inode *inode, int issued,
423 u32 truncate_seq, u64 truncate_size, u64 size)
424{
425 struct ceph_inode_info *ci = ceph_inode(inode);
426 int queue_trunc = 0;
427
428 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
429 (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
430 dout("size %lld -> %llu\n", inode->i_size, size);
431 inode->i_size = size;
432 inode->i_blocks = (size + (1<<9) - 1) >> 9;
433 ci->i_reported_size = size;
434 if (truncate_seq != ci->i_truncate_seq) {
435 dout("truncate_seq %u -> %u\n",
436 ci->i_truncate_seq, truncate_seq);
437 ci->i_truncate_seq = truncate_seq;
438 /*
439 * If we hold relevant caps, or in the case where we're
440 * not the only client referencing this file and we
441 * don't hold those caps, then we need to check whether
442 * the file is either opened or mmaped
443 */
444 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
445 CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
446 CEPH_CAP_FILE_EXCL)) ||
447 mapping_mapped(inode->i_mapping) ||
448 __ceph_caps_file_wanted(ci)) {
449 ci->i_truncate_pending++;
450 queue_trunc = 1;
451 }
452 }
453 }
454 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
455 ci->i_truncate_size != truncate_size) {
456 dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
457 truncate_size);
458 ci->i_truncate_size = truncate_size;
459 }
460 return queue_trunc;
461}
462
463void ceph_fill_file_time(struct inode *inode, int issued,
464 u64 time_warp_seq, struct timespec *ctime,
465 struct timespec *mtime, struct timespec *atime)
466{
467 struct ceph_inode_info *ci = ceph_inode(inode);
468 int warn = 0;
469
470 if (issued & (CEPH_CAP_FILE_EXCL|
471 CEPH_CAP_FILE_WR|
472 CEPH_CAP_FILE_BUFFER)) {
473 if (timespec_compare(ctime, &inode->i_ctime) > 0) {
474 dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
475 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
476 ctime->tv_sec, ctime->tv_nsec);
477 inode->i_ctime = *ctime;
478 }
479 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
480 /* the MDS did a utimes() */
481 dout("mtime %ld.%09ld -> %ld.%09ld "
482 "tw %d -> %d\n",
483 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
484 mtime->tv_sec, mtime->tv_nsec,
485 ci->i_time_warp_seq, (int)time_warp_seq);
486
487 inode->i_mtime = *mtime;
488 inode->i_atime = *atime;
489 ci->i_time_warp_seq = time_warp_seq;
490 } else if (time_warp_seq == ci->i_time_warp_seq) {
491 /* nobody did utimes(); take the max */
492 if (timespec_compare(mtime, &inode->i_mtime) > 0) {
493 dout("mtime %ld.%09ld -> %ld.%09ld inc\n",
494 inode->i_mtime.tv_sec,
495 inode->i_mtime.tv_nsec,
496 mtime->tv_sec, mtime->tv_nsec);
497 inode->i_mtime = *mtime;
498 }
499 if (timespec_compare(atime, &inode->i_atime) > 0) {
500 dout("atime %ld.%09ld -> %ld.%09ld inc\n",
501 inode->i_atime.tv_sec,
502 inode->i_atime.tv_nsec,
503 atime->tv_sec, atime->tv_nsec);
504 inode->i_atime = *atime;
505 }
506 } else if (issued & CEPH_CAP_FILE_EXCL) {
507 /* we did a utimes(); ignore mds values */
508 } else {
509 warn = 1;
510 }
511 } else {
512 /* we have no write caps; whatever the MDS says is true */
513 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
514 inode->i_ctime = *ctime;
515 inode->i_mtime = *mtime;
516 inode->i_atime = *atime;
517 ci->i_time_warp_seq = time_warp_seq;
518 } else {
519 warn = 1;
520 }
521 }
522 if (warn) /* time_warp_seq shouldn't go backwards */
523 dout("%p mds time_warp_seq %llu < %u\n",
524 inode, time_warp_seq, ci->i_time_warp_seq);
525}
526
527/*
528 * Populate an inode based on info from mds. May be called on new or
529 * existing inodes.
530 */
531static int fill_inode(struct inode *inode,
532 struct ceph_mds_reply_info_in *iinfo,
533 struct ceph_mds_reply_dirfrag *dirinfo,
534 struct ceph_mds_session *session,
535 unsigned long ttl_from, int cap_fmode,
536 struct ceph_cap_reservation *caps_reservation)
537{
538 struct ceph_mds_reply_inode *info = iinfo->in;
539 struct ceph_inode_info *ci = ceph_inode(inode);
540 int i;
541 int issued, implemented;
542 struct timespec mtime, atime, ctime;
543 u32 nsplits;
544 struct ceph_buffer *xattr_blob = NULL;
545 int err = 0;
546 int queue_trunc = 0;
547
548 dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
549 inode, ceph_vinop(inode), le64_to_cpu(info->version),
550 ci->i_version);
551
552 /*
553 * prealloc xattr data, if it looks like we'll need it. only
554 * if len > 4 (meaning there are actually xattrs; the first 4
555 * bytes are the xattr count).
556 */
557 if (iinfo->xattr_len > 4) {
558 xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
559 if (!xattr_blob)
560 pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
561 iinfo->xattr_len);
562 }
563
564 spin_lock(&inode->i_lock);
565
566 /*
567 * provided version will be odd if inode value is projected,
568 * even if stable. skip the update if we have a newer info
569 * (e.g., due to inode info racing form multiple MDSs), or if
570 * we are getting projected (unstable) inode info.
571 */
572 if (le64_to_cpu(info->version) > 0 &&
573 (ci->i_version & ~1) > le64_to_cpu(info->version))
574 goto no_change;
575
576 issued = __ceph_caps_issued(ci, &implemented);
577 issued |= implemented | __ceph_caps_dirty(ci);
578
579 /* update inode */
580 ci->i_version = le64_to_cpu(info->version);
581 inode->i_version++;
582 inode->i_rdev = le32_to_cpu(info->rdev);
583
584 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
585 inode->i_mode = le32_to_cpu(info->mode);
586 inode->i_uid = le32_to_cpu(info->uid);
587 inode->i_gid = le32_to_cpu(info->gid);
588 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
589 inode->i_uid, inode->i_gid);
590 }
591
592 if ((issued & CEPH_CAP_LINK_EXCL) == 0)
593 inode->i_nlink = le32_to_cpu(info->nlink);
594
595 /* be careful with mtime, atime, size */
596 ceph_decode_timespec(&atime, &info->atime);
597 ceph_decode_timespec(&mtime, &info->mtime);
598 ceph_decode_timespec(&ctime, &info->ctime);
599 queue_trunc = ceph_fill_file_size(inode, issued,
600 le32_to_cpu(info->truncate_seq),
601 le64_to_cpu(info->truncate_size),
602 le64_to_cpu(info->size));
603 ceph_fill_file_time(inode, issued,
604 le32_to_cpu(info->time_warp_seq),
605 &ctime, &mtime, &atime);
606
607 ci->i_max_size = le64_to_cpu(info->max_size);
608 ci->i_layout = info->layout;
609 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
610
611 /* xattrs */
612 /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
613 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
614 le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
615 if (ci->i_xattrs.blob)
616 ceph_buffer_put(ci->i_xattrs.blob);
617 ci->i_xattrs.blob = xattr_blob;
618 if (xattr_blob)
619 memcpy(ci->i_xattrs.blob->vec.iov_base,
620 iinfo->xattr_data, iinfo->xattr_len);
621 ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
622 }
623
624 inode->i_mapping->a_ops = &ceph_aops;
625 inode->i_mapping->backing_dev_info =
626 &ceph_client(inode->i_sb)->backing_dev_info;
627
628 switch (inode->i_mode & S_IFMT) {
629 case S_IFIFO:
630 case S_IFBLK:
631 case S_IFCHR:
632 case S_IFSOCK:
633 init_special_inode(inode, inode->i_mode, inode->i_rdev);
634 inode->i_op = &ceph_file_iops;
635 break;
636 case S_IFREG:
637 inode->i_op = &ceph_file_iops;
638 inode->i_fop = &ceph_file_fops;
639 break;
640 case S_IFLNK:
641 inode->i_op = &ceph_symlink_iops;
642 if (!ci->i_symlink) {
643 int symlen = iinfo->symlink_len;
644 char *sym;
645
646 BUG_ON(symlen != inode->i_size);
647 spin_unlock(&inode->i_lock);
648
649 err = -ENOMEM;
650 sym = kmalloc(symlen+1, GFP_NOFS);
651 if (!sym)
652 goto out;
653 memcpy(sym, iinfo->symlink, symlen);
654 sym[symlen] = 0;
655
656 spin_lock(&inode->i_lock);
657 if (!ci->i_symlink)
658 ci->i_symlink = sym;
659 else
660 kfree(sym); /* lost a race */
661 }
662 break;
663 case S_IFDIR:
664 inode->i_op = &ceph_dir_iops;
665 inode->i_fop = &ceph_dir_fops;
666
667 ci->i_files = le64_to_cpu(info->files);
668 ci->i_subdirs = le64_to_cpu(info->subdirs);
669 ci->i_rbytes = le64_to_cpu(info->rbytes);
670 ci->i_rfiles = le64_to_cpu(info->rfiles);
671 ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
672 ceph_decode_timespec(&ci->i_rctime, &info->rctime);
673
674 /* set dir completion flag? */
675 if (ci->i_files == 0 && ci->i_subdirs == 0 &&
676 ceph_snap(inode) == CEPH_NOSNAP &&
677 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) {
678 dout(" marking %p complete (empty)\n", inode);
679 ci->i_ceph_flags |= CEPH_I_COMPLETE;
680 ci->i_max_offset = 2;
681 }
682
683 /* it may be better to set st_size in getattr instead? */
684 if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES))
685 inode->i_size = ci->i_rbytes;
686 break;
687 default:
688 pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
689 ceph_vinop(inode), inode->i_mode);
690 }
691
692no_change:
693 spin_unlock(&inode->i_lock);
694
695 /* queue truncate if we saw i_size decrease */
696 if (queue_trunc)
697 ceph_queue_vmtruncate(inode);
698
699 /* populate frag tree */
700 /* FIXME: move me up, if/when version reflects fragtree changes */
701 nsplits = le32_to_cpu(info->fragtree.nsplits);
702 mutex_lock(&ci->i_fragtree_mutex);
703 for (i = 0; i < nsplits; i++) {
704 u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
705 struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
706
707 if (IS_ERR(frag))
708 continue;
709 frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
710 dout(" frag %x split by %d\n", frag->frag, frag->split_by);
711 }
712 mutex_unlock(&ci->i_fragtree_mutex);
713
714 /* were we issued a capability? */
715 if (info->cap.caps) {
716 if (ceph_snap(inode) == CEPH_NOSNAP) {
717 ceph_add_cap(inode, session,
718 le64_to_cpu(info->cap.cap_id),
719 cap_fmode,
720 le32_to_cpu(info->cap.caps),
721 le32_to_cpu(info->cap.wanted),
722 le32_to_cpu(info->cap.seq),
723 le32_to_cpu(info->cap.mseq),
724 le64_to_cpu(info->cap.realm),
725 info->cap.flags,
726 caps_reservation);
727 } else {
728 spin_lock(&inode->i_lock);
729 dout(" %p got snap_caps %s\n", inode,
730 ceph_cap_string(le32_to_cpu(info->cap.caps)));
731 ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
732 if (cap_fmode >= 0)
733 __ceph_get_fmode(ci, cap_fmode);
734 spin_unlock(&inode->i_lock);
735 }
736 }
737
738 /* update delegation info? */
739 if (dirinfo)
740 ceph_fill_dirfrag(inode, dirinfo);
741
742 err = 0;
743
744out:
745 if (xattr_blob)
746 ceph_buffer_put(xattr_blob);
747 return err;
748}
749
750/*
751 * caller should hold session s_mutex.
752 */
753static void update_dentry_lease(struct dentry *dentry,
754 struct ceph_mds_reply_lease *lease,
755 struct ceph_mds_session *session,
756 unsigned long from_time)
757{
758 struct ceph_dentry_info *di = ceph_dentry(dentry);
759 long unsigned duration = le32_to_cpu(lease->duration_ms);
760 long unsigned ttl = from_time + (duration * HZ) / 1000;
761 long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
762 struct inode *dir;
763
764 /* only track leases on regular dentries */
765 if (dentry->d_op != &ceph_dentry_ops)
766 return;
767
768 spin_lock(&dentry->d_lock);
769 dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n",
770 dentry, le16_to_cpu(lease->mask), duration, ttl);
771
772 /* make lease_rdcache_gen match directory */
773 dir = dentry->d_parent->d_inode;
774 di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
775
776 if (lease->mask == 0)
777 goto out_unlock;
778
779 if (di->lease_gen == session->s_cap_gen &&
780 time_before(ttl, dentry->d_time))
781 goto out_unlock; /* we already have a newer lease. */
782
783 if (di->lease_session && di->lease_session != session)
784 goto out_unlock;
785
786 ceph_dentry_lru_touch(dentry);
787
788 if (!di->lease_session)
789 di->lease_session = ceph_get_mds_session(session);
790 di->lease_gen = session->s_cap_gen;
791 di->lease_seq = le32_to_cpu(lease->seq);
792 di->lease_renew_after = half_ttl;
793 di->lease_renew_from = 0;
794 dentry->d_time = ttl;
795out_unlock:
796 spin_unlock(&dentry->d_lock);
797 return;
798}
799
800/*
801 * splice a dentry to an inode.
802 * caller must hold directory i_mutex for this to be safe.
803 *
804 * we will only rehash the resulting dentry if @prehash is
805 * true; @prehash will be set to false (for the benefit of
806 * the caller) if we fail.
807 */
808static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
809 bool *prehash)
810{
811 struct dentry *realdn;
812
813 /* dn must be unhashed */
814 if (!d_unhashed(dn))
815 d_drop(dn);
816 realdn = d_materialise_unique(dn, in);
817 if (IS_ERR(realdn)) {
818 pr_err("splice_dentry error %p inode %p ino %llx.%llx\n",
819 dn, in, ceph_vinop(in));
820 if (prehash)
821 *prehash = false; /* don't rehash on error */
822 dn = realdn; /* note realdn contains the error */
823 goto out;
824 } else if (realdn) {
825 dout("dn %p (%d) spliced with %p (%d) "
826 "inode %p ino %llx.%llx\n",
827 dn, atomic_read(&dn->d_count),
828 realdn, atomic_read(&realdn->d_count),
829 realdn->d_inode, ceph_vinop(realdn->d_inode));
830 dput(dn);
831 dn = realdn;
832 } else {
833 BUG_ON(!ceph_dentry(dn));
834
835 dout("dn %p attached to %p ino %llx.%llx\n",
836 dn, dn->d_inode, ceph_vinop(dn->d_inode));
837 }
838 if ((!prehash || *prehash) && d_unhashed(dn))
839 d_rehash(dn);
840out:
841 return dn;
842}
843
844/*
845 * Set dentry's directory position based on the current dir's max, and
846 * order it in d_subdirs, so that dcache_readdir behaves.
847 */
848static void ceph_set_dentry_offset(struct dentry *dn)
849{
850 struct dentry *dir = dn->d_parent;
851 struct inode *inode = dn->d_parent->d_inode;
852 struct ceph_dentry_info *di;
853
854 BUG_ON(!inode);
855
856 di = ceph_dentry(dn);
857
858 spin_lock(&inode->i_lock);
859 di->offset = ceph_inode(inode)->i_max_offset++;
860 spin_unlock(&inode->i_lock);
861
862 spin_lock(&dcache_lock);
863 spin_lock(&dn->d_lock);
864 list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
865 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
866 dn->d_u.d_child.prev, dn->d_u.d_child.next);
867 spin_unlock(&dn->d_lock);
868 spin_unlock(&dcache_lock);
869}
870
871/*
872 * Incorporate results into the local cache. This is either just
873 * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
874 * after a lookup).
875 *
876 * A reply may contain
877 * a directory inode along with a dentry.
878 * and/or a target inode
879 *
880 * Called with snap_rwsem (read).
881 */
882int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
883 struct ceph_mds_session *session)
884{
885 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
886 struct inode *in = NULL;
887 struct ceph_mds_reply_inode *ininfo;
888 struct ceph_vino vino;
889 struct ceph_client *client = ceph_sb_to_client(sb);
890 int i = 0;
891 int err = 0;
892
893 dout("fill_trace %p is_dentry %d is_target %d\n", req,
894 rinfo->head->is_dentry, rinfo->head->is_target);
895
896#if 0
897 /*
898 * Debugging hook:
899 *
900 * If we resend completed ops to a recovering mds, we get no
901 * trace. Since that is very rare, pretend this is the case
902 * to ensure the 'no trace' handlers in the callers behave.
903 *
904 * Fill in inodes unconditionally to avoid breaking cap
905 * invariants.
906 */
907 if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
908 pr_info("fill_trace faking empty trace on %lld %s\n",
909 req->r_tid, ceph_mds_op_name(rinfo->head->op));
910 if (rinfo->head->is_dentry) {
911 rinfo->head->is_dentry = 0;
912 err = fill_inode(req->r_locked_dir,
913 &rinfo->diri, rinfo->dirfrag,
914 session, req->r_request_started, -1);
915 }
916 if (rinfo->head->is_target) {
917 rinfo->head->is_target = 0;
918 ininfo = rinfo->targeti.in;
919 vino.ino = le64_to_cpu(ininfo->ino);
920 vino.snap = le64_to_cpu(ininfo->snapid);
921 in = ceph_get_inode(sb, vino);
922 err = fill_inode(in, &rinfo->targeti, NULL,
923 session, req->r_request_started,
924 req->r_fmode);
925 iput(in);
926 }
927 }
928#endif
929
930 if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
931 dout("fill_trace reply is empty!\n");
932 if (rinfo->head->result == 0 && req->r_locked_dir) {
933 struct ceph_inode_info *ci =
934 ceph_inode(req->r_locked_dir);
935 dout(" clearing %p complete (empty trace)\n",
936 req->r_locked_dir);
937 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
938 ci->i_release_count++;
939 }
940 return 0;
941 }
942
943 if (rinfo->head->is_dentry) {
944 struct inode *dir = req->r_locked_dir;
945
946 err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
947 session, req->r_request_started, -1,
948 &req->r_caps_reservation);
949 if (err < 0)
950 return err;
951 }
952
953 /*
954 * ignore null lease/binding on snapdir ENOENT, or else we
955 * will have trouble splicing in the virtual snapdir later
956 */
957 if (rinfo->head->is_dentry && !req->r_aborted &&
958 (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
959 client->mount_args->snapdir_name,
960 req->r_dentry->d_name.len))) {
961 /*
962 * lookup link rename : null -> possibly existing inode
963 * mknod symlink mkdir : null -> new inode
964 * unlink : linked -> null
965 */
966 struct inode *dir = req->r_locked_dir;
967 struct dentry *dn = req->r_dentry;
968 bool have_dir_cap, have_lease;
969
970 BUG_ON(!dn);
971 BUG_ON(!dir);
972 BUG_ON(dn->d_parent->d_inode != dir);
973 BUG_ON(ceph_ino(dir) !=
974 le64_to_cpu(rinfo->diri.in->ino));
975 BUG_ON(ceph_snap(dir) !=
976 le64_to_cpu(rinfo->diri.in->snapid));
977
978 /* do we have a lease on the whole dir? */
979 have_dir_cap =
980 (le32_to_cpu(rinfo->diri.in->cap.caps) &
981 CEPH_CAP_FILE_SHARED);
982
983 /* do we have a dn lease? */
984 have_lease = have_dir_cap ||
985 (le16_to_cpu(rinfo->dlease->mask) &
986 CEPH_LOCK_DN);
987
988 if (!have_lease)
989 dout("fill_trace no dentry lease or dir cap\n");
990
991 /* rename? */
992 if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
993 dout(" src %p '%.*s' dst %p '%.*s'\n",
994 req->r_old_dentry,
995 req->r_old_dentry->d_name.len,
996 req->r_old_dentry->d_name.name,
997 dn, dn->d_name.len, dn->d_name.name);
998 dout("fill_trace doing d_move %p -> %p\n",
999 req->r_old_dentry, dn);
1000 d_move(req->r_old_dentry, dn);
1001 dout(" src %p '%.*s' dst %p '%.*s'\n",
1002 req->r_old_dentry,
1003 req->r_old_dentry->d_name.len,
1004 req->r_old_dentry->d_name.name,
1005 dn, dn->d_name.len, dn->d_name.name);
1006 /* ensure target dentry is invalidated, despite
1007 rehashing bug in vfs_rename_dir */
1008 dn->d_time = jiffies;
1009 ceph_dentry(dn)->lease_shared_gen = 0;
1010 /* take overwritten dentry's readdir offset */
1011 ceph_dentry(req->r_old_dentry)->offset =
1012 ceph_dentry(dn)->offset;
1013 dn = req->r_old_dentry; /* use old_dentry */
1014 in = dn->d_inode;
1015 }
1016
1017 /* null dentry? */
1018 if (!rinfo->head->is_target) {
1019 dout("fill_trace null dentry\n");
1020 if (dn->d_inode) {
1021 dout("d_delete %p\n", dn);
1022 d_delete(dn);
1023 } else {
1024 dout("d_instantiate %p NULL\n", dn);
1025 d_instantiate(dn, NULL);
1026 if (have_lease && d_unhashed(dn))
1027 d_rehash(dn);
1028 update_dentry_lease(dn, rinfo->dlease,
1029 session,
1030 req->r_request_started);
1031 }
1032 goto done;
1033 }
1034
1035 /* attach proper inode */
1036 ininfo = rinfo->targeti.in;
1037 vino.ino = le64_to_cpu(ininfo->ino);
1038 vino.snap = le64_to_cpu(ininfo->snapid);
1039 if (!dn->d_inode) {
1040 in = ceph_get_inode(sb, vino);
1041 if (IS_ERR(in)) {
1042 pr_err("fill_trace bad get_inode "
1043 "%llx.%llx\n", vino.ino, vino.snap);
1044 err = PTR_ERR(in);
1045 d_delete(dn);
1046 goto done;
1047 }
1048 dn = splice_dentry(dn, in, &have_lease);
1049 if (IS_ERR(dn)) {
1050 err = PTR_ERR(dn);
1051 goto done;
1052 }
1053 req->r_dentry = dn; /* may have spliced */
1054 ceph_set_dentry_offset(dn);
1055 igrab(in);
1056 } else if (ceph_ino(in) == vino.ino &&
1057 ceph_snap(in) == vino.snap) {
1058 igrab(in);
1059 } else {
1060 dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
1061 dn, in, ceph_ino(in), ceph_snap(in),
1062 vino.ino, vino.snap);
1063 have_lease = false;
1064 in = NULL;
1065 }
1066
1067 if (have_lease)
1068 update_dentry_lease(dn, rinfo->dlease, session,
1069 req->r_request_started);
1070 dout(" final dn %p\n", dn);
1071 i++;
1072 } else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
1073 req->r_op == CEPH_MDS_OP_MKSNAP) {
1074 struct dentry *dn = req->r_dentry;
1075
1076 /* fill out a snapdir LOOKUPSNAP dentry */
1077 BUG_ON(!dn);
1078 BUG_ON(!req->r_locked_dir);
1079 BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
1080 ininfo = rinfo->targeti.in;
1081 vino.ino = le64_to_cpu(ininfo->ino);
1082 vino.snap = le64_to_cpu(ininfo->snapid);
1083 in = ceph_get_inode(sb, vino);
1084 if (IS_ERR(in)) {
1085 pr_err("fill_inode get_inode badness %llx.%llx\n",
1086 vino.ino, vino.snap);
1087 err = PTR_ERR(in);
1088 d_delete(dn);
1089 goto done;
1090 }
1091 dout(" linking snapped dir %p to dn %p\n", in, dn);
1092 dn = splice_dentry(dn, in, NULL);
1093 if (IS_ERR(dn)) {
1094 err = PTR_ERR(dn);
1095 goto done;
1096 }
1097 ceph_set_dentry_offset(dn);
1098 req->r_dentry = dn; /* may have spliced */
1099 igrab(in);
1100 rinfo->head->is_dentry = 1; /* fool notrace handlers */
1101 }
1102
1103 if (rinfo->head->is_target) {
1104 vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1105 vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1106
1107 if (in == NULL || ceph_ino(in) != vino.ino ||
1108 ceph_snap(in) != vino.snap) {
1109 in = ceph_get_inode(sb, vino);
1110 if (IS_ERR(in)) {
1111 err = PTR_ERR(in);
1112 goto done;
1113 }
1114 }
1115 req->r_target_inode = in;
1116
1117 err = fill_inode(in,
1118 &rinfo->targeti, NULL,
1119 session, req->r_request_started,
1120 (le32_to_cpu(rinfo->head->result) == 0) ?
1121 req->r_fmode : -1,
1122 &req->r_caps_reservation);
1123 if (err < 0) {
1124 pr_err("fill_inode badness %p %llx.%llx\n",
1125 in, ceph_vinop(in));
1126 goto done;
1127 }
1128 }
1129
1130done:
1131 dout("fill_trace done err=%d\n", err);
1132 return err;
1133}
1134
1135/*
1136 * Prepopulate our cache with readdir results, leases, etc.
1137 */
1138int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1139 struct ceph_mds_session *session)
1140{
1141 struct dentry *parent = req->r_dentry;
1142 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1143 struct qstr dname;
1144 struct dentry *dn;
1145 struct inode *in;
1146 int err = 0, i;
1147 struct inode *snapdir = NULL;
1148 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
1149 u64 frag = le32_to_cpu(rhead->args.readdir.frag);
1150 struct ceph_dentry_info *di;
1151
1152 if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
1153 snapdir = ceph_get_snapdir(parent->d_inode);
1154 parent = d_find_alias(snapdir);
1155 dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
1156 rinfo->dir_nr, parent);
1157 } else {
1158 dout("readdir_prepopulate %d items under dn %p\n",
1159 rinfo->dir_nr, parent);
1160 if (rinfo->dir_dir)
1161 ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
1162 }
1163
1164 for (i = 0; i < rinfo->dir_nr; i++) {
1165 struct ceph_vino vino;
1166
1167 dname.name = rinfo->dir_dname[i];
1168 dname.len = rinfo->dir_dname_len[i];
1169 dname.hash = full_name_hash(dname.name, dname.len);
1170
1171 vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
1172 vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
1173
1174retry_lookup:
1175 dn = d_lookup(parent, &dname);
1176 dout("d_lookup on parent=%p name=%.*s got %p\n",
1177 parent, dname.len, dname.name, dn);
1178
1179 if (!dn) {
1180 dn = d_alloc(parent, &dname);
1181 dout("d_alloc %p '%.*s' = %p\n", parent,
1182 dname.len, dname.name, dn);
1183 if (dn == NULL) {
1184 dout("d_alloc badness\n");
1185 err = -ENOMEM;
1186 goto out;
1187 }
1188 err = ceph_init_dentry(dn);
1189 if (err < 0)
1190 goto out;
1191 } else if (dn->d_inode &&
1192 (ceph_ino(dn->d_inode) != vino.ino ||
1193 ceph_snap(dn->d_inode) != vino.snap)) {
1194 dout(" dn %p points to wrong inode %p\n",
1195 dn, dn->d_inode);
1196 d_delete(dn);
1197 dput(dn);
1198 goto retry_lookup;
1199 } else {
1200 /* reorder parent's d_subdirs */
1201 spin_lock(&dcache_lock);
1202 spin_lock(&dn->d_lock);
1203 list_move(&dn->d_u.d_child, &parent->d_subdirs);
1204 spin_unlock(&dn->d_lock);
1205 spin_unlock(&dcache_lock);
1206 }
1207
1208 di = dn->d_fsdata;
1209 di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
1210
1211 /* inode */
1212 if (dn->d_inode) {
1213 in = dn->d_inode;
1214 } else {
1215 in = ceph_get_inode(parent->d_sb, vino);
1216 if (in == NULL) {
1217 dout("new_inode badness\n");
1218 d_delete(dn);
1219 dput(dn);
1220 err = -ENOMEM;
1221 goto out;
1222 }
1223 dn = splice_dentry(dn, in, NULL);
1224 }
1225
1226 if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
1227 req->r_request_started, -1,
1228 &req->r_caps_reservation) < 0) {
1229 pr_err("fill_inode badness on %p\n", in);
1230 dput(dn);
1231 continue;
1232 }
1233 update_dentry_lease(dn, rinfo->dir_dlease[i],
1234 req->r_session, req->r_request_started);
1235 dput(dn);
1236 }
1237 req->r_did_prepopulate = true;
1238
1239out:
1240 if (snapdir) {
1241 iput(snapdir);
1242 dput(parent);
1243 }
1244 dout("readdir_prepopulate done\n");
1245 return err;
1246}
1247
1248int ceph_inode_set_size(struct inode *inode, loff_t size)
1249{
1250 struct ceph_inode_info *ci = ceph_inode(inode);
1251 int ret = 0;
1252
1253 spin_lock(&inode->i_lock);
1254 dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
1255 inode->i_size = size;
1256 inode->i_blocks = (size + (1 << 9) - 1) >> 9;
1257
1258 /* tell the MDS if we are approaching max_size */
1259 if ((size << 1) >= ci->i_max_size &&
1260 (ci->i_reported_size << 1) < ci->i_max_size)
1261 ret = 1;
1262
1263 spin_unlock(&inode->i_lock);
1264 return ret;
1265}
1266
1267/*
1268 * Write back inode data in a worker thread. (This can't be done
1269 * in the message handler context.)
1270 */
1271void ceph_queue_writeback(struct inode *inode)
1272{
1273 if (queue_work(ceph_inode_to_client(inode)->wb_wq,
1274 &ceph_inode(inode)->i_wb_work)) {
1275 dout("ceph_queue_writeback %p\n", inode);
1276 igrab(inode);
1277 } else {
1278 dout("ceph_queue_writeback %p failed\n", inode);
1279 }
1280}
1281
1282static void ceph_writeback_work(struct work_struct *work)
1283{
1284 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1285 i_wb_work);
1286 struct inode *inode = &ci->vfs_inode;
1287
1288 dout("writeback %p\n", inode);
1289 filemap_fdatawrite(&inode->i_data);
1290 iput(inode);
1291}
1292
1293/*
1294 * queue an async invalidation
1295 */
1296void ceph_queue_invalidate(struct inode *inode)
1297{
1298 if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
1299 &ceph_inode(inode)->i_pg_inv_work)) {
1300 dout("ceph_queue_invalidate %p\n", inode);
1301 igrab(inode);
1302 } else {
1303 dout("ceph_queue_invalidate %p failed\n", inode);
1304 }
1305}
1306
1307/*
1308 * invalidate any pages that are not dirty or under writeback. this
1309 * includes pages that are clean and mapped.
1310 */
1311static void ceph_invalidate_nondirty_pages(struct address_space *mapping)
1312{
1313 struct pagevec pvec;
1314 pgoff_t next = 0;
1315 int i;
1316
1317 pagevec_init(&pvec, 0);
1318 while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
1319 for (i = 0; i < pagevec_count(&pvec); i++) {
1320 struct page *page = pvec.pages[i];
1321 pgoff_t index;
1322 int skip_page =
1323 (PageDirty(page) || PageWriteback(page));
1324
1325 if (!skip_page)
1326 skip_page = !trylock_page(page);
1327
1328 /*
1329 * We really shouldn't be looking at the ->index of an
1330 * unlocked page. But we're not allowed to lock these
1331 * pages. So we rely upon nobody altering the ->index
1332 * of this (pinned-by-us) page.
1333 */
1334 index = page->index;
1335 if (index > next)
1336 next = index;
1337 next++;
1338
1339 if (skip_page)
1340 continue;
1341
1342 generic_error_remove_page(mapping, page);
1343 unlock_page(page);
1344 }
1345 pagevec_release(&pvec);
1346 cond_resched();
1347 }
1348}
1349
1350/*
1351 * Invalidate inode pages in a worker thread. (This can't be done
1352 * in the message handler context.)
1353 */
1354static void ceph_invalidate_work(struct work_struct *work)
1355{
1356 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1357 i_pg_inv_work);
1358 struct inode *inode = &ci->vfs_inode;
1359 u32 orig_gen;
1360 int check = 0;
1361
1362 spin_lock(&inode->i_lock);
1363 dout("invalidate_pages %p gen %d revoking %d\n", inode,
1364 ci->i_rdcache_gen, ci->i_rdcache_revoking);
1365 if (ci->i_rdcache_gen == 0 ||
1366 ci->i_rdcache_revoking != ci->i_rdcache_gen) {
1367 BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
1368 /* nevermind! */
1369 ci->i_rdcache_revoking = 0;
1370 spin_unlock(&inode->i_lock);
1371 goto out;
1372 }
1373 orig_gen = ci->i_rdcache_gen;
1374 spin_unlock(&inode->i_lock);
1375
1376 ceph_invalidate_nondirty_pages(inode->i_mapping);
1377
1378 spin_lock(&inode->i_lock);
1379 if (orig_gen == ci->i_rdcache_gen) {
1380 dout("invalidate_pages %p gen %d successful\n", inode,
1381 ci->i_rdcache_gen);
1382 ci->i_rdcache_gen = 0;
1383 ci->i_rdcache_revoking = 0;
1384 check = 1;
1385 } else {
1386 dout("invalidate_pages %p gen %d raced, gen now %d\n",
1387 inode, orig_gen, ci->i_rdcache_gen);
1388 }
1389 spin_unlock(&inode->i_lock);
1390
1391 if (check)
1392 ceph_check_caps(ci, 0, NULL);
1393out:
1394 iput(inode);
1395}
1396
1397
1398/*
1399 * called by trunc_wq; take i_mutex ourselves
1400 *
1401 * We also truncate in a separate thread as well.
1402 */
1403static void ceph_vmtruncate_work(struct work_struct *work)
1404{
1405 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1406 i_vmtruncate_work);
1407 struct inode *inode = &ci->vfs_inode;
1408
1409 dout("vmtruncate_work %p\n", inode);
1410 mutex_lock(&inode->i_mutex);
1411 __ceph_do_pending_vmtruncate(inode);
1412 mutex_unlock(&inode->i_mutex);
1413 iput(inode);
1414}
1415
1416/*
1417 * Queue an async vmtruncate. If we fail to queue work, we will handle
1418 * the truncation the next time we call __ceph_do_pending_vmtruncate.
1419 */
1420void ceph_queue_vmtruncate(struct inode *inode)
1421{
1422 struct ceph_inode_info *ci = ceph_inode(inode);
1423
1424 if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
1425 &ci->i_vmtruncate_work)) {
1426 dout("ceph_queue_vmtruncate %p\n", inode);
1427 igrab(inode);
1428 } else {
1429 dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
1430 inode, ci->i_truncate_pending);
1431 }
1432}
1433
1434/*
1435 * called with i_mutex held.
1436 *
1437 * Make sure any pending truncation is applied before doing anything
1438 * that may depend on it.
1439 */
1440void __ceph_do_pending_vmtruncate(struct inode *inode)
1441{
1442 struct ceph_inode_info *ci = ceph_inode(inode);
1443 u64 to;
1444 int wrbuffer_refs, wake = 0;
1445
1446retry:
1447 spin_lock(&inode->i_lock);
1448 if (ci->i_truncate_pending == 0) {
1449 dout("__do_pending_vmtruncate %p none pending\n", inode);
1450 spin_unlock(&inode->i_lock);
1451 return;
1452 }
1453
1454 /*
1455 * make sure any dirty snapped pages are flushed before we
1456 * possibly truncate them.. so write AND block!
1457 */
1458 if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
1459 dout("__do_pending_vmtruncate %p flushing snaps first\n",
1460 inode);
1461 spin_unlock(&inode->i_lock);
1462 filemap_write_and_wait_range(&inode->i_data, 0,
1463 inode->i_sb->s_maxbytes);
1464 goto retry;
1465 }
1466
1467 to = ci->i_truncate_size;
1468 wrbuffer_refs = ci->i_wrbuffer_ref;
1469 dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
1470 ci->i_truncate_pending, to);
1471 spin_unlock(&inode->i_lock);
1472
1473 truncate_inode_pages(inode->i_mapping, to);
1474
1475 spin_lock(&inode->i_lock);
1476 ci->i_truncate_pending--;
1477 if (ci->i_truncate_pending == 0)
1478 wake = 1;
1479 spin_unlock(&inode->i_lock);
1480
1481 if (wrbuffer_refs == 0)
1482 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
1483 if (wake)
1484 wake_up(&ci->i_cap_wq);
1485}
1486
1487
1488/*
1489 * symlinks
1490 */
1491static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
1492{
1493 struct ceph_inode_info *ci = ceph_inode(dentry->d_inode);
1494 nd_set_link(nd, ci->i_symlink);
1495 return NULL;
1496}
1497
1498static const struct inode_operations ceph_symlink_iops = {
1499 .readlink = generic_readlink,
1500 .follow_link = ceph_sym_follow_link,
1501};
1502
1503/*
1504 * setattr
1505 */
1506int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1507{
1508 struct inode *inode = dentry->d_inode;
1509 struct ceph_inode_info *ci = ceph_inode(inode);
1510 struct inode *parent_inode = dentry->d_parent->d_inode;
1511 const unsigned int ia_valid = attr->ia_valid;
1512 struct ceph_mds_request *req;
1513 struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc;
1514 int issued;
1515 int release = 0, dirtied = 0;
1516 int mask = 0;
1517 int err = 0;
1518
1519 if (ceph_snap(inode) != CEPH_NOSNAP)
1520 return -EROFS;
1521
1522 __ceph_do_pending_vmtruncate(inode);
1523
1524 err = inode_change_ok(inode, attr);
1525 if (err != 0)
1526 return err;
1527
1528 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
1529 USE_AUTH_MDS);
1530 if (IS_ERR(req))
1531 return PTR_ERR(req);
1532
1533 spin_lock(&inode->i_lock);
1534 issued = __ceph_caps_issued(ci, NULL);
1535 dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
1536
1537 if (ia_valid & ATTR_UID) {
1538 dout("setattr %p uid %d -> %d\n", inode,
1539 inode->i_uid, attr->ia_uid);
1540 if (issued & CEPH_CAP_AUTH_EXCL) {
1541 inode->i_uid = attr->ia_uid;
1542 dirtied |= CEPH_CAP_AUTH_EXCL;
1543 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1544 attr->ia_uid != inode->i_uid) {
1545 req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid);
1546 mask |= CEPH_SETATTR_UID;
1547 release |= CEPH_CAP_AUTH_SHARED;
1548 }
1549 }
1550 if (ia_valid & ATTR_GID) {
1551 dout("setattr %p gid %d -> %d\n", inode,
1552 inode->i_gid, attr->ia_gid);
1553 if (issued & CEPH_CAP_AUTH_EXCL) {
1554 inode->i_gid = attr->ia_gid;
1555 dirtied |= CEPH_CAP_AUTH_EXCL;
1556 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1557 attr->ia_gid != inode->i_gid) {
1558 req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid);
1559 mask |= CEPH_SETATTR_GID;
1560 release |= CEPH_CAP_AUTH_SHARED;
1561 }
1562 }
1563 if (ia_valid & ATTR_MODE) {
1564 dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
1565 attr->ia_mode);
1566 if (issued & CEPH_CAP_AUTH_EXCL) {
1567 inode->i_mode = attr->ia_mode;
1568 dirtied |= CEPH_CAP_AUTH_EXCL;
1569 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1570 attr->ia_mode != inode->i_mode) {
1571 req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
1572 mask |= CEPH_SETATTR_MODE;
1573 release |= CEPH_CAP_AUTH_SHARED;
1574 }
1575 }
1576
1577 if (ia_valid & ATTR_ATIME) {
1578 dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode,
1579 inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
1580 attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
1581 if (issued & CEPH_CAP_FILE_EXCL) {
1582 ci->i_time_warp_seq++;
1583 inode->i_atime = attr->ia_atime;
1584 dirtied |= CEPH_CAP_FILE_EXCL;
1585 } else if ((issued & CEPH_CAP_FILE_WR) &&
1586 timespec_compare(&inode->i_atime,
1587 &attr->ia_atime) < 0) {
1588 inode->i_atime = attr->ia_atime;
1589 dirtied |= CEPH_CAP_FILE_WR;
1590 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1591 !timespec_equal(&inode->i_atime, &attr->ia_atime)) {
1592 ceph_encode_timespec(&req->r_args.setattr.atime,
1593 &attr->ia_atime);
1594 mask |= CEPH_SETATTR_ATIME;
1595 release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
1596 CEPH_CAP_FILE_WR;
1597 }
1598 }
1599 if (ia_valid & ATTR_MTIME) {
1600 dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode,
1601 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
1602 attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
1603 if (issued & CEPH_CAP_FILE_EXCL) {
1604 ci->i_time_warp_seq++;
1605 inode->i_mtime = attr->ia_mtime;
1606 dirtied |= CEPH_CAP_FILE_EXCL;
1607 } else if ((issued & CEPH_CAP_FILE_WR) &&
1608 timespec_compare(&inode->i_mtime,
1609 &attr->ia_mtime) < 0) {
1610 inode->i_mtime = attr->ia_mtime;
1611 dirtied |= CEPH_CAP_FILE_WR;
1612 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1613 !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) {
1614 ceph_encode_timespec(&req->r_args.setattr.mtime,
1615 &attr->ia_mtime);
1616 mask |= CEPH_SETATTR_MTIME;
1617 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
1618 CEPH_CAP_FILE_WR;
1619 }
1620 }
1621 if (ia_valid & ATTR_SIZE) {
1622 dout("setattr %p size %lld -> %lld\n", inode,
1623 inode->i_size, attr->ia_size);
1624 if (attr->ia_size > inode->i_sb->s_maxbytes) {
1625 err = -EINVAL;
1626 goto out;
1627 }
1628 if ((issued & CEPH_CAP_FILE_EXCL) &&
1629 attr->ia_size > inode->i_size) {
1630 inode->i_size = attr->ia_size;
1631 inode->i_blocks =
1632 (attr->ia_size + (1 << 9) - 1) >> 9;
1633 inode->i_ctime = attr->ia_ctime;
1634 ci->i_reported_size = attr->ia_size;
1635 dirtied |= CEPH_CAP_FILE_EXCL;
1636 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1637 attr->ia_size != inode->i_size) {
1638 req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
1639 req->r_args.setattr.old_size =
1640 cpu_to_le64(inode->i_size);
1641 mask |= CEPH_SETATTR_SIZE;
1642 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
1643 CEPH_CAP_FILE_WR;
1644 }
1645 }
1646
1647 /* these do nothing */
1648 if (ia_valid & ATTR_CTIME) {
1649 bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
1650 ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
1651 dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode,
1652 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
1653 attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
1654 only ? "ctime only" : "ignored");
1655 inode->i_ctime = attr->ia_ctime;
1656 if (only) {
1657 /*
1658 * if kernel wants to dirty ctime but nothing else,
1659 * we need to choose a cap to dirty under, or do
1660 * a almost-no-op setattr
1661 */
1662 if (issued & CEPH_CAP_AUTH_EXCL)
1663 dirtied |= CEPH_CAP_AUTH_EXCL;
1664 else if (issued & CEPH_CAP_FILE_EXCL)
1665 dirtied |= CEPH_CAP_FILE_EXCL;
1666 else if (issued & CEPH_CAP_XATTR_EXCL)
1667 dirtied |= CEPH_CAP_XATTR_EXCL;
1668 else
1669 mask |= CEPH_SETATTR_CTIME;
1670 }
1671 }
1672 if (ia_valid & ATTR_FILE)
1673 dout("setattr %p ATTR_FILE ... hrm!\n", inode);
1674
1675 if (dirtied) {
1676 __ceph_mark_dirty_caps(ci, dirtied);
1677 inode->i_ctime = CURRENT_TIME;
1678 }
1679
1680 release &= issued;
1681 spin_unlock(&inode->i_lock);
1682
1683 if (mask) {
1684 req->r_inode = igrab(inode);
1685 req->r_inode_drop = release;
1686 req->r_args.setattr.mask = cpu_to_le32(mask);
1687 req->r_num_caps = 1;
1688 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
1689 }
1690 dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
1691 ceph_cap_string(dirtied), mask);
1692
1693 ceph_mdsc_put_request(req);
1694 __ceph_do_pending_vmtruncate(inode);
1695 return err;
1696out:
1697 spin_unlock(&inode->i_lock);
1698 ceph_mdsc_put_request(req);
1699 return err;
1700}
1701
1702/*
1703 * Verify that we have a lease on the given mask. If not,
1704 * do a getattr against an mds.
1705 */
1706int ceph_do_getattr(struct inode *inode, int mask)
1707{
1708 struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
1709 struct ceph_mds_client *mdsc = &client->mdsc;
1710 struct ceph_mds_request *req;
1711 int err;
1712
1713 if (ceph_snap(inode) == CEPH_SNAPDIR) {
1714 dout("do_getattr inode %p SNAPDIR\n", inode);
1715 return 0;
1716 }
1717
1718 dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
1719 if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
1720 return 0;
1721
1722 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
1723 if (IS_ERR(req))
1724 return PTR_ERR(req);
1725 req->r_inode = igrab(inode);
1726 req->r_num_caps = 1;
1727 req->r_args.getattr.mask = cpu_to_le32(mask);
1728 err = ceph_mdsc_do_request(mdsc, NULL, req);
1729 ceph_mdsc_put_request(req);
1730 dout("do_getattr result=%d\n", err);
1731 return err;
1732}
1733
1734
1735/*
1736 * Check inode permissions. We verify we have a valid value for
1737 * the AUTH cap, then call the generic handler.
1738 */
1739int ceph_permission(struct inode *inode, int mask)
1740{
1741 int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
1742
1743 if (!err)
1744 err = generic_permission(inode, mask, NULL);
1745 return err;
1746}
1747
1748/*
1749 * Get all attributes. Hopefully somedata we'll have a statlite()
1750 * and can limit the fields we require to be accurate.
1751 */
1752int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
1753 struct kstat *stat)
1754{
1755 struct inode *inode = dentry->d_inode;
1756 struct ceph_inode_info *ci = ceph_inode(inode);
1757 int err;
1758
1759 err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
1760 if (!err) {
1761 generic_fillattr(inode, stat);
1762 stat->ino = inode->i_ino;
1763 if (ceph_snap(inode) != CEPH_NOSNAP)
1764 stat->dev = ceph_snap(inode);
1765 else
1766 stat->dev = 0;
1767 if (S_ISDIR(inode->i_mode)) {
1768 stat->size = ci->i_rbytes;
1769 stat->blocks = 0;
1770 stat->blksize = 65536;
1771 }
1772 }
1773 return err;
1774}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
new file mode 100644
index 000000000000..8a5bcae62846
--- /dev/null
+++ b/fs/ceph/ioctl.c
@@ -0,0 +1,160 @@
1#include <linux/in.h>
2
3#include "ioctl.h"
4#include "super.h"
5#include "ceph_debug.h"
6
7
8/*
9 * ioctls
10 */
11
12/*
13 * get and set the file layout
14 */
15static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
16{
17 struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
18 struct ceph_ioctl_layout l;
19 int err;
20
21 err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
22 if (!err) {
23 l.stripe_unit = ceph_file_layout_su(ci->i_layout);
24 l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
25 l.object_size = ceph_file_layout_object_size(ci->i_layout);
26 l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
27 l.preferred_osd =
28 (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
29 if (copy_to_user(arg, &l, sizeof(l)))
30 return -EFAULT;
31 }
32
33 return err;
34}
35
36static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
37{
38 struct inode *inode = file->f_dentry->d_inode;
39 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
40 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
41 struct ceph_mds_request *req;
42 struct ceph_ioctl_layout l;
43 int err, i;
44
45 /* copy and validate */
46 if (copy_from_user(&l, arg, sizeof(l)))
47 return -EFAULT;
48
49 if ((l.object_size & ~PAGE_MASK) ||
50 (l.stripe_unit & ~PAGE_MASK) ||
51 !l.stripe_unit ||
52 (l.object_size &&
53 (unsigned)l.object_size % (unsigned)l.stripe_unit))
54 return -EINVAL;
55
56 /* make sure it's a valid data pool */
57 if (l.data_pool > 0) {
58 mutex_lock(&mdsc->mutex);
59 err = -EINVAL;
60 for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
61 if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
62 err = 0;
63 break;
64 }
65 mutex_unlock(&mdsc->mutex);
66 if (err)
67 return err;
68 }
69
70 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
71 USE_AUTH_MDS);
72 if (IS_ERR(req))
73 return PTR_ERR(req);
74 req->r_inode = igrab(inode);
75 req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
76
77 req->r_args.setlayout.layout.fl_stripe_unit =
78 cpu_to_le32(l.stripe_unit);
79 req->r_args.setlayout.layout.fl_stripe_count =
80 cpu_to_le32(l.stripe_count);
81 req->r_args.setlayout.layout.fl_object_size =
82 cpu_to_le32(l.object_size);
83 req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
84 req->r_args.setlayout.layout.fl_pg_preferred =
85 cpu_to_le32(l.preferred_osd);
86
87 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
88 ceph_mdsc_put_request(req);
89 return err;
90}
91
92/*
93 * Return object name, size/offset information, and location (OSD
94 * number, network address) for a given file offset.
95 */
96static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
97{
98 struct ceph_ioctl_dataloc dl;
99 struct inode *inode = file->f_dentry->d_inode;
100 struct ceph_inode_info *ci = ceph_inode(inode);
101 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
102 u64 len = 1, olen;
103 u64 tmp;
104 struct ceph_object_layout ol;
105 struct ceph_pg pgid;
106
107 /* copy and validate */
108 if (copy_from_user(&dl, arg, sizeof(dl)))
109 return -EFAULT;
110
111 down_read(&osdc->map_sem);
112 ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
113 &dl.object_no, &dl.object_offset, &olen);
114 dl.file_offset -= dl.object_offset;
115 dl.object_size = ceph_file_layout_object_size(ci->i_layout);
116 dl.block_size = ceph_file_layout_su(ci->i_layout);
117
118 /* block_offset = object_offset % block_size */
119 tmp = dl.object_offset;
120 dl.block_offset = do_div(tmp, dl.block_size);
121
122 snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
123 ceph_ino(inode), dl.object_no);
124 ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
125 osdc->osdmap);
126
127 pgid = ol.ol_pgid;
128 dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
129 if (dl.osd >= 0) {
130 struct ceph_entity_addr *a =
131 ceph_osd_addr(osdc->osdmap, dl.osd);
132 if (a)
133 memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr));
134 } else {
135 memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
136 }
137 up_read(&osdc->map_sem);
138
139 /* send result back to user */
140 if (copy_to_user(arg, &dl, sizeof(dl)))
141 return -EFAULT;
142
143 return 0;
144}
145
146long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
147{
148 dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
149 switch (cmd) {
150 case CEPH_IOC_GET_LAYOUT:
151 return ceph_ioctl_get_layout(file, (void __user *)arg);
152
153 case CEPH_IOC_SET_LAYOUT:
154 return ceph_ioctl_set_layout(file, (void __user *)arg);
155
156 case CEPH_IOC_GET_DATALOC:
157 return ceph_ioctl_get_dataloc(file, (void __user *)arg);
158 }
159 return -ENOTTY;
160}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
new file mode 100644
index 000000000000..25e4f1a9d059
--- /dev/null
+++ b/fs/ceph/ioctl.h
@@ -0,0 +1,40 @@
1#ifndef FS_CEPH_IOCTL_H
2#define FS_CEPH_IOCTL_H
3
4#include <linux/ioctl.h>
5#include <linux/types.h>
6
7#define CEPH_IOCTL_MAGIC 0x97
8
9/* just use u64 to align sanely on all archs */
10struct ceph_ioctl_layout {
11 __u64 stripe_unit, stripe_count, object_size;
12 __u64 data_pool;
13 __s64 preferred_osd;
14};
15
16#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1, \
17 struct ceph_ioctl_layout)
18#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \
19 struct ceph_ioctl_layout)
20
21/*
22 * Extract identity, address of the OSD and object storing a given
23 * file offset.
24 */
25struct ceph_ioctl_dataloc {
26 __u64 file_offset; /* in+out: file offset */
27 __u64 object_offset; /* out: offset in object */
28 __u64 object_no; /* out: object # */
29 __u64 object_size; /* out: object size */
30 char object_name[64]; /* out: object name */
31 __u64 block_offset; /* out: offset in block */
32 __u64 block_size; /* out: block length */
33 __s64 osd; /* out: osd # */
34 struct sockaddr_storage osd_addr; /* out: osd address */
35};
36
37#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
38 struct ceph_ioctl_dataloc)
39
40#endif
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
new file mode 100644
index 000000000000..60a9a4ae47be
--- /dev/null
+++ b/fs/ceph/mds_client.c
@@ -0,0 +1,3043 @@
1#include "ceph_debug.h"
2
3#include <linux/wait.h>
4#include <linux/slab.h>
5#include <linux/sched.h>
6
7#include "mds_client.h"
8#include "mon_client.h"
9#include "super.h"
10#include "messenger.h"
11#include "decode.h"
12#include "auth.h"
13#include "pagelist.h"
14
15/*
16 * A cluster of MDS (metadata server) daemons is responsible for
17 * managing the file system namespace (the directory hierarchy and
18 * inodes) and for coordinating shared access to storage. Metadata is
19 * partitioning hierarchically across a number of servers, and that
20 * partition varies over time as the cluster adjusts the distribution
21 * in order to balance load.
22 *
23 * The MDS client is primarily responsible to managing synchronous
24 * metadata requests for operations like open, unlink, and so forth.
25 * If there is a MDS failure, we find out about it when we (possibly
26 * request and) receive a new MDS map, and can resubmit affected
27 * requests.
28 *
29 * For the most part, though, we take advantage of a lossless
30 * communications channel to the MDS, and do not need to worry about
31 * timing out or resubmitting requests.
32 *
33 * We maintain a stateful "session" with each MDS we interact with.
34 * Within each session, we sent periodic heartbeat messages to ensure
35 * any capabilities or leases we have been issues remain valid. If
36 * the session times out and goes stale, our leases and capabilities
37 * are no longer valid.
38 */
39
40static void __wake_requests(struct ceph_mds_client *mdsc,
41 struct list_head *head);
42
43const static struct ceph_connection_operations mds_con_ops;
44
45
46/*
47 * mds reply parsing
48 */
49
50/*
51 * parse individual inode info
52 */
53static int parse_reply_info_in(void **p, void *end,
54 struct ceph_mds_reply_info_in *info)
55{
56 int err = -EIO;
57
58 info->in = *p;
59 *p += sizeof(struct ceph_mds_reply_inode) +
60 sizeof(*info->in->fragtree.splits) *
61 le32_to_cpu(info->in->fragtree.nsplits);
62
63 ceph_decode_32_safe(p, end, info->symlink_len, bad);
64 ceph_decode_need(p, end, info->symlink_len, bad);
65 info->symlink = *p;
66 *p += info->symlink_len;
67
68 ceph_decode_32_safe(p, end, info->xattr_len, bad);
69 ceph_decode_need(p, end, info->xattr_len, bad);
70 info->xattr_data = *p;
71 *p += info->xattr_len;
72 return 0;
73bad:
74 return err;
75}
76
77/*
78 * parse a normal reply, which may contain a (dir+)dentry and/or a
79 * target inode.
80 */
81static int parse_reply_info_trace(void **p, void *end,
82 struct ceph_mds_reply_info_parsed *info)
83{
84 int err;
85
86 if (info->head->is_dentry) {
87 err = parse_reply_info_in(p, end, &info->diri);
88 if (err < 0)
89 goto out_bad;
90
91 if (unlikely(*p + sizeof(*info->dirfrag) > end))
92 goto bad;
93 info->dirfrag = *p;
94 *p += sizeof(*info->dirfrag) +
95 sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
96 if (unlikely(*p > end))
97 goto bad;
98
99 ceph_decode_32_safe(p, end, info->dname_len, bad);
100 ceph_decode_need(p, end, info->dname_len, bad);
101 info->dname = *p;
102 *p += info->dname_len;
103 info->dlease = *p;
104 *p += sizeof(*info->dlease);
105 }
106
107 if (info->head->is_target) {
108 err = parse_reply_info_in(p, end, &info->targeti);
109 if (err < 0)
110 goto out_bad;
111 }
112
113 if (unlikely(*p != end))
114 goto bad;
115 return 0;
116
117bad:
118 err = -EIO;
119out_bad:
120 pr_err("problem parsing mds trace %d\n", err);
121 return err;
122}
123
124/*
125 * parse readdir results
126 */
127static int parse_reply_info_dir(void **p, void *end,
128 struct ceph_mds_reply_info_parsed *info)
129{
130 u32 num, i = 0;
131 int err;
132
133 info->dir_dir = *p;
134 if (*p + sizeof(*info->dir_dir) > end)
135 goto bad;
136 *p += sizeof(*info->dir_dir) +
137 sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
138 if (*p > end)
139 goto bad;
140
141 ceph_decode_need(p, end, sizeof(num) + 2, bad);
142 num = ceph_decode_32(p);
143 info->dir_end = ceph_decode_8(p);
144 info->dir_complete = ceph_decode_8(p);
145 if (num == 0)
146 goto done;
147
148 /* alloc large array */
149 info->dir_nr = num;
150 info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
151 sizeof(*info->dir_dname) +
152 sizeof(*info->dir_dname_len) +
153 sizeof(*info->dir_dlease),
154 GFP_NOFS);
155 if (info->dir_in == NULL) {
156 err = -ENOMEM;
157 goto out_bad;
158 }
159 info->dir_dname = (void *)(info->dir_in + num);
160 info->dir_dname_len = (void *)(info->dir_dname + num);
161 info->dir_dlease = (void *)(info->dir_dname_len + num);
162
163 while (num) {
164 /* dentry */
165 ceph_decode_need(p, end, sizeof(u32)*2, bad);
166 info->dir_dname_len[i] = ceph_decode_32(p);
167 ceph_decode_need(p, end, info->dir_dname_len[i], bad);
168 info->dir_dname[i] = *p;
169 *p += info->dir_dname_len[i];
170 dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
171 info->dir_dname[i]);
172 info->dir_dlease[i] = *p;
173 *p += sizeof(struct ceph_mds_reply_lease);
174
175 /* inode */
176 err = parse_reply_info_in(p, end, &info->dir_in[i]);
177 if (err < 0)
178 goto out_bad;
179 i++;
180 num--;
181 }
182
183done:
184 if (*p != end)
185 goto bad;
186 return 0;
187
188bad:
189 err = -EIO;
190out_bad:
191 pr_err("problem parsing dir contents %d\n", err);
192 return err;
193}
194
195/*
196 * parse entire mds reply
197 */
198static int parse_reply_info(struct ceph_msg *msg,
199 struct ceph_mds_reply_info_parsed *info)
200{
201 void *p, *end;
202 u32 len;
203 int err;
204
205 info->head = msg->front.iov_base;
206 p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
207 end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
208
209 /* trace */
210 ceph_decode_32_safe(&p, end, len, bad);
211 if (len > 0) {
212 err = parse_reply_info_trace(&p, p+len, info);
213 if (err < 0)
214 goto out_bad;
215 }
216
217 /* dir content */
218 ceph_decode_32_safe(&p, end, len, bad);
219 if (len > 0) {
220 err = parse_reply_info_dir(&p, p+len, info);
221 if (err < 0)
222 goto out_bad;
223 }
224
225 /* snap blob */
226 ceph_decode_32_safe(&p, end, len, bad);
227 info->snapblob_len = len;
228 info->snapblob = p;
229 p += len;
230
231 if (p != end)
232 goto bad;
233 return 0;
234
235bad:
236 err = -EIO;
237out_bad:
238 pr_err("mds parse_reply err %d\n", err);
239 return err;
240}
241
242static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
243{
244 kfree(info->dir_in);
245}
246
247
248/*
249 * sessions
250 */
251static const char *session_state_name(int s)
252{
253 switch (s) {
254 case CEPH_MDS_SESSION_NEW: return "new";
255 case CEPH_MDS_SESSION_OPENING: return "opening";
256 case CEPH_MDS_SESSION_OPEN: return "open";
257 case CEPH_MDS_SESSION_HUNG: return "hung";
258 case CEPH_MDS_SESSION_CLOSING: return "closing";
259 case CEPH_MDS_SESSION_RESTARTING: return "restarting";
260 case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
261 default: return "???";
262 }
263}
264
265static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
266{
267 if (atomic_inc_not_zero(&s->s_ref)) {
268 dout("mdsc get_session %p %d -> %d\n", s,
269 atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
270 return s;
271 } else {
272 dout("mdsc get_session %p 0 -- FAIL", s);
273 return NULL;
274 }
275}
276
277void ceph_put_mds_session(struct ceph_mds_session *s)
278{
279 dout("mdsc put_session %p %d -> %d\n", s,
280 atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
281 if (atomic_dec_and_test(&s->s_ref)) {
282 if (s->s_authorizer)
283 s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
284 s->s_mdsc->client->monc.auth, s->s_authorizer);
285 kfree(s);
286 }
287}
288
289/*
290 * called under mdsc->mutex
291 */
292struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
293 int mds)
294{
295 struct ceph_mds_session *session;
296
297 if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
298 return NULL;
299 session = mdsc->sessions[mds];
300 dout("lookup_mds_session %p %d\n", session,
301 atomic_read(&session->s_ref));
302 get_session(session);
303 return session;
304}
305
306static bool __have_session(struct ceph_mds_client *mdsc, int mds)
307{
308 if (mds >= mdsc->max_sessions)
309 return false;
310 return mdsc->sessions[mds];
311}
312
313static int __verify_registered_session(struct ceph_mds_client *mdsc,
314 struct ceph_mds_session *s)
315{
316 if (s->s_mds >= mdsc->max_sessions ||
317 mdsc->sessions[s->s_mds] != s)
318 return -ENOENT;
319 return 0;
320}
321
322/*
323 * create+register a new session for given mds.
324 * called under mdsc->mutex.
325 */
326static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
327 int mds)
328{
329 struct ceph_mds_session *s;
330
331 s = kzalloc(sizeof(*s), GFP_NOFS);
332 if (!s)
333 return ERR_PTR(-ENOMEM);
334 s->s_mdsc = mdsc;
335 s->s_mds = mds;
336 s->s_state = CEPH_MDS_SESSION_NEW;
337 s->s_ttl = 0;
338 s->s_seq = 0;
339 mutex_init(&s->s_mutex);
340
341 ceph_con_init(mdsc->client->msgr, &s->s_con);
342 s->s_con.private = s;
343 s->s_con.ops = &mds_con_ops;
344 s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
345 s->s_con.peer_name.num = cpu_to_le64(mds);
346
347 spin_lock_init(&s->s_cap_lock);
348 s->s_cap_gen = 0;
349 s->s_cap_ttl = 0;
350 s->s_renew_requested = 0;
351 s->s_renew_seq = 0;
352 INIT_LIST_HEAD(&s->s_caps);
353 s->s_nr_caps = 0;
354 s->s_trim_caps = 0;
355 atomic_set(&s->s_ref, 1);
356 INIT_LIST_HEAD(&s->s_waiting);
357 INIT_LIST_HEAD(&s->s_unsafe);
358 s->s_num_cap_releases = 0;
359 s->s_cap_iterator = NULL;
360 INIT_LIST_HEAD(&s->s_cap_releases);
361 INIT_LIST_HEAD(&s->s_cap_releases_done);
362 INIT_LIST_HEAD(&s->s_cap_flushing);
363 INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
364
365 dout("register_session mds%d\n", mds);
366 if (mds >= mdsc->max_sessions) {
367 int newmax = 1 << get_count_order(mds+1);
368 struct ceph_mds_session **sa;
369
370 dout("register_session realloc to %d\n", newmax);
371 sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
372 if (sa == NULL)
373 goto fail_realloc;
374 if (mdsc->sessions) {
375 memcpy(sa, mdsc->sessions,
376 mdsc->max_sessions * sizeof(void *));
377 kfree(mdsc->sessions);
378 }
379 mdsc->sessions = sa;
380 mdsc->max_sessions = newmax;
381 }
382 mdsc->sessions[mds] = s;
383 atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */
384
385 ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
386
387 return s;
388
389fail_realloc:
390 kfree(s);
391 return ERR_PTR(-ENOMEM);
392}
393
394/*
395 * called under mdsc->mutex
396 */
397static void __unregister_session(struct ceph_mds_client *mdsc,
398 struct ceph_mds_session *s)
399{
400 dout("__unregister_session mds%d %p\n", s->s_mds, s);
401 BUG_ON(mdsc->sessions[s->s_mds] != s);
402 mdsc->sessions[s->s_mds] = NULL;
403 ceph_con_close(&s->s_con);
404 ceph_put_mds_session(s);
405}
406
407/*
408 * drop session refs in request.
409 *
410 * should be last request ref, or hold mdsc->mutex
411 */
412static void put_request_session(struct ceph_mds_request *req)
413{
414 if (req->r_session) {
415 ceph_put_mds_session(req->r_session);
416 req->r_session = NULL;
417 }
418}
419
420void ceph_mdsc_release_request(struct kref *kref)
421{
422 struct ceph_mds_request *req = container_of(kref,
423 struct ceph_mds_request,
424 r_kref);
425 if (req->r_request)
426 ceph_msg_put(req->r_request);
427 if (req->r_reply) {
428 ceph_msg_put(req->r_reply);
429 destroy_reply_info(&req->r_reply_info);
430 }
431 if (req->r_inode) {
432 ceph_put_cap_refs(ceph_inode(req->r_inode),
433 CEPH_CAP_PIN);
434 iput(req->r_inode);
435 }
436 if (req->r_locked_dir)
437 ceph_put_cap_refs(ceph_inode(req->r_locked_dir),
438 CEPH_CAP_PIN);
439 if (req->r_target_inode)
440 iput(req->r_target_inode);
441 if (req->r_dentry)
442 dput(req->r_dentry);
443 if (req->r_old_dentry) {
444 ceph_put_cap_refs(
445 ceph_inode(req->r_old_dentry->d_parent->d_inode),
446 CEPH_CAP_PIN);
447 dput(req->r_old_dentry);
448 }
449 kfree(req->r_path1);
450 kfree(req->r_path2);
451 put_request_session(req);
452 ceph_unreserve_caps(&req->r_caps_reservation);
453 kfree(req);
454}
455
456/*
457 * lookup session, bump ref if found.
458 *
459 * called under mdsc->mutex.
460 */
461static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
462 u64 tid)
463{
464 struct ceph_mds_request *req;
465 struct rb_node *n = mdsc->request_tree.rb_node;
466
467 while (n) {
468 req = rb_entry(n, struct ceph_mds_request, r_node);
469 if (tid < req->r_tid)
470 n = n->rb_left;
471 else if (tid > req->r_tid)
472 n = n->rb_right;
473 else {
474 ceph_mdsc_get_request(req);
475 return req;
476 }
477 }
478 return NULL;
479}
480
481static void __insert_request(struct ceph_mds_client *mdsc,
482 struct ceph_mds_request *new)
483{
484 struct rb_node **p = &mdsc->request_tree.rb_node;
485 struct rb_node *parent = NULL;
486 struct ceph_mds_request *req = NULL;
487
488 while (*p) {
489 parent = *p;
490 req = rb_entry(parent, struct ceph_mds_request, r_node);
491 if (new->r_tid < req->r_tid)
492 p = &(*p)->rb_left;
493 else if (new->r_tid > req->r_tid)
494 p = &(*p)->rb_right;
495 else
496 BUG();
497 }
498
499 rb_link_node(&new->r_node, parent, p);
500 rb_insert_color(&new->r_node, &mdsc->request_tree);
501}
502
503/*
504 * Register an in-flight request, and assign a tid. Link to directory
505 * are modifying (if any).
506 *
507 * Called under mdsc->mutex.
508 */
509static void __register_request(struct ceph_mds_client *mdsc,
510 struct ceph_mds_request *req,
511 struct inode *dir)
512{
513 req->r_tid = ++mdsc->last_tid;
514 if (req->r_num_caps)
515 ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps);
516 dout("__register_request %p tid %lld\n", req, req->r_tid);
517 ceph_mdsc_get_request(req);
518 __insert_request(mdsc, req);
519
520 if (dir) {
521 struct ceph_inode_info *ci = ceph_inode(dir);
522
523 spin_lock(&ci->i_unsafe_lock);
524 req->r_unsafe_dir = dir;
525 list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
526 spin_unlock(&ci->i_unsafe_lock);
527 }
528}
529
530static void __unregister_request(struct ceph_mds_client *mdsc,
531 struct ceph_mds_request *req)
532{
533 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
534 rb_erase(&req->r_node, &mdsc->request_tree);
535 RB_CLEAR_NODE(&req->r_node);
536
537 if (req->r_unsafe_dir) {
538 struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
539
540 spin_lock(&ci->i_unsafe_lock);
541 list_del_init(&req->r_unsafe_dir_item);
542 spin_unlock(&ci->i_unsafe_lock);
543 }
544
545 ceph_mdsc_put_request(req);
546}
547
548/*
549 * Choose mds to send request to next. If there is a hint set in the
550 * request (e.g., due to a prior forward hint from the mds), use that.
551 * Otherwise, consult frag tree and/or caps to identify the
552 * appropriate mds. If all else fails, choose randomly.
553 *
554 * Called under mdsc->mutex.
555 */
556static int __choose_mds(struct ceph_mds_client *mdsc,
557 struct ceph_mds_request *req)
558{
559 struct inode *inode;
560 struct ceph_inode_info *ci;
561 struct ceph_cap *cap;
562 int mode = req->r_direct_mode;
563 int mds = -1;
564 u32 hash = req->r_direct_hash;
565 bool is_hash = req->r_direct_is_hash;
566
567 /*
568 * is there a specific mds we should try? ignore hint if we have
569 * no session and the mds is not up (active or recovering).
570 */
571 if (req->r_resend_mds >= 0 &&
572 (__have_session(mdsc, req->r_resend_mds) ||
573 ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
574 dout("choose_mds using resend_mds mds%d\n",
575 req->r_resend_mds);
576 return req->r_resend_mds;
577 }
578
579 if (mode == USE_RANDOM_MDS)
580 goto random;
581
582 inode = NULL;
583 if (req->r_inode) {
584 inode = req->r_inode;
585 } else if (req->r_dentry) {
586 if (req->r_dentry->d_inode) {
587 inode = req->r_dentry->d_inode;
588 } else {
589 inode = req->r_dentry->d_parent->d_inode;
590 hash = req->r_dentry->d_name.hash;
591 is_hash = true;
592 }
593 }
594 dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
595 (int)hash, mode);
596 if (!inode)
597 goto random;
598 ci = ceph_inode(inode);
599
600 if (is_hash && S_ISDIR(inode->i_mode)) {
601 struct ceph_inode_frag frag;
602 int found;
603
604 ceph_choose_frag(ci, hash, &frag, &found);
605 if (found) {
606 if (mode == USE_ANY_MDS && frag.ndist > 0) {
607 u8 r;
608
609 /* choose a random replica */
610 get_random_bytes(&r, 1);
611 r %= frag.ndist;
612 mds = frag.dist[r];
613 dout("choose_mds %p %llx.%llx "
614 "frag %u mds%d (%d/%d)\n",
615 inode, ceph_vinop(inode),
616 frag.frag, frag.mds,
617 (int)r, frag.ndist);
618 return mds;
619 }
620
621 /* since this file/dir wasn't known to be
622 * replicated, then we want to look for the
623 * authoritative mds. */
624 mode = USE_AUTH_MDS;
625 if (frag.mds >= 0) {
626 /* choose auth mds */
627 mds = frag.mds;
628 dout("choose_mds %p %llx.%llx "
629 "frag %u mds%d (auth)\n",
630 inode, ceph_vinop(inode), frag.frag, mds);
631 return mds;
632 }
633 }
634 }
635
636 spin_lock(&inode->i_lock);
637 cap = NULL;
638 if (mode == USE_AUTH_MDS)
639 cap = ci->i_auth_cap;
640 if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
641 cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
642 if (!cap) {
643 spin_unlock(&inode->i_lock);
644 goto random;
645 }
646 mds = cap->session->s_mds;
647 dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
648 inode, ceph_vinop(inode), mds,
649 cap == ci->i_auth_cap ? "auth " : "", cap);
650 spin_unlock(&inode->i_lock);
651 return mds;
652
653random:
654 mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
655 dout("choose_mds chose random mds%d\n", mds);
656 return mds;
657}
658
659
660/*
661 * session messages
662 */
663static struct ceph_msg *create_session_msg(u32 op, u64 seq)
664{
665 struct ceph_msg *msg;
666 struct ceph_mds_session_head *h;
667
668 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
669 if (IS_ERR(msg)) {
670 pr_err("create_session_msg ENOMEM creating msg\n");
671 return ERR_PTR(PTR_ERR(msg));
672 }
673 h = msg->front.iov_base;
674 h->op = cpu_to_le32(op);
675 h->seq = cpu_to_le64(seq);
676 return msg;
677}
678
679/*
680 * send session open request.
681 *
682 * called under mdsc->mutex
683 */
684static int __open_session(struct ceph_mds_client *mdsc,
685 struct ceph_mds_session *session)
686{
687 struct ceph_msg *msg;
688 int mstate;
689 int mds = session->s_mds;
690 int err = 0;
691
692 /* wait for mds to go active? */
693 mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
694 dout("open_session to mds%d (%s)\n", mds,
695 ceph_mds_state_name(mstate));
696 session->s_state = CEPH_MDS_SESSION_OPENING;
697 session->s_renew_requested = jiffies;
698
699 /* send connect message */
700 msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
701 if (IS_ERR(msg)) {
702 err = PTR_ERR(msg);
703 goto out;
704 }
705 ceph_con_send(&session->s_con, msg);
706
707out:
708 return 0;
709}
710
711/*
712 * session caps
713 */
714
715/*
716 * Free preallocated cap messages assigned to this session
717 */
718static void cleanup_cap_releases(struct ceph_mds_session *session)
719{
720 struct ceph_msg *msg;
721
722 spin_lock(&session->s_cap_lock);
723 while (!list_empty(&session->s_cap_releases)) {
724 msg = list_first_entry(&session->s_cap_releases,
725 struct ceph_msg, list_head);
726 list_del_init(&msg->list_head);
727 ceph_msg_put(msg);
728 }
729 while (!list_empty(&session->s_cap_releases_done)) {
730 msg = list_first_entry(&session->s_cap_releases_done,
731 struct ceph_msg, list_head);
732 list_del_init(&msg->list_head);
733 ceph_msg_put(msg);
734 }
735 spin_unlock(&session->s_cap_lock);
736}
737
738/*
739 * Helper to safely iterate over all caps associated with a session.
740 *
741 * caller must hold session s_mutex
742 */
743static int iterate_session_caps(struct ceph_mds_session *session,
744 int (*cb)(struct inode *, struct ceph_cap *,
745 void *), void *arg)
746{
747 struct list_head *p;
748 struct ceph_cap *cap;
749 struct inode *inode, *last_inode = NULL;
750 struct ceph_cap *old_cap = NULL;
751 int ret;
752
753 dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
754 spin_lock(&session->s_cap_lock);
755 p = session->s_caps.next;
756 while (p != &session->s_caps) {
757 cap = list_entry(p, struct ceph_cap, session_caps);
758 inode = igrab(&cap->ci->vfs_inode);
759 if (!inode) {
760 p = p->next;
761 continue;
762 }
763 session->s_cap_iterator = cap;
764 spin_unlock(&session->s_cap_lock);
765
766 if (last_inode) {
767 iput(last_inode);
768 last_inode = NULL;
769 }
770 if (old_cap) {
771 ceph_put_cap(old_cap);
772 old_cap = NULL;
773 }
774
775 ret = cb(inode, cap, arg);
776 last_inode = inode;
777
778 spin_lock(&session->s_cap_lock);
779 p = p->next;
780 if (cap->ci == NULL) {
781 dout("iterate_session_caps finishing cap %p removal\n",
782 cap);
783 BUG_ON(cap->session != session);
784 list_del_init(&cap->session_caps);
785 session->s_nr_caps--;
786 cap->session = NULL;
787 old_cap = cap; /* put_cap it w/o locks held */
788 }
789 if (ret < 0)
790 goto out;
791 }
792 ret = 0;
793out:
794 session->s_cap_iterator = NULL;
795 spin_unlock(&session->s_cap_lock);
796
797 if (last_inode)
798 iput(last_inode);
799 if (old_cap)
800 ceph_put_cap(old_cap);
801
802 return ret;
803}
804
805static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
806 void *arg)
807{
808 struct ceph_inode_info *ci = ceph_inode(inode);
809 dout("removing cap %p, ci is %p, inode is %p\n",
810 cap, ci, &ci->vfs_inode);
811 ceph_remove_cap(cap);
812 return 0;
813}
814
815/*
816 * caller must hold session s_mutex
817 */
818static void remove_session_caps(struct ceph_mds_session *session)
819{
820 dout("remove_session_caps on %p\n", session);
821 iterate_session_caps(session, remove_session_caps_cb, NULL);
822 BUG_ON(session->s_nr_caps > 0);
823 cleanup_cap_releases(session);
824}
825
826/*
827 * wake up any threads waiting on this session's caps. if the cap is
828 * old (didn't get renewed on the client reconnect), remove it now.
829 *
830 * caller must hold s_mutex.
831 */
832static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
833 void *arg)
834{
835 struct ceph_inode_info *ci = ceph_inode(inode);
836
837 wake_up(&ci->i_cap_wq);
838 if (arg) {
839 spin_lock(&inode->i_lock);
840 ci->i_wanted_max_size = 0;
841 ci->i_requested_max_size = 0;
842 spin_unlock(&inode->i_lock);
843 }
844 return 0;
845}
846
847static void wake_up_session_caps(struct ceph_mds_session *session,
848 int reconnect)
849{
850 dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
851 iterate_session_caps(session, wake_up_session_cb,
852 (void *)(unsigned long)reconnect);
853}
854
855/*
856 * Send periodic message to MDS renewing all currently held caps. The
857 * ack will reset the expiration for all caps from this session.
858 *
859 * caller holds s_mutex
860 */
861static int send_renew_caps(struct ceph_mds_client *mdsc,
862 struct ceph_mds_session *session)
863{
864 struct ceph_msg *msg;
865 int state;
866
867 if (time_after_eq(jiffies, session->s_cap_ttl) &&
868 time_after_eq(session->s_cap_ttl, session->s_renew_requested))
869 pr_info("mds%d caps stale\n", session->s_mds);
870 session->s_renew_requested = jiffies;
871
872 /* do not try to renew caps until a recovering mds has reconnected
873 * with its clients. */
874 state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
875 if (state < CEPH_MDS_STATE_RECONNECT) {
876 dout("send_renew_caps ignoring mds%d (%s)\n",
877 session->s_mds, ceph_mds_state_name(state));
878 return 0;
879 }
880
881 dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
882 ceph_mds_state_name(state));
883 msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
884 ++session->s_renew_seq);
885 if (IS_ERR(msg))
886 return PTR_ERR(msg);
887 ceph_con_send(&session->s_con, msg);
888 return 0;
889}
890
891/*
892 * Note new cap ttl, and any transition from stale -> not stale (fresh?).
893 *
894 * Called under session->s_mutex
895 */
896static void renewed_caps(struct ceph_mds_client *mdsc,
897 struct ceph_mds_session *session, int is_renew)
898{
899 int was_stale;
900 int wake = 0;
901
902 spin_lock(&session->s_cap_lock);
903 was_stale = is_renew && (session->s_cap_ttl == 0 ||
904 time_after_eq(jiffies, session->s_cap_ttl));
905
906 session->s_cap_ttl = session->s_renew_requested +
907 mdsc->mdsmap->m_session_timeout*HZ;
908
909 if (was_stale) {
910 if (time_before(jiffies, session->s_cap_ttl)) {
911 pr_info("mds%d caps renewed\n", session->s_mds);
912 wake = 1;
913 } else {
914 pr_info("mds%d caps still stale\n", session->s_mds);
915 }
916 }
917 dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
918 session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
919 time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
920 spin_unlock(&session->s_cap_lock);
921
922 if (wake)
923 wake_up_session_caps(session, 0);
924}
925
926/*
927 * send a session close request
928 */
929static int request_close_session(struct ceph_mds_client *mdsc,
930 struct ceph_mds_session *session)
931{
932 struct ceph_msg *msg;
933 int err = 0;
934
935 dout("request_close_session mds%d state %s seq %lld\n",
936 session->s_mds, session_state_name(session->s_state),
937 session->s_seq);
938 msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
939 if (IS_ERR(msg))
940 err = PTR_ERR(msg);
941 else
942 ceph_con_send(&session->s_con, msg);
943 return err;
944}
945
946/*
947 * Called with s_mutex held.
948 */
949static int __close_session(struct ceph_mds_client *mdsc,
950 struct ceph_mds_session *session)
951{
952 if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
953 return 0;
954 session->s_state = CEPH_MDS_SESSION_CLOSING;
955 return request_close_session(mdsc, session);
956}
957
958/*
959 * Trim old(er) caps.
960 *
961 * Because we can't cache an inode without one or more caps, we do
962 * this indirectly: if a cap is unused, we prune its aliases, at which
963 * point the inode will hopefully get dropped to.
964 *
965 * Yes, this is a bit sloppy. Our only real goal here is to respond to
966 * memory pressure from the MDS, though, so it needn't be perfect.
967 */
968static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
969{
970 struct ceph_mds_session *session = arg;
971 struct ceph_inode_info *ci = ceph_inode(inode);
972 int used, oissued, mine;
973
974 if (session->s_trim_caps <= 0)
975 return -1;
976
977 spin_lock(&inode->i_lock);
978 mine = cap->issued | cap->implemented;
979 used = __ceph_caps_used(ci);
980 oissued = __ceph_caps_issued_other(ci, cap);
981
982 dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
983 inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
984 ceph_cap_string(used));
985 if (ci->i_dirty_caps)
986 goto out; /* dirty caps */
987 if ((used & ~oissued) & mine)
988 goto out; /* we need these caps */
989
990 session->s_trim_caps--;
991 if (oissued) {
992 /* we aren't the only cap.. just remove us */
993 __ceph_remove_cap(cap);
994 } else {
995 /* try to drop referring dentries */
996 spin_unlock(&inode->i_lock);
997 d_prune_aliases(inode);
998 dout("trim_caps_cb %p cap %p pruned, count now %d\n",
999 inode, cap, atomic_read(&inode->i_count));
1000 return 0;
1001 }
1002
1003out:
1004 spin_unlock(&inode->i_lock);
1005 return 0;
1006}
1007
1008/*
1009 * Trim session cap count down to some max number.
1010 */
1011static int trim_caps(struct ceph_mds_client *mdsc,
1012 struct ceph_mds_session *session,
1013 int max_caps)
1014{
1015 int trim_caps = session->s_nr_caps - max_caps;
1016
1017 dout("trim_caps mds%d start: %d / %d, trim %d\n",
1018 session->s_mds, session->s_nr_caps, max_caps, trim_caps);
1019 if (trim_caps > 0) {
1020 session->s_trim_caps = trim_caps;
1021 iterate_session_caps(session, trim_caps_cb, session);
1022 dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
1023 session->s_mds, session->s_nr_caps, max_caps,
1024 trim_caps - session->s_trim_caps);
1025 session->s_trim_caps = 0;
1026 }
1027 return 0;
1028}
1029
1030/*
1031 * Allocate cap_release messages. If there is a partially full message
1032 * in the queue, try to allocate enough to cover it's remainder, so that
1033 * we can send it immediately.
1034 *
1035 * Called under s_mutex.
1036 */
1037static int add_cap_releases(struct ceph_mds_client *mdsc,
1038 struct ceph_mds_session *session,
1039 int extra)
1040{
1041 struct ceph_msg *msg;
1042 struct ceph_mds_cap_release *head;
1043 int err = -ENOMEM;
1044
1045 if (extra < 0)
1046 extra = mdsc->client->mount_args->cap_release_safety;
1047
1048 spin_lock(&session->s_cap_lock);
1049
1050 if (!list_empty(&session->s_cap_releases)) {
1051 msg = list_first_entry(&session->s_cap_releases,
1052 struct ceph_msg,
1053 list_head);
1054 head = msg->front.iov_base;
1055 extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
1056 }
1057
1058 while (session->s_num_cap_releases < session->s_nr_caps + extra) {
1059 spin_unlock(&session->s_cap_lock);
1060 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
1061 0, 0, NULL);
1062 if (!msg)
1063 goto out_unlocked;
1064 dout("add_cap_releases %p msg %p now %d\n", session, msg,
1065 (int)msg->front.iov_len);
1066 head = msg->front.iov_base;
1067 head->num = cpu_to_le32(0);
1068 msg->front.iov_len = sizeof(*head);
1069 spin_lock(&session->s_cap_lock);
1070 list_add(&msg->list_head, &session->s_cap_releases);
1071 session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
1072 }
1073
1074 if (!list_empty(&session->s_cap_releases)) {
1075 msg = list_first_entry(&session->s_cap_releases,
1076 struct ceph_msg,
1077 list_head);
1078 head = msg->front.iov_base;
1079 if (head->num) {
1080 dout(" queueing non-full %p (%d)\n", msg,
1081 le32_to_cpu(head->num));
1082 list_move_tail(&msg->list_head,
1083 &session->s_cap_releases_done);
1084 session->s_num_cap_releases -=
1085 CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
1086 }
1087 }
1088 err = 0;
1089 spin_unlock(&session->s_cap_lock);
1090out_unlocked:
1091 return err;
1092}
1093
1094/*
1095 * flush all dirty inode data to disk.
1096 *
1097 * returns true if we've flushed through want_flush_seq
1098 */
1099static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
1100{
1101 int mds, ret = 1;
1102
1103 dout("check_cap_flush want %lld\n", want_flush_seq);
1104 mutex_lock(&mdsc->mutex);
1105 for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
1106 struct ceph_mds_session *session = mdsc->sessions[mds];
1107
1108 if (!session)
1109 continue;
1110 get_session(session);
1111 mutex_unlock(&mdsc->mutex);
1112
1113 mutex_lock(&session->s_mutex);
1114 if (!list_empty(&session->s_cap_flushing)) {
1115 struct ceph_inode_info *ci =
1116 list_entry(session->s_cap_flushing.next,
1117 struct ceph_inode_info,
1118 i_flushing_item);
1119 struct inode *inode = &ci->vfs_inode;
1120
1121 spin_lock(&inode->i_lock);
1122 if (ci->i_cap_flush_seq <= want_flush_seq) {
1123 dout("check_cap_flush still flushing %p "
1124 "seq %lld <= %lld to mds%d\n", inode,
1125 ci->i_cap_flush_seq, want_flush_seq,
1126 session->s_mds);
1127 ret = 0;
1128 }
1129 spin_unlock(&inode->i_lock);
1130 }
1131 mutex_unlock(&session->s_mutex);
1132 ceph_put_mds_session(session);
1133
1134 if (!ret)
1135 return ret;
1136 mutex_lock(&mdsc->mutex);
1137 }
1138
1139 mutex_unlock(&mdsc->mutex);
1140 dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
1141 return ret;
1142}
1143
1144/*
1145 * called under s_mutex
1146 */
1147static void send_cap_releases(struct ceph_mds_client *mdsc,
1148 struct ceph_mds_session *session)
1149{
1150 struct ceph_msg *msg;
1151
1152 dout("send_cap_releases mds%d\n", session->s_mds);
1153 while (1) {
1154 spin_lock(&session->s_cap_lock);
1155 if (list_empty(&session->s_cap_releases_done))
1156 break;
1157 msg = list_first_entry(&session->s_cap_releases_done,
1158 struct ceph_msg, list_head);
1159 list_del_init(&msg->list_head);
1160 spin_unlock(&session->s_cap_lock);
1161 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1162 dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
1163 ceph_con_send(&session->s_con, msg);
1164 }
1165 spin_unlock(&session->s_cap_lock);
1166}
1167
1168/*
1169 * requests
1170 */
1171
1172/*
1173 * Create an mds request.
1174 */
1175struct ceph_mds_request *
1176ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
1177{
1178 struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
1179
1180 if (!req)
1181 return ERR_PTR(-ENOMEM);
1182
1183 req->r_started = jiffies;
1184 req->r_resend_mds = -1;
1185 INIT_LIST_HEAD(&req->r_unsafe_dir_item);
1186 req->r_fmode = -1;
1187 kref_init(&req->r_kref);
1188 INIT_LIST_HEAD(&req->r_wait);
1189 init_completion(&req->r_completion);
1190 init_completion(&req->r_safe_completion);
1191 INIT_LIST_HEAD(&req->r_unsafe_item);
1192
1193 req->r_op = op;
1194 req->r_direct_mode = mode;
1195 return req;
1196}
1197
1198/*
1199 * return oldest (lowest) request, tid in request tree, 0 if none.
1200 *
1201 * called under mdsc->mutex.
1202 */
1203static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
1204{
1205 if (RB_EMPTY_ROOT(&mdsc->request_tree))
1206 return NULL;
1207 return rb_entry(rb_first(&mdsc->request_tree),
1208 struct ceph_mds_request, r_node);
1209}
1210
1211static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
1212{
1213 struct ceph_mds_request *req = __get_oldest_req(mdsc);
1214
1215 if (req)
1216 return req->r_tid;
1217 return 0;
1218}
1219
1220/*
1221 * Build a dentry's path. Allocate on heap; caller must kfree. Based
1222 * on build_path_from_dentry in fs/cifs/dir.c.
1223 *
1224 * If @stop_on_nosnap, generate path relative to the first non-snapped
1225 * inode.
1226 *
1227 * Encode hidden .snap dirs as a double /, i.e.
1228 * foo/.snap/bar -> foo//bar
1229 */
1230char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
1231 int stop_on_nosnap)
1232{
1233 struct dentry *temp;
1234 char *path;
1235 int len, pos;
1236
1237 if (dentry == NULL)
1238 return ERR_PTR(-EINVAL);
1239
1240retry:
1241 len = 0;
1242 for (temp = dentry; !IS_ROOT(temp);) {
1243 struct inode *inode = temp->d_inode;
1244 if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
1245 len++; /* slash only */
1246 else if (stop_on_nosnap && inode &&
1247 ceph_snap(inode) == CEPH_NOSNAP)
1248 break;
1249 else
1250 len += 1 + temp->d_name.len;
1251 temp = temp->d_parent;
1252 if (temp == NULL) {
1253 pr_err("build_path_dentry corrupt dentry %p\n", dentry);
1254 return ERR_PTR(-EINVAL);
1255 }
1256 }
1257 if (len)
1258 len--; /* no leading '/' */
1259
1260 path = kmalloc(len+1, GFP_NOFS);
1261 if (path == NULL)
1262 return ERR_PTR(-ENOMEM);
1263 pos = len;
1264 path[pos] = 0; /* trailing null */
1265 for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
1266 struct inode *inode = temp->d_inode;
1267
1268 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
1269 dout("build_path_dentry path+%d: %p SNAPDIR\n",
1270 pos, temp);
1271 } else if (stop_on_nosnap && inode &&
1272 ceph_snap(inode) == CEPH_NOSNAP) {
1273 break;
1274 } else {
1275 pos -= temp->d_name.len;
1276 if (pos < 0)
1277 break;
1278 strncpy(path + pos, temp->d_name.name,
1279 temp->d_name.len);
1280 dout("build_path_dentry path+%d: %p '%.*s'\n",
1281 pos, temp, temp->d_name.len, path + pos);
1282 }
1283 if (pos)
1284 path[--pos] = '/';
1285 temp = temp->d_parent;
1286 if (temp == NULL) {
1287 pr_err("build_path_dentry corrupt dentry\n");
1288 kfree(path);
1289 return ERR_PTR(-EINVAL);
1290 }
1291 }
1292 if (pos != 0) {
1293 pr_err("build_path_dentry did not end path lookup where "
1294 "expected, namelen is %d, pos is %d\n", len, pos);
1295 /* presumably this is only possible if racing with a
1296 rename of one of the parent directories (we can not
1297 lock the dentries above us to prevent this, but
1298 retrying should be harmless) */
1299 kfree(path);
1300 goto retry;
1301 }
1302
1303 *base = ceph_ino(temp->d_inode);
1304 *plen = len;
1305 dout("build_path_dentry on %p %d built %llx '%.*s'\n",
1306 dentry, atomic_read(&dentry->d_count), *base, len, path);
1307 return path;
1308}
1309
1310static int build_dentry_path(struct dentry *dentry,
1311 const char **ppath, int *ppathlen, u64 *pino,
1312 int *pfreepath)
1313{
1314 char *path;
1315
1316 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) {
1317 *pino = ceph_ino(dentry->d_parent->d_inode);
1318 *ppath = dentry->d_name.name;
1319 *ppathlen = dentry->d_name.len;
1320 return 0;
1321 }
1322 path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
1323 if (IS_ERR(path))
1324 return PTR_ERR(path);
1325 *ppath = path;
1326 *pfreepath = 1;
1327 return 0;
1328}
1329
1330static int build_inode_path(struct inode *inode,
1331 const char **ppath, int *ppathlen, u64 *pino,
1332 int *pfreepath)
1333{
1334 struct dentry *dentry;
1335 char *path;
1336
1337 if (ceph_snap(inode) == CEPH_NOSNAP) {
1338 *pino = ceph_ino(inode);
1339 *ppathlen = 0;
1340 return 0;
1341 }
1342 dentry = d_find_alias(inode);
1343 path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
1344 dput(dentry);
1345 if (IS_ERR(path))
1346 return PTR_ERR(path);
1347 *ppath = path;
1348 *pfreepath = 1;
1349 return 0;
1350}
1351
1352/*
1353 * request arguments may be specified via an inode *, a dentry *, or
1354 * an explicit ino+path.
1355 */
1356static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
1357 const char *rpath, u64 rino,
1358 const char **ppath, int *pathlen,
1359 u64 *ino, int *freepath)
1360{
1361 int r = 0;
1362
1363 if (rinode) {
1364 r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
1365 dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
1366 ceph_snap(rinode));
1367 } else if (rdentry) {
1368 r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
1369 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
1370 *ppath);
1371 } else if (rpath) {
1372 *ino = rino;
1373 *ppath = rpath;
1374 *pathlen = strlen(rpath);
1375 dout(" path %.*s\n", *pathlen, rpath);
1376 }
1377
1378 return r;
1379}
1380
1381/*
1382 * called under mdsc->mutex
1383 */
1384static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1385 struct ceph_mds_request *req,
1386 int mds)
1387{
1388 struct ceph_msg *msg;
1389 struct ceph_mds_request_head *head;
1390 const char *path1 = NULL;
1391 const char *path2 = NULL;
1392 u64 ino1 = 0, ino2 = 0;
1393 int pathlen1 = 0, pathlen2 = 0;
1394 int freepath1 = 0, freepath2 = 0;
1395 int len;
1396 u16 releases;
1397 void *p, *end;
1398 int ret;
1399
1400 ret = set_request_path_attr(req->r_inode, req->r_dentry,
1401 req->r_path1, req->r_ino1.ino,
1402 &path1, &pathlen1, &ino1, &freepath1);
1403 if (ret < 0) {
1404 msg = ERR_PTR(ret);
1405 goto out;
1406 }
1407
1408 ret = set_request_path_attr(NULL, req->r_old_dentry,
1409 req->r_path2, req->r_ino2.ino,
1410 &path2, &pathlen2, &ino2, &freepath2);
1411 if (ret < 0) {
1412 msg = ERR_PTR(ret);
1413 goto out_free1;
1414 }
1415
1416 len = sizeof(*head) +
1417 pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
1418
1419 /* calculate (max) length for cap releases */
1420 len += sizeof(struct ceph_mds_request_release) *
1421 (!!req->r_inode_drop + !!req->r_dentry_drop +
1422 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
1423 if (req->r_dentry_drop)
1424 len += req->r_dentry->d_name.len;
1425 if (req->r_old_dentry_drop)
1426 len += req->r_old_dentry->d_name.len;
1427
1428 msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL);
1429 if (IS_ERR(msg))
1430 goto out_free2;
1431
1432 msg->hdr.tid = cpu_to_le64(req->r_tid);
1433
1434 head = msg->front.iov_base;
1435 p = msg->front.iov_base + sizeof(*head);
1436 end = msg->front.iov_base + msg->front.iov_len;
1437
1438 head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
1439 head->op = cpu_to_le32(req->r_op);
1440 head->caller_uid = cpu_to_le32(current_fsuid());
1441 head->caller_gid = cpu_to_le32(current_fsgid());
1442 head->args = req->r_args;
1443
1444 ceph_encode_filepath(&p, end, ino1, path1);
1445 ceph_encode_filepath(&p, end, ino2, path2);
1446
1447 /* cap releases */
1448 releases = 0;
1449 if (req->r_inode_drop)
1450 releases += ceph_encode_inode_release(&p,
1451 req->r_inode ? req->r_inode : req->r_dentry->d_inode,
1452 mds, req->r_inode_drop, req->r_inode_unless, 0);
1453 if (req->r_dentry_drop)
1454 releases += ceph_encode_dentry_release(&p, req->r_dentry,
1455 mds, req->r_dentry_drop, req->r_dentry_unless);
1456 if (req->r_old_dentry_drop)
1457 releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
1458 mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
1459 if (req->r_old_inode_drop)
1460 releases += ceph_encode_inode_release(&p,
1461 req->r_old_dentry->d_inode,
1462 mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
1463 head->num_releases = cpu_to_le16(releases);
1464
1465 BUG_ON(p > end);
1466 msg->front.iov_len = p - msg->front.iov_base;
1467 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1468
1469 msg->pages = req->r_pages;
1470 msg->nr_pages = req->r_num_pages;
1471 msg->hdr.data_len = cpu_to_le32(req->r_data_len);
1472 msg->hdr.data_off = cpu_to_le16(0);
1473
1474out_free2:
1475 if (freepath2)
1476 kfree((char *)path2);
1477out_free1:
1478 if (freepath1)
1479 kfree((char *)path1);
1480out:
1481 return msg;
1482}
1483
1484/*
1485 * called under mdsc->mutex if error, under no mutex if
1486 * success.
1487 */
1488static void complete_request(struct ceph_mds_client *mdsc,
1489 struct ceph_mds_request *req)
1490{
1491 if (req->r_callback)
1492 req->r_callback(mdsc, req);
1493 else
1494 complete(&req->r_completion);
1495}
1496
1497/*
1498 * called under mdsc->mutex
1499 */
1500static int __prepare_send_request(struct ceph_mds_client *mdsc,
1501 struct ceph_mds_request *req,
1502 int mds)
1503{
1504 struct ceph_mds_request_head *rhead;
1505 struct ceph_msg *msg;
1506 int flags = 0;
1507
1508 req->r_mds = mds;
1509 req->r_attempts++;
1510 dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
1511 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
1512
1513 if (req->r_request) {
1514 ceph_msg_put(req->r_request);
1515 req->r_request = NULL;
1516 }
1517 msg = create_request_message(mdsc, req, mds);
1518 if (IS_ERR(msg)) {
1519 req->r_reply = ERR_PTR(PTR_ERR(msg));
1520 complete_request(mdsc, req);
1521 return -PTR_ERR(msg);
1522 }
1523 req->r_request = msg;
1524
1525 rhead = msg->front.iov_base;
1526 rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
1527 if (req->r_got_unsafe)
1528 flags |= CEPH_MDS_FLAG_REPLAY;
1529 if (req->r_locked_dir)
1530 flags |= CEPH_MDS_FLAG_WANT_DENTRY;
1531 rhead->flags = cpu_to_le32(flags);
1532 rhead->num_fwd = req->r_num_fwd;
1533 rhead->num_retry = req->r_attempts - 1;
1534
1535 dout(" r_locked_dir = %p\n", req->r_locked_dir);
1536
1537 if (req->r_target_inode && req->r_got_unsafe)
1538 rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
1539 else
1540 rhead->ino = 0;
1541 return 0;
1542}
1543
1544/*
1545 * send request, or put it on the appropriate wait list.
1546 */
1547static int __do_request(struct ceph_mds_client *mdsc,
1548 struct ceph_mds_request *req)
1549{
1550 struct ceph_mds_session *session = NULL;
1551 int mds = -1;
1552 int err = -EAGAIN;
1553
1554 if (req->r_reply)
1555 goto out;
1556
1557 if (req->r_timeout &&
1558 time_after_eq(jiffies, req->r_started + req->r_timeout)) {
1559 dout("do_request timed out\n");
1560 err = -EIO;
1561 goto finish;
1562 }
1563
1564 mds = __choose_mds(mdsc, req);
1565 if (mds < 0 ||
1566 ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
1567 dout("do_request no mds or not active, waiting for map\n");
1568 list_add(&req->r_wait, &mdsc->waiting_for_map);
1569 goto out;
1570 }
1571
1572 /* get, open session */
1573 session = __ceph_lookup_mds_session(mdsc, mds);
1574 if (!session) {
1575 session = register_session(mdsc, mds);
1576 if (IS_ERR(session)) {
1577 err = PTR_ERR(session);
1578 goto finish;
1579 }
1580 }
1581 dout("do_request mds%d session %p state %s\n", mds, session,
1582 session_state_name(session->s_state));
1583 if (session->s_state != CEPH_MDS_SESSION_OPEN &&
1584 session->s_state != CEPH_MDS_SESSION_HUNG) {
1585 if (session->s_state == CEPH_MDS_SESSION_NEW ||
1586 session->s_state == CEPH_MDS_SESSION_CLOSING)
1587 __open_session(mdsc, session);
1588 list_add(&req->r_wait, &session->s_waiting);
1589 goto out_session;
1590 }
1591
1592 /* send request */
1593 req->r_session = get_session(session);
1594 req->r_resend_mds = -1; /* forget any previous mds hint */
1595
1596 if (req->r_request_started == 0) /* note request start time */
1597 req->r_request_started = jiffies;
1598
1599 err = __prepare_send_request(mdsc, req, mds);
1600 if (!err) {
1601 ceph_msg_get(req->r_request);
1602 ceph_con_send(&session->s_con, req->r_request);
1603 }
1604
1605out_session:
1606 ceph_put_mds_session(session);
1607out:
1608 return err;
1609
1610finish:
1611 req->r_reply = ERR_PTR(err);
1612 complete_request(mdsc, req);
1613 goto out;
1614}
1615
1616/*
1617 * called under mdsc->mutex
1618 */
1619static void __wake_requests(struct ceph_mds_client *mdsc,
1620 struct list_head *head)
1621{
1622 struct ceph_mds_request *req, *nreq;
1623
1624 list_for_each_entry_safe(req, nreq, head, r_wait) {
1625 list_del_init(&req->r_wait);
1626 __do_request(mdsc, req);
1627 }
1628}
1629
1630/*
1631 * Wake up threads with requests pending for @mds, so that they can
1632 * resubmit their requests to a possibly different mds. If @all is set,
1633 * wake up if their requests has been forwarded to @mds, too.
1634 */
1635static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all)
1636{
1637 struct ceph_mds_request *req;
1638 struct rb_node *p;
1639
1640 dout("kick_requests mds%d\n", mds);
1641 for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) {
1642 req = rb_entry(p, struct ceph_mds_request, r_node);
1643 if (req->r_got_unsafe)
1644 continue;
1645 if (req->r_session &&
1646 req->r_session->s_mds == mds) {
1647 dout(" kicking tid %llu\n", req->r_tid);
1648 put_request_session(req);
1649 __do_request(mdsc, req);
1650 }
1651 }
1652}
1653
1654void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
1655 struct ceph_mds_request *req)
1656{
1657 dout("submit_request on %p\n", req);
1658 mutex_lock(&mdsc->mutex);
1659 __register_request(mdsc, req, NULL);
1660 __do_request(mdsc, req);
1661 mutex_unlock(&mdsc->mutex);
1662}
1663
1664/*
1665 * Synchrously perform an mds request. Take care of all of the
1666 * session setup, forwarding, retry details.
1667 */
1668int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
1669 struct inode *dir,
1670 struct ceph_mds_request *req)
1671{
1672 int err;
1673
1674 dout("do_request on %p\n", req);
1675
1676 /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
1677 if (req->r_inode)
1678 ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
1679 if (req->r_locked_dir)
1680 ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
1681 if (req->r_old_dentry)
1682 ceph_get_cap_refs(
1683 ceph_inode(req->r_old_dentry->d_parent->d_inode),
1684 CEPH_CAP_PIN);
1685
1686 /* issue */
1687 mutex_lock(&mdsc->mutex);
1688 __register_request(mdsc, req, dir);
1689 __do_request(mdsc, req);
1690
1691 /* wait */
1692 if (!req->r_reply) {
1693 mutex_unlock(&mdsc->mutex);
1694 if (req->r_timeout) {
1695 err = (long)wait_for_completion_interruptible_timeout(
1696 &req->r_completion, req->r_timeout);
1697 if (err == 0)
1698 req->r_reply = ERR_PTR(-EIO);
1699 else if (err < 0)
1700 req->r_reply = ERR_PTR(err);
1701 } else {
1702 err = wait_for_completion_interruptible(
1703 &req->r_completion);
1704 if (err)
1705 req->r_reply = ERR_PTR(err);
1706 }
1707 mutex_lock(&mdsc->mutex);
1708 }
1709
1710 if (IS_ERR(req->r_reply)) {
1711 err = PTR_ERR(req->r_reply);
1712 req->r_reply = NULL;
1713
1714 if (err == -ERESTARTSYS) {
1715 /* aborted */
1716 req->r_aborted = true;
1717
1718 if (req->r_locked_dir &&
1719 (req->r_op & CEPH_MDS_OP_WRITE)) {
1720 struct ceph_inode_info *ci =
1721 ceph_inode(req->r_locked_dir);
1722
1723 dout("aborted, clearing I_COMPLETE on %p\n",
1724 req->r_locked_dir);
1725 spin_lock(&req->r_locked_dir->i_lock);
1726 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1727 ci->i_release_count++;
1728 spin_unlock(&req->r_locked_dir->i_lock);
1729 }
1730 } else {
1731 /* clean up this request */
1732 __unregister_request(mdsc, req);
1733 if (!list_empty(&req->r_unsafe_item))
1734 list_del_init(&req->r_unsafe_item);
1735 complete(&req->r_safe_completion);
1736 }
1737 } else if (req->r_err) {
1738 err = req->r_err;
1739 } else {
1740 err = le32_to_cpu(req->r_reply_info.head->result);
1741 }
1742 mutex_unlock(&mdsc->mutex);
1743
1744 dout("do_request %p done, result %d\n", req, err);
1745 return err;
1746}
1747
1748/*
1749 * Handle mds reply.
1750 *
1751 * We take the session mutex and parse and process the reply immediately.
1752 * This preserves the logical ordering of replies, capabilities, etc., sent
1753 * by the MDS as they are applied to our local cache.
1754 */
1755static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1756{
1757 struct ceph_mds_client *mdsc = session->s_mdsc;
1758 struct ceph_mds_request *req;
1759 struct ceph_mds_reply_head *head = msg->front.iov_base;
1760 struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */
1761 u64 tid;
1762 int err, result;
1763 int mds = session->s_mds;
1764
1765 if (msg->front.iov_len < sizeof(*head)) {
1766 pr_err("mdsc_handle_reply got corrupt (short) reply\n");
1767 ceph_msg_dump(msg);
1768 return;
1769 }
1770
1771 /* get request, session */
1772 tid = le64_to_cpu(msg->hdr.tid);
1773 mutex_lock(&mdsc->mutex);
1774 req = __lookup_request(mdsc, tid);
1775 if (!req) {
1776 dout("handle_reply on unknown tid %llu\n", tid);
1777 mutex_unlock(&mdsc->mutex);
1778 return;
1779 }
1780 dout("handle_reply %p\n", req);
1781
1782 /* correct session? */
1783 if (req->r_session != session) {
1784 pr_err("mdsc_handle_reply got %llu on session mds%d"
1785 " not mds%d\n", tid, session->s_mds,
1786 req->r_session ? req->r_session->s_mds : -1);
1787 mutex_unlock(&mdsc->mutex);
1788 goto out;
1789 }
1790
1791 /* dup? */
1792 if ((req->r_got_unsafe && !head->safe) ||
1793 (req->r_got_safe && head->safe)) {
1794 pr_warning("got a dup %s reply on %llu from mds%d\n",
1795 head->safe ? "safe" : "unsafe", tid, mds);
1796 mutex_unlock(&mdsc->mutex);
1797 goto out;
1798 }
1799
1800 result = le32_to_cpu(head->result);
1801
1802 /*
1803 * Tolerate 2 consecutive ESTALEs from the same mds.
1804 * FIXME: we should be looking at the cap migrate_seq.
1805 */
1806 if (result == -ESTALE) {
1807 req->r_direct_mode = USE_AUTH_MDS;
1808 req->r_num_stale++;
1809 if (req->r_num_stale <= 2) {
1810 __do_request(mdsc, req);
1811 mutex_unlock(&mdsc->mutex);
1812 goto out;
1813 }
1814 } else {
1815 req->r_num_stale = 0;
1816 }
1817
1818 if (head->safe) {
1819 req->r_got_safe = true;
1820 __unregister_request(mdsc, req);
1821 complete(&req->r_safe_completion);
1822
1823 if (req->r_got_unsafe) {
1824 /*
1825 * We already handled the unsafe response, now do the
1826 * cleanup. No need to examine the response; the MDS
1827 * doesn't include any result info in the safe
1828 * response. And even if it did, there is nothing
1829 * useful we could do with a revised return value.
1830 */
1831 dout("got safe reply %llu, mds%d\n", tid, mds);
1832 list_del_init(&req->r_unsafe_item);
1833
1834 /* last unsafe request during umount? */
1835 if (mdsc->stopping && !__get_oldest_req(mdsc))
1836 complete(&mdsc->safe_umount_waiters);
1837 mutex_unlock(&mdsc->mutex);
1838 goto out;
1839 }
1840 }
1841
1842 BUG_ON(req->r_reply);
1843
1844 if (!head->safe) {
1845 req->r_got_unsafe = true;
1846 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
1847 }
1848
1849 dout("handle_reply tid %lld result %d\n", tid, result);
1850 rinfo = &req->r_reply_info;
1851 err = parse_reply_info(msg, rinfo);
1852 mutex_unlock(&mdsc->mutex);
1853
1854 mutex_lock(&session->s_mutex);
1855 if (err < 0) {
1856 pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
1857 ceph_msg_dump(msg);
1858 goto out_err;
1859 }
1860
1861 /* snap trace */
1862 if (rinfo->snapblob_len) {
1863 down_write(&mdsc->snap_rwsem);
1864 ceph_update_snap_trace(mdsc, rinfo->snapblob,
1865 rinfo->snapblob + rinfo->snapblob_len,
1866 le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
1867 downgrade_write(&mdsc->snap_rwsem);
1868 } else {
1869 down_read(&mdsc->snap_rwsem);
1870 }
1871
1872 /* insert trace into our cache */
1873 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
1874 if (err == 0) {
1875 if (result == 0 && rinfo->dir_nr)
1876 ceph_readdir_prepopulate(req, req->r_session);
1877 ceph_unreserve_caps(&req->r_caps_reservation);
1878 }
1879
1880 up_read(&mdsc->snap_rwsem);
1881out_err:
1882 if (err) {
1883 req->r_err = err;
1884 } else {
1885 req->r_reply = msg;
1886 ceph_msg_get(msg);
1887 }
1888
1889 add_cap_releases(mdsc, req->r_session, -1);
1890 mutex_unlock(&session->s_mutex);
1891
1892 /* kick calling process */
1893 complete_request(mdsc, req);
1894out:
1895 ceph_mdsc_put_request(req);
1896 return;
1897}
1898
1899
1900
1901/*
1902 * handle mds notification that our request has been forwarded.
1903 */
1904static void handle_forward(struct ceph_mds_client *mdsc,
1905 struct ceph_mds_session *session,
1906 struct ceph_msg *msg)
1907{
1908 struct ceph_mds_request *req;
1909 u64 tid = le64_to_cpu(msg->hdr.tid);
1910 u32 next_mds;
1911 u32 fwd_seq;
1912 int err = -EINVAL;
1913 void *p = msg->front.iov_base;
1914 void *end = p + msg->front.iov_len;
1915
1916 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1917 next_mds = ceph_decode_32(&p);
1918 fwd_seq = ceph_decode_32(&p);
1919
1920 mutex_lock(&mdsc->mutex);
1921 req = __lookup_request(mdsc, tid);
1922 if (!req) {
1923 dout("forward %llu to mds%d - req dne\n", tid, next_mds);
1924 goto out; /* dup reply? */
1925 }
1926
1927 if (fwd_seq <= req->r_num_fwd) {
1928 dout("forward %llu to mds%d - old seq %d <= %d\n",
1929 tid, next_mds, req->r_num_fwd, fwd_seq);
1930 } else {
1931 /* resend. forward race not possible; mds would drop */
1932 dout("forward %llu to mds%d (we resend)\n", tid, next_mds);
1933 req->r_num_fwd = fwd_seq;
1934 req->r_resend_mds = next_mds;
1935 put_request_session(req);
1936 __do_request(mdsc, req);
1937 }
1938 ceph_mdsc_put_request(req);
1939out:
1940 mutex_unlock(&mdsc->mutex);
1941 return;
1942
1943bad:
1944 pr_err("mdsc_handle_forward decode error err=%d\n", err);
1945}
1946
1947/*
1948 * handle a mds session control message
1949 */
1950static void handle_session(struct ceph_mds_session *session,
1951 struct ceph_msg *msg)
1952{
1953 struct ceph_mds_client *mdsc = session->s_mdsc;
1954 u32 op;
1955 u64 seq;
1956 int mds = session->s_mds;
1957 struct ceph_mds_session_head *h = msg->front.iov_base;
1958 int wake = 0;
1959
1960 /* decode */
1961 if (msg->front.iov_len != sizeof(*h))
1962 goto bad;
1963 op = le32_to_cpu(h->op);
1964 seq = le64_to_cpu(h->seq);
1965
1966 mutex_lock(&mdsc->mutex);
1967 if (op == CEPH_SESSION_CLOSE)
1968 __unregister_session(mdsc, session);
1969 /* FIXME: this ttl calculation is generous */
1970 session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
1971 mutex_unlock(&mdsc->mutex);
1972
1973 mutex_lock(&session->s_mutex);
1974
1975 dout("handle_session mds%d %s %p state %s seq %llu\n",
1976 mds, ceph_session_op_name(op), session,
1977 session_state_name(session->s_state), seq);
1978
1979 if (session->s_state == CEPH_MDS_SESSION_HUNG) {
1980 session->s_state = CEPH_MDS_SESSION_OPEN;
1981 pr_info("mds%d came back\n", session->s_mds);
1982 }
1983
1984 switch (op) {
1985 case CEPH_SESSION_OPEN:
1986 session->s_state = CEPH_MDS_SESSION_OPEN;
1987 renewed_caps(mdsc, session, 0);
1988 wake = 1;
1989 if (mdsc->stopping)
1990 __close_session(mdsc, session);
1991 break;
1992
1993 case CEPH_SESSION_RENEWCAPS:
1994 if (session->s_renew_seq == seq)
1995 renewed_caps(mdsc, session, 1);
1996 break;
1997
1998 case CEPH_SESSION_CLOSE:
1999 remove_session_caps(session);
2000 wake = 1; /* for good measure */
2001 complete(&mdsc->session_close_waiters);
2002 kick_requests(mdsc, mds, 0); /* cur only */
2003 break;
2004
2005 case CEPH_SESSION_STALE:
2006 pr_info("mds%d caps went stale, renewing\n",
2007 session->s_mds);
2008 spin_lock(&session->s_cap_lock);
2009 session->s_cap_gen++;
2010 session->s_cap_ttl = 0;
2011 spin_unlock(&session->s_cap_lock);
2012 send_renew_caps(mdsc, session);
2013 break;
2014
2015 case CEPH_SESSION_RECALL_STATE:
2016 trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
2017 break;
2018
2019 default:
2020 pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
2021 WARN_ON(1);
2022 }
2023
2024 mutex_unlock(&session->s_mutex);
2025 if (wake) {
2026 mutex_lock(&mdsc->mutex);
2027 __wake_requests(mdsc, &session->s_waiting);
2028 mutex_unlock(&mdsc->mutex);
2029 }
2030 return;
2031
2032bad:
2033 pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
2034 (int)msg->front.iov_len);
2035 ceph_msg_dump(msg);
2036 return;
2037}
2038
2039
2040/*
2041 * called under session->mutex.
2042 */
2043static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
2044 struct ceph_mds_session *session)
2045{
2046 struct ceph_mds_request *req, *nreq;
2047 int err;
2048
2049 dout("replay_unsafe_requests mds%d\n", session->s_mds);
2050
2051 mutex_lock(&mdsc->mutex);
2052 list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
2053 err = __prepare_send_request(mdsc, req, session->s_mds);
2054 if (!err) {
2055 ceph_msg_get(req->r_request);
2056 ceph_con_send(&session->s_con, req->r_request);
2057 }
2058 }
2059 mutex_unlock(&mdsc->mutex);
2060}
2061
2062/*
2063 * Encode information about a cap for a reconnect with the MDS.
2064 */
2065static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2066 void *arg)
2067{
2068 struct ceph_mds_cap_reconnect rec;
2069 struct ceph_inode_info *ci;
2070 struct ceph_pagelist *pagelist = arg;
2071 char *path;
2072 int pathlen, err;
2073 u64 pathbase;
2074 struct dentry *dentry;
2075
2076 ci = cap->ci;
2077
2078 dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
2079 inode, ceph_vinop(inode), cap, cap->cap_id,
2080 ceph_cap_string(cap->issued));
2081 err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
2082 if (err)
2083 return err;
2084
2085 dentry = d_find_alias(inode);
2086 if (dentry) {
2087 path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
2088 if (IS_ERR(path)) {
2089 err = PTR_ERR(path);
2090 BUG_ON(err);
2091 }
2092 } else {
2093 path = NULL;
2094 pathlen = 0;
2095 }
2096 err = ceph_pagelist_encode_string(pagelist, path, pathlen);
2097 if (err)
2098 goto out;
2099
2100 spin_lock(&inode->i_lock);
2101 cap->seq = 0; /* reset cap seq */
2102 cap->issue_seq = 0; /* and issue_seq */
2103 rec.cap_id = cpu_to_le64(cap->cap_id);
2104 rec.pathbase = cpu_to_le64(pathbase);
2105 rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
2106 rec.issued = cpu_to_le32(cap->issued);
2107 rec.size = cpu_to_le64(inode->i_size);
2108 ceph_encode_timespec(&rec.mtime, &inode->i_mtime);
2109 ceph_encode_timespec(&rec.atime, &inode->i_atime);
2110 rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
2111 spin_unlock(&inode->i_lock);
2112
2113 err = ceph_pagelist_append(pagelist, &rec, sizeof(rec));
2114
2115out:
2116 kfree(path);
2117 dput(dentry);
2118 return err;
2119}
2120
2121
2122/*
2123 * If an MDS fails and recovers, clients need to reconnect in order to
2124 * reestablish shared state. This includes all caps issued through
2125 * this session _and_ the snap_realm hierarchy. Because it's not
2126 * clear which snap realms the mds cares about, we send everything we
2127 * know about.. that ensures we'll then get any new info the
2128 * recovering MDS might have.
2129 *
2130 * This is a relatively heavyweight operation, but it's rare.
2131 *
2132 * called with mdsc->mutex held.
2133 */
2134static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
2135{
2136 struct ceph_mds_session *session = NULL;
2137 struct ceph_msg *reply;
2138 struct rb_node *p;
2139 int err;
2140 struct ceph_pagelist *pagelist;
2141
2142 pr_info("reconnect to recovering mds%d\n", mds);
2143
2144 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
2145 if (!pagelist)
2146 goto fail_nopagelist;
2147 ceph_pagelist_init(pagelist);
2148
2149 reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL);
2150 if (IS_ERR(reply)) {
2151 err = PTR_ERR(reply);
2152 goto fail_nomsg;
2153 }
2154
2155 /* find session */
2156 session = __ceph_lookup_mds_session(mdsc, mds);
2157 mutex_unlock(&mdsc->mutex); /* drop lock for duration */
2158
2159 if (session) {
2160 mutex_lock(&session->s_mutex);
2161
2162 session->s_state = CEPH_MDS_SESSION_RECONNECTING;
2163 session->s_seq = 0;
2164
2165 ceph_con_open(&session->s_con,
2166 ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
2167
2168 /* replay unsafe requests */
2169 replay_unsafe_requests(mdsc, session);
2170 } else {
2171 dout("no session for mds%d, will send short reconnect\n",
2172 mds);
2173 }
2174
2175 down_read(&mdsc->snap_rwsem);
2176
2177 if (!session)
2178 goto send;
2179 dout("session %p state %s\n", session,
2180 session_state_name(session->s_state));
2181
2182 /* traverse this session's caps */
2183 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
2184 if (err)
2185 goto fail;
2186 err = iterate_session_caps(session, encode_caps_cb, pagelist);
2187 if (err < 0)
2188 goto out;
2189
2190 /*
2191 * snaprealms. we provide mds with the ino, seq (version), and
2192 * parent for all of our realms. If the mds has any newer info,
2193 * it will tell us.
2194 */
2195 for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
2196 struct ceph_snap_realm *realm =
2197 rb_entry(p, struct ceph_snap_realm, node);
2198 struct ceph_mds_snaprealm_reconnect sr_rec;
2199
2200 dout(" adding snap realm %llx seq %lld parent %llx\n",
2201 realm->ino, realm->seq, realm->parent_ino);
2202 sr_rec.ino = cpu_to_le64(realm->ino);
2203 sr_rec.seq = cpu_to_le64(realm->seq);
2204 sr_rec.parent = cpu_to_le64(realm->parent_ino);
2205 err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
2206 if (err)
2207 goto fail;
2208 }
2209
2210send:
2211 reply->pagelist = pagelist;
2212 reply->hdr.data_len = cpu_to_le32(pagelist->length);
2213 reply->nr_pages = calc_pages_for(0, pagelist->length);
2214 ceph_con_send(&session->s_con, reply);
2215
2216 if (session) {
2217 session->s_state = CEPH_MDS_SESSION_OPEN;
2218 __wake_requests(mdsc, &session->s_waiting);
2219 }
2220
2221out:
2222 up_read(&mdsc->snap_rwsem);
2223 if (session) {
2224 mutex_unlock(&session->s_mutex);
2225 ceph_put_mds_session(session);
2226 }
2227 mutex_lock(&mdsc->mutex);
2228 return;
2229
2230fail:
2231 ceph_msg_put(reply);
2232fail_nomsg:
2233 ceph_pagelist_release(pagelist);
2234 kfree(pagelist);
2235fail_nopagelist:
2236 pr_err("ENOMEM preparing reconnect for mds%d\n", mds);
2237 goto out;
2238}
2239
2240
2241/*
2242 * compare old and new mdsmaps, kicking requests
2243 * and closing out old connections as necessary
2244 *
2245 * called under mdsc->mutex.
2246 */
2247static void check_new_map(struct ceph_mds_client *mdsc,
2248 struct ceph_mdsmap *newmap,
2249 struct ceph_mdsmap *oldmap)
2250{
2251 int i;
2252 int oldstate, newstate;
2253 struct ceph_mds_session *s;
2254
2255 dout("check_new_map new %u old %u\n",
2256 newmap->m_epoch, oldmap->m_epoch);
2257
2258 for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
2259 if (mdsc->sessions[i] == NULL)
2260 continue;
2261 s = mdsc->sessions[i];
2262 oldstate = ceph_mdsmap_get_state(oldmap, i);
2263 newstate = ceph_mdsmap_get_state(newmap, i);
2264
2265 dout("check_new_map mds%d state %s -> %s (session %s)\n",
2266 i, ceph_mds_state_name(oldstate),
2267 ceph_mds_state_name(newstate),
2268 session_state_name(s->s_state));
2269
2270 if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
2271 ceph_mdsmap_get_addr(newmap, i),
2272 sizeof(struct ceph_entity_addr))) {
2273 if (s->s_state == CEPH_MDS_SESSION_OPENING) {
2274 /* the session never opened, just close it
2275 * out now */
2276 __wake_requests(mdsc, &s->s_waiting);
2277 __unregister_session(mdsc, s);
2278 } else {
2279 /* just close it */
2280 mutex_unlock(&mdsc->mutex);
2281 mutex_lock(&s->s_mutex);
2282 mutex_lock(&mdsc->mutex);
2283 ceph_con_close(&s->s_con);
2284 mutex_unlock(&s->s_mutex);
2285 s->s_state = CEPH_MDS_SESSION_RESTARTING;
2286 }
2287
2288 /* kick any requests waiting on the recovering mds */
2289 kick_requests(mdsc, i, 1);
2290 } else if (oldstate == newstate) {
2291 continue; /* nothing new with this mds */
2292 }
2293
2294 /*
2295 * send reconnect?
2296 */
2297 if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
2298 newstate >= CEPH_MDS_STATE_RECONNECT)
2299 send_mds_reconnect(mdsc, i);
2300
2301 /*
2302 * kick requests on any mds that has gone active.
2303 *
2304 * kick requests on cur or forwarder: we may have sent
2305 * the request to mds1, mds1 told us it forwarded it
2306 * to mds2, but then we learn mds1 failed and can't be
2307 * sure it successfully forwarded our request before
2308 * it died.
2309 */
2310 if (oldstate < CEPH_MDS_STATE_ACTIVE &&
2311 newstate >= CEPH_MDS_STATE_ACTIVE) {
2312 pr_info("mds%d reconnect completed\n", s->s_mds);
2313 kick_requests(mdsc, i, 1);
2314 ceph_kick_flushing_caps(mdsc, s);
2315 wake_up_session_caps(s, 1);
2316 }
2317 }
2318}
2319
2320
2321
2322/*
2323 * leases
2324 */
2325
2326/*
2327 * caller must hold session s_mutex, dentry->d_lock
2328 */
2329void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
2330{
2331 struct ceph_dentry_info *di = ceph_dentry(dentry);
2332
2333 ceph_put_mds_session(di->lease_session);
2334 di->lease_session = NULL;
2335}
2336
2337static void handle_lease(struct ceph_mds_client *mdsc,
2338 struct ceph_mds_session *session,
2339 struct ceph_msg *msg)
2340{
2341 struct super_block *sb = mdsc->client->sb;
2342 struct inode *inode;
2343 struct ceph_inode_info *ci;
2344 struct dentry *parent, *dentry;
2345 struct ceph_dentry_info *di;
2346 int mds = session->s_mds;
2347 struct ceph_mds_lease *h = msg->front.iov_base;
2348 struct ceph_vino vino;
2349 int mask;
2350 struct qstr dname;
2351 int release = 0;
2352
2353 dout("handle_lease from mds%d\n", mds);
2354
2355 /* decode */
2356 if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
2357 goto bad;
2358 vino.ino = le64_to_cpu(h->ino);
2359 vino.snap = CEPH_NOSNAP;
2360 mask = le16_to_cpu(h->mask);
2361 dname.name = (void *)h + sizeof(*h) + sizeof(u32);
2362 dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
2363 if (dname.len != get_unaligned_le32(h+1))
2364 goto bad;
2365
2366 mutex_lock(&session->s_mutex);
2367 session->s_seq++;
2368
2369 /* lookup inode */
2370 inode = ceph_find_inode(sb, vino);
2371 dout("handle_lease '%s', mask %d, ino %llx %p\n",
2372 ceph_lease_op_name(h->action), mask, vino.ino, inode);
2373 if (inode == NULL) {
2374 dout("handle_lease no inode %llx\n", vino.ino);
2375 goto release;
2376 }
2377 ci = ceph_inode(inode);
2378
2379 /* dentry */
2380 parent = d_find_alias(inode);
2381 if (!parent) {
2382 dout("no parent dentry on inode %p\n", inode);
2383 WARN_ON(1);
2384 goto release; /* hrm... */
2385 }
2386 dname.hash = full_name_hash(dname.name, dname.len);
2387 dentry = d_lookup(parent, &dname);
2388 dput(parent);
2389 if (!dentry)
2390 goto release;
2391
2392 spin_lock(&dentry->d_lock);
2393 di = ceph_dentry(dentry);
2394 switch (h->action) {
2395 case CEPH_MDS_LEASE_REVOKE:
2396 if (di && di->lease_session == session) {
2397 h->seq = cpu_to_le32(di->lease_seq);
2398 __ceph_mdsc_drop_dentry_lease(dentry);
2399 }
2400 release = 1;
2401 break;
2402
2403 case CEPH_MDS_LEASE_RENEW:
2404 if (di && di->lease_session == session &&
2405 di->lease_gen == session->s_cap_gen &&
2406 di->lease_renew_from &&
2407 di->lease_renew_after == 0) {
2408 unsigned long duration =
2409 le32_to_cpu(h->duration_ms) * HZ / 1000;
2410
2411 di->lease_seq = le32_to_cpu(h->seq);
2412 dentry->d_time = di->lease_renew_from + duration;
2413 di->lease_renew_after = di->lease_renew_from +
2414 (duration >> 1);
2415 di->lease_renew_from = 0;
2416 }
2417 break;
2418 }
2419 spin_unlock(&dentry->d_lock);
2420 dput(dentry);
2421
2422 if (!release)
2423 goto out;
2424
2425release:
2426 /* let's just reuse the same message */
2427 h->action = CEPH_MDS_LEASE_REVOKE_ACK;
2428 ceph_msg_get(msg);
2429 ceph_con_send(&session->s_con, msg);
2430
2431out:
2432 iput(inode);
2433 mutex_unlock(&session->s_mutex);
2434 return;
2435
2436bad:
2437 pr_err("corrupt lease message\n");
2438 ceph_msg_dump(msg);
2439}
2440
2441void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
2442 struct inode *inode,
2443 struct dentry *dentry, char action,
2444 u32 seq)
2445{
2446 struct ceph_msg *msg;
2447 struct ceph_mds_lease *lease;
2448 int len = sizeof(*lease) + sizeof(u32);
2449 int dnamelen = 0;
2450
2451 dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
2452 inode, dentry, ceph_lease_op_name(action), session->s_mds);
2453 dnamelen = dentry->d_name.len;
2454 len += dnamelen;
2455
2456 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
2457 if (IS_ERR(msg))
2458 return;
2459 lease = msg->front.iov_base;
2460 lease->action = action;
2461 lease->mask = cpu_to_le16(CEPH_LOCK_DN);
2462 lease->ino = cpu_to_le64(ceph_vino(inode).ino);
2463 lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
2464 lease->seq = cpu_to_le32(seq);
2465 put_unaligned_le32(dnamelen, lease + 1);
2466 memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
2467
2468 /*
2469 * if this is a preemptive lease RELEASE, no need to
2470 * flush request stream, since the actual request will
2471 * soon follow.
2472 */
2473 msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
2474
2475 ceph_con_send(&session->s_con, msg);
2476}
2477
2478/*
2479 * Preemptively release a lease we expect to invalidate anyway.
2480 * Pass @inode always, @dentry is optional.
2481 */
2482void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
2483 struct dentry *dentry, int mask)
2484{
2485 struct ceph_dentry_info *di;
2486 struct ceph_mds_session *session;
2487 u32 seq;
2488
2489 BUG_ON(inode == NULL);
2490 BUG_ON(dentry == NULL);
2491 BUG_ON(mask != CEPH_LOCK_DN);
2492
2493 /* is dentry lease valid? */
2494 spin_lock(&dentry->d_lock);
2495 di = ceph_dentry(dentry);
2496 if (!di || !di->lease_session ||
2497 di->lease_session->s_mds < 0 ||
2498 di->lease_gen != di->lease_session->s_cap_gen ||
2499 !time_before(jiffies, dentry->d_time)) {
2500 dout("lease_release inode %p dentry %p -- "
2501 "no lease on %d\n",
2502 inode, dentry, mask);
2503 spin_unlock(&dentry->d_lock);
2504 return;
2505 }
2506
2507 /* we do have a lease on this dentry; note mds and seq */
2508 session = ceph_get_mds_session(di->lease_session);
2509 seq = di->lease_seq;
2510 __ceph_mdsc_drop_dentry_lease(dentry);
2511 spin_unlock(&dentry->d_lock);
2512
2513 dout("lease_release inode %p dentry %p mask %d to mds%d\n",
2514 inode, dentry, mask, session->s_mds);
2515 ceph_mdsc_lease_send_msg(session, inode, dentry,
2516 CEPH_MDS_LEASE_RELEASE, seq);
2517 ceph_put_mds_session(session);
2518}
2519
2520/*
2521 * drop all leases (and dentry refs) in preparation for umount
2522 */
2523static void drop_leases(struct ceph_mds_client *mdsc)
2524{
2525 int i;
2526
2527 dout("drop_leases\n");
2528 mutex_lock(&mdsc->mutex);
2529 for (i = 0; i < mdsc->max_sessions; i++) {
2530 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
2531 if (!s)
2532 continue;
2533 mutex_unlock(&mdsc->mutex);
2534 mutex_lock(&s->s_mutex);
2535 mutex_unlock(&s->s_mutex);
2536 ceph_put_mds_session(s);
2537 mutex_lock(&mdsc->mutex);
2538 }
2539 mutex_unlock(&mdsc->mutex);
2540}
2541
2542
2543
2544/*
2545 * delayed work -- periodically trim expired leases, renew caps with mds
2546 */
2547static void schedule_delayed(struct ceph_mds_client *mdsc)
2548{
2549 int delay = 5;
2550 unsigned hz = round_jiffies_relative(HZ * delay);
2551 schedule_delayed_work(&mdsc->delayed_work, hz);
2552}
2553
2554static void delayed_work(struct work_struct *work)
2555{
2556 int i;
2557 struct ceph_mds_client *mdsc =
2558 container_of(work, struct ceph_mds_client, delayed_work.work);
2559 int renew_interval;
2560 int renew_caps;
2561
2562 dout("mdsc delayed_work\n");
2563 ceph_check_delayed_caps(mdsc);
2564
2565 mutex_lock(&mdsc->mutex);
2566 renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
2567 renew_caps = time_after_eq(jiffies, HZ*renew_interval +
2568 mdsc->last_renew_caps);
2569 if (renew_caps)
2570 mdsc->last_renew_caps = jiffies;
2571
2572 for (i = 0; i < mdsc->max_sessions; i++) {
2573 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
2574 if (s == NULL)
2575 continue;
2576 if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
2577 dout("resending session close request for mds%d\n",
2578 s->s_mds);
2579 request_close_session(mdsc, s);
2580 ceph_put_mds_session(s);
2581 continue;
2582 }
2583 if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
2584 if (s->s_state == CEPH_MDS_SESSION_OPEN) {
2585 s->s_state = CEPH_MDS_SESSION_HUNG;
2586 pr_info("mds%d hung\n", s->s_mds);
2587 }
2588 }
2589 if (s->s_state < CEPH_MDS_SESSION_OPEN) {
2590 /* this mds is failed or recovering, just wait */
2591 ceph_put_mds_session(s);
2592 continue;
2593 }
2594 mutex_unlock(&mdsc->mutex);
2595
2596 mutex_lock(&s->s_mutex);
2597 if (renew_caps)
2598 send_renew_caps(mdsc, s);
2599 else
2600 ceph_con_keepalive(&s->s_con);
2601 add_cap_releases(mdsc, s, -1);
2602 send_cap_releases(mdsc, s);
2603 mutex_unlock(&s->s_mutex);
2604 ceph_put_mds_session(s);
2605
2606 mutex_lock(&mdsc->mutex);
2607 }
2608 mutex_unlock(&mdsc->mutex);
2609
2610 schedule_delayed(mdsc);
2611}
2612
2613
2614int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2615{
2616 mdsc->client = client;
2617 mutex_init(&mdsc->mutex);
2618 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
2619 init_completion(&mdsc->safe_umount_waiters);
2620 init_completion(&mdsc->session_close_waiters);
2621 INIT_LIST_HEAD(&mdsc->waiting_for_map);
2622 mdsc->sessions = NULL;
2623 mdsc->max_sessions = 0;
2624 mdsc->stopping = 0;
2625 init_rwsem(&mdsc->snap_rwsem);
2626 mdsc->snap_realms = RB_ROOT;
2627 INIT_LIST_HEAD(&mdsc->snap_empty);
2628 spin_lock_init(&mdsc->snap_empty_lock);
2629 mdsc->last_tid = 0;
2630 mdsc->request_tree = RB_ROOT;
2631 INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
2632 mdsc->last_renew_caps = jiffies;
2633 INIT_LIST_HEAD(&mdsc->cap_delay_list);
2634 spin_lock_init(&mdsc->cap_delay_lock);
2635 INIT_LIST_HEAD(&mdsc->snap_flush_list);
2636 spin_lock_init(&mdsc->snap_flush_lock);
2637 mdsc->cap_flush_seq = 0;
2638 INIT_LIST_HEAD(&mdsc->cap_dirty);
2639 mdsc->num_cap_flushing = 0;
2640 spin_lock_init(&mdsc->cap_dirty_lock);
2641 init_waitqueue_head(&mdsc->cap_flushing_wq);
2642 spin_lock_init(&mdsc->dentry_lru_lock);
2643 INIT_LIST_HEAD(&mdsc->dentry_lru);
2644 return 0;
2645}
2646
2647/*
2648 * Wait for safe replies on open mds requests. If we time out, drop
2649 * all requests from the tree to avoid dangling dentry refs.
2650 */
2651static void wait_requests(struct ceph_mds_client *mdsc)
2652{
2653 struct ceph_mds_request *req;
2654 struct ceph_client *client = mdsc->client;
2655
2656 mutex_lock(&mdsc->mutex);
2657 if (__get_oldest_req(mdsc)) {
2658 mutex_unlock(&mdsc->mutex);
2659
2660 dout("wait_requests waiting for requests\n");
2661 wait_for_completion_timeout(&mdsc->safe_umount_waiters,
2662 client->mount_args->mount_timeout * HZ);
2663
2664 /* tear down remaining requests */
2665 mutex_lock(&mdsc->mutex);
2666 while ((req = __get_oldest_req(mdsc))) {
2667 dout("wait_requests timed out on tid %llu\n",
2668 req->r_tid);
2669 __unregister_request(mdsc, req);
2670 }
2671 }
2672 mutex_unlock(&mdsc->mutex);
2673 dout("wait_requests done\n");
2674}
2675
2676/*
2677 * called before mount is ro, and before dentries are torn down.
2678 * (hmm, does this still race with new lookups?)
2679 */
2680void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
2681{
2682 dout("pre_umount\n");
2683 mdsc->stopping = 1;
2684
2685 drop_leases(mdsc);
2686 ceph_flush_dirty_caps(mdsc);
2687 wait_requests(mdsc);
2688}
2689
2690/*
2691 * wait for all write mds requests to flush.
2692 */
2693static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
2694{
2695 struct ceph_mds_request *req = NULL, *nextreq;
2696 struct rb_node *n;
2697
2698 mutex_lock(&mdsc->mutex);
2699 dout("wait_unsafe_requests want %lld\n", want_tid);
2700restart:
2701 req = __get_oldest_req(mdsc);
2702 while (req && req->r_tid <= want_tid) {
2703 /* find next request */
2704 n = rb_next(&req->r_node);
2705 if (n)
2706 nextreq = rb_entry(n, struct ceph_mds_request, r_node);
2707 else
2708 nextreq = NULL;
2709 if ((req->r_op & CEPH_MDS_OP_WRITE)) {
2710 /* write op */
2711 ceph_mdsc_get_request(req);
2712 if (nextreq)
2713 ceph_mdsc_get_request(nextreq);
2714 mutex_unlock(&mdsc->mutex);
2715 dout("wait_unsafe_requests wait on %llu (want %llu)\n",
2716 req->r_tid, want_tid);
2717 wait_for_completion(&req->r_safe_completion);
2718 mutex_lock(&mdsc->mutex);
2719 ceph_mdsc_put_request(req);
2720 if (!nextreq)
2721 break; /* next dne before, so we're done! */
2722 if (RB_EMPTY_NODE(&nextreq->r_node)) {
2723 /* next request was removed from tree */
2724 ceph_mdsc_put_request(nextreq);
2725 goto restart;
2726 }
2727 ceph_mdsc_put_request(nextreq); /* won't go away */
2728 }
2729 req = nextreq;
2730 }
2731 mutex_unlock(&mdsc->mutex);
2732 dout("wait_unsafe_requests done\n");
2733}
2734
2735void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
2736{
2737 u64 want_tid, want_flush;
2738
2739 dout("sync\n");
2740 mutex_lock(&mdsc->mutex);
2741 want_tid = mdsc->last_tid;
2742 want_flush = mdsc->cap_flush_seq;
2743 mutex_unlock(&mdsc->mutex);
2744 dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
2745
2746 ceph_flush_dirty_caps(mdsc);
2747
2748 wait_unsafe_requests(mdsc, want_tid);
2749 wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
2750}
2751
2752
2753/*
2754 * called after sb is ro.
2755 */
2756void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
2757{
2758 struct ceph_mds_session *session;
2759 int i;
2760 int n;
2761 struct ceph_client *client = mdsc->client;
2762 unsigned long started, timeout = client->mount_args->mount_timeout * HZ;
2763
2764 dout("close_sessions\n");
2765
2766 mutex_lock(&mdsc->mutex);
2767
2768 /* close sessions */
2769 started = jiffies;
2770 while (time_before(jiffies, started + timeout)) {
2771 dout("closing sessions\n");
2772 n = 0;
2773 for (i = 0; i < mdsc->max_sessions; i++) {
2774 session = __ceph_lookup_mds_session(mdsc, i);
2775 if (!session)
2776 continue;
2777 mutex_unlock(&mdsc->mutex);
2778 mutex_lock(&session->s_mutex);
2779 __close_session(mdsc, session);
2780 mutex_unlock(&session->s_mutex);
2781 ceph_put_mds_session(session);
2782 mutex_lock(&mdsc->mutex);
2783 n++;
2784 }
2785 if (n == 0)
2786 break;
2787
2788 if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
2789 break;
2790
2791 dout("waiting for sessions to close\n");
2792 mutex_unlock(&mdsc->mutex);
2793 wait_for_completion_timeout(&mdsc->session_close_waiters,
2794 timeout);
2795 mutex_lock(&mdsc->mutex);
2796 }
2797
2798 /* tear down remaining sessions */
2799 for (i = 0; i < mdsc->max_sessions; i++) {
2800 if (mdsc->sessions[i]) {
2801 session = get_session(mdsc->sessions[i]);
2802 __unregister_session(mdsc, session);
2803 mutex_unlock(&mdsc->mutex);
2804 mutex_lock(&session->s_mutex);
2805 remove_session_caps(session);
2806 mutex_unlock(&session->s_mutex);
2807 ceph_put_mds_session(session);
2808 mutex_lock(&mdsc->mutex);
2809 }
2810 }
2811
2812 WARN_ON(!list_empty(&mdsc->cap_delay_list));
2813
2814 mutex_unlock(&mdsc->mutex);
2815
2816 ceph_cleanup_empty_realms(mdsc);
2817
2818 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
2819
2820 dout("stopped\n");
2821}
2822
2823void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
2824{
2825 dout("stop\n");
2826 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
2827 if (mdsc->mdsmap)
2828 ceph_mdsmap_destroy(mdsc->mdsmap);
2829 kfree(mdsc->sessions);
2830}
2831
2832
2833/*
2834 * handle mds map update.
2835 */
2836void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
2837{
2838 u32 epoch;
2839 u32 maplen;
2840 void *p = msg->front.iov_base;
2841 void *end = p + msg->front.iov_len;
2842 struct ceph_mdsmap *newmap, *oldmap;
2843 struct ceph_fsid fsid;
2844 int err = -EINVAL;
2845
2846 ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
2847 ceph_decode_copy(&p, &fsid, sizeof(fsid));
2848 if (ceph_check_fsid(mdsc->client, &fsid) < 0)
2849 return;
2850 epoch = ceph_decode_32(&p);
2851 maplen = ceph_decode_32(&p);
2852 dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
2853
2854 /* do we need it? */
2855 ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
2856 mutex_lock(&mdsc->mutex);
2857 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
2858 dout("handle_map epoch %u <= our %u\n",
2859 epoch, mdsc->mdsmap->m_epoch);
2860 mutex_unlock(&mdsc->mutex);
2861 return;
2862 }
2863
2864 newmap = ceph_mdsmap_decode(&p, end);
2865 if (IS_ERR(newmap)) {
2866 err = PTR_ERR(newmap);
2867 goto bad_unlock;
2868 }
2869
2870 /* swap into place */
2871 if (mdsc->mdsmap) {
2872 oldmap = mdsc->mdsmap;
2873 mdsc->mdsmap = newmap;
2874 check_new_map(mdsc, newmap, oldmap);
2875 ceph_mdsmap_destroy(oldmap);
2876 } else {
2877 mdsc->mdsmap = newmap; /* first mds map */
2878 }
2879 mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
2880
2881 __wake_requests(mdsc, &mdsc->waiting_for_map);
2882
2883 mutex_unlock(&mdsc->mutex);
2884 schedule_delayed(mdsc);
2885 return;
2886
2887bad_unlock:
2888 mutex_unlock(&mdsc->mutex);
2889bad:
2890 pr_err("error decoding mdsmap %d\n", err);
2891 return;
2892}
2893
2894static struct ceph_connection *con_get(struct ceph_connection *con)
2895{
2896 struct ceph_mds_session *s = con->private;
2897
2898 if (get_session(s)) {
2899 dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
2900 return con;
2901 }
2902 dout("mdsc con_get %p FAIL\n", s);
2903 return NULL;
2904}
2905
2906static void con_put(struct ceph_connection *con)
2907{
2908 struct ceph_mds_session *s = con->private;
2909
2910 ceph_put_mds_session(s);
2911 dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref));
2912}
2913
2914/*
2915 * if the client is unresponsive for long enough, the mds will kill
2916 * the session entirely.
2917 */
2918static void peer_reset(struct ceph_connection *con)
2919{
2920 struct ceph_mds_session *s = con->private;
2921
2922 pr_err("mds%d gave us the boot. IMPLEMENT RECONNECT.\n",
2923 s->s_mds);
2924}
2925
2926static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
2927{
2928 struct ceph_mds_session *s = con->private;
2929 struct ceph_mds_client *mdsc = s->s_mdsc;
2930 int type = le16_to_cpu(msg->hdr.type);
2931
2932 mutex_lock(&mdsc->mutex);
2933 if (__verify_registered_session(mdsc, s) < 0) {
2934 mutex_unlock(&mdsc->mutex);
2935 goto out;
2936 }
2937 mutex_unlock(&mdsc->mutex);
2938
2939 switch (type) {
2940 case CEPH_MSG_MDS_MAP:
2941 ceph_mdsc_handle_map(mdsc, msg);
2942 break;
2943 case CEPH_MSG_CLIENT_SESSION:
2944 handle_session(s, msg);
2945 break;
2946 case CEPH_MSG_CLIENT_REPLY:
2947 handle_reply(s, msg);
2948 break;
2949 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2950 handle_forward(mdsc, s, msg);
2951 break;
2952 case CEPH_MSG_CLIENT_CAPS:
2953 ceph_handle_caps(s, msg);
2954 break;
2955 case CEPH_MSG_CLIENT_SNAP:
2956 ceph_handle_snap(mdsc, s, msg);
2957 break;
2958 case CEPH_MSG_CLIENT_LEASE:
2959 handle_lease(mdsc, s, msg);
2960 break;
2961
2962 default:
2963 pr_err("received unknown message type %d %s\n", type,
2964 ceph_msg_type_name(type));
2965 }
2966out:
2967 ceph_msg_put(msg);
2968}
2969
2970/*
2971 * authentication
2972 */
2973static int get_authorizer(struct ceph_connection *con,
2974 void **buf, int *len, int *proto,
2975 void **reply_buf, int *reply_len, int force_new)
2976{
2977 struct ceph_mds_session *s = con->private;
2978 struct ceph_mds_client *mdsc = s->s_mdsc;
2979 struct ceph_auth_client *ac = mdsc->client->monc.auth;
2980 int ret = 0;
2981
2982 if (force_new && s->s_authorizer) {
2983 ac->ops->destroy_authorizer(ac, s->s_authorizer);
2984 s->s_authorizer = NULL;
2985 }
2986 if (s->s_authorizer == NULL) {
2987 if (ac->ops->create_authorizer) {
2988 ret = ac->ops->create_authorizer(
2989 ac, CEPH_ENTITY_TYPE_MDS,
2990 &s->s_authorizer,
2991 &s->s_authorizer_buf,
2992 &s->s_authorizer_buf_len,
2993 &s->s_authorizer_reply_buf,
2994 &s->s_authorizer_reply_buf_len);
2995 if (ret)
2996 return ret;
2997 }
2998 }
2999
3000 *proto = ac->protocol;
3001 *buf = s->s_authorizer_buf;
3002 *len = s->s_authorizer_buf_len;
3003 *reply_buf = s->s_authorizer_reply_buf;
3004 *reply_len = s->s_authorizer_reply_buf_len;
3005 return 0;
3006}
3007
3008
3009static int verify_authorizer_reply(struct ceph_connection *con, int len)
3010{
3011 struct ceph_mds_session *s = con->private;
3012 struct ceph_mds_client *mdsc = s->s_mdsc;
3013 struct ceph_auth_client *ac = mdsc->client->monc.auth;
3014
3015 return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
3016}
3017
3018static int invalidate_authorizer(struct ceph_connection *con)
3019{
3020 struct ceph_mds_session *s = con->private;
3021 struct ceph_mds_client *mdsc = s->s_mdsc;
3022 struct ceph_auth_client *ac = mdsc->client->monc.auth;
3023
3024 if (ac->ops->invalidate_authorizer)
3025 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
3026
3027 return ceph_monc_validate_auth(&mdsc->client->monc);
3028}
3029
3030const static struct ceph_connection_operations mds_con_ops = {
3031 .get = con_get,
3032 .put = con_put,
3033 .dispatch = dispatch,
3034 .get_authorizer = get_authorizer,
3035 .verify_authorizer_reply = verify_authorizer_reply,
3036 .invalidate_authorizer = invalidate_authorizer,
3037 .peer_reset = peer_reset,
3038};
3039
3040
3041
3042
3043/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
new file mode 100644
index 000000000000..961cc6f65878
--- /dev/null
+++ b/fs/ceph/mds_client.h
@@ -0,0 +1,335 @@
1#ifndef _FS_CEPH_MDS_CLIENT_H
2#define _FS_CEPH_MDS_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/list.h>
7#include <linux/mutex.h>
8#include <linux/rbtree.h>
9#include <linux/spinlock.h>
10
11#include "types.h"
12#include "messenger.h"
13#include "mdsmap.h"
14
15/*
16 * Some lock dependencies:
17 *
18 * session->s_mutex
19 * mdsc->mutex
20 *
21 * mdsc->snap_rwsem
22 *
23 * inode->i_lock
24 * mdsc->snap_flush_lock
25 * mdsc->cap_delay_lock
26 *
27 */
28
29struct ceph_client;
30struct ceph_cap;
31
32/*
33 * parsed info about a single inode. pointers are into the encoded
34 * on-wire structures within the mds reply message payload.
35 */
36struct ceph_mds_reply_info_in {
37 struct ceph_mds_reply_inode *in;
38 u32 symlink_len;
39 char *symlink;
40 u32 xattr_len;
41 char *xattr_data;
42};
43
44/*
45 * parsed info about an mds reply, including information about the
46 * target inode and/or its parent directory and dentry, and directory
47 * contents (for readdir results).
48 */
49struct ceph_mds_reply_info_parsed {
50 struct ceph_mds_reply_head *head;
51
52 struct ceph_mds_reply_info_in diri, targeti;
53 struct ceph_mds_reply_dirfrag *dirfrag;
54 char *dname;
55 u32 dname_len;
56 struct ceph_mds_reply_lease *dlease;
57
58 struct ceph_mds_reply_dirfrag *dir_dir;
59 int dir_nr;
60 char **dir_dname;
61 u32 *dir_dname_len;
62 struct ceph_mds_reply_lease **dir_dlease;
63 struct ceph_mds_reply_info_in *dir_in;
64 u8 dir_complete, dir_end;
65
66 /* encoded blob describing snapshot contexts for certain
67 operations (e.g., open) */
68 void *snapblob;
69 int snapblob_len;
70};
71
72
73/*
74 * cap releases are batched and sent to the MDS en masse.
75 */
76#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \
77 sizeof(struct ceph_mds_cap_release)) / \
78 sizeof(struct ceph_mds_cap_item))
79
80
81/*
82 * state associated with each MDS<->client session
83 */
84enum {
85 CEPH_MDS_SESSION_NEW = 1,
86 CEPH_MDS_SESSION_OPENING = 2,
87 CEPH_MDS_SESSION_OPEN = 3,
88 CEPH_MDS_SESSION_HUNG = 4,
89 CEPH_MDS_SESSION_CLOSING = 5,
90 CEPH_MDS_SESSION_RESTARTING = 6,
91 CEPH_MDS_SESSION_RECONNECTING = 7,
92};
93
94struct ceph_mds_session {
95 struct ceph_mds_client *s_mdsc;
96 int s_mds;
97 int s_state;
98 unsigned long s_ttl; /* time until mds kills us */
99 u64 s_seq; /* incoming msg seq # */
100 struct mutex s_mutex; /* serialize session messages */
101
102 struct ceph_connection s_con;
103
104 struct ceph_authorizer *s_authorizer;
105 void *s_authorizer_buf, *s_authorizer_reply_buf;
106 size_t s_authorizer_buf_len, s_authorizer_reply_buf_len;
107
108 /* protected by s_cap_lock */
109 spinlock_t s_cap_lock;
110 u32 s_cap_gen; /* inc each time we get mds stale msg */
111 unsigned long s_cap_ttl; /* when session caps expire */
112 struct list_head s_caps; /* all caps issued by this session */
113 int s_nr_caps, s_trim_caps;
114 int s_num_cap_releases;
115 struct list_head s_cap_releases; /* waiting cap_release messages */
116 struct list_head s_cap_releases_done; /* ready to send */
117 struct ceph_cap *s_cap_iterator;
118
119 /* protected by mutex */
120 struct list_head s_cap_flushing; /* inodes w/ flushing caps */
121 struct list_head s_cap_snaps_flushing;
122 unsigned long s_renew_requested; /* last time we sent a renew req */
123 u64 s_renew_seq;
124
125 atomic_t s_ref;
126 struct list_head s_waiting; /* waiting requests */
127 struct list_head s_unsafe; /* unsafe requests */
128};
129
130/*
131 * modes of choosing which MDS to send a request to
132 */
133enum {
134 USE_ANY_MDS,
135 USE_RANDOM_MDS,
136 USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */
137};
138
139struct ceph_mds_request;
140struct ceph_mds_client;
141
142/*
143 * request completion callback
144 */
145typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
146 struct ceph_mds_request *req);
147
148/*
149 * an in-flight mds request
150 */
151struct ceph_mds_request {
152 u64 r_tid; /* transaction id */
153 struct rb_node r_node;
154
155 int r_op; /* mds op code */
156 int r_mds;
157
158 /* operation on what? */
159 struct inode *r_inode; /* arg1 */
160 struct dentry *r_dentry; /* arg1 */
161 struct dentry *r_old_dentry; /* arg2: rename from or link from */
162 char *r_path1, *r_path2;
163 struct ceph_vino r_ino1, r_ino2;
164
165 struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
166 struct inode *r_target_inode; /* resulting inode */
167
168 union ceph_mds_request_args r_args;
169 int r_fmode; /* file mode, if expecting cap */
170
171 /* for choosing which mds to send this request to */
172 int r_direct_mode;
173 u32 r_direct_hash; /* choose dir frag based on this dentry hash */
174 bool r_direct_is_hash; /* true if r_direct_hash is valid */
175
176 /* data payload is used for xattr ops */
177 struct page **r_pages;
178 int r_num_pages;
179 int r_data_len;
180
181 /* what caps shall we drop? */
182 int r_inode_drop, r_inode_unless;
183 int r_dentry_drop, r_dentry_unless;
184 int r_old_dentry_drop, r_old_dentry_unless;
185 struct inode *r_old_inode;
186 int r_old_inode_drop, r_old_inode_unless;
187
188 struct ceph_msg *r_request; /* original request */
189 struct ceph_msg *r_reply;
190 struct ceph_mds_reply_info_parsed r_reply_info;
191 int r_err;
192 bool r_aborted;
193
194 unsigned long r_timeout; /* optional. jiffies */
195 unsigned long r_started; /* start time to measure timeout against */
196 unsigned long r_request_started; /* start time for mds request only,
197 used to measure lease durations */
198
199 /* link unsafe requests to parent directory, for fsync */
200 struct inode *r_unsafe_dir;
201 struct list_head r_unsafe_dir_item;
202
203 struct ceph_mds_session *r_session;
204
205 int r_attempts; /* resend attempts */
206 int r_num_fwd; /* number of forward attempts */
207 int r_num_stale;
208 int r_resend_mds; /* mds to resend to next, if any*/
209
210 struct kref r_kref;
211 struct list_head r_wait;
212 struct completion r_completion;
213 struct completion r_safe_completion;
214 ceph_mds_request_callback_t r_callback;
215 struct list_head r_unsafe_item; /* per-session unsafe list item */
216 bool r_got_unsafe, r_got_safe;
217
218 bool r_did_prepopulate;
219 u32 r_readdir_offset;
220
221 struct ceph_cap_reservation r_caps_reservation;
222 int r_num_caps;
223};
224
225/*
226 * mds client state
227 */
228struct ceph_mds_client {
229 struct ceph_client *client;
230 struct mutex mutex; /* all nested structures */
231
232 struct ceph_mdsmap *mdsmap;
233 struct completion safe_umount_waiters, session_close_waiters;
234 struct list_head waiting_for_map;
235
236 struct ceph_mds_session **sessions; /* NULL for mds if no session */
237 int max_sessions; /* len of s_mds_sessions */
238 int stopping; /* true if shutting down */
239
240 /*
241 * snap_rwsem will cover cap linkage into snaprealms, and
242 * realm snap contexts. (later, we can do per-realm snap
243 * contexts locks..) the empty list contains realms with no
244 * references (implying they contain no inodes with caps) that
245 * should be destroyed.
246 */
247 struct rw_semaphore snap_rwsem;
248 struct rb_root snap_realms;
249 struct list_head snap_empty;
250 spinlock_t snap_empty_lock; /* protect snap_empty */
251
252 u64 last_tid; /* most recent mds request */
253 struct rb_root request_tree; /* pending mds requests */
254 struct delayed_work delayed_work; /* delayed work */
255 unsigned long last_renew_caps; /* last time we renewed our caps */
256 struct list_head cap_delay_list; /* caps with delayed release */
257 spinlock_t cap_delay_lock; /* protects cap_delay_list */
258 struct list_head snap_flush_list; /* cap_snaps ready to flush */
259 spinlock_t snap_flush_lock;
260
261 u64 cap_flush_seq;
262 struct list_head cap_dirty; /* inodes with dirty caps */
263 int num_cap_flushing; /* # caps we are flushing */
264 spinlock_t cap_dirty_lock; /* protects above items */
265 wait_queue_head_t cap_flushing_wq;
266
267#ifdef CONFIG_DEBUG_FS
268 struct dentry *debugfs_file;
269#endif
270
271 spinlock_t dentry_lru_lock;
272 struct list_head dentry_lru;
273 int num_dentry;
274};
275
276extern const char *ceph_mds_op_name(int op);
277
278extern struct ceph_mds_session *
279__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
280
281static inline struct ceph_mds_session *
282ceph_get_mds_session(struct ceph_mds_session *s)
283{
284 atomic_inc(&s->s_ref);
285 return s;
286}
287
288extern void ceph_put_mds_session(struct ceph_mds_session *s);
289
290extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
291 struct ceph_msg *msg, int mds);
292
293extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
294 struct ceph_client *client);
295extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
296extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
297
298extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
299
300extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
301 struct inode *inode,
302 struct dentry *dn, int mask);
303
304extern struct ceph_mds_request *
305ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
306extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
307 struct ceph_mds_request *req);
308extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
309 struct inode *dir,
310 struct ceph_mds_request *req);
311static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
312{
313 kref_get(&req->r_kref);
314}
315extern void ceph_mdsc_release_request(struct kref *kref);
316static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
317{
318 kref_put(&req->r_kref, ceph_mdsc_release_request);
319}
320
321extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
322
323extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
324 int stop_on_nosnap);
325
326extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
327extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
328 struct inode *inode,
329 struct dentry *dentry, char action,
330 u32 seq);
331
332extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
333 struct ceph_msg *msg);
334
335#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
new file mode 100644
index 000000000000..c4c498e6dfef
--- /dev/null
+++ b/fs/ceph/mdsmap.c
@@ -0,0 +1,174 @@
1#include "ceph_debug.h"
2
3#include <linux/bug.h>
4#include <linux/err.h>
5#include <linux/random.h>
6#include <linux/slab.h>
7#include <linux/types.h>
8
9#include "mdsmap.h"
10#include "messenger.h"
11#include "decode.h"
12
13#include "super.h"
14
15
16/*
17 * choose a random mds that is "up" (i.e. has a state > 0), or -1.
18 */
19int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
20{
21 int n = 0;
22 int i;
23 char r;
24
25 /* count */
26 for (i = 0; i < m->m_max_mds; i++)
27 if (m->m_info[i].state > 0)
28 n++;
29 if (n == 0)
30 return -1;
31
32 /* pick */
33 get_random_bytes(&r, 1);
34 n = r % n;
35 i = 0;
36 for (i = 0; n > 0; i++, n--)
37 while (m->m_info[i].state <= 0)
38 i++;
39
40 return i;
41}
42
43/*
44 * Decode an MDS map
45 *
46 * Ignore any fields we don't care about (there are quite a few of
47 * them).
48 */
49struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
50{
51 struct ceph_mdsmap *m;
52 const void *start = *p;
53 int i, j, n;
54 int err = -EINVAL;
55 u16 version;
56
57 m = kzalloc(sizeof(*m), GFP_NOFS);
58 if (m == NULL)
59 return ERR_PTR(-ENOMEM);
60
61 ceph_decode_16_safe(p, end, version, bad);
62
63 ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
64 m->m_epoch = ceph_decode_32(p);
65 m->m_client_epoch = ceph_decode_32(p);
66 m->m_last_failure = ceph_decode_32(p);
67 m->m_root = ceph_decode_32(p);
68 m->m_session_timeout = ceph_decode_32(p);
69 m->m_session_autoclose = ceph_decode_32(p);
70 m->m_max_file_size = ceph_decode_64(p);
71 m->m_max_mds = ceph_decode_32(p);
72
73 m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
74 if (m->m_info == NULL)
75 goto badmem;
76
77 /* pick out active nodes from mds_info (state > 0) */
78 n = ceph_decode_32(p);
79 for (i = 0; i < n; i++) {
80 u64 global_id;
81 u32 namelen;
82 s32 mds, inc, state;
83 u64 state_seq;
84 u8 infoversion;
85 struct ceph_entity_addr addr;
86 u32 num_export_targets;
87 void *pexport_targets = NULL;
88
89 ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
90 global_id = ceph_decode_64(p);
91 infoversion = ceph_decode_8(p);
92 *p += sizeof(u64);
93 namelen = ceph_decode_32(p); /* skip mds name */
94 *p += namelen;
95
96 ceph_decode_need(p, end,
97 4*sizeof(u32) + sizeof(u64) +
98 sizeof(addr) + sizeof(struct ceph_timespec),
99 bad);
100 mds = ceph_decode_32(p);
101 inc = ceph_decode_32(p);
102 state = ceph_decode_32(p);
103 state_seq = ceph_decode_64(p);
104 ceph_decode_copy(p, &addr, sizeof(addr));
105 ceph_decode_addr(&addr);
106 *p += sizeof(struct ceph_timespec);
107 *p += sizeof(u32);
108 ceph_decode_32_safe(p, end, namelen, bad);
109 *p += namelen;
110 if (infoversion >= 2) {
111 ceph_decode_32_safe(p, end, num_export_targets, bad);
112 pexport_targets = *p;
113 *p += num_export_targets * sizeof(u32);
114 } else {
115 num_export_targets = 0;
116 }
117
118 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
119 i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr),
120 ceph_mds_state_name(state));
121 if (mds >= 0 && mds < m->m_max_mds && state > 0) {
122 m->m_info[mds].global_id = global_id;
123 m->m_info[mds].state = state;
124 m->m_info[mds].addr = addr;
125 m->m_info[mds].num_export_targets = num_export_targets;
126 if (num_export_targets) {
127 m->m_info[mds].export_targets =
128 kcalloc(num_export_targets, sizeof(u32),
129 GFP_NOFS);
130 for (j = 0; j < num_export_targets; j++)
131 m->m_info[mds].export_targets[j] =
132 ceph_decode_32(&pexport_targets);
133 } else {
134 m->m_info[mds].export_targets = NULL;
135 }
136 }
137 }
138
139 /* pg_pools */
140 ceph_decode_32_safe(p, end, n, bad);
141 m->m_num_data_pg_pools = n;
142 m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
143 if (!m->m_data_pg_pools)
144 goto badmem;
145 ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
146 for (i = 0; i < n; i++)
147 m->m_data_pg_pools[i] = ceph_decode_32(p);
148 m->m_cas_pg_pool = ceph_decode_32(p);
149
150 /* ok, we don't care about the rest. */
151 dout("mdsmap_decode success epoch %u\n", m->m_epoch);
152 return m;
153
154badmem:
155 err = -ENOMEM;
156bad:
157 pr_err("corrupt mdsmap\n");
158 print_hex_dump(KERN_DEBUG, "mdsmap: ",
159 DUMP_PREFIX_OFFSET, 16, 1,
160 start, end - start, true);
161 ceph_mdsmap_destroy(m);
162 return ERR_PTR(-EINVAL);
163}
164
165void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
166{
167 int i;
168
169 for (i = 0; i < m->m_max_mds; i++)
170 kfree(m->m_info[i].export_targets);
171 kfree(m->m_info);
172 kfree(m->m_data_pg_pools);
173 kfree(m);
174}
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
new file mode 100644
index 000000000000..eacc131aa5cb
--- /dev/null
+++ b/fs/ceph/mdsmap.h
@@ -0,0 +1,54 @@
1#ifndef _FS_CEPH_MDSMAP_H
2#define _FS_CEPH_MDSMAP_H
3
4#include "types.h"
5
6/*
7 * mds map - describe servers in the mds cluster.
8 *
9 * we limit fields to those the client actually xcares about
10 */
11struct ceph_mds_info {
12 u64 global_id;
13 struct ceph_entity_addr addr;
14 s32 state;
15 int num_export_targets;
16 u32 *export_targets;
17};
18
19struct ceph_mdsmap {
20 u32 m_epoch, m_client_epoch, m_last_failure;
21 u32 m_root;
22 u32 m_session_timeout; /* seconds */
23 u32 m_session_autoclose; /* seconds */
24 u64 m_max_file_size;
25 u32 m_max_mds; /* size of m_addr, m_state arrays */
26 struct ceph_mds_info *m_info;
27
28 /* which object pools file data can be stored in */
29 int m_num_data_pg_pools;
30 u32 *m_data_pg_pools;
31 u32 m_cas_pg_pool;
32};
33
34static inline struct ceph_entity_addr *
35ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
36{
37 if (w >= m->m_max_mds)
38 return NULL;
39 return &m->m_info[w].addr;
40}
41
42static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
43{
44 BUG_ON(w < 0);
45 if (w >= m->m_max_mds)
46 return CEPH_MDS_STATE_DNE;
47 return m->m_info[w].state;
48}
49
50extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
51extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
52extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
53
54#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
new file mode 100644
index 000000000000..cdaaa131add3
--- /dev/null
+++ b/fs/ceph/messenger.c
@@ -0,0 +1,2249 @@
1#include "ceph_debug.h"
2
3#include <linux/crc32c.h>
4#include <linux/ctype.h>
5#include <linux/highmem.h>
6#include <linux/inet.h>
7#include <linux/kthread.h>
8#include <linux/net.h>
9#include <linux/slab.h>
10#include <linux/socket.h>
11#include <linux/string.h>
12#include <net/tcp.h>
13
14#include "super.h"
15#include "messenger.h"
16#include "decode.h"
17#include "pagelist.h"
18
19/*
20 * Ceph uses the messenger to exchange ceph_msg messages with other
21 * hosts in the system. The messenger provides ordered and reliable
22 * delivery. We tolerate TCP disconnects by reconnecting (with
23 * exponential backoff) in the case of a fault (disconnection, bad
24 * crc, protocol error). Acks allow sent messages to be discarded by
25 * the sender.
26 */
27
28/* static tag bytes (protocol control messages) */
29static char tag_msg = CEPH_MSGR_TAG_MSG;
30static char tag_ack = CEPH_MSGR_TAG_ACK;
31static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
32
33#ifdef CONFIG_LOCKDEP
34static struct lock_class_key socket_class;
35#endif
36
37
38static void queue_con(struct ceph_connection *con);
39static void con_work(struct work_struct *);
40static void ceph_fault(struct ceph_connection *con);
41
42const char *ceph_name_type_str(int t)
43{
44 switch (t) {
45 case CEPH_ENTITY_TYPE_MON: return "mon";
46 case CEPH_ENTITY_TYPE_MDS: return "mds";
47 case CEPH_ENTITY_TYPE_OSD: return "osd";
48 case CEPH_ENTITY_TYPE_CLIENT: return "client";
49 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
50 default: return "???";
51 }
52}
53
54/*
55 * nicely render a sockaddr as a string.
56 */
57#define MAX_ADDR_STR 20
58static char addr_str[MAX_ADDR_STR][40];
59static DEFINE_SPINLOCK(addr_str_lock);
60static int last_addr_str;
61
62const char *pr_addr(const struct sockaddr_storage *ss)
63{
64 int i;
65 char *s;
66 struct sockaddr_in *in4 = (void *)ss;
67 unsigned char *quad = (void *)&in4->sin_addr.s_addr;
68 struct sockaddr_in6 *in6 = (void *)ss;
69
70 spin_lock(&addr_str_lock);
71 i = last_addr_str++;
72 if (last_addr_str == MAX_ADDR_STR)
73 last_addr_str = 0;
74 spin_unlock(&addr_str_lock);
75 s = addr_str[i];
76
77 switch (ss->ss_family) {
78 case AF_INET:
79 sprintf(s, "%u.%u.%u.%u:%u",
80 (unsigned int)quad[0],
81 (unsigned int)quad[1],
82 (unsigned int)quad[2],
83 (unsigned int)quad[3],
84 (unsigned int)ntohs(in4->sin_port));
85 break;
86
87 case AF_INET6:
88 sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u",
89 in6->sin6_addr.s6_addr16[0],
90 in6->sin6_addr.s6_addr16[1],
91 in6->sin6_addr.s6_addr16[2],
92 in6->sin6_addr.s6_addr16[3],
93 in6->sin6_addr.s6_addr16[4],
94 in6->sin6_addr.s6_addr16[5],
95 in6->sin6_addr.s6_addr16[6],
96 in6->sin6_addr.s6_addr16[7],
97 (unsigned int)ntohs(in6->sin6_port));
98 break;
99
100 default:
101 sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
102 }
103
104 return s;
105}
106
107static void encode_my_addr(struct ceph_messenger *msgr)
108{
109 memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
110 ceph_encode_addr(&msgr->my_enc_addr);
111}
112
113/*
114 * work queue for all reading and writing to/from the socket.
115 */
116struct workqueue_struct *ceph_msgr_wq;
117
118int __init ceph_msgr_init(void)
119{
120 ceph_msgr_wq = create_workqueue("ceph-msgr");
121 if (IS_ERR(ceph_msgr_wq)) {
122 int ret = PTR_ERR(ceph_msgr_wq);
123 pr_err("msgr_init failed to create workqueue: %d\n", ret);
124 ceph_msgr_wq = NULL;
125 return ret;
126 }
127 return 0;
128}
129
130void ceph_msgr_exit(void)
131{
132 destroy_workqueue(ceph_msgr_wq);
133}
134
135/*
136 * socket callback functions
137 */
138
139/* data available on socket, or listen socket received a connect */
140static void ceph_data_ready(struct sock *sk, int count_unused)
141{
142 struct ceph_connection *con =
143 (struct ceph_connection *)sk->sk_user_data;
144 if (sk->sk_state != TCP_CLOSE_WAIT) {
145 dout("ceph_data_ready on %p state = %lu, queueing work\n",
146 con, con->state);
147 queue_con(con);
148 }
149}
150
151/* socket has buffer space for writing */
152static void ceph_write_space(struct sock *sk)
153{
154 struct ceph_connection *con =
155 (struct ceph_connection *)sk->sk_user_data;
156
157 /* only queue to workqueue if there is data we want to write. */
158 if (test_bit(WRITE_PENDING, &con->state)) {
159 dout("ceph_write_space %p queueing write work\n", con);
160 queue_con(con);
161 } else {
162 dout("ceph_write_space %p nothing to write\n", con);
163 }
164
165 /* since we have our own write_space, clear the SOCK_NOSPACE flag */
166 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
167}
168
169/* socket's state has changed */
170static void ceph_state_change(struct sock *sk)
171{
172 struct ceph_connection *con =
173 (struct ceph_connection *)sk->sk_user_data;
174
175 dout("ceph_state_change %p state = %lu sk_state = %u\n",
176 con, con->state, sk->sk_state);
177
178 if (test_bit(CLOSED, &con->state))
179 return;
180
181 switch (sk->sk_state) {
182 case TCP_CLOSE:
183 dout("ceph_state_change TCP_CLOSE\n");
184 case TCP_CLOSE_WAIT:
185 dout("ceph_state_change TCP_CLOSE_WAIT\n");
186 if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
187 if (test_bit(CONNECTING, &con->state))
188 con->error_msg = "connection failed";
189 else
190 con->error_msg = "socket closed";
191 queue_con(con);
192 }
193 break;
194 case TCP_ESTABLISHED:
195 dout("ceph_state_change TCP_ESTABLISHED\n");
196 queue_con(con);
197 break;
198 }
199}
200
201/*
202 * set up socket callbacks
203 */
204static void set_sock_callbacks(struct socket *sock,
205 struct ceph_connection *con)
206{
207 struct sock *sk = sock->sk;
208 sk->sk_user_data = (void *)con;
209 sk->sk_data_ready = ceph_data_ready;
210 sk->sk_write_space = ceph_write_space;
211 sk->sk_state_change = ceph_state_change;
212}
213
214
215/*
216 * socket helpers
217 */
218
219/*
220 * initiate connection to a remote socket.
221 */
222static struct socket *ceph_tcp_connect(struct ceph_connection *con)
223{
224 struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr;
225 struct socket *sock;
226 int ret;
227
228 BUG_ON(con->sock);
229 ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
230 if (ret)
231 return ERR_PTR(ret);
232 con->sock = sock;
233 sock->sk->sk_allocation = GFP_NOFS;
234
235#ifdef CONFIG_LOCKDEP
236 lockdep_set_class(&sock->sk->sk_lock, &socket_class);
237#endif
238
239 set_sock_callbacks(sock, con);
240
241 dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
242
243 ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK);
244 if (ret == -EINPROGRESS) {
245 dout("connect %s EINPROGRESS sk_state = %u\n",
246 pr_addr(&con->peer_addr.in_addr),
247 sock->sk->sk_state);
248 ret = 0;
249 }
250 if (ret < 0) {
251 pr_err("connect %s error %d\n",
252 pr_addr(&con->peer_addr.in_addr), ret);
253 sock_release(sock);
254 con->sock = NULL;
255 con->error_msg = "connect error";
256 }
257
258 if (ret < 0)
259 return ERR_PTR(ret);
260 return sock;
261}
262
263static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
264{
265 struct kvec iov = {buf, len};
266 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
267
268 return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
269}
270
271/*
272 * write something. @more is true if caller will be sending more data
273 * shortly.
274 */
275static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
276 size_t kvlen, size_t len, int more)
277{
278 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
279
280 if (more)
281 msg.msg_flags |= MSG_MORE;
282 else
283 msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
284
285 return kernel_sendmsg(sock, &msg, iov, kvlen, len);
286}
287
288
289/*
290 * Shutdown/close the socket for the given connection.
291 */
292static int con_close_socket(struct ceph_connection *con)
293{
294 int rc;
295
296 dout("con_close_socket on %p sock %p\n", con, con->sock);
297 if (!con->sock)
298 return 0;
299 set_bit(SOCK_CLOSED, &con->state);
300 rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
301 sock_release(con->sock);
302 con->sock = NULL;
303 clear_bit(SOCK_CLOSED, &con->state);
304 return rc;
305}
306
307/*
308 * Reset a connection. Discard all incoming and outgoing messages
309 * and clear *_seq state.
310 */
311static void ceph_msg_remove(struct ceph_msg *msg)
312{
313 list_del_init(&msg->list_head);
314 ceph_msg_put(msg);
315}
316static void ceph_msg_remove_list(struct list_head *head)
317{
318 while (!list_empty(head)) {
319 struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
320 list_head);
321 ceph_msg_remove(msg);
322 }
323}
324
325static void reset_connection(struct ceph_connection *con)
326{
327 /* reset connection, out_queue, msg_ and connect_seq */
328 /* discard existing out_queue and msg_seq */
329 ceph_msg_remove_list(&con->out_queue);
330 ceph_msg_remove_list(&con->out_sent);
331
332 if (con->in_msg) {
333 ceph_msg_put(con->in_msg);
334 con->in_msg = NULL;
335 }
336
337 con->connect_seq = 0;
338 con->out_seq = 0;
339 if (con->out_msg) {
340 ceph_msg_put(con->out_msg);
341 con->out_msg = NULL;
342 }
343 con->in_seq = 0;
344 con->in_seq_acked = 0;
345}
346
347/*
348 * mark a peer down. drop any open connections.
349 */
350void ceph_con_close(struct ceph_connection *con)
351{
352 dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
353 set_bit(CLOSED, &con->state); /* in case there's queued work */
354 clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
355 clear_bit(LOSSYTX, &con->state); /* so we retry next connect */
356 clear_bit(KEEPALIVE_PENDING, &con->state);
357 clear_bit(WRITE_PENDING, &con->state);
358 mutex_lock(&con->mutex);
359 reset_connection(con);
360 cancel_delayed_work(&con->work);
361 mutex_unlock(&con->mutex);
362 queue_con(con);
363}
364
365/*
366 * Reopen a closed connection, with a new peer address.
367 */
368void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
369{
370 dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
371 set_bit(OPENING, &con->state);
372 clear_bit(CLOSED, &con->state);
373 memcpy(&con->peer_addr, addr, sizeof(*addr));
374 con->delay = 0; /* reset backoff memory */
375 queue_con(con);
376}
377
378/*
379 * return true if this connection ever successfully opened
380 */
381bool ceph_con_opened(struct ceph_connection *con)
382{
383 return con->connect_seq > 0;
384}
385
386/*
387 * generic get/put
388 */
389struct ceph_connection *ceph_con_get(struct ceph_connection *con)
390{
391 dout("con_get %p nref = %d -> %d\n", con,
392 atomic_read(&con->nref), atomic_read(&con->nref) + 1);
393 if (atomic_inc_not_zero(&con->nref))
394 return con;
395 return NULL;
396}
397
398void ceph_con_put(struct ceph_connection *con)
399{
400 dout("con_put %p nref = %d -> %d\n", con,
401 atomic_read(&con->nref), atomic_read(&con->nref) - 1);
402 BUG_ON(atomic_read(&con->nref) == 0);
403 if (atomic_dec_and_test(&con->nref)) {
404 BUG_ON(con->sock);
405 kfree(con);
406 }
407}
408
409/*
410 * initialize a new connection.
411 */
412void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
413{
414 dout("con_init %p\n", con);
415 memset(con, 0, sizeof(*con));
416 atomic_set(&con->nref, 1);
417 con->msgr = msgr;
418 mutex_init(&con->mutex);
419 INIT_LIST_HEAD(&con->out_queue);
420 INIT_LIST_HEAD(&con->out_sent);
421 INIT_DELAYED_WORK(&con->work, con_work);
422}
423
424
425/*
426 * We maintain a global counter to order connection attempts. Get
427 * a unique seq greater than @gt.
428 */
429static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
430{
431 u32 ret;
432
433 spin_lock(&msgr->global_seq_lock);
434 if (msgr->global_seq < gt)
435 msgr->global_seq = gt;
436 ret = ++msgr->global_seq;
437 spin_unlock(&msgr->global_seq_lock);
438 return ret;
439}
440
441
442/*
443 * Prepare footer for currently outgoing message, and finish things
444 * off. Assumes out_kvec* are already valid.. we just add on to the end.
445 */
446static void prepare_write_message_footer(struct ceph_connection *con, int v)
447{
448 struct ceph_msg *m = con->out_msg;
449
450 dout("prepare_write_message_footer %p\n", con);
451 con->out_kvec_is_msg = true;
452 con->out_kvec[v].iov_base = &m->footer;
453 con->out_kvec[v].iov_len = sizeof(m->footer);
454 con->out_kvec_bytes += sizeof(m->footer);
455 con->out_kvec_left++;
456 con->out_more = m->more_to_follow;
457 con->out_msg_done = true;
458}
459
460/*
461 * Prepare headers for the next outgoing message.
462 */
463static void prepare_write_message(struct ceph_connection *con)
464{
465 struct ceph_msg *m;
466 int v = 0;
467
468 con->out_kvec_bytes = 0;
469 con->out_kvec_is_msg = true;
470 con->out_msg_done = false;
471
472 /* Sneak an ack in there first? If we can get it into the same
473 * TCP packet that's a good thing. */
474 if (con->in_seq > con->in_seq_acked) {
475 con->in_seq_acked = con->in_seq;
476 con->out_kvec[v].iov_base = &tag_ack;
477 con->out_kvec[v++].iov_len = 1;
478 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
479 con->out_kvec[v].iov_base = &con->out_temp_ack;
480 con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
481 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
482 }
483
484 m = list_first_entry(&con->out_queue,
485 struct ceph_msg, list_head);
486 con->out_msg = m;
487 if (test_bit(LOSSYTX, &con->state)) {
488 list_del_init(&m->list_head);
489 } else {
490 /* put message on sent list */
491 ceph_msg_get(m);
492 list_move_tail(&m->list_head, &con->out_sent);
493 }
494
495 m->hdr.seq = cpu_to_le64(++con->out_seq);
496
497 dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
498 m, con->out_seq, le16_to_cpu(m->hdr.type),
499 le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
500 le32_to_cpu(m->hdr.data_len),
501 m->nr_pages);
502 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
503
504 /* tag + hdr + front + middle */
505 con->out_kvec[v].iov_base = &tag_msg;
506 con->out_kvec[v++].iov_len = 1;
507 con->out_kvec[v].iov_base = &m->hdr;
508 con->out_kvec[v++].iov_len = sizeof(m->hdr);
509 con->out_kvec[v++] = m->front;
510 if (m->middle)
511 con->out_kvec[v++] = m->middle->vec;
512 con->out_kvec_left = v;
513 con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
514 (m->middle ? m->middle->vec.iov_len : 0);
515 con->out_kvec_cur = con->out_kvec;
516
517 /* fill in crc (except data pages), footer */
518 con->out_msg->hdr.crc =
519 cpu_to_le32(crc32c(0, (void *)&m->hdr,
520 sizeof(m->hdr) - sizeof(m->hdr.crc)));
521 con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
522 con->out_msg->footer.front_crc =
523 cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
524 if (m->middle)
525 con->out_msg->footer.middle_crc =
526 cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
527 m->middle->vec.iov_len));
528 else
529 con->out_msg->footer.middle_crc = 0;
530 con->out_msg->footer.data_crc = 0;
531 dout("prepare_write_message front_crc %u data_crc %u\n",
532 le32_to_cpu(con->out_msg->footer.front_crc),
533 le32_to_cpu(con->out_msg->footer.middle_crc));
534
535 /* is there a data payload? */
536 if (le32_to_cpu(m->hdr.data_len) > 0) {
537 /* initialize page iterator */
538 con->out_msg_pos.page = 0;
539 con->out_msg_pos.page_pos =
540 le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
541 con->out_msg_pos.data_pos = 0;
542 con->out_msg_pos.did_page_crc = 0;
543 con->out_more = 1; /* data + footer will follow */
544 } else {
545 /* no, queue up footer too and be done */
546 prepare_write_message_footer(con, v);
547 }
548
549 set_bit(WRITE_PENDING, &con->state);
550}
551
552/*
553 * Prepare an ack.
554 */
555static void prepare_write_ack(struct ceph_connection *con)
556{
557 dout("prepare_write_ack %p %llu -> %llu\n", con,
558 con->in_seq_acked, con->in_seq);
559 con->in_seq_acked = con->in_seq;
560
561 con->out_kvec[0].iov_base = &tag_ack;
562 con->out_kvec[0].iov_len = 1;
563 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
564 con->out_kvec[1].iov_base = &con->out_temp_ack;
565 con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
566 con->out_kvec_left = 2;
567 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
568 con->out_kvec_cur = con->out_kvec;
569 con->out_more = 1; /* more will follow.. eventually.. */
570 set_bit(WRITE_PENDING, &con->state);
571}
572
573/*
574 * Prepare to write keepalive byte.
575 */
576static void prepare_write_keepalive(struct ceph_connection *con)
577{
578 dout("prepare_write_keepalive %p\n", con);
579 con->out_kvec[0].iov_base = &tag_keepalive;
580 con->out_kvec[0].iov_len = 1;
581 con->out_kvec_left = 1;
582 con->out_kvec_bytes = 1;
583 con->out_kvec_cur = con->out_kvec;
584 set_bit(WRITE_PENDING, &con->state);
585}
586
587/*
588 * Connection negotiation.
589 */
590
591static void prepare_connect_authorizer(struct ceph_connection *con)
592{
593 void *auth_buf;
594 int auth_len = 0;
595 int auth_protocol = 0;
596
597 mutex_unlock(&con->mutex);
598 if (con->ops->get_authorizer)
599 con->ops->get_authorizer(con, &auth_buf, &auth_len,
600 &auth_protocol, &con->auth_reply_buf,
601 &con->auth_reply_buf_len,
602 con->auth_retry);
603 mutex_lock(&con->mutex);
604
605 con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
606 con->out_connect.authorizer_len = cpu_to_le32(auth_len);
607
608 con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
609 con->out_kvec[con->out_kvec_left].iov_len = auth_len;
610 con->out_kvec_left++;
611 con->out_kvec_bytes += auth_len;
612}
613
614/*
615 * We connected to a peer and are saying hello.
616 */
617static void prepare_write_banner(struct ceph_messenger *msgr,
618 struct ceph_connection *con)
619{
620 int len = strlen(CEPH_BANNER);
621
622 con->out_kvec[0].iov_base = CEPH_BANNER;
623 con->out_kvec[0].iov_len = len;
624 con->out_kvec[1].iov_base = &msgr->my_enc_addr;
625 con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
626 con->out_kvec_left = 2;
627 con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
628 con->out_kvec_cur = con->out_kvec;
629 con->out_more = 0;
630 set_bit(WRITE_PENDING, &con->state);
631}
632
633static void prepare_write_connect(struct ceph_messenger *msgr,
634 struct ceph_connection *con,
635 int after_banner)
636{
637 unsigned global_seq = get_global_seq(con->msgr, 0);
638 int proto;
639
640 switch (con->peer_name.type) {
641 case CEPH_ENTITY_TYPE_MON:
642 proto = CEPH_MONC_PROTOCOL;
643 break;
644 case CEPH_ENTITY_TYPE_OSD:
645 proto = CEPH_OSDC_PROTOCOL;
646 break;
647 case CEPH_ENTITY_TYPE_MDS:
648 proto = CEPH_MDSC_PROTOCOL;
649 break;
650 default:
651 BUG();
652 }
653
654 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
655 con->connect_seq, global_seq, proto);
656
657 con->out_connect.features = CEPH_FEATURE_SUPPORTED;
658 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
659 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
660 con->out_connect.global_seq = cpu_to_le32(global_seq);
661 con->out_connect.protocol_version = cpu_to_le32(proto);
662 con->out_connect.flags = 0;
663
664 if (!after_banner) {
665 con->out_kvec_left = 0;
666 con->out_kvec_bytes = 0;
667 }
668 con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
669 con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
670 con->out_kvec_left++;
671 con->out_kvec_bytes += sizeof(con->out_connect);
672 con->out_kvec_cur = con->out_kvec;
673 con->out_more = 0;
674 set_bit(WRITE_PENDING, &con->state);
675
676 prepare_connect_authorizer(con);
677}
678
679
680/*
681 * write as much of pending kvecs to the socket as we can.
682 * 1 -> done
683 * 0 -> socket full, but more to do
684 * <0 -> error
685 */
686static int write_partial_kvec(struct ceph_connection *con)
687{
688 int ret;
689
690 dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
691 while (con->out_kvec_bytes > 0) {
692 ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
693 con->out_kvec_left, con->out_kvec_bytes,
694 con->out_more);
695 if (ret <= 0)
696 goto out;
697 con->out_kvec_bytes -= ret;
698 if (con->out_kvec_bytes == 0)
699 break; /* done */
700 while (ret > 0) {
701 if (ret >= con->out_kvec_cur->iov_len) {
702 ret -= con->out_kvec_cur->iov_len;
703 con->out_kvec_cur++;
704 con->out_kvec_left--;
705 } else {
706 con->out_kvec_cur->iov_len -= ret;
707 con->out_kvec_cur->iov_base += ret;
708 ret = 0;
709 break;
710 }
711 }
712 }
713 con->out_kvec_left = 0;
714 con->out_kvec_is_msg = false;
715 ret = 1;
716out:
717 dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
718 con->out_kvec_bytes, con->out_kvec_left, ret);
719 return ret; /* done! */
720}
721
722/*
723 * Write as much message data payload as we can. If we finish, queue
724 * up the footer.
725 * 1 -> done, footer is now queued in out_kvec[].
726 * 0 -> socket full, but more to do
727 * <0 -> error
728 */
729static int write_partial_msg_pages(struct ceph_connection *con)
730{
731 struct ceph_msg *msg = con->out_msg;
732 unsigned data_len = le32_to_cpu(msg->hdr.data_len);
733 size_t len;
734 int crc = con->msgr->nocrc;
735 int ret;
736
737 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
738 con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
739 con->out_msg_pos.page_pos);
740
741 while (con->out_msg_pos.page < con->out_msg->nr_pages) {
742 struct page *page = NULL;
743 void *kaddr = NULL;
744
745 /*
746 * if we are calculating the data crc (the default), we need
747 * to map the page. if our pages[] has been revoked, use the
748 * zero page.
749 */
750 if (msg->pages) {
751 page = msg->pages[con->out_msg_pos.page];
752 if (crc)
753 kaddr = kmap(page);
754 } else if (msg->pagelist) {
755 page = list_first_entry(&msg->pagelist->head,
756 struct page, lru);
757 if (crc)
758 kaddr = kmap(page);
759 } else {
760 page = con->msgr->zero_page;
761 if (crc)
762 kaddr = page_address(con->msgr->zero_page);
763 }
764 len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
765 (int)(data_len - con->out_msg_pos.data_pos));
766 if (crc && !con->out_msg_pos.did_page_crc) {
767 void *base = kaddr + con->out_msg_pos.page_pos;
768 u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
769
770 BUG_ON(kaddr == NULL);
771 con->out_msg->footer.data_crc =
772 cpu_to_le32(crc32c(tmpcrc, base, len));
773 con->out_msg_pos.did_page_crc = 1;
774 }
775
776 ret = kernel_sendpage(con->sock, page,
777 con->out_msg_pos.page_pos, len,
778 MSG_DONTWAIT | MSG_NOSIGNAL |
779 MSG_MORE);
780
781 if (crc && (msg->pages || msg->pagelist))
782 kunmap(page);
783
784 if (ret <= 0)
785 goto out;
786
787 con->out_msg_pos.data_pos += ret;
788 con->out_msg_pos.page_pos += ret;
789 if (ret == len) {
790 con->out_msg_pos.page_pos = 0;
791 con->out_msg_pos.page++;
792 con->out_msg_pos.did_page_crc = 0;
793 if (msg->pagelist)
794 list_move_tail(&page->lru,
795 &msg->pagelist->head);
796 }
797 }
798
799 dout("write_partial_msg_pages %p msg %p done\n", con, msg);
800
801 /* prepare and queue up footer, too */
802 if (!crc)
803 con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
804 con->out_kvec_bytes = 0;
805 con->out_kvec_left = 0;
806 con->out_kvec_cur = con->out_kvec;
807 prepare_write_message_footer(con, 0);
808 ret = 1;
809out:
810 return ret;
811}
812
813/*
814 * write some zeros
815 */
816static int write_partial_skip(struct ceph_connection *con)
817{
818 int ret;
819
820 while (con->out_skip > 0) {
821 struct kvec iov = {
822 .iov_base = page_address(con->msgr->zero_page),
823 .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
824 };
825
826 ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
827 if (ret <= 0)
828 goto out;
829 con->out_skip -= ret;
830 }
831 ret = 1;
832out:
833 return ret;
834}
835
836/*
837 * Prepare to read connection handshake, or an ack.
838 */
839static void prepare_read_banner(struct ceph_connection *con)
840{
841 dout("prepare_read_banner %p\n", con);
842 con->in_base_pos = 0;
843}
844
845static void prepare_read_connect(struct ceph_connection *con)
846{
847 dout("prepare_read_connect %p\n", con);
848 con->in_base_pos = 0;
849}
850
851static void prepare_read_ack(struct ceph_connection *con)
852{
853 dout("prepare_read_ack %p\n", con);
854 con->in_base_pos = 0;
855}
856
857static void prepare_read_tag(struct ceph_connection *con)
858{
859 dout("prepare_read_tag %p\n", con);
860 con->in_base_pos = 0;
861 con->in_tag = CEPH_MSGR_TAG_READY;
862}
863
864/*
865 * Prepare to read a message.
866 */
867static int prepare_read_message(struct ceph_connection *con)
868{
869 dout("prepare_read_message %p\n", con);
870 BUG_ON(con->in_msg != NULL);
871 con->in_base_pos = 0;
872 con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
873 return 0;
874}
875
876
877static int read_partial(struct ceph_connection *con,
878 int *to, int size, void *object)
879{
880 *to += size;
881 while (con->in_base_pos < *to) {
882 int left = *to - con->in_base_pos;
883 int have = size - left;
884 int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
885 if (ret <= 0)
886 return ret;
887 con->in_base_pos += ret;
888 }
889 return 1;
890}
891
892
893/*
894 * Read all or part of the connect-side handshake on a new connection
895 */
896static int read_partial_banner(struct ceph_connection *con)
897{
898 int ret, to = 0;
899
900 dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
901
902 /* peer's banner */
903 ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
904 if (ret <= 0)
905 goto out;
906 ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
907 &con->actual_peer_addr);
908 if (ret <= 0)
909 goto out;
910 ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
911 &con->peer_addr_for_me);
912 if (ret <= 0)
913 goto out;
914out:
915 return ret;
916}
917
918static int read_partial_connect(struct ceph_connection *con)
919{
920 int ret, to = 0;
921
922 dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
923
924 ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
925 if (ret <= 0)
926 goto out;
927 ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
928 con->auth_reply_buf);
929 if (ret <= 0)
930 goto out;
931
932 dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
933 con, (int)con->in_reply.tag,
934 le32_to_cpu(con->in_reply.connect_seq),
935 le32_to_cpu(con->in_reply.global_seq));
936out:
937 return ret;
938
939}
940
941/*
942 * Verify the hello banner looks okay.
943 */
944static int verify_hello(struct ceph_connection *con)
945{
946 if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
947 pr_err("connect to %s got bad banner\n",
948 pr_addr(&con->peer_addr.in_addr));
949 con->error_msg = "protocol error, bad banner";
950 return -1;
951 }
952 return 0;
953}
954
955static bool addr_is_blank(struct sockaddr_storage *ss)
956{
957 switch (ss->ss_family) {
958 case AF_INET:
959 return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
960 case AF_INET6:
961 return
962 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
963 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
964 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
965 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
966 }
967 return false;
968}
969
970static int addr_port(struct sockaddr_storage *ss)
971{
972 switch (ss->ss_family) {
973 case AF_INET:
974 return ntohs(((struct sockaddr_in *)ss)->sin_port);
975 case AF_INET6:
976 return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
977 }
978 return 0;
979}
980
981static void addr_set_port(struct sockaddr_storage *ss, int p)
982{
983 switch (ss->ss_family) {
984 case AF_INET:
985 ((struct sockaddr_in *)ss)->sin_port = htons(p);
986 case AF_INET6:
987 ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
988 }
989}
990
991/*
992 * Parse an ip[:port] list into an addr array. Use the default
993 * monitor port if a port isn't specified.
994 */
995int ceph_parse_ips(const char *c, const char *end,
996 struct ceph_entity_addr *addr,
997 int max_count, int *count)
998{
999 int i;
1000 const char *p = c;
1001
1002 dout("parse_ips on '%.*s'\n", (int)(end-c), c);
1003 for (i = 0; i < max_count; i++) {
1004 const char *ipend;
1005 struct sockaddr_storage *ss = &addr[i].in_addr;
1006 struct sockaddr_in *in4 = (void *)ss;
1007 struct sockaddr_in6 *in6 = (void *)ss;
1008 int port;
1009
1010 memset(ss, 0, sizeof(*ss));
1011 if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
1012 ',', &ipend)) {
1013 ss->ss_family = AF_INET;
1014 } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
1015 ',', &ipend)) {
1016 ss->ss_family = AF_INET6;
1017 } else {
1018 goto bad;
1019 }
1020 p = ipend;
1021
1022 /* port? */
1023 if (p < end && *p == ':') {
1024 port = 0;
1025 p++;
1026 while (p < end && *p >= '0' && *p <= '9') {
1027 port = (port * 10) + (*p - '0');
1028 p++;
1029 }
1030 if (port > 65535 || port == 0)
1031 goto bad;
1032 } else {
1033 port = CEPH_MON_PORT;
1034 }
1035
1036 addr_set_port(ss, port);
1037
1038 dout("parse_ips got %s\n", pr_addr(ss));
1039
1040 if (p == end)
1041 break;
1042 if (*p != ',')
1043 goto bad;
1044 p++;
1045 }
1046
1047 if (p != end)
1048 goto bad;
1049
1050 if (count)
1051 *count = i + 1;
1052 return 0;
1053
1054bad:
1055 pr_err("parse_ips bad ip '%s'\n", c);
1056 return -EINVAL;
1057}
1058
1059static int process_banner(struct ceph_connection *con)
1060{
1061 dout("process_banner on %p\n", con);
1062
1063 if (verify_hello(con) < 0)
1064 return -1;
1065
1066 ceph_decode_addr(&con->actual_peer_addr);
1067 ceph_decode_addr(&con->peer_addr_for_me);
1068
1069 /*
1070 * Make sure the other end is who we wanted. note that the other
1071 * end may not yet know their ip address, so if it's 0.0.0.0, give
1072 * them the benefit of the doubt.
1073 */
1074 if (memcmp(&con->peer_addr, &con->actual_peer_addr,
1075 sizeof(con->peer_addr)) != 0 &&
1076 !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
1077 con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
1078 pr_warning("wrong peer, want %s/%lld, got %s/%lld\n",
1079 pr_addr(&con->peer_addr.in_addr),
1080 le64_to_cpu(con->peer_addr.nonce),
1081 pr_addr(&con->actual_peer_addr.in_addr),
1082 le64_to_cpu(con->actual_peer_addr.nonce));
1083 con->error_msg = "wrong peer at address";
1084 return -1;
1085 }
1086
1087 /*
1088 * did we learn our address?
1089 */
1090 if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
1091 int port = addr_port(&con->msgr->inst.addr.in_addr);
1092
1093 memcpy(&con->msgr->inst.addr.in_addr,
1094 &con->peer_addr_for_me.in_addr,
1095 sizeof(con->peer_addr_for_me.in_addr));
1096 addr_set_port(&con->msgr->inst.addr.in_addr, port);
1097 encode_my_addr(con->msgr);
1098 dout("process_banner learned my addr is %s\n",
1099 pr_addr(&con->msgr->inst.addr.in_addr));
1100 }
1101
1102 set_bit(NEGOTIATING, &con->state);
1103 prepare_read_connect(con);
1104 return 0;
1105}
1106
1107static void fail_protocol(struct ceph_connection *con)
1108{
1109 reset_connection(con);
1110 set_bit(CLOSED, &con->state); /* in case there's queued work */
1111
1112 mutex_unlock(&con->mutex);
1113 if (con->ops->bad_proto)
1114 con->ops->bad_proto(con);
1115 mutex_lock(&con->mutex);
1116}
1117
1118static int process_connect(struct ceph_connection *con)
1119{
1120 u64 sup_feat = CEPH_FEATURE_SUPPORTED;
1121 u64 req_feat = CEPH_FEATURE_REQUIRED;
1122 u64 server_feat = le64_to_cpu(con->in_reply.features);
1123
1124 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
1125
1126 switch (con->in_reply.tag) {
1127 case CEPH_MSGR_TAG_FEATURES:
1128 pr_err("%s%lld %s feature set mismatch,"
1129 " my %llx < server's %llx, missing %llx\n",
1130 ENTITY_NAME(con->peer_name),
1131 pr_addr(&con->peer_addr.in_addr),
1132 sup_feat, server_feat, server_feat & ~sup_feat);
1133 con->error_msg = "missing required protocol features";
1134 fail_protocol(con);
1135 return -1;
1136
1137 case CEPH_MSGR_TAG_BADPROTOVER:
1138 pr_err("%s%lld %s protocol version mismatch,"
1139 " my %d != server's %d\n",
1140 ENTITY_NAME(con->peer_name),
1141 pr_addr(&con->peer_addr.in_addr),
1142 le32_to_cpu(con->out_connect.protocol_version),
1143 le32_to_cpu(con->in_reply.protocol_version));
1144 con->error_msg = "protocol version mismatch";
1145 fail_protocol(con);
1146 return -1;
1147
1148 case CEPH_MSGR_TAG_BADAUTHORIZER:
1149 con->auth_retry++;
1150 dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
1151 con->auth_retry);
1152 if (con->auth_retry == 2) {
1153 con->error_msg = "connect authorization failure";
1154 reset_connection(con);
1155 set_bit(CLOSED, &con->state);
1156 return -1;
1157 }
1158 con->auth_retry = 1;
1159 prepare_write_connect(con->msgr, con, 0);
1160 prepare_read_connect(con);
1161 break;
1162
1163 case CEPH_MSGR_TAG_RESETSESSION:
1164 /*
1165 * If we connected with a large connect_seq but the peer
1166 * has no record of a session with us (no connection, or
1167 * connect_seq == 0), they will send RESETSESION to indicate
1168 * that they must have reset their session, and may have
1169 * dropped messages.
1170 */
1171 dout("process_connect got RESET peer seq %u\n",
1172 le32_to_cpu(con->in_connect.connect_seq));
1173 pr_err("%s%lld %s connection reset\n",
1174 ENTITY_NAME(con->peer_name),
1175 pr_addr(&con->peer_addr.in_addr));
1176 reset_connection(con);
1177 prepare_write_connect(con->msgr, con, 0);
1178 prepare_read_connect(con);
1179
1180 /* Tell ceph about it. */
1181 mutex_unlock(&con->mutex);
1182 pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
1183 if (con->ops->peer_reset)
1184 con->ops->peer_reset(con);
1185 mutex_lock(&con->mutex);
1186 break;
1187
1188 case CEPH_MSGR_TAG_RETRY_SESSION:
1189 /*
1190 * If we sent a smaller connect_seq than the peer has, try
1191 * again with a larger value.
1192 */
1193 dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
1194 le32_to_cpu(con->out_connect.connect_seq),
1195 le32_to_cpu(con->in_connect.connect_seq));
1196 con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
1197 prepare_write_connect(con->msgr, con, 0);
1198 prepare_read_connect(con);
1199 break;
1200
1201 case CEPH_MSGR_TAG_RETRY_GLOBAL:
1202 /*
1203 * If we sent a smaller global_seq than the peer has, try
1204 * again with a larger value.
1205 */
1206 dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
1207 con->peer_global_seq,
1208 le32_to_cpu(con->in_connect.global_seq));
1209 get_global_seq(con->msgr,
1210 le32_to_cpu(con->in_connect.global_seq));
1211 prepare_write_connect(con->msgr, con, 0);
1212 prepare_read_connect(con);
1213 break;
1214
1215 case CEPH_MSGR_TAG_READY:
1216 if (req_feat & ~server_feat) {
1217 pr_err("%s%lld %s protocol feature mismatch,"
1218 " my required %llx > server's %llx, need %llx\n",
1219 ENTITY_NAME(con->peer_name),
1220 pr_addr(&con->peer_addr.in_addr),
1221 req_feat, server_feat, req_feat & ~server_feat);
1222 con->error_msg = "missing required protocol features";
1223 fail_protocol(con);
1224 return -1;
1225 }
1226 clear_bit(CONNECTING, &con->state);
1227 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1228 con->connect_seq++;
1229 dout("process_connect got READY gseq %d cseq %d (%d)\n",
1230 con->peer_global_seq,
1231 le32_to_cpu(con->in_reply.connect_seq),
1232 con->connect_seq);
1233 WARN_ON(con->connect_seq !=
1234 le32_to_cpu(con->in_reply.connect_seq));
1235
1236 if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
1237 set_bit(LOSSYTX, &con->state);
1238
1239 prepare_read_tag(con);
1240 break;
1241
1242 case CEPH_MSGR_TAG_WAIT:
1243 /*
1244 * If there is a connection race (we are opening
1245 * connections to each other), one of us may just have
1246 * to WAIT. This shouldn't happen if we are the
1247 * client.
1248 */
1249 pr_err("process_connect peer connecting WAIT\n");
1250
1251 default:
1252 pr_err("connect protocol error, will retry\n");
1253 con->error_msg = "protocol error, garbage tag during connect";
1254 return -1;
1255 }
1256 return 0;
1257}
1258
1259
1260/*
1261 * read (part of) an ack
1262 */
1263static int read_partial_ack(struct ceph_connection *con)
1264{
1265 int to = 0;
1266
1267 return read_partial(con, &to, sizeof(con->in_temp_ack),
1268 &con->in_temp_ack);
1269}
1270
1271
1272/*
1273 * We can finally discard anything that's been acked.
1274 */
1275static void process_ack(struct ceph_connection *con)
1276{
1277 struct ceph_msg *m;
1278 u64 ack = le64_to_cpu(con->in_temp_ack);
1279 u64 seq;
1280
1281 while (!list_empty(&con->out_sent)) {
1282 m = list_first_entry(&con->out_sent, struct ceph_msg,
1283 list_head);
1284 seq = le64_to_cpu(m->hdr.seq);
1285 if (seq > ack)
1286 break;
1287 dout("got ack for seq %llu type %d at %p\n", seq,
1288 le16_to_cpu(m->hdr.type), m);
1289 ceph_msg_remove(m);
1290 }
1291 prepare_read_tag(con);
1292}
1293
1294
1295
1296
1297static int read_partial_message_section(struct ceph_connection *con,
1298 struct kvec *section, unsigned int sec_len,
1299 u32 *crc)
1300{
1301 int left;
1302 int ret;
1303
1304 BUG_ON(!section);
1305
1306 while (section->iov_len < sec_len) {
1307 BUG_ON(section->iov_base == NULL);
1308 left = sec_len - section->iov_len;
1309 ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
1310 section->iov_len, left);
1311 if (ret <= 0)
1312 return ret;
1313 section->iov_len += ret;
1314 if (section->iov_len == sec_len)
1315 *crc = crc32c(0, section->iov_base,
1316 section->iov_len);
1317 }
1318
1319 return 1;
1320}
1321
1322static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
1323 struct ceph_msg_header *hdr,
1324 int *skip);
1325/*
1326 * read (part of) a message.
1327 */
1328static int read_partial_message(struct ceph_connection *con)
1329{
1330 struct ceph_msg *m = con->in_msg;
1331 void *p;
1332 int ret;
1333 int to, left;
1334 unsigned front_len, middle_len, data_len, data_off;
1335 int datacrc = con->msgr->nocrc;
1336 int skip;
1337
1338 dout("read_partial_message con %p msg %p\n", con, m);
1339
1340 /* header */
1341 while (con->in_base_pos < sizeof(con->in_hdr)) {
1342 left = sizeof(con->in_hdr) - con->in_base_pos;
1343 ret = ceph_tcp_recvmsg(con->sock,
1344 (char *)&con->in_hdr + con->in_base_pos,
1345 left);
1346 if (ret <= 0)
1347 return ret;
1348 con->in_base_pos += ret;
1349 if (con->in_base_pos == sizeof(con->in_hdr)) {
1350 u32 crc = crc32c(0, (void *)&con->in_hdr,
1351 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
1352 if (crc != le32_to_cpu(con->in_hdr.crc)) {
1353 pr_err("read_partial_message bad hdr "
1354 " crc %u != expected %u\n",
1355 crc, con->in_hdr.crc);
1356 return -EBADMSG;
1357 }
1358 }
1359 }
1360 front_len = le32_to_cpu(con->in_hdr.front_len);
1361 if (front_len > CEPH_MSG_MAX_FRONT_LEN)
1362 return -EIO;
1363 middle_len = le32_to_cpu(con->in_hdr.middle_len);
1364 if (middle_len > CEPH_MSG_MAX_DATA_LEN)
1365 return -EIO;
1366 data_len = le32_to_cpu(con->in_hdr.data_len);
1367 if (data_len > CEPH_MSG_MAX_DATA_LEN)
1368 return -EIO;
1369 data_off = le16_to_cpu(con->in_hdr.data_off);
1370
1371 /* allocate message? */
1372 if (!con->in_msg) {
1373 dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
1374 con->in_hdr.front_len, con->in_hdr.data_len);
1375 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
1376 if (skip) {
1377 /* skip this message */
1378 dout("alloc_msg returned NULL, skipping message\n");
1379 con->in_base_pos = -front_len - middle_len - data_len -
1380 sizeof(m->footer);
1381 con->in_tag = CEPH_MSGR_TAG_READY;
1382 return 0;
1383 }
1384 if (IS_ERR(con->in_msg)) {
1385 ret = PTR_ERR(con->in_msg);
1386 con->in_msg = NULL;
1387 con->error_msg =
1388 "error allocating memory for incoming message";
1389 return ret;
1390 }
1391 m = con->in_msg;
1392 m->front.iov_len = 0; /* haven't read it yet */
1393 if (m->middle)
1394 m->middle->vec.iov_len = 0;
1395
1396 con->in_msg_pos.page = 0;
1397 con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
1398 con->in_msg_pos.data_pos = 0;
1399 }
1400
1401 /* front */
1402 ret = read_partial_message_section(con, &m->front, front_len,
1403 &con->in_front_crc);
1404 if (ret <= 0)
1405 return ret;
1406
1407 /* middle */
1408 if (m->middle) {
1409 ret = read_partial_message_section(con, &m->middle->vec, middle_len,
1410 &con->in_middle_crc);
1411 if (ret <= 0)
1412 return ret;
1413 }
1414
1415 /* (page) data */
1416 while (con->in_msg_pos.data_pos < data_len) {
1417 left = min((int)(data_len - con->in_msg_pos.data_pos),
1418 (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
1419 BUG_ON(m->pages == NULL);
1420 p = kmap(m->pages[con->in_msg_pos.page]);
1421 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1422 left);
1423 if (ret > 0 && datacrc)
1424 con->in_data_crc =
1425 crc32c(con->in_data_crc,
1426 p + con->in_msg_pos.page_pos, ret);
1427 kunmap(m->pages[con->in_msg_pos.page]);
1428 if (ret <= 0)
1429 return ret;
1430 con->in_msg_pos.data_pos += ret;
1431 con->in_msg_pos.page_pos += ret;
1432 if (con->in_msg_pos.page_pos == PAGE_SIZE) {
1433 con->in_msg_pos.page_pos = 0;
1434 con->in_msg_pos.page++;
1435 }
1436 }
1437
1438 /* footer */
1439 to = sizeof(m->hdr) + sizeof(m->footer);
1440 while (con->in_base_pos < to) {
1441 left = to - con->in_base_pos;
1442 ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
1443 (con->in_base_pos - sizeof(m->hdr)),
1444 left);
1445 if (ret <= 0)
1446 return ret;
1447 con->in_base_pos += ret;
1448 }
1449 dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
1450 m, front_len, m->footer.front_crc, middle_len,
1451 m->footer.middle_crc, data_len, m->footer.data_crc);
1452
1453 /* crc ok? */
1454 if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
1455 pr_err("read_partial_message %p front crc %u != exp. %u\n",
1456 m, con->in_front_crc, m->footer.front_crc);
1457 return -EBADMSG;
1458 }
1459 if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
1460 pr_err("read_partial_message %p middle crc %u != exp %u\n",
1461 m, con->in_middle_crc, m->footer.middle_crc);
1462 return -EBADMSG;
1463 }
1464 if (datacrc &&
1465 (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
1466 con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
1467 pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
1468 con->in_data_crc, le32_to_cpu(m->footer.data_crc));
1469 return -EBADMSG;
1470 }
1471
1472 return 1; /* done! */
1473}
1474
1475/*
1476 * Process message. This happens in the worker thread. The callback should
1477 * be careful not to do anything that waits on other incoming messages or it
1478 * may deadlock.
1479 */
1480static void process_message(struct ceph_connection *con)
1481{
1482 struct ceph_msg *msg;
1483
1484 msg = con->in_msg;
1485 con->in_msg = NULL;
1486
1487 /* if first message, set peer_name */
1488 if (con->peer_name.type == 0)
1489 con->peer_name = msg->hdr.src.name;
1490
1491 con->in_seq++;
1492 mutex_unlock(&con->mutex);
1493
1494 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
1495 msg, le64_to_cpu(msg->hdr.seq),
1496 ENTITY_NAME(msg->hdr.src.name),
1497 le16_to_cpu(msg->hdr.type),
1498 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1499 le32_to_cpu(msg->hdr.front_len),
1500 le32_to_cpu(msg->hdr.data_len),
1501 con->in_front_crc, con->in_middle_crc, con->in_data_crc);
1502 con->ops->dispatch(con, msg);
1503
1504 mutex_lock(&con->mutex);
1505 prepare_read_tag(con);
1506}
1507
1508
1509/*
1510 * Write something to the socket. Called in a worker thread when the
1511 * socket appears to be writeable and we have something ready to send.
1512 */
1513static int try_write(struct ceph_connection *con)
1514{
1515 struct ceph_messenger *msgr = con->msgr;
1516 int ret = 1;
1517
1518 dout("try_write start %p state %lu nref %d\n", con, con->state,
1519 atomic_read(&con->nref));
1520
1521 mutex_lock(&con->mutex);
1522more:
1523 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
1524
1525 /* open the socket first? */
1526 if (con->sock == NULL) {
1527 /*
1528 * if we were STANDBY and are reconnecting _this_
1529 * connection, bump connect_seq now. Always bump
1530 * global_seq.
1531 */
1532 if (test_and_clear_bit(STANDBY, &con->state))
1533 con->connect_seq++;
1534
1535 prepare_write_banner(msgr, con);
1536 prepare_write_connect(msgr, con, 1);
1537 prepare_read_banner(con);
1538 set_bit(CONNECTING, &con->state);
1539 clear_bit(NEGOTIATING, &con->state);
1540
1541 BUG_ON(con->in_msg);
1542 con->in_tag = CEPH_MSGR_TAG_READY;
1543 dout("try_write initiating connect on %p new state %lu\n",
1544 con, con->state);
1545 con->sock = ceph_tcp_connect(con);
1546 if (IS_ERR(con->sock)) {
1547 con->sock = NULL;
1548 con->error_msg = "connect error";
1549 ret = -1;
1550 goto out;
1551 }
1552 }
1553
1554more_kvec:
1555 /* kvec data queued? */
1556 if (con->out_skip) {
1557 ret = write_partial_skip(con);
1558 if (ret <= 0)
1559 goto done;
1560 if (ret < 0) {
1561 dout("try_write write_partial_skip err %d\n", ret);
1562 goto done;
1563 }
1564 }
1565 if (con->out_kvec_left) {
1566 ret = write_partial_kvec(con);
1567 if (ret <= 0)
1568 goto done;
1569 }
1570
1571 /* msg pages? */
1572 if (con->out_msg) {
1573 if (con->out_msg_done) {
1574 ceph_msg_put(con->out_msg);
1575 con->out_msg = NULL; /* we're done with this one */
1576 goto do_next;
1577 }
1578
1579 ret = write_partial_msg_pages(con);
1580 if (ret == 1)
1581 goto more_kvec; /* we need to send the footer, too! */
1582 if (ret == 0)
1583 goto done;
1584 if (ret < 0) {
1585 dout("try_write write_partial_msg_pages err %d\n",
1586 ret);
1587 goto done;
1588 }
1589 }
1590
1591do_next:
1592 if (!test_bit(CONNECTING, &con->state)) {
1593 /* is anything else pending? */
1594 if (!list_empty(&con->out_queue)) {
1595 prepare_write_message(con);
1596 goto more;
1597 }
1598 if (con->in_seq > con->in_seq_acked) {
1599 prepare_write_ack(con);
1600 goto more;
1601 }
1602 if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
1603 prepare_write_keepalive(con);
1604 goto more;
1605 }
1606 }
1607
1608 /* Nothing to do! */
1609 clear_bit(WRITE_PENDING, &con->state);
1610 dout("try_write nothing else to write.\n");
1611done:
1612 ret = 0;
1613out:
1614 mutex_unlock(&con->mutex);
1615 dout("try_write done on %p\n", con);
1616 return ret;
1617}
1618
1619
1620
1621/*
1622 * Read what we can from the socket.
1623 */
1624static int try_read(struct ceph_connection *con)
1625{
1626 struct ceph_messenger *msgr;
1627 int ret = -1;
1628
1629 if (!con->sock)
1630 return 0;
1631
1632 if (test_bit(STANDBY, &con->state))
1633 return 0;
1634
1635 dout("try_read start on %p\n", con);
1636 msgr = con->msgr;
1637
1638 mutex_lock(&con->mutex);
1639
1640more:
1641 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
1642 con->in_base_pos);
1643 if (test_bit(CONNECTING, &con->state)) {
1644 if (!test_bit(NEGOTIATING, &con->state)) {
1645 dout("try_read connecting\n");
1646 ret = read_partial_banner(con);
1647 if (ret <= 0)
1648 goto done;
1649 if (process_banner(con) < 0) {
1650 ret = -1;
1651 goto out;
1652 }
1653 }
1654 ret = read_partial_connect(con);
1655 if (ret <= 0)
1656 goto done;
1657 if (process_connect(con) < 0) {
1658 ret = -1;
1659 goto out;
1660 }
1661 goto more;
1662 }
1663
1664 if (con->in_base_pos < 0) {
1665 /*
1666 * skipping + discarding content.
1667 *
1668 * FIXME: there must be a better way to do this!
1669 */
1670 static char buf[1024];
1671 int skip = min(1024, -con->in_base_pos);
1672 dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
1673 ret = ceph_tcp_recvmsg(con->sock, buf, skip);
1674 if (ret <= 0)
1675 goto done;
1676 con->in_base_pos += ret;
1677 if (con->in_base_pos)
1678 goto more;
1679 }
1680 if (con->in_tag == CEPH_MSGR_TAG_READY) {
1681 /*
1682 * what's next?
1683 */
1684 ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
1685 if (ret <= 0)
1686 goto done;
1687 dout("try_read got tag %d\n", (int)con->in_tag);
1688 switch (con->in_tag) {
1689 case CEPH_MSGR_TAG_MSG:
1690 prepare_read_message(con);
1691 break;
1692 case CEPH_MSGR_TAG_ACK:
1693 prepare_read_ack(con);
1694 break;
1695 case CEPH_MSGR_TAG_CLOSE:
1696 set_bit(CLOSED, &con->state); /* fixme */
1697 goto done;
1698 default:
1699 goto bad_tag;
1700 }
1701 }
1702 if (con->in_tag == CEPH_MSGR_TAG_MSG) {
1703 ret = read_partial_message(con);
1704 if (ret <= 0) {
1705 switch (ret) {
1706 case -EBADMSG:
1707 con->error_msg = "bad crc";
1708 ret = -EIO;
1709 goto out;
1710 case -EIO:
1711 con->error_msg = "io error";
1712 goto out;
1713 default:
1714 goto done;
1715 }
1716 }
1717 if (con->in_tag == CEPH_MSGR_TAG_READY)
1718 goto more;
1719 process_message(con);
1720 goto more;
1721 }
1722 if (con->in_tag == CEPH_MSGR_TAG_ACK) {
1723 ret = read_partial_ack(con);
1724 if (ret <= 0)
1725 goto done;
1726 process_ack(con);
1727 goto more;
1728 }
1729
1730done:
1731 ret = 0;
1732out:
1733 mutex_unlock(&con->mutex);
1734 dout("try_read done on %p\n", con);
1735 return ret;
1736
1737bad_tag:
1738 pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
1739 con->error_msg = "protocol error, garbage tag";
1740 ret = -1;
1741 goto out;
1742}
1743
1744
1745/*
1746 * Atomically queue work on a connection. Bump @con reference to
1747 * avoid races with connection teardown.
1748 *
1749 * There is some trickery going on with QUEUED and BUSY because we
1750 * only want a _single_ thread operating on each connection at any
1751 * point in time, but we want to use all available CPUs.
1752 *
1753 * The worker thread only proceeds if it can atomically set BUSY. It
1754 * clears QUEUED and does it's thing. When it thinks it's done, it
1755 * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
1756 * (tries again to set BUSY).
1757 *
1758 * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
1759 * try to queue work. If that fails (work is already queued, or BUSY)
1760 * we give up (work also already being done or is queued) but leave QUEUED
1761 * set so that the worker thread will loop if necessary.
1762 */
1763static void queue_con(struct ceph_connection *con)
1764{
1765 if (test_bit(DEAD, &con->state)) {
1766 dout("queue_con %p ignoring: DEAD\n",
1767 con);
1768 return;
1769 }
1770
1771 if (!con->ops->get(con)) {
1772 dout("queue_con %p ref count 0\n", con);
1773 return;
1774 }
1775
1776 set_bit(QUEUED, &con->state);
1777 if (test_bit(BUSY, &con->state)) {
1778 dout("queue_con %p - already BUSY\n", con);
1779 con->ops->put(con);
1780 } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
1781 dout("queue_con %p - already queued\n", con);
1782 con->ops->put(con);
1783 } else {
1784 dout("queue_con %p\n", con);
1785 }
1786}
1787
1788/*
1789 * Do some work on a connection. Drop a connection ref when we're done.
1790 */
1791static void con_work(struct work_struct *work)
1792{
1793 struct ceph_connection *con = container_of(work, struct ceph_connection,
1794 work.work);
1795 int backoff = 0;
1796
1797more:
1798 if (test_and_set_bit(BUSY, &con->state) != 0) {
1799 dout("con_work %p BUSY already set\n", con);
1800 goto out;
1801 }
1802 dout("con_work %p start, clearing QUEUED\n", con);
1803 clear_bit(QUEUED, &con->state);
1804
1805 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
1806 dout("con_work CLOSED\n");
1807 con_close_socket(con);
1808 goto done;
1809 }
1810 if (test_and_clear_bit(OPENING, &con->state)) {
1811 /* reopen w/ new peer */
1812 dout("con_work OPENING\n");
1813 con_close_socket(con);
1814 }
1815
1816 if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
1817 try_read(con) < 0 ||
1818 try_write(con) < 0) {
1819 backoff = 1;
1820 ceph_fault(con); /* error/fault path */
1821 }
1822
1823done:
1824 clear_bit(BUSY, &con->state);
1825 dout("con->state=%lu\n", con->state);
1826 if (test_bit(QUEUED, &con->state)) {
1827 if (!backoff || test_bit(OPENING, &con->state)) {
1828 dout("con_work %p QUEUED reset, looping\n", con);
1829 goto more;
1830 }
1831 dout("con_work %p QUEUED reset, but just faulted\n", con);
1832 clear_bit(QUEUED, &con->state);
1833 }
1834 dout("con_work %p done\n", con);
1835
1836out:
1837 con->ops->put(con);
1838}
1839
1840
1841/*
1842 * Generic error/fault handler. A retry mechanism is used with
1843 * exponential backoff
1844 */
1845static void ceph_fault(struct ceph_connection *con)
1846{
1847 pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
1848 pr_addr(&con->peer_addr.in_addr), con->error_msg);
1849 dout("fault %p state %lu to peer %s\n",
1850 con, con->state, pr_addr(&con->peer_addr.in_addr));
1851
1852 if (test_bit(LOSSYTX, &con->state)) {
1853 dout("fault on LOSSYTX channel\n");
1854 goto out;
1855 }
1856
1857 mutex_lock(&con->mutex);
1858 if (test_bit(CLOSED, &con->state))
1859 goto out_unlock;
1860
1861 con_close_socket(con);
1862
1863 if (con->in_msg) {
1864 ceph_msg_put(con->in_msg);
1865 con->in_msg = NULL;
1866 }
1867
1868 /* Requeue anything that hasn't been acked */
1869 list_splice_init(&con->out_sent, &con->out_queue);
1870
1871 /* If there are no messages in the queue, place the connection
1872 * in a STANDBY state (i.e., don't try to reconnect just yet). */
1873 if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
1874 dout("fault setting STANDBY\n");
1875 set_bit(STANDBY, &con->state);
1876 } else {
1877 /* retry after a delay. */
1878 if (con->delay == 0)
1879 con->delay = BASE_DELAY_INTERVAL;
1880 else if (con->delay < MAX_DELAY_INTERVAL)
1881 con->delay *= 2;
1882 dout("fault queueing %p delay %lu\n", con, con->delay);
1883 con->ops->get(con);
1884 if (queue_delayed_work(ceph_msgr_wq, &con->work,
1885 round_jiffies_relative(con->delay)) == 0)
1886 con->ops->put(con);
1887 }
1888
1889out_unlock:
1890 mutex_unlock(&con->mutex);
1891out:
1892 /*
1893 * in case we faulted due to authentication, invalidate our
1894 * current tickets so that we can get new ones.
1895 */
1896 if (con->auth_retry && con->ops->invalidate_authorizer) {
1897 dout("calling invalidate_authorizer()\n");
1898 con->ops->invalidate_authorizer(con);
1899 }
1900
1901 if (con->ops->fault)
1902 con->ops->fault(con);
1903}
1904
1905
1906
1907/*
1908 * create a new messenger instance
1909 */
1910struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
1911{
1912 struct ceph_messenger *msgr;
1913
1914 msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
1915 if (msgr == NULL)
1916 return ERR_PTR(-ENOMEM);
1917
1918 spin_lock_init(&msgr->global_seq_lock);
1919
1920 /* the zero page is needed if a request is "canceled" while the message
1921 * is being written over the socket */
1922 msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
1923 if (!msgr->zero_page) {
1924 kfree(msgr);
1925 return ERR_PTR(-ENOMEM);
1926 }
1927 kmap(msgr->zero_page);
1928
1929 if (myaddr)
1930 msgr->inst.addr = *myaddr;
1931
1932 /* select a random nonce */
1933 msgr->inst.addr.type = 0;
1934 get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
1935 encode_my_addr(msgr);
1936
1937 dout("messenger_create %p\n", msgr);
1938 return msgr;
1939}
1940
1941void ceph_messenger_destroy(struct ceph_messenger *msgr)
1942{
1943 dout("destroy %p\n", msgr);
1944 kunmap(msgr->zero_page);
1945 __free_page(msgr->zero_page);
1946 kfree(msgr);
1947 dout("destroyed messenger %p\n", msgr);
1948}
1949
1950/*
1951 * Queue up an outgoing message on the given connection.
1952 */
1953void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
1954{
1955 if (test_bit(CLOSED, &con->state)) {
1956 dout("con_send %p closed, dropping %p\n", con, msg);
1957 ceph_msg_put(msg);
1958 return;
1959 }
1960
1961 /* set src+dst */
1962 msg->hdr.src.name = con->msgr->inst.name;
1963 msg->hdr.src.addr = con->msgr->my_enc_addr;
1964 msg->hdr.orig_src = msg->hdr.src;
1965
1966 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
1967
1968 /* queue */
1969 mutex_lock(&con->mutex);
1970 BUG_ON(!list_empty(&msg->list_head));
1971 list_add_tail(&msg->list_head, &con->out_queue);
1972 dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
1973 ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
1974 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1975 le32_to_cpu(msg->hdr.front_len),
1976 le32_to_cpu(msg->hdr.middle_len),
1977 le32_to_cpu(msg->hdr.data_len));
1978 mutex_unlock(&con->mutex);
1979
1980 /* if there wasn't anything waiting to send before, queue
1981 * new work */
1982 if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
1983 queue_con(con);
1984}
1985
1986/*
1987 * Revoke a message that was previously queued for send
1988 */
1989void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
1990{
1991 mutex_lock(&con->mutex);
1992 if (!list_empty(&msg->list_head)) {
1993 dout("con_revoke %p msg %p\n", con, msg);
1994 list_del_init(&msg->list_head);
1995 ceph_msg_put(msg);
1996 msg->hdr.seq = 0;
1997 if (con->out_msg == msg) {
1998 ceph_msg_put(con->out_msg);
1999 con->out_msg = NULL;
2000 }
2001 if (con->out_kvec_is_msg) {
2002 con->out_skip = con->out_kvec_bytes;
2003 con->out_kvec_is_msg = false;
2004 }
2005 } else {
2006 dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg);
2007 }
2008 mutex_unlock(&con->mutex);
2009}
2010
2011/*
2012 * Revoke a message that we may be reading data into
2013 */
2014void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
2015{
2016 mutex_lock(&con->mutex);
2017 if (con->in_msg && con->in_msg == msg) {
2018 unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
2019 unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
2020 unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
2021
2022 /* skip rest of message */
2023 dout("con_revoke_pages %p msg %p revoked\n", con, msg);
2024 con->in_base_pos = con->in_base_pos -
2025 sizeof(struct ceph_msg_header) -
2026 front_len -
2027 middle_len -
2028 data_len -
2029 sizeof(struct ceph_msg_footer);
2030 ceph_msg_put(con->in_msg);
2031 con->in_msg = NULL;
2032 con->in_tag = CEPH_MSGR_TAG_READY;
2033 } else {
2034 dout("con_revoke_pages %p msg %p pages %p no-op\n",
2035 con, con->in_msg, msg);
2036 }
2037 mutex_unlock(&con->mutex);
2038}
2039
2040/*
2041 * Queue a keepalive byte to ensure the tcp connection is alive.
2042 */
2043void ceph_con_keepalive(struct ceph_connection *con)
2044{
2045 if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
2046 test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2047 queue_con(con);
2048}
2049
2050
2051/*
2052 * construct a new message with given type, size
2053 * the new msg has a ref count of 1.
2054 */
2055struct ceph_msg *ceph_msg_new(int type, int front_len,
2056 int page_len, int page_off, struct page **pages)
2057{
2058 struct ceph_msg *m;
2059
2060 m = kmalloc(sizeof(*m), GFP_NOFS);
2061 if (m == NULL)
2062 goto out;
2063 kref_init(&m->kref);
2064 INIT_LIST_HEAD(&m->list_head);
2065
2066 m->hdr.type = cpu_to_le16(type);
2067 m->hdr.front_len = cpu_to_le32(front_len);
2068 m->hdr.middle_len = 0;
2069 m->hdr.data_len = cpu_to_le32(page_len);
2070 m->hdr.data_off = cpu_to_le16(page_off);
2071 m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
2072 m->footer.front_crc = 0;
2073 m->footer.middle_crc = 0;
2074 m->footer.data_crc = 0;
2075 m->front_max = front_len;
2076 m->front_is_vmalloc = false;
2077 m->more_to_follow = false;
2078 m->pool = NULL;
2079
2080 /* front */
2081 if (front_len) {
2082 if (front_len > PAGE_CACHE_SIZE) {
2083 m->front.iov_base = __vmalloc(front_len, GFP_NOFS,
2084 PAGE_KERNEL);
2085 m->front_is_vmalloc = true;
2086 } else {
2087 m->front.iov_base = kmalloc(front_len, GFP_NOFS);
2088 }
2089 if (m->front.iov_base == NULL) {
2090 pr_err("msg_new can't allocate %d bytes\n",
2091 front_len);
2092 goto out2;
2093 }
2094 } else {
2095 m->front.iov_base = NULL;
2096 }
2097 m->front.iov_len = front_len;
2098
2099 /* middle */
2100 m->middle = NULL;
2101
2102 /* data */
2103 m->nr_pages = calc_pages_for(page_off, page_len);
2104 m->pages = pages;
2105 m->pagelist = NULL;
2106
2107 dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
2108 m->nr_pages);
2109 return m;
2110
2111out2:
2112 ceph_msg_put(m);
2113out:
2114 pr_err("msg_new can't create type %d len %d\n", type, front_len);
2115 return ERR_PTR(-ENOMEM);
2116}
2117
2118/*
2119 * Allocate "middle" portion of a message, if it is needed and wasn't
2120 * allocated by alloc_msg. This allows us to read a small fixed-size
2121 * per-type header in the front and then gracefully fail (i.e.,
2122 * propagate the error to the caller based on info in the front) when
2123 * the middle is too large.
2124 */
2125static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
2126{
2127 int type = le16_to_cpu(msg->hdr.type);
2128 int middle_len = le32_to_cpu(msg->hdr.middle_len);
2129
2130 dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
2131 ceph_msg_type_name(type), middle_len);
2132 BUG_ON(!middle_len);
2133 BUG_ON(msg->middle);
2134
2135 msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
2136 if (!msg->middle)
2137 return -ENOMEM;
2138 return 0;
2139}
2140
2141/*
2142 * Generic message allocator, for incoming messages.
2143 */
2144static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
2145 struct ceph_msg_header *hdr,
2146 int *skip)
2147{
2148 int type = le16_to_cpu(hdr->type);
2149 int front_len = le32_to_cpu(hdr->front_len);
2150 int middle_len = le32_to_cpu(hdr->middle_len);
2151 struct ceph_msg *msg = NULL;
2152 int ret;
2153
2154 if (con->ops->alloc_msg) {
2155 mutex_unlock(&con->mutex);
2156 msg = con->ops->alloc_msg(con, hdr, skip);
2157 mutex_lock(&con->mutex);
2158 if (IS_ERR(msg))
2159 return msg;
2160
2161 if (*skip)
2162 return NULL;
2163 }
2164 if (!msg) {
2165 *skip = 0;
2166 msg = ceph_msg_new(type, front_len, 0, 0, NULL);
2167 if (!msg) {
2168 pr_err("unable to allocate msg type %d len %d\n",
2169 type, front_len);
2170 return ERR_PTR(-ENOMEM);
2171 }
2172 }
2173 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2174
2175 if (middle_len) {
2176 ret = ceph_alloc_middle(con, msg);
2177
2178 if (ret < 0) {
2179 ceph_msg_put(msg);
2180 return msg;
2181 }
2182 }
2183
2184 return msg;
2185}
2186
2187
2188/*
2189 * Free a generically kmalloc'd message.
2190 */
2191void ceph_msg_kfree(struct ceph_msg *m)
2192{
2193 dout("msg_kfree %p\n", m);
2194 if (m->front_is_vmalloc)
2195 vfree(m->front.iov_base);
2196 else
2197 kfree(m->front.iov_base);
2198 kfree(m);
2199}
2200
2201/*
2202 * Drop a msg ref. Destroy as needed.
2203 */
2204void ceph_msg_last_put(struct kref *kref)
2205{
2206 struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
2207
2208 dout("ceph_msg_put last one on %p\n", m);
2209 WARN_ON(!list_empty(&m->list_head));
2210
2211 /* drop middle, data, if any */
2212 if (m->middle) {
2213 ceph_buffer_put(m->middle);
2214 m->middle = NULL;
2215 }
2216 m->nr_pages = 0;
2217 m->pages = NULL;
2218
2219 if (m->pagelist) {
2220 ceph_pagelist_release(m->pagelist);
2221 kfree(m->pagelist);
2222 m->pagelist = NULL;
2223 }
2224
2225 if (m->pool)
2226 ceph_msgpool_put(m->pool, m);
2227 else
2228 ceph_msg_kfree(m);
2229}
2230
2231void ceph_msg_dump(struct ceph_msg *msg)
2232{
2233 pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
2234 msg->front_max, msg->nr_pages);
2235 print_hex_dump(KERN_DEBUG, "header: ",
2236 DUMP_PREFIX_OFFSET, 16, 1,
2237 &msg->hdr, sizeof(msg->hdr), true);
2238 print_hex_dump(KERN_DEBUG, " front: ",
2239 DUMP_PREFIX_OFFSET, 16, 1,
2240 msg->front.iov_base, msg->front.iov_len, true);
2241 if (msg->middle)
2242 print_hex_dump(KERN_DEBUG, "middle: ",
2243 DUMP_PREFIX_OFFSET, 16, 1,
2244 msg->middle->vec.iov_base,
2245 msg->middle->vec.iov_len, true);
2246 print_hex_dump(KERN_DEBUG, "footer: ",
2247 DUMP_PREFIX_OFFSET, 16, 1,
2248 &msg->footer, sizeof(msg->footer), true);
2249}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
new file mode 100644
index 000000000000..a343dae73cdc
--- /dev/null
+++ b/fs/ceph/messenger.h
@@ -0,0 +1,255 @@
1#ifndef __FS_CEPH_MESSENGER_H
2#define __FS_CEPH_MESSENGER_H
3
4#include <linux/kref.h>
5#include <linux/mutex.h>
6#include <linux/net.h>
7#include <linux/radix-tree.h>
8#include <linux/uio.h>
9#include <linux/version.h>
10#include <linux/workqueue.h>
11
12#include "types.h"
13#include "buffer.h"
14
15struct ceph_msg;
16struct ceph_connection;
17
18extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */
19
20/*
21 * Ceph defines these callbacks for handling connection events.
22 */
23struct ceph_connection_operations {
24 struct ceph_connection *(*get)(struct ceph_connection *);
25 void (*put)(struct ceph_connection *);
26
27 /* handle an incoming message. */
28 void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
29
30 /* authorize an outgoing connection */
31 int (*get_authorizer) (struct ceph_connection *con,
32 void **buf, int *len, int *proto,
33 void **reply_buf, int *reply_len, int force_new);
34 int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
35 int (*invalidate_authorizer)(struct ceph_connection *con);
36
37 /* protocol version mismatch */
38 void (*bad_proto) (struct ceph_connection *con);
39
40 /* there was some error on the socket (disconnect, whatever) */
41 void (*fault) (struct ceph_connection *con);
42
43 /* a remote host as terminated a message exchange session, and messages
44 * we sent (or they tried to send us) may be lost. */
45 void (*peer_reset) (struct ceph_connection *con);
46
47 struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
48 struct ceph_msg_header *hdr,
49 int *skip);
50};
51
52extern const char *ceph_name_type_str(int t);
53
54/* use format string %s%d */
55#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num)
56
57struct ceph_messenger {
58 struct ceph_entity_inst inst; /* my name+address */
59 struct ceph_entity_addr my_enc_addr;
60 struct page *zero_page; /* used in certain error cases */
61
62 bool nocrc;
63
64 /*
65 * the global_seq counts connections i (attempt to) initiate
66 * in order to disambiguate certain connect race conditions.
67 */
68 u32 global_seq;
69 spinlock_t global_seq_lock;
70};
71
72/*
73 * a single message. it contains a header (src, dest, message type, etc.),
74 * footer (crc values, mainly), a "front" message body, and possibly a
75 * data payload (stored in some number of pages).
76 */
77struct ceph_msg {
78 struct ceph_msg_header hdr; /* header */
79 struct ceph_msg_footer footer; /* footer */
80 struct kvec front; /* unaligned blobs of message */
81 struct ceph_buffer *middle;
82 struct page **pages; /* data payload. NOT OWNER. */
83 unsigned nr_pages; /* size of page array */
84 struct ceph_pagelist *pagelist; /* instead of pages */
85 struct list_head list_head;
86 struct kref kref;
87 bool front_is_vmalloc;
88 bool more_to_follow;
89 int front_max;
90
91 struct ceph_msgpool *pool;
92};
93
94struct ceph_msg_pos {
95 int page, page_pos; /* which page; offset in page */
96 int data_pos; /* offset in data payload */
97 int did_page_crc; /* true if we've calculated crc for current page */
98};
99
100/* ceph connection fault delay defaults, for exponential backoff */
101#define BASE_DELAY_INTERVAL (HZ/2)
102#define MAX_DELAY_INTERVAL (5 * 60 * HZ)
103
104/*
105 * ceph_connection state bit flags
106 *
107 * QUEUED and BUSY are used together to ensure that only a single
108 * thread is currently opening, reading or writing data to the socket.
109 */
110#define LOSSYTX 0 /* we can close channel or drop messages on errors */
111#define CONNECTING 1
112#define NEGOTIATING 2
113#define KEEPALIVE_PENDING 3
114#define WRITE_PENDING 4 /* we have data ready to send */
115#define QUEUED 5 /* there is work queued on this connection */
116#define BUSY 6 /* work is being done */
117#define STANDBY 8 /* no outgoing messages, socket closed. we keep
118 * the ceph_connection around to maintain shared
119 * state with the peer. */
120#define CLOSED 10 /* we've closed the connection */
121#define SOCK_CLOSED 11 /* socket state changed to closed */
122#define OPENING 13 /* open connection w/ (possibly new) peer */
123#define DEAD 14 /* dead, about to kfree */
124
125/*
126 * A single connection with another host.
127 *
128 * We maintain a queue of outgoing messages, and some session state to
129 * ensure that we can preserve the lossless, ordered delivery of
130 * messages in the case of a TCP disconnect.
131 */
132struct ceph_connection {
133 void *private;
134 atomic_t nref;
135
136 const struct ceph_connection_operations *ops;
137
138 struct ceph_messenger *msgr;
139 struct socket *sock;
140 unsigned long state; /* connection state (see flags above) */
141 const char *error_msg; /* error message, if any */
142
143 struct ceph_entity_addr peer_addr; /* peer address */
144 struct ceph_entity_name peer_name; /* peer name */
145 struct ceph_entity_addr peer_addr_for_me;
146 u32 connect_seq; /* identify the most recent connection
147 attempt for this connection, client */
148 u32 peer_global_seq; /* peer's global seq for this connection */
149
150 int auth_retry; /* true if we need a newer authorizer */
151 void *auth_reply_buf; /* where to put the authorizer reply */
152 int auth_reply_buf_len;
153
154 struct mutex mutex;
155
156 /* out queue */
157 struct list_head out_queue;
158 struct list_head out_sent; /* sending or sent but unacked */
159 u64 out_seq; /* last message queued for send */
160 u64 out_seq_sent; /* last message sent */
161 bool out_keepalive_pending;
162
163 u64 in_seq, in_seq_acked; /* last message received, acked */
164
165 /* connection negotiation temps */
166 char in_banner[CEPH_BANNER_MAX_LEN];
167 union {
168 struct { /* outgoing connection */
169 struct ceph_msg_connect out_connect;
170 struct ceph_msg_connect_reply in_reply;
171 };
172 struct { /* incoming */
173 struct ceph_msg_connect in_connect;
174 struct ceph_msg_connect_reply out_reply;
175 };
176 };
177 struct ceph_entity_addr actual_peer_addr;
178
179 /* message out temps */
180 struct ceph_msg *out_msg; /* sending message (== tail of
181 out_sent) */
182 bool out_msg_done;
183 struct ceph_msg_pos out_msg_pos;
184
185 struct kvec out_kvec[8], /* sending header/footer data */
186 *out_kvec_cur;
187 int out_kvec_left; /* kvec's left in out_kvec */
188 int out_skip; /* skip this many bytes */
189 int out_kvec_bytes; /* total bytes left */
190 bool out_kvec_is_msg; /* kvec refers to out_msg */
191 int out_more; /* there is more data after the kvecs */
192 __le64 out_temp_ack; /* for writing an ack */
193
194 /* message in temps */
195 struct ceph_msg_header in_hdr;
196 struct ceph_msg *in_msg;
197 struct ceph_msg_pos in_msg_pos;
198 u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
199
200 char in_tag; /* protocol control byte */
201 int in_base_pos; /* bytes read */
202 __le64 in_temp_ack; /* for reading an ack */
203
204 struct delayed_work work; /* send|recv work */
205 unsigned long delay; /* current delay interval */
206};
207
208
209extern const char *pr_addr(const struct sockaddr_storage *ss);
210extern int ceph_parse_ips(const char *c, const char *end,
211 struct ceph_entity_addr *addr,
212 int max_count, int *count);
213
214
215extern int ceph_msgr_init(void);
216extern void ceph_msgr_exit(void);
217
218extern struct ceph_messenger *ceph_messenger_create(
219 struct ceph_entity_addr *myaddr);
220extern void ceph_messenger_destroy(struct ceph_messenger *);
221
222extern void ceph_con_init(struct ceph_messenger *msgr,
223 struct ceph_connection *con);
224extern void ceph_con_open(struct ceph_connection *con,
225 struct ceph_entity_addr *addr);
226extern bool ceph_con_opened(struct ceph_connection *con);
227extern void ceph_con_close(struct ceph_connection *con);
228extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
229extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
230extern void ceph_con_revoke_message(struct ceph_connection *con,
231 struct ceph_msg *msg);
232extern void ceph_con_keepalive(struct ceph_connection *con);
233extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
234extern void ceph_con_put(struct ceph_connection *con);
235
236extern struct ceph_msg *ceph_msg_new(int type, int front_len,
237 int page_len, int page_off,
238 struct page **pages);
239extern void ceph_msg_kfree(struct ceph_msg *m);
240
241
242static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
243{
244 kref_get(&msg->kref);
245 return msg;
246}
247extern void ceph_msg_last_put(struct kref *kref);
248static inline void ceph_msg_put(struct ceph_msg *msg)
249{
250 kref_put(&msg->kref, ceph_msg_last_put);
251}
252
253extern void ceph_msg_dump(struct ceph_msg *msg);
254
255#endif
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
new file mode 100644
index 000000000000..8fdc011ca956
--- /dev/null
+++ b/fs/ceph/mon_client.c
@@ -0,0 +1,835 @@
1#include "ceph_debug.h"
2
3#include <linux/types.h>
4#include <linux/slab.h>
5#include <linux/random.h>
6#include <linux/sched.h>
7
8#include "mon_client.h"
9#include "super.h"
10#include "auth.h"
11#include "decode.h"
12
13/*
14 * Interact with Ceph monitor cluster. Handle requests for new map
15 * versions, and periodically resend as needed. Also implement
16 * statfs() and umount().
17 *
18 * A small cluster of Ceph "monitors" are responsible for managing critical
19 * cluster configuration and state information. An odd number (e.g., 3, 5)
20 * of cmon daemons use a modified version of the Paxos part-time parliament
21 * algorithm to manage the MDS map (mds cluster membership), OSD map, and
22 * list of clients who have mounted the file system.
23 *
24 * We maintain an open, active session with a monitor at all times in order to
25 * receive timely MDSMap updates. We periodically send a keepalive byte on the
26 * TCP socket to ensure we detect a failure. If the connection does break, we
27 * randomly hunt for a new monitor. Once the connection is reestablished, we
28 * resend any outstanding requests.
29 */
30
31const static struct ceph_connection_operations mon_con_ops;
32
33static int __validate_auth(struct ceph_mon_client *monc);
34
35/*
36 * Decode a monmap blob (e.g., during mount).
37 */
38struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
39{
40 struct ceph_monmap *m = NULL;
41 int i, err = -EINVAL;
42 struct ceph_fsid fsid;
43 u32 epoch, num_mon;
44 u16 version;
45 u32 len;
46
47 ceph_decode_32_safe(&p, end, len, bad);
48 ceph_decode_need(&p, end, len, bad);
49
50 dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
51
52 ceph_decode_16_safe(&p, end, version, bad);
53
54 ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
55 ceph_decode_copy(&p, &fsid, sizeof(fsid));
56 epoch = ceph_decode_32(&p);
57
58 num_mon = ceph_decode_32(&p);
59 ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
60
61 if (num_mon >= CEPH_MAX_MON)
62 goto bad;
63 m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
64 if (m == NULL)
65 return ERR_PTR(-ENOMEM);
66 m->fsid = fsid;
67 m->epoch = epoch;
68 m->num_mon = num_mon;
69 ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
70 for (i = 0; i < num_mon; i++)
71 ceph_decode_addr(&m->mon_inst[i].addr);
72
73 dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
74 m->num_mon);
75 for (i = 0; i < m->num_mon; i++)
76 dout("monmap_decode mon%d is %s\n", i,
77 pr_addr(&m->mon_inst[i].addr.in_addr));
78 return m;
79
80bad:
81 dout("monmap_decode failed with %d\n", err);
82 kfree(m);
83 return ERR_PTR(err);
84}
85
86/*
87 * return true if *addr is included in the monmap.
88 */
89int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
90{
91 int i;
92
93 for (i = 0; i < m->num_mon; i++)
94 if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
95 return 1;
96 return 0;
97}
98
99/*
100 * Send an auth request.
101 */
102static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
103{
104 monc->pending_auth = 1;
105 monc->m_auth->front.iov_len = len;
106 monc->m_auth->hdr.front_len = cpu_to_le32(len);
107 ceph_msg_get(monc->m_auth); /* keep our ref */
108 ceph_con_send(monc->con, monc->m_auth);
109}
110
111/*
112 * Close monitor session, if any.
113 */
114static void __close_session(struct ceph_mon_client *monc)
115{
116 if (monc->con) {
117 dout("__close_session closing mon%d\n", monc->cur_mon);
118 ceph_con_revoke(monc->con, monc->m_auth);
119 ceph_con_close(monc->con);
120 monc->cur_mon = -1;
121 monc->pending_auth = 0;
122 ceph_auth_reset(monc->auth);
123 }
124}
125
126/*
127 * Open a session with a (new) monitor.
128 */
129static int __open_session(struct ceph_mon_client *monc)
130{
131 char r;
132 int ret;
133
134 if (monc->cur_mon < 0) {
135 get_random_bytes(&r, 1);
136 monc->cur_mon = r % monc->monmap->num_mon;
137 dout("open_session num=%d r=%d -> mon%d\n",
138 monc->monmap->num_mon, r, monc->cur_mon);
139 monc->sub_sent = 0;
140 monc->sub_renew_after = jiffies; /* i.e., expired */
141 monc->want_next_osdmap = !!monc->want_next_osdmap;
142
143 dout("open_session mon%d opening\n", monc->cur_mon);
144 monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
145 monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
146 ceph_con_open(monc->con,
147 &monc->monmap->mon_inst[monc->cur_mon].addr);
148
149 /* initiatiate authentication handshake */
150 ret = ceph_auth_build_hello(monc->auth,
151 monc->m_auth->front.iov_base,
152 monc->m_auth->front_max);
153 __send_prepared_auth_request(monc, ret);
154 } else {
155 dout("open_session mon%d already open\n", monc->cur_mon);
156 }
157 return 0;
158}
159
160static bool __sub_expired(struct ceph_mon_client *monc)
161{
162 return time_after_eq(jiffies, monc->sub_renew_after);
163}
164
165/*
166 * Reschedule delayed work timer.
167 */
168static void __schedule_delayed(struct ceph_mon_client *monc)
169{
170 unsigned delay;
171
172 if (monc->cur_mon < 0 || __sub_expired(monc))
173 delay = 10 * HZ;
174 else
175 delay = 20 * HZ;
176 dout("__schedule_delayed after %u\n", delay);
177 schedule_delayed_work(&monc->delayed_work, delay);
178}
179
180/*
181 * Send subscribe request for mdsmap and/or osdmap.
182 */
183static void __send_subscribe(struct ceph_mon_client *monc)
184{
185 dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
186 (unsigned)monc->sub_sent, __sub_expired(monc),
187 monc->want_next_osdmap);
188 if ((__sub_expired(monc) && !monc->sub_sent) ||
189 monc->want_next_osdmap == 1) {
190 struct ceph_msg *msg;
191 struct ceph_mon_subscribe_item *i;
192 void *p, *end;
193
194 msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
195 if (!msg)
196 return;
197
198 p = msg->front.iov_base;
199 end = p + msg->front.iov_len;
200
201 dout("__send_subscribe to 'mdsmap' %u+\n",
202 (unsigned)monc->have_mdsmap);
203 if (monc->want_next_osdmap) {
204 dout("__send_subscribe to 'osdmap' %u\n",
205 (unsigned)monc->have_osdmap);
206 ceph_encode_32(&p, 3);
207 ceph_encode_string(&p, end, "osdmap", 6);
208 i = p;
209 i->have = cpu_to_le64(monc->have_osdmap);
210 i->onetime = 1;
211 p += sizeof(*i);
212 monc->want_next_osdmap = 2; /* requested */
213 } else {
214 ceph_encode_32(&p, 2);
215 }
216 ceph_encode_string(&p, end, "mdsmap", 6);
217 i = p;
218 i->have = cpu_to_le64(monc->have_mdsmap);
219 i->onetime = 0;
220 p += sizeof(*i);
221 ceph_encode_string(&p, end, "monmap", 6);
222 i = p;
223 i->have = 0;
224 i->onetime = 0;
225 p += sizeof(*i);
226
227 msg->front.iov_len = p - msg->front.iov_base;
228 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
229 ceph_con_send(monc->con, msg);
230
231 monc->sub_sent = jiffies | 1; /* never 0 */
232 }
233}
234
235static void handle_subscribe_ack(struct ceph_mon_client *monc,
236 struct ceph_msg *msg)
237{
238 unsigned seconds;
239 struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
240
241 if (msg->front.iov_len < sizeof(*h))
242 goto bad;
243 seconds = le32_to_cpu(h->duration);
244
245 mutex_lock(&monc->mutex);
246 if (monc->hunting) {
247 pr_info("mon%d %s session established\n",
248 monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
249 monc->hunting = false;
250 }
251 dout("handle_subscribe_ack after %d seconds\n", seconds);
252 monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
253 monc->sub_sent = 0;
254 mutex_unlock(&monc->mutex);
255 return;
256bad:
257 pr_err("got corrupt subscribe-ack msg\n");
258 ceph_msg_dump(msg);
259}
260
261/*
262 * Keep track of which maps we have
263 */
264int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
265{
266 mutex_lock(&monc->mutex);
267 monc->have_mdsmap = got;
268 mutex_unlock(&monc->mutex);
269 return 0;
270}
271
272int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
273{
274 mutex_lock(&monc->mutex);
275 monc->have_osdmap = got;
276 monc->want_next_osdmap = 0;
277 mutex_unlock(&monc->mutex);
278 return 0;
279}
280
281/*
282 * Register interest in the next osdmap
283 */
284void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
285{
286 dout("request_next_osdmap have %u\n", monc->have_osdmap);
287 mutex_lock(&monc->mutex);
288 if (!monc->want_next_osdmap)
289 monc->want_next_osdmap = 1;
290 if (monc->want_next_osdmap < 2)
291 __send_subscribe(monc);
292 mutex_unlock(&monc->mutex);
293}
294
295/*
296 *
297 */
298int ceph_monc_open_session(struct ceph_mon_client *monc)
299{
300 if (!monc->con) {
301 monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
302 if (!monc->con)
303 return -ENOMEM;
304 ceph_con_init(monc->client->msgr, monc->con);
305 monc->con->private = monc;
306 monc->con->ops = &mon_con_ops;
307 }
308
309 mutex_lock(&monc->mutex);
310 __open_session(monc);
311 __schedule_delayed(monc);
312 mutex_unlock(&monc->mutex);
313 return 0;
314}
315
316/*
317 * The monitor responds with mount ack indicate mount success. The
318 * included client ticket allows the client to talk to MDSs and OSDs.
319 */
320static void ceph_monc_handle_map(struct ceph_mon_client *monc,
321 struct ceph_msg *msg)
322{
323 struct ceph_client *client = monc->client;
324 struct ceph_monmap *monmap = NULL, *old = monc->monmap;
325 void *p, *end;
326
327 mutex_lock(&monc->mutex);
328
329 dout("handle_monmap\n");
330 p = msg->front.iov_base;
331 end = p + msg->front.iov_len;
332
333 monmap = ceph_monmap_decode(p, end);
334 if (IS_ERR(monmap)) {
335 pr_err("problem decoding monmap, %d\n",
336 (int)PTR_ERR(monmap));
337 goto out;
338 }
339
340 if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
341 kfree(monmap);
342 goto out;
343 }
344
345 client->monc.monmap = monmap;
346 kfree(old);
347
348out:
349 mutex_unlock(&monc->mutex);
350 wake_up(&client->auth_wq);
351}
352
353/*
354 * statfs
355 */
356static struct ceph_mon_statfs_request *__lookup_statfs(
357 struct ceph_mon_client *monc, u64 tid)
358{
359 struct ceph_mon_statfs_request *req;
360 struct rb_node *n = monc->statfs_request_tree.rb_node;
361
362 while (n) {
363 req = rb_entry(n, struct ceph_mon_statfs_request, node);
364 if (tid < req->tid)
365 n = n->rb_left;
366 else if (tid > req->tid)
367 n = n->rb_right;
368 else
369 return req;
370 }
371 return NULL;
372}
373
374static void __insert_statfs(struct ceph_mon_client *monc,
375 struct ceph_mon_statfs_request *new)
376{
377 struct rb_node **p = &monc->statfs_request_tree.rb_node;
378 struct rb_node *parent = NULL;
379 struct ceph_mon_statfs_request *req = NULL;
380
381 while (*p) {
382 parent = *p;
383 req = rb_entry(parent, struct ceph_mon_statfs_request, node);
384 if (new->tid < req->tid)
385 p = &(*p)->rb_left;
386 else if (new->tid > req->tid)
387 p = &(*p)->rb_right;
388 else
389 BUG();
390 }
391
392 rb_link_node(&new->node, parent, p);
393 rb_insert_color(&new->node, &monc->statfs_request_tree);
394}
395
396static void handle_statfs_reply(struct ceph_mon_client *monc,
397 struct ceph_msg *msg)
398{
399 struct ceph_mon_statfs_request *req;
400 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
401 u64 tid;
402
403 if (msg->front.iov_len != sizeof(*reply))
404 goto bad;
405 tid = le64_to_cpu(msg->hdr.tid);
406 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
407
408 mutex_lock(&monc->mutex);
409 req = __lookup_statfs(monc, tid);
410 if (req) {
411 *req->buf = reply->st;
412 req->result = 0;
413 }
414 mutex_unlock(&monc->mutex);
415 if (req)
416 complete(&req->completion);
417 return;
418
419bad:
420 pr_err("corrupt statfs reply, no tid\n");
421 ceph_msg_dump(msg);
422}
423
424/*
425 * (re)send a statfs request
426 */
427static int send_statfs(struct ceph_mon_client *monc,
428 struct ceph_mon_statfs_request *req)
429{
430 struct ceph_msg *msg;
431 struct ceph_mon_statfs *h;
432
433 dout("send_statfs tid %llu\n", req->tid);
434 msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
435 if (IS_ERR(msg))
436 return PTR_ERR(msg);
437 req->request = msg;
438 msg->hdr.tid = cpu_to_le64(req->tid);
439 h = msg->front.iov_base;
440 h->monhdr.have_version = 0;
441 h->monhdr.session_mon = cpu_to_le16(-1);
442 h->monhdr.session_mon_tid = 0;
443 h->fsid = monc->monmap->fsid;
444 ceph_con_send(monc->con, msg);
445 return 0;
446}
447
448/*
449 * Do a synchronous statfs().
450 */
451int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
452{
453 struct ceph_mon_statfs_request req;
454 int err;
455
456 req.buf = buf;
457 init_completion(&req.completion);
458
459 /* allocate memory for reply */
460 err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
461 if (err)
462 return err;
463
464 /* register request */
465 mutex_lock(&monc->mutex);
466 req.tid = ++monc->last_tid;
467 req.last_attempt = jiffies;
468 req.delay = BASE_DELAY_INTERVAL;
469 __insert_statfs(monc, &req);
470 monc->num_statfs_requests++;
471 mutex_unlock(&monc->mutex);
472
473 /* send request and wait */
474 err = send_statfs(monc, &req);
475 if (!err)
476 err = wait_for_completion_interruptible(&req.completion);
477
478 mutex_lock(&monc->mutex);
479 rb_erase(&req.node, &monc->statfs_request_tree);
480 monc->num_statfs_requests--;
481 ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
482 mutex_unlock(&monc->mutex);
483
484 if (!err)
485 err = req.result;
486 return err;
487}
488
489/*
490 * Resend pending statfs requests.
491 */
492static void __resend_statfs(struct ceph_mon_client *monc)
493{
494 struct ceph_mon_statfs_request *req;
495 struct rb_node *p;
496
497 for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) {
498 req = rb_entry(p, struct ceph_mon_statfs_request, node);
499 send_statfs(monc, req);
500 }
501}
502
503/*
504 * Delayed work. If we haven't mounted yet, retry. Otherwise,
505 * renew/retry subscription as needed (in case it is timing out, or we
506 * got an ENOMEM). And keep the monitor connection alive.
507 */
508static void delayed_work(struct work_struct *work)
509{
510 struct ceph_mon_client *monc =
511 container_of(work, struct ceph_mon_client, delayed_work.work);
512
513 dout("monc delayed_work\n");
514 mutex_lock(&monc->mutex);
515 if (monc->hunting) {
516 __close_session(monc);
517 __open_session(monc); /* continue hunting */
518 } else {
519 ceph_con_keepalive(monc->con);
520
521 __validate_auth(monc);
522
523 if (monc->auth->ops->is_authenticated(monc->auth))
524 __send_subscribe(monc);
525 }
526 __schedule_delayed(monc);
527 mutex_unlock(&monc->mutex);
528}
529
530/*
531 * On startup, we build a temporary monmap populated with the IPs
532 * provided by mount(2).
533 */
534static int build_initial_monmap(struct ceph_mon_client *monc)
535{
536 struct ceph_mount_args *args = monc->client->mount_args;
537 struct ceph_entity_addr *mon_addr = args->mon_addr;
538 int num_mon = args->num_mon;
539 int i;
540
541 /* build initial monmap */
542 monc->monmap = kzalloc(sizeof(*monc->monmap) +
543 num_mon*sizeof(monc->monmap->mon_inst[0]),
544 GFP_KERNEL);
545 if (!monc->monmap)
546 return -ENOMEM;
547 for (i = 0; i < num_mon; i++) {
548 monc->monmap->mon_inst[i].addr = mon_addr[i];
549 monc->monmap->mon_inst[i].addr.nonce = 0;
550 monc->monmap->mon_inst[i].name.type =
551 CEPH_ENTITY_TYPE_MON;
552 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
553 }
554 monc->monmap->num_mon = num_mon;
555 monc->have_fsid = false;
556
557 /* release addr memory */
558 kfree(args->mon_addr);
559 args->mon_addr = NULL;
560 args->num_mon = 0;
561 return 0;
562}
563
564int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
565{
566 int err = 0;
567
568 dout("init\n");
569 memset(monc, 0, sizeof(*monc));
570 monc->client = cl;
571 monc->monmap = NULL;
572 mutex_init(&monc->mutex);
573
574 err = build_initial_monmap(monc);
575 if (err)
576 goto out;
577
578 monc->con = NULL;
579
580 /* authentication */
581 monc->auth = ceph_auth_init(cl->mount_args->name,
582 cl->mount_args->secret);
583 if (IS_ERR(monc->auth))
584 return PTR_ERR(monc->auth);
585 monc->auth->want_keys =
586 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
587 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
588
589 /* msg pools */
590 err = ceph_msgpool_init(&monc->msgpool_subscribe_ack,
591 sizeof(struct ceph_mon_subscribe_ack), 1, false);
592 if (err < 0)
593 goto out_monmap;
594 err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
595 sizeof(struct ceph_mon_statfs_reply), 0, false);
596 if (err < 0)
597 goto out_pool1;
598 err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
599 if (err < 0)
600 goto out_pool2;
601
602 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
603 monc->pending_auth = 0;
604 if (IS_ERR(monc->m_auth)) {
605 err = PTR_ERR(monc->m_auth);
606 monc->m_auth = NULL;
607 goto out_pool3;
608 }
609
610 monc->cur_mon = -1;
611 monc->hunting = true;
612 monc->sub_renew_after = jiffies;
613 monc->sub_sent = 0;
614
615 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
616 monc->statfs_request_tree = RB_ROOT;
617 monc->num_statfs_requests = 0;
618 monc->last_tid = 0;
619
620 monc->have_mdsmap = 0;
621 monc->have_osdmap = 0;
622 monc->want_next_osdmap = 1;
623 return 0;
624
625out_pool3:
626 ceph_msgpool_destroy(&monc->msgpool_auth_reply);
627out_pool2:
628 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
629out_pool1:
630 ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
631out_monmap:
632 kfree(monc->monmap);
633out:
634 return err;
635}
636
637void ceph_monc_stop(struct ceph_mon_client *monc)
638{
639 dout("stop\n");
640 cancel_delayed_work_sync(&monc->delayed_work);
641
642 mutex_lock(&monc->mutex);
643 __close_session(monc);
644 if (monc->con) {
645 monc->con->private = NULL;
646 monc->con->ops->put(monc->con);
647 monc->con = NULL;
648 }
649 mutex_unlock(&monc->mutex);
650
651 ceph_auth_destroy(monc->auth);
652
653 ceph_msg_put(monc->m_auth);
654 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
655 ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
656 ceph_msgpool_destroy(&monc->msgpool_auth_reply);
657
658 kfree(monc->monmap);
659}
660
661static void handle_auth_reply(struct ceph_mon_client *monc,
662 struct ceph_msg *msg)
663{
664 int ret;
665
666 mutex_lock(&monc->mutex);
667 monc->pending_auth = 0;
668 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
669 msg->front.iov_len,
670 monc->m_auth->front.iov_base,
671 monc->m_auth->front_max);
672 if (ret < 0) {
673 monc->client->auth_err = ret;
674 wake_up(&monc->client->auth_wq);
675 } else if (ret > 0) {
676 __send_prepared_auth_request(monc, ret);
677 } else if (monc->auth->ops->is_authenticated(monc->auth)) {
678 dout("authenticated, starting session\n");
679
680 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
681 monc->client->msgr->inst.name.num = monc->auth->global_id;
682
683 __send_subscribe(monc);
684 __resend_statfs(monc);
685 }
686 mutex_unlock(&monc->mutex);
687}
688
689static int __validate_auth(struct ceph_mon_client *monc)
690{
691 int ret;
692
693 if (monc->pending_auth)
694 return 0;
695
696 ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
697 monc->m_auth->front_max);
698 if (ret <= 0)
699 return ret; /* either an error, or no need to authenticate */
700 __send_prepared_auth_request(monc, ret);
701 return 0;
702}
703
704int ceph_monc_validate_auth(struct ceph_mon_client *monc)
705{
706 int ret;
707
708 mutex_lock(&monc->mutex);
709 ret = __validate_auth(monc);
710 mutex_unlock(&monc->mutex);
711 return ret;
712}
713
714/*
715 * handle incoming message
716 */
717static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
718{
719 struct ceph_mon_client *monc = con->private;
720 int type = le16_to_cpu(msg->hdr.type);
721
722 if (!monc)
723 return;
724
725 switch (type) {
726 case CEPH_MSG_AUTH_REPLY:
727 handle_auth_reply(monc, msg);
728 break;
729
730 case CEPH_MSG_MON_SUBSCRIBE_ACK:
731 handle_subscribe_ack(monc, msg);
732 break;
733
734 case CEPH_MSG_STATFS_REPLY:
735 handle_statfs_reply(monc, msg);
736 break;
737
738 case CEPH_MSG_MON_MAP:
739 ceph_monc_handle_map(monc, msg);
740 break;
741
742 case CEPH_MSG_MDS_MAP:
743 ceph_mdsc_handle_map(&monc->client->mdsc, msg);
744 break;
745
746 case CEPH_MSG_OSD_MAP:
747 ceph_osdc_handle_map(&monc->client->osdc, msg);
748 break;
749
750 default:
751 pr_err("received unknown message type %d %s\n", type,
752 ceph_msg_type_name(type));
753 }
754 ceph_msg_put(msg);
755}
756
757/*
758 * Allocate memory for incoming message
759 */
760static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
761 struct ceph_msg_header *hdr,
762 int *skip)
763{
764 struct ceph_mon_client *monc = con->private;
765 int type = le16_to_cpu(hdr->type);
766 int front_len = le32_to_cpu(hdr->front_len);
767 struct ceph_msg *m = NULL;
768
769 *skip = 0;
770
771 switch (type) {
772 case CEPH_MSG_MON_SUBSCRIBE_ACK:
773 m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len);
774 break;
775 case CEPH_MSG_STATFS_REPLY:
776 m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len);
777 break;
778 case CEPH_MSG_AUTH_REPLY:
779 m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len);
780 break;
781 case CEPH_MSG_MON_MAP:
782 case CEPH_MSG_MDS_MAP:
783 case CEPH_MSG_OSD_MAP:
784 m = ceph_msg_new(type, front_len, 0, 0, NULL);
785 break;
786 }
787
788 if (!m) {
789 pr_info("alloc_msg unknown type %d\n", type);
790 *skip = 1;
791 }
792 return m;
793}
794
795/*
796 * If the monitor connection resets, pick a new monitor and resubmit
797 * any pending requests.
798 */
799static void mon_fault(struct ceph_connection *con)
800{
801 struct ceph_mon_client *monc = con->private;
802
803 if (!monc)
804 return;
805
806 dout("mon_fault\n");
807 mutex_lock(&monc->mutex);
808 if (!con->private)
809 goto out;
810
811 if (monc->con && !monc->hunting)
812 pr_info("mon%d %s session lost, "
813 "hunting for new mon\n", monc->cur_mon,
814 pr_addr(&monc->con->peer_addr.in_addr));
815
816 __close_session(monc);
817 if (!monc->hunting) {
818 /* start hunting */
819 monc->hunting = true;
820 __open_session(monc);
821 } else {
822 /* already hunting, let's wait a bit */
823 __schedule_delayed(monc);
824 }
825out:
826 mutex_unlock(&monc->mutex);
827}
828
829const static struct ceph_connection_operations mon_con_ops = {
830 .get = ceph_con_get,
831 .put = ceph_con_put,
832 .dispatch = dispatch,
833 .fault = mon_fault,
834 .alloc_msg = mon_alloc_msg,
835};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
new file mode 100644
index 000000000000..b958ad5afa06
--- /dev/null
+++ b/fs/ceph/mon_client.h
@@ -0,0 +1,119 @@
1#ifndef _FS_CEPH_MON_CLIENT_H
2#define _FS_CEPH_MON_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/rbtree.h>
6
7#include "messenger.h"
8#include "msgpool.h"
9
10struct ceph_client;
11struct ceph_mount_args;
12struct ceph_auth_client;
13
14/*
15 * The monitor map enumerates the set of all monitors.
16 */
17struct ceph_monmap {
18 struct ceph_fsid fsid;
19 u32 epoch;
20 u32 num_mon;
21 struct ceph_entity_inst mon_inst[0];
22};
23
24struct ceph_mon_client;
25struct ceph_mon_statfs_request;
26
27
28/*
29 * Generic mechanism for resending monitor requests.
30 */
31typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
32 int newmon);
33
34/* a pending monitor request */
35struct ceph_mon_request {
36 struct ceph_mon_client *monc;
37 struct delayed_work delayed_work;
38 unsigned long delay;
39 ceph_monc_request_func_t do_request;
40};
41
42/*
43 * statfs() is done a bit differently because we need to get data back
44 * to the caller
45 */
46struct ceph_mon_statfs_request {
47 u64 tid;
48 struct rb_node node;
49 int result;
50 struct ceph_statfs *buf;
51 struct completion completion;
52 unsigned long last_attempt, delay; /* jiffies */
53 struct ceph_msg *request; /* original request */
54};
55
56struct ceph_mon_client {
57 struct ceph_client *client;
58 struct ceph_monmap *monmap;
59
60 struct mutex mutex;
61 struct delayed_work delayed_work;
62
63 struct ceph_auth_client *auth;
64 struct ceph_msg *m_auth;
65 int pending_auth;
66
67 bool hunting;
68 int cur_mon; /* last monitor i contacted */
69 unsigned long sub_sent, sub_renew_after;
70 struct ceph_connection *con;
71 bool have_fsid;
72
73 /* msg pools */
74 struct ceph_msgpool msgpool_subscribe_ack;
75 struct ceph_msgpool msgpool_statfs_reply;
76 struct ceph_msgpool msgpool_auth_reply;
77
78 /* pending statfs requests */
79 struct rb_root statfs_request_tree;
80 int num_statfs_requests;
81 u64 last_tid;
82
83 /* mds/osd map */
84 int want_next_osdmap; /* 1 = want, 2 = want+asked */
85 u32 have_osdmap, have_mdsmap;
86
87#ifdef CONFIG_DEBUG_FS
88 struct dentry *debugfs_file;
89#endif
90};
91
92extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
93extern int ceph_monmap_contains(struct ceph_monmap *m,
94 struct ceph_entity_addr *addr);
95
96extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
97extern void ceph_monc_stop(struct ceph_mon_client *monc);
98
99/*
100 * The model here is to indicate that we need a new map of at least
101 * epoch @want, and also call in when we receive a map. We will
102 * periodically rerequest the map from the monitor cluster until we
103 * get what we want.
104 */
105extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
106extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
107
108extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
109
110extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
111 struct ceph_statfs *buf);
112
113extern int ceph_monc_open_session(struct ceph_mon_client *monc);
114
115extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
116
117
118
119#endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
new file mode 100644
index 000000000000..ca3b44a89f2d
--- /dev/null
+++ b/fs/ceph/msgpool.c
@@ -0,0 +1,186 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/sched.h>
5#include <linux/types.h>
6#include <linux/vmalloc.h>
7
8#include "msgpool.h"
9
10/*
11 * We use msg pools to preallocate memory for messages we expect to
12 * receive over the wire, to avoid getting ourselves into OOM
13 * conditions at unexpected times. We take use a few different
14 * strategies:
15 *
16 * - for request/response type interactions, we preallocate the
17 * memory needed for the response when we generate the request.
18 *
19 * - for messages we can receive at any time from the MDS, we preallocate
20 * a pool of messages we can re-use.
21 *
22 * - for writeback, we preallocate some number of messages to use for
23 * requests and their replies, so that we always make forward
24 * progress.
25 *
26 * The msgpool behaves like a mempool_t, but keeps preallocated
27 * ceph_msgs strung together on a list_head instead of using a pointer
28 * vector. This avoids vector reallocation when we adjust the number
29 * of preallocated items (which happens frequently).
30 */
31
32
33/*
34 * Allocate or release as necessary to meet our target pool size.
35 */
36static int __fill_msgpool(struct ceph_msgpool *pool)
37{
38 struct ceph_msg *msg;
39
40 while (pool->num < pool->min) {
41 dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
42 pool->min);
43 spin_unlock(&pool->lock);
44 msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
45 spin_lock(&pool->lock);
46 if (IS_ERR(msg))
47 return PTR_ERR(msg);
48 msg->pool = pool;
49 list_add(&msg->list_head, &pool->msgs);
50 pool->num++;
51 }
52 while (pool->num > pool->min) {
53 msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
54 dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
55 pool->min, msg);
56 list_del_init(&msg->list_head);
57 pool->num--;
58 ceph_msg_kfree(msg);
59 }
60 return 0;
61}
62
63int ceph_msgpool_init(struct ceph_msgpool *pool,
64 int front_len, int min, bool blocking)
65{
66 int ret;
67
68 dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
69 spin_lock_init(&pool->lock);
70 pool->front_len = front_len;
71 INIT_LIST_HEAD(&pool->msgs);
72 pool->num = 0;
73 pool->min = min;
74 pool->blocking = blocking;
75 init_waitqueue_head(&pool->wait);
76
77 spin_lock(&pool->lock);
78 ret = __fill_msgpool(pool);
79 spin_unlock(&pool->lock);
80 return ret;
81}
82
83void ceph_msgpool_destroy(struct ceph_msgpool *pool)
84{
85 dout("msgpool_destroy %p\n", pool);
86 spin_lock(&pool->lock);
87 pool->min = 0;
88 __fill_msgpool(pool);
89 spin_unlock(&pool->lock);
90}
91
92int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
93{
94 int ret;
95
96 spin_lock(&pool->lock);
97 dout("msgpool_resv %p delta %d\n", pool, delta);
98 pool->min += delta;
99 ret = __fill_msgpool(pool);
100 spin_unlock(&pool->lock);
101 return ret;
102}
103
104struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
105{
106 wait_queue_t wait;
107 struct ceph_msg *msg;
108
109 if (front_len && front_len > pool->front_len) {
110 pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
111 pool, front_len, pool->front_len);
112 WARN_ON(1);
113
114 /* try to alloc a fresh message */
115 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
116 if (!IS_ERR(msg))
117 return msg;
118 }
119
120 if (!front_len)
121 front_len = pool->front_len;
122
123 if (pool->blocking) {
124 /* mempool_t behavior; first try to alloc */
125 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
126 if (!IS_ERR(msg))
127 return msg;
128 }
129
130 while (1) {
131 spin_lock(&pool->lock);
132 if (likely(pool->num)) {
133 msg = list_entry(pool->msgs.next, struct ceph_msg,
134 list_head);
135 list_del_init(&msg->list_head);
136 pool->num--;
137 dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
138 pool->num, pool->min);
139 spin_unlock(&pool->lock);
140 return msg;
141 }
142 pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
143 pool->min, pool->blocking ? "waiting" : "may fail");
144 spin_unlock(&pool->lock);
145
146 if (!pool->blocking) {
147 WARN_ON(1);
148
149 /* maybe we can allocate it now? */
150 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
151 if (!IS_ERR(msg))
152 return msg;
153
154 pr_err("msgpool_get %p empty + alloc failed\n", pool);
155 return ERR_PTR(-ENOMEM);
156 }
157
158 init_wait(&wait);
159 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
160 schedule();
161 finish_wait(&pool->wait, &wait);
162 }
163}
164
165void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
166{
167 spin_lock(&pool->lock);
168 if (pool->num < pool->min) {
169 /* reset msg front_len; user may have changed it */
170 msg->front.iov_len = pool->front_len;
171 msg->hdr.front_len = cpu_to_le32(pool->front_len);
172
173 kref_set(&msg->kref, 1); /* retake a single ref */
174 list_add(&msg->list_head, &pool->msgs);
175 pool->num++;
176 dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
177 pool->num, pool->min);
178 spin_unlock(&pool->lock);
179 wake_up(&pool->wait);
180 } else {
181 dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
182 pool->num, pool->min);
183 spin_unlock(&pool->lock);
184 ceph_msg_kfree(msg);
185 }
186}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
new file mode 100644
index 000000000000..bc834bfcd720
--- /dev/null
+++ b/fs/ceph/msgpool.h
@@ -0,0 +1,27 @@
1#ifndef _FS_CEPH_MSGPOOL
2#define _FS_CEPH_MSGPOOL
3
4#include "messenger.h"
5
6/*
7 * we use memory pools for preallocating messages we may receive, to
8 * avoid unexpected OOM conditions.
9 */
10struct ceph_msgpool {
11 spinlock_t lock;
12 int front_len; /* preallocated payload size */
13 struct list_head msgs; /* msgs in the pool; each has 1 ref */
14 int num, min; /* cur, min # msgs in the pool */
15 bool blocking;
16 wait_queue_head_t wait;
17};
18
19extern int ceph_msgpool_init(struct ceph_msgpool *pool,
20 int front_len, int size, bool blocking);
21extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
22extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
23extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
24 int front_len);
25extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
26
27#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
new file mode 100644
index 000000000000..8aaab414f3f8
--- /dev/null
+++ b/fs/ceph/msgr.h
@@ -0,0 +1,158 @@
1#ifndef __MSGR_H
2#define __MSGR_H
3
4/*
5 * Data types for message passing layer used by Ceph.
6 */
7
8#define CEPH_MON_PORT 6789 /* default monitor port */
9
10/*
11 * client-side processes will try to bind to ports in this
12 * range, simply for the benefit of tools like nmap or wireshark
13 * that would like to identify the protocol.
14 */
15#define CEPH_PORT_FIRST 6789
16#define CEPH_PORT_START 6800 /* non-monitors start here */
17#define CEPH_PORT_LAST 6900
18
19/*
20 * tcp connection banner. include a protocol version. and adjust
21 * whenever the wire protocol changes. try to keep this string length
22 * constant.
23 */
24#define CEPH_BANNER "ceph v027"
25#define CEPH_BANNER_MAX_LEN 30
26
27
28/*
29 * Rollover-safe type and comparator for 32-bit sequence numbers.
30 * Comparator returns -1, 0, or 1.
31 */
32typedef __u32 ceph_seq_t;
33
34static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
35{
36 return (__s32)a - (__s32)b;
37}
38
39
40/*
41 * entity_name -- logical name for a process participating in the
42 * network, e.g. 'mds0' or 'osd3'.
43 */
44struct ceph_entity_name {
45 __u8 type; /* CEPH_ENTITY_TYPE_* */
46 __le64 num;
47} __attribute__ ((packed));
48
49#define CEPH_ENTITY_TYPE_MON 0x01
50#define CEPH_ENTITY_TYPE_MDS 0x02
51#define CEPH_ENTITY_TYPE_OSD 0x04
52#define CEPH_ENTITY_TYPE_CLIENT 0x08
53#define CEPH_ENTITY_TYPE_ADMIN 0x10
54#define CEPH_ENTITY_TYPE_AUTH 0x20
55
56#define CEPH_ENTITY_TYPE_ANY 0xFF
57
58extern const char *ceph_entity_type_name(int type);
59
60/*
61 * entity_addr -- network address
62 */
63struct ceph_entity_addr {
64 __le32 type;
65 __le32 nonce; /* unique id for process (e.g. pid) */
66 struct sockaddr_storage in_addr;
67} __attribute__ ((packed));
68
69struct ceph_entity_inst {
70 struct ceph_entity_name name;
71 struct ceph_entity_addr addr;
72} __attribute__ ((packed));
73
74
75/* used by message exchange protocol */
76#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
77#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
78#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
79 incoming connection */
80#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
81 with higher cseq */
82#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
83 with higher gseq */
84#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
85#define CEPH_MSGR_TAG_MSG 7 /* message */
86#define CEPH_MSGR_TAG_ACK 8 /* message ack */
87#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
88#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
89#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
90#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */
91
92
93/*
94 * connection negotiation
95 */
96struct ceph_msg_connect {
97 __le64 features; /* supported feature bits */
98 __le32 host_type; /* CEPH_ENTITY_TYPE_* */
99 __le32 global_seq; /* count connections initiated by this host */
100 __le32 connect_seq; /* count connections initiated in this session */
101 __le32 protocol_version;
102 __le32 authorizer_protocol;
103 __le32 authorizer_len;
104 __u8 flags; /* CEPH_MSG_CONNECT_* */
105} __attribute__ ((packed));
106
107struct ceph_msg_connect_reply {
108 __u8 tag;
109 __le64 features; /* feature bits for this session */
110 __le32 global_seq;
111 __le32 connect_seq;
112 __le32 protocol_version;
113 __le32 authorizer_len;
114 __u8 flags;
115} __attribute__ ((packed));
116
117#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
118
119
120/*
121 * message header
122 */
123struct ceph_msg_header {
124 __le64 seq; /* message seq# for this session */
125 __le64 tid; /* transaction id */
126 __le16 type; /* message type */
127 __le16 priority; /* priority. higher value == higher priority */
128 __le16 version; /* version of message encoding */
129
130 __le32 front_len; /* bytes in main payload */
131 __le32 middle_len;/* bytes in middle payload */
132 __le32 data_len; /* bytes of data payload */
133 __le16 data_off; /* sender: include full offset;
134 receiver: mask against ~PAGE_MASK */
135
136 struct ceph_entity_inst src, orig_src;
137 __le32 reserved;
138 __le32 crc; /* header crc32c */
139} __attribute__ ((packed));
140
141#define CEPH_MSG_PRIO_LOW 64
142#define CEPH_MSG_PRIO_DEFAULT 127
143#define CEPH_MSG_PRIO_HIGH 196
144#define CEPH_MSG_PRIO_HIGHEST 255
145
146/*
147 * follows data payload
148 */
149struct ceph_msg_footer {
150 __le32 front_crc, middle_crc, data_crc;
151 __u8 flags;
152} __attribute__ ((packed));
153
154#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */
155#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
156
157
158#endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
new file mode 100644
index 000000000000..c7b4dedaace6
--- /dev/null
+++ b/fs/ceph/osd_client.c
@@ -0,0 +1,1550 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/highmem.h>
5#include <linux/mm.h>
6#include <linux/pagemap.h>
7#include <linux/slab.h>
8#include <linux/uaccess.h>
9
10#include "super.h"
11#include "osd_client.h"
12#include "messenger.h"
13#include "decode.h"
14#include "auth.h"
15
16#define OSD_OP_FRONT_LEN 4096
17#define OSD_OPREPLY_FRONT_LEN 512
18
19const static struct ceph_connection_operations osd_con_ops;
20static int __kick_requests(struct ceph_osd_client *osdc,
21 struct ceph_osd *kickosd);
22
23static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
24
25/*
26 * Implement client access to distributed object storage cluster.
27 *
28 * All data objects are stored within a cluster/cloud of OSDs, or
29 * "object storage devices." (Note that Ceph OSDs have _nothing_ to
30 * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
31 * remote daemons serving up and coordinating consistent and safe
32 * access to storage.
33 *
34 * Cluster membership and the mapping of data objects onto storage devices
35 * are described by the osd map.
36 *
37 * We keep track of pending OSD requests (read, write), resubmit
38 * requests to different OSDs when the cluster topology/data layout
39 * change, or retry the affected requests when the communications
40 * channel with an OSD is reset.
41 */
42
43/*
44 * calculate the mapping of a file extent onto an object, and fill out the
45 * request accordingly. shorten extent as necessary if it crosses an
46 * object boundary.
47 *
48 * fill osd op in request message.
49 */
50static void calc_layout(struct ceph_osd_client *osdc,
51 struct ceph_vino vino, struct ceph_file_layout *layout,
52 u64 off, u64 *plen,
53 struct ceph_osd_request *req)
54{
55 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
56 struct ceph_osd_op *op = (void *)(reqhead + 1);
57 u64 orig_len = *plen;
58 u64 objoff, objlen; /* extent in object */
59 u64 bno;
60
61 reqhead->snapid = cpu_to_le64(vino.snap);
62
63 /* object extent? */
64 ceph_calc_file_object_mapping(layout, off, plen, &bno,
65 &objoff, &objlen);
66 if (*plen < orig_len)
67 dout(" skipping last %llu, final file extent %llu~%llu\n",
68 orig_len - *plen, off, *plen);
69
70 sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
71 req->r_oid_len = strlen(req->r_oid);
72
73 op->extent.offset = cpu_to_le64(objoff);
74 op->extent.length = cpu_to_le64(objlen);
75 req->r_num_pages = calc_pages_for(off, *plen);
76
77 dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
78 req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
79}
80
81/*
82 * requests
83 */
84void ceph_osdc_release_request(struct kref *kref)
85{
86 struct ceph_osd_request *req = container_of(kref,
87 struct ceph_osd_request,
88 r_kref);
89
90 if (req->r_request)
91 ceph_msg_put(req->r_request);
92 if (req->r_reply)
93 ceph_msg_put(req->r_reply);
94 if (req->r_con_filling_msg) {
95 dout("release_request revoking pages %p from con %p\n",
96 req->r_pages, req->r_con_filling_msg);
97 ceph_con_revoke_message(req->r_con_filling_msg,
98 req->r_reply);
99 ceph_con_put(req->r_con_filling_msg);
100 }
101 if (req->r_own_pages)
102 ceph_release_page_vector(req->r_pages,
103 req->r_num_pages);
104 ceph_put_snap_context(req->r_snapc);
105 if (req->r_mempool)
106 mempool_free(req, req->r_osdc->req_mempool);
107 else
108 kfree(req);
109}
110
111/*
112 * build new request AND message, calculate layout, and adjust file
113 * extent as needed.
114 *
115 * if the file was recently truncated, we include information about its
116 * old and new size so that the object can be updated appropriately. (we
117 * avoid synchronously deleting truncated objects because it's slow.)
118 *
119 * if @do_sync, include a 'startsync' command so that the osd will flush
120 * data quickly.
121 */
122struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
123 struct ceph_file_layout *layout,
124 struct ceph_vino vino,
125 u64 off, u64 *plen,
126 int opcode, int flags,
127 struct ceph_snap_context *snapc,
128 int do_sync,
129 u32 truncate_seq,
130 u64 truncate_size,
131 struct timespec *mtime,
132 bool use_mempool, int num_reply)
133{
134 struct ceph_osd_request *req;
135 struct ceph_msg *msg;
136 struct ceph_osd_request_head *head;
137 struct ceph_osd_op *op;
138 void *p;
139 int num_op = 1 + do_sync;
140 size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
141 int i;
142
143 if (use_mempool) {
144 req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
145 memset(req, 0, sizeof(*req));
146 } else {
147 req = kzalloc(sizeof(*req), GFP_NOFS);
148 }
149 if (req == NULL)
150 return ERR_PTR(-ENOMEM);
151
152 req->r_osdc = osdc;
153 req->r_mempool = use_mempool;
154 kref_init(&req->r_kref);
155 init_completion(&req->r_completion);
156 init_completion(&req->r_safe_completion);
157 INIT_LIST_HEAD(&req->r_unsafe_item);
158 req->r_flags = flags;
159
160 WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
161
162 /* create reply message */
163 if (use_mempool)
164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
165 else
166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
167 OSD_OPREPLY_FRONT_LEN, 0, 0, NULL);
168 if (IS_ERR(msg)) {
169 ceph_osdc_put_request(req);
170 return ERR_PTR(PTR_ERR(msg));
171 }
172 req->r_reply = msg;
173
174 /* create request message; allow space for oid */
175 msg_size += 40;
176 if (snapc)
177 msg_size += sizeof(u64) * snapc->num_snaps;
178 if (use_mempool)
179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
180 else
181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
182 if (IS_ERR(msg)) {
183 ceph_osdc_put_request(req);
184 return ERR_PTR(PTR_ERR(msg));
185 }
186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
187 memset(msg->front.iov_base, 0, msg->front.iov_len);
188 head = msg->front.iov_base;
189 op = (void *)(head + 1);
190 p = (void *)(op + num_op);
191
192 req->r_request = msg;
193 req->r_snapc = ceph_get_snap_context(snapc);
194
195 head->client_inc = cpu_to_le32(1); /* always, for now. */
196 head->flags = cpu_to_le32(flags);
197 if (flags & CEPH_OSD_FLAG_WRITE)
198 ceph_encode_timespec(&head->mtime, mtime);
199 head->num_ops = cpu_to_le16(num_op);
200 op->op = cpu_to_le16(opcode);
201
202 /* calculate max write size */
203 calc_layout(osdc, vino, layout, off, plen, req);
204 req->r_file_layout = *layout; /* keep a copy */
205
206 if (flags & CEPH_OSD_FLAG_WRITE) {
207 req->r_request->hdr.data_off = cpu_to_le16(off);
208 req->r_request->hdr.data_len = cpu_to_le32(*plen);
209 op->payload_len = cpu_to_le32(*plen);
210 }
211 op->extent.truncate_size = cpu_to_le64(truncate_size);
212 op->extent.truncate_seq = cpu_to_le32(truncate_seq);
213
214 /* fill in oid */
215 head->object_len = cpu_to_le32(req->r_oid_len);
216 memcpy(p, req->r_oid, req->r_oid_len);
217 p += req->r_oid_len;
218
219 if (do_sync) {
220 op++;
221 op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
222 }
223 if (snapc) {
224 head->snap_seq = cpu_to_le64(snapc->seq);
225 head->num_snaps = cpu_to_le32(snapc->num_snaps);
226 for (i = 0; i < snapc->num_snaps; i++) {
227 put_unaligned_le64(snapc->snaps[i], p);
228 p += sizeof(u64);
229 }
230 }
231
232 BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
233 msg_size = p - msg->front.iov_base;
234 msg->front.iov_len = msg_size;
235 msg->hdr.front_len = cpu_to_le32(msg_size);
236 return req;
237}
238
239/*
240 * We keep osd requests in an rbtree, sorted by ->r_tid.
241 */
242static void __insert_request(struct ceph_osd_client *osdc,
243 struct ceph_osd_request *new)
244{
245 struct rb_node **p = &osdc->requests.rb_node;
246 struct rb_node *parent = NULL;
247 struct ceph_osd_request *req = NULL;
248
249 while (*p) {
250 parent = *p;
251 req = rb_entry(parent, struct ceph_osd_request, r_node);
252 if (new->r_tid < req->r_tid)
253 p = &(*p)->rb_left;
254 else if (new->r_tid > req->r_tid)
255 p = &(*p)->rb_right;
256 else
257 BUG();
258 }
259
260 rb_link_node(&new->r_node, parent, p);
261 rb_insert_color(&new->r_node, &osdc->requests);
262}
263
264static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
265 u64 tid)
266{
267 struct ceph_osd_request *req;
268 struct rb_node *n = osdc->requests.rb_node;
269
270 while (n) {
271 req = rb_entry(n, struct ceph_osd_request, r_node);
272 if (tid < req->r_tid)
273 n = n->rb_left;
274 else if (tid > req->r_tid)
275 n = n->rb_right;
276 else
277 return req;
278 }
279 return NULL;
280}
281
282static struct ceph_osd_request *
283__lookup_request_ge(struct ceph_osd_client *osdc,
284 u64 tid)
285{
286 struct ceph_osd_request *req;
287 struct rb_node *n = osdc->requests.rb_node;
288
289 while (n) {
290 req = rb_entry(n, struct ceph_osd_request, r_node);
291 if (tid < req->r_tid) {
292 if (!n->rb_left)
293 return req;
294 n = n->rb_left;
295 } else if (tid > req->r_tid) {
296 n = n->rb_right;
297 } else {
298 return req;
299 }
300 }
301 return NULL;
302}
303
304
305/*
306 * If the osd connection drops, we need to resubmit all requests.
307 */
308static void osd_reset(struct ceph_connection *con)
309{
310 struct ceph_osd *osd = con->private;
311 struct ceph_osd_client *osdc;
312
313 if (!osd)
314 return;
315 dout("osd_reset osd%d\n", osd->o_osd);
316 osdc = osd->o_osdc;
317 down_read(&osdc->map_sem);
318 kick_requests(osdc, osd);
319 up_read(&osdc->map_sem);
320}
321
322/*
323 * Track open sessions with osds.
324 */
325static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
326{
327 struct ceph_osd *osd;
328
329 osd = kzalloc(sizeof(*osd), GFP_NOFS);
330 if (!osd)
331 return NULL;
332
333 atomic_set(&osd->o_ref, 1);
334 osd->o_osdc = osdc;
335 INIT_LIST_HEAD(&osd->o_requests);
336 INIT_LIST_HEAD(&osd->o_osd_lru);
337 osd->o_incarnation = 1;
338
339 ceph_con_init(osdc->client->msgr, &osd->o_con);
340 osd->o_con.private = osd;
341 osd->o_con.ops = &osd_con_ops;
342 osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
343
344 INIT_LIST_HEAD(&osd->o_keepalive_item);
345 return osd;
346}
347
348static struct ceph_osd *get_osd(struct ceph_osd *osd)
349{
350 if (atomic_inc_not_zero(&osd->o_ref)) {
351 dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
352 atomic_read(&osd->o_ref));
353 return osd;
354 } else {
355 dout("get_osd %p FAIL\n", osd);
356 return NULL;
357 }
358}
359
360static void put_osd(struct ceph_osd *osd)
361{
362 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
363 atomic_read(&osd->o_ref) - 1);
364 if (atomic_dec_and_test(&osd->o_ref))
365 kfree(osd);
366}
367
368/*
369 * remove an osd from our map
370 */
371static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
372{
373 dout("__remove_osd %p\n", osd);
374 BUG_ON(!list_empty(&osd->o_requests));
375 rb_erase(&osd->o_node, &osdc->osds);
376 list_del_init(&osd->o_osd_lru);
377 ceph_con_close(&osd->o_con);
378 put_osd(osd);
379}
380
381static void __move_osd_to_lru(struct ceph_osd_client *osdc,
382 struct ceph_osd *osd)
383{
384 dout("__move_osd_to_lru %p\n", osd);
385 BUG_ON(!list_empty(&osd->o_osd_lru));
386 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
387 osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
388}
389
390static void __remove_osd_from_lru(struct ceph_osd *osd)
391{
392 dout("__remove_osd_from_lru %p\n", osd);
393 if (!list_empty(&osd->o_osd_lru))
394 list_del_init(&osd->o_osd_lru);
395}
396
397static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
398{
399 struct ceph_osd *osd, *nosd;
400
401 dout("__remove_old_osds %p\n", osdc);
402 mutex_lock(&osdc->request_mutex);
403 list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
404 if (!remove_all && time_before(jiffies, osd->lru_ttl))
405 break;
406 __remove_osd(osdc, osd);
407 }
408 mutex_unlock(&osdc->request_mutex);
409}
410
411/*
412 * reset osd connect
413 */
414static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
415{
416 struct ceph_osd_request *req;
417 int ret = 0;
418
419 dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
420 if (list_empty(&osd->o_requests)) {
421 __remove_osd(osdc, osd);
422 } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
423 &osd->o_con.peer_addr,
424 sizeof(osd->o_con.peer_addr)) == 0 &&
425 !ceph_con_opened(&osd->o_con)) {
426 dout(" osd addr hasn't changed and connection never opened,"
427 " letting msgr retry");
428 /* touch each r_stamp for handle_timeout()'s benfit */
429 list_for_each_entry(req, &osd->o_requests, r_osd_item)
430 req->r_stamp = jiffies;
431 ret = -EAGAIN;
432 } else {
433 ceph_con_close(&osd->o_con);
434 ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
435 osd->o_incarnation++;
436 }
437 return ret;
438}
439
440static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
441{
442 struct rb_node **p = &osdc->osds.rb_node;
443 struct rb_node *parent = NULL;
444 struct ceph_osd *osd = NULL;
445
446 while (*p) {
447 parent = *p;
448 osd = rb_entry(parent, struct ceph_osd, o_node);
449 if (new->o_osd < osd->o_osd)
450 p = &(*p)->rb_left;
451 else if (new->o_osd > osd->o_osd)
452 p = &(*p)->rb_right;
453 else
454 BUG();
455 }
456
457 rb_link_node(&new->o_node, parent, p);
458 rb_insert_color(&new->o_node, &osdc->osds);
459}
460
461static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
462{
463 struct ceph_osd *osd;
464 struct rb_node *n = osdc->osds.rb_node;
465
466 while (n) {
467 osd = rb_entry(n, struct ceph_osd, o_node);
468 if (o < osd->o_osd)
469 n = n->rb_left;
470 else if (o > osd->o_osd)
471 n = n->rb_right;
472 else
473 return osd;
474 }
475 return NULL;
476}
477
478static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
479{
480 schedule_delayed_work(&osdc->timeout_work,
481 osdc->client->mount_args->osd_keepalive_timeout * HZ);
482}
483
484static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
485{
486 cancel_delayed_work(&osdc->timeout_work);
487}
488
489/*
490 * Register request, assign tid. If this is the first request, set up
491 * the timeout event.
492 */
493static void register_request(struct ceph_osd_client *osdc,
494 struct ceph_osd_request *req)
495{
496 mutex_lock(&osdc->request_mutex);
497 req->r_tid = ++osdc->last_tid;
498 req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
499 INIT_LIST_HEAD(&req->r_req_lru_item);
500
501 dout("register_request %p tid %lld\n", req, req->r_tid);
502 __insert_request(osdc, req);
503 ceph_osdc_get_request(req);
504 osdc->num_requests++;
505
506 if (osdc->num_requests == 1) {
507 dout(" first request, scheduling timeout\n");
508 __schedule_osd_timeout(osdc);
509 }
510 mutex_unlock(&osdc->request_mutex);
511}
512
513/*
514 * called under osdc->request_mutex
515 */
516static void __unregister_request(struct ceph_osd_client *osdc,
517 struct ceph_osd_request *req)
518{
519 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
520 rb_erase(&req->r_node, &osdc->requests);
521 osdc->num_requests--;
522
523 if (req->r_osd) {
524 /* make sure the original request isn't in flight. */
525 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
526
527 list_del_init(&req->r_osd_item);
528 if (list_empty(&req->r_osd->o_requests))
529 __move_osd_to_lru(osdc, req->r_osd);
530 req->r_osd = NULL;
531 }
532
533 ceph_osdc_put_request(req);
534
535 list_del_init(&req->r_req_lru_item);
536 if (osdc->num_requests == 0) {
537 dout(" no requests, canceling timeout\n");
538 __cancel_osd_timeout(osdc);
539 }
540}
541
542/*
543 * Cancel a previously queued request message
544 */
545static void __cancel_request(struct ceph_osd_request *req)
546{
547 if (req->r_sent) {
548 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
549 req->r_sent = 0;
550 }
551 list_del_init(&req->r_req_lru_item);
552}
553
554/*
555 * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
556 * (as needed), and set the request r_osd appropriately. If there is
557 * no up osd, set r_osd to NULL.
558 *
559 * Return 0 if unchanged, 1 if changed, or negative on error.
560 *
561 * Caller should hold map_sem for read and request_mutex.
562 */
563static int __map_osds(struct ceph_osd_client *osdc,
564 struct ceph_osd_request *req)
565{
566 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
567 struct ceph_pg pgid;
568 int o = -1;
569 int err;
570
571 dout("map_osds %p tid %lld\n", req, req->r_tid);
572 err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
573 &req->r_file_layout, osdc->osdmap);
574 if (err)
575 return err;
576 pgid = reqhead->layout.ol_pgid;
577 req->r_pgid = pgid;
578
579 o = ceph_calc_pg_primary(osdc->osdmap, pgid);
580
581 if ((req->r_osd && req->r_osd->o_osd == o &&
582 req->r_sent >= req->r_osd->o_incarnation) ||
583 (req->r_osd == NULL && o == -1))
584 return 0; /* no change */
585
586 dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
587 req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
588 req->r_osd ? req->r_osd->o_osd : -1);
589
590 if (req->r_osd) {
591 __cancel_request(req);
592 list_del_init(&req->r_osd_item);
593 req->r_osd = NULL;
594 }
595
596 req->r_osd = __lookup_osd(osdc, o);
597 if (!req->r_osd && o >= 0) {
598 err = -ENOMEM;
599 req->r_osd = create_osd(osdc);
600 if (!req->r_osd)
601 goto out;
602
603 dout("map_osds osd %p is osd%d\n", req->r_osd, o);
604 req->r_osd->o_osd = o;
605 req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
606 __insert_osd(osdc, req->r_osd);
607
608 ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
609 }
610
611 if (req->r_osd) {
612 __remove_osd_from_lru(req->r_osd);
613 list_add(&req->r_osd_item, &req->r_osd->o_requests);
614 }
615 err = 1; /* osd changed */
616
617out:
618 return err;
619}
620
621/*
622 * caller should hold map_sem (for read) and request_mutex
623 */
624static int __send_request(struct ceph_osd_client *osdc,
625 struct ceph_osd_request *req)
626{
627 struct ceph_osd_request_head *reqhead;
628 int err;
629
630 err = __map_osds(osdc, req);
631 if (err < 0)
632 return err;
633 if (req->r_osd == NULL) {
634 dout("send_request %p no up osds in pg\n", req);
635 ceph_monc_request_next_osdmap(&osdc->client->monc);
636 return 0;
637 }
638
639 dout("send_request %p tid %llu to osd%d flags %d\n",
640 req, req->r_tid, req->r_osd->o_osd, req->r_flags);
641
642 reqhead = req->r_request->front.iov_base;
643 reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
644 reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */
645 reqhead->reassert_version = req->r_reassert_version;
646
647 req->r_stamp = jiffies;
648 list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
649
650 ceph_msg_get(req->r_request); /* send consumes a ref */
651 ceph_con_send(&req->r_osd->o_con, req->r_request);
652 req->r_sent = req->r_osd->o_incarnation;
653 return 0;
654}
655
656/*
657 * Timeout callback, called every N seconds when 1 or more osd
658 * requests has been active for more than N seconds. When this
659 * happens, we ping all OSDs with requests who have timed out to
660 * ensure any communications channel reset is detected. Reset the
661 * request timeouts another N seconds in the future as we go.
662 * Reschedule the timeout event another N seconds in future (unless
663 * there are no open requests).
664 */
665static void handle_timeout(struct work_struct *work)
666{
667 struct ceph_osd_client *osdc =
668 container_of(work, struct ceph_osd_client, timeout_work.work);
669 struct ceph_osd_request *req, *last_req = NULL;
670 struct ceph_osd *osd;
671 unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
672 unsigned long keepalive =
673 osdc->client->mount_args->osd_keepalive_timeout * HZ;
674 unsigned long last_stamp = 0;
675 struct rb_node *p;
676 struct list_head slow_osds;
677
678 dout("timeout\n");
679 down_read(&osdc->map_sem);
680
681 ceph_monc_request_next_osdmap(&osdc->client->monc);
682
683 mutex_lock(&osdc->request_mutex);
684 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
685 req = rb_entry(p, struct ceph_osd_request, r_node);
686
687 if (req->r_resend) {
688 int err;
689
690 dout("osdc resending prev failed %lld\n", req->r_tid);
691 err = __send_request(osdc, req);
692 if (err)
693 dout("osdc failed again on %lld\n", req->r_tid);
694 else
695 req->r_resend = false;
696 continue;
697 }
698 }
699
700 /*
701 * reset osds that appear to be _really_ unresponsive. this
702 * is a failsafe measure.. we really shouldn't be getting to
703 * this point if the system is working properly. the monitors
704 * should mark the osd as failed and we should find out about
705 * it from an updated osd map.
706 */
707 while (!list_empty(&osdc->req_lru)) {
708 req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
709 r_req_lru_item);
710
711 if (time_before(jiffies, req->r_stamp + timeout))
712 break;
713
714 BUG_ON(req == last_req && req->r_stamp == last_stamp);
715 last_req = req;
716 last_stamp = req->r_stamp;
717
718 osd = req->r_osd;
719 BUG_ON(!osd);
720 pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
721 req->r_tid, osd->o_osd);
722 __kick_requests(osdc, osd);
723 }
724
725 /*
726 * ping osds that are a bit slow. this ensures that if there
727 * is a break in the TCP connection we will notice, and reopen
728 * a connection with that osd (from the fault callback).
729 */
730 INIT_LIST_HEAD(&slow_osds);
731 list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
732 if (time_before(jiffies, req->r_stamp + keepalive))
733 break;
734
735 osd = req->r_osd;
736 BUG_ON(!osd);
737 dout(" tid %llu is slow, will send keepalive on osd%d\n",
738 req->r_tid, osd->o_osd);
739 list_move_tail(&osd->o_keepalive_item, &slow_osds);
740 }
741 while (!list_empty(&slow_osds)) {
742 osd = list_entry(slow_osds.next, struct ceph_osd,
743 o_keepalive_item);
744 list_del_init(&osd->o_keepalive_item);
745 ceph_con_keepalive(&osd->o_con);
746 }
747
748 __schedule_osd_timeout(osdc);
749 mutex_unlock(&osdc->request_mutex);
750
751 up_read(&osdc->map_sem);
752}
753
754static void handle_osds_timeout(struct work_struct *work)
755{
756 struct ceph_osd_client *osdc =
757 container_of(work, struct ceph_osd_client,
758 osds_timeout_work.work);
759 unsigned long delay =
760 osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
761
762 dout("osds timeout\n");
763 down_read(&osdc->map_sem);
764 remove_old_osds(osdc, 0);
765 up_read(&osdc->map_sem);
766
767 schedule_delayed_work(&osdc->osds_timeout_work,
768 round_jiffies_relative(delay));
769}
770
771/*
772 * handle osd op reply. either call the callback if it is specified,
773 * or do the completion to wake up the waiting thread.
774 */
775static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
776 struct ceph_connection *con)
777{
778 struct ceph_osd_reply_head *rhead = msg->front.iov_base;
779 struct ceph_osd_request *req;
780 u64 tid;
781 int numops, object_len, flags;
782
783 tid = le64_to_cpu(msg->hdr.tid);
784 if (msg->front.iov_len < sizeof(*rhead))
785 goto bad;
786 numops = le32_to_cpu(rhead->num_ops);
787 object_len = le32_to_cpu(rhead->object_len);
788 if (msg->front.iov_len != sizeof(*rhead) + object_len +
789 numops * sizeof(struct ceph_osd_op))
790 goto bad;
791 dout("handle_reply %p tid %llu\n", msg, tid);
792
793 /* lookup */
794 mutex_lock(&osdc->request_mutex);
795 req = __lookup_request(osdc, tid);
796 if (req == NULL) {
797 dout("handle_reply tid %llu dne\n", tid);
798 mutex_unlock(&osdc->request_mutex);
799 return;
800 }
801 ceph_osdc_get_request(req);
802 flags = le32_to_cpu(rhead->flags);
803
804 /*
805 * if this connection filled our message, drop our reference now, to
806 * avoid a (safe but slower) revoke later.
807 */
808 if (req->r_con_filling_msg == con && req->r_reply == msg) {
809 dout(" dropping con_filling_msg ref %p\n", con);
810 req->r_con_filling_msg = NULL;
811 ceph_con_put(con);
812 }
813
814 if (!req->r_got_reply) {
815 unsigned bytes;
816
817 req->r_result = le32_to_cpu(rhead->result);
818 bytes = le32_to_cpu(msg->hdr.data_len);
819 dout("handle_reply result %d bytes %d\n", req->r_result,
820 bytes);
821 if (req->r_result == 0)
822 req->r_result = bytes;
823
824 /* in case this is a write and we need to replay, */
825 req->r_reassert_version = rhead->reassert_version;
826
827 req->r_got_reply = 1;
828 } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
829 dout("handle_reply tid %llu dup ack\n", tid);
830 mutex_unlock(&osdc->request_mutex);
831 goto done;
832 }
833
834 dout("handle_reply tid %llu flags %d\n", tid, flags);
835
836 /* either this is a read, or we got the safe response */
837 if ((flags & CEPH_OSD_FLAG_ONDISK) ||
838 ((flags & CEPH_OSD_FLAG_WRITE) == 0))
839 __unregister_request(osdc, req);
840
841 mutex_unlock(&osdc->request_mutex);
842
843 if (req->r_callback)
844 req->r_callback(req, msg);
845 else
846 complete(&req->r_completion);
847
848 if (flags & CEPH_OSD_FLAG_ONDISK) {
849 if (req->r_safe_callback)
850 req->r_safe_callback(req, msg);
851 complete(&req->r_safe_completion); /* fsync waiter */
852 }
853
854done:
855 ceph_osdc_put_request(req);
856 return;
857
858bad:
859 pr_err("corrupt osd_op_reply got %d %d expected %d\n",
860 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
861 (int)sizeof(*rhead));
862 ceph_msg_dump(msg);
863}
864
865
866static int __kick_requests(struct ceph_osd_client *osdc,
867 struct ceph_osd *kickosd)
868{
869 struct ceph_osd_request *req;
870 struct rb_node *p, *n;
871 int needmap = 0;
872 int err;
873
874 dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
875 if (kickosd) {
876 err = __reset_osd(osdc, kickosd);
877 if (err == -EAGAIN)
878 return 1;
879 } else {
880 for (p = rb_first(&osdc->osds); p; p = n) {
881 struct ceph_osd *osd =
882 rb_entry(p, struct ceph_osd, o_node);
883
884 n = rb_next(p);
885 if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
886 memcmp(&osd->o_con.peer_addr,
887 ceph_osd_addr(osdc->osdmap,
888 osd->o_osd),
889 sizeof(struct ceph_entity_addr)) != 0)
890 __reset_osd(osdc, osd);
891 }
892 }
893
894 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
895 req = rb_entry(p, struct ceph_osd_request, r_node);
896
897 if (req->r_resend) {
898 dout(" r_resend set on tid %llu\n", req->r_tid);
899 __cancel_request(req);
900 goto kick;
901 }
902 if (req->r_osd && kickosd == req->r_osd) {
903 __cancel_request(req);
904 goto kick;
905 }
906
907 err = __map_osds(osdc, req);
908 if (err == 0)
909 continue; /* no change */
910 if (err < 0) {
911 /*
912 * FIXME: really, we should set the request
913 * error and fail if this isn't a 'nofail'
914 * request, but that's a fair bit more
915 * complicated to do. So retry!
916 */
917 dout(" setting r_resend on %llu\n", req->r_tid);
918 req->r_resend = true;
919 continue;
920 }
921 if (req->r_osd == NULL) {
922 dout("tid %llu maps to no valid osd\n", req->r_tid);
923 needmap++; /* request a newer map */
924 continue;
925 }
926
927kick:
928 dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
929 req->r_osd ? req->r_osd->o_osd : -1);
930 req->r_flags |= CEPH_OSD_FLAG_RETRY;
931 err = __send_request(osdc, req);
932 if (err) {
933 dout(" setting r_resend on %llu\n", req->r_tid);
934 req->r_resend = true;
935 }
936 }
937
938 return needmap;
939}
940
941/*
942 * Resubmit osd requests whose osd or osd address has changed. Request
943 * a new osd map if osds are down, or we are otherwise unable to determine
944 * how to direct a request.
945 *
946 * Close connections to down osds.
947 *
948 * If @who is specified, resubmit requests for that specific osd.
949 *
950 * Caller should hold map_sem for read and request_mutex.
951 */
952static void kick_requests(struct ceph_osd_client *osdc,
953 struct ceph_osd *kickosd)
954{
955 int needmap;
956
957 mutex_lock(&osdc->request_mutex);
958 needmap = __kick_requests(osdc, kickosd);
959 mutex_unlock(&osdc->request_mutex);
960
961 if (needmap) {
962 dout("%d requests for down osds, need new map\n", needmap);
963 ceph_monc_request_next_osdmap(&osdc->client->monc);
964 }
965
966}
967/*
968 * Process updated osd map.
969 *
970 * The message contains any number of incremental and full maps, normally
971 * indicating some sort of topology change in the cluster. Kick requests
972 * off to different OSDs as needed.
973 */
974void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
975{
976 void *p, *end, *next;
977 u32 nr_maps, maplen;
978 u32 epoch;
979 struct ceph_osdmap *newmap = NULL, *oldmap;
980 int err;
981 struct ceph_fsid fsid;
982
983 dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
984 p = msg->front.iov_base;
985 end = p + msg->front.iov_len;
986
987 /* verify fsid */
988 ceph_decode_need(&p, end, sizeof(fsid), bad);
989 ceph_decode_copy(&p, &fsid, sizeof(fsid));
990 if (ceph_check_fsid(osdc->client, &fsid) < 0)
991 return;
992
993 down_write(&osdc->map_sem);
994
995 /* incremental maps */
996 ceph_decode_32_safe(&p, end, nr_maps, bad);
997 dout(" %d inc maps\n", nr_maps);
998 while (nr_maps > 0) {
999 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1000 epoch = ceph_decode_32(&p);
1001 maplen = ceph_decode_32(&p);
1002 ceph_decode_need(&p, end, maplen, bad);
1003 next = p + maplen;
1004 if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
1005 dout("applying incremental map %u len %d\n",
1006 epoch, maplen);
1007 newmap = osdmap_apply_incremental(&p, next,
1008 osdc->osdmap,
1009 osdc->client->msgr);
1010 if (IS_ERR(newmap)) {
1011 err = PTR_ERR(newmap);
1012 goto bad;
1013 }
1014 BUG_ON(!newmap);
1015 if (newmap != osdc->osdmap) {
1016 ceph_osdmap_destroy(osdc->osdmap);
1017 osdc->osdmap = newmap;
1018 }
1019 } else {
1020 dout("ignoring incremental map %u len %d\n",
1021 epoch, maplen);
1022 }
1023 p = next;
1024 nr_maps--;
1025 }
1026 if (newmap)
1027 goto done;
1028
1029 /* full maps */
1030 ceph_decode_32_safe(&p, end, nr_maps, bad);
1031 dout(" %d full maps\n", nr_maps);
1032 while (nr_maps) {
1033 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1034 epoch = ceph_decode_32(&p);
1035 maplen = ceph_decode_32(&p);
1036 ceph_decode_need(&p, end, maplen, bad);
1037 if (nr_maps > 1) {
1038 dout("skipping non-latest full map %u len %d\n",
1039 epoch, maplen);
1040 } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
1041 dout("skipping full map %u len %d, "
1042 "older than our %u\n", epoch, maplen,
1043 osdc->osdmap->epoch);
1044 } else {
1045 dout("taking full map %u len %d\n", epoch, maplen);
1046 newmap = osdmap_decode(&p, p+maplen);
1047 if (IS_ERR(newmap)) {
1048 err = PTR_ERR(newmap);
1049 goto bad;
1050 }
1051 BUG_ON(!newmap);
1052 oldmap = osdc->osdmap;
1053 osdc->osdmap = newmap;
1054 if (oldmap)
1055 ceph_osdmap_destroy(oldmap);
1056 }
1057 p += maplen;
1058 nr_maps--;
1059 }
1060
1061done:
1062 downgrade_write(&osdc->map_sem);
1063 ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
1064 if (newmap)
1065 kick_requests(osdc, NULL);
1066 up_read(&osdc->map_sem);
1067 return;
1068
1069bad:
1070 pr_err("osdc handle_map corrupt msg\n");
1071 ceph_msg_dump(msg);
1072 up_write(&osdc->map_sem);
1073 return;
1074}
1075
1076
1077/*
1078 * A read request prepares specific pages that data is to be read into.
1079 * When a message is being read off the wire, we call prepare_pages to
1080 * find those pages.
1081 * 0 = success, -1 failure.
1082 */
1083static int __prepare_pages(struct ceph_connection *con,
1084 struct ceph_msg_header *hdr,
1085 struct ceph_osd_request *req,
1086 u64 tid,
1087 struct ceph_msg *m)
1088{
1089 struct ceph_osd *osd = con->private;
1090 struct ceph_osd_client *osdc;
1091 int ret = -1;
1092 int data_len = le32_to_cpu(hdr->data_len);
1093 unsigned data_off = le16_to_cpu(hdr->data_off);
1094
1095 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1096
1097 if (!osd)
1098 return -1;
1099
1100 osdc = osd->o_osdc;
1101
1102 dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
1103 tid, req->r_num_pages, want);
1104 if (unlikely(req->r_num_pages < want))
1105 goto out;
1106 m->pages = req->r_pages;
1107 m->nr_pages = req->r_num_pages;
1108 ret = 0; /* success */
1109out:
1110 BUG_ON(ret < 0 || m->nr_pages < want);
1111
1112 return ret;
1113}
1114
1115/*
1116 * Register request, send initial attempt.
1117 */
1118int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1119 struct ceph_osd_request *req,
1120 bool nofail)
1121{
1122 int rc = 0;
1123
1124 req->r_request->pages = req->r_pages;
1125 req->r_request->nr_pages = req->r_num_pages;
1126
1127 register_request(osdc, req);
1128
1129 down_read(&osdc->map_sem);
1130 mutex_lock(&osdc->request_mutex);
1131 /*
1132 * a racing kick_requests() may have sent the message for us
1133 * while we dropped request_mutex above, so only send now if
1134 * the request still han't been touched yet.
1135 */
1136 if (req->r_sent == 0) {
1137 rc = __send_request(osdc, req);
1138 if (rc) {
1139 if (nofail) {
1140 dout("osdc_start_request failed send, "
1141 " marking %lld\n", req->r_tid);
1142 req->r_resend = true;
1143 rc = 0;
1144 } else {
1145 __unregister_request(osdc, req);
1146 }
1147 }
1148 }
1149 mutex_unlock(&osdc->request_mutex);
1150 up_read(&osdc->map_sem);
1151 return rc;
1152}
1153
1154/*
1155 * wait for a request to complete
1156 */
1157int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
1158 struct ceph_osd_request *req)
1159{
1160 int rc;
1161
1162 rc = wait_for_completion_interruptible(&req->r_completion);
1163 if (rc < 0) {
1164 mutex_lock(&osdc->request_mutex);
1165 __cancel_request(req);
1166 __unregister_request(osdc, req);
1167 mutex_unlock(&osdc->request_mutex);
1168 dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
1169 return rc;
1170 }
1171
1172 dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
1173 return req->r_result;
1174}
1175
1176/*
1177 * sync - wait for all in-flight requests to flush. avoid starvation.
1178 */
1179void ceph_osdc_sync(struct ceph_osd_client *osdc)
1180{
1181 struct ceph_osd_request *req;
1182 u64 last_tid, next_tid = 0;
1183
1184 mutex_lock(&osdc->request_mutex);
1185 last_tid = osdc->last_tid;
1186 while (1) {
1187 req = __lookup_request_ge(osdc, next_tid);
1188 if (!req)
1189 break;
1190 if (req->r_tid > last_tid)
1191 break;
1192
1193 next_tid = req->r_tid + 1;
1194 if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
1195 continue;
1196
1197 ceph_osdc_get_request(req);
1198 mutex_unlock(&osdc->request_mutex);
1199 dout("sync waiting on tid %llu (last is %llu)\n",
1200 req->r_tid, last_tid);
1201 wait_for_completion(&req->r_safe_completion);
1202 mutex_lock(&osdc->request_mutex);
1203 ceph_osdc_put_request(req);
1204 }
1205 mutex_unlock(&osdc->request_mutex);
1206 dout("sync done (thru tid %llu)\n", last_tid);
1207}
1208
1209/*
1210 * init, shutdown
1211 */
1212int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1213{
1214 int err;
1215
1216 dout("init\n");
1217 osdc->client = client;
1218 osdc->osdmap = NULL;
1219 init_rwsem(&osdc->map_sem);
1220 init_completion(&osdc->map_waiters);
1221 osdc->last_requested_map = 0;
1222 mutex_init(&osdc->request_mutex);
1223 osdc->last_tid = 0;
1224 osdc->osds = RB_ROOT;
1225 INIT_LIST_HEAD(&osdc->osd_lru);
1226 osdc->requests = RB_ROOT;
1227 INIT_LIST_HEAD(&osdc->req_lru);
1228 osdc->num_requests = 0;
1229 INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
1230 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
1231
1232 schedule_delayed_work(&osdc->osds_timeout_work,
1233 round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
1234
1235 err = -ENOMEM;
1236 osdc->req_mempool = mempool_create_kmalloc_pool(10,
1237 sizeof(struct ceph_osd_request));
1238 if (!osdc->req_mempool)
1239 goto out;
1240
1241 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true);
1242 if (err < 0)
1243 goto out_mempool;
1244 err = ceph_msgpool_init(&osdc->msgpool_op_reply,
1245 OSD_OPREPLY_FRONT_LEN, 10, true);
1246 if (err < 0)
1247 goto out_msgpool;
1248 return 0;
1249
1250out_msgpool:
1251 ceph_msgpool_destroy(&osdc->msgpool_op);
1252out_mempool:
1253 mempool_destroy(osdc->req_mempool);
1254out:
1255 return err;
1256}
1257
1258void ceph_osdc_stop(struct ceph_osd_client *osdc)
1259{
1260 cancel_delayed_work_sync(&osdc->timeout_work);
1261 cancel_delayed_work_sync(&osdc->osds_timeout_work);
1262 if (osdc->osdmap) {
1263 ceph_osdmap_destroy(osdc->osdmap);
1264 osdc->osdmap = NULL;
1265 }
1266 remove_old_osds(osdc, 1);
1267 mempool_destroy(osdc->req_mempool);
1268 ceph_msgpool_destroy(&osdc->msgpool_op);
1269 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
1270}
1271
1272/*
1273 * Read some contiguous pages. If we cross a stripe boundary, shorten
1274 * *plen. Return number of bytes read, or error.
1275 */
1276int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1277 struct ceph_vino vino, struct ceph_file_layout *layout,
1278 u64 off, u64 *plen,
1279 u32 truncate_seq, u64 truncate_size,
1280 struct page **pages, int num_pages)
1281{
1282 struct ceph_osd_request *req;
1283 int rc = 0;
1284
1285 dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
1286 vino.snap, off, *plen);
1287 req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
1288 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1289 NULL, 0, truncate_seq, truncate_size, NULL,
1290 false, 1);
1291 if (IS_ERR(req))
1292 return PTR_ERR(req);
1293
1294 /* it may be a short read due to an object boundary */
1295 req->r_pages = pages;
1296 num_pages = calc_pages_for(off, *plen);
1297 req->r_num_pages = num_pages;
1298
1299 dout("readpages final extent is %llu~%llu (%d pages)\n",
1300 off, *plen, req->r_num_pages);
1301
1302 rc = ceph_osdc_start_request(osdc, req, false);
1303 if (!rc)
1304 rc = ceph_osdc_wait_request(osdc, req);
1305
1306 ceph_osdc_put_request(req);
1307 dout("readpages result %d\n", rc);
1308 return rc;
1309}
1310
1311/*
1312 * do a synchronous write on N pages
1313 */
1314int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1315 struct ceph_file_layout *layout,
1316 struct ceph_snap_context *snapc,
1317 u64 off, u64 len,
1318 u32 truncate_seq, u64 truncate_size,
1319 struct timespec *mtime,
1320 struct page **pages, int num_pages,
1321 int flags, int do_sync, bool nofail)
1322{
1323 struct ceph_osd_request *req;
1324 int rc = 0;
1325
1326 BUG_ON(vino.snap != CEPH_NOSNAP);
1327 req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
1328 CEPH_OSD_OP_WRITE,
1329 flags | CEPH_OSD_FLAG_ONDISK |
1330 CEPH_OSD_FLAG_WRITE,
1331 snapc, do_sync,
1332 truncate_seq, truncate_size, mtime,
1333 nofail, 1);
1334 if (IS_ERR(req))
1335 return PTR_ERR(req);
1336
1337 /* it may be a short write due to an object boundary */
1338 req->r_pages = pages;
1339 req->r_num_pages = calc_pages_for(off, len);
1340 dout("writepages %llu~%llu (%d pages)\n", off, len,
1341 req->r_num_pages);
1342
1343 rc = ceph_osdc_start_request(osdc, req, nofail);
1344 if (!rc)
1345 rc = ceph_osdc_wait_request(osdc, req);
1346
1347 ceph_osdc_put_request(req);
1348 if (rc == 0)
1349 rc = len;
1350 dout("writepages result %d\n", rc);
1351 return rc;
1352}
1353
1354/*
1355 * handle incoming message
1356 */
1357static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1358{
1359 struct ceph_osd *osd = con->private;
1360 struct ceph_osd_client *osdc;
1361 int type = le16_to_cpu(msg->hdr.type);
1362
1363 if (!osd)
1364 return;
1365 osdc = osd->o_osdc;
1366
1367 switch (type) {
1368 case CEPH_MSG_OSD_MAP:
1369 ceph_osdc_handle_map(osdc, msg);
1370 break;
1371 case CEPH_MSG_OSD_OPREPLY:
1372 handle_reply(osdc, msg, con);
1373 break;
1374
1375 default:
1376 pr_err("received unknown message type %d %s\n", type,
1377 ceph_msg_type_name(type));
1378 }
1379 ceph_msg_put(msg);
1380}
1381
1382/*
1383 * lookup and return message for incoming reply
1384 */
1385static struct ceph_msg *get_reply(struct ceph_connection *con,
1386 struct ceph_msg_header *hdr,
1387 int *skip)
1388{
1389 struct ceph_osd *osd = con->private;
1390 struct ceph_osd_client *osdc = osd->o_osdc;
1391 struct ceph_msg *m;
1392 struct ceph_osd_request *req;
1393 int front = le32_to_cpu(hdr->front_len);
1394 int data_len = le32_to_cpu(hdr->data_len);
1395 u64 tid;
1396 int err;
1397
1398 tid = le64_to_cpu(hdr->tid);
1399 mutex_lock(&osdc->request_mutex);
1400 req = __lookup_request(osdc, tid);
1401 if (!req) {
1402 *skip = 1;
1403 m = NULL;
1404 pr_info("get_reply unknown tid %llu from osd%d\n", tid,
1405 osd->o_osd);
1406 goto out;
1407 }
1408
1409 if (req->r_con_filling_msg) {
1410 dout("get_reply revoking msg %p from old con %p\n",
1411 req->r_reply, req->r_con_filling_msg);
1412 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
1413 ceph_con_put(req->r_con_filling_msg);
1414 }
1415
1416 if (front > req->r_reply->front.iov_len) {
1417 pr_warning("get_reply front %d > preallocated %d\n",
1418 front, (int)req->r_reply->front.iov_len);
1419 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL);
1420 if (IS_ERR(m))
1421 goto out;
1422 ceph_msg_put(req->r_reply);
1423 req->r_reply = m;
1424 }
1425 m = ceph_msg_get(req->r_reply);
1426
1427 if (data_len > 0) {
1428 err = __prepare_pages(con, hdr, req, tid, m);
1429 if (err < 0) {
1430 *skip = 1;
1431 ceph_msg_put(m);
1432 m = ERR_PTR(err);
1433 }
1434 }
1435 *skip = 0;
1436 req->r_con_filling_msg = ceph_con_get(con);
1437 dout("get_reply tid %lld %p\n", tid, m);
1438
1439out:
1440 mutex_unlock(&osdc->request_mutex);
1441 return m;
1442
1443}
1444
1445static struct ceph_msg *alloc_msg(struct ceph_connection *con,
1446 struct ceph_msg_header *hdr,
1447 int *skip)
1448{
1449 struct ceph_osd *osd = con->private;
1450 int type = le16_to_cpu(hdr->type);
1451 int front = le32_to_cpu(hdr->front_len);
1452
1453 switch (type) {
1454 case CEPH_MSG_OSD_MAP:
1455 return ceph_msg_new(type, front, 0, 0, NULL);
1456 case CEPH_MSG_OSD_OPREPLY:
1457 return get_reply(con, hdr, skip);
1458 default:
1459 pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
1460 osd->o_osd);
1461 *skip = 1;
1462 return NULL;
1463 }
1464}
1465
1466/*
1467 * Wrappers to refcount containing ceph_osd struct
1468 */
1469static struct ceph_connection *get_osd_con(struct ceph_connection *con)
1470{
1471 struct ceph_osd *osd = con->private;
1472 if (get_osd(osd))
1473 return con;
1474 return NULL;
1475}
1476
1477static void put_osd_con(struct ceph_connection *con)
1478{
1479 struct ceph_osd *osd = con->private;
1480 put_osd(osd);
1481}
1482
1483/*
1484 * authentication
1485 */
1486static int get_authorizer(struct ceph_connection *con,
1487 void **buf, int *len, int *proto,
1488 void **reply_buf, int *reply_len, int force_new)
1489{
1490 struct ceph_osd *o = con->private;
1491 struct ceph_osd_client *osdc = o->o_osdc;
1492 struct ceph_auth_client *ac = osdc->client->monc.auth;
1493 int ret = 0;
1494
1495 if (force_new && o->o_authorizer) {
1496 ac->ops->destroy_authorizer(ac, o->o_authorizer);
1497 o->o_authorizer = NULL;
1498 }
1499 if (o->o_authorizer == NULL) {
1500 ret = ac->ops->create_authorizer(
1501 ac, CEPH_ENTITY_TYPE_OSD,
1502 &o->o_authorizer,
1503 &o->o_authorizer_buf,
1504 &o->o_authorizer_buf_len,
1505 &o->o_authorizer_reply_buf,
1506 &o->o_authorizer_reply_buf_len);
1507 if (ret)
1508 return ret;
1509 }
1510
1511 *proto = ac->protocol;
1512 *buf = o->o_authorizer_buf;
1513 *len = o->o_authorizer_buf_len;
1514 *reply_buf = o->o_authorizer_reply_buf;
1515 *reply_len = o->o_authorizer_reply_buf_len;
1516 return 0;
1517}
1518
1519
1520static int verify_authorizer_reply(struct ceph_connection *con, int len)
1521{
1522 struct ceph_osd *o = con->private;
1523 struct ceph_osd_client *osdc = o->o_osdc;
1524 struct ceph_auth_client *ac = osdc->client->monc.auth;
1525
1526 return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
1527}
1528
1529static int invalidate_authorizer(struct ceph_connection *con)
1530{
1531 struct ceph_osd *o = con->private;
1532 struct ceph_osd_client *osdc = o->o_osdc;
1533 struct ceph_auth_client *ac = osdc->client->monc.auth;
1534
1535 if (ac->ops->invalidate_authorizer)
1536 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
1537
1538 return ceph_monc_validate_auth(&osdc->client->monc);
1539}
1540
1541const static struct ceph_connection_operations osd_con_ops = {
1542 .get = get_osd_con,
1543 .put = put_osd_con,
1544 .dispatch = dispatch,
1545 .get_authorizer = get_authorizer,
1546 .verify_authorizer_reply = verify_authorizer_reply,
1547 .invalidate_authorizer = invalidate_authorizer,
1548 .alloc_msg = alloc_msg,
1549 .fault = osd_reset,
1550};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
new file mode 100644
index 000000000000..b0759911e7c3
--- /dev/null
+++ b/fs/ceph/osd_client.h
@@ -0,0 +1,166 @@
1#ifndef _FS_CEPH_OSD_CLIENT_H
2#define _FS_CEPH_OSD_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/mempool.h>
7#include <linux/rbtree.h>
8
9#include "types.h"
10#include "osdmap.h"
11#include "messenger.h"
12
13struct ceph_msg;
14struct ceph_snap_context;
15struct ceph_osd_request;
16struct ceph_osd_client;
17struct ceph_authorizer;
18
19/*
20 * completion callback for async writepages
21 */
22typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
23 struct ceph_msg *);
24
25/* a given osd we're communicating with */
26struct ceph_osd {
27 atomic_t o_ref;
28 struct ceph_osd_client *o_osdc;
29 int o_osd;
30 int o_incarnation;
31 struct rb_node o_node;
32 struct ceph_connection o_con;
33 struct list_head o_requests;
34 struct list_head o_osd_lru;
35 struct ceph_authorizer *o_authorizer;
36 void *o_authorizer_buf, *o_authorizer_reply_buf;
37 size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
38 unsigned long lru_ttl;
39 int o_marked_for_keepalive;
40 struct list_head o_keepalive_item;
41};
42
43/* an in-flight request */
44struct ceph_osd_request {
45 u64 r_tid; /* unique for this client */
46 struct rb_node r_node;
47 struct list_head r_req_lru_item;
48 struct list_head r_osd_item;
49 struct ceph_osd *r_osd;
50 struct ceph_pg r_pgid;
51
52 struct ceph_connection *r_con_filling_msg;
53
54 struct ceph_msg *r_request, *r_reply;
55 int r_result;
56 int r_flags; /* any additional flags for the osd */
57 u32 r_sent; /* >0 if r_request is sending/sent */
58 int r_got_reply;
59
60 struct ceph_osd_client *r_osdc;
61 struct kref r_kref;
62 bool r_mempool;
63 struct completion r_completion, r_safe_completion;
64 ceph_osdc_callback_t r_callback, r_safe_callback;
65 struct ceph_eversion r_reassert_version;
66 struct list_head r_unsafe_item;
67
68 struct inode *r_inode; /* for use by callbacks */
69 struct writeback_control *r_wbc; /* ditto */
70
71 char r_oid[40]; /* object name */
72 int r_oid_len;
73 unsigned long r_stamp; /* send OR check time */
74 bool r_resend; /* msg send failed, needs retry */
75
76 struct ceph_file_layout r_file_layout;
77 struct ceph_snap_context *r_snapc; /* snap context for writes */
78 unsigned r_num_pages; /* size of page array (follows) */
79 struct page **r_pages; /* pages for data payload */
80 int r_pages_from_pool;
81 int r_own_pages; /* if true, i own page list */
82};
83
84struct ceph_osd_client {
85 struct ceph_client *client;
86
87 struct ceph_osdmap *osdmap; /* current map */
88 struct rw_semaphore map_sem;
89 struct completion map_waiters;
90 u64 last_requested_map;
91
92 struct mutex request_mutex;
93 struct rb_root osds; /* osds */
94 struct list_head osd_lru; /* idle osds */
95 u64 timeout_tid; /* tid of timeout triggering rq */
96 u64 last_tid; /* tid of last request */
97 struct rb_root requests; /* pending requests */
98 struct list_head req_lru; /* pending requests lru */
99 int num_requests;
100 struct delayed_work timeout_work;
101 struct delayed_work osds_timeout_work;
102#ifdef CONFIG_DEBUG_FS
103 struct dentry *debugfs_file;
104#endif
105
106 mempool_t *req_mempool;
107
108 struct ceph_msgpool msgpool_op;
109 struct ceph_msgpool msgpool_op_reply;
110};
111
112extern int ceph_osdc_init(struct ceph_osd_client *osdc,
113 struct ceph_client *client);
114extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
115
116extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
117 struct ceph_msg *msg);
118extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
119 struct ceph_msg *msg);
120
121extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
122 struct ceph_file_layout *layout,
123 struct ceph_vino vino,
124 u64 offset, u64 *len, int op, int flags,
125 struct ceph_snap_context *snapc,
126 int do_sync, u32 truncate_seq,
127 u64 truncate_size,
128 struct timespec *mtime,
129 bool use_mempool, int num_reply);
130
131static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
132{
133 kref_get(&req->r_kref);
134}
135extern void ceph_osdc_release_request(struct kref *kref);
136static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
137{
138 kref_put(&req->r_kref, ceph_osdc_release_request);
139}
140
141extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
142 struct ceph_osd_request *req,
143 bool nofail);
144extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
145 struct ceph_osd_request *req);
146extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
147
148extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
149 struct ceph_vino vino,
150 struct ceph_file_layout *layout,
151 u64 off, u64 *plen,
152 u32 truncate_seq, u64 truncate_size,
153 struct page **pages, int nr_pages);
154
155extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
156 struct ceph_vino vino,
157 struct ceph_file_layout *layout,
158 struct ceph_snap_context *sc,
159 u64 off, u64 len,
160 u32 truncate_seq, u64 truncate_size,
161 struct timespec *mtime,
162 struct page **pages, int nr_pages,
163 int flags, int do_sync, bool nofail);
164
165#endif
166
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
new file mode 100644
index 000000000000..2e2c15eed82a
--- /dev/null
+++ b/fs/ceph/osdmap.c
@@ -0,0 +1,1062 @@
1
2#include "ceph_debug.h"
3
4#include <linux/slab.h>
5#include <asm/div64.h>
6
7#include "super.h"
8#include "osdmap.h"
9#include "crush/hash.h"
10#include "crush/mapper.h"
11#include "decode.h"
12
13char *ceph_osdmap_state_str(char *str, int len, int state)
14{
15 int flag = 0;
16
17 if (!len)
18 goto done;
19
20 *str = '\0';
21 if (state) {
22 if (state & CEPH_OSD_EXISTS) {
23 snprintf(str, len, "exists");
24 flag = 1;
25 }
26 if (state & CEPH_OSD_UP) {
27 snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
28 "up");
29 flag = 1;
30 }
31 } else {
32 snprintf(str, len, "doesn't exist");
33 }
34done:
35 return str;
36}
37
38/* maps */
39
40static int calc_bits_of(unsigned t)
41{
42 int b = 0;
43 while (t) {
44 t = t >> 1;
45 b++;
46 }
47 return b;
48}
49
50/*
51 * the foo_mask is the smallest value 2^n-1 that is >= foo.
52 */
53static void calc_pg_masks(struct ceph_pg_pool_info *pi)
54{
55 pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
56 pi->pgp_num_mask =
57 (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
58 pi->lpg_num_mask =
59 (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
60 pi->lpgp_num_mask =
61 (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
62}
63
64/*
65 * decode crush map
66 */
67static int crush_decode_uniform_bucket(void **p, void *end,
68 struct crush_bucket_uniform *b)
69{
70 dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
71 ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
72 b->item_weight = ceph_decode_32(p);
73 return 0;
74bad:
75 return -EINVAL;
76}
77
78static int crush_decode_list_bucket(void **p, void *end,
79 struct crush_bucket_list *b)
80{
81 int j;
82 dout("crush_decode_list_bucket %p to %p\n", *p, end);
83 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
84 if (b->item_weights == NULL)
85 return -ENOMEM;
86 b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
87 if (b->sum_weights == NULL)
88 return -ENOMEM;
89 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
90 for (j = 0; j < b->h.size; j++) {
91 b->item_weights[j] = ceph_decode_32(p);
92 b->sum_weights[j] = ceph_decode_32(p);
93 }
94 return 0;
95bad:
96 return -EINVAL;
97}
98
99static int crush_decode_tree_bucket(void **p, void *end,
100 struct crush_bucket_tree *b)
101{
102 int j;
103 dout("crush_decode_tree_bucket %p to %p\n", *p, end);
104 ceph_decode_32_safe(p, end, b->num_nodes, bad);
105 b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
106 if (b->node_weights == NULL)
107 return -ENOMEM;
108 ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
109 for (j = 0; j < b->num_nodes; j++)
110 b->node_weights[j] = ceph_decode_32(p);
111 return 0;
112bad:
113 return -EINVAL;
114}
115
116static int crush_decode_straw_bucket(void **p, void *end,
117 struct crush_bucket_straw *b)
118{
119 int j;
120 dout("crush_decode_straw_bucket %p to %p\n", *p, end);
121 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
122 if (b->item_weights == NULL)
123 return -ENOMEM;
124 b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
125 if (b->straws == NULL)
126 return -ENOMEM;
127 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
128 for (j = 0; j < b->h.size; j++) {
129 b->item_weights[j] = ceph_decode_32(p);
130 b->straws[j] = ceph_decode_32(p);
131 }
132 return 0;
133bad:
134 return -EINVAL;
135}
136
137static struct crush_map *crush_decode(void *pbyval, void *end)
138{
139 struct crush_map *c;
140 int err = -EINVAL;
141 int i, j;
142 void **p = &pbyval;
143 void *start = pbyval;
144 u32 magic;
145
146 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
147
148 c = kzalloc(sizeof(*c), GFP_NOFS);
149 if (c == NULL)
150 return ERR_PTR(-ENOMEM);
151
152 ceph_decode_need(p, end, 4*sizeof(u32), bad);
153 magic = ceph_decode_32(p);
154 if (magic != CRUSH_MAGIC) {
155 pr_err("crush_decode magic %x != current %x\n",
156 (unsigned)magic, (unsigned)CRUSH_MAGIC);
157 goto bad;
158 }
159 c->max_buckets = ceph_decode_32(p);
160 c->max_rules = ceph_decode_32(p);
161 c->max_devices = ceph_decode_32(p);
162
163 c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
164 if (c->device_parents == NULL)
165 goto badmem;
166 c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
167 if (c->bucket_parents == NULL)
168 goto badmem;
169
170 c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
171 if (c->buckets == NULL)
172 goto badmem;
173 c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
174 if (c->rules == NULL)
175 goto badmem;
176
177 /* buckets */
178 for (i = 0; i < c->max_buckets; i++) {
179 int size = 0;
180 u32 alg;
181 struct crush_bucket *b;
182
183 ceph_decode_32_safe(p, end, alg, bad);
184 if (alg == 0) {
185 c->buckets[i] = NULL;
186 continue;
187 }
188 dout("crush_decode bucket %d off %x %p to %p\n",
189 i, (int)(*p-start), *p, end);
190
191 switch (alg) {
192 case CRUSH_BUCKET_UNIFORM:
193 size = sizeof(struct crush_bucket_uniform);
194 break;
195 case CRUSH_BUCKET_LIST:
196 size = sizeof(struct crush_bucket_list);
197 break;
198 case CRUSH_BUCKET_TREE:
199 size = sizeof(struct crush_bucket_tree);
200 break;
201 case CRUSH_BUCKET_STRAW:
202 size = sizeof(struct crush_bucket_straw);
203 break;
204 default:
205 err = -EINVAL;
206 goto bad;
207 }
208 BUG_ON(size == 0);
209 b = c->buckets[i] = kzalloc(size, GFP_NOFS);
210 if (b == NULL)
211 goto badmem;
212
213 ceph_decode_need(p, end, 4*sizeof(u32), bad);
214 b->id = ceph_decode_32(p);
215 b->type = ceph_decode_16(p);
216 b->alg = ceph_decode_8(p);
217 b->hash = ceph_decode_8(p);
218 b->weight = ceph_decode_32(p);
219 b->size = ceph_decode_32(p);
220
221 dout("crush_decode bucket size %d off %x %p to %p\n",
222 b->size, (int)(*p-start), *p, end);
223
224 b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
225 if (b->items == NULL)
226 goto badmem;
227 b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
228 if (b->perm == NULL)
229 goto badmem;
230 b->perm_n = 0;
231
232 ceph_decode_need(p, end, b->size*sizeof(u32), bad);
233 for (j = 0; j < b->size; j++)
234 b->items[j] = ceph_decode_32(p);
235
236 switch (b->alg) {
237 case CRUSH_BUCKET_UNIFORM:
238 err = crush_decode_uniform_bucket(p, end,
239 (struct crush_bucket_uniform *)b);
240 if (err < 0)
241 goto bad;
242 break;
243 case CRUSH_BUCKET_LIST:
244 err = crush_decode_list_bucket(p, end,
245 (struct crush_bucket_list *)b);
246 if (err < 0)
247 goto bad;
248 break;
249 case CRUSH_BUCKET_TREE:
250 err = crush_decode_tree_bucket(p, end,
251 (struct crush_bucket_tree *)b);
252 if (err < 0)
253 goto bad;
254 break;
255 case CRUSH_BUCKET_STRAW:
256 err = crush_decode_straw_bucket(p, end,
257 (struct crush_bucket_straw *)b);
258 if (err < 0)
259 goto bad;
260 break;
261 }
262 }
263
264 /* rules */
265 dout("rule vec is %p\n", c->rules);
266 for (i = 0; i < c->max_rules; i++) {
267 u32 yes;
268 struct crush_rule *r;
269
270 ceph_decode_32_safe(p, end, yes, bad);
271 if (!yes) {
272 dout("crush_decode NO rule %d off %x %p to %p\n",
273 i, (int)(*p-start), *p, end);
274 c->rules[i] = NULL;
275 continue;
276 }
277
278 dout("crush_decode rule %d off %x %p to %p\n",
279 i, (int)(*p-start), *p, end);
280
281 /* len */
282 ceph_decode_32_safe(p, end, yes, bad);
283#if BITS_PER_LONG == 32
284 err = -EINVAL;
285 if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
286 goto bad;
287#endif
288 r = c->rules[i] = kmalloc(sizeof(*r) +
289 yes*sizeof(struct crush_rule_step),
290 GFP_NOFS);
291 if (r == NULL)
292 goto badmem;
293 dout(" rule %d is at %p\n", i, r);
294 r->len = yes;
295 ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
296 ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
297 for (j = 0; j < r->len; j++) {
298 r->steps[j].op = ceph_decode_32(p);
299 r->steps[j].arg1 = ceph_decode_32(p);
300 r->steps[j].arg2 = ceph_decode_32(p);
301 }
302 }
303
304 /* ignore trailing name maps. */
305
306 dout("crush_decode success\n");
307 return c;
308
309badmem:
310 err = -ENOMEM;
311bad:
312 dout("crush_decode fail %d\n", err);
313 crush_destroy(c);
314 return ERR_PTR(err);
315}
316
317/*
318 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
319 * to a set of osds)
320 */
321static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
322{
323 u64 a = *(u64 *)&l;
324 u64 b = *(u64 *)&r;
325
326 if (a < b)
327 return -1;
328 if (a > b)
329 return 1;
330 return 0;
331}
332
333static int __insert_pg_mapping(struct ceph_pg_mapping *new,
334 struct rb_root *root)
335{
336 struct rb_node **p = &root->rb_node;
337 struct rb_node *parent = NULL;
338 struct ceph_pg_mapping *pg = NULL;
339 int c;
340
341 while (*p) {
342 parent = *p;
343 pg = rb_entry(parent, struct ceph_pg_mapping, node);
344 c = pgid_cmp(new->pgid, pg->pgid);
345 if (c < 0)
346 p = &(*p)->rb_left;
347 else if (c > 0)
348 p = &(*p)->rb_right;
349 else
350 return -EEXIST;
351 }
352
353 rb_link_node(&new->node, parent, p);
354 rb_insert_color(&new->node, root);
355 return 0;
356}
357
358static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
359 struct ceph_pg pgid)
360{
361 struct rb_node *n = root->rb_node;
362 struct ceph_pg_mapping *pg;
363 int c;
364
365 while (n) {
366 pg = rb_entry(n, struct ceph_pg_mapping, node);
367 c = pgid_cmp(pgid, pg->pgid);
368 if (c < 0)
369 n = n->rb_left;
370 else if (c > 0)
371 n = n->rb_right;
372 else
373 return pg;
374 }
375 return NULL;
376}
377
378/*
379 * rbtree of pg pool info
380 */
381static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
382{
383 struct rb_node **p = &root->rb_node;
384 struct rb_node *parent = NULL;
385 struct ceph_pg_pool_info *pi = NULL;
386
387 while (*p) {
388 parent = *p;
389 pi = rb_entry(parent, struct ceph_pg_pool_info, node);
390 if (new->id < pi->id)
391 p = &(*p)->rb_left;
392 else if (new->id > pi->id)
393 p = &(*p)->rb_right;
394 else
395 return -EEXIST;
396 }
397
398 rb_link_node(&new->node, parent, p);
399 rb_insert_color(&new->node, root);
400 return 0;
401}
402
403static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
404{
405 struct ceph_pg_pool_info *pi;
406 struct rb_node *n = root->rb_node;
407
408 while (n) {
409 pi = rb_entry(n, struct ceph_pg_pool_info, node);
410 if (id < pi->id)
411 n = n->rb_left;
412 else if (id > pi->id)
413 n = n->rb_right;
414 else
415 return pi;
416 }
417 return NULL;
418}
419
420static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
421{
422 rb_erase(&pi->node, root);
423 kfree(pi->name);
424 kfree(pi);
425}
426
427void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
428{
429 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
430 calc_pg_masks(pi);
431 *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
432 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
433}
434
435static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
436{
437 struct ceph_pg_pool_info *pi;
438 u32 num, len, pool;
439
440 ceph_decode_32_safe(p, end, num, bad);
441 dout(" %d pool names\n", num);
442 while (num--) {
443 ceph_decode_32_safe(p, end, pool, bad);
444 ceph_decode_32_safe(p, end, len, bad);
445 dout(" pool %d len %d\n", pool, len);
446 pi = __lookup_pg_pool(&map->pg_pools, pool);
447 if (pi) {
448 kfree(pi->name);
449 pi->name = kmalloc(len + 1, GFP_NOFS);
450 if (pi->name) {
451 memcpy(pi->name, *p, len);
452 pi->name[len] = '\0';
453 dout(" name is %s\n", pi->name);
454 }
455 }
456 *p += len;
457 }
458 return 0;
459
460bad:
461 return -EINVAL;
462}
463
464/*
465 * osd map
466 */
467void ceph_osdmap_destroy(struct ceph_osdmap *map)
468{
469 dout("osdmap_destroy %p\n", map);
470 if (map->crush)
471 crush_destroy(map->crush);
472 while (!RB_EMPTY_ROOT(&map->pg_temp)) {
473 struct ceph_pg_mapping *pg =
474 rb_entry(rb_first(&map->pg_temp),
475 struct ceph_pg_mapping, node);
476 rb_erase(&pg->node, &map->pg_temp);
477 kfree(pg);
478 }
479 while (!RB_EMPTY_ROOT(&map->pg_pools)) {
480 struct ceph_pg_pool_info *pi =
481 rb_entry(rb_first(&map->pg_pools),
482 struct ceph_pg_pool_info, node);
483 __remove_pg_pool(&map->pg_pools, pi);
484 }
485 kfree(map->osd_state);
486 kfree(map->osd_weight);
487 kfree(map->osd_addr);
488 kfree(map);
489}
490
491/*
492 * adjust max osd value. reallocate arrays.
493 */
494static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
495{
496 u8 *state;
497 struct ceph_entity_addr *addr;
498 u32 *weight;
499
500 state = kcalloc(max, sizeof(*state), GFP_NOFS);
501 addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
502 weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
503 if (state == NULL || addr == NULL || weight == NULL) {
504 kfree(state);
505 kfree(addr);
506 kfree(weight);
507 return -ENOMEM;
508 }
509
510 /* copy old? */
511 if (map->osd_state) {
512 memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
513 memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
514 memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
515 kfree(map->osd_state);
516 kfree(map->osd_addr);
517 kfree(map->osd_weight);
518 }
519
520 map->osd_state = state;
521 map->osd_weight = weight;
522 map->osd_addr = addr;
523 map->max_osd = max;
524 return 0;
525}
526
527/*
528 * decode a full map.
529 */
530struct ceph_osdmap *osdmap_decode(void **p, void *end)
531{
532 struct ceph_osdmap *map;
533 u16 version;
534 u32 len, max, i;
535 u8 ev;
536 int err = -EINVAL;
537 void *start = *p;
538 struct ceph_pg_pool_info *pi;
539
540 dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
541
542 map = kzalloc(sizeof(*map), GFP_NOFS);
543 if (map == NULL)
544 return ERR_PTR(-ENOMEM);
545 map->pg_temp = RB_ROOT;
546
547 ceph_decode_16_safe(p, end, version, bad);
548 if (version > CEPH_OSDMAP_VERSION) {
549 pr_warning("got unknown v %d > %d of osdmap\n", version,
550 CEPH_OSDMAP_VERSION);
551 goto bad;
552 }
553
554 ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
555 ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
556 map->epoch = ceph_decode_32(p);
557 ceph_decode_copy(p, &map->created, sizeof(map->created));
558 ceph_decode_copy(p, &map->modified, sizeof(map->modified));
559
560 ceph_decode_32_safe(p, end, max, bad);
561 while (max--) {
562 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
563 pi = kzalloc(sizeof(*pi), GFP_NOFS);
564 if (!pi)
565 goto bad;
566 pi->id = ceph_decode_32(p);
567 ev = ceph_decode_8(p); /* encoding version */
568 if (ev > CEPH_PG_POOL_VERSION) {
569 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
570 ev, CEPH_PG_POOL_VERSION);
571 goto bad;
572 }
573 __decode_pool(p, pi);
574 __insert_pg_pool(&map->pg_pools, pi);
575 }
576
577 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
578 goto bad;
579
580 ceph_decode_32_safe(p, end, map->pool_max, bad);
581
582 ceph_decode_32_safe(p, end, map->flags, bad);
583
584 max = ceph_decode_32(p);
585
586 /* (re)alloc osd arrays */
587 err = osdmap_set_max_osd(map, max);
588 if (err < 0)
589 goto bad;
590 dout("osdmap_decode max_osd = %d\n", map->max_osd);
591
592 /* osds */
593 err = -EINVAL;
594 ceph_decode_need(p, end, 3*sizeof(u32) +
595 map->max_osd*(1 + sizeof(*map->osd_weight) +
596 sizeof(*map->osd_addr)), bad);
597 *p += 4; /* skip length field (should match max) */
598 ceph_decode_copy(p, map->osd_state, map->max_osd);
599
600 *p += 4; /* skip length field (should match max) */
601 for (i = 0; i < map->max_osd; i++)
602 map->osd_weight[i] = ceph_decode_32(p);
603
604 *p += 4; /* skip length field (should match max) */
605 ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
606 for (i = 0; i < map->max_osd; i++)
607 ceph_decode_addr(&map->osd_addr[i]);
608
609 /* pg_temp */
610 ceph_decode_32_safe(p, end, len, bad);
611 for (i = 0; i < len; i++) {
612 int n, j;
613 struct ceph_pg pgid;
614 struct ceph_pg_mapping *pg;
615
616 ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
617 ceph_decode_copy(p, &pgid, sizeof(pgid));
618 n = ceph_decode_32(p);
619 ceph_decode_need(p, end, n * sizeof(u32), bad);
620 err = -ENOMEM;
621 pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
622 if (!pg)
623 goto bad;
624 pg->pgid = pgid;
625 pg->len = n;
626 for (j = 0; j < n; j++)
627 pg->osds[j] = ceph_decode_32(p);
628
629 err = __insert_pg_mapping(pg, &map->pg_temp);
630 if (err)
631 goto bad;
632 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
633 }
634
635 /* crush */
636 ceph_decode_32_safe(p, end, len, bad);
637 dout("osdmap_decode crush len %d from off 0x%x\n", len,
638 (int)(*p - start));
639 ceph_decode_need(p, end, len, bad);
640 map->crush = crush_decode(*p, end);
641 *p += len;
642 if (IS_ERR(map->crush)) {
643 err = PTR_ERR(map->crush);
644 map->crush = NULL;
645 goto bad;
646 }
647
648 /* ignore the rest of the map */
649 *p = end;
650
651 dout("osdmap_decode done %p %p\n", *p, end);
652 return map;
653
654bad:
655 dout("osdmap_decode fail\n");
656 ceph_osdmap_destroy(map);
657 return ERR_PTR(err);
658}
659
660/*
661 * decode and apply an incremental map update.
662 */
663struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
664 struct ceph_osdmap *map,
665 struct ceph_messenger *msgr)
666{
667 struct crush_map *newcrush = NULL;
668 struct ceph_fsid fsid;
669 u32 epoch = 0;
670 struct ceph_timespec modified;
671 u32 len, pool;
672 __s32 new_pool_max, new_flags, max;
673 void *start = *p;
674 int err = -EINVAL;
675 u16 version;
676 struct rb_node *rbp;
677
678 ceph_decode_16_safe(p, end, version, bad);
679 if (version > CEPH_OSDMAP_INC_VERSION) {
680 pr_warning("got unknown v %d > %d of inc osdmap\n", version,
681 CEPH_OSDMAP_INC_VERSION);
682 goto bad;
683 }
684
685 ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
686 bad);
687 ceph_decode_copy(p, &fsid, sizeof(fsid));
688 epoch = ceph_decode_32(p);
689 BUG_ON(epoch != map->epoch+1);
690 ceph_decode_copy(p, &modified, sizeof(modified));
691 new_pool_max = ceph_decode_32(p);
692 new_flags = ceph_decode_32(p);
693
694 /* full map? */
695 ceph_decode_32_safe(p, end, len, bad);
696 if (len > 0) {
697 dout("apply_incremental full map len %d, %p to %p\n",
698 len, *p, end);
699 return osdmap_decode(p, min(*p+len, end));
700 }
701
702 /* new crush? */
703 ceph_decode_32_safe(p, end, len, bad);
704 if (len > 0) {
705 dout("apply_incremental new crush map len %d, %p to %p\n",
706 len, *p, end);
707 newcrush = crush_decode(*p, min(*p+len, end));
708 if (IS_ERR(newcrush))
709 return ERR_PTR(PTR_ERR(newcrush));
710 }
711
712 /* new flags? */
713 if (new_flags >= 0)
714 map->flags = new_flags;
715 if (new_pool_max >= 0)
716 map->pool_max = new_pool_max;
717
718 ceph_decode_need(p, end, 5*sizeof(u32), bad);
719
720 /* new max? */
721 max = ceph_decode_32(p);
722 if (max >= 0) {
723 err = osdmap_set_max_osd(map, max);
724 if (err < 0)
725 goto bad;
726 }
727
728 map->epoch++;
729 map->modified = map->modified;
730 if (newcrush) {
731 if (map->crush)
732 crush_destroy(map->crush);
733 map->crush = newcrush;
734 newcrush = NULL;
735 }
736
737 /* new_pool */
738 ceph_decode_32_safe(p, end, len, bad);
739 while (len--) {
740 __u8 ev;
741 struct ceph_pg_pool_info *pi;
742
743 ceph_decode_32_safe(p, end, pool, bad);
744 ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
745 ev = ceph_decode_8(p); /* encoding version */
746 if (ev > CEPH_PG_POOL_VERSION) {
747 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
748 ev, CEPH_PG_POOL_VERSION);
749 goto bad;
750 }
751 pi = __lookup_pg_pool(&map->pg_pools, pool);
752 if (!pi) {
753 pi = kzalloc(sizeof(*pi), GFP_NOFS);
754 if (!pi) {
755 err = -ENOMEM;
756 goto bad;
757 }
758 pi->id = pool;
759 __insert_pg_pool(&map->pg_pools, pi);
760 }
761 __decode_pool(p, pi);
762 }
763 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
764 goto bad;
765
766 /* old_pool */
767 ceph_decode_32_safe(p, end, len, bad);
768 while (len--) {
769 struct ceph_pg_pool_info *pi;
770
771 ceph_decode_32_safe(p, end, pool, bad);
772 pi = __lookup_pg_pool(&map->pg_pools, pool);
773 if (pi)
774 __remove_pg_pool(&map->pg_pools, pi);
775 }
776
777 /* new_up */
778 err = -EINVAL;
779 ceph_decode_32_safe(p, end, len, bad);
780 while (len--) {
781 u32 osd;
782 struct ceph_entity_addr addr;
783 ceph_decode_32_safe(p, end, osd, bad);
784 ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
785 ceph_decode_addr(&addr);
786 pr_info("osd%d up\n", osd);
787 BUG_ON(osd >= map->max_osd);
788 map->osd_state[osd] |= CEPH_OSD_UP;
789 map->osd_addr[osd] = addr;
790 }
791
792 /* new_down */
793 ceph_decode_32_safe(p, end, len, bad);
794 while (len--) {
795 u32 osd;
796 ceph_decode_32_safe(p, end, osd, bad);
797 (*p)++; /* clean flag */
798 pr_info("osd%d down\n", osd);
799 if (osd < map->max_osd)
800 map->osd_state[osd] &= ~CEPH_OSD_UP;
801 }
802
803 /* new_weight */
804 ceph_decode_32_safe(p, end, len, bad);
805 while (len--) {
806 u32 osd, off;
807 ceph_decode_need(p, end, sizeof(u32)*2, bad);
808 osd = ceph_decode_32(p);
809 off = ceph_decode_32(p);
810 pr_info("osd%d weight 0x%x %s\n", osd, off,
811 off == CEPH_OSD_IN ? "(in)" :
812 (off == CEPH_OSD_OUT ? "(out)" : ""));
813 if (osd < map->max_osd)
814 map->osd_weight[osd] = off;
815 }
816
817 /* new_pg_temp */
818 rbp = rb_first(&map->pg_temp);
819 ceph_decode_32_safe(p, end, len, bad);
820 while (len--) {
821 struct ceph_pg_mapping *pg;
822 int j;
823 struct ceph_pg pgid;
824 u32 pglen;
825 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
826 ceph_decode_copy(p, &pgid, sizeof(pgid));
827 pglen = ceph_decode_32(p);
828
829 /* remove any? */
830 while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
831 node)->pgid, pgid) <= 0) {
832 struct rb_node *cur = rbp;
833 rbp = rb_next(rbp);
834 dout(" removed pg_temp %llx\n",
835 *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
836 node)->pgid);
837 rb_erase(cur, &map->pg_temp);
838 }
839
840 if (pglen) {
841 /* insert */
842 ceph_decode_need(p, end, pglen*sizeof(u32), bad);
843 pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
844 if (!pg) {
845 err = -ENOMEM;
846 goto bad;
847 }
848 pg->pgid = pgid;
849 pg->len = pglen;
850 for (j = 0; j < pglen; j++)
851 pg->osds[j] = ceph_decode_32(p);
852 err = __insert_pg_mapping(pg, &map->pg_temp);
853 if (err)
854 goto bad;
855 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
856 pglen);
857 }
858 }
859 while (rbp) {
860 struct rb_node *cur = rbp;
861 rbp = rb_next(rbp);
862 dout(" removed pg_temp %llx\n",
863 *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
864 node)->pgid);
865 rb_erase(cur, &map->pg_temp);
866 }
867
868 /* ignore the rest */
869 *p = end;
870 return map;
871
872bad:
873 pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
874 epoch, (int)(*p - start), *p, start, end);
875 print_hex_dump(KERN_DEBUG, "osdmap: ",
876 DUMP_PREFIX_OFFSET, 16, 1,
877 start, end - start, true);
878 if (newcrush)
879 crush_destroy(newcrush);
880 return ERR_PTR(err);
881}
882
883
884
885
886/*
887 * calculate file layout from given offset, length.
888 * fill in correct oid, logical length, and object extent
889 * offset, length.
890 *
891 * for now, we write only a single su, until we can
892 * pass a stride back to the caller.
893 */
894void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
895 u64 off, u64 *plen,
896 u64 *ono,
897 u64 *oxoff, u64 *oxlen)
898{
899 u32 osize = le32_to_cpu(layout->fl_object_size);
900 u32 su = le32_to_cpu(layout->fl_stripe_unit);
901 u32 sc = le32_to_cpu(layout->fl_stripe_count);
902 u32 bl, stripeno, stripepos, objsetno;
903 u32 su_per_object;
904 u64 t, su_offset;
905
906 dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen,
907 osize, su);
908 su_per_object = osize / su;
909 dout("osize %u / su %u = su_per_object %u\n", osize, su,
910 su_per_object);
911
912 BUG_ON((su & ~PAGE_MASK) != 0);
913 /* bl = *off / su; */
914 t = off;
915 do_div(t, su);
916 bl = t;
917 dout("off %llu / su %u = bl %u\n", off, su, bl);
918
919 stripeno = bl / sc;
920 stripepos = bl % sc;
921 objsetno = stripeno / su_per_object;
922
923 *ono = objsetno * sc + stripepos;
924 dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
925
926 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
927 t = off;
928 su_offset = do_div(t, su);
929 *oxoff = su_offset + (stripeno % su_per_object) * su;
930
931 /*
932 * Calculate the length of the extent being written to the selected
933 * object. This is the minimum of the full length requested (plen) or
934 * the remainder of the current stripe being written to.
935 */
936 *oxlen = min_t(u64, *plen, su - su_offset);
937 *plen = *oxlen;
938
939 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
940}
941
942/*
943 * calculate an object layout (i.e. pgid) from an oid,
944 * file_layout, and osdmap
945 */
946int ceph_calc_object_layout(struct ceph_object_layout *ol,
947 const char *oid,
948 struct ceph_file_layout *fl,
949 struct ceph_osdmap *osdmap)
950{
951 unsigned num, num_mask;
952 struct ceph_pg pgid;
953 s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
954 int poolid = le32_to_cpu(fl->fl_pg_pool);
955 struct ceph_pg_pool_info *pool;
956 unsigned ps;
957
958 BUG_ON(!osdmap);
959
960 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
961 if (!pool)
962 return -EIO;
963 ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
964 if (preferred >= 0) {
965 ps += preferred;
966 num = le32_to_cpu(pool->v.lpg_num);
967 num_mask = pool->lpg_num_mask;
968 } else {
969 num = le32_to_cpu(pool->v.pg_num);
970 num_mask = pool->pg_num_mask;
971 }
972
973 pgid.ps = cpu_to_le16(ps);
974 pgid.preferred = cpu_to_le16(preferred);
975 pgid.pool = fl->fl_pg_pool;
976 if (preferred >= 0)
977 dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
978 (int)preferred);
979 else
980 dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
981
982 ol->ol_pgid = pgid;
983 ol->ol_stripe_unit = fl->fl_object_stripe_unit;
984 return 0;
985}
986
987/*
988 * Calculate raw osd vector for the given pgid. Return pointer to osd
989 * array, or NULL on failure.
990 */
991static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
992 int *osds, int *num)
993{
994 struct ceph_pg_mapping *pg;
995 struct ceph_pg_pool_info *pool;
996 int ruleno;
997 unsigned poolid, ps, pps;
998 int preferred;
999
1000 /* pg_temp? */
1001 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
1002 if (pg) {
1003 *num = pg->len;
1004 return pg->osds;
1005 }
1006
1007 /* crush */
1008 poolid = le32_to_cpu(pgid.pool);
1009 ps = le16_to_cpu(pgid.ps);
1010 preferred = (s16)le16_to_cpu(pgid.preferred);
1011
1012 /* don't forcefeed bad device ids to crush */
1013 if (preferred >= osdmap->max_osd ||
1014 preferred >= osdmap->crush->max_devices)
1015 preferred = -1;
1016
1017 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
1018 if (!pool)
1019 return NULL;
1020 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
1021 pool->v.type, pool->v.size);
1022 if (ruleno < 0) {
1023 pr_err("no crush rule pool %d type %d size %d\n",
1024 poolid, pool->v.type, pool->v.size);
1025 return NULL;
1026 }
1027
1028 if (preferred >= 0)
1029 pps = ceph_stable_mod(ps,
1030 le32_to_cpu(pool->v.lpgp_num),
1031 pool->lpgp_num_mask);
1032 else
1033 pps = ceph_stable_mod(ps,
1034 le32_to_cpu(pool->v.pgp_num),
1035 pool->pgp_num_mask);
1036 pps += poolid;
1037 *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
1038 min_t(int, pool->v.size, *num),
1039 preferred, osdmap->osd_weight);
1040 return osds;
1041}
1042
1043/*
1044 * Return primary osd for given pgid, or -1 if none.
1045 */
1046int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
1047{
1048 int rawosds[10], *osds;
1049 int i, num = ARRAY_SIZE(rawosds);
1050
1051 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1052 if (!osds)
1053 return -1;
1054
1055 /* primary is first up osd */
1056 for (i = 0; i < num; i++)
1057 if (ceph_osd_is_up(osdmap, osds[i])) {
1058 return osds[i];
1059 break;
1060 }
1061 return -1;
1062}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
new file mode 100644
index 000000000000..8bc9f1e4f562
--- /dev/null
+++ b/fs/ceph/osdmap.h
@@ -0,0 +1,126 @@
1#ifndef _FS_CEPH_OSDMAP_H
2#define _FS_CEPH_OSDMAP_H
3
4#include <linux/rbtree.h>
5#include "types.h"
6#include "ceph_fs.h"
7#include "crush/crush.h"
8
9/*
10 * The osd map describes the current membership of the osd cluster and
11 * specifies the mapping of objects to placement groups and placement
12 * groups to (sets of) osds. That is, it completely specifies the
13 * (desired) distribution of all data objects in the system at some
14 * point in time.
15 *
16 * Each map version is identified by an epoch, which increases monotonically.
17 *
18 * The map can be updated either via an incremental map (diff) describing
19 * the change between two successive epochs, or as a fully encoded map.
20 */
21struct ceph_pg_pool_info {
22 struct rb_node node;
23 int id;
24 struct ceph_pg_pool v;
25 int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
26 char *name;
27};
28
29struct ceph_pg_mapping {
30 struct rb_node node;
31 struct ceph_pg pgid;
32 int len;
33 int osds[];
34};
35
36struct ceph_osdmap {
37 struct ceph_fsid fsid;
38 u32 epoch;
39 u32 mkfs_epoch;
40 struct ceph_timespec created, modified;
41
42 u32 flags; /* CEPH_OSDMAP_* */
43
44 u32 max_osd; /* size of osd_state, _offload, _addr arrays */
45 u8 *osd_state; /* CEPH_OSD_* */
46 u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
47 struct ceph_entity_addr *osd_addr;
48
49 struct rb_root pg_temp;
50 struct rb_root pg_pools;
51 u32 pool_max;
52
53 /* the CRUSH map specifies the mapping of placement groups to
54 * the list of osds that store+replicate them. */
55 struct crush_map *crush;
56};
57
58/*
59 * file layout helpers
60 */
61#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
62#define ceph_file_layout_stripe_count(l) \
63 ((__s32)le32_to_cpu((l).fl_stripe_count))
64#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
65#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
66#define ceph_file_layout_object_su(l) \
67 ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
68#define ceph_file_layout_pg_preferred(l) \
69 ((__s32)le32_to_cpu((l).fl_pg_preferred))
70#define ceph_file_layout_pg_pool(l) \
71 ((__s32)le32_to_cpu((l).fl_pg_pool))
72
73static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
74{
75 return le32_to_cpu(l->fl_stripe_unit) *
76 le32_to_cpu(l->fl_stripe_count);
77}
78
79/* "period" == bytes before i start on a new set of objects */
80static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
81{
82 return le32_to_cpu(l->fl_object_size) *
83 le32_to_cpu(l->fl_stripe_count);
84}
85
86
87static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
88{
89 return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
90}
91
92static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
93{
94 return map && (map->flags & flag);
95}
96
97extern char *ceph_osdmap_state_str(char *str, int len, int state);
98
99static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
100 int osd)
101{
102 if (osd >= map->max_osd)
103 return NULL;
104 return &map->osd_addr[osd];
105}
106
107extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
108extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
109 struct ceph_osdmap *map,
110 struct ceph_messenger *msgr);
111extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
112
113/* calculate mapping of a file extent to an object */
114extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
115 u64 off, u64 *plen,
116 u64 *bno, u64 *oxoff, u64 *oxlen);
117
118/* calculate mapping of object to a placement group */
119extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
120 const char *oid,
121 struct ceph_file_layout *fl,
122 struct ceph_osdmap *osdmap);
123extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
124 struct ceph_pg pgid);
125
126#endif
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
new file mode 100644
index 000000000000..5f8dbf7c745a
--- /dev/null
+++ b/fs/ceph/pagelist.c
@@ -0,0 +1,55 @@
1
2#include <linux/gfp.h>
3#include <linux/pagemap.h>
4#include <linux/highmem.h>
5
6#include "pagelist.h"
7
8int ceph_pagelist_release(struct ceph_pagelist *pl)
9{
10 if (pl->mapped_tail)
11 kunmap(pl->mapped_tail);
12 while (!list_empty(&pl->head)) {
13 struct page *page = list_first_entry(&pl->head, struct page,
14 lru);
15 list_del(&page->lru);
16 __free_page(page);
17 }
18 return 0;
19}
20
21static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
22{
23 struct page *page = alloc_page(GFP_NOFS);
24 if (!page)
25 return -ENOMEM;
26 pl->room += PAGE_SIZE;
27 list_add_tail(&page->lru, &pl->head);
28 if (pl->mapped_tail)
29 kunmap(pl->mapped_tail);
30 pl->mapped_tail = kmap(page);
31 return 0;
32}
33
34int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
35{
36 while (pl->room < len) {
37 size_t bit = pl->room;
38 int ret;
39
40 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
41 buf, bit);
42 pl->length += bit;
43 pl->room -= bit;
44 buf += bit;
45 len -= bit;
46 ret = ceph_pagelist_addpage(pl);
47 if (ret)
48 return ret;
49 }
50
51 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
52 pl->length += len;
53 pl->room -= len;
54 return 0;
55}
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h
new file mode 100644
index 000000000000..e8a4187e1087
--- /dev/null
+++ b/fs/ceph/pagelist.h
@@ -0,0 +1,54 @@
1#ifndef __FS_CEPH_PAGELIST_H
2#define __FS_CEPH_PAGELIST_H
3
4#include <linux/list.h>
5
6struct ceph_pagelist {
7 struct list_head head;
8 void *mapped_tail;
9 size_t length;
10 size_t room;
11};
12
13static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
14{
15 INIT_LIST_HEAD(&pl->head);
16 pl->mapped_tail = NULL;
17 pl->length = 0;
18 pl->room = 0;
19}
20extern int ceph_pagelist_release(struct ceph_pagelist *pl);
21
22extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
23
24static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
25{
26 __le64 ev = cpu_to_le64(v);
27 return ceph_pagelist_append(pl, &ev, sizeof(ev));
28}
29static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
30{
31 __le32 ev = cpu_to_le32(v);
32 return ceph_pagelist_append(pl, &ev, sizeof(ev));
33}
34static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
35{
36 __le16 ev = cpu_to_le16(v);
37 return ceph_pagelist_append(pl, &ev, sizeof(ev));
38}
39static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
40{
41 return ceph_pagelist_append(pl, &v, 1);
42}
43static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
44 char *s, size_t len)
45{
46 int ret = ceph_pagelist_encode_32(pl, len);
47 if (ret)
48 return ret;
49 if (len)
50 return ceph_pagelist_append(pl, s, len);
51 return 0;
52}
53
54#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
new file mode 100644
index 000000000000..a1fc1d017b58
--- /dev/null
+++ b/fs/ceph/rados.h
@@ -0,0 +1,376 @@
1#ifndef __RADOS_H
2#define __RADOS_H
3
4/*
5 * Data types for the Ceph distributed object storage layer RADOS
6 * (Reliable Autonomic Distributed Object Store).
7 */
8
9#include "msgr.h"
10
11/*
12 * osdmap encoding versions
13 */
14#define CEPH_OSDMAP_INC_VERSION 5
15#define CEPH_OSDMAP_INC_VERSION_EXT 5
16#define CEPH_OSDMAP_VERSION 5
17#define CEPH_OSDMAP_VERSION_EXT 5
18
19/*
20 * fs id
21 */
22struct ceph_fsid {
23 unsigned char fsid[16];
24};
25
26static inline int ceph_fsid_compare(const struct ceph_fsid *a,
27 const struct ceph_fsid *b)
28{
29 return memcmp(a, b, sizeof(*a));
30}
31
32/*
33 * ino, object, etc.
34 */
35typedef __le64 ceph_snapid_t;
36#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */
37#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */
38#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */
39
40struct ceph_timespec {
41 __le32 tv_sec;
42 __le32 tv_nsec;
43} __attribute__ ((packed));
44
45
46/*
47 * object layout - how objects are mapped into PGs
48 */
49#define CEPH_OBJECT_LAYOUT_HASH 1
50#define CEPH_OBJECT_LAYOUT_LINEAR 2
51#define CEPH_OBJECT_LAYOUT_HASHINO 3
52
53/*
54 * pg layout -- how PGs are mapped onto (sets of) OSDs
55 */
56#define CEPH_PG_LAYOUT_CRUSH 0
57#define CEPH_PG_LAYOUT_HASH 1
58#define CEPH_PG_LAYOUT_LINEAR 2
59#define CEPH_PG_LAYOUT_HYBRID 3
60
61
62/*
63 * placement group.
64 * we encode this into one __le64.
65 */
66struct ceph_pg {
67 __le16 preferred; /* preferred primary osd */
68 __le16 ps; /* placement seed */
69 __le32 pool; /* object pool */
70} __attribute__ ((packed));
71
72/*
73 * pg_pool is a set of pgs storing a pool of objects
74 *
75 * pg_num -- base number of pseudorandomly placed pgs
76 *
77 * pgp_num -- effective number when calculating pg placement. this
78 * is used for pg_num increases. new pgs result in data being "split"
79 * into new pgs. for this to proceed smoothly, new pgs are intiially
80 * colocated with their parents; that is, pgp_num doesn't increase
81 * until the new pgs have successfully split. only _then_ are the new
82 * pgs placed independently.
83 *
84 * lpg_num -- localized pg count (per device). replicas are randomly
85 * selected.
86 *
87 * lpgp_num -- as above.
88 */
89#define CEPH_PG_TYPE_REP 1
90#define CEPH_PG_TYPE_RAID4 2
91#define CEPH_PG_POOL_VERSION 2
92struct ceph_pg_pool {
93 __u8 type; /* CEPH_PG_TYPE_* */
94 __u8 size; /* number of osds in each pg */
95 __u8 crush_ruleset; /* crush placement rule */
96 __u8 object_hash; /* hash mapping object name to ps */
97 __le32 pg_num, pgp_num; /* number of pg's */
98 __le32 lpg_num, lpgp_num; /* number of localized pg's */
99 __le32 last_change; /* most recent epoch changed */
100 __le64 snap_seq; /* seq for per-pool snapshot */
101 __le32 snap_epoch; /* epoch of last snap */
102 __le32 num_snaps;
103 __le32 num_removed_snap_intervals;
104 __le64 uid;
105} __attribute__ ((packed));
106
107/*
108 * stable_mod func is used to control number of placement groups.
109 * similar to straight-up modulo, but produces a stable mapping as b
110 * increases over time. b is the number of bins, and bmask is the
111 * containing power of 2 minus 1.
112 *
113 * b <= bmask and bmask=(2**n)-1
114 * e.g., b=12 -> bmask=15, b=123 -> bmask=127
115 */
116static inline int ceph_stable_mod(int x, int b, int bmask)
117{
118 if ((x & bmask) < b)
119 return x & bmask;
120 else
121 return x & (bmask >> 1);
122}
123
124/*
125 * object layout - how a given object should be stored.
126 */
127struct ceph_object_layout {
128 struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */
129 __le32 ol_stripe_unit; /* for per-object parity, if any */
130} __attribute__ ((packed));
131
132/*
133 * compound epoch+version, used by storage layer to serialize mutations
134 */
135struct ceph_eversion {
136 __le32 epoch;
137 __le64 version;
138} __attribute__ ((packed));
139
140/*
141 * osd map bits
142 */
143
144/* status bits */
145#define CEPH_OSD_EXISTS 1
146#define CEPH_OSD_UP 2
147
148/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
149#define CEPH_OSD_IN 0x10000
150#define CEPH_OSD_OUT 0
151
152
153/*
154 * osd map flag bits
155 */
156#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
157#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
158#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
159#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
160#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
161
162/*
163 * osd ops
164 */
165#define CEPH_OSD_OP_MODE 0xf000
166#define CEPH_OSD_OP_MODE_RD 0x1000
167#define CEPH_OSD_OP_MODE_WR 0x2000
168#define CEPH_OSD_OP_MODE_RMW 0x3000
169#define CEPH_OSD_OP_MODE_SUB 0x4000
170
171#define CEPH_OSD_OP_TYPE 0x0f00
172#define CEPH_OSD_OP_TYPE_LOCK 0x0100
173#define CEPH_OSD_OP_TYPE_DATA 0x0200
174#define CEPH_OSD_OP_TYPE_ATTR 0x0300
175#define CEPH_OSD_OP_TYPE_EXEC 0x0400
176#define CEPH_OSD_OP_TYPE_PG 0x0500
177
178enum {
179 /** data **/
180 /* read */
181 CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
182 CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
183
184 /* fancy read */
185 CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
186
187 /* write */
188 CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
189 CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
190 CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
191 CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
192 CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
193
194 /* fancy write */
195 CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
196 CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
197 CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
198 CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
199
200 CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
201 CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
202 CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
203
204 CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
205
206 /** attrs **/
207 /* read */
208 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
209 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
210
211 /* write */
212 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
213 CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
214 CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
215 CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
216
217 /** subop **/
218 CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1,
219 CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2,
220 CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3,
221 CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
222 CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5,
223
224 /** lock **/
225 CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
226 CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
227 CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
228 CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
229 CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
230 CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
231
232 /** exec **/
233 CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
234
235 /** pg **/
236 CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
237};
238
239static inline int ceph_osd_op_type_lock(int op)
240{
241 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
242}
243static inline int ceph_osd_op_type_data(int op)
244{
245 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
246}
247static inline int ceph_osd_op_type_attr(int op)
248{
249 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
250}
251static inline int ceph_osd_op_type_exec(int op)
252{
253 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
254}
255static inline int ceph_osd_op_type_pg(int op)
256{
257 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
258}
259
260static inline int ceph_osd_op_mode_subop(int op)
261{
262 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
263}
264static inline int ceph_osd_op_mode_read(int op)
265{
266 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
267}
268static inline int ceph_osd_op_mode_modify(int op)
269{
270 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
271}
272
273#define CEPH_OSD_TMAP_HDR 'h'
274#define CEPH_OSD_TMAP_SET 's'
275#define CEPH_OSD_TMAP_RM 'r'
276
277extern const char *ceph_osd_op_name(int op);
278
279
280/*
281 * osd op flags
282 *
283 * An op may be READ, WRITE, or READ|WRITE.
284 */
285enum {
286 CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */
287 CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */
288 CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */
289 CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */
290 CEPH_OSD_FLAG_READ = 16, /* op may read */
291 CEPH_OSD_FLAG_WRITE = 32, /* op may write */
292 CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */
293 CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */
294 CEPH_OSD_FLAG_BALANCE_READS = 256,
295 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
296 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */
297 CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */
298};
299
300enum {
301 CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
302};
303
304#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
305#define EBLACKLISTED ESHUTDOWN /* blacklisted */
306
307/*
308 * an individual object operation. each may be accompanied by some data
309 * payload
310 */
311struct ceph_osd_op {
312 __le16 op; /* CEPH_OSD_OP_* */
313 __le32 flags; /* CEPH_OSD_FLAG_* */
314 union {
315 struct {
316 __le64 offset, length;
317 __le64 truncate_size;
318 __le32 truncate_seq;
319 } __attribute__ ((packed)) extent;
320 struct {
321 __le32 name_len;
322 __le32 value_len;
323 } __attribute__ ((packed)) xattr;
324 struct {
325 __u8 class_len;
326 __u8 method_len;
327 __u8 argc;
328 __le32 indata_len;
329 } __attribute__ ((packed)) cls;
330 struct {
331 __le64 cookie, count;
332 } __attribute__ ((packed)) pgls;
333 };
334 __le32 payload_len;
335} __attribute__ ((packed));
336
337/*
338 * osd request message header. each request may include multiple
339 * ceph_osd_op object operations.
340 */
341struct ceph_osd_request_head {
342 __le32 client_inc; /* client incarnation */
343 struct ceph_object_layout layout; /* pgid */
344 __le32 osdmap_epoch; /* client's osdmap epoch */
345
346 __le32 flags;
347
348 struct ceph_timespec mtime; /* for mutations only */
349 struct ceph_eversion reassert_version; /* if we are replaying op */
350
351 __le32 object_len; /* length of object name */
352
353 __le64 snapid; /* snapid to read */
354 __le64 snap_seq; /* writer's snap context */
355 __le32 num_snaps;
356
357 __le16 num_ops;
358 struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */
359} __attribute__ ((packed));
360
361struct ceph_osd_reply_head {
362 __le32 client_inc; /* client incarnation */
363 __le32 flags;
364 struct ceph_object_layout layout;
365 __le32 osdmap_epoch;
366 struct ceph_eversion reassert_version; /* for replaying uncommitted */
367
368 __le32 result; /* result code */
369
370 __le32 object_len; /* length of object name */
371 __le32 num_ops;
372 struct ceph_osd_op ops[0]; /* ops[], object */
373} __attribute__ ((packed));
374
375
376#endif
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
new file mode 100644
index 000000000000..2b881262ef67
--- /dev/null
+++ b/fs/ceph/snap.c
@@ -0,0 +1,907 @@
1#include "ceph_debug.h"
2
3#include <linux/sort.h>
4#include <linux/slab.h>
5
6#include "super.h"
7#include "decode.h"
8
9/*
10 * Snapshots in ceph are driven in large part by cooperation from the
11 * client. In contrast to local file systems or file servers that
12 * implement snapshots at a single point in the system, ceph's
13 * distributed access to storage requires clients to help decide
14 * whether a write logically occurs before or after a recently created
15 * snapshot.
16 *
17 * This provides a perfect instantanous client-wide snapshot. Between
18 * clients, however, snapshots may appear to be applied at slightly
19 * different points in time, depending on delays in delivering the
20 * snapshot notification.
21 *
22 * Snapshots are _not_ file system-wide. Instead, each snapshot
23 * applies to the subdirectory nested beneath some directory. This
24 * effectively divides the hierarchy into multiple "realms," where all
25 * of the files contained by each realm share the same set of
26 * snapshots. An individual realm's snap set contains snapshots
27 * explicitly created on that realm, as well as any snaps in its
28 * parent's snap set _after_ the point at which the parent became it's
29 * parent (due to, say, a rename). Similarly, snaps from prior parents
30 * during the time intervals during which they were the parent are included.
31 *
32 * The client is spared most of this detail, fortunately... it must only
33 * maintains a hierarchy of realms reflecting the current parent/child
34 * realm relationship, and for each realm has an explicit list of snaps
35 * inherited from prior parents.
36 *
37 * A snap_realm struct is maintained for realms containing every inode
38 * with an open cap in the system. (The needed snap realm information is
39 * provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq'
40 * version number is used to ensure that as realm parameters change (new
41 * snapshot, new parent, etc.) the client's realm hierarchy is updated.
42 *
43 * The realm hierarchy drives the generation of a 'snap context' for each
44 * realm, which simply lists the resulting set of snaps for the realm. This
45 * is attached to any writes sent to OSDs.
46 */
47/*
48 * Unfortunately error handling is a bit mixed here. If we get a snap
49 * update, but don't have enough memory to update our realm hierarchy,
50 * it's not clear what we can do about it (besides complaining to the
51 * console).
52 */
53
54
55/*
56 * increase ref count for the realm
57 *
58 * caller must hold snap_rwsem for write.
59 */
60void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
61 struct ceph_snap_realm *realm)
62{
63 dout("get_realm %p %d -> %d\n", realm,
64 atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
65 /*
66 * since we _only_ increment realm refs or empty the empty
67 * list with snap_rwsem held, adjusting the empty list here is
68 * safe. we do need to protect against concurrent empty list
69 * additions, however.
70 */
71 if (atomic_read(&realm->nref) == 0) {
72 spin_lock(&mdsc->snap_empty_lock);
73 list_del_init(&realm->empty_item);
74 spin_unlock(&mdsc->snap_empty_lock);
75 }
76
77 atomic_inc(&realm->nref);
78}
79
80static void __insert_snap_realm(struct rb_root *root,
81 struct ceph_snap_realm *new)
82{
83 struct rb_node **p = &root->rb_node;
84 struct rb_node *parent = NULL;
85 struct ceph_snap_realm *r = NULL;
86
87 while (*p) {
88 parent = *p;
89 r = rb_entry(parent, struct ceph_snap_realm, node);
90 if (new->ino < r->ino)
91 p = &(*p)->rb_left;
92 else if (new->ino > r->ino)
93 p = &(*p)->rb_right;
94 else
95 BUG();
96 }
97
98 rb_link_node(&new->node, parent, p);
99 rb_insert_color(&new->node, root);
100}
101
102/*
103 * create and get the realm rooted at @ino and bump its ref count.
104 *
105 * caller must hold snap_rwsem for write.
106 */
107static struct ceph_snap_realm *ceph_create_snap_realm(
108 struct ceph_mds_client *mdsc,
109 u64 ino)
110{
111 struct ceph_snap_realm *realm;
112
113 realm = kzalloc(sizeof(*realm), GFP_NOFS);
114 if (!realm)
115 return ERR_PTR(-ENOMEM);
116
117 atomic_set(&realm->nref, 0); /* tree does not take a ref */
118 realm->ino = ino;
119 INIT_LIST_HEAD(&realm->children);
120 INIT_LIST_HEAD(&realm->child_item);
121 INIT_LIST_HEAD(&realm->empty_item);
122 INIT_LIST_HEAD(&realm->inodes_with_caps);
123 spin_lock_init(&realm->inodes_with_caps_lock);
124 __insert_snap_realm(&mdsc->snap_realms, realm);
125 dout("create_snap_realm %llx %p\n", realm->ino, realm);
126 return realm;
127}
128
129/*
130 * lookup the realm rooted at @ino.
131 *
132 * caller must hold snap_rwsem for write.
133 */
134struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
135 u64 ino)
136{
137 struct rb_node *n = mdsc->snap_realms.rb_node;
138 struct ceph_snap_realm *r;
139
140 while (n) {
141 r = rb_entry(n, struct ceph_snap_realm, node);
142 if (ino < r->ino)
143 n = n->rb_left;
144 else if (ino > r->ino)
145 n = n->rb_right;
146 else {
147 dout("lookup_snap_realm %llx %p\n", r->ino, r);
148 return r;
149 }
150 }
151 return NULL;
152}
153
154static void __put_snap_realm(struct ceph_mds_client *mdsc,
155 struct ceph_snap_realm *realm);
156
157/*
158 * called with snap_rwsem (write)
159 */
160static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
161 struct ceph_snap_realm *realm)
162{
163 dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
164
165 rb_erase(&realm->node, &mdsc->snap_realms);
166
167 if (realm->parent) {
168 list_del_init(&realm->child_item);
169 __put_snap_realm(mdsc, realm->parent);
170 }
171
172 kfree(realm->prior_parent_snaps);
173 kfree(realm->snaps);
174 ceph_put_snap_context(realm->cached_context);
175 kfree(realm);
176}
177
178/*
179 * caller holds snap_rwsem (write)
180 */
181static void __put_snap_realm(struct ceph_mds_client *mdsc,
182 struct ceph_snap_realm *realm)
183{
184 dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
185 atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
186 if (atomic_dec_and_test(&realm->nref))
187 __destroy_snap_realm(mdsc, realm);
188}
189
190/*
191 * caller needn't hold any locks
192 */
193void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
194 struct ceph_snap_realm *realm)
195{
196 dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
197 atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
198 if (!atomic_dec_and_test(&realm->nref))
199 return;
200
201 if (down_write_trylock(&mdsc->snap_rwsem)) {
202 __destroy_snap_realm(mdsc, realm);
203 up_write(&mdsc->snap_rwsem);
204 } else {
205 spin_lock(&mdsc->snap_empty_lock);
206 list_add(&mdsc->snap_empty, &realm->empty_item);
207 spin_unlock(&mdsc->snap_empty_lock);
208 }
209}
210
211/*
212 * Clean up any realms whose ref counts have dropped to zero. Note
213 * that this does not include realms who were created but not yet
214 * used.
215 *
216 * Called under snap_rwsem (write)
217 */
218static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
219{
220 struct ceph_snap_realm *realm;
221
222 spin_lock(&mdsc->snap_empty_lock);
223 while (!list_empty(&mdsc->snap_empty)) {
224 realm = list_first_entry(&mdsc->snap_empty,
225 struct ceph_snap_realm, empty_item);
226 list_del(&realm->empty_item);
227 spin_unlock(&mdsc->snap_empty_lock);
228 __destroy_snap_realm(mdsc, realm);
229 spin_lock(&mdsc->snap_empty_lock);
230 }
231 spin_unlock(&mdsc->snap_empty_lock);
232}
233
234void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
235{
236 down_write(&mdsc->snap_rwsem);
237 __cleanup_empty_realms(mdsc);
238 up_write(&mdsc->snap_rwsem);
239}
240
241/*
242 * adjust the parent realm of a given @realm. adjust child list, and parent
243 * pointers, and ref counts appropriately.
244 *
245 * return true if parent was changed, 0 if unchanged, <0 on error.
246 *
247 * caller must hold snap_rwsem for write.
248 */
249static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
250 struct ceph_snap_realm *realm,
251 u64 parentino)
252{
253 struct ceph_snap_realm *parent;
254
255 if (realm->parent_ino == parentino)
256 return 0;
257
258 parent = ceph_lookup_snap_realm(mdsc, parentino);
259 if (!parent) {
260 parent = ceph_create_snap_realm(mdsc, parentino);
261 if (IS_ERR(parent))
262 return PTR_ERR(parent);
263 }
264 dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
265 realm->ino, realm, realm->parent_ino, realm->parent,
266 parentino, parent);
267 if (realm->parent) {
268 list_del_init(&realm->child_item);
269 ceph_put_snap_realm(mdsc, realm->parent);
270 }
271 realm->parent_ino = parentino;
272 realm->parent = parent;
273 ceph_get_snap_realm(mdsc, parent);
274 list_add(&realm->child_item, &parent->children);
275 return 1;
276}
277
278
279static int cmpu64_rev(const void *a, const void *b)
280{
281 if (*(u64 *)a < *(u64 *)b)
282 return 1;
283 if (*(u64 *)a > *(u64 *)b)
284 return -1;
285 return 0;
286}
287
288/*
289 * build the snap context for a given realm.
290 */
291static int build_snap_context(struct ceph_snap_realm *realm)
292{
293 struct ceph_snap_realm *parent = realm->parent;
294 struct ceph_snap_context *snapc;
295 int err = 0;
296 int i;
297 int num = realm->num_prior_parent_snaps + realm->num_snaps;
298
299 /*
300 * build parent context, if it hasn't been built.
301 * conservatively estimate that all parent snaps might be
302 * included by us.
303 */
304 if (parent) {
305 if (!parent->cached_context) {
306 err = build_snap_context(parent);
307 if (err)
308 goto fail;
309 }
310 num += parent->cached_context->num_snaps;
311 }
312
313 /* do i actually need to update? not if my context seq
314 matches realm seq, and my parents' does to. (this works
315 because we rebuild_snap_realms() works _downward_ in
316 hierarchy after each update.) */
317 if (realm->cached_context &&
318 realm->cached_context->seq == realm->seq &&
319 (!parent ||
320 realm->cached_context->seq >= parent->cached_context->seq)) {
321 dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
322 " (unchanged)\n",
323 realm->ino, realm, realm->cached_context,
324 realm->cached_context->seq,
325 realm->cached_context->num_snaps);
326 return 0;
327 }
328
329 /* alloc new snap context */
330 err = -ENOMEM;
331 if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc))
332 goto fail;
333 snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
334 if (!snapc)
335 goto fail;
336 atomic_set(&snapc->nref, 1);
337
338 /* build (reverse sorted) snap vector */
339 num = 0;
340 snapc->seq = realm->seq;
341 if (parent) {
342 /* include any of parent's snaps occuring _after_ my
343 parent became my parent */
344 for (i = 0; i < parent->cached_context->num_snaps; i++)
345 if (parent->cached_context->snaps[i] >=
346 realm->parent_since)
347 snapc->snaps[num++] =
348 parent->cached_context->snaps[i];
349 if (parent->cached_context->seq > snapc->seq)
350 snapc->seq = parent->cached_context->seq;
351 }
352 memcpy(snapc->snaps + num, realm->snaps,
353 sizeof(u64)*realm->num_snaps);
354 num += realm->num_snaps;
355 memcpy(snapc->snaps + num, realm->prior_parent_snaps,
356 sizeof(u64)*realm->num_prior_parent_snaps);
357 num += realm->num_prior_parent_snaps;
358
359 sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
360 snapc->num_snaps = num;
361 dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n",
362 realm->ino, realm, snapc, snapc->seq, snapc->num_snaps);
363
364 if (realm->cached_context)
365 ceph_put_snap_context(realm->cached_context);
366 realm->cached_context = snapc;
367 return 0;
368
369fail:
370 /*
371 * if we fail, clear old (incorrect) cached_context... hopefully
372 * we'll have better luck building it later
373 */
374 if (realm->cached_context) {
375 ceph_put_snap_context(realm->cached_context);
376 realm->cached_context = NULL;
377 }
378 pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
379 realm, err);
380 return err;
381}
382
383/*
384 * rebuild snap context for the given realm and all of its children.
385 */
386static void rebuild_snap_realms(struct ceph_snap_realm *realm)
387{
388 struct ceph_snap_realm *child;
389
390 dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
391 build_snap_context(realm);
392
393 list_for_each_entry(child, &realm->children, child_item)
394 rebuild_snap_realms(child);
395}
396
397
398/*
399 * helper to allocate and decode an array of snapids. free prior
400 * instance, if any.
401 */
402static int dup_array(u64 **dst, __le64 *src, int num)
403{
404 int i;
405
406 kfree(*dst);
407 if (num) {
408 *dst = kcalloc(num, sizeof(u64), GFP_NOFS);
409 if (!*dst)
410 return -ENOMEM;
411 for (i = 0; i < num; i++)
412 (*dst)[i] = get_unaligned_le64(src + i);
413 } else {
414 *dst = NULL;
415 }
416 return 0;
417}
418
419
420/*
421 * When a snapshot is applied, the size/mtime inode metadata is queued
422 * in a ceph_cap_snap (one for each snapshot) until writeback
423 * completes and the metadata can be flushed back to the MDS.
424 *
425 * However, if a (sync) write is currently in-progress when we apply
426 * the snapshot, we have to wait until the write succeeds or fails
427 * (and a final size/mtime is known). In this case the
428 * cap_snap->writing = 1, and is said to be "pending." When the write
429 * finishes, we __ceph_finish_cap_snap().
430 *
431 * Caller must hold snap_rwsem for read (i.e., the realm topology won't
432 * change).
433 */
434void ceph_queue_cap_snap(struct ceph_inode_info *ci)
435{
436 struct inode *inode = &ci->vfs_inode;
437 struct ceph_cap_snap *capsnap;
438 int used;
439
440 capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
441 if (!capsnap) {
442 pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
443 return;
444 }
445
446 spin_lock(&inode->i_lock);
447 used = __ceph_caps_used(ci);
448 if (__ceph_have_pending_cap_snap(ci)) {
449 /* there is no point in queuing multiple "pending" cap_snaps,
450 as no new writes are allowed to start when pending, so any
451 writes in progress now were started before the previous
452 cap_snap. lucky us. */
453 dout("queue_cap_snap %p already pending\n", inode);
454 kfree(capsnap);
455 } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
456 struct ceph_snap_context *snapc = ci->i_head_snapc;
457
458 igrab(inode);
459
460 atomic_set(&capsnap->nref, 1);
461 capsnap->ci = ci;
462 INIT_LIST_HEAD(&capsnap->ci_item);
463 INIT_LIST_HEAD(&capsnap->flushing_item);
464
465 capsnap->follows = snapc->seq - 1;
466 capsnap->issued = __ceph_caps_issued(ci, NULL);
467 capsnap->dirty = __ceph_caps_dirty(ci);
468
469 capsnap->mode = inode->i_mode;
470 capsnap->uid = inode->i_uid;
471 capsnap->gid = inode->i_gid;
472
473 /* fixme? */
474 capsnap->xattr_blob = NULL;
475 capsnap->xattr_len = 0;
476
477 /* dirty page count moved from _head to this cap_snap;
478 all subsequent writes page dirties occur _after_ this
479 snapshot. */
480 capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
481 ci->i_wrbuffer_ref_head = 0;
482 capsnap->context = snapc;
483 ci->i_head_snapc = NULL;
484 list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
485
486 if (used & CEPH_CAP_FILE_WR) {
487 dout("queue_cap_snap %p cap_snap %p snapc %p"
488 " seq %llu used WR, now pending\n", inode,
489 capsnap, snapc, snapc->seq);
490 capsnap->writing = 1;
491 } else {
492 /* note mtime, size NOW. */
493 __ceph_finish_cap_snap(ci, capsnap);
494 }
495 } else {
496 dout("queue_cap_snap %p nothing dirty|writing\n", inode);
497 kfree(capsnap);
498 }
499
500 spin_unlock(&inode->i_lock);
501}
502
503/*
504 * Finalize the size, mtime for a cap_snap.. that is, settle on final values
505 * to be used for the snapshot, to be flushed back to the mds.
506 *
507 * If capsnap can now be flushed, add to snap_flush list, and return 1.
508 *
509 * Caller must hold i_lock.
510 */
511int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
512 struct ceph_cap_snap *capsnap)
513{
514 struct inode *inode = &ci->vfs_inode;
515 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
516
517 BUG_ON(capsnap->writing);
518 capsnap->size = inode->i_size;
519 capsnap->mtime = inode->i_mtime;
520 capsnap->atime = inode->i_atime;
521 capsnap->ctime = inode->i_ctime;
522 capsnap->time_warp_seq = ci->i_time_warp_seq;
523 if (capsnap->dirty_pages) {
524 dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
525 "still has %d dirty pages\n", inode, capsnap,
526 capsnap->context, capsnap->context->seq,
527 ceph_cap_string(capsnap->dirty), capsnap->size,
528 capsnap->dirty_pages);
529 return 0;
530 }
531 dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
532 inode, capsnap, capsnap->context,
533 capsnap->context->seq, ceph_cap_string(capsnap->dirty),
534 capsnap->size);
535
536 spin_lock(&mdsc->snap_flush_lock);
537 list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
538 spin_unlock(&mdsc->snap_flush_lock);
539 return 1; /* caller may want to ceph_flush_snaps */
540}
541
542
543/*
544 * Parse and apply a snapblob "snap trace" from the MDS. This specifies
545 * the snap realm parameters from a given realm and all of its ancestors,
546 * up to the root.
547 *
548 * Caller must hold snap_rwsem for write.
549 */
550int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
551 void *p, void *e, bool deletion)
552{
553 struct ceph_mds_snap_realm *ri; /* encoded */
554 __le64 *snaps; /* encoded */
555 __le64 *prior_parent_snaps; /* encoded */
556 struct ceph_snap_realm *realm;
557 int invalidate = 0;
558 int err = -ENOMEM;
559
560 dout("update_snap_trace deletion=%d\n", deletion);
561more:
562 ceph_decode_need(&p, e, sizeof(*ri), bad);
563 ri = p;
564 p += sizeof(*ri);
565 ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
566 le32_to_cpu(ri->num_prior_parent_snaps)), bad);
567 snaps = p;
568 p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
569 prior_parent_snaps = p;
570 p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
571
572 realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
573 if (!realm) {
574 realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
575 if (IS_ERR(realm)) {
576 err = PTR_ERR(realm);
577 goto fail;
578 }
579 }
580
581 if (le64_to_cpu(ri->seq) > realm->seq) {
582 dout("update_snap_trace updating %llx %p %lld -> %lld\n",
583 realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
584 /*
585 * if the realm seq has changed, queue a cap_snap for every
586 * inode with open caps. we do this _before_ we update
587 * the realm info so that we prepare for writeback under the
588 * _previous_ snap context.
589 *
590 * ...unless it's a snap deletion!
591 */
592 if (!deletion) {
593 struct ceph_inode_info *ci;
594 struct inode *lastinode = NULL;
595
596 spin_lock(&realm->inodes_with_caps_lock);
597 list_for_each_entry(ci, &realm->inodes_with_caps,
598 i_snap_realm_item) {
599 struct inode *inode = igrab(&ci->vfs_inode);
600 if (!inode)
601 continue;
602 spin_unlock(&realm->inodes_with_caps_lock);
603 if (lastinode)
604 iput(lastinode);
605 lastinode = inode;
606 ceph_queue_cap_snap(ci);
607 spin_lock(&realm->inodes_with_caps_lock);
608 }
609 spin_unlock(&realm->inodes_with_caps_lock);
610 if (lastinode)
611 iput(lastinode);
612 dout("update_snap_trace cap_snaps queued\n");
613 }
614
615 } else {
616 dout("update_snap_trace %llx %p seq %lld unchanged\n",
617 realm->ino, realm, realm->seq);
618 }
619
620 /* ensure the parent is correct */
621 err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
622 if (err < 0)
623 goto fail;
624 invalidate += err;
625
626 if (le64_to_cpu(ri->seq) > realm->seq) {
627 /* update realm parameters, snap lists */
628 realm->seq = le64_to_cpu(ri->seq);
629 realm->created = le64_to_cpu(ri->created);
630 realm->parent_since = le64_to_cpu(ri->parent_since);
631
632 realm->num_snaps = le32_to_cpu(ri->num_snaps);
633 err = dup_array(&realm->snaps, snaps, realm->num_snaps);
634 if (err < 0)
635 goto fail;
636
637 realm->num_prior_parent_snaps =
638 le32_to_cpu(ri->num_prior_parent_snaps);
639 err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps,
640 realm->num_prior_parent_snaps);
641 if (err < 0)
642 goto fail;
643
644 invalidate = 1;
645 } else if (!realm->cached_context) {
646 invalidate = 1;
647 }
648
649 dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
650 realm, invalidate, p, e);
651
652 if (p < e)
653 goto more;
654
655 /* invalidate when we reach the _end_ (root) of the trace */
656 if (invalidate)
657 rebuild_snap_realms(realm);
658
659 __cleanup_empty_realms(mdsc);
660 return 0;
661
662bad:
663 err = -EINVAL;
664fail:
665 pr_err("update_snap_trace error %d\n", err);
666 return err;
667}
668
669
670/*
671 * Send any cap_snaps that are queued for flush. Try to carry
672 * s_mutex across multiple snap flushes to avoid locking overhead.
673 *
674 * Caller holds no locks.
675 */
676static void flush_snaps(struct ceph_mds_client *mdsc)
677{
678 struct ceph_inode_info *ci;
679 struct inode *inode;
680 struct ceph_mds_session *session = NULL;
681
682 dout("flush_snaps\n");
683 spin_lock(&mdsc->snap_flush_lock);
684 while (!list_empty(&mdsc->snap_flush_list)) {
685 ci = list_first_entry(&mdsc->snap_flush_list,
686 struct ceph_inode_info, i_snap_flush_item);
687 inode = &ci->vfs_inode;
688 igrab(inode);
689 spin_unlock(&mdsc->snap_flush_lock);
690 spin_lock(&inode->i_lock);
691 __ceph_flush_snaps(ci, &session);
692 spin_unlock(&inode->i_lock);
693 iput(inode);
694 spin_lock(&mdsc->snap_flush_lock);
695 }
696 spin_unlock(&mdsc->snap_flush_lock);
697
698 if (session) {
699 mutex_unlock(&session->s_mutex);
700 ceph_put_mds_session(session);
701 }
702 dout("flush_snaps done\n");
703}
704
705
706/*
707 * Handle a snap notification from the MDS.
708 *
709 * This can take two basic forms: the simplest is just a snap creation
710 * or deletion notification on an existing realm. This should update the
711 * realm and its children.
712 *
713 * The more difficult case is realm creation, due to snap creation at a
714 * new point in the file hierarchy, or due to a rename that moves a file or
715 * directory into another realm.
716 */
717void ceph_handle_snap(struct ceph_mds_client *mdsc,
718 struct ceph_mds_session *session,
719 struct ceph_msg *msg)
720{
721 struct super_block *sb = mdsc->client->sb;
722 int mds = session->s_mds;
723 u64 split;
724 int op;
725 int trace_len;
726 struct ceph_snap_realm *realm = NULL;
727 void *p = msg->front.iov_base;
728 void *e = p + msg->front.iov_len;
729 struct ceph_mds_snap_head *h;
730 int num_split_inos, num_split_realms;
731 __le64 *split_inos = NULL, *split_realms = NULL;
732 int i;
733 int locked_rwsem = 0;
734
735 /* decode */
736 if (msg->front.iov_len < sizeof(*h))
737 goto bad;
738 h = p;
739 op = le32_to_cpu(h->op);
740 split = le64_to_cpu(h->split); /* non-zero if we are splitting an
741 * existing realm */
742 num_split_inos = le32_to_cpu(h->num_split_inos);
743 num_split_realms = le32_to_cpu(h->num_split_realms);
744 trace_len = le32_to_cpu(h->trace_len);
745 p += sizeof(*h);
746
747 dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
748 ceph_snap_op_name(op), split, trace_len);
749
750 mutex_lock(&session->s_mutex);
751 session->s_seq++;
752 mutex_unlock(&session->s_mutex);
753
754 down_write(&mdsc->snap_rwsem);
755 locked_rwsem = 1;
756
757 if (op == CEPH_SNAP_OP_SPLIT) {
758 struct ceph_mds_snap_realm *ri;
759
760 /*
761 * A "split" breaks part of an existing realm off into
762 * a new realm. The MDS provides a list of inodes
763 * (with caps) and child realms that belong to the new
764 * child.
765 */
766 split_inos = p;
767 p += sizeof(u64) * num_split_inos;
768 split_realms = p;
769 p += sizeof(u64) * num_split_realms;
770 ceph_decode_need(&p, e, sizeof(*ri), bad);
771 /* we will peek at realm info here, but will _not_
772 * advance p, as the realm update will occur below in
773 * ceph_update_snap_trace. */
774 ri = p;
775
776 realm = ceph_lookup_snap_realm(mdsc, split);
777 if (!realm) {
778 realm = ceph_create_snap_realm(mdsc, split);
779 if (IS_ERR(realm))
780 goto out;
781 }
782 ceph_get_snap_realm(mdsc, realm);
783
784 dout("splitting snap_realm %llx %p\n", realm->ino, realm);
785 for (i = 0; i < num_split_inos; i++) {
786 struct ceph_vino vino = {
787 .ino = le64_to_cpu(split_inos[i]),
788 .snap = CEPH_NOSNAP,
789 };
790 struct inode *inode = ceph_find_inode(sb, vino);
791 struct ceph_inode_info *ci;
792
793 if (!inode)
794 continue;
795 ci = ceph_inode(inode);
796
797 spin_lock(&inode->i_lock);
798 if (!ci->i_snap_realm)
799 goto skip_inode;
800 /*
801 * If this inode belongs to a realm that was
802 * created after our new realm, we experienced
803 * a race (due to another split notifications
804 * arriving from a different MDS). So skip
805 * this inode.
806 */
807 if (ci->i_snap_realm->created >
808 le64_to_cpu(ri->created)) {
809 dout(" leaving %p in newer realm %llx %p\n",
810 inode, ci->i_snap_realm->ino,
811 ci->i_snap_realm);
812 goto skip_inode;
813 }
814 dout(" will move %p to split realm %llx %p\n",
815 inode, realm->ino, realm);
816 /*
817 * Remove the inode from the realm's inode
818 * list, but don't add it to the new realm
819 * yet. We don't want the cap_snap to be
820 * queued (again) by ceph_update_snap_trace()
821 * below. Queue it _now_, under the old context.
822 */
823 spin_lock(&realm->inodes_with_caps_lock);
824 list_del_init(&ci->i_snap_realm_item);
825 spin_unlock(&realm->inodes_with_caps_lock);
826 spin_unlock(&inode->i_lock);
827
828 ceph_queue_cap_snap(ci);
829
830 iput(inode);
831 continue;
832
833skip_inode:
834 spin_unlock(&inode->i_lock);
835 iput(inode);
836 }
837
838 /* we may have taken some of the old realm's children. */
839 for (i = 0; i < num_split_realms; i++) {
840 struct ceph_snap_realm *child =
841 ceph_lookup_snap_realm(mdsc,
842 le64_to_cpu(split_realms[i]));
843 if (!child)
844 continue;
845 adjust_snap_realm_parent(mdsc, child, realm->ino);
846 }
847 }
848
849 /*
850 * update using the provided snap trace. if we are deleting a
851 * snap, we can avoid queueing cap_snaps.
852 */
853 ceph_update_snap_trace(mdsc, p, e,
854 op == CEPH_SNAP_OP_DESTROY);
855
856 if (op == CEPH_SNAP_OP_SPLIT) {
857 /*
858 * ok, _now_ add the inodes into the new realm.
859 */
860 for (i = 0; i < num_split_inos; i++) {
861 struct ceph_vino vino = {
862 .ino = le64_to_cpu(split_inos[i]),
863 .snap = CEPH_NOSNAP,
864 };
865 struct inode *inode = ceph_find_inode(sb, vino);
866 struct ceph_inode_info *ci;
867
868 if (!inode)
869 continue;
870 ci = ceph_inode(inode);
871 spin_lock(&inode->i_lock);
872 if (!ci->i_snap_realm)
873 goto split_skip_inode;
874 ceph_put_snap_realm(mdsc, ci->i_snap_realm);
875 spin_lock(&realm->inodes_with_caps_lock);
876 list_add(&ci->i_snap_realm_item,
877 &realm->inodes_with_caps);
878 ci->i_snap_realm = realm;
879 spin_unlock(&realm->inodes_with_caps_lock);
880 ceph_get_snap_realm(mdsc, realm);
881split_skip_inode:
882 spin_unlock(&inode->i_lock);
883 iput(inode);
884 }
885
886 /* we took a reference when we created the realm, above */
887 ceph_put_snap_realm(mdsc, realm);
888 }
889
890 __cleanup_empty_realms(mdsc);
891
892 up_write(&mdsc->snap_rwsem);
893
894 flush_snaps(mdsc);
895 return;
896
897bad:
898 pr_err("corrupt snap message from mds%d\n", mds);
899 ceph_msg_dump(msg);
900out:
901 if (locked_rwsem)
902 up_write(&mdsc->snap_rwsem);
903 return;
904}
905
906
907
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
new file mode 100644
index 000000000000..75d02eaa1279
--- /dev/null
+++ b/fs/ceph/super.c
@@ -0,0 +1,1031 @@
1
2#include "ceph_debug.h"
3
4#include <linux/backing-dev.h>
5#include <linux/fs.h>
6#include <linux/inet.h>
7#include <linux/in6.h>
8#include <linux/module.h>
9#include <linux/mount.h>
10#include <linux/parser.h>
11#include <linux/rwsem.h>
12#include <linux/sched.h>
13#include <linux/seq_file.h>
14#include <linux/slab.h>
15#include <linux/statfs.h>
16#include <linux/string.h>
17#include <linux/version.h>
18#include <linux/vmalloc.h>
19
20#include "decode.h"
21#include "super.h"
22#include "mon_client.h"
23#include "auth.h"
24
25/*
26 * Ceph superblock operations
27 *
28 * Handle the basics of mounting, unmounting.
29 */
30
31
32/*
33 * find filename portion of a path (/foo/bar/baz -> baz)
34 */
35const char *ceph_file_part(const char *s, int len)
36{
37 const char *e = s + len;
38
39 while (e != s && *(e-1) != '/')
40 e--;
41 return e;
42}
43
44
45/*
46 * super ops
47 */
48static void ceph_put_super(struct super_block *s)
49{
50 struct ceph_client *cl = ceph_client(s);
51
52 dout("put_super\n");
53 ceph_mdsc_close_sessions(&cl->mdsc);
54 return;
55}
56
57static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
58{
59 struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
60 struct ceph_monmap *monmap = client->monc.monmap;
61 struct ceph_statfs st;
62 u64 fsid;
63 int err;
64
65 dout("statfs\n");
66 err = ceph_monc_do_statfs(&client->monc, &st);
67 if (err < 0)
68 return err;
69
70 /* fill in kstatfs */
71 buf->f_type = CEPH_SUPER_MAGIC; /* ?? */
72
73 /*
74 * express utilization in terms of large blocks to avoid
75 * overflow on 32-bit machines.
76 */
77 buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
78 buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
79 buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >>
80 (CEPH_BLOCK_SHIFT-10);
81 buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
82
83 buf->f_files = le64_to_cpu(st.num_objects);
84 buf->f_ffree = -1;
85 buf->f_namelen = PATH_MAX;
86 buf->f_frsize = PAGE_CACHE_SIZE;
87
88 /* leave fsid little-endian, regardless of host endianness */
89 fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
90 buf->f_fsid.val[0] = fsid & 0xffffffff;
91 buf->f_fsid.val[1] = fsid >> 32;
92
93 return 0;
94}
95
96
97static int ceph_syncfs(struct super_block *sb, int wait)
98{
99 dout("sync_fs %d\n", wait);
100 ceph_osdc_sync(&ceph_client(sb)->osdc);
101 ceph_mdsc_sync(&ceph_client(sb)->mdsc);
102 dout("sync_fs %d done\n", wait);
103 return 0;
104}
105
106
107/**
108 * ceph_show_options - Show mount options in /proc/mounts
109 * @m: seq_file to write to
110 * @mnt: mount descriptor
111 */
112static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
113{
114 struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
115 struct ceph_mount_args *args = client->mount_args;
116
117 if (args->flags & CEPH_OPT_FSID)
118 seq_printf(m, ",fsidmajor=%llu,fsidminor%llu",
119 le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
120 le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
121 if (args->flags & CEPH_OPT_NOSHARE)
122 seq_puts(m, ",noshare");
123 if (args->flags & CEPH_OPT_DIRSTAT)
124 seq_puts(m, ",dirstat");
125 if ((args->flags & CEPH_OPT_RBYTES) == 0)
126 seq_puts(m, ",norbytes");
127 if (args->flags & CEPH_OPT_NOCRC)
128 seq_puts(m, ",nocrc");
129 if (args->flags & CEPH_OPT_NOASYNCREADDIR)
130 seq_puts(m, ",noasyncreaddir");
131 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
132 seq_printf(m, ",snapdirname=%s", args->snapdir_name);
133 if (args->name)
134 seq_printf(m, ",name=%s", args->name);
135 if (args->secret)
136 seq_puts(m, ",secret=<hidden>");
137 return 0;
138}
139
140/*
141 * caches
142 */
143struct kmem_cache *ceph_inode_cachep;
144struct kmem_cache *ceph_cap_cachep;
145struct kmem_cache *ceph_dentry_cachep;
146struct kmem_cache *ceph_file_cachep;
147
148static void ceph_inode_init_once(void *foo)
149{
150 struct ceph_inode_info *ci = foo;
151 inode_init_once(&ci->vfs_inode);
152}
153
154static int default_congestion_kb(void)
155{
156 int congestion_kb;
157
158 /*
159 * Copied from NFS
160 *
161 * congestion size, scale with available memory.
162 *
163 * 64MB: 8192k
164 * 128MB: 11585k
165 * 256MB: 16384k
166 * 512MB: 23170k
167 * 1GB: 32768k
168 * 2GB: 46340k
169 * 4GB: 65536k
170 * 8GB: 92681k
171 * 16GB: 131072k
172 *
173 * This allows larger machines to have larger/more transfers.
174 * Limit the default to 256M
175 */
176 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
177 if (congestion_kb > 256*1024)
178 congestion_kb = 256*1024;
179
180 return congestion_kb;
181}
182
183static int __init init_caches(void)
184{
185 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
186 sizeof(struct ceph_inode_info),
187 __alignof__(struct ceph_inode_info),
188 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
189 ceph_inode_init_once);
190 if (ceph_inode_cachep == NULL)
191 return -ENOMEM;
192
193 ceph_cap_cachep = KMEM_CACHE(ceph_cap,
194 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
195 if (ceph_cap_cachep == NULL)
196 goto bad_cap;
197
198 ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
199 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
200 if (ceph_dentry_cachep == NULL)
201 goto bad_dentry;
202
203 ceph_file_cachep = KMEM_CACHE(ceph_file_info,
204 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
205 if (ceph_file_cachep == NULL)
206 goto bad_file;
207
208 return 0;
209
210bad_file:
211 kmem_cache_destroy(ceph_dentry_cachep);
212bad_dentry:
213 kmem_cache_destroy(ceph_cap_cachep);
214bad_cap:
215 kmem_cache_destroy(ceph_inode_cachep);
216 return -ENOMEM;
217}
218
219static void destroy_caches(void)
220{
221 kmem_cache_destroy(ceph_inode_cachep);
222 kmem_cache_destroy(ceph_cap_cachep);
223 kmem_cache_destroy(ceph_dentry_cachep);
224 kmem_cache_destroy(ceph_file_cachep);
225}
226
227
228/*
229 * ceph_umount_begin - initiate forced umount. Tear down down the
230 * mount, skipping steps that may hang while waiting for server(s).
231 */
232static void ceph_umount_begin(struct super_block *sb)
233{
234 struct ceph_client *client = ceph_sb_to_client(sb);
235
236 dout("ceph_umount_begin - starting forced umount\n");
237 if (!client)
238 return;
239 client->mount_state = CEPH_MOUNT_SHUTDOWN;
240 return;
241}
242
243static const struct super_operations ceph_super_ops = {
244 .alloc_inode = ceph_alloc_inode,
245 .destroy_inode = ceph_destroy_inode,
246 .write_inode = ceph_write_inode,
247 .sync_fs = ceph_syncfs,
248 .put_super = ceph_put_super,
249 .show_options = ceph_show_options,
250 .statfs = ceph_statfs,
251 .umount_begin = ceph_umount_begin,
252};
253
254
255const char *ceph_msg_type_name(int type)
256{
257 switch (type) {
258 case CEPH_MSG_SHUTDOWN: return "shutdown";
259 case CEPH_MSG_PING: return "ping";
260 case CEPH_MSG_AUTH: return "auth";
261 case CEPH_MSG_AUTH_REPLY: return "auth_reply";
262 case CEPH_MSG_MON_MAP: return "mon_map";
263 case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
264 case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
265 case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
266 case CEPH_MSG_STATFS: return "statfs";
267 case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
268 case CEPH_MSG_MDS_MAP: return "mds_map";
269 case CEPH_MSG_CLIENT_SESSION: return "client_session";
270 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
271 case CEPH_MSG_CLIENT_REQUEST: return "client_request";
272 case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
273 case CEPH_MSG_CLIENT_REPLY: return "client_reply";
274 case CEPH_MSG_CLIENT_CAPS: return "client_caps";
275 case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
276 case CEPH_MSG_CLIENT_SNAP: return "client_snap";
277 case CEPH_MSG_CLIENT_LEASE: return "client_lease";
278 case CEPH_MSG_OSD_MAP: return "osd_map";
279 case CEPH_MSG_OSD_OP: return "osd_op";
280 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
281 default: return "unknown";
282 }
283}
284
285
286/*
287 * mount options
288 */
289enum {
290 Opt_fsidmajor,
291 Opt_fsidminor,
292 Opt_monport,
293 Opt_wsize,
294 Opt_rsize,
295 Opt_osdtimeout,
296 Opt_osdkeepalivetimeout,
297 Opt_mount_timeout,
298 Opt_osd_idle_ttl,
299 Opt_caps_wanted_delay_min,
300 Opt_caps_wanted_delay_max,
301 Opt_readdir_max_entries,
302 Opt_congestion_kb,
303 Opt_last_int,
304 /* int args above */
305 Opt_snapdirname,
306 Opt_name,
307 Opt_secret,
308 Opt_last_string,
309 /* string args above */
310 Opt_ip,
311 Opt_noshare,
312 Opt_dirstat,
313 Opt_nodirstat,
314 Opt_rbytes,
315 Opt_norbytes,
316 Opt_nocrc,
317 Opt_noasyncreaddir,
318};
319
320static match_table_t arg_tokens = {
321 {Opt_fsidmajor, "fsidmajor=%ld"},
322 {Opt_fsidminor, "fsidminor=%ld"},
323 {Opt_monport, "monport=%d"},
324 {Opt_wsize, "wsize=%d"},
325 {Opt_rsize, "rsize=%d"},
326 {Opt_osdtimeout, "osdtimeout=%d"},
327 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
328 {Opt_mount_timeout, "mount_timeout=%d"},
329 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
330 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
331 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
332 {Opt_readdir_max_entries, "readdir_max_entries=%d"},
333 {Opt_congestion_kb, "write_congestion_kb=%d"},
334 /* int args above */
335 {Opt_snapdirname, "snapdirname=%s"},
336 {Opt_name, "name=%s"},
337 {Opt_secret, "secret=%s"},
338 /* string args above */
339 {Opt_ip, "ip=%s"},
340 {Opt_noshare, "noshare"},
341 {Opt_dirstat, "dirstat"},
342 {Opt_nodirstat, "nodirstat"},
343 {Opt_rbytes, "rbytes"},
344 {Opt_norbytes, "norbytes"},
345 {Opt_nocrc, "nocrc"},
346 {Opt_noasyncreaddir, "noasyncreaddir"},
347 {-1, NULL}
348};
349
350
351static struct ceph_mount_args *parse_mount_args(int flags, char *options,
352 const char *dev_name,
353 const char **path)
354{
355 struct ceph_mount_args *args;
356 const char *c;
357 int err = -ENOMEM;
358 substring_t argstr[MAX_OPT_ARGS];
359
360 args = kzalloc(sizeof(*args), GFP_KERNEL);
361 if (!args)
362 return ERR_PTR(-ENOMEM);
363 args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
364 GFP_KERNEL);
365 if (!args->mon_addr)
366 goto out;
367
368 dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
369
370 /* start with defaults */
371 args->sb_flags = flags;
372 args->flags = CEPH_OPT_DEFAULT;
373 args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
374 args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
375 args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
376 args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
377 args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
378 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
379 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
380 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
381 args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
382 args->max_readdir = 1024;
383 args->congestion_kb = default_congestion_kb();
384
385 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
386 err = -EINVAL;
387 if (!dev_name)
388 goto out;
389 *path = strstr(dev_name, ":/");
390 if (*path == NULL) {
391 pr_err("device name is missing path (no :/ in %s)\n",
392 dev_name);
393 goto out;
394 }
395
396 /* get mon ip(s) */
397 err = ceph_parse_ips(dev_name, *path, args->mon_addr,
398 CEPH_MAX_MON, &args->num_mon);
399 if (err < 0)
400 goto out;
401
402 /* path on server */
403 *path += 2;
404 dout("server path '%s'\n", *path);
405
406 /* parse mount options */
407 while ((c = strsep(&options, ",")) != NULL) {
408 int token, intval, ret;
409 if (!*c)
410 continue;
411 err = -EINVAL;
412 token = match_token((char *)c, arg_tokens, argstr);
413 if (token < 0) {
414 pr_err("bad mount option at '%s'\n", c);
415 goto out;
416 }
417 if (token < Opt_last_int) {
418 ret = match_int(&argstr[0], &intval);
419 if (ret < 0) {
420 pr_err("bad mount option arg (not int) "
421 "at '%s'\n", c);
422 continue;
423 }
424 dout("got int token %d val %d\n", token, intval);
425 } else if (token > Opt_last_int && token < Opt_last_string) {
426 dout("got string token %d val %s\n", token,
427 argstr[0].from);
428 } else {
429 dout("got token %d\n", token);
430 }
431 switch (token) {
432 case Opt_fsidmajor:
433 *(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval);
434 break;
435 case Opt_fsidminor:
436 *(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval);
437 break;
438 case Opt_ip:
439 err = ceph_parse_ips(argstr[0].from,
440 argstr[0].to,
441 &args->my_addr,
442 1, NULL);
443 if (err < 0)
444 goto out;
445 args->flags |= CEPH_OPT_MYIP;
446 break;
447
448 case Opt_snapdirname:
449 kfree(args->snapdir_name);
450 args->snapdir_name = kstrndup(argstr[0].from,
451 argstr[0].to-argstr[0].from,
452 GFP_KERNEL);
453 break;
454 case Opt_name:
455 args->name = kstrndup(argstr[0].from,
456 argstr[0].to-argstr[0].from,
457 GFP_KERNEL);
458 break;
459 case Opt_secret:
460 args->secret = kstrndup(argstr[0].from,
461 argstr[0].to-argstr[0].from,
462 GFP_KERNEL);
463 break;
464
465 /* misc */
466 case Opt_wsize:
467 args->wsize = intval;
468 break;
469 case Opt_rsize:
470 args->rsize = intval;
471 break;
472 case Opt_osdtimeout:
473 args->osd_timeout = intval;
474 break;
475 case Opt_osdkeepalivetimeout:
476 args->osd_keepalive_timeout = intval;
477 break;
478 case Opt_mount_timeout:
479 args->mount_timeout = intval;
480 break;
481 case Opt_caps_wanted_delay_min:
482 args->caps_wanted_delay_min = intval;
483 break;
484 case Opt_caps_wanted_delay_max:
485 args->caps_wanted_delay_max = intval;
486 break;
487 case Opt_readdir_max_entries:
488 args->max_readdir = intval;
489 break;
490 case Opt_congestion_kb:
491 args->congestion_kb = intval;
492 break;
493
494 case Opt_noshare:
495 args->flags |= CEPH_OPT_NOSHARE;
496 break;
497
498 case Opt_dirstat:
499 args->flags |= CEPH_OPT_DIRSTAT;
500 break;
501 case Opt_nodirstat:
502 args->flags &= ~CEPH_OPT_DIRSTAT;
503 break;
504 case Opt_rbytes:
505 args->flags |= CEPH_OPT_RBYTES;
506 break;
507 case Opt_norbytes:
508 args->flags &= ~CEPH_OPT_RBYTES;
509 break;
510 case Opt_nocrc:
511 args->flags |= CEPH_OPT_NOCRC;
512 break;
513 case Opt_noasyncreaddir:
514 args->flags |= CEPH_OPT_NOASYNCREADDIR;
515 break;
516
517 default:
518 BUG_ON(token);
519 }
520 }
521 return args;
522
523out:
524 kfree(args->mon_addr);
525 kfree(args);
526 return ERR_PTR(err);
527}
528
529static void destroy_mount_args(struct ceph_mount_args *args)
530{
531 dout("destroy_mount_args %p\n", args);
532 kfree(args->snapdir_name);
533 args->snapdir_name = NULL;
534 kfree(args->name);
535 args->name = NULL;
536 kfree(args->secret);
537 args->secret = NULL;
538 kfree(args);
539}
540
541/*
542 * create a fresh client instance
543 */
544static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
545{
546 struct ceph_client *client;
547 int err = -ENOMEM;
548
549 client = kzalloc(sizeof(*client), GFP_KERNEL);
550 if (client == NULL)
551 return ERR_PTR(-ENOMEM);
552
553 mutex_init(&client->mount_mutex);
554
555 init_waitqueue_head(&client->auth_wq);
556
557 client->sb = NULL;
558 client->mount_state = CEPH_MOUNT_MOUNTING;
559 client->mount_args = args;
560
561 client->msgr = NULL;
562
563 client->auth_err = 0;
564 atomic_long_set(&client->writeback_count, 0);
565
566 err = bdi_init(&client->backing_dev_info);
567 if (err < 0)
568 goto fail;
569
570 err = -ENOMEM;
571 client->wb_wq = create_workqueue("ceph-writeback");
572 if (client->wb_wq == NULL)
573 goto fail_bdi;
574 client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
575 if (client->pg_inv_wq == NULL)
576 goto fail_wb_wq;
577 client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
578 if (client->trunc_wq == NULL)
579 goto fail_pg_inv_wq;
580
581 /* set up mempools */
582 err = -ENOMEM;
583 client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
584 client->mount_args->wsize >> PAGE_CACHE_SHIFT);
585 if (!client->wb_pagevec_pool)
586 goto fail_trunc_wq;
587
588 /* caps */
589 client->min_caps = args->max_readdir;
590 ceph_adjust_min_caps(client->min_caps);
591
592 /* subsystems */
593 err = ceph_monc_init(&client->monc, client);
594 if (err < 0)
595 goto fail_mempool;
596 err = ceph_osdc_init(&client->osdc, client);
597 if (err < 0)
598 goto fail_monc;
599 err = ceph_mdsc_init(&client->mdsc, client);
600 if (err < 0)
601 goto fail_osdc;
602 return client;
603
604fail_osdc:
605 ceph_osdc_stop(&client->osdc);
606fail_monc:
607 ceph_monc_stop(&client->monc);
608fail_mempool:
609 mempool_destroy(client->wb_pagevec_pool);
610fail_trunc_wq:
611 destroy_workqueue(client->trunc_wq);
612fail_pg_inv_wq:
613 destroy_workqueue(client->pg_inv_wq);
614fail_wb_wq:
615 destroy_workqueue(client->wb_wq);
616fail_bdi:
617 bdi_destroy(&client->backing_dev_info);
618fail:
619 kfree(client);
620 return ERR_PTR(err);
621}
622
623static void ceph_destroy_client(struct ceph_client *client)
624{
625 dout("destroy_client %p\n", client);
626
627 /* unmount */
628 ceph_mdsc_stop(&client->mdsc);
629 ceph_monc_stop(&client->monc);
630 ceph_osdc_stop(&client->osdc);
631
632 ceph_adjust_min_caps(-client->min_caps);
633
634 ceph_debugfs_client_cleanup(client);
635 destroy_workqueue(client->wb_wq);
636 destroy_workqueue(client->pg_inv_wq);
637 destroy_workqueue(client->trunc_wq);
638
639 if (client->msgr)
640 ceph_messenger_destroy(client->msgr);
641 mempool_destroy(client->wb_pagevec_pool);
642
643 destroy_mount_args(client->mount_args);
644
645 kfree(client);
646 dout("destroy_client %p done\n", client);
647}
648
649/*
650 * Initially learn our fsid, or verify an fsid matches.
651 */
652int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
653{
654 if (client->have_fsid) {
655 if (ceph_fsid_compare(&client->fsid, fsid)) {
656 pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT,
657 PR_FSID(&client->fsid), PR_FSID(fsid));
658 return -1;
659 }
660 } else {
661 pr_info("client%lld fsid " FSID_FORMAT "\n",
662 client->monc.auth->global_id, PR_FSID(fsid));
663 memcpy(&client->fsid, fsid, sizeof(*fsid));
664 ceph_debugfs_client_init(client);
665 client->have_fsid = true;
666 }
667 return 0;
668}
669
670/*
671 * true if we have the mon map (and have thus joined the cluster)
672 */
673static int have_mon_map(struct ceph_client *client)
674{
675 return client->monc.monmap && client->monc.monmap->epoch;
676}
677
678/*
679 * Bootstrap mount by opening the root directory. Note the mount
680 * @started time from caller, and time out if this takes too long.
681 */
682static struct dentry *open_root_dentry(struct ceph_client *client,
683 const char *path,
684 unsigned long started)
685{
686 struct ceph_mds_client *mdsc = &client->mdsc;
687 struct ceph_mds_request *req = NULL;
688 int err;
689 struct dentry *root;
690
691 /* open dir */
692 dout("open_root_inode opening '%s'\n", path);
693 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
694 if (IS_ERR(req))
695 return ERR_PTR(PTR_ERR(req));
696 req->r_path1 = kstrdup(path, GFP_NOFS);
697 req->r_ino1.ino = CEPH_INO_ROOT;
698 req->r_ino1.snap = CEPH_NOSNAP;
699 req->r_started = started;
700 req->r_timeout = client->mount_args->mount_timeout * HZ;
701 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
702 req->r_num_caps = 2;
703 err = ceph_mdsc_do_request(mdsc, NULL, req);
704 if (err == 0) {
705 dout("open_root_inode success\n");
706 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
707 client->sb->s_root == NULL)
708 root = d_alloc_root(req->r_target_inode);
709 else
710 root = d_obtain_alias(req->r_target_inode);
711 req->r_target_inode = NULL;
712 dout("open_root_inode success, root dentry is %p\n", root);
713 } else {
714 root = ERR_PTR(err);
715 }
716 ceph_mdsc_put_request(req);
717 return root;
718}
719
720/*
721 * mount: join the ceph cluster, and open root directory.
722 */
723static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
724 const char *path)
725{
726 struct ceph_entity_addr *myaddr = NULL;
727 int err;
728 unsigned long timeout = client->mount_args->mount_timeout * HZ;
729 unsigned long started = jiffies; /* note the start time */
730 struct dentry *root;
731
732 dout("mount start\n");
733 mutex_lock(&client->mount_mutex);
734
735 /* initialize the messenger */
736 if (client->msgr == NULL) {
737 if (ceph_test_opt(client, MYIP))
738 myaddr = &client->mount_args->my_addr;
739 client->msgr = ceph_messenger_create(myaddr);
740 if (IS_ERR(client->msgr)) {
741 err = PTR_ERR(client->msgr);
742 client->msgr = NULL;
743 goto out;
744 }
745 client->msgr->nocrc = ceph_test_opt(client, NOCRC);
746 }
747
748 /* open session, and wait for mon, mds, and osd maps */
749 err = ceph_monc_open_session(&client->monc);
750 if (err < 0)
751 goto out;
752
753 while (!have_mon_map(client)) {
754 err = -EIO;
755 if (timeout && time_after_eq(jiffies, started + timeout))
756 goto out;
757
758 /* wait */
759 dout("mount waiting for mon_map\n");
760 err = wait_event_interruptible_timeout(client->auth_wq,
761 have_mon_map(client) || (client->auth_err < 0),
762 timeout);
763 if (err == -EINTR || err == -ERESTARTSYS)
764 goto out;
765 if (client->auth_err < 0) {
766 err = client->auth_err;
767 goto out;
768 }
769 }
770
771 dout("mount opening root\n");
772 root = open_root_dentry(client, "", started);
773 if (IS_ERR(root)) {
774 err = PTR_ERR(root);
775 goto out;
776 }
777 if (client->sb->s_root)
778 dput(root);
779 else
780 client->sb->s_root = root;
781
782 if (path[0] == 0) {
783 dget(root);
784 } else {
785 dout("mount opening base mountpoint\n");
786 root = open_root_dentry(client, path, started);
787 if (IS_ERR(root)) {
788 err = PTR_ERR(root);
789 dput(client->sb->s_root);
790 client->sb->s_root = NULL;
791 goto out;
792 }
793 }
794
795 mnt->mnt_root = root;
796 mnt->mnt_sb = client->sb;
797
798 client->mount_state = CEPH_MOUNT_MOUNTED;
799 dout("mount success\n");
800 err = 0;
801
802out:
803 mutex_unlock(&client->mount_mutex);
804 return err;
805}
806
807static int ceph_set_super(struct super_block *s, void *data)
808{
809 struct ceph_client *client = data;
810 int ret;
811
812 dout("set_super %p data %p\n", s, data);
813
814 s->s_flags = client->mount_args->sb_flags;
815 s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
816
817 s->s_fs_info = client;
818 client->sb = s;
819
820 s->s_op = &ceph_super_ops;
821 s->s_export_op = &ceph_export_ops;
822
823 s->s_time_gran = 1000; /* 1000 ns == 1 us */
824
825 ret = set_anon_super(s, NULL); /* what is that second arg for? */
826 if (ret != 0)
827 goto fail;
828
829 return ret;
830
831fail:
832 s->s_fs_info = NULL;
833 client->sb = NULL;
834 return ret;
835}
836
837/*
838 * share superblock if same fs AND options
839 */
840static int ceph_compare_super(struct super_block *sb, void *data)
841{
842 struct ceph_client *new = data;
843 struct ceph_mount_args *args = new->mount_args;
844 struct ceph_client *other = ceph_sb_to_client(sb);
845 int i;
846
847 dout("ceph_compare_super %p\n", sb);
848 if (args->flags & CEPH_OPT_FSID) {
849 if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
850 dout("fsid doesn't match\n");
851 return 0;
852 }
853 } else {
854 /* do we share (a) monitor? */
855 for (i = 0; i < new->monc.monmap->num_mon; i++)
856 if (ceph_monmap_contains(other->monc.monmap,
857 &new->monc.monmap->mon_inst[i].addr))
858 break;
859 if (i == new->monc.monmap->num_mon) {
860 dout("mon ip not part of monmap\n");
861 return 0;
862 }
863 dout("mon ip matches existing sb %p\n", sb);
864 }
865 if (args->sb_flags != other->mount_args->sb_flags) {
866 dout("flags differ\n");
867 return 0;
868 }
869 return 1;
870}
871
872/*
873 * construct our own bdi so we can control readahead, etc.
874 */
875static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
876{
877 int err;
878
879 sb->s_bdi = &client->backing_dev_info;
880
881 /* set ra_pages based on rsize mount option? */
882 if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
883 client->backing_dev_info.ra_pages =
884 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
885 >> PAGE_SHIFT;
886 err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
887 return err;
888}
889
890static int ceph_get_sb(struct file_system_type *fs_type,
891 int flags, const char *dev_name, void *data,
892 struct vfsmount *mnt)
893{
894 struct super_block *sb;
895 struct ceph_client *client;
896 int err;
897 int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
898 const char *path = NULL;
899 struct ceph_mount_args *args;
900
901 dout("ceph_get_sb\n");
902 args = parse_mount_args(flags, data, dev_name, &path);
903 if (IS_ERR(args)) {
904 err = PTR_ERR(args);
905 goto out_final;
906 }
907
908 /* create client (which we may/may not use) */
909 client = ceph_create_client(args);
910 if (IS_ERR(client)) {
911 err = PTR_ERR(client);
912 goto out_final;
913 }
914
915 if (client->mount_args->flags & CEPH_OPT_NOSHARE)
916 compare_super = NULL;
917 sb = sget(fs_type, compare_super, ceph_set_super, client);
918 if (IS_ERR(sb)) {
919 err = PTR_ERR(sb);
920 goto out;
921 }
922
923 if (ceph_client(sb) != client) {
924 ceph_destroy_client(client);
925 client = ceph_client(sb);
926 dout("get_sb got existing client %p\n", client);
927 } else {
928 dout("get_sb using new client %p\n", client);
929 err = ceph_register_bdi(sb, client);
930 if (err < 0)
931 goto out_splat;
932 }
933
934 err = ceph_mount(client, mnt, path);
935 if (err < 0)
936 goto out_splat;
937 dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
938 mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode));
939 return 0;
940
941out_splat:
942 ceph_mdsc_close_sessions(&client->mdsc);
943 up_write(&sb->s_umount);
944 deactivate_super(sb);
945 goto out_final;
946
947out:
948 ceph_destroy_client(client);
949out_final:
950 dout("ceph_get_sb fail %d\n", err);
951 return err;
952}
953
954static void ceph_kill_sb(struct super_block *s)
955{
956 struct ceph_client *client = ceph_sb_to_client(s);
957 dout("kill_sb %p\n", s);
958 ceph_mdsc_pre_umount(&client->mdsc);
959 kill_anon_super(s); /* will call put_super after sb is r/o */
960 if (s->s_bdi == &client->backing_dev_info)
961 bdi_unregister(&client->backing_dev_info);
962 bdi_destroy(&client->backing_dev_info);
963 ceph_destroy_client(client);
964}
965
966static struct file_system_type ceph_fs_type = {
967 .owner = THIS_MODULE,
968 .name = "ceph",
969 .get_sb = ceph_get_sb,
970 .kill_sb = ceph_kill_sb,
971 .fs_flags = FS_RENAME_DOES_D_MOVE,
972};
973
974#define _STRINGIFY(x) #x
975#define STRINGIFY(x) _STRINGIFY(x)
976
977static int __init init_ceph(void)
978{
979 int ret = 0;
980
981 ret = ceph_debugfs_init();
982 if (ret < 0)
983 goto out;
984
985 ret = ceph_msgr_init();
986 if (ret < 0)
987 goto out_debugfs;
988
989 ret = init_caches();
990 if (ret)
991 goto out_msgr;
992
993 ceph_caps_init();
994
995 ret = register_filesystem(&ceph_fs_type);
996 if (ret)
997 goto out_icache;
998
999 pr_info("loaded %d.%d.%d (mon/mds/osd proto %d/%d/%d)\n",
1000 CEPH_VERSION_MAJOR, CEPH_VERSION_MINOR, CEPH_VERSION_PATCH,
1001 CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL);
1002 return 0;
1003
1004out_icache:
1005 destroy_caches();
1006out_msgr:
1007 ceph_msgr_exit();
1008out_debugfs:
1009 ceph_debugfs_cleanup();
1010out:
1011 return ret;
1012}
1013
1014static void __exit exit_ceph(void)
1015{
1016 dout("exit_ceph\n");
1017 unregister_filesystem(&ceph_fs_type);
1018 ceph_caps_finalize();
1019 destroy_caches();
1020 ceph_msgr_exit();
1021 ceph_debugfs_cleanup();
1022}
1023
1024module_init(init_ceph);
1025module_exit(exit_ceph);
1026
1027MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
1028MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
1029MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
1030MODULE_DESCRIPTION("Ceph filesystem for Linux");
1031MODULE_LICENSE("GPL");
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
new file mode 100644
index 000000000000..e30dfbb056c3
--- /dev/null
+++ b/fs/ceph/super.h
@@ -0,0 +1,901 @@
1#ifndef _FS_CEPH_SUPER_H
2#define _FS_CEPH_SUPER_H
3
4#include "ceph_debug.h"
5
6#include <asm/unaligned.h>
7#include <linux/backing-dev.h>
8#include <linux/completion.h>
9#include <linux/exportfs.h>
10#include <linux/fs.h>
11#include <linux/mempool.h>
12#include <linux/pagemap.h>
13#include <linux/wait.h>
14#include <linux/writeback.h>
15#include <linux/slab.h>
16
17#include "types.h"
18#include "messenger.h"
19#include "msgpool.h"
20#include "mon_client.h"
21#include "mds_client.h"
22#include "osd_client.h"
23#include "ceph_fs.h"
24
25/* f_type in struct statfs */
26#define CEPH_SUPER_MAGIC 0x00c36400
27
28/* large granularity for statfs utilization stats to facilitate
29 * large volume sizes on 32-bit machines. */
30#define CEPH_BLOCK_SHIFT 20 /* 1 MB */
31#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
32
33/*
34 * mount options
35 */
36#define CEPH_OPT_FSID (1<<0)
37#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
38#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
39#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */
40#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
41#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */
42#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
43
44#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES)
45
46#define ceph_set_opt(client, opt) \
47 (client)->mount_args->flags |= CEPH_OPT_##opt;
48#define ceph_test_opt(client, opt) \
49 (!!((client)->mount_args->flags & CEPH_OPT_##opt))
50
51
52struct ceph_mount_args {
53 int sb_flags;
54 int num_mon;
55 struct ceph_entity_addr *mon_addr;
56 int flags;
57 int mount_timeout;
58 int osd_idle_ttl;
59 int caps_wanted_delay_min, caps_wanted_delay_max;
60 struct ceph_fsid fsid;
61 struct ceph_entity_addr my_addr;
62 int wsize;
63 int rsize; /* max readahead */
64 int max_readdir; /* max readdir size */
65 int congestion_kb; /* max readdir size */
66 int osd_timeout;
67 int osd_keepalive_timeout;
68 char *snapdir_name; /* default ".snap" */
69 char *name;
70 char *secret;
71 int cap_release_safety;
72};
73
74/*
75 * defaults
76 */
77#define CEPH_MOUNT_TIMEOUT_DEFAULT 60
78#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */
79#define CEPH_OSD_KEEPALIVE_DEFAULT 5
80#define CEPH_OSD_IDLE_TTL_DEFAULT 60
81#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
82
83#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
84#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
85
86#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
87#define CEPH_AUTH_NAME_DEFAULT "guest"
88
89/*
90 * Delay telling the MDS we no longer want caps, in case we reopen
91 * the file. Delay a minimum amount of time, even if we send a cap
92 * message for some other reason. Otherwise, take the oppotunity to
93 * update the mds to avoid sending another message later.
94 */
95#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
96#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
97
98
99/* mount state */
100enum {
101 CEPH_MOUNT_MOUNTING,
102 CEPH_MOUNT_MOUNTED,
103 CEPH_MOUNT_UNMOUNTING,
104 CEPH_MOUNT_UNMOUNTED,
105 CEPH_MOUNT_SHUTDOWN,
106};
107
108/*
109 * subtract jiffies
110 */
111static inline unsigned long time_sub(unsigned long a, unsigned long b)
112{
113 BUG_ON(time_after(b, a));
114 return (long)a - (long)b;
115}
116
117/*
118 * per-filesystem client state
119 *
120 * possibly shared by multiple mount points, if they are
121 * mounting the same ceph filesystem/cluster.
122 */
123struct ceph_client {
124 struct ceph_fsid fsid;
125 bool have_fsid;
126
127 struct mutex mount_mutex; /* serialize mount attempts */
128 struct ceph_mount_args *mount_args;
129
130 struct super_block *sb;
131
132 unsigned long mount_state;
133 wait_queue_head_t auth_wq;
134
135 int auth_err;
136
137 int min_caps; /* min caps i added */
138
139 struct ceph_messenger *msgr; /* messenger instance */
140 struct ceph_mon_client monc;
141 struct ceph_mds_client mdsc;
142 struct ceph_osd_client osdc;
143
144 /* writeback */
145 mempool_t *wb_pagevec_pool;
146 struct workqueue_struct *wb_wq;
147 struct workqueue_struct *pg_inv_wq;
148 struct workqueue_struct *trunc_wq;
149 atomic_long_t writeback_count;
150
151 struct backing_dev_info backing_dev_info;
152
153#ifdef CONFIG_DEBUG_FS
154 struct dentry *debugfs_monmap;
155 struct dentry *debugfs_mdsmap, *debugfs_osdmap;
156 struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
157 struct dentry *debugfs_congestion_kb;
158 struct dentry *debugfs_bdi;
159#endif
160};
161
162static inline struct ceph_client *ceph_client(struct super_block *sb)
163{
164 return sb->s_fs_info;
165}
166
167
168/*
169 * File i/o capability. This tracks shared state with the metadata
170 * server that allows us to cache or writeback attributes or to read
171 * and write data. For any given inode, we should have one or more
172 * capabilities, one issued by each metadata server, and our
173 * cumulative access is the OR of all issued capabilities.
174 *
175 * Each cap is referenced by the inode's i_caps rbtree and by per-mds
176 * session capability lists.
177 */
178struct ceph_cap {
179 struct ceph_inode_info *ci;
180 struct rb_node ci_node; /* per-ci cap tree */
181 struct ceph_mds_session *session;
182 struct list_head session_caps; /* per-session caplist */
183 int mds;
184 u64 cap_id; /* unique cap id (mds provided) */
185 int issued; /* latest, from the mds */
186 int implemented; /* implemented superset of issued (for revocation) */
187 int mds_wanted;
188 u32 seq, issue_seq, mseq;
189 u32 cap_gen; /* active/stale cycle */
190 unsigned long last_used;
191 struct list_head caps_item;
192};
193
194#define CHECK_CAPS_NODELAY 1 /* do not delay any further */
195#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
196#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
197
198/*
199 * Snapped cap state that is pending flush to mds. When a snapshot occurs,
200 * we first complete any in-process sync writes and writeback any dirty
201 * data before flushing the snapped state (tracked here) back to the MDS.
202 */
203struct ceph_cap_snap {
204 atomic_t nref;
205 struct ceph_inode_info *ci;
206 struct list_head ci_item, flushing_item;
207
208 u64 follows, flush_tid;
209 int issued, dirty;
210 struct ceph_snap_context *context;
211
212 mode_t mode;
213 uid_t uid;
214 gid_t gid;
215
216 void *xattr_blob;
217 int xattr_len;
218 u64 xattr_version;
219
220 u64 size;
221 struct timespec mtime, atime, ctime;
222 u64 time_warp_seq;
223 int writing; /* a sync write is still in progress */
224 int dirty_pages; /* dirty pages awaiting writeback */
225};
226
227static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
228{
229 if (atomic_dec_and_test(&capsnap->nref))
230 kfree(capsnap);
231}
232
233/*
234 * The frag tree describes how a directory is fragmented, potentially across
235 * multiple metadata servers. It is also used to indicate points where
236 * metadata authority is delegated, and whether/where metadata is replicated.
237 *
238 * A _leaf_ frag will be present in the i_fragtree IFF there is
239 * delegation info. That is, if mds >= 0 || ndist > 0.
240 */
241#define CEPH_MAX_DIRFRAG_REP 4
242
243struct ceph_inode_frag {
244 struct rb_node node;
245
246 /* fragtree state */
247 u32 frag;
248 int split_by; /* i.e. 2^(split_by) children */
249
250 /* delegation and replication info */
251 int mds; /* -1 if same authority as parent */
252 int ndist; /* >0 if replicated */
253 int dist[CEPH_MAX_DIRFRAG_REP];
254};
255
256/*
257 * We cache inode xattrs as an encoded blob until they are first used,
258 * at which point we parse them into an rbtree.
259 */
260struct ceph_inode_xattr {
261 struct rb_node node;
262
263 const char *name;
264 int name_len;
265 const char *val;
266 int val_len;
267 int dirty;
268
269 int should_free_name;
270 int should_free_val;
271};
272
273struct ceph_inode_xattrs_info {
274 /*
275 * (still encoded) xattr blob. we avoid the overhead of parsing
276 * this until someone actually calls getxattr, etc.
277 *
278 * blob->vec.iov_len == 4 implies there are no xattrs; blob ==
279 * NULL means we don't know.
280 */
281 struct ceph_buffer *blob, *prealloc_blob;
282
283 struct rb_root index;
284 bool dirty;
285 int count;
286 int names_size;
287 int vals_size;
288 u64 version, index_version;
289};
290
291/*
292 * Ceph inode.
293 */
294#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
295#define CEPH_I_NODELAY 4 /* do not delay cap release */
296#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
297#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
298
299struct ceph_inode_info {
300 struct ceph_vino i_vino; /* ceph ino + snap */
301
302 u64 i_version;
303 u32 i_time_warp_seq;
304
305 unsigned i_ceph_flags;
306 unsigned long i_release_count;
307
308 struct ceph_file_layout i_layout;
309 char *i_symlink;
310
311 /* for dirs */
312 struct timespec i_rctime;
313 u64 i_rbytes, i_rfiles, i_rsubdirs;
314 u64 i_files, i_subdirs;
315 u64 i_max_offset; /* largest readdir offset, set with I_COMPLETE */
316
317 struct rb_root i_fragtree;
318 struct mutex i_fragtree_mutex;
319
320 struct ceph_inode_xattrs_info i_xattrs;
321
322 /* capabilities. protected _both_ by i_lock and cap->session's
323 * s_mutex. */
324 struct rb_root i_caps; /* cap list */
325 struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
326 unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
327 struct list_head i_dirty_item, i_flushing_item;
328 u64 i_cap_flush_seq;
329 /* we need to track cap writeback on a per-cap-bit basis, to allow
330 * overlapping, pipelined cap flushes to the mds. we can probably
331 * reduce the tid to 8 bits if we're concerned about inode size. */
332 u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
333 wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
334 unsigned long i_hold_caps_min; /* jiffies */
335 unsigned long i_hold_caps_max; /* jiffies */
336 struct list_head i_cap_delay_list; /* for delayed cap release to mds */
337 int i_cap_exporting_mds; /* to handle cap migration between */
338 unsigned i_cap_exporting_mseq; /* mds's. */
339 unsigned i_cap_exporting_issued;
340 struct ceph_cap_reservation i_cap_migration_resv;
341 struct list_head i_cap_snaps; /* snapped state pending flush to mds */
342 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 */
343 unsigned i_snap_caps; /* cap bits for snapped files */
344
345 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
346
347 u32 i_truncate_seq; /* last truncate to smaller size */
348 u64 i_truncate_size; /* and the size we last truncated down to */
349 int i_truncate_pending; /* still need to call vmtruncate */
350
351 u64 i_max_size; /* max file size authorized by mds */
352 u64 i_reported_size; /* (max_)size reported to or requested of mds */
353 u64 i_wanted_max_size; /* offset we'd like to write too */
354 u64 i_requested_max_size; /* max_size we've requested */
355
356 /* held references to caps */
357 int i_pin_ref;
358 int i_rd_ref, i_rdcache_ref, i_wr_ref;
359 int i_wrbuffer_ref, i_wrbuffer_ref_head;
360 u32 i_shared_gen; /* increment each time we get FILE_SHARED */
361 u32 i_rdcache_gen; /* we increment this each time we get
362 FILE_CACHE. If it's non-zero, we
363 _may_ have cached pages. */
364 u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
365
366 struct list_head i_unsafe_writes; /* uncommitted sync writes */
367 struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
368 spinlock_t i_unsafe_lock;
369
370 struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
371 int i_snap_realm_counter; /* snap realm (if caps) */
372 struct list_head i_snap_realm_item;
373 struct list_head i_snap_flush_item;
374
375 struct work_struct i_wb_work; /* writeback work */
376 struct work_struct i_pg_inv_work; /* page invalidation work */
377
378 struct work_struct i_vmtruncate_work;
379
380 struct inode vfs_inode; /* at end */
381};
382
383static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
384{
385 return container_of(inode, struct ceph_inode_info, vfs_inode);
386}
387
388static inline void ceph_i_clear(struct inode *inode, unsigned mask)
389{
390 struct ceph_inode_info *ci = ceph_inode(inode);
391
392 spin_lock(&inode->i_lock);
393 ci->i_ceph_flags &= ~mask;
394 spin_unlock(&inode->i_lock);
395}
396
397static inline void ceph_i_set(struct inode *inode, unsigned mask)
398{
399 struct ceph_inode_info *ci = ceph_inode(inode);
400
401 spin_lock(&inode->i_lock);
402 ci->i_ceph_flags |= mask;
403 spin_unlock(&inode->i_lock);
404}
405
406static inline bool ceph_i_test(struct inode *inode, unsigned mask)
407{
408 struct ceph_inode_info *ci = ceph_inode(inode);
409 bool r;
410
411 smp_mb();
412 r = (ci->i_ceph_flags & mask) == mask;
413 return r;
414}
415
416
417/* find a specific frag @f */
418extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
419 u32 f);
420
421/*
422 * choose fragment for value @v. copy frag content to pfrag, if leaf
423 * exists
424 */
425extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
426 struct ceph_inode_frag *pfrag,
427 int *found);
428
429/*
430 * Ceph dentry state
431 */
432struct ceph_dentry_info {
433 struct ceph_mds_session *lease_session;
434 u32 lease_gen, lease_shared_gen;
435 u32 lease_seq;
436 unsigned long lease_renew_after, lease_renew_from;
437 struct list_head lru;
438 struct dentry *dentry;
439 u64 time;
440 u64 offset;
441};
442
443static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
444{
445 return (struct ceph_dentry_info *)dentry->d_fsdata;
446}
447
448static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
449{
450 return ((loff_t)frag << 32) | (loff_t)off;
451}
452
453/*
454 * ino_t is <64 bits on many architectures, blech.
455 *
456 * don't include snap in ino hash, at least for now.
457 */
458static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
459{
460 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
461#if BITS_PER_LONG == 32
462 ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
463 if (!ino)
464 ino = 1;
465#endif
466 return ino;
467}
468
469static inline int ceph_set_ino_cb(struct inode *inode, void *data)
470{
471 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
472 inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
473 return 0;
474}
475
476static inline struct ceph_vino ceph_vino(struct inode *inode)
477{
478 return ceph_inode(inode)->i_vino;
479}
480
481/* for printf-style formatting */
482#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
483
484static inline u64 ceph_ino(struct inode *inode)
485{
486 return ceph_inode(inode)->i_vino.ino;
487}
488static inline u64 ceph_snap(struct inode *inode)
489{
490 return ceph_inode(inode)->i_vino.snap;
491}
492
493static inline int ceph_ino_compare(struct inode *inode, void *data)
494{
495 struct ceph_vino *pvino = (struct ceph_vino *)data;
496 struct ceph_inode_info *ci = ceph_inode(inode);
497 return ci->i_vino.ino == pvino->ino &&
498 ci->i_vino.snap == pvino->snap;
499}
500
501static inline struct inode *ceph_find_inode(struct super_block *sb,
502 struct ceph_vino vino)
503{
504 ino_t t = ceph_vino_to_ino(vino);
505 return ilookup5(sb, t, ceph_ino_compare, &vino);
506}
507
508
509/*
510 * caps helpers
511 */
512static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
513{
514 return !RB_EMPTY_ROOT(&ci->i_caps);
515}
516
517extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
518extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
519extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
520 struct ceph_cap *cap);
521
522static inline int ceph_caps_issued(struct ceph_inode_info *ci)
523{
524 int issued;
525 spin_lock(&ci->vfs_inode.i_lock);
526 issued = __ceph_caps_issued(ci, NULL);
527 spin_unlock(&ci->vfs_inode.i_lock);
528 return issued;
529}
530
531static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
532 int touch)
533{
534 int r;
535 spin_lock(&ci->vfs_inode.i_lock);
536 r = __ceph_caps_issued_mask(ci, mask, touch);
537 spin_unlock(&ci->vfs_inode.i_lock);
538 return r;
539}
540
541static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
542{
543 return ci->i_dirty_caps | ci->i_flushing_caps;
544}
545extern void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
546
547extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
548extern int __ceph_caps_used(struct ceph_inode_info *ci);
549
550extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
551
552/*
553 * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
554 */
555static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
556{
557 int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
558 if (w & CEPH_CAP_FILE_BUFFER)
559 w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */
560 return w;
561}
562
563/* what the mds thinks we want */
564extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
565
566extern void ceph_caps_init(void);
567extern void ceph_caps_finalize(void);
568extern void ceph_adjust_min_caps(int delta);
569extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need);
570extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx);
571extern void ceph_reservation_status(struct ceph_client *client,
572 int *total, int *avail, int *used,
573 int *reserved, int *min);
574
575static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
576{
577 return (struct ceph_client *)inode->i_sb->s_fs_info;
578}
579
580static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
581{
582 return (struct ceph_client *)sb->s_fs_info;
583}
584
585
586/*
587 * we keep buffered readdir results attached to file->private_data
588 */
589struct ceph_file_info {
590 int fmode; /* initialized on open */
591
592 /* readdir: position within the dir */
593 u32 frag;
594 struct ceph_mds_request *last_readdir;
595 int at_end;
596
597 /* readdir: position within a frag */
598 unsigned offset; /* offset of last chunk, adjusted for . and .. */
599 u64 next_offset; /* offset of next chunk (last_name's + 1) */
600 char *last_name; /* last entry in previous chunk */
601 struct dentry *dentry; /* next dentry (for dcache readdir) */
602 unsigned long dir_release_count;
603
604 /* used for -o dirstat read() on directory thing */
605 char *dir_info;
606 int dir_info_len;
607};
608
609
610
611/*
612 * snapshots
613 */
614
615/*
616 * A "snap context" is the set of existing snapshots when we
617 * write data. It is used by the OSD to guide its COW behavior.
618 *
619 * The ceph_snap_context is refcounted, and attached to each dirty
620 * page, indicating which context the dirty data belonged when it was
621 * dirtied.
622 */
623struct ceph_snap_context {
624 atomic_t nref;
625 u64 seq;
626 int num_snaps;
627 u64 snaps[];
628};
629
630static inline struct ceph_snap_context *
631ceph_get_snap_context(struct ceph_snap_context *sc)
632{
633 /*
634 printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
635 atomic_read(&sc->nref)+1);
636 */
637 if (sc)
638 atomic_inc(&sc->nref);
639 return sc;
640}
641
642static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
643{
644 if (!sc)
645 return;
646 /*
647 printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
648 atomic_read(&sc->nref)-1);
649 */
650 if (atomic_dec_and_test(&sc->nref)) {
651 /*printk(" deleting snap_context %p\n", sc);*/
652 kfree(sc);
653 }
654}
655
656/*
657 * A "snap realm" describes a subset of the file hierarchy sharing
658 * the same set of snapshots that apply to it. The realms themselves
659 * are organized into a hierarchy, such that children inherit (some of)
660 * the snapshots of their parents.
661 *
662 * All inodes within the realm that have capabilities are linked into a
663 * per-realm list.
664 */
665struct ceph_snap_realm {
666 u64 ino;
667 atomic_t nref;
668 struct rb_node node;
669
670 u64 created, seq;
671 u64 parent_ino;
672 u64 parent_since; /* snapid when our current parent became so */
673
674 u64 *prior_parent_snaps; /* snaps inherited from any parents we */
675 int num_prior_parent_snaps; /* had prior to parent_since */
676 u64 *snaps; /* snaps specific to this realm */
677 int num_snaps;
678
679 struct ceph_snap_realm *parent;
680 struct list_head children; /* list of child realms */
681 struct list_head child_item;
682
683 struct list_head empty_item; /* if i have ref==0 */
684
685 /* the current set of snaps for this realm */
686 struct ceph_snap_context *cached_context;
687
688 struct list_head inodes_with_caps;
689 spinlock_t inodes_with_caps_lock;
690};
691
692
693
694/*
695 * calculate the number of pages a given length and offset map onto,
696 * if we align the data.
697 */
698static inline int calc_pages_for(u64 off, u64 len)
699{
700 return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
701 (off >> PAGE_CACHE_SHIFT);
702}
703
704
705
706/* snap.c */
707struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
708 u64 ino);
709extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
710 struct ceph_snap_realm *realm);
711extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
712 struct ceph_snap_realm *realm);
713extern int ceph_update_snap_trace(struct ceph_mds_client *m,
714 void *p, void *e, bool deletion);
715extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
716 struct ceph_mds_session *session,
717 struct ceph_msg *msg);
718extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
719extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
720 struct ceph_cap_snap *capsnap);
721extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
722
723/*
724 * a cap_snap is "pending" if it is still awaiting an in-progress
725 * sync write (that may/may not still update size, mtime, etc.).
726 */
727static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
728{
729 return !list_empty(&ci->i_cap_snaps) &&
730 list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
731 ci_item)->writing;
732}
733
734
735/* super.c */
736extern struct kmem_cache *ceph_inode_cachep;
737extern struct kmem_cache *ceph_cap_cachep;
738extern struct kmem_cache *ceph_dentry_cachep;
739extern struct kmem_cache *ceph_file_cachep;
740
741extern const char *ceph_msg_type_name(int type);
742extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
743
744#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
745 "%02x%02x%02x%02x%02x%02x"
746#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \
747 (f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7], \
748 (f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11], \
749 (f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15]
750
751/* inode.c */
752extern const struct inode_operations ceph_file_iops;
753
754extern struct inode *ceph_alloc_inode(struct super_block *sb);
755extern void ceph_destroy_inode(struct inode *inode);
756
757extern struct inode *ceph_get_inode(struct super_block *sb,
758 struct ceph_vino vino);
759extern struct inode *ceph_get_snapdir(struct inode *parent);
760extern int ceph_fill_file_size(struct inode *inode, int issued,
761 u32 truncate_seq, u64 truncate_size, u64 size);
762extern void ceph_fill_file_time(struct inode *inode, int issued,
763 u64 time_warp_seq, struct timespec *ctime,
764 struct timespec *mtime, struct timespec *atime);
765extern int ceph_fill_trace(struct super_block *sb,
766 struct ceph_mds_request *req,
767 struct ceph_mds_session *session);
768extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
769 struct ceph_mds_session *session);
770
771extern int ceph_inode_holds_cap(struct inode *inode, int mask);
772
773extern int ceph_inode_set_size(struct inode *inode, loff_t size);
774extern void __ceph_do_pending_vmtruncate(struct inode *inode);
775extern void ceph_queue_vmtruncate(struct inode *inode);
776
777extern void ceph_queue_invalidate(struct inode *inode);
778extern void ceph_queue_writeback(struct inode *inode);
779
780extern int ceph_do_getattr(struct inode *inode, int mask);
781extern int ceph_permission(struct inode *inode, int mask);
782extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
783extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
784 struct kstat *stat);
785
786/* xattr.c */
787extern int ceph_setxattr(struct dentry *, const char *, const void *,
788 size_t, int);
789extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
790extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
791extern int ceph_removexattr(struct dentry *, const char *);
792extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
793extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
794
795/* caps.c */
796extern const char *ceph_cap_string(int c);
797extern void ceph_handle_caps(struct ceph_mds_session *session,
798 struct ceph_msg *msg);
799extern int ceph_add_cap(struct inode *inode,
800 struct ceph_mds_session *session, u64 cap_id,
801 int fmode, unsigned issued, unsigned wanted,
802 unsigned cap, unsigned seq, u64 realmino, int flags,
803 struct ceph_cap_reservation *caps_reservation);
804extern void __ceph_remove_cap(struct ceph_cap *cap);
805static inline void ceph_remove_cap(struct ceph_cap *cap)
806{
807 struct inode *inode = &cap->ci->vfs_inode;
808 spin_lock(&inode->i_lock);
809 __ceph_remove_cap(cap);
810 spin_unlock(&inode->i_lock);
811}
812extern void ceph_put_cap(struct ceph_cap *cap);
813
814extern void ceph_queue_caps_release(struct inode *inode);
815extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
816extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync);
817extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
818 struct ceph_mds_session *session);
819extern int ceph_get_cap_mds(struct inode *inode);
820extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
821extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
822extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
823 struct ceph_snap_context *snapc);
824extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
825 struct ceph_mds_session **psession);
826extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
827 struct ceph_mds_session *session);
828extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
829extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
830
831extern int ceph_encode_inode_release(void **p, struct inode *inode,
832 int mds, int drop, int unless, int force);
833extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
834 int mds, int drop, int unless);
835
836extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
837 int *got, loff_t endoff);
838
839/* for counting open files by mode */
840static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
841{
842 ci->i_nr_by_mode[mode]++;
843}
844extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
845
846/* addr.c */
847extern const struct address_space_operations ceph_aops;
848extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
849
850/* file.c */
851extern const struct file_operations ceph_file_fops;
852extern const struct address_space_operations ceph_aops;
853extern int ceph_open(struct inode *inode, struct file *file);
854extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
855 struct nameidata *nd, int mode,
856 int locked_dir);
857extern int ceph_release(struct inode *inode, struct file *filp);
858extern void ceph_release_page_vector(struct page **pages, int num_pages);
859
860/* dir.c */
861extern const struct file_operations ceph_dir_fops;
862extern const struct inode_operations ceph_dir_iops;
863extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
864 ceph_snapdir_dentry_ops;
865
866extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
867extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
868 struct dentry *dentry, int err);
869
870extern void ceph_dentry_lru_add(struct dentry *dn);
871extern void ceph_dentry_lru_touch(struct dentry *dn);
872extern void ceph_dentry_lru_del(struct dentry *dn);
873
874/*
875 * our d_ops vary depending on whether the inode is live,
876 * snapshotted (read-only), or a virtual ".snap" directory.
877 */
878int ceph_init_dentry(struct dentry *dentry);
879
880
881/* ioctl.c */
882extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
883
884/* export.c */
885extern const struct export_operations ceph_export_ops;
886
887/* debugfs.c */
888extern int ceph_debugfs_init(void);
889extern void ceph_debugfs_cleanup(void);
890extern int ceph_debugfs_client_init(struct ceph_client *client);
891extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
892
893static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
894{
895 if (dentry && dentry->d_parent)
896 return dentry->d_parent->d_inode;
897
898 return NULL;
899}
900
901#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
new file mode 100644
index 000000000000..28b35a005ec2
--- /dev/null
+++ b/fs/ceph/types.h
@@ -0,0 +1,29 @@
1#ifndef _FS_CEPH_TYPES_H
2#define _FS_CEPH_TYPES_H
3
4/* needed before including ceph_fs.h */
5#include <linux/in.h>
6#include <linux/types.h>
7#include <linux/fcntl.h>
8#include <linux/string.h>
9
10#include "ceph_fs.h"
11#include "ceph_frag.h"
12#include "ceph_hash.h"
13
14/*
15 * Identify inodes by both their ino AND snapshot id (a u64).
16 */
17struct ceph_vino {
18 u64 ino;
19 u64 snap;
20};
21
22
23/* context for the caps reservation mechanism */
24struct ceph_cap_reservation {
25 int count;
26};
27
28
29#endif
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
new file mode 100644
index 000000000000..2845422907fc
--- /dev/null
+++ b/fs/ceph/xattr.c
@@ -0,0 +1,845 @@
1#include "ceph_debug.h"
2#include "super.h"
3#include "decode.h"
4
5#include <linux/xattr.h>
6#include <linux/slab.h>
7
8static bool ceph_is_valid_xattr(const char *name)
9{
10 return !strncmp(name, XATTR_SECURITY_PREFIX,
11 XATTR_SECURITY_PREFIX_LEN) ||
12 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
13 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
14}
15
16/*
17 * These define virtual xattrs exposing the recursive directory
18 * statistics and layout metadata.
19 */
20struct ceph_vxattr_cb {
21 bool readonly;
22 char *name;
23 size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
24 size_t size);
25};
26
27/* directories */
28
29static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val,
30 size_t size)
31{
32 return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
33}
34
35static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val,
36 size_t size)
37{
38 return snprintf(val, size, "%lld", ci->i_files);
39}
40
41static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val,
42 size_t size)
43{
44 return snprintf(val, size, "%lld", ci->i_subdirs);
45}
46
47static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val,
48 size_t size)
49{
50 return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
51}
52
53static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val,
54 size_t size)
55{
56 return snprintf(val, size, "%lld", ci->i_rfiles);
57}
58
59static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val,
60 size_t size)
61{
62 return snprintf(val, size, "%lld", ci->i_rsubdirs);
63}
64
65static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val,
66 size_t size)
67{
68 return snprintf(val, size, "%lld", ci->i_rbytes);
69}
70
71static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
72 size_t size)
73{
74 return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec,
75 (long)ci->i_rctime.tv_nsec);
76}
77
78static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
79 { true, "user.ceph.dir.entries", ceph_vxattrcb_entries},
80 { true, "user.ceph.dir.files", ceph_vxattrcb_files},
81 { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs},
82 { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries},
83 { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles},
84 { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
85 { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes},
86 { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime},
87 { true, NULL, NULL }
88};
89
90/* files */
91
92static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
93 size_t size)
94{
95 int ret;
96
97 ret = snprintf(val, size,
98 "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
99 (unsigned long long)ceph_file_layout_su(ci->i_layout),
100 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
101 (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
102 if (ceph_file_layout_pg_preferred(ci->i_layout))
103 ret += snprintf(val + ret, size, "preferred_osd=%lld\n",
104 (unsigned long long)ceph_file_layout_pg_preferred(
105 ci->i_layout));
106 return ret;
107}
108
109static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
110 { true, "user.ceph.layout", ceph_vxattrcb_layout},
111 { NULL, NULL }
112};
113
114static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
115{
116 if (S_ISDIR(inode->i_mode))
117 return ceph_dir_vxattrs;
118 else if (S_ISREG(inode->i_mode))
119 return ceph_file_vxattrs;
120 return NULL;
121}
122
123static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr,
124 const char *name)
125{
126 do {
127 if (strcmp(vxattr->name, name) == 0)
128 return vxattr;
129 vxattr++;
130 } while (vxattr->name);
131 return NULL;
132}
133
134static int __set_xattr(struct ceph_inode_info *ci,
135 const char *name, int name_len,
136 const char *val, int val_len,
137 int dirty,
138 int should_free_name, int should_free_val,
139 struct ceph_inode_xattr **newxattr)
140{
141 struct rb_node **p;
142 struct rb_node *parent = NULL;
143 struct ceph_inode_xattr *xattr = NULL;
144 int c;
145 int new = 0;
146
147 p = &ci->i_xattrs.index.rb_node;
148 while (*p) {
149 parent = *p;
150 xattr = rb_entry(parent, struct ceph_inode_xattr, node);
151 c = strncmp(name, xattr->name, min(name_len, xattr->name_len));
152 if (c < 0)
153 p = &(*p)->rb_left;
154 else if (c > 0)
155 p = &(*p)->rb_right;
156 else {
157 if (name_len == xattr->name_len)
158 break;
159 else if (name_len < xattr->name_len)
160 p = &(*p)->rb_left;
161 else
162 p = &(*p)->rb_right;
163 }
164 xattr = NULL;
165 }
166
167 if (!xattr) {
168 new = 1;
169 xattr = *newxattr;
170 xattr->name = name;
171 xattr->name_len = name_len;
172 xattr->should_free_name = should_free_name;
173
174 ci->i_xattrs.count++;
175 dout("__set_xattr count=%d\n", ci->i_xattrs.count);
176 } else {
177 kfree(*newxattr);
178 *newxattr = NULL;
179 if (xattr->should_free_val)
180 kfree((void *)xattr->val);
181
182 if (should_free_name) {
183 kfree((void *)name);
184 name = xattr->name;
185 }
186 ci->i_xattrs.names_size -= xattr->name_len;
187 ci->i_xattrs.vals_size -= xattr->val_len;
188 }
189 if (!xattr) {
190 pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
191 &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
192 xattr->val);
193 return -ENOMEM;
194 }
195 ci->i_xattrs.names_size += name_len;
196 ci->i_xattrs.vals_size += val_len;
197 if (val)
198 xattr->val = val;
199 else
200 xattr->val = "";
201
202 xattr->val_len = val_len;
203 xattr->dirty = dirty;
204 xattr->should_free_val = (val && should_free_val);
205
206 if (new) {
207 rb_link_node(&xattr->node, parent, p);
208 rb_insert_color(&xattr->node, &ci->i_xattrs.index);
209 dout("__set_xattr_val p=%p\n", p);
210 }
211
212 dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
213 ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
214
215 return 0;
216}
217
218static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
219 const char *name)
220{
221 struct rb_node **p;
222 struct rb_node *parent = NULL;
223 struct ceph_inode_xattr *xattr = NULL;
224 int c;
225
226 p = &ci->i_xattrs.index.rb_node;
227 while (*p) {
228 parent = *p;
229 xattr = rb_entry(parent, struct ceph_inode_xattr, node);
230 c = strncmp(name, xattr->name, xattr->name_len);
231 if (c < 0)
232 p = &(*p)->rb_left;
233 else if (c > 0)
234 p = &(*p)->rb_right;
235 else {
236 dout("__get_xattr %s: found %.*s\n", name,
237 xattr->val_len, xattr->val);
238 return xattr;
239 }
240 }
241
242 dout("__get_xattr %s: not found\n", name);
243
244 return NULL;
245}
246
247static void __free_xattr(struct ceph_inode_xattr *xattr)
248{
249 BUG_ON(!xattr);
250
251 if (xattr->should_free_name)
252 kfree((void *)xattr->name);
253 if (xattr->should_free_val)
254 kfree((void *)xattr->val);
255
256 kfree(xattr);
257}
258
259static int __remove_xattr(struct ceph_inode_info *ci,
260 struct ceph_inode_xattr *xattr)
261{
262 if (!xattr)
263 return -EOPNOTSUPP;
264
265 rb_erase(&xattr->node, &ci->i_xattrs.index);
266
267 if (xattr->should_free_name)
268 kfree((void *)xattr->name);
269 if (xattr->should_free_val)
270 kfree((void *)xattr->val);
271
272 ci->i_xattrs.names_size -= xattr->name_len;
273 ci->i_xattrs.vals_size -= xattr->val_len;
274 ci->i_xattrs.count--;
275 kfree(xattr);
276
277 return 0;
278}
279
280static int __remove_xattr_by_name(struct ceph_inode_info *ci,
281 const char *name)
282{
283 struct rb_node **p;
284 struct ceph_inode_xattr *xattr;
285 int err;
286
287 p = &ci->i_xattrs.index.rb_node;
288 xattr = __get_xattr(ci, name);
289 err = __remove_xattr(ci, xattr);
290 return err;
291}
292
293static char *__copy_xattr_names(struct ceph_inode_info *ci,
294 char *dest)
295{
296 struct rb_node *p;
297 struct ceph_inode_xattr *xattr = NULL;
298
299 p = rb_first(&ci->i_xattrs.index);
300 dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
301
302 while (p) {
303 xattr = rb_entry(p, struct ceph_inode_xattr, node);
304 memcpy(dest, xattr->name, xattr->name_len);
305 dest[xattr->name_len] = '\0';
306
307 dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
308 xattr->name_len, ci->i_xattrs.names_size);
309
310 dest += xattr->name_len + 1;
311 p = rb_next(p);
312 }
313
314 return dest;
315}
316
317void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
318{
319 struct rb_node *p, *tmp;
320 struct ceph_inode_xattr *xattr = NULL;
321
322 p = rb_first(&ci->i_xattrs.index);
323
324 dout("__ceph_destroy_xattrs p=%p\n", p);
325
326 while (p) {
327 xattr = rb_entry(p, struct ceph_inode_xattr, node);
328 tmp = p;
329 p = rb_next(tmp);
330 dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
331 xattr->name_len, xattr->name);
332 rb_erase(tmp, &ci->i_xattrs.index);
333
334 __free_xattr(xattr);
335 }
336
337 ci->i_xattrs.names_size = 0;
338 ci->i_xattrs.vals_size = 0;
339 ci->i_xattrs.index_version = 0;
340 ci->i_xattrs.count = 0;
341 ci->i_xattrs.index = RB_ROOT;
342}
343
344static int __build_xattrs(struct inode *inode)
345{
346 u32 namelen;
347 u32 numattr = 0;
348 void *p, *end;
349 u32 len;
350 const char *name, *val;
351 struct ceph_inode_info *ci = ceph_inode(inode);
352 int xattr_version;
353 struct ceph_inode_xattr **xattrs = NULL;
354 int err = 0;
355 int i;
356
357 dout("__build_xattrs() len=%d\n",
358 ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
359
360 if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
361 return 0; /* already built */
362
363 __ceph_destroy_xattrs(ci);
364
365start:
366 /* updated internal xattr rb tree */
367 if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) {
368 p = ci->i_xattrs.blob->vec.iov_base;
369 end = p + ci->i_xattrs.blob->vec.iov_len;
370 ceph_decode_32_safe(&p, end, numattr, bad);
371 xattr_version = ci->i_xattrs.version;
372 spin_unlock(&inode->i_lock);
373
374 xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
375 GFP_NOFS);
376 err = -ENOMEM;
377 if (!xattrs)
378 goto bad_lock;
379 memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
380 for (i = 0; i < numattr; i++) {
381 xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
382 GFP_NOFS);
383 if (!xattrs[i])
384 goto bad_lock;
385 }
386
387 spin_lock(&inode->i_lock);
388 if (ci->i_xattrs.version != xattr_version) {
389 /* lost a race, retry */
390 for (i = 0; i < numattr; i++)
391 kfree(xattrs[i]);
392 kfree(xattrs);
393 goto start;
394 }
395 err = -EIO;
396 while (numattr--) {
397 ceph_decode_32_safe(&p, end, len, bad);
398 namelen = len;
399 name = p;
400 p += len;
401 ceph_decode_32_safe(&p, end, len, bad);
402 val = p;
403 p += len;
404
405 err = __set_xattr(ci, name, namelen, val, len,
406 0, 0, 0, &xattrs[numattr]);
407
408 if (err < 0)
409 goto bad;
410 }
411 kfree(xattrs);
412 }
413 ci->i_xattrs.index_version = ci->i_xattrs.version;
414 ci->i_xattrs.dirty = false;
415
416 return err;
417bad_lock:
418 spin_lock(&inode->i_lock);
419bad:
420 if (xattrs) {
421 for (i = 0; i < numattr; i++)
422 kfree(xattrs[i]);
423 kfree(xattrs);
424 }
425 ci->i_xattrs.names_size = 0;
426 return err;
427}
428
429static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
430 int val_size)
431{
432 /*
433 * 4 bytes for the length, and additional 4 bytes per each xattr name,
434 * 4 bytes per each value
435 */
436 int size = 4 + ci->i_xattrs.count*(4 + 4) +
437 ci->i_xattrs.names_size +
438 ci->i_xattrs.vals_size;
439 dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
440 ci->i_xattrs.count, ci->i_xattrs.names_size,
441 ci->i_xattrs.vals_size);
442
443 if (name_size)
444 size += 4 + 4 + name_size + val_size;
445
446 return size;
447}
448
449/*
450 * If there are dirty xattrs, reencode xattrs into the prealloc_blob
451 * and swap into place.
452 */
453void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
454{
455 struct rb_node *p;
456 struct ceph_inode_xattr *xattr = NULL;
457 void *dest;
458
459 dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
460 if (ci->i_xattrs.dirty) {
461 int need = __get_required_blob_size(ci, 0, 0);
462
463 BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len);
464
465 p = rb_first(&ci->i_xattrs.index);
466 dest = ci->i_xattrs.prealloc_blob->vec.iov_base;
467
468 ceph_encode_32(&dest, ci->i_xattrs.count);
469 while (p) {
470 xattr = rb_entry(p, struct ceph_inode_xattr, node);
471
472 ceph_encode_32(&dest, xattr->name_len);
473 memcpy(dest, xattr->name, xattr->name_len);
474 dest += xattr->name_len;
475 ceph_encode_32(&dest, xattr->val_len);
476 memcpy(dest, xattr->val, xattr->val_len);
477 dest += xattr->val_len;
478
479 p = rb_next(p);
480 }
481
482 /* adjust buffer len; it may be larger than we need */
483 ci->i_xattrs.prealloc_blob->vec.iov_len =
484 dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
485
486 if (ci->i_xattrs.blob)
487 ceph_buffer_put(ci->i_xattrs.blob);
488 ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
489 ci->i_xattrs.prealloc_blob = NULL;
490 ci->i_xattrs.dirty = false;
491 }
492}
493
494ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
495 size_t size)
496{
497 struct inode *inode = dentry->d_inode;
498 struct ceph_inode_info *ci = ceph_inode(inode);
499 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
500 int err;
501 struct ceph_inode_xattr *xattr;
502 struct ceph_vxattr_cb *vxattr = NULL;
503
504 if (!ceph_is_valid_xattr(name))
505 return -ENODATA;
506
507 /* let's see if a virtual xattr was requested */
508 if (vxattrs)
509 vxattr = ceph_match_vxattr(vxattrs, name);
510
511 spin_lock(&inode->i_lock);
512 dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
513 ci->i_xattrs.version, ci->i_xattrs.index_version);
514
515 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
516 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
517 goto get_xattr;
518 } else {
519 spin_unlock(&inode->i_lock);
520 /* get xattrs from mds (if we don't already have them) */
521 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
522 if (err)
523 return err;
524 }
525
526 spin_lock(&inode->i_lock);
527
528 if (vxattr && vxattr->readonly) {
529 err = vxattr->getxattr_cb(ci, value, size);
530 goto out;
531 }
532
533 err = __build_xattrs(inode);
534 if (err < 0)
535 goto out;
536
537get_xattr:
538 err = -ENODATA; /* == ENOATTR */
539 xattr = __get_xattr(ci, name);
540 if (!xattr) {
541 if (vxattr)
542 err = vxattr->getxattr_cb(ci, value, size);
543 goto out;
544 }
545
546 err = -ERANGE;
547 if (size && size < xattr->val_len)
548 goto out;
549
550 err = xattr->val_len;
551 if (size == 0)
552 goto out;
553
554 memcpy(value, xattr->val, xattr->val_len);
555
556out:
557 spin_unlock(&inode->i_lock);
558 return err;
559}
560
561ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
562{
563 struct inode *inode = dentry->d_inode;
564 struct ceph_inode_info *ci = ceph_inode(inode);
565 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
566 u32 vir_namelen = 0;
567 u32 namelen;
568 int err;
569 u32 len;
570 int i;
571
572 spin_lock(&inode->i_lock);
573 dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
574 ci->i_xattrs.version, ci->i_xattrs.index_version);
575
576 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
577 (ci->i_xattrs.index_version > ci->i_xattrs.version)) {
578 goto list_xattr;
579 } else {
580 spin_unlock(&inode->i_lock);
581 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
582 if (err)
583 return err;
584 }
585
586 spin_lock(&inode->i_lock);
587
588 err = __build_xattrs(inode);
589 if (err < 0)
590 goto out;
591
592list_xattr:
593 vir_namelen = 0;
594 /* include virtual dir xattrs */
595 if (vxattrs)
596 for (i = 0; vxattrs[i].name; i++)
597 vir_namelen += strlen(vxattrs[i].name) + 1;
598 /* adding 1 byte per each variable due to the null termination */
599 namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
600 err = -ERANGE;
601 if (size && namelen > size)
602 goto out;
603
604 err = namelen;
605 if (size == 0)
606 goto out;
607
608 names = __copy_xattr_names(ci, names);
609
610 /* virtual xattr names, too */
611 if (vxattrs)
612 for (i = 0; vxattrs[i].name; i++) {
613 len = sprintf(names, "%s", vxattrs[i].name);
614 names += len + 1;
615 }
616
617out:
618 spin_unlock(&inode->i_lock);
619 return err;
620}
621
622static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
623 const char *value, size_t size, int flags)
624{
625 struct ceph_client *client = ceph_client(dentry->d_sb);
626 struct inode *inode = dentry->d_inode;
627 struct ceph_inode_info *ci = ceph_inode(inode);
628 struct inode *parent_inode = dentry->d_parent->d_inode;
629 struct ceph_mds_request *req;
630 struct ceph_mds_client *mdsc = &client->mdsc;
631 int err;
632 int i, nr_pages;
633 struct page **pages = NULL;
634 void *kaddr;
635
636 /* copy value into some pages */
637 nr_pages = calc_pages_for(0, size);
638 if (nr_pages) {
639 pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS);
640 if (!pages)
641 return -ENOMEM;
642 err = -ENOMEM;
643 for (i = 0; i < nr_pages; i++) {
644 pages[i] = alloc_page(GFP_NOFS);
645 if (!pages[i]) {
646 nr_pages = i;
647 goto out;
648 }
649 kaddr = kmap(pages[i]);
650 memcpy(kaddr, value + i*PAGE_CACHE_SIZE,
651 min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE));
652 }
653 }
654
655 dout("setxattr value=%.*s\n", (int)size, value);
656
657 /* do request */
658 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
659 USE_AUTH_MDS);
660 if (IS_ERR(req)) {
661 err = PTR_ERR(req);
662 goto out;
663 }
664 req->r_inode = igrab(inode);
665 req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
666 req->r_num_caps = 1;
667 req->r_args.setxattr.flags = cpu_to_le32(flags);
668 req->r_path2 = kstrdup(name, GFP_NOFS);
669
670 req->r_pages = pages;
671 req->r_num_pages = nr_pages;
672 req->r_data_len = size;
673
674 dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
675 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
676 ceph_mdsc_put_request(req);
677 dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
678
679out:
680 if (pages) {
681 for (i = 0; i < nr_pages; i++)
682 __free_page(pages[i]);
683 kfree(pages);
684 }
685 return err;
686}
687
688int ceph_setxattr(struct dentry *dentry, const char *name,
689 const void *value, size_t size, int flags)
690{
691 struct inode *inode = dentry->d_inode;
692 struct ceph_inode_info *ci = ceph_inode(inode);
693 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
694 int err;
695 int name_len = strlen(name);
696 int val_len = size;
697 char *newname = NULL;
698 char *newval = NULL;
699 struct ceph_inode_xattr *xattr = NULL;
700 int issued;
701 int required_blob_size;
702
703 if (ceph_snap(inode) != CEPH_NOSNAP)
704 return -EROFS;
705
706 if (!ceph_is_valid_xattr(name))
707 return -EOPNOTSUPP;
708
709 if (vxattrs) {
710 struct ceph_vxattr_cb *vxattr =
711 ceph_match_vxattr(vxattrs, name);
712 if (vxattr && vxattr->readonly)
713 return -EOPNOTSUPP;
714 }
715
716 /* preallocate memory for xattr name, value, index node */
717 err = -ENOMEM;
718 newname = kmalloc(name_len + 1, GFP_NOFS);
719 if (!newname)
720 goto out;
721 memcpy(newname, name, name_len + 1);
722
723 if (val_len) {
724 newval = kmalloc(val_len + 1, GFP_NOFS);
725 if (!newval)
726 goto out;
727 memcpy(newval, value, val_len);
728 newval[val_len] = '\0';
729 }
730
731 xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
732 if (!xattr)
733 goto out;
734
735 spin_lock(&inode->i_lock);
736retry:
737 issued = __ceph_caps_issued(ci, NULL);
738 if (!(issued & CEPH_CAP_XATTR_EXCL))
739 goto do_sync;
740 __build_xattrs(inode);
741
742 required_blob_size = __get_required_blob_size(ci, name_len, val_len);
743
744 if (!ci->i_xattrs.prealloc_blob ||
745 required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
746 struct ceph_buffer *blob = NULL;
747
748 spin_unlock(&inode->i_lock);
749 dout(" preaallocating new blob size=%d\n", required_blob_size);
750 blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
751 if (!blob)
752 goto out;
753 spin_lock(&inode->i_lock);
754 if (ci->i_xattrs.prealloc_blob)
755 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
756 ci->i_xattrs.prealloc_blob = blob;
757 goto retry;
758 }
759
760 dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
761 err = __set_xattr(ci, newname, name_len, newval,
762 val_len, 1, 1, 1, &xattr);
763 __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
764 ci->i_xattrs.dirty = true;
765 inode->i_ctime = CURRENT_TIME;
766 spin_unlock(&inode->i_lock);
767
768 return err;
769
770do_sync:
771 spin_unlock(&inode->i_lock);
772 err = ceph_sync_setxattr(dentry, name, value, size, flags);
773out:
774 kfree(newname);
775 kfree(newval);
776 kfree(xattr);
777 return err;
778}
779
780static int ceph_send_removexattr(struct dentry *dentry, const char *name)
781{
782 struct ceph_client *client = ceph_client(dentry->d_sb);
783 struct ceph_mds_client *mdsc = &client->mdsc;
784 struct inode *inode = dentry->d_inode;
785 struct inode *parent_inode = dentry->d_parent->d_inode;
786 struct ceph_mds_request *req;
787 int err;
788
789 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR,
790 USE_AUTH_MDS);
791 if (IS_ERR(req))
792 return PTR_ERR(req);
793 req->r_inode = igrab(inode);
794 req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
795 req->r_num_caps = 1;
796 req->r_path2 = kstrdup(name, GFP_NOFS);
797
798 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
799 ceph_mdsc_put_request(req);
800 return err;
801}
802
803int ceph_removexattr(struct dentry *dentry, const char *name)
804{
805 struct inode *inode = dentry->d_inode;
806 struct ceph_inode_info *ci = ceph_inode(inode);
807 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
808 int issued;
809 int err;
810
811 if (ceph_snap(inode) != CEPH_NOSNAP)
812 return -EROFS;
813
814 if (!ceph_is_valid_xattr(name))
815 return -EOPNOTSUPP;
816
817 if (vxattrs) {
818 struct ceph_vxattr_cb *vxattr =
819 ceph_match_vxattr(vxattrs, name);
820 if (vxattr && vxattr->readonly)
821 return -EOPNOTSUPP;
822 }
823
824 spin_lock(&inode->i_lock);
825 __build_xattrs(inode);
826 issued = __ceph_caps_issued(ci, NULL);
827 dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
828
829 if (!(issued & CEPH_CAP_XATTR_EXCL))
830 goto do_sync;
831
832 err = __remove_xattr_by_name(ceph_inode(inode), name);
833 __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
834 ci->i_xattrs.dirty = true;
835 inode->i_ctime = CURRENT_TIME;
836
837 spin_unlock(&inode->i_lock);
838
839 return err;
840do_sync:
841 spin_unlock(&inode->i_lock);
842 err = ceph_send_removexattr(dentry, name);
843 return err;
844}
845
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index b1d61d0bdfc7..78e4d2a3a68b 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -15,6 +15,7 @@
15#include <linux/dcache.h> 15#include <linux/dcache.h>
16#include <linux/mount.h> 16#include <linux/mount.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/slab.h>
18#include <linux/vfs.h> 19#include <linux/vfs.h>
19#include <linux/fs.h> 20#include <linux/fs.h>
20#include "cifsglob.h" 21#include "cifsglob.h"
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 8ec7736ce954..310d12f69a92 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -20,6 +20,7 @@
20 */ 20 */
21 21
22#include <linux/list.h> 22#include <linux/list.h>
23#include <linux/slab.h>
23#include <linux/string.h> 24#include <linux/string.h>
24#include <keys/user-type.h> 25#include <keys/user-type.h>
25#include <linux/key-type.h> 26#include <linux/key-type.h>
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 714a542cbafc..d07676bd76d2 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -19,6 +19,7 @@
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/slab.h>
22#include "cifs_unicode.h" 23#include "cifs_unicode.h"
23#include "cifs_uniupr.h" 24#include "cifs_uniupr.h"
24#include "cifspdu.h" 25#include "cifspdu.h"
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 7dfe0842a6f6..9b716d044bbd 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -22,6 +22,7 @@
22 */ 22 */
23 23
24#include <linux/fs.h> 24#include <linux/fs.h>
25#include <linux/slab.h>
25#include "cifspdu.h" 26#include "cifspdu.h"
26#include "cifsglob.h" 27#include "cifsglob.h"
27#include "cifsacl.h" 28#include "cifsacl.h"
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 7efe1745494d..fbe986430d0c 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -20,6 +20,7 @@
20 */ 20 */
21 21
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/slab.h>
23#include "cifspdu.h" 24#include "cifspdu.h"
24#include "cifsglob.h" 25#include "cifsglob.h"
25#include "cifs_debug.h" 26#include "cifs_debug.h"
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8c6a03627176..ded66be6597c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -312,6 +312,7 @@ cifs_alloc_inode(struct super_block *sb)
312 cifs_inode->clientCanCacheRead = false; 312 cifs_inode->clientCanCacheRead = false;
313 cifs_inode->clientCanCacheAll = false; 313 cifs_inode->clientCanCacheAll = false;
314 cifs_inode->delete_pending = false; 314 cifs_inode->delete_pending = false;
315 cifs_inode->invalid_mapping = false;
315 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ 316 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
316 cifs_inode->server_eof = 0; 317 cifs_inode->server_eof = 0;
317 318
@@ -638,7 +639,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
638 setting the revalidate time to zero */ 639 setting the revalidate time to zero */
639 CIFS_I(file->f_path.dentry->d_inode)->time = 0; 640 CIFS_I(file->f_path.dentry->d_inode)->time = 0;
640 641
641 retval = cifs_revalidate(file->f_path.dentry); 642 retval = cifs_revalidate_file(file);
642 if (retval < 0) 643 if (retval < 0)
643 return (loff_t)retval; 644 return (loff_t)retval;
644 } 645 }
@@ -807,6 +808,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
807 .release = cifs_close, 808 .release = cifs_close,
808 .fsync = cifs_fsync, 809 .fsync = cifs_fsync,
809 .flush = cifs_flush, 810 .flush = cifs_flush,
811 .mmap = cifs_file_mmap,
810 .splice_read = generic_file_splice_read, 812 .splice_read = generic_file_splice_read,
811#ifdef CONFIG_CIFS_POSIX 813#ifdef CONFIG_CIFS_POSIX
812 .unlocked_ioctl = cifs_ioctl, 814 .unlocked_ioctl = cifs_ioctl,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 78c1b86d55f6..7aa57ecdc437 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -61,7 +61,8 @@ extern int cifs_mkdir(struct inode *, struct dentry *, int);
61extern int cifs_rmdir(struct inode *, struct dentry *); 61extern int cifs_rmdir(struct inode *, struct dentry *);
62extern int cifs_rename(struct inode *, struct dentry *, struct inode *, 62extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
63 struct dentry *); 63 struct dentry *);
64extern int cifs_revalidate(struct dentry *); 64extern int cifs_revalidate_file(struct file *filp);
65extern int cifs_revalidate_dentry(struct dentry *);
65extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *); 66extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
66extern int cifs_setattr(struct dentry *, struct iattr *); 67extern int cifs_setattr(struct dentry *, struct iattr *);
67 68
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index a1c817eb291a..ecf0ffbe2b64 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -18,6 +18,7 @@
18 */ 18 */
19#include <linux/in.h> 19#include <linux/in.h>
20#include <linux/in6.h> 20#include <linux/in6.h>
21#include <linux/slab.h>
21#include <linux/slow-work.h> 22#include <linux/slow-work.h>
22#include "cifs_fs_sb.h" 23#include "cifs_fs_sb.h"
23#include "cifsacl.h" 24#include "cifsacl.h"
@@ -389,6 +390,7 @@ struct cifsInodeInfo {
389 bool clientCanCacheRead:1; /* read oplock */ 390 bool clientCanCacheRead:1; /* read oplock */
390 bool clientCanCacheAll:1; /* read and writebehind oplock */ 391 bool clientCanCacheAll:1; /* read and writebehind oplock */
391 bool delete_pending:1; /* DELETE_ON_CLOSE is set */ 392 bool delete_pending:1; /* DELETE_ON_CLOSE is set */
393 bool invalid_mapping:1; /* pagecache is invalid */
392 u64 server_eof; /* current file size on server */ 394 u64 server_eof; /* current file size on server */
393 u64 uniqueid; /* server inode number */ 395 u64 uniqueid; /* server inode number */
394 struct inode vfs_inode; 396 struct inode vfs_inode;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 88e2bc44ac58..39e47f46dea5 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -104,10 +104,12 @@ extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr);
104extern struct inode *cifs_iget(struct super_block *sb, 104extern struct inode *cifs_iget(struct super_block *sb,
105 struct cifs_fattr *fattr); 105 struct cifs_fattr *fattr);
106 106
107extern int cifs_get_file_info(struct file *filp);
107extern int cifs_get_inode_info(struct inode **pinode, 108extern int cifs_get_inode_info(struct inode **pinode,
108 const unsigned char *search_path, 109 const unsigned char *search_path,
109 FILE_ALL_INFO *pfile_info, 110 FILE_ALL_INFO *pfile_info,
110 struct super_block *sb, int xid, const __u16 *pfid); 111 struct super_block *sb, int xid, const __u16 *pfid);
112extern int cifs_get_file_info_unix(struct file *filp);
111extern int cifs_get_inode_info_unix(struct inode **pinode, 113extern int cifs_get_inode_info_unix(struct inode **pinode,
112 const unsigned char *search_path, 114 const unsigned char *search_path,
113 struct super_block *sb, int xid); 115 struct super_block *sb, int xid);
@@ -142,6 +144,8 @@ extern int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
142extern int CIFSFindClose(const int, struct cifsTconInfo *tcon, 144extern int CIFSFindClose(const int, struct cifsTconInfo *tcon,
143 const __u16 search_handle); 145 const __u16 search_handle);
144 146
147extern int CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
148 u16 netfid, FILE_ALL_INFO *pFindData);
145extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon, 149extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
146 const unsigned char *searchName, 150 const unsigned char *searchName,
147 FILE_ALL_INFO *findData, 151 FILE_ALL_INFO *findData,
@@ -152,6 +156,8 @@ extern int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
152 FILE_ALL_INFO *findData, 156 FILE_ALL_INFO *findData,
153 const struct nls_table *nls_codepage, int remap); 157 const struct nls_table *nls_codepage, int remap);
154 158
159extern int CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
160 u16 netfid, FILE_UNIX_BASIC_INFO *pFindData);
155extern int CIFSSMBUnixQPathInfo(const int xid, 161extern int CIFSSMBUnixQPathInfo(const int xid,
156 struct cifsTconInfo *tcon, 162 struct cifsTconInfo *tcon,
157 const unsigned char *searchName, 163 const unsigned char *searchName,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 611835899844..5d3f29fef532 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -30,6 +30,7 @@
30#include <linux/fs.h> 30#include <linux/fs.h>
31#include <linux/kernel.h> 31#include <linux/kernel.h>
32#include <linux/vfs.h> 32#include <linux/vfs.h>
33#include <linux/slab.h>
33#include <linux/posix_acl_xattr.h> 34#include <linux/posix_acl_xattr.h>
34#include <asm/uaccess.h> 35#include <asm/uaccess.h>
35#include "cifspdu.h" 36#include "cifspdu.h"
@@ -500,7 +501,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
500 } else if (pSMBr->hdr.WordCount == 13) { 501 } else if (pSMBr->hdr.WordCount == 13) {
501 cERROR(1, ("mount failed, cifs module not built " 502 cERROR(1, ("mount failed, cifs module not built "
502 "with CIFS_WEAK_PW_HASH support")); 503 "with CIFS_WEAK_PW_HASH support"));
503 rc = -EOPNOTSUPP; 504 rc = -EOPNOTSUPP;
504#endif /* WEAK_PW_HASH */ 505#endif /* WEAK_PW_HASH */
505 goto neg_err_exit; 506 goto neg_err_exit;
506 } else if (pSMBr->hdr.WordCount != 17) { 507 } else if (pSMBr->hdr.WordCount != 17) {
@@ -1430,6 +1431,8 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
1430 __u32 bytes_sent; 1431 __u32 bytes_sent;
1431 __u16 byte_count; 1432 __u16 byte_count;
1432 1433
1434 *nbytes = 0;
1435
1433 /* cFYI(1, ("write at %lld %d bytes", offset, count));*/ 1436 /* cFYI(1, ("write at %lld %d bytes", offset, count));*/
1434 if (tcon->ses == NULL) 1437 if (tcon->ses == NULL)
1435 return -ECONNABORTED; 1438 return -ECONNABORTED;
@@ -1512,11 +1515,18 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
1512 cifs_stats_inc(&tcon->num_writes); 1515 cifs_stats_inc(&tcon->num_writes);
1513 if (rc) { 1516 if (rc) {
1514 cFYI(1, ("Send error in write = %d", rc)); 1517 cFYI(1, ("Send error in write = %d", rc));
1515 *nbytes = 0;
1516 } else { 1518 } else {
1517 *nbytes = le16_to_cpu(pSMBr->CountHigh); 1519 *nbytes = le16_to_cpu(pSMBr->CountHigh);
1518 *nbytes = (*nbytes) << 16; 1520 *nbytes = (*nbytes) << 16;
1519 *nbytes += le16_to_cpu(pSMBr->Count); 1521 *nbytes += le16_to_cpu(pSMBr->Count);
1522
1523 /*
1524 * Mask off high 16 bits when bytes written as returned by the
1525 * server is greater than bytes requested by the client. Some
1526 * OS/2 servers are known to set incorrect CountHigh values.
1527 */
1528 if (*nbytes > count)
1529 *nbytes &= 0xFFFF;
1520 } 1530 }
1521 1531
1522 cifs_buf_release(pSMB); 1532 cifs_buf_release(pSMB);
@@ -1605,6 +1615,14 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
1605 *nbytes = le16_to_cpu(pSMBr->CountHigh); 1615 *nbytes = le16_to_cpu(pSMBr->CountHigh);
1606 *nbytes = (*nbytes) << 16; 1616 *nbytes = (*nbytes) << 16;
1607 *nbytes += le16_to_cpu(pSMBr->Count); 1617 *nbytes += le16_to_cpu(pSMBr->Count);
1618
1619 /*
1620 * Mask off high 16 bits when bytes written as returned by the
1621 * server is greater than bytes requested by the client. OS/2
1622 * servers are known to set incorrect CountHigh values.
1623 */
1624 if (*nbytes > count)
1625 *nbytes &= 0xFFFF;
1608 } 1626 }
1609 1627
1610/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ 1628/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
@@ -1793,8 +1811,21 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
1793 } 1811 }
1794 parm_data = (struct cifs_posix_lock *) 1812 parm_data = (struct cifs_posix_lock *)
1795 ((char *)&pSMBr->hdr.Protocol + data_offset); 1813 ((char *)&pSMBr->hdr.Protocol + data_offset);
1796 if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK)) 1814 if (parm_data->lock_type == __constant_cpu_to_le16(CIFS_UNLCK))
1797 pLockData->fl_type = F_UNLCK; 1815 pLockData->fl_type = F_UNLCK;
1816 else {
1817 if (parm_data->lock_type ==
1818 __constant_cpu_to_le16(CIFS_RDLCK))
1819 pLockData->fl_type = F_RDLCK;
1820 else if (parm_data->lock_type ==
1821 __constant_cpu_to_le16(CIFS_WRLCK))
1822 pLockData->fl_type = F_WRLCK;
1823
1824 pLockData->fl_start = parm_data->start;
1825 pLockData->fl_end = parm_data->start +
1826 parm_data->length - 1;
1827 pLockData->fl_pid = parm_data->pid;
1828 }
1798 } 1829 }
1799 1830
1800plk_err_exit: 1831plk_err_exit:
@@ -3230,8 +3261,72 @@ QInfRetry:
3230 return rc; 3261 return rc;
3231} 3262}
3232 3263
3264int
3265CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
3266 u16 netfid, FILE_ALL_INFO *pFindData)
3267{
3268 struct smb_t2_qfi_req *pSMB = NULL;
3269 struct smb_t2_qfi_rsp *pSMBr = NULL;
3270 int rc = 0;
3271 int bytes_returned;
3272 __u16 params, byte_count;
3273
3274QFileInfoRetry:
3275 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
3276 (void **) &pSMBr);
3277 if (rc)
3278 return rc;
3279
3280 params = 2 /* level */ + 2 /* fid */;
3281 pSMB->t2.TotalDataCount = 0;
3282 pSMB->t2.MaxParameterCount = cpu_to_le16(4);
3283 /* BB find exact max data count below from sess structure BB */
3284 pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
3285 pSMB->t2.MaxSetupCount = 0;
3286 pSMB->t2.Reserved = 0;
3287 pSMB->t2.Flags = 0;
3288 pSMB->t2.Timeout = 0;
3289 pSMB->t2.Reserved2 = 0;
3290 pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
3291 Fid) - 4);
3292 pSMB->t2.DataCount = 0;
3293 pSMB->t2.DataOffset = 0;
3294 pSMB->t2.SetupCount = 1;
3295 pSMB->t2.Reserved3 = 0;
3296 pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
3297 byte_count = params + 1 /* pad */ ;
3298 pSMB->t2.TotalParameterCount = cpu_to_le16(params);
3299 pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
3300 pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
3301 pSMB->Pad = 0;
3302 pSMB->Fid = netfid;
3303 pSMB->hdr.smb_buf_length += byte_count;
3304
3305 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3306 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3307 if (rc) {
3308 cFYI(1, ("Send error in QPathInfo = %d", rc));
3309 } else { /* decode response */
3310 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3233 3311
3312 if (rc) /* BB add auto retry on EOPNOTSUPP? */
3313 rc = -EIO;
3314 else if (pSMBr->ByteCount < 40)
3315 rc = -EIO; /* bad smb */
3316 else if (pFindData) {
3317 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
3318 memcpy((char *) pFindData,
3319 (char *) &pSMBr->hdr.Protocol +
3320 data_offset, sizeof(FILE_ALL_INFO));
3321 } else
3322 rc = -ENOMEM;
3323 }
3324 cifs_buf_release(pSMB);
3325 if (rc == -EAGAIN)
3326 goto QFileInfoRetry;
3234 3327
3328 return rc;
3329}
3235 3330
3236int 3331int
3237CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon, 3332CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
@@ -3335,6 +3430,75 @@ QPathInfoRetry:
3335} 3430}
3336 3431
3337int 3432int
3433CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
3434 u16 netfid, FILE_UNIX_BASIC_INFO *pFindData)
3435{
3436 struct smb_t2_qfi_req *pSMB = NULL;
3437 struct smb_t2_qfi_rsp *pSMBr = NULL;
3438 int rc = 0;
3439 int bytes_returned;
3440 __u16 params, byte_count;
3441
3442UnixQFileInfoRetry:
3443 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
3444 (void **) &pSMBr);
3445 if (rc)
3446 return rc;
3447
3448 params = 2 /* level */ + 2 /* fid */;
3449 pSMB->t2.TotalDataCount = 0;
3450 pSMB->t2.MaxParameterCount = cpu_to_le16(4);
3451 /* BB find exact max data count below from sess structure BB */
3452 pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
3453 pSMB->t2.MaxSetupCount = 0;
3454 pSMB->t2.Reserved = 0;
3455 pSMB->t2.Flags = 0;
3456 pSMB->t2.Timeout = 0;
3457 pSMB->t2.Reserved2 = 0;
3458 pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
3459 Fid) - 4);
3460 pSMB->t2.DataCount = 0;
3461 pSMB->t2.DataOffset = 0;
3462 pSMB->t2.SetupCount = 1;
3463 pSMB->t2.Reserved3 = 0;
3464 pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
3465 byte_count = params + 1 /* pad */ ;
3466 pSMB->t2.TotalParameterCount = cpu_to_le16(params);
3467 pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
3468 pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
3469 pSMB->Pad = 0;
3470 pSMB->Fid = netfid;
3471 pSMB->hdr.smb_buf_length += byte_count;
3472
3473 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3474 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3475 if (rc) {
3476 cFYI(1, ("Send error in QPathInfo = %d", rc));
3477 } else { /* decode response */
3478 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3479
3480 if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
3481 cERROR(1, ("Malformed FILE_UNIX_BASIC_INFO response.\n"
3482 "Unix Extensions can be disabled on mount "
3483 "by specifying the nosfu mount option."));
3484 rc = -EIO; /* bad smb */
3485 } else {
3486 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
3487 memcpy((char *) pFindData,
3488 (char *) &pSMBr->hdr.Protocol +
3489 data_offset,
3490 sizeof(FILE_UNIX_BASIC_INFO));
3491 }
3492 }
3493
3494 cifs_buf_release(pSMB);
3495 if (rc == -EAGAIN)
3496 goto UnixQFileInfoRetry;
3497
3498 return rc;
3499}
3500
3501int
3338CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon, 3502CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
3339 const unsigned char *searchName, 3503 const unsigned char *searchName,
3340 FILE_UNIX_BASIC_INFO *pFindData, 3504 FILE_UNIX_BASIC_INFO *pFindData,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 45eb6cba793f..d9566bf8f917 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -23,6 +23,7 @@
23#include <linux/string.h> 23#include <linux/string.h>
24#include <linux/list.h> 24#include <linux/list.h>
25#include <linux/wait.h> 25#include <linux/wait.h>
26#include <linux/slab.h>
26#include <linux/pagemap.h> 27#include <linux/pagemap.h>
27#include <linux/ctype.h> 28#include <linux/ctype.h>
28#include <linux/utsname.h> 29#include <linux/utsname.h>
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 6ccf7262d1b7..e9f7ecc2714b 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -739,7 +739,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
739 int isValid = 1; 739 int isValid = 1;
740 740
741 if (direntry->d_inode) { 741 if (direntry->d_inode) {
742 if (cifs_revalidate(direntry)) 742 if (cifs_revalidate_dentry(direntry))
743 return 0; 743 return 0;
744 } else { 744 } else {
745 cFYI(1, ("neg dentry 0x%p name = %s", 745 cFYI(1, ("neg dentry 0x%p name = %s",
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 87948147d7ec..6f8a0e3fb25b 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -23,6 +23,7 @@
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */ 24 */
25 25
26#include <linux/slab.h>
26#include <keys/user-type.h> 27#include <keys/user-type.h>
27#include "dns_resolve.h" 28#include "dns_resolve.h"
28#include "cifsglob.h" 29#include "cifsglob.h"
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 3d8f8a96f5a3..9b11a8f56f3a 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -31,6 +31,7 @@
31#include <linux/task_io_accounting_ops.h> 31#include <linux/task_io_accounting_ops.h>
32#include <linux/delay.h> 32#include <linux/delay.h>
33#include <linux/mount.h> 33#include <linux/mount.h>
34#include <linux/slab.h>
34#include <asm/div64.h> 35#include <asm/div64.h>
35#include "cifsfs.h" 36#include "cifsfs.h"
36#include "cifspdu.h" 37#include "cifspdu.h"
@@ -219,8 +220,8 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
219 cFYI(1, ("inode unchanged on server")); 220 cFYI(1, ("inode unchanged on server"));
220 } else { 221 } else {
221 if (file->f_path.dentry->d_inode->i_mapping) { 222 if (file->f_path.dentry->d_inode->i_mapping) {
222 /* BB no need to lock inode until after invalidate 223 /* BB no need to lock inode until after invalidate
223 since namei code should already have it locked? */ 224 since namei code should already have it locked? */
224 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping); 225 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
225 if (rc != 0) 226 if (rc != 0)
226 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc; 227 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
@@ -838,8 +839,32 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
838 839
839 } else { 840 } else {
840 /* if rc == ERR_SHARING_VIOLATION ? */ 841 /* if rc == ERR_SHARING_VIOLATION ? */
841 rc = 0; /* do not change lock type to unlock 842 rc = 0;
842 since range in use */ 843
844 if (lockType & LOCKING_ANDX_SHARED_LOCK) {
845 pfLock->fl_type = F_WRLCK;
846 } else {
847 rc = CIFSSMBLock(xid, tcon, netfid, length,
848 pfLock->fl_start, 0, 1,
849 lockType | LOCKING_ANDX_SHARED_LOCK,
850 0 /* wait flag */);
851 if (rc == 0) {
852 rc = CIFSSMBLock(xid, tcon, netfid,
853 length, pfLock->fl_start, 1, 0,
854 lockType |
855 LOCKING_ANDX_SHARED_LOCK,
856 0 /* wait flag */);
857 pfLock->fl_type = F_RDLCK;
858 if (rc != 0)
859 cERROR(1, ("Error unlocking "
860 "previously locked range %d "
861 "during test of lock", rc));
862 rc = 0;
863 } else {
864 pfLock->fl_type = F_WRLCK;
865 rc = 0;
866 }
867 }
843 } 868 }
844 869
845 FreeXid(xid); 870 FreeXid(xid);
@@ -1890,11 +1915,10 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1890 1915
1891int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) 1916int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1892{ 1917{
1893 struct dentry *dentry = file->f_path.dentry;
1894 int rc, xid; 1918 int rc, xid;
1895 1919
1896 xid = GetXid(); 1920 xid = GetXid();
1897 rc = cifs_revalidate(dentry); 1921 rc = cifs_revalidate_file(file);
1898 if (rc) { 1922 if (rc) {
1899 cFYI(1, ("Validation prior to mmap failed, error=%d", rc)); 1923 cFYI(1, ("Validation prior to mmap failed, error=%d", rc));
1900 FreeXid(xid); 1924 FreeXid(xid);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 8bdbc818164c..35ec11716213 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -20,6 +20,7 @@
20 */ 20 */
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/stat.h> 22#include <linux/stat.h>
23#include <linux/slab.h>
23#include <linux/pagemap.h> 24#include <linux/pagemap.h>
24#include <asm/div64.h> 25#include <asm/div64.h>
25#include "cifsfs.h" 26#include "cifsfs.h"
@@ -77,6 +78,41 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
77 } 78 }
78} 79}
79 80
81/* check inode attributes against fattr. If they don't match, tag the
82 * inode for cache invalidation
83 */
84static void
85cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
86{
87 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
88
89 cFYI(1, ("%s: revalidating inode %llu", __func__, cifs_i->uniqueid));
90
91 if (inode->i_state & I_NEW) {
92 cFYI(1, ("%s: inode %llu is new", __func__, cifs_i->uniqueid));
93 return;
94 }
95
96 /* don't bother with revalidation if we have an oplock */
97 if (cifs_i->clientCanCacheRead) {
98 cFYI(1, ("%s: inode %llu is oplocked", __func__,
99 cifs_i->uniqueid));
100 return;
101 }
102
103 /* revalidate if mtime or size have changed */
104 if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) &&
105 cifs_i->server_eof == fattr->cf_eof) {
106 cFYI(1, ("%s: inode %llu is unchanged", __func__,
107 cifs_i->uniqueid));
108 return;
109 }
110
111 cFYI(1, ("%s: invalidating inode %llu mapping", __func__,
112 cifs_i->uniqueid));
113 cifs_i->invalid_mapping = true;
114}
115
80/* populate an inode with info from a cifs_fattr struct */ 116/* populate an inode with info from a cifs_fattr struct */
81void 117void
82cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) 118cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
@@ -85,6 +121,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
85 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 121 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
86 unsigned long oldtime = cifs_i->time; 122 unsigned long oldtime = cifs_i->time;
87 123
124 cifs_revalidate_cache(inode, fattr);
125
88 inode->i_atime = fattr->cf_atime; 126 inode->i_atime = fattr->cf_atime;
89 inode->i_mtime = fattr->cf_mtime; 127 inode->i_mtime = fattr->cf_mtime;
90 inode->i_ctime = fattr->cf_ctime; 128 inode->i_ctime = fattr->cf_ctime;
@@ -231,6 +269,31 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
231 fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL; 269 fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL;
232} 270}
233 271
272int cifs_get_file_info_unix(struct file *filp)
273{
274 int rc;
275 int xid;
276 FILE_UNIX_BASIC_INFO find_data;
277 struct cifs_fattr fattr;
278 struct inode *inode = filp->f_path.dentry->d_inode;
279 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
280 struct cifsTconInfo *tcon = cifs_sb->tcon;
281 struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
282
283 xid = GetXid();
284 rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
285 if (!rc) {
286 cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
287 } else if (rc == -EREMOTE) {
288 cifs_create_dfs_fattr(&fattr, inode->i_sb);
289 rc = 0;
290 }
291
292 cifs_fattr_to_inode(inode, &fattr);
293 FreeXid(xid);
294 return rc;
295}
296
234int cifs_get_inode_info_unix(struct inode **pinode, 297int cifs_get_inode_info_unix(struct inode **pinode,
235 const unsigned char *full_path, 298 const unsigned char *full_path,
236 struct super_block *sb, int xid) 299 struct super_block *sb, int xid)
@@ -432,6 +495,47 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
432 fattr->cf_gid = cifs_sb->mnt_gid; 495 fattr->cf_gid = cifs_sb->mnt_gid;
433} 496}
434 497
498int cifs_get_file_info(struct file *filp)
499{
500 int rc;
501 int xid;
502 FILE_ALL_INFO find_data;
503 struct cifs_fattr fattr;
504 struct inode *inode = filp->f_path.dentry->d_inode;
505 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
506 struct cifsTconInfo *tcon = cifs_sb->tcon;
507 struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
508
509 xid = GetXid();
510 rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
511 if (rc == -EOPNOTSUPP || rc == -EINVAL) {
512 /*
513 * FIXME: legacy server -- fall back to path-based call?
514 * for now, just skip revalidating and mark inode for
515 * immediate reval.
516 */
517 rc = 0;
518 CIFS_I(inode)->time = 0;
519 goto cgfi_exit;
520 } else if (rc == -EREMOTE) {
521 cifs_create_dfs_fattr(&fattr, inode->i_sb);
522 rc = 0;
523 } else if (rc)
524 goto cgfi_exit;
525
526 /*
527 * don't bother with SFU junk here -- just mark inode as needing
528 * revalidation.
529 */
530 cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
531 fattr.cf_uniqueid = CIFS_I(inode)->uniqueid;
532 fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
533 cifs_fattr_to_inode(inode, &fattr);
534cgfi_exit:
535 FreeXid(xid);
536 return rc;
537}
538
435int cifs_get_inode_info(struct inode **pinode, 539int cifs_get_inode_info(struct inode **pinode,
436 const unsigned char *full_path, FILE_ALL_INFO *pfindData, 540 const unsigned char *full_path, FILE_ALL_INFO *pfindData,
437 struct super_block *sb, int xid, const __u16 *pfid) 541 struct super_block *sb, int xid, const __u16 *pfid)
@@ -1389,135 +1493,103 @@ cifs_rename_exit:
1389 return rc; 1493 return rc;
1390} 1494}
1391 1495
1392int cifs_revalidate(struct dentry *direntry) 1496static bool
1497cifs_inode_needs_reval(struct inode *inode)
1393{ 1498{
1394 int xid; 1499 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
1395 int rc = 0, wbrc = 0;
1396 char *full_path;
1397 struct cifs_sb_info *cifs_sb;
1398 struct cifsInodeInfo *cifsInode;
1399 loff_t local_size;
1400 struct timespec local_mtime;
1401 bool invalidate_inode = false;
1402 1500
1403 if (direntry->d_inode == NULL) 1501 if (cifs_i->clientCanCacheRead)
1404 return -ENOENT; 1502 return false;
1405 1503
1406 cifsInode = CIFS_I(direntry->d_inode); 1504 if (!lookupCacheEnabled)
1505 return true;
1407 1506
1408 if (cifsInode == NULL) 1507 if (cifs_i->time == 0)
1409 return -ENOENT; 1508 return true;
1410 1509
1411 /* no sense revalidating inode info on file that no one can write */ 1510 /* FIXME: the actimeo should be tunable */
1412 if (CIFS_I(direntry->d_inode)->clientCanCacheRead) 1511 if (time_after_eq(jiffies, cifs_i->time + HZ))
1413 return rc; 1512 return true;
1513
1514 return false;
1515}
1516
1517/* check invalid_mapping flag and zap the cache if it's set */
1518static void
1519cifs_invalidate_mapping(struct inode *inode)
1520{
1521 int rc;
1522 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
1523
1524 cifs_i->invalid_mapping = false;
1525
1526 /* write back any cached data */
1527 if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
1528 rc = filemap_write_and_wait(inode->i_mapping);
1529 if (rc)
1530 cifs_i->write_behind_rc = rc;
1531 }
1532 invalidate_remote_inode(inode);
1533}
1534
1535int cifs_revalidate_file(struct file *filp)
1536{
1537 int rc = 0;
1538 struct inode *inode = filp->f_path.dentry->d_inode;
1539
1540 if (!cifs_inode_needs_reval(inode))
1541 goto check_inval;
1542
1543 if (CIFS_SB(inode->i_sb)->tcon->unix_ext)
1544 rc = cifs_get_file_info_unix(filp);
1545 else
1546 rc = cifs_get_file_info(filp);
1547
1548check_inval:
1549 if (CIFS_I(inode)->invalid_mapping)
1550 cifs_invalidate_mapping(inode);
1551
1552 return rc;
1553}
1554
1555/* revalidate a dentry's inode attributes */
1556int cifs_revalidate_dentry(struct dentry *dentry)
1557{
1558 int xid;
1559 int rc = 0;
1560 char *full_path = NULL;
1561 struct inode *inode = dentry->d_inode;
1562 struct super_block *sb = dentry->d_sb;
1563
1564 if (inode == NULL)
1565 return -ENOENT;
1414 1566
1415 xid = GetXid(); 1567 xid = GetXid();
1416 1568
1417 cifs_sb = CIFS_SB(direntry->d_sb); 1569 if (!cifs_inode_needs_reval(inode))
1570 goto check_inval;
1418 1571
1419 /* can not safely grab the rename sem here if rename calls revalidate 1572 /* can not safely grab the rename sem here if rename calls revalidate
1420 since that would deadlock */ 1573 since that would deadlock */
1421 full_path = build_path_from_dentry(direntry); 1574 full_path = build_path_from_dentry(dentry);
1422 if (full_path == NULL) { 1575 if (full_path == NULL) {
1423 rc = -ENOMEM; 1576 rc = -ENOMEM;
1424 FreeXid(xid); 1577 goto check_inval;
1425 return rc;
1426 }
1427 cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
1428 "jiffies %ld", full_path, direntry->d_inode,
1429 direntry->d_inode->i_count.counter, direntry,
1430 direntry->d_time, jiffies));
1431
1432 if (cifsInode->time == 0) {
1433 /* was set to zero previously to force revalidate */
1434 } else if (time_before(jiffies, cifsInode->time + HZ) &&
1435 lookupCacheEnabled) {
1436 if ((S_ISREG(direntry->d_inode->i_mode) == 0) ||
1437 (direntry->d_inode->i_nlink == 1)) {
1438 kfree(full_path);
1439 FreeXid(xid);
1440 return rc;
1441 } else {
1442 cFYI(1, ("Have to revalidate file due to hardlinks"));
1443 }
1444 }
1445
1446 /* save mtime and size */
1447 local_mtime = direntry->d_inode->i_mtime;
1448 local_size = direntry->d_inode->i_size;
1449
1450 if (cifs_sb->tcon->unix_ext) {
1451 rc = cifs_get_inode_info_unix(&direntry->d_inode, full_path,
1452 direntry->d_sb, xid);
1453 if (rc) {
1454 cFYI(1, ("error on getting revalidate info %d", rc));
1455/* if (rc != -ENOENT)
1456 rc = 0; */ /* BB should we cache info on
1457 certain errors? */
1458 }
1459 } else {
1460 rc = cifs_get_inode_info(&direntry->d_inode, full_path, NULL,
1461 direntry->d_sb, xid, NULL);
1462 if (rc) {
1463 cFYI(1, ("error on getting revalidate info %d", rc));
1464/* if (rc != -ENOENT)
1465 rc = 0; */ /* BB should we cache info on
1466 certain errors? */
1467 }
1468 } 1578 }
1469 /* should we remap certain errors, access denied?, to zero */
1470 1579
1471 /* if not oplocked, we invalidate inode pages if mtime or file size 1580 cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
1472 had changed on server */ 1581 "jiffies %ld", full_path, inode, inode->i_count.counter,
1582 dentry, dentry->d_time, jiffies));
1473 1583
1474 if (timespec_equal(&local_mtime, &direntry->d_inode->i_mtime) && 1584 if (CIFS_SB(sb)->tcon->unix_ext)
1475 (local_size == direntry->d_inode->i_size)) { 1585 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
1476 cFYI(1, ("cifs_revalidate - inode unchanged")); 1586 else
1477 } else { 1587 rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
1478 /* file may have changed on server */ 1588 xid, NULL);
1479 if (cifsInode->clientCanCacheRead) {
1480 /* no need to invalidate inode pages since we were the
1481 only ones who could have modified the file and the
1482 server copy is staler than ours */
1483 } else {
1484 invalidate_inode = true;
1485 }
1486 }
1487 1589
1488 /* can not grab this sem since kernel filesys locking documentation 1590check_inval:
1489 indicates i_mutex may be taken by the kernel on lookup and rename 1591 if (CIFS_I(inode)->invalid_mapping)
1490 which could deadlock if we grab the i_mutex here as well */ 1592 cifs_invalidate_mapping(inode);
1491/* mutex_lock(&direntry->d_inode->i_mutex);*/
1492 /* need to write out dirty pages here */
1493 if (direntry->d_inode->i_mapping) {
1494 /* do we need to lock inode until after invalidate completes
1495 below? */
1496 wbrc = filemap_fdatawrite(direntry->d_inode->i_mapping);
1497 if (wbrc)
1498 CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
1499 }
1500 if (invalidate_inode) {
1501 /* shrink_dcache not necessary now that cifs dentry ops
1502 are exported for negative dentries */
1503/* if (S_ISDIR(direntry->d_inode->i_mode))
1504 shrink_dcache_parent(direntry); */
1505 if (S_ISREG(direntry->d_inode->i_mode)) {
1506 if (direntry->d_inode->i_mapping) {
1507 wbrc = filemap_fdatawait(direntry->d_inode->i_mapping);
1508 if (wbrc)
1509 CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
1510 }
1511 /* may eventually have to do this for open files too */
1512 if (list_empty(&(cifsInode->openFileList))) {
1513 /* changed on server - flush read ahead pages */
1514 cFYI(1, ("Invalidating read ahead data on "
1515 "closed file"));
1516 invalidate_remote_inode(direntry->d_inode);
1517 }
1518 }
1519 }
1520/* mutex_unlock(&direntry->d_inode->i_mutex); */
1521 1593
1522 kfree(full_path); 1594 kfree(full_path);
1523 FreeXid(xid); 1595 FreeXid(xid);
@@ -1527,7 +1599,7 @@ int cifs_revalidate(struct dentry *direntry)
1527int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, 1599int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1528 struct kstat *stat) 1600 struct kstat *stat)
1529{ 1601{
1530 int err = cifs_revalidate(dentry); 1602 int err = cifs_revalidate_dentry(dentry);
1531 if (!err) { 1603 if (!err) {
1532 generic_fillattr(dentry->d_inode, stat); 1604 generic_fillattr(dentry->d_inode, stat);
1533 stat->blksize = CIFS_MAX_MSGSIZE; 1605 stat->blksize = CIFS_MAX_MSGSIZE;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index fc1e0487eaee..c1a9d4236a8c 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -20,6 +20,7 @@
20 */ 20 */
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/stat.h> 22#include <linux/stat.h>
23#include <linux/slab.h>
23#include <linux/namei.h> 24#include <linux/namei.h>
24#include "cifsfs.h" 25#include "cifsfs.h"
25#include "cifspdu.h" 26#include "cifspdu.h"
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index c343b14ba2d3..18e0bc1fb593 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -22,6 +22,7 @@
22 */ 22 */
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/pagemap.h> 24#include <linux/pagemap.h>
25#include <linux/slab.h>
25#include <linux/stat.h> 26#include <linux/stat.h>
26#include "cifspdu.h" 27#include "cifspdu.h"
27#include "cifsglob.h" 28#include "cifsglob.h"
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index aaa9c1c5a5bd..7c3fd7463f44 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -29,6 +29,7 @@
29#include "ntlmssp.h" 29#include "ntlmssp.h"
30#include "nterr.h" 30#include "nterr.h"
31#include <linux/utsname.h> 31#include <linux/utsname.h>
32#include <linux/slab.h>
32#include "cifs_spnego.h" 33#include "cifs_spnego.h"
33 34
34extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, 35extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 93fb09a99c69..192ea51af20f 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -24,6 +24,7 @@
24*/ 24*/
25 25
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/slab.h>
27#include <linux/fs.h> 28#include <linux/fs.h>
28#include <linux/string.h> 29#include <linux/string.h>
29#include <linux/kernel.h> 30#include <linux/kernel.h>
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 07b8e71544ee..ad081fe7eb18 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/list.h> 24#include <linux/list.h>
25#include <linux/gfp.h>
25#include <linux/wait.h> 26#include <linux/wait.h>
26#include <linux/net.h> 27#include <linux/net.h>
27#include <linux/delay.h> 28#include <linux/delay.h>
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 3e2ef0de1209..f555ce077d4f 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -21,6 +21,7 @@
21 21
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/posix_acl_xattr.h> 23#include <linux/posix_acl_xattr.h>
24#include <linux/slab.h>
24#include "cifsfs.h" 25#include "cifsfs.h"
25#include "cifspdu.h" 26#include "cifspdu.h"
26#include "cifsglob.h" 27#include "cifsglob.h"
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 4bb9d0a5decc..ccd98b0f2b0b 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -12,6 +12,7 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/slab.h>
15#include <linux/file.h> 16#include <linux/file.h>
16#include <linux/stat.h> 17#include <linux/stat.h>
17#include <linux/errno.h> 18#include <linux/errno.h>
diff --git a/fs/coda/file.c b/fs/coda/file.c
index ffd42815fda1..4c813f2cdc52 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -17,6 +17,7 @@
17#include <linux/errno.h> 17#include <linux/errno.h>
18#include <linux/smp_lock.h> 18#include <linux/smp_lock.h>
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/slab.h>
20#include <asm/uaccess.h> 21#include <asm/uaccess.h>
21 22
22#include <linux/coda.h> 23#include <linux/coda.h>
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 830f51abb971..a1695dcadd99 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -18,6 +18,7 @@
18#include <linux/smp_lock.h> 18#include <linux/smp_lock.h>
19#include <linux/file.h> 19#include <linux/file.h>
20#include <linux/vfs.h> 20#include <linux/vfs.h>
21#include <linux/slab.h>
21 22
22#include <asm/system.h> 23#include <asm/system.h>
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index c274d949179d..f09c5ed76f6c 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -26,6 +26,7 @@
26#include <linux/stat.h> 26#include <linux/stat.h>
27#include <linux/errno.h> 27#include <linux/errno.h>
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/slab.h>
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
30#include <linux/vmalloc.h> 31#include <linux/vmalloc.h>
31#include <linux/vfs.h> 32#include <linux/vfs.h>
diff --git a/fs/compat.c b/fs/compat.c
index 030602d453b7..4b6ed03cc478 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -49,6 +49,7 @@
49#include <linux/mm.h> 49#include <linux/mm.h>
50#include <linux/eventpoll.h> 50#include <linux/eventpoll.h>
51#include <linux/fs_struct.h> 51#include <linux/fs_struct.h>
52#include <linux/slab.h>
52 53
53#include <asm/uaccess.h> 54#include <asm/uaccess.h>
54#include <asm/mmu_context.h> 55#include <asm/mmu_context.h>
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 6d55b61bfa79..c32a1b6a856b 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -23,7 +23,6 @@
23#include <linux/ioctl.h> 23#include <linux/ioctl.h>
24#include <linux/if.h> 24#include <linux/if.h>
25#include <linux/if_bridge.h> 25#include <linux/if_bridge.h>
26#include <linux/slab.h>
27#include <linux/raid/md_u.h> 26#include <linux/raid/md_u.h>
28#include <linux/kd.h> 27#include <linux/kd.h>
29#include <linux/route.h> 28#include <linux/route.h>
@@ -60,6 +59,7 @@
60#include <linux/i2c.h> 59#include <linux/i2c.h>
61#include <linux/i2c-dev.h> 60#include <linux/i2c-dev.h>
62#include <linux/atalk.h> 61#include <linux/atalk.h>
62#include <linux/gfp.h>
63 63
64#include <net/bluetooth/bluetooth.h> 64#include <net/bluetooth/bluetooth.h>
65#include <net/bluetooth/hci.h> 65#include <net/bluetooth/hci.h>
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index a2f746066c5d..c8af2d91174b 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -34,6 +34,7 @@
34#include <linux/capability.h> 34#include <linux/capability.h>
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/lockdep.h> 36#include <linux/lockdep.h>
37#include <linux/slab.h>
37 38
38#include <linux/configfs.h> 39#include <linux/configfs.h>
39#include "configfs_internal.h" 40#include "configfs_internal.h"
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 8421cea7d8c7..8c8d64230c2d 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -29,6 +29,7 @@
29#include <linux/mount.h> 29#include <linux/mount.h>
30#include <linux/pagemap.h> 30#include <linux/pagemap.h>
31#include <linux/init.h> 31#include <linux/init.h>
32#include <linux/slab.h>
32 33
33#include <linux/configfs.h> 34#include <linux/configfs.h>
34#include "configfs_internal.h" 35#include "configfs_internal.h"
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 32a5f46b1157..0f3eb41d9201 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -27,6 +27,7 @@
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/namei.h> 29#include <linux/namei.h>
30#include <linux/slab.h>
30 31
31#include <linux/configfs.h> 32#include <linux/configfs.h>
32#include "configfs_internal.h" 33#include "configfs_internal.h"
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 049d6c36da09..30a87b3dbcac 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -27,6 +27,7 @@
27#include <linux/fsnotify.h> 27#include <linux/fsnotify.h>
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/magic.h> 29#include <linux/magic.h>
30#include <linux/slab.h>
30 31
31static struct vfsmount *debugfs_mount; 32static struct vfsmount *debugfs_mount;
32static int debugfs_mount_count; 33static int debugfs_mount_count;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 8882ecc0f1bf..0120247b41c0 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -15,6 +15,7 @@
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/slab.h>
18#include <linux/mount.h> 19#include <linux/mount.h>
19#include <linux/tty.h> 20#include <linux/tty.h>
20#include <linux/mutex.h> 21#include <linux/mutex.h>
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 0df243850818..b54bca03d92f 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -14,6 +14,7 @@
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/configfs.h> 16#include <linux/configfs.h>
17#include <linux/slab.h>
17#include <linux/in.h> 18#include <linux/in.h>
18#include <linux/in6.h> 19#include <linux/in6.h>
19#include <net/ipv6.h> 20#include <net/ipv6.h>
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 29d6139c35fc..c6cf25158746 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -15,6 +15,7 @@
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/ctype.h> 16#include <linux/ctype.h>
17#include <linux/debugfs.h> 17#include <linux/debugfs.h>
18#include <linux/slab.h>
18 19
19#include "dlm_internal.h" 20#include "dlm_internal.h"
20#include "lock.h" 21#include "lock.h"
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 46ffd3eeaaf7..17903b491298 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -56,6 +56,7 @@
56 L: receive_xxxx_reply() <- R: send_xxxx_reply() 56 L: receive_xxxx_reply() <- R: send_xxxx_reply()
57*/ 57*/
58#include <linux/types.h> 58#include <linux/types.h>
59#include <linux/slab.h>
59#include "dlm_internal.h" 60#include "dlm_internal.h"
60#include <linux/dlm_device.h> 61#include <linux/dlm_device.h>
61#include "memory.h" 62#include "memory.h"
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 52cab160893c..c0d35c620526 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -51,6 +51,7 @@
51#include <linux/file.h> 51#include <linux/file.h>
52#include <linux/mutex.h> 52#include <linux/mutex.h>
53#include <linux/sctp.h> 53#include <linux/sctp.h>
54#include <linux/slab.h>
54#include <net/sctp/user.h> 55#include <net/sctp/user.h>
55#include <net/ipv6.h> 56#include <net/ipv6.h>
56 57
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 052095cd592f..2c6ad518100d 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -9,6 +9,7 @@
9#include <net/genetlink.h> 9#include <net/genetlink.h>
10#include <linux/dlm.h> 10#include <linux/dlm.h>
11#include <linux/dlm_netlink.h> 11#include <linux/dlm_netlink.h>
12#include <linux/gfp.h>
12 13
13#include "dlm_internal.h" 14#include "dlm_internal.h"
14 15
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index b5f89aef3b29..d45c02db6943 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -11,6 +11,7 @@
11#include <linux/poll.h> 11#include <linux/poll.h>
12#include <linux/dlm.h> 12#include <linux/dlm.h>
13#include <linux/dlm_plock.h> 13#include <linux/dlm_plock.h>
14#include <linux/slab.h>
14 15
15#include "dlm_internal.h" 16#include "dlm_internal.h"
16#include "lockspace.h" 17#include "lockspace.h"
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index a4bfd31ac45b..8b6e73c47435 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -17,6 +17,7 @@
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/dlm.h> 18#include <linux/dlm.h>
19#include <linux/dlm_device.h> 19#include <linux/dlm_device.h>
20#include <linux/slab.h>
20 21
21#include "dlm_internal.h" 22#include "dlm_internal.h"
22#include "lockspace.h" 23#include "lockspace.h"
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 7cb0a59f4b9d..1cc087635a5e 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -33,6 +33,7 @@
33#include <linux/crypto.h> 33#include <linux/crypto.h>
34#include <linux/file.h> 34#include <linux/file.h>
35#include <linux/scatterlist.h> 35#include <linux/scatterlist.h>
36#include <linux/slab.h>
36#include <asm/unaligned.h> 37#include <asm/unaligned.h>
37#include "ecryptfs_kernel.h" 38#include "ecryptfs_kernel.h"
38 39
@@ -381,8 +382,8 @@ out:
381static void ecryptfs_lower_offset_for_extent(loff_t *offset, loff_t extent_num, 382static void ecryptfs_lower_offset_for_extent(loff_t *offset, loff_t extent_num,
382 struct ecryptfs_crypt_stat *crypt_stat) 383 struct ecryptfs_crypt_stat *crypt_stat)
383{ 384{
384 (*offset) = (crypt_stat->num_header_bytes_at_front 385 (*offset) = ecryptfs_lower_header_size(crypt_stat)
385 + (crypt_stat->extent_size * extent_num)); 386 + (crypt_stat->extent_size * extent_num);
386} 387}
387 388
388/** 389/**
@@ -834,13 +835,13 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat)
834 set_extent_mask_and_shift(crypt_stat); 835 set_extent_mask_and_shift(crypt_stat);
835 crypt_stat->iv_bytes = ECRYPTFS_DEFAULT_IV_BYTES; 836 crypt_stat->iv_bytes = ECRYPTFS_DEFAULT_IV_BYTES;
836 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) 837 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
837 crypt_stat->num_header_bytes_at_front = 0; 838 crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
838 else { 839 else {
839 if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) 840 if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)
840 crypt_stat->num_header_bytes_at_front = 841 crypt_stat->metadata_size =
841 ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; 842 ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
842 else 843 else
843 crypt_stat->num_header_bytes_at_front = PAGE_CACHE_SIZE; 844 crypt_stat->metadata_size = PAGE_CACHE_SIZE;
844 } 845 }
845} 846}
846 847
@@ -1107,9 +1108,9 @@ static void write_ecryptfs_marker(char *page_virt, size_t *written)
1107 (*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES; 1108 (*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
1108} 1109}
1109 1110
1110static void 1111void ecryptfs_write_crypt_stat_flags(char *page_virt,
1111write_ecryptfs_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat, 1112 struct ecryptfs_crypt_stat *crypt_stat,
1112 size_t *written) 1113 size_t *written)
1113{ 1114{
1114 u32 flags = 0; 1115 u32 flags = 0;
1115 int i; 1116 int i;
@@ -1237,8 +1238,7 @@ ecryptfs_write_header_metadata(char *virt,
1237 1238
1238 header_extent_size = (u32)crypt_stat->extent_size; 1239 header_extent_size = (u32)crypt_stat->extent_size;
1239 num_header_extents_at_front = 1240 num_header_extents_at_front =
1240 (u16)(crypt_stat->num_header_bytes_at_front 1241 (u16)(crypt_stat->metadata_size / crypt_stat->extent_size);
1241 / crypt_stat->extent_size);
1242 put_unaligned_be32(header_extent_size, virt); 1242 put_unaligned_be32(header_extent_size, virt);
1243 virt += 4; 1243 virt += 4;
1244 put_unaligned_be16(num_header_extents_at_front, virt); 1244 put_unaligned_be16(num_header_extents_at_front, virt);
@@ -1291,7 +1291,8 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max,
1291 offset = ECRYPTFS_FILE_SIZE_BYTES; 1291 offset = ECRYPTFS_FILE_SIZE_BYTES;
1292 write_ecryptfs_marker((page_virt + offset), &written); 1292 write_ecryptfs_marker((page_virt + offset), &written);
1293 offset += written; 1293 offset += written;
1294 write_ecryptfs_flags((page_virt + offset), crypt_stat, &written); 1294 ecryptfs_write_crypt_stat_flags((page_virt + offset), crypt_stat,
1295 &written);
1295 offset += written; 1296 offset += written;
1296 ecryptfs_write_header_metadata((page_virt + offset), crypt_stat, 1297 ecryptfs_write_header_metadata((page_virt + offset), crypt_stat,
1297 &written); 1298 &written);
@@ -1381,7 +1382,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
1381 rc = -EINVAL; 1382 rc = -EINVAL;
1382 goto out; 1383 goto out;
1383 } 1384 }
1384 virt_len = crypt_stat->num_header_bytes_at_front; 1385 virt_len = crypt_stat->metadata_size;
1385 order = get_order(virt_len); 1386 order = get_order(virt_len);
1386 /* Released in this function */ 1387 /* Released in this function */
1387 virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order); 1388 virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order);
@@ -1427,16 +1428,15 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
1427 header_extent_size = get_unaligned_be32(virt); 1428 header_extent_size = get_unaligned_be32(virt);
1428 virt += sizeof(__be32); 1429 virt += sizeof(__be32);
1429 num_header_extents_at_front = get_unaligned_be16(virt); 1430 num_header_extents_at_front = get_unaligned_be16(virt);
1430 crypt_stat->num_header_bytes_at_front = 1431 crypt_stat->metadata_size = (((size_t)num_header_extents_at_front
1431 (((size_t)num_header_extents_at_front 1432 * (size_t)header_extent_size));
1432 * (size_t)header_extent_size));
1433 (*bytes_read) = (sizeof(__be32) + sizeof(__be16)); 1433 (*bytes_read) = (sizeof(__be32) + sizeof(__be16));
1434 if ((validate_header_size == ECRYPTFS_VALIDATE_HEADER_SIZE) 1434 if ((validate_header_size == ECRYPTFS_VALIDATE_HEADER_SIZE)
1435 && (crypt_stat->num_header_bytes_at_front 1435 && (crypt_stat->metadata_size
1436 < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)) { 1436 < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)) {
1437 rc = -EINVAL; 1437 rc = -EINVAL;
1438 printk(KERN_WARNING "Invalid header size: [%zd]\n", 1438 printk(KERN_WARNING "Invalid header size: [%zd]\n",
1439 crypt_stat->num_header_bytes_at_front); 1439 crypt_stat->metadata_size);
1440 } 1440 }
1441 return rc; 1441 return rc;
1442} 1442}
@@ -1451,8 +1451,7 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
1451 */ 1451 */
1452static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat) 1452static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat)
1453{ 1453{
1454 crypt_stat->num_header_bytes_at_front = 1454 crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
1455 ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
1456} 1455}
1457 1456
1458/** 1457/**
@@ -1606,6 +1605,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
1606 ecryptfs_dentry, 1605 ecryptfs_dentry,
1607 ECRYPTFS_VALIDATE_HEADER_SIZE); 1606 ECRYPTFS_VALIDATE_HEADER_SIZE);
1608 if (rc) { 1607 if (rc) {
1608 memset(page_virt, 0, PAGE_CACHE_SIZE);
1609 rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode); 1609 rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode);
1610 if (rc) { 1610 if (rc) {
1611 printk(KERN_DEBUG "Valid eCryptfs headers not found in " 1611 printk(KERN_DEBUG "Valid eCryptfs headers not found in "
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 8f006a0d6076..906e803f7f79 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -26,6 +26,7 @@
26#include <linux/namei.h> 26#include <linux/namei.h>
27#include <linux/mount.h> 27#include <linux/mount.h>
28#include <linux/fs_stack.h> 28#include <linux/fs_stack.h>
29#include <linux/slab.h>
29#include "ecryptfs_kernel.h" 30#include "ecryptfs_kernel.h"
30 31
31/** 32/**
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 542f625312f3..bc7115403f38 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -273,7 +273,7 @@ struct ecryptfs_crypt_stat {
273 u32 flags; 273 u32 flags;
274 unsigned int file_version; 274 unsigned int file_version;
275 size_t iv_bytes; 275 size_t iv_bytes;
276 size_t num_header_bytes_at_front; 276 size_t metadata_size;
277 size_t extent_size; /* Data extent size; default is 4096 */ 277 size_t extent_size; /* Data extent size; default is 4096 */
278 size_t key_size; 278 size_t key_size;
279 size_t extent_shift; 279 size_t extent_shift;
@@ -464,6 +464,14 @@ struct ecryptfs_daemon {
464 464
465extern struct mutex ecryptfs_daemon_hash_mux; 465extern struct mutex ecryptfs_daemon_hash_mux;
466 466
467static inline size_t
468ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat)
469{
470 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
471 return 0;
472 return crypt_stat->metadata_size;
473}
474
467static inline struct ecryptfs_file_info * 475static inline struct ecryptfs_file_info *
468ecryptfs_file_to_private(struct file *file) 476ecryptfs_file_to_private(struct file *file)
469{ 477{
@@ -651,6 +659,9 @@ int ecryptfs_decrypt_page(struct page *page);
651int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry); 659int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry);
652int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry); 660int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry);
653int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry); 661int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry);
662void ecryptfs_write_crypt_stat_flags(char *page_virt,
663 struct ecryptfs_crypt_stat *crypt_stat,
664 size_t *written);
654int ecryptfs_read_and_validate_header_region(char *data, 665int ecryptfs_read_and_validate_header_region(char *data,
655 struct inode *ecryptfs_inode); 666 struct inode *ecryptfs_inode);
656int ecryptfs_read_and_validate_xattr_region(char *page_virt, 667int ecryptfs_read_and_validate_xattr_region(char *page_virt,
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 678172b61be2..e7440a6f5ebf 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -25,6 +25,7 @@
25 25
26#include <linux/file.h> 26#include <linux/file.h>
27#include <linux/poll.h> 27#include <linux/poll.h>
28#include <linux/slab.h>
28#include <linux/mount.h> 29#include <linux/mount.h>
29#include <linux/pagemap.h> 30#include <linux/pagemap.h>
30#include <linux/security.h> 31#include <linux/security.h>
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 4a430ab4115c..e2d4418affac 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -31,6 +31,7 @@
31#include <linux/mount.h> 31#include <linux/mount.h>
32#include <linux/crypto.h> 32#include <linux/crypto.h>
33#include <linux/fs_stack.h> 33#include <linux/fs_stack.h>
34#include <linux/slab.h>
34#include <asm/unaligned.h> 35#include <asm/unaligned.h>
35#include "ecryptfs_kernel.h" 36#include "ecryptfs_kernel.h"
36 37
@@ -323,6 +324,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
323 rc = ecryptfs_read_and_validate_header_region(page_virt, 324 rc = ecryptfs_read_and_validate_header_region(page_virt,
324 ecryptfs_dentry->d_inode); 325 ecryptfs_dentry->d_inode);
325 if (rc) { 326 if (rc) {
327 memset(page_virt, 0, PAGE_CACHE_SIZE);
326 rc = ecryptfs_read_and_validate_xattr_region(page_virt, 328 rc = ecryptfs_read_and_validate_xattr_region(page_virt,
327 ecryptfs_dentry); 329 ecryptfs_dentry);
328 if (rc) { 330 if (rc) {
@@ -335,7 +337,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
335 ecryptfs_dentry->d_sb)->mount_crypt_stat; 337 ecryptfs_dentry->d_sb)->mount_crypt_stat;
336 if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) { 338 if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) {
337 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) 339 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
338 file_size = (crypt_stat->num_header_bytes_at_front 340 file_size = (crypt_stat->metadata_size
339 + i_size_read(lower_dentry->d_inode)); 341 + i_size_read(lower_dentry->d_inode));
340 else 342 else
341 file_size = i_size_read(lower_dentry->d_inode); 343 file_size = i_size_read(lower_dentry->d_inode);
@@ -387,9 +389,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
387 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); 389 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
388 if (IS_ERR(lower_dentry)) { 390 if (IS_ERR(lower_dentry)) {
389 rc = PTR_ERR(lower_dentry); 391 rc = PTR_ERR(lower_dentry);
390 printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " 392 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
391 "lower_dentry = [%s]\n", __func__, rc, 393 "[%d] on lower_dentry = [%s]\n", __func__, rc,
392 ecryptfs_dentry->d_name.name); 394 encrypted_and_encoded_name);
393 goto out_d_drop; 395 goto out_d_drop;
394 } 396 }
395 if (lower_dentry->d_inode) 397 if (lower_dentry->d_inode)
@@ -416,9 +418,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
416 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); 418 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
417 if (IS_ERR(lower_dentry)) { 419 if (IS_ERR(lower_dentry)) {
418 rc = PTR_ERR(lower_dentry); 420 rc = PTR_ERR(lower_dentry);
419 printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " 421 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
420 "lower_dentry = [%s]\n", __func__, rc, 422 "[%d] on lower_dentry = [%s]\n", __func__, rc,
421 encrypted_and_encoded_name); 423 encrypted_and_encoded_name);
422 goto out_d_drop; 424 goto out_d_drop;
423 } 425 }
424lookup_and_interpose: 426lookup_and_interpose:
@@ -455,8 +457,8 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
455 rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0); 457 rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0);
456 if (rc) 458 if (rc)
457 goto out_lock; 459 goto out_lock;
458 fsstack_copy_attr_times(dir, lower_new_dentry->d_inode); 460 fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
459 fsstack_copy_inode_size(dir, lower_new_dentry->d_inode); 461 fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode);
460 old_dentry->d_inode->i_nlink = 462 old_dentry->d_inode->i_nlink =
461 ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink; 463 ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink;
462 i_size_write(new_dentry->d_inode, file_size_save); 464 i_size_write(new_dentry->d_inode, file_size_save);
@@ -647,38 +649,17 @@ out_lock:
647 return rc; 649 return rc;
648} 650}
649 651
650static int 652static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
651ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) 653 size_t *bufsiz)
652{ 654{
655 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
653 char *lower_buf; 656 char *lower_buf;
654 size_t lower_bufsiz; 657 size_t lower_bufsiz = PATH_MAX;
655 struct dentry *lower_dentry;
656 struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
657 char *plaintext_name;
658 size_t plaintext_name_size;
659 mm_segment_t old_fs; 658 mm_segment_t old_fs;
660 int rc; 659 int rc;
661 660
662 lower_dentry = ecryptfs_dentry_to_lower(dentry);
663 if (!lower_dentry->d_inode->i_op->readlink) {
664 rc = -EINVAL;
665 goto out;
666 }
667 mount_crypt_stat = &ecryptfs_superblock_to_private(
668 dentry->d_sb)->mount_crypt_stat;
669 /*
670 * If the lower filename is encrypted, it will result in a significantly
671 * longer name. If needed, truncate the name after decode and decrypt.
672 */
673 if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
674 lower_bufsiz = PATH_MAX;
675 else
676 lower_bufsiz = bufsiz;
677 /* Released in this function */
678 lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL); 661 lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL);
679 if (lower_buf == NULL) { 662 if (!lower_buf) {
680 printk(KERN_ERR "%s: Out of memory whilst attempting to "
681 "kmalloc [%zd] bytes\n", __func__, lower_bufsiz);
682 rc = -ENOMEM; 663 rc = -ENOMEM;
683 goto out; 664 goto out;
684 } 665 }
@@ -688,29 +669,31 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
688 (char __user *)lower_buf, 669 (char __user *)lower_buf,
689 lower_bufsiz); 670 lower_bufsiz);
690 set_fs(old_fs); 671 set_fs(old_fs);
691 if (rc >= 0) { 672 if (rc < 0)
692 rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name, 673 goto out;
693 &plaintext_name_size, 674 lower_bufsiz = rc;
694 dentry, lower_buf, 675 rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry,
695 rc); 676 lower_buf, lower_bufsiz);
696 if (rc) { 677out:
697 printk(KERN_ERR "%s: Error attempting to decode and "
698 "decrypt filename; rc = [%d]\n", __func__,
699 rc);
700 goto out_free_lower_buf;
701 }
702 /* Check for bufsiz <= 0 done in sys_readlinkat() */
703 rc = copy_to_user(buf, plaintext_name,
704 min((size_t) bufsiz, plaintext_name_size));
705 if (rc)
706 rc = -EFAULT;
707 else
708 rc = plaintext_name_size;
709 kfree(plaintext_name);
710 fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode);
711 }
712out_free_lower_buf:
713 kfree(lower_buf); 678 kfree(lower_buf);
679 return rc;
680}
681
682static int
683ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
684{
685 char *kbuf;
686 size_t kbufsiz, copied;
687 int rc;
688
689 rc = ecryptfs_readlink_lower(dentry, &kbuf, &kbufsiz);
690 if (rc)
691 goto out;
692 copied = min_t(size_t, bufsiz, kbufsiz);
693 rc = copy_to_user(buf, kbuf, copied) ? -EFAULT : copied;
694 kfree(kbuf);
695 fsstack_copy_attr_atime(dentry->d_inode,
696 ecryptfs_dentry_to_lower(dentry)->d_inode);
714out: 697out:
715 return rc; 698 return rc;
716} 699}
@@ -768,7 +751,7 @@ upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat,
768{ 751{
769 loff_t lower_size; 752 loff_t lower_size;
770 753
771 lower_size = crypt_stat->num_header_bytes_at_front; 754 lower_size = ecryptfs_lower_header_size(crypt_stat);
772 if (upper_size != 0) { 755 if (upper_size != 0) {
773 loff_t num_extents; 756 loff_t num_extents;
774 757
@@ -1015,6 +998,28 @@ out:
1015 return rc; 998 return rc;
1016} 999}
1017 1000
1001int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
1002 struct kstat *stat)
1003{
1004 struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
1005 int rc = 0;
1006
1007 mount_crypt_stat = &ecryptfs_superblock_to_private(
1008 dentry->d_sb)->mount_crypt_stat;
1009 generic_fillattr(dentry->d_inode, stat);
1010 if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
1011 char *target;
1012 size_t targetsiz;
1013
1014 rc = ecryptfs_readlink_lower(dentry, &target, &targetsiz);
1015 if (!rc) {
1016 kfree(target);
1017 stat->size = targetsiz;
1018 }
1019 }
1020 return rc;
1021}
1022
1018int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, 1023int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1019 struct kstat *stat) 1024 struct kstat *stat)
1020{ 1025{
@@ -1039,7 +1044,7 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
1039 1044
1040 lower_dentry = ecryptfs_dentry_to_lower(dentry); 1045 lower_dentry = ecryptfs_dentry_to_lower(dentry);
1041 if (!lower_dentry->d_inode->i_op->setxattr) { 1046 if (!lower_dentry->d_inode->i_op->setxattr) {
1042 rc = -ENOSYS; 1047 rc = -EOPNOTSUPP;
1043 goto out; 1048 goto out;
1044 } 1049 }
1045 mutex_lock(&lower_dentry->d_inode->i_mutex); 1050 mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1057,7 +1062,7 @@ ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
1057 int rc = 0; 1062 int rc = 0;
1058 1063
1059 if (!lower_dentry->d_inode->i_op->getxattr) { 1064 if (!lower_dentry->d_inode->i_op->getxattr) {
1060 rc = -ENOSYS; 1065 rc = -EOPNOTSUPP;
1061 goto out; 1066 goto out;
1062 } 1067 }
1063 mutex_lock(&lower_dentry->d_inode->i_mutex); 1068 mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1084,7 +1089,7 @@ ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size)
1084 1089
1085 lower_dentry = ecryptfs_dentry_to_lower(dentry); 1090 lower_dentry = ecryptfs_dentry_to_lower(dentry);
1086 if (!lower_dentry->d_inode->i_op->listxattr) { 1091 if (!lower_dentry->d_inode->i_op->listxattr) {
1087 rc = -ENOSYS; 1092 rc = -EOPNOTSUPP;
1088 goto out; 1093 goto out;
1089 } 1094 }
1090 mutex_lock(&lower_dentry->d_inode->i_mutex); 1095 mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1101,7 +1106,7 @@ static int ecryptfs_removexattr(struct dentry *dentry, const char *name)
1101 1106
1102 lower_dentry = ecryptfs_dentry_to_lower(dentry); 1107 lower_dentry = ecryptfs_dentry_to_lower(dentry);
1103 if (!lower_dentry->d_inode->i_op->removexattr) { 1108 if (!lower_dentry->d_inode->i_op->removexattr) {
1104 rc = -ENOSYS; 1109 rc = -EOPNOTSUPP;
1105 goto out; 1110 goto out;
1106 } 1111 }
1107 mutex_lock(&lower_dentry->d_inode->i_mutex); 1112 mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1132,6 +1137,7 @@ const struct inode_operations ecryptfs_symlink_iops = {
1132 .put_link = ecryptfs_put_link, 1137 .put_link = ecryptfs_put_link,
1133 .permission = ecryptfs_permission, 1138 .permission = ecryptfs_permission,
1134 .setattr = ecryptfs_setattr, 1139 .setattr = ecryptfs_setattr,
1140 .getattr = ecryptfs_getattr_link,
1135 .setxattr = ecryptfs_setxattr, 1141 .setxattr = ecryptfs_setxattr,
1136 .getxattr = ecryptfs_getxattr, 1142 .getxattr = ecryptfs_getxattr,
1137 .listxattr = ecryptfs_listxattr, 1143 .listxattr = ecryptfs_listxattr,
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index a0a7847567e9..89c5476506ef 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -32,6 +32,7 @@
32#include <linux/random.h> 32#include <linux/random.h>
33#include <linux/crypto.h> 33#include <linux/crypto.h>
34#include <linux/scatterlist.h> 34#include <linux/scatterlist.h>
35#include <linux/slab.h>
35#include "ecryptfs_kernel.h" 36#include "ecryptfs_kernel.h"
36 37
37/** 38/**
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index e14cf7e588db..d8c3a373aafa 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/kthread.h> 23#include <linux/kthread.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/slab.h>
25#include <linux/wait.h> 26#include <linux/wait.h>
26#include <linux/mount.h> 27#include <linux/mount.h>
27#include "ecryptfs_kernel.h" 28#include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index ea2f92101dfe..af1a8f01ebac 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -35,6 +35,7 @@
35#include <linux/key.h> 35#include <linux/key.h>
36#include <linux/parser.h> 36#include <linux/parser.h>
37#include <linux/fs_stack.h> 37#include <linux/fs_stack.h>
38#include <linux/slab.h>
38#include "ecryptfs_kernel.h" 39#include "ecryptfs_kernel.h"
39 40
40/** 41/**
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index f1c17e87c5fb..2d8dbce9d485 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -20,6 +20,7 @@
20 * 02111-1307, USA. 20 * 02111-1307, USA.
21 */ 21 */
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/slab.h>
23#include <linux/user_namespace.h> 24#include <linux/user_namespace.h>
24#include <linux/nsproxy.h> 25#include <linux/nsproxy.h>
25#include "ecryptfs_kernel.h" 26#include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 4ec8f61ccf5a..3745f612bcd4 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -24,6 +24,7 @@
24#include <linux/random.h> 24#include <linux/random.h>
25#include <linux/miscdevice.h> 25#include <linux/miscdevice.h>
26#include <linux/poll.h> 26#include <linux/poll.h>
27#include <linux/slab.h>
27#include <linux/wait.h> 28#include <linux/wait.h>
28#include <linux/module.h> 29#include <linux/module.h>
29#include "ecryptfs_kernel.h" 30#include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index df4ce99d0597..2ee9a3a7b68c 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -32,6 +32,7 @@
32#include <linux/file.h> 32#include <linux/file.h>
33#include <linux/crypto.h> 33#include <linux/crypto.h>
34#include <linux/scatterlist.h> 34#include <linux/scatterlist.h>
35#include <linux/slab.h>
35#include <asm/unaligned.h> 36#include <asm/unaligned.h>
36#include "ecryptfs_kernel.h" 37#include "ecryptfs_kernel.h"
37 38
@@ -82,6 +83,19 @@ out:
82 return rc; 83 return rc;
83} 84}
84 85
86static void strip_xattr_flag(char *page_virt,
87 struct ecryptfs_crypt_stat *crypt_stat)
88{
89 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) {
90 size_t written;
91
92 crypt_stat->flags &= ~ECRYPTFS_METADATA_IN_XATTR;
93 ecryptfs_write_crypt_stat_flags(page_virt, crypt_stat,
94 &written);
95 crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
96 }
97}
98
85/** 99/**
86 * Header Extent: 100 * Header Extent:
87 * Octets 0-7: Unencrypted file size (big-endian) 101 * Octets 0-7: Unencrypted file size (big-endian)
@@ -97,19 +111,6 @@ out:
97 * (big-endian) 111 * (big-endian)
98 * Octet 26: Begin RFC 2440 authentication token packet set 112 * Octet 26: Begin RFC 2440 authentication token packet set
99 */ 113 */
100static void set_header_info(char *page_virt,
101 struct ecryptfs_crypt_stat *crypt_stat)
102{
103 size_t written;
104 size_t save_num_header_bytes_at_front =
105 crypt_stat->num_header_bytes_at_front;
106
107 crypt_stat->num_header_bytes_at_front =
108 ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
109 ecryptfs_write_header_metadata(page_virt + 20, crypt_stat, &written);
110 crypt_stat->num_header_bytes_at_front =
111 save_num_header_bytes_at_front;
112}
113 114
114/** 115/**
115 * ecryptfs_copy_up_encrypted_with_header 116 * ecryptfs_copy_up_encrypted_with_header
@@ -135,8 +136,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
135 * num_extents_per_page) 136 * num_extents_per_page)
136 + extent_num_in_page); 137 + extent_num_in_page);
137 size_t num_header_extents_at_front = 138 size_t num_header_extents_at_front =
138 (crypt_stat->num_header_bytes_at_front 139 (crypt_stat->metadata_size / crypt_stat->extent_size);
139 / crypt_stat->extent_size);
140 140
141 if (view_extent_num < num_header_extents_at_front) { 141 if (view_extent_num < num_header_extents_at_front) {
142 /* This is a header extent */ 142 /* This is a header extent */
@@ -146,9 +146,14 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
146 memset(page_virt, 0, PAGE_CACHE_SIZE); 146 memset(page_virt, 0, PAGE_CACHE_SIZE);
147 /* TODO: Support more than one header extent */ 147 /* TODO: Support more than one header extent */
148 if (view_extent_num == 0) { 148 if (view_extent_num == 0) {
149 size_t written;
150
149 rc = ecryptfs_read_xattr_region( 151 rc = ecryptfs_read_xattr_region(
150 page_virt, page->mapping->host); 152 page_virt, page->mapping->host);
151 set_header_info(page_virt, crypt_stat); 153 strip_xattr_flag(page_virt + 16, crypt_stat);
154 ecryptfs_write_header_metadata(page_virt + 20,
155 crypt_stat,
156 &written);
152 } 157 }
153 kunmap_atomic(page_virt, KM_USER0); 158 kunmap_atomic(page_virt, KM_USER0);
154 flush_dcache_page(page); 159 flush_dcache_page(page);
@@ -161,7 +166,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
161 /* This is an encrypted data extent */ 166 /* This is an encrypted data extent */
162 loff_t lower_offset = 167 loff_t lower_offset =
163 ((view_extent_num * crypt_stat->extent_size) 168 ((view_extent_num * crypt_stat->extent_size)
164 - crypt_stat->num_header_bytes_at_front); 169 - crypt_stat->metadata_size);
165 170
166 rc = ecryptfs_read_lower_page_segment( 171 rc = ecryptfs_read_lower_page_segment(
167 page, (lower_offset >> PAGE_CACHE_SHIFT), 172 page, (lower_offset >> PAGE_CACHE_SHIFT),
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index b15a43a80ab7..278743c7716a 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -26,6 +26,7 @@
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/mount.h> 27#include <linux/mount.h>
28#include <linux/key.h> 28#include <linux/key.h>
29#include <linux/slab.h>
29#include <linux/seq_file.h> 30#include <linux/seq_file.h>
30#include <linux/smp_lock.h> 31#include <linux/smp_lock.h>
31#include <linux/file.h> 32#include <linux/file.h>
@@ -85,7 +86,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
85 if (lower_dentry->d_inode) { 86 if (lower_dentry->d_inode) {
86 fput(inode_info->lower_file); 87 fput(inode_info->lower_file);
87 inode_info->lower_file = NULL; 88 inode_info->lower_file = NULL;
88 d_drop(lower_dentry);
89 } 89 }
90 } 90 }
91 ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat); 91 ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 7758cc382ef0..6bd3f76fdf88 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -11,6 +11,7 @@
11#include <linux/fs.h> 11#include <linux/fs.h>
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/slab.h>
14#include <linux/list.h> 15#include <linux/list.h>
15#include <linux/spinlock.h> 16#include <linux/spinlock.h>
16#include <linux/anon_inodes.h> 17#include <linux/anon_inodes.h>
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index a17e4b733e35..76d2a79ef93e 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -31,6 +31,7 @@
31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
32 */ 32 */
33 33
34#include <linux/slab.h>
34#include <linux/writeback.h> 35#include <linux/writeback.h>
35#include <linux/buffer_head.h> 36#include <linux/buffer_head.h>
36#include <scsi/scsi_device.h> 37#include <scsi/scsi_device.h>
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 5293bc411d17..4337cad7777b 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -22,6 +22,7 @@
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 */ 23 */
24 24
25#include <linux/slab.h>
25#include <scsi/scsi_device.h> 26#include <scsi/scsi_device.h>
26#include <asm/div64.h> 27#include <asm/div64.h>
27 28
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 6cf5e4e84d61..18e57ea1e5b4 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -37,6 +37,7 @@
37#include <linux/vfs.h> 37#include <linux/vfs.h>
38#include <linux/random.h> 38#include <linux/random.h>
39#include <linux/exportfs.h> 39#include <linux/exportfs.h>
40#include <linux/slab.h>
40 41
41#include "exofs.h" 42#include "exofs.h"
42 43
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 1d081f0cfec2..3cf038c055d7 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -13,6 +13,7 @@
13 13
14#include "ext2.h" 14#include "ext2.h"
15#include <linux/quotaops.h> 15#include <linux/quotaops.h>
16#include <linux/slab.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
17#include <linux/buffer_head.h> 18#include <linux/buffer_head.h>
18#include <linux/capability.h> 19#include <linux/capability.h>
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index 4e2426e22bbe..565cf817bbf1 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -32,6 +32,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
32 .readlink = generic_readlink, 32 .readlink = generic_readlink,
33 .follow_link = page_follow_link_light, 33 .follow_link = page_follow_link_light,
34 .put_link = page_put_link, 34 .put_link = page_put_link,
35 .setattr = ext2_setattr,
35#ifdef CONFIG_EXT2_FS_XATTR 36#ifdef CONFIG_EXT2_FS_XATTR
36 .setxattr = generic_setxattr, 37 .setxattr = generic_setxattr,
37 .getxattr = generic_getxattr, 38 .getxattr = generic_getxattr,
@@ -43,6 +44,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
43const struct inode_operations ext2_fast_symlink_inode_operations = { 44const struct inode_operations ext2_fast_symlink_inode_operations = {
44 .readlink = generic_readlink, 45 .readlink = generic_readlink,
45 .follow_link = ext2_follow_link, 46 .follow_link = ext2_follow_link,
47 .setattr = ext2_setattr,
46#ifdef CONFIG_EXT2_FS_XATTR 48#ifdef CONFIG_EXT2_FS_XATTR
47 .setxattr = generic_setxattr, 49 .setxattr = generic_setxattr,
48 .getxattr = generic_getxattr, 50 .getxattr = generic_getxattr,
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index c8155845ac05..b118c6383c6d 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -4,6 +4,7 @@
4 */ 4 */
5 5
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/slab.h>
7#include <linux/string.h> 8#include <linux/string.h>
8#include <linux/fs.h> 9#include <linux/fs.h>
9#include <linux/ext2_fs.h> 10#include <linux/ext2_fs.h>
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 161da2d3f890..a177122a1b25 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -14,6 +14,7 @@
14#include <linux/time.h> 14#include <linux/time.h>
15#include <linux/capability.h> 15#include <linux/capability.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/slab.h>
17#include <linux/jbd.h> 18#include <linux/jbd.h>
18#include <linux/ext3_fs.h> 19#include <linux/ext3_fs.h>
19#include <linux/ext3_jbd.h> 20#include <linux/ext3_jbd.h>
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index ef9008b885b5..0d0e97ed3ff6 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -582,7 +582,9 @@ got:
582 inode->i_generation = sbi->s_next_generation++; 582 inode->i_generation = sbi->s_next_generation++;
583 spin_unlock(&sbi->s_next_gen_lock); 583 spin_unlock(&sbi->s_next_gen_lock);
584 584
585 ei->i_state = EXT3_STATE_NEW; 585 ei->i_state_flags = 0;
586 ext3_set_inode_state(inode, EXT3_STATE_NEW);
587
586 ei->i_extra_isize = 588 ei->i_extra_isize =
587 (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ? 589 (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ?
588 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; 590 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 7f920b7263a4..ea33bdf0a300 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2811,7 +2811,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
2811 inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime); 2811 inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
2812 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0; 2812 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2813 2813
2814 ei->i_state = 0; 2814 ei->i_state_flags = 0;
2815 ei->i_dir_start_lookup = 0; 2815 ei->i_dir_start_lookup = 0;
2816 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 2816 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2817 /* We now have enough fields to check if the inode was active or not. 2817 /* We now have enough fields to check if the inode was active or not.
diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c
index ff7b4ccd8983..7c4898207776 100644
--- a/fs/ext3/symlink.c
+++ b/fs/ext3/symlink.c
@@ -34,6 +34,7 @@ const struct inode_operations ext3_symlink_inode_operations = {
34 .readlink = generic_readlink, 34 .readlink = generic_readlink,
35 .follow_link = page_follow_link_light, 35 .follow_link = page_follow_link_light,
36 .put_link = page_put_link, 36 .put_link = page_put_link,
37 .setattr = ext3_setattr,
37#ifdef CONFIG_EXT3_FS_XATTR 38#ifdef CONFIG_EXT3_FS_XATTR
38 .setxattr = generic_setxattr, 39 .setxattr = generic_setxattr,
39 .getxattr = generic_getxattr, 40 .getxattr = generic_getxattr,
@@ -45,6 +46,7 @@ const struct inode_operations ext3_symlink_inode_operations = {
45const struct inode_operations ext3_fast_symlink_inode_operations = { 46const struct inode_operations ext3_fast_symlink_inode_operations = {
46 .readlink = generic_readlink, 47 .readlink = generic_readlink,
47 .follow_link = ext3_follow_link, 48 .follow_link = ext3_follow_link,
49 .setattr = ext3_setattr,
48#ifdef CONFIG_EXT3_FS_XATTR 50#ifdef CONFIG_EXT3_FS_XATTR
49 .setxattr = generic_setxattr, 51 .setxattr = generic_setxattr,
50 .getxattr = generic_getxattr, 52 .getxattr = generic_getxattr,
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 474348788dd9..3af91f476dff 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -4,6 +4,7 @@
4 */ 4 */
5 5
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/slab.h>
7#include <linux/string.h> 8#include <linux/string.h>
8#include <linux/fs.h> 9#include <linux/fs.h>
9#include <linux/ext3_jbd.h> 10#include <linux/ext3_jbd.h>
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 983f0e127493..538c48655084 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -18,6 +18,7 @@
18#include <linux/pagemap.h> 18#include <linux/pagemap.h>
19#include <linux/blkdev.h> 19#include <linux/blkdev.h>
20#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include <linux/slab.h>
21#include "ext4.h" 22#include "ext4.h"
22 23
23struct ext4_system_zone { 24struct ext4_system_zone {
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 361c0b9962a8..57f6eef6ccd6 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -263,7 +263,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
263 ext4_group_t f; 263 ext4_group_t f;
264 264
265 f = ext4_flex_group(sbi, block_group); 265 f = ext4_flex_group(sbi, block_group);
266 atomic_dec(&sbi->s_flex_groups[f].free_inodes); 266 atomic_dec(&sbi->s_flex_groups[f].used_dirs);
267 } 267 }
268 268
269 } 269 }
@@ -773,7 +773,7 @@ static int ext4_claim_inode(struct super_block *sb,
773 if (sbi->s_log_groups_per_flex) { 773 if (sbi->s_log_groups_per_flex) {
774 ext4_group_t f = ext4_flex_group(sbi, group); 774 ext4_group_t f = ext4_flex_group(sbi, group);
775 775
776 atomic_inc(&sbi->s_flex_groups[f].free_inodes); 776 atomic_inc(&sbi->s_flex_groups[f].used_dirs);
777 } 777 }
778 } 778 }
779 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); 779 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 986120f30066..5381802d6052 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -39,6 +39,7 @@
39#include <linux/bio.h> 39#include <linux/bio.h>
40#include <linux/workqueue.h> 40#include <linux/workqueue.h>
41#include <linux/kernel.h> 41#include <linux/kernel.h>
42#include <linux/slab.h>
42 43
43#include "ext4_jbd2.h" 44#include "ext4_jbd2.h"
44#include "xattr.h" 45#include "xattr.h"
@@ -1035,7 +1036,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1035 sector_t lblock) 1036 sector_t lblock)
1036{ 1037{
1037 struct ext4_inode_info *ei = EXT4_I(inode); 1038 struct ext4_inode_info *ei = EXT4_I(inode);
1038 int dind_mask = EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1; 1039 sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
1039 int blk_bits; 1040 int blk_bits;
1040 1041
1041 if (lblock < EXT4_NDIR_BLOCKS) 1042 if (lblock < EXT4_NDIR_BLOCKS)
@@ -1050,7 +1051,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1050 } 1051 }
1051 ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; 1052 ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
1052 ei->i_da_metadata_calc_len = 1; 1053 ei->i_da_metadata_calc_len = 1;
1053 blk_bits = roundup_pow_of_two(lblock + 1); 1054 blk_bits = order_base_2(lblock);
1054 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; 1055 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
1055} 1056}
1056 1057
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 54df209d2eed..bde9d0b170c2 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -23,6 +23,7 @@
23 23
24#include "mballoc.h" 24#include "mballoc.h"
25#include <linux/debugfs.h> 25#include <linux/debugfs.h>
26#include <linux/slab.h>
26#include <trace/events/ext4.h> 27#include <trace/events/ext4.h>
27 28
28/* 29/*
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 8b87bd0eac95..34dcfc52ef44 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -13,6 +13,7 @@
13 */ 13 */
14 14
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/slab.h>
16#include "ext4_jbd2.h" 17#include "ext4_jbd2.h"
17#include "ext4_extents.h" 18#include "ext4_extents.h"
18 19
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index aa5fe28d180f..d1fc662cc311 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -15,6 +15,7 @@
15 15
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/quotaops.h> 17#include <linux/quotaops.h>
18#include <linux/slab.h>
18#include "ext4_jbd2.h" 19#include "ext4_jbd2.h"
19#include "ext4_extents.h" 20#include "ext4_extents.h"
20#include "ext4.h" 21#include "ext4.h"
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ba191dae8730..e14d22c170d5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -68,7 +68,21 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
68static int ext4_unfreeze(struct super_block *sb); 68static int ext4_unfreeze(struct super_block *sb);
69static void ext4_write_super(struct super_block *sb); 69static void ext4_write_super(struct super_block *sb);
70static int ext4_freeze(struct super_block *sb); 70static int ext4_freeze(struct super_block *sb);
71static int ext4_get_sb(struct file_system_type *fs_type, int flags,
72 const char *dev_name, void *data, struct vfsmount *mnt);
71 73
74#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
75static struct file_system_type ext3_fs_type = {
76 .owner = THIS_MODULE,
77 .name = "ext3",
78 .get_sb = ext4_get_sb,
79 .kill_sb = kill_block_super,
80 .fs_flags = FS_REQUIRES_DEV,
81};
82#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
83#else
84#define IS_EXT3_SB(sb) (0)
85#endif
72 86
73ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, 87ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
74 struct ext4_group_desc *bg) 88 struct ext4_group_desc *bg)
@@ -2539,7 +2553,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2539 * enable delayed allocation by default 2553 * enable delayed allocation by default
2540 * Use -o nodelalloc to turn it off 2554 * Use -o nodelalloc to turn it off
2541 */ 2555 */
2542 set_opt(sbi->s_mount_opt, DELALLOC); 2556 if (!IS_EXT3_SB(sb))
2557 set_opt(sbi->s_mount_opt, DELALLOC);
2543 2558
2544 if (!parse_options((char *) data, sb, &journal_devnum, 2559 if (!parse_options((char *) data, sb, &journal_devnum,
2545 &journal_ioprio, NULL, 0)) 2560 &journal_ioprio, NULL, 0))
@@ -4068,7 +4083,7 @@ static int ext4_get_sb(struct file_system_type *fs_type, int flags,
4068 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); 4083 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
4069} 4084}
4070 4085
4071#if !defined(CONTIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 4086#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
4072static struct file_system_type ext2_fs_type = { 4087static struct file_system_type ext2_fs_type = {
4073 .owner = THIS_MODULE, 4088 .owner = THIS_MODULE,
4074 .name = "ext2", 4089 .name = "ext2",
@@ -4095,15 +4110,7 @@ static inline void register_as_ext2(void) { }
4095static inline void unregister_as_ext2(void) { } 4110static inline void unregister_as_ext2(void) { }
4096#endif 4111#endif
4097 4112
4098#if !defined(CONTIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 4113#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
4099static struct file_system_type ext3_fs_type = {
4100 .owner = THIS_MODULE,
4101 .name = "ext3",
4102 .get_sb = ext4_get_sb,
4103 .kill_sb = kill_block_super,
4104 .fs_flags = FS_REQUIRES_DEV,
4105};
4106
4107static inline void register_as_ext3(void) 4114static inline void register_as_ext3(void)
4108{ 4115{
4109 int err = register_filesystem(&ext3_fs_type); 4116 int err = register_filesystem(&ext3_fs_type);
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 983c253999a7..8b145e98df07 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -7,6 +7,7 @@
7#include <linux/string.h> 7#include <linux/string.h>
8#include <linux/fs.h> 8#include <linux/fs.h>
9#include <linux/security.h> 9#include <linux/security.h>
10#include <linux/slab.h>
10#include "ext4_jbd2.h" 11#include "ext4_jbd2.h"
11#include "ext4.h" 12#include "ext4.h"
12#include "xattr.h" 13#include "xattr.h"
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 923990e4f16e..113f0a1e565d 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/fs.h> 11#include <linux/fs.h>
12#include <linux/slab.h>
12#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
13#include "fat.h" 14#include "fat.h"
14 15
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index c1ef50154868..6fcc7e71fbaa 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -309,7 +309,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
309{ 309{
310 struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options; 310 struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options;
311 wchar_t *ip, *ext_start, *end, *name_start; 311 wchar_t *ip, *ext_start, *end, *name_start;
312 unsigned char base[9], ext[4], buf[8], *p; 312 unsigned char base[9], ext[4], buf[5], *p;
313 unsigned char charbuf[NLS_MAX_CHARSET_SIZE]; 313 unsigned char charbuf[NLS_MAX_CHARSET_SIZE];
314 int chl, chi; 314 int chl, chi;
315 int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen; 315 int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen;
@@ -467,7 +467,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
467 return 0; 467 return 0;
468 } 468 }
469 469
470 i = jiffies & 0xffff; 470 i = jiffies;
471 sz = (jiffies >> 16) & 0x7; 471 sz = (jiffies >> 16) & 0x7;
472 if (baselen > 2) { 472 if (baselen > 2) {
473 baselen = numtail2_baselen; 473 baselen = numtail2_baselen;
@@ -476,7 +476,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
476 name_res[baselen + 4] = '~'; 476 name_res[baselen + 4] = '~';
477 name_res[baselen + 5] = '1' + sz; 477 name_res[baselen + 5] = '1' + sz;
478 while (1) { 478 while (1) {
479 sprintf(buf, "%04X", i); 479 snprintf(buf, sizeof(buf), "%04X", i & 0xffff);
480 memcpy(&name_res[baselen], buf, 4); 480 memcpy(&name_res[baselen], buf, 4);
481 if (vfat_find_form(dir, name_res) < 0) 481 if (vfat_find_form(dir, name_res) < 0)
482 break; 482 break;
diff --git a/fs/fifo.c b/fs/fifo.c
index f8f97b8b6d44..5d6606ffc2d2 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/slab.h>
14#include <linux/fs.h> 13#include <linux/fs.h>
15#include <linux/sched.h> 14#include <linux/sched.h>
16#include <linux/pipe_fs_i.h> 15#include <linux/pipe_fs_i.h>
diff --git a/fs/filesystems.c b/fs/filesystems.c
index a24c58e181db..68ba492d8eef 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -10,10 +10,10 @@
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/proc_fs.h> 11#include <linux/proc_fs.h>
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/slab.h>
14#include <linux/kmod.h> 13#include <linux/kmod.h>
15#include <linux/init.h> 14#include <linux/init.h>
16#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/slab.h>
17#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18 18
19/* 19/*
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index ed8f0b0dd880..1429f3ae1e86 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -33,7 +33,6 @@
33#include <linux/fs.h> 33#include <linux/fs.h>
34#include <linux/buffer_head.h> 34#include <linux/buffer_head.h>
35#include <linux/kernel.h> 35#include <linux/kernel.h>
36#include <linux/slab.h>
37#include <linux/pagemap.h> 36#include <linux/pagemap.h>
38 37
39#include "vxfs_extern.h" 38#include "vxfs_extern.h"
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 76fc4d594acb..4b37f7cea4dd 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -16,6 +16,7 @@
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/spinlock.h> 18#include <linux/spinlock.h>
19#include <linux/slab.h>
19#include <linux/sched.h> 20#include <linux/sched.h>
20#include <linux/fs.h> 21#include <linux/fs.h>
21#include <linux/mm.h> 22#include <linux/mm.h>
@@ -553,108 +554,85 @@ select_queue:
553 return ret; 554 return ret;
554} 555}
555 556
556static void unpin_sb_for_writeback(struct super_block **psb) 557static void unpin_sb_for_writeback(struct super_block *sb)
557{ 558{
558 struct super_block *sb = *psb; 559 up_read(&sb->s_umount);
559 560 put_super(sb);
560 if (sb) {
561 up_read(&sb->s_umount);
562 put_super(sb);
563 *psb = NULL;
564 }
565} 561}
566 562
563enum sb_pin_state {
564 SB_PINNED,
565 SB_NOT_PINNED,
566 SB_PIN_FAILED
567};
568
567/* 569/*
568 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned 570 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
569 * before calling writeback. So make sure that we do pin it, so it doesn't 571 * before calling writeback. So make sure that we do pin it, so it doesn't
570 * go away while we are writing inodes from it. 572 * go away while we are writing inodes from it.
571 *
572 * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
573 * 1 if we failed.
574 */ 573 */
575static int pin_sb_for_writeback(struct writeback_control *wbc, 574static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
576 struct inode *inode, struct super_block **psb) 575 struct super_block *sb)
577{ 576{
578 struct super_block *sb = inode->i_sb;
579
580 /*
581 * If this sb is already pinned, nothing more to do. If not and
582 * *psb is non-NULL, unpin the old one first
583 */
584 if (sb == *psb)
585 return 0;
586 else if (*psb)
587 unpin_sb_for_writeback(psb);
588
589 /* 577 /*
590 * Caller must already hold the ref for this 578 * Caller must already hold the ref for this
591 */ 579 */
592 if (wbc->sync_mode == WB_SYNC_ALL) { 580 if (wbc->sync_mode == WB_SYNC_ALL) {
593 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 581 WARN_ON(!rwsem_is_locked(&sb->s_umount));
594 return 0; 582 return SB_NOT_PINNED;
595 } 583 }
596
597 spin_lock(&sb_lock); 584 spin_lock(&sb_lock);
598 sb->s_count++; 585 sb->s_count++;
599 if (down_read_trylock(&sb->s_umount)) { 586 if (down_read_trylock(&sb->s_umount)) {
600 if (sb->s_root) { 587 if (sb->s_root) {
601 spin_unlock(&sb_lock); 588 spin_unlock(&sb_lock);
602 goto pinned; 589 return SB_PINNED;
603 } 590 }
604 /* 591 /*
605 * umounted, drop rwsem again and fall through to failure 592 * umounted, drop rwsem again and fall through to failure
606 */ 593 */
607 up_read(&sb->s_umount); 594 up_read(&sb->s_umount);
608 } 595 }
609
610 sb->s_count--; 596 sb->s_count--;
611 spin_unlock(&sb_lock); 597 spin_unlock(&sb_lock);
612 return 1; 598 return SB_PIN_FAILED;
613pinned:
614 *psb = sb;
615 return 0;
616} 599}
617 600
618static void writeback_inodes_wb(struct bdi_writeback *wb, 601/*
619 struct writeback_control *wbc) 602 * Write a portion of b_io inodes which belong to @sb.
603 * If @wbc->sb != NULL, then find and write all such
604 * inodes. Otherwise write only ones which go sequentially
605 * in reverse order.
606 * Return 1, if the caller writeback routine should be
607 * interrupted. Otherwise return 0.
608 */
609static int writeback_sb_inodes(struct super_block *sb,
610 struct bdi_writeback *wb,
611 struct writeback_control *wbc)
620{ 612{
621 struct super_block *sb = wbc->sb, *pin_sb = NULL;
622 const unsigned long start = jiffies; /* livelock avoidance */
623
624 spin_lock(&inode_lock);
625
626 if (!wbc->for_kupdate || list_empty(&wb->b_io))
627 queue_io(wb, wbc->older_than_this);
628
629 while (!list_empty(&wb->b_io)) { 613 while (!list_empty(&wb->b_io)) {
630 struct inode *inode = list_entry(wb->b_io.prev,
631 struct inode, i_list);
632 long pages_skipped; 614 long pages_skipped;
633 615 struct inode *inode = list_entry(wb->b_io.prev,
634 /* 616 struct inode, i_list);
635 * super block given and doesn't match, skip this inode 617 if (wbc->sb && sb != inode->i_sb) {
636 */ 618 /* super block given and doesn't
637 if (sb && sb != inode->i_sb) { 619 match, skip this inode */
638 redirty_tail(inode); 620 redirty_tail(inode);
639 continue; 621 continue;
640 } 622 }
641 623 if (sb != inode->i_sb)
624 /* finish with this superblock */
625 return 0;
642 if (inode->i_state & (I_NEW | I_WILL_FREE)) { 626 if (inode->i_state & (I_NEW | I_WILL_FREE)) {
643 requeue_io(inode); 627 requeue_io(inode);
644 continue; 628 continue;
645 } 629 }
646
647 /* 630 /*
648 * Was this inode dirtied after sync_sb_inodes was called? 631 * Was this inode dirtied after sync_sb_inodes was called?
649 * This keeps sync from extra jobs and livelock. 632 * This keeps sync from extra jobs and livelock.
650 */ 633 */
651 if (inode_dirtied_after(inode, start)) 634 if (inode_dirtied_after(inode, wbc->wb_start))
652 break; 635 return 1;
653
654 if (pin_sb_for_writeback(wbc, inode, &pin_sb)) {
655 requeue_io(inode);
656 continue;
657 }
658 636
659 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); 637 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
660 __iget(inode); 638 __iget(inode);
@@ -673,14 +651,50 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
673 spin_lock(&inode_lock); 651 spin_lock(&inode_lock);
674 if (wbc->nr_to_write <= 0) { 652 if (wbc->nr_to_write <= 0) {
675 wbc->more_io = 1; 653 wbc->more_io = 1;
676 break; 654 return 1;
677 } 655 }
678 if (!list_empty(&wb->b_more_io)) 656 if (!list_empty(&wb->b_more_io))
679 wbc->more_io = 1; 657 wbc->more_io = 1;
680 } 658 }
659 /* b_io is empty */
660 return 1;
661}
662
663static void writeback_inodes_wb(struct bdi_writeback *wb,
664 struct writeback_control *wbc)
665{
666 int ret = 0;
681 667
682 unpin_sb_for_writeback(&pin_sb); 668 wbc->wb_start = jiffies; /* livelock avoidance */
669 spin_lock(&inode_lock);
670 if (!wbc->for_kupdate || list_empty(&wb->b_io))
671 queue_io(wb, wbc->older_than_this);
672
673 while (!list_empty(&wb->b_io)) {
674 struct inode *inode = list_entry(wb->b_io.prev,
675 struct inode, i_list);
676 struct super_block *sb = inode->i_sb;
677 enum sb_pin_state state;
678
679 if (wbc->sb && sb != wbc->sb) {
680 /* super block given and doesn't
681 match, skip this inode */
682 redirty_tail(inode);
683 continue;
684 }
685 state = pin_sb_for_writeback(wbc, sb);
686
687 if (state == SB_PIN_FAILED) {
688 requeue_io(inode);
689 continue;
690 }
691 ret = writeback_sb_inodes(sb, wb, wbc);
683 692
693 if (state == SB_PINNED)
694 unpin_sb_for_writeback(sb);
695 if (ret)
696 break;
697 }
684 spin_unlock(&inode_lock); 698 spin_unlock(&inode_lock);
685 /* Leave any unwritten inodes on b_io */ 699 /* Leave any unwritten inodes on b_io */
686} 700}
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 3221a0c7944e..1e1f286dd70e 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -12,6 +12,7 @@
12#define FSCACHE_DEBUG_LEVEL COOKIE 12#define FSCACHE_DEBUG_LEVEL COOKIE
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/seq_file.h> 14#include <linux/seq_file.h>
15#include <linux/slab.h>
15#include <linux/key.h> 16#include <linux/key.h>
16#include <keys/user-type.h> 17#include <keys/user-type.h>
17#include "internal.h" 18#include "internal.h"
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index e513ac599c8e..0b589a9b4ffc 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -53,7 +53,7 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
53static void fscache_object_slow_work_put_ref(struct slow_work *); 53static void fscache_object_slow_work_put_ref(struct slow_work *);
54static int fscache_object_slow_work_get_ref(struct slow_work *); 54static int fscache_object_slow_work_get_ref(struct slow_work *);
55static void fscache_object_slow_work_execute(struct slow_work *); 55static void fscache_object_slow_work_execute(struct slow_work *);
56#ifdef CONFIG_SLOW_WORK_PROC 56#ifdef CONFIG_SLOW_WORK_DEBUG
57static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *); 57static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
58#endif 58#endif
59static void fscache_initialise_object(struct fscache_object *); 59static void fscache_initialise_object(struct fscache_object *);
@@ -69,7 +69,7 @@ const struct slow_work_ops fscache_object_slow_work_ops = {
69 .get_ref = fscache_object_slow_work_get_ref, 69 .get_ref = fscache_object_slow_work_get_ref,
70 .put_ref = fscache_object_slow_work_put_ref, 70 .put_ref = fscache_object_slow_work_put_ref,
71 .execute = fscache_object_slow_work_execute, 71 .execute = fscache_object_slow_work_execute,
72#ifdef CONFIG_SLOW_WORK_PROC 72#ifdef CONFIG_SLOW_WORK_DEBUG
73 .desc = fscache_object_slow_work_desc, 73 .desc = fscache_object_slow_work_desc,
74#endif 74#endif
75}; 75};
@@ -364,7 +364,7 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
364/* 364/*
365 * describe an object for slow-work debugging 365 * describe an object for slow-work debugging
366 */ 366 */
367#ifdef CONFIG_SLOW_WORK_PROC 367#ifdef CONFIG_SLOW_WORK_DEBUG
368static void fscache_object_slow_work_desc(struct slow_work *work, 368static void fscache_object_slow_work_desc(struct slow_work *work,
369 struct seq_file *m) 369 struct seq_file *m)
370{ 370{
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 313e79a14266..f17cecafae44 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -14,6 +14,7 @@
14#define FSCACHE_DEBUG_LEVEL OPERATION 14#define FSCACHE_DEBUG_LEVEL OPERATION
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/seq_file.h> 16#include <linux/seq_file.h>
17#include <linux/slab.h>
17#include "internal.h" 18#include "internal.h"
18 19
19atomic_t fscache_op_debug_id; 20atomic_t fscache_op_debug_id;
@@ -500,7 +501,7 @@ static void fscache_op_execute(struct slow_work *work)
500/* 501/*
501 * describe an operation for slow-work debugging 502 * describe an operation for slow-work debugging
502 */ 503 */
503#ifdef CONFIG_SLOW_WORK_PROC 504#ifdef CONFIG_SLOW_WORK_DEBUG
504static void fscache_op_desc(struct slow_work *work, struct seq_file *m) 505static void fscache_op_desc(struct slow_work *work, struct seq_file *m)
505{ 506{
506 struct fscache_operation *op = 507 struct fscache_operation *op =
@@ -517,7 +518,7 @@ const struct slow_work_ops fscache_op_slow_work_ops = {
517 .get_ref = fscache_op_get_ref, 518 .get_ref = fscache_op_get_ref,
518 .put_ref = fscache_op_put_ref, 519 .put_ref = fscache_op_put_ref,
519 .execute = fscache_op_execute, 520 .execute = fscache_op_execute,
520#ifdef CONFIG_SLOW_WORK_PROC 521#ifdef CONFIG_SLOW_WORK_DEBUG
521 .desc = fscache_op_desc, 522 .desc = fscache_op_desc,
522#endif 523#endif
523}; 524};
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index c598ea4c4e7d..47aefd376e54 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -14,6 +14,7 @@
14#include <linux/fscache-cache.h> 14#include <linux/fscache-cache.h>
15#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
16#include <linux/pagevec.h> 16#include <linux/pagevec.h>
17#include <linux/slab.h>
17#include "internal.h" 18#include "internal.h"
18 19
19/* 20/*
@@ -881,6 +882,7 @@ submit_failed:
881 goto nobufs; 882 goto nobufs;
882 883
883nobufs_unlock_obj: 884nobufs_unlock_obj:
885 spin_unlock(&cookie->stores_lock);
884 spin_unlock(&object->lock); 886 spin_unlock(&object->lock);
885nobufs: 887nobufs:
886 spin_unlock(&cookie->lock); 888 spin_unlock(&cookie->lock);
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 46435f3aae68..4765190d537f 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -165,8 +165,8 @@ static int fscache_stats_show(struct seq_file *m, void *v)
165 atomic_read(&fscache_n_object_lookups), 165 atomic_read(&fscache_n_object_lookups),
166 atomic_read(&fscache_n_object_lookups_negative), 166 atomic_read(&fscache_n_object_lookups_negative),
167 atomic_read(&fscache_n_object_lookups_positive), 167 atomic_read(&fscache_n_object_lookups_positive),
168 atomic_read(&fscache_n_object_lookups_timed_out), 168 atomic_read(&fscache_n_object_created),
169 atomic_read(&fscache_n_object_created)); 169 atomic_read(&fscache_n_object_lookups_timed_out));
170 170
171 seq_printf(m, "Updates: n=%u nul=%u run=%u\n", 171 seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
172 atomic_read(&fscache_n_updates), 172 atomic_read(&fscache_n_updates),
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index de792dcf3274..e1f8171278bd 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -44,6 +44,7 @@
44#include <linux/magic.h> 44#include <linux/magic.h>
45#include <linux/miscdevice.h> 45#include <linux/miscdevice.h>
46#include <linux/mutex.h> 46#include <linux/mutex.h>
47#include <linux/slab.h>
47#include <linux/spinlock.h> 48#include <linux/spinlock.h>
48#include <linux/stat.h> 49#include <linux/stat.h>
49 50
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 55458031e501..fe5df5457656 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/gfp.h>
10#include <linux/fs.h> 11#include <linux/fs.h>
11#include <linux/generic_acl.h> 12#include <linux/generic_acl.h>
12#include <linux/posix_acl.h> 13#include <linux/posix_acl.h>
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 4dcddf83326f..a47b43107112 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -8,7 +8,6 @@ config GFS2_FS
8 select FS_POSIX_ACL 8 select FS_POSIX_ACL
9 select CRC32 9 select CRC32
10 select SLOW_WORK 10 select SLOW_WORK
11 select QUOTA
12 select QUOTACTL 11 select QUOTACTL
13 help 12 help
14 A cluster filesystem. 13 A cluster filesystem.
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 583e823307ae..5e411d5f4697 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 91beddadd388..bb7907bde3d8 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index d15876e9aa26..c22c21174833 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index a6abbae8a278..e6dd2aec6f82 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -640,7 +640,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
640 640
641 if (!(fl->fl_flags & FL_POSIX)) 641 if (!(fl->fl_flags & FL_POSIX))
642 return -ENOLCK; 642 return -ENOLCK;
643 if (__mandatory_lock(&ip->i_inode)) 643 if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK)
644 return -ENOLCK; 644 return -ENOLCK;
645 645
646 if (cmd == F_CANCELLK) { 646 if (cmd == F_CANCELLK) {
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 38e3749d476c..49f97d3bb690 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index b8025e51cabf..3aac46f6853e 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -616,7 +616,7 @@ struct gfs2_sbd {
616 unsigned int sd_log_blks_reserved; 616 unsigned int sd_log_blks_reserved;
617 unsigned int sd_log_commited_buf; 617 unsigned int sd_log_commited_buf;
618 unsigned int sd_log_commited_databuf; 618 unsigned int sd_log_commited_databuf;
619 unsigned int sd_log_commited_revoke; 619 int sd_log_commited_revoke;
620 620
621 unsigned int sd_log_num_buf; 621 unsigned int sd_log_num_buf;
622 unsigned int sd_log_num_revoke; 622 unsigned int sd_log_num_revoke;
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 569b46240f61..0e0470ed34c2 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -9,6 +9,7 @@
9 9
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/dlm.h> 11#include <linux/dlm.h>
12#include <linux/slab.h>
12#include <linux/types.h> 13#include <linux/types.h>
13#include <linux/gfs2_ondisk.h> 14#include <linux/gfs2_ondisk.h>
14 15
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 4511b08fc451..e5bf4b59d46e 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -417,7 +417,7 @@ static unsigned int calc_reserved(struct gfs2_sbd *sdp)
417 databufhdrs_needed = (sdp->sd_log_commited_databuf + 417 databufhdrs_needed = (sdp->sd_log_commited_databuf +
418 (dbuf_limit - 1)) / dbuf_limit; 418 (dbuf_limit - 1)) / dbuf_limit;
419 419
420 if (sdp->sd_log_commited_revoke) 420 if (sdp->sd_log_commited_revoke > 0)
421 revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke, 421 revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
422 sizeof(u64)); 422 sizeof(u64));
423 423
@@ -790,7 +790,6 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
790 gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) || 790 gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) ||
791 (((int)sdp->sd_log_commited_databuf) >= 0)); 791 (((int)sdp->sd_log_commited_databuf) >= 0));
792 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; 792 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
793 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
794 reserved = calc_reserved(sdp); 793 reserved = calc_reserved(sdp);
795 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved); 794 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved);
796 unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved; 795 unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index b4106ddaaa98..f07119d89557 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -10,6 +10,8 @@
10#ifndef __RGRP_DOT_H__ 10#ifndef __RGRP_DOT_H__
11#define __RGRP_DOT_H__ 11#define __RGRP_DOT_H__
12 12
13#include <linux/slab.h>
14
13struct gfs2_rgrpd; 15struct gfs2_rgrpd;
14struct gfs2_sbd; 16struct gfs2_sbd;
15struct gfs2_holder; 17struct gfs2_holder;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 419042f7f0b6..54fd98425991 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -8,7 +8,6 @@
8 */ 8 */
9 9
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h> 11#include <linux/spinlock.h>
13#include <linux/completion.h> 12#include <linux/completion.h>
14#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 226f2bfbf16a..53511291fe36 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 0d200068d0af..cdb41a1f6a64 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/slab.h>
12#include <linux/swap.h> 13#include <linux/swap.h>
13 14
14#include "btree.h" 15#include "btree.h"
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 052f214ea6f0..38a0a9917d7f 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/slab.h>
12#include <linux/log2.h> 13#include <linux/log2.h>
13 14
14#include "btree.h" 15#include "btree.h"
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 8bbe03c3f6d5..86428f5ac991 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -11,6 +11,7 @@
11#include <linux/cdrom.h> 11#include <linux/cdrom.h>
12#include <linux/genhd.h> 12#include <linux/genhd.h>
13#include <linux/nls.h> 13#include <linux/nls.h>
14#include <linux/slab.h>
14 15
15#include "hfs_fs.h" 16#include "hfs_fs.h"
16#include "btree.h" 17#include "btree.h"
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 5ed7252b7b23..0a81eb7111f3 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -19,6 +19,7 @@
19#include <linux/nls.h> 19#include <linux/nls.h>
20#include <linux/parser.h> 20#include <linux/parser.h>
21#include <linux/seq_file.h> 21#include <linux/seq_file.h>
22#include <linux/slab.h>
22#include <linux/smp_lock.h> 23#include <linux/smp_lock.h>
23#include <linux/vfs.h> 24#include <linux/vfs.h>
24 25
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 3fcbb0e1f6fc..572628b4b07d 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -15,6 +15,7 @@
15#include <linux/nls.h> 15#include <linux/nls.h>
16#include <linux/mount.h> 16#include <linux/mount.h>
17#include <linux/seq_file.h> 17#include <linux/seq_file.h>
18#include <linux/slab.h>
18#include "hfsplus_fs.h" 19#include "hfsplus_fs.h"
19 20
20enum { 21enum {
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 032604e5ef2c..3a029d8f4cf1 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -11,6 +11,7 @@
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/statfs.h> 13#include <linux/statfs.h>
14#include <linux/slab.h>
14#include <linux/seq_file.h> 15#include <linux/seq_file.h>
15#include <linux/mount.h> 16#include <linux/mount.h>
16#include "hostfs.h" 17#include "hostfs.h"
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index b6fca543544c..eac5f96323e3 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -6,6 +6,7 @@
6 * general buffer i/o 6 * general buffer i/o
7 */ 7 */
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/slab.h>
9#include "hpfs_fn.h" 10#include "hpfs_fn.h"
10 11
11void hpfs_lock_creation(struct super_block *s) 12void hpfs_lock_creation(struct super_block *s)
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 26e3964a4b8c..2338130cceba 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/smp_lock.h> 9#include <linux/smp_lock.h>
10#include <linux/slab.h>
10#include "hpfs_fn.h" 11#include "hpfs_fn.h"
11 12
12static int hpfs_dir_release(struct inode *inode, struct file *filp) 13static int hpfs_dir_release(struct inode *inode, struct file *filp)
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index ff90affb94e1..1042a9bc97f3 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/smp_lock.h> 9#include <linux/smp_lock.h>
10#include <linux/slab.h>
10#include "hpfs_fn.h" 11#include "hpfs_fn.h"
11 12
12void hpfs_init_inode(struct inode *i) 13void hpfs_init_inode(struct inode *i)
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index cadc4ce48656..aa53842c599c 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -15,6 +15,7 @@
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/smp_lock.h> 16#include <linux/smp_lock.h>
17#include <linux/bitmap.h> 17#include <linux/bitmap.h>
18#include <linux/slab.h>
18 19
19/* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */ 20/* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
20 21
diff --git a/fs/ioprio.c b/fs/ioprio.c
index c7c0b28d7d21..748cfb92dcc6 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -19,6 +19,7 @@
19 * See also Documentation/block/ioprio.txt 19 * See also Documentation/block/ioprio.txt
20 * 20 *
21 */ 21 */
22#include <linux/gfp.h>
22#include <linux/kernel.h> 23#include <linux/kernel.h>
23#include <linux/ioprio.h> 24#include <linux/ioprio.h>
24#include <linux/blkdev.h> 25#include <linux/blkdev.h>
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 8ba5441063be..b9ab69b3a482 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -11,6 +11,7 @@
11 * isofs directory handling functions 11 * isofs directory handling functions
12 */ 12 */
13#include <linux/smp_lock.h> 13#include <linux/smp_lock.h>
14#include <linux/gfp.h>
14#include "isofs.h" 15#include "isofs.h"
15 16
16int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode) 17int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode)
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index eaa831311c9c..ab438beb867c 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/smp_lock.h> 9#include <linux/smp_lock.h>
10#include <linux/gfp.h>
10#include "isofs.h" 11#include "isofs.h"
11 12
12/* 13/*
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 2c90e3ef625f..ecb44c94ba8d 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -17,7 +17,6 @@
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/jbd.h> 18#include <linux/jbd.h>
19#include <linux/errno.h> 19#include <linux/errno.h>
20#include <linux/slab.h>
21#include <linux/mm.h> 20#include <linux/mm.h>
22#include <linux/pagemap.h> 21#include <linux/pagemap.h>
23#include <linux/bio.h> 22#include <linux/bio.h>
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index cb1a49ae605e..54c9bc9e1b17 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -20,7 +20,6 @@
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/jbd.h> 21#include <linux/jbd.h>
22#include <linux/errno.h> 22#include <linux/errno.h>
23#include <linux/slab.h>
24#endif 23#endif
25 24
26/* 25/*
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 73063285b13f..049281b7cb89 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -20,7 +20,6 @@
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/jbd2.h> 21#include <linux/jbd2.h>
22#include <linux/errno.h> 22#include <linux/errno.h>
23#include <linux/slab.h>
24#include <linux/crc32.h> 23#include <linux/crc32.h>
25#endif 24#endif
26 25
diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
index 90cb60d09787..cd02acafde8a 100644
--- a/fs/jffs2/compr_lzo.c
+++ b/fs/jffs2/compr_lzo.c
@@ -11,7 +11,6 @@
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/slab.h>
15#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/lzo.h> 16#include <linux/lzo.h>
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index cfd301a5edfc..b46661a42758 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -14,7 +14,6 @@
14#endif 14#endif
15 15
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/slab.h>
18#include <linux/zlib.h> 17#include <linux/zlib.h>
19#include <linux/zutil.h> 18#include <linux/zutil.h>
20#include "nodelist.h" 19#include "nodelist.h"
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index 5544d31c066b..ec3538413926 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -15,6 +15,7 @@
15#include <linux/crc32.h> 15#include <linux/crc32.h>
16#include <linux/jffs2.h> 16#include <linux/jffs2.h>
17#include <linux/mtd/mtd.h> 17#include <linux/mtd/mtd.h>
18#include <linux/slab.h>
18#include "nodelist.h" 19#include "nodelist.h"
19#include "debug.h" 20#include "debug.h"
20 21
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index b7b74e299142..e7291c161a19 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/slab.h>
14#include <linux/fs.h> 13#include <linux/fs.h>
15#include <linux/time.h> 14#include <linux/time.h>
16#include <linux/pagemap.h> 15#include <linux/pagemap.h>
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 87c6f555e1a0..af02bd138469 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -15,7 +15,6 @@
15#include <linux/mtd/mtd.h> 15#include <linux/mtd/mtd.h>
16#include <linux/rbtree.h> 16#include <linux/rbtree.h>
17#include <linux/crc32.h> 17#include <linux/crc32.h>
18#include <linux/slab.h>
19#include <linux/pagemap.h> 18#include <linux/pagemap.h>
20#include "nodelist.h" 19#include "nodelist.h"
21 20
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 21a052915aa9..191359dde4e1 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/slab.h>
14#include <linux/mtd/mtd.h> 13#include <linux/mtd/mtd.h>
15#include <linux/compiler.h> 14#include <linux/compiler.h>
16#include <linux/sched.h> /* For cond_resched() */ 15#include <linux/sched.h> /* For cond_resched() */
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index e22de8397b74..d32ee9412cb9 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -567,7 +567,7 @@ static void jffs2_free_tmp_dnode_info_list(struct rb_root *list)
567 else BUG(); 567 else BUG();
568 } 568 }
569 } 569 }
570 list->rb_node = NULL; 570 *list = RB_ROOT;
571} 571}
572 572
573static void jffs2_free_full_dirent_list(struct jffs2_full_dirent *fd) 573static void jffs2_free_full_dirent_list(struct jffs2_full_dirent *fd)
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 4ec11e8bda8c..b955626071c2 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/slab.h>
14#include <linux/fs.h> 13#include <linux/fs.h>
15#include <linux/namei.h> 14#include <linux/namei.h>
16#include "nodelist.h" 15#include "nodelist.h"
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index ca29440e9435..c819eb0e982d 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -12,7 +12,6 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/crc32.h> 14#include <linux/crc32.h>
15#include <linux/slab.h>
16#include <linux/pagemap.h> 15#include <linux/pagemap.h>
17#include <linux/mtd/mtd.h> 16#include <linux/mtd/mtd.h>
18#include "nodelist.h" 17#include "nodelist.h"
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 213169780b6c..1057a4998e4e 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -19,6 +19,7 @@
19 */ 19 */
20 20
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/slab.h>
22#include <linux/fs.h> 23#include <linux/fs.h>
23#include <linux/posix_acl_xattr.h> 24#include <linux/posix_acl_xattr.h>
24#include "jfs_incore.h" 25#include "jfs_incore.h"
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 9dd126276c9f..ed9ba6fe04f5 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -61,7 +61,7 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
61 inode->i_op = &page_symlink_inode_operations; 61 inode->i_op = &page_symlink_inode_operations;
62 inode->i_mapping->a_ops = &jfs_aops; 62 inode->i_mapping->a_ops = &jfs_aops;
63 } else { 63 } else {
64 inode->i_op = &jfs_symlink_inode_operations; 64 inode->i_op = &jfs_fast_symlink_inode_operations;
65 /* 65 /*
66 * The inline data should be null-terminated, but 66 * The inline data should be null-terminated, but
67 * don't let on-disk corruption crash the kernel 67 * don't let on-disk corruption crash the kernel
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index d13b93043b04..c92ea3b3ea5e 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/slab.h>
20#include "jfs_incore.h" 21#include "jfs_incore.h"
21#include "jfs_superblock.h" 22#include "jfs_superblock.h"
22#include "jfs_dmap.h" 23#include "jfs_dmap.h"
@@ -195,7 +196,7 @@ int dbMount(struct inode *ipbmap)
195 bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag); 196 bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
196 bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref); 197 bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
197 bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel); 198 bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
198 bmp->db_agheigth = le32_to_cpu(dbmp_le->dn_agheigth); 199 bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight);
199 bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth); 200 bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
200 bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart); 201 bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
201 bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size); 202 bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
@@ -287,7 +288,7 @@ int dbSync(struct inode *ipbmap)
287 dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag); 288 dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag);
288 dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref); 289 dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref);
289 dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel); 290 dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel);
290 dbmp_le->dn_agheigth = cpu_to_le32(bmp->db_agheigth); 291 dbmp_le->dn_agheight = cpu_to_le32(bmp->db_agheight);
291 dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth); 292 dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth);
292 dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart); 293 dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart);
293 dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size); 294 dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size);
@@ -1440,7 +1441,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1440 * tree index of this allocation group within the control page. 1441 * tree index of this allocation group within the control page.
1441 */ 1442 */
1442 agperlev = 1443 agperlev =
1443 (1 << (L2LPERCTL - (bmp->db_agheigth << 1))) / bmp->db_agwidth; 1444 (1 << (L2LPERCTL - (bmp->db_agheight << 1))) / bmp->db_agwidth;
1444 ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1)); 1445 ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1));
1445 1446
1446 /* dmap control page trees fan-out by 4 and a single allocation 1447 /* dmap control page trees fan-out by 4 and a single allocation
@@ -1459,7 +1460,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1459 * the subtree to find the leftmost leaf that describes this 1460 * the subtree to find the leftmost leaf that describes this
1460 * free space. 1461 * free space.
1461 */ 1462 */
1462 for (k = bmp->db_agheigth; k > 0; k--) { 1463 for (k = bmp->db_agheight; k > 0; k--) {
1463 for (n = 0, m = (ti << 2) + 1; n < 4; n++) { 1464 for (n = 0, m = (ti << 2) + 1; n < 4; n++) {
1464 if (l2nb <= dcp->stree[m + n]) { 1465 if (l2nb <= dcp->stree[m + n]) {
1465 ti = m + n; 1466 ti = m + n;
@@ -3606,7 +3607,7 @@ void dbFinalizeBmap(struct inode *ipbmap)
3606 } 3607 }
3607 3608
3608 /* 3609 /*
3609 * compute db_aglevel, db_agheigth, db_width, db_agstart: 3610 * compute db_aglevel, db_agheight, db_width, db_agstart:
3610 * an ag is covered in aglevel dmapctl summary tree, 3611 * an ag is covered in aglevel dmapctl summary tree,
3611 * at agheight level height (from leaf) with agwidth number of nodes 3612 * at agheight level height (from leaf) with agwidth number of nodes
3612 * each, which starts at agstart index node of the smmary tree node 3613 * each, which starts at agstart index node of the smmary tree node
@@ -3615,9 +3616,9 @@ void dbFinalizeBmap(struct inode *ipbmap)
3615 bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize); 3616 bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize);
3616 l2nl = 3617 l2nl =
3617 bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL); 3618 bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL);
3618 bmp->db_agheigth = l2nl >> 1; 3619 bmp->db_agheight = l2nl >> 1;
3619 bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheigth << 1)); 3620 bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheight << 1));
3620 for (i = 5 - bmp->db_agheigth, bmp->db_agstart = 0, n = 1; i > 0; 3621 for (i = 5 - bmp->db_agheight, bmp->db_agstart = 0, n = 1; i > 0;
3621 i--) { 3622 i--) {
3622 bmp->db_agstart += n; 3623 bmp->db_agstart += n;
3623 n <<= 2; 3624 n <<= 2;
diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h
index 1a6eb41569bc..6dcb906c55d8 100644
--- a/fs/jfs/jfs_dmap.h
+++ b/fs/jfs/jfs_dmap.h
@@ -210,7 +210,7 @@ struct dbmap_disk {
210 __le32 dn_maxag; /* 4: max active alloc group number */ 210 __le32 dn_maxag; /* 4: max active alloc group number */
211 __le32 dn_agpref; /* 4: preferred alloc group (hint) */ 211 __le32 dn_agpref; /* 4: preferred alloc group (hint) */
212 __le32 dn_aglevel; /* 4: dmapctl level holding the AG */ 212 __le32 dn_aglevel; /* 4: dmapctl level holding the AG */
213 __le32 dn_agheigth; /* 4: height in dmapctl of the AG */ 213 __le32 dn_agheight; /* 4: height in dmapctl of the AG */
214 __le32 dn_agwidth; /* 4: width in dmapctl of the AG */ 214 __le32 dn_agwidth; /* 4: width in dmapctl of the AG */
215 __le32 dn_agstart; /* 4: start tree index at AG height */ 215 __le32 dn_agstart; /* 4: start tree index at AG height */
216 __le32 dn_agl2size; /* 4: l2 num of blks per alloc group */ 216 __le32 dn_agl2size; /* 4: l2 num of blks per alloc group */
@@ -229,7 +229,7 @@ struct dbmap {
229 int dn_maxag; /* max active alloc group number */ 229 int dn_maxag; /* max active alloc group number */
230 int dn_agpref; /* preferred alloc group (hint) */ 230 int dn_agpref; /* preferred alloc group (hint) */
231 int dn_aglevel; /* dmapctl level holding the AG */ 231 int dn_aglevel; /* dmapctl level holding the AG */
232 int dn_agheigth; /* height in dmapctl of the AG */ 232 int dn_agheight; /* height in dmapctl of the AG */
233 int dn_agwidth; /* width in dmapctl of the AG */ 233 int dn_agwidth; /* width in dmapctl of the AG */
234 int dn_agstart; /* start tree index at AG height */ 234 int dn_agstart; /* start tree index at AG height */
235 int dn_agl2size; /* l2 num of blks per alloc group */ 235 int dn_agl2size; /* l2 num of blks per alloc group */
@@ -255,7 +255,7 @@ struct bmap {
255#define db_agsize db_bmap.dn_agsize 255#define db_agsize db_bmap.dn_agsize
256#define db_agl2size db_bmap.dn_agl2size 256#define db_agl2size db_bmap.dn_agl2size
257#define db_agwidth db_bmap.dn_agwidth 257#define db_agwidth db_bmap.dn_agwidth
258#define db_agheigth db_bmap.dn_agheigth 258#define db_agheight db_bmap.dn_agheight
259#define db_agstart db_bmap.dn_agstart 259#define db_agstart db_bmap.dn_agstart
260#define db_numag db_bmap.dn_numag 260#define db_numag db_bmap.dn_numag
261#define db_maxlevel db_bmap.dn_maxlevel 261#define db_maxlevel db_bmap.dn_maxlevel
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 0e4623be70ce..9197a1b0d02d 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -102,6 +102,7 @@
102 102
103#include <linux/fs.h> 103#include <linux/fs.h>
104#include <linux/quotaops.h> 104#include <linux/quotaops.h>
105#include <linux/slab.h>
105#include "jfs_incore.h" 106#include "jfs_incore.h"
106#include "jfs_superblock.h" 107#include "jfs_superblock.h"
107#include "jfs_filsys.h" 108#include "jfs_filsys.h"
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 0fc30407f039..f8332dc8eeb2 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -45,6 +45,7 @@
45#include <linux/buffer_head.h> 45#include <linux/buffer_head.h>
46#include <linux/pagemap.h> 46#include <linux/pagemap.h>
47#include <linux/quotaops.h> 47#include <linux/quotaops.h>
48#include <linux/slab.h>
48 49
49#include "jfs_incore.h" 50#include "jfs_incore.h"
50#include "jfs_inode.h" 51#include "jfs_inode.h"
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 79e2c79661df..9e6bda30a6e8 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -48,5 +48,6 @@ extern const struct file_operations jfs_dir_operations;
48extern const struct inode_operations jfs_file_inode_operations; 48extern const struct inode_operations jfs_file_inode_operations;
49extern const struct file_operations jfs_file_operations; 49extern const struct file_operations jfs_file_operations;
50extern const struct inode_operations jfs_symlink_inode_operations; 50extern const struct inode_operations jfs_symlink_inode_operations;
51extern const struct inode_operations jfs_fast_symlink_inode_operations;
51extern const struct dentry_operations jfs_ci_dentry_operations; 52extern const struct dentry_operations jfs_ci_dentry_operations;
52#endif /* _H_JFS_INODE */ 53#endif /* _H_JFS_INODE */
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 335c4de6552d..c51af2a14516 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -70,6 +70,7 @@
70#include <linux/delay.h> 70#include <linux/delay.h>
71#include <linux/mutex.h> 71#include <linux/mutex.h>
72#include <linux/seq_file.h> 72#include <linux/seq_file.h>
73#include <linux/slab.h>
73#include "jfs_incore.h" 74#include "jfs_incore.h"
74#include "jfs_filsys.h" 75#include "jfs_filsys.h"
75#include "jfs_metapage.h" 76#include "jfs_metapage.h"
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 07b6c5dfb4b6..48b44bd8267b 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -21,6 +21,7 @@
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/bio.h> 23#include <linux/bio.h>
24#include <linux/slab.h>
24#include <linux/init.h> 25#include <linux/init.h>
25#include <linux/buffer_head.h> 26#include <linux/buffer_head.h>
26#include <linux/mempool.h> 27#include <linux/mempool.h>
diff --git a/fs/jfs/jfs_unicode.h b/fs/jfs/jfs_unicode.h
index 3fbb3a225590..8f0f02cb6ca6 100644
--- a/fs/jfs/jfs_unicode.h
+++ b/fs/jfs/jfs_unicode.h
@@ -19,6 +19,7 @@
19#ifndef _H_JFS_UNICODE 19#ifndef _H_JFS_UNICODE
20#define _H_JFS_UNICODE 20#define _H_JFS_UNICODE
21 21
22#include <linux/slab.h>
22#include <asm/byteorder.h> 23#include <asm/byteorder.h>
23#include "jfs_types.h" 24#include "jfs_types.h"
24 25
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 4a3e9f39c21d..a9cf8e8675be 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -956,7 +956,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
956 */ 956 */
957 957
958 if (ssize <= IDATASIZE) { 958 if (ssize <= IDATASIZE) {
959 ip->i_op = &jfs_symlink_inode_operations; 959 ip->i_op = &jfs_fast_symlink_inode_operations;
960 960
961 i_fastsymlink = JFS_IP(ip)->i_inline; 961 i_fastsymlink = JFS_IP(ip)->i_inline;
962 memcpy(i_fastsymlink, name, ssize); 962 memcpy(i_fastsymlink, name, ssize);
@@ -978,7 +978,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
978 else { 978 else {
979 jfs_info("jfs_symlink: allocate extent ip:0x%p", ip); 979 jfs_info("jfs_symlink: allocate extent ip:0x%p", ip);
980 980
981 ip->i_op = &page_symlink_inode_operations; 981 ip->i_op = &jfs_symlink_inode_operations;
982 ip->i_mapping->a_ops = &jfs_aops; 982 ip->i_mapping->a_ops = &jfs_aops;
983 983
984 /* 984 /*
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 7f24a0bb08ca..1aba0039f1c9 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -81,6 +81,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
81 struct inode *iplist[1]; 81 struct inode *iplist[1];
82 struct jfs_superblock *j_sb, *j_sb2; 82 struct jfs_superblock *j_sb, *j_sb2;
83 uint old_agsize; 83 uint old_agsize;
84 int agsizechanged = 0;
84 struct buffer_head *bh, *bh2; 85 struct buffer_head *bh, *bh2;
85 86
86 /* If the volume hasn't grown, get out now */ 87 /* If the volume hasn't grown, get out now */
@@ -333,6 +334,9 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
333 */ 334 */
334 if ((rc = dbExtendFS(ipbmap, XAddress, nblocks))) 335 if ((rc = dbExtendFS(ipbmap, XAddress, nblocks)))
335 goto error_out; 336 goto error_out;
337
338 agsizechanged |= (bmp->db_agsize != old_agsize);
339
336 /* 340 /*
337 * the map now has extended to cover additional nblocks: 341 * the map now has extended to cover additional nblocks:
338 * dn_mapsize = oldMapsize + nblocks; 342 * dn_mapsize = oldMapsize + nblocks;
@@ -432,7 +436,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
432 * will correctly identify the new ag); 436 * will correctly identify the new ag);
433 */ 437 */
434 /* if new AG size the same as old AG size, done! */ 438 /* if new AG size the same as old AG size, done! */
435 if (bmp->db_agsize != old_agsize) { 439 if (agsizechanged) {
436 if ((rc = diExtendFS(ipimap, ipbmap))) 440 if ((rc = diExtendFS(ipimap, ipbmap)))
437 goto error_out; 441 goto error_out;
438 442
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 266699deb1c6..157382fa6256 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -30,6 +30,7 @@
30#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
31#include <linux/exportfs.h> 31#include <linux/exportfs.h>
32#include <linux/crc32.h> 32#include <linux/crc32.h>
33#include <linux/slab.h>
33#include <asm/uaccess.h> 34#include <asm/uaccess.h>
34#include <linux/seq_file.h> 35#include <linux/seq_file.h>
35#include <linux/smp_lock.h> 36#include <linux/smp_lock.h>
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c
index 4af1a05aad0a..205b946d8e0d 100644
--- a/fs/jfs/symlink.c
+++ b/fs/jfs/symlink.c
@@ -29,9 +29,21 @@ static void *jfs_follow_link(struct dentry *dentry, struct nameidata *nd)
29 return NULL; 29 return NULL;
30} 30}
31 31
32const struct inode_operations jfs_symlink_inode_operations = { 32const struct inode_operations jfs_fast_symlink_inode_operations = {
33 .readlink = generic_readlink, 33 .readlink = generic_readlink,
34 .follow_link = jfs_follow_link, 34 .follow_link = jfs_follow_link,
35 .setattr = jfs_setattr,
36 .setxattr = jfs_setxattr,
37 .getxattr = jfs_getxattr,
38 .listxattr = jfs_listxattr,
39 .removexattr = jfs_removexattr,
40};
41
42const struct inode_operations jfs_symlink_inode_operations = {
43 .readlink = generic_readlink,
44 .follow_link = page_follow_link_light,
45 .put_link = page_put_link,
46 .setattr = jfs_setattr,
35 .setxattr = jfs_setxattr, 47 .setxattr = jfs_setxattr,
36 .getxattr = jfs_getxattr, 48 .getxattr = jfs_getxattr,
37 .listxattr = jfs_listxattr, 49 .listxattr = jfs_listxattr,
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 1f594ab21895..fa96bbb26343 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -21,6 +21,7 @@
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/xattr.h> 22#include <linux/xattr.h>
23#include <linux/posix_acl_xattr.h> 23#include <linux/posix_acl_xattr.h>
24#include <linux/slab.h>
24#include <linux/quotaops.h> 25#include <linux/quotaops.h>
25#include <linux/security.h> 26#include <linux/security.h>
26#include "jfs_incore.h" 27#include "jfs_incore.h"
diff --git a/fs/libfs.c b/fs/libfs.c
index 9e50bcf55857..ea9a6cc9b35c 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -5,6 +5,7 @@
5 5
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/pagemap.h> 7#include <linux/pagemap.h>
8#include <linux/slab.h>
8#include <linux/mount.h> 9#include <linux/mount.h>
9#include <linux/vfs.h> 10#include <linux/vfs.h>
10#include <linux/mutex.h> 11#include <linux/mutex.h>
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index fc9032dc8862..64fd427c993c 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/slab.h>
11#include <linux/time.h> 12#include <linux/time.h>
12#include <linux/nfs_fs.h> 13#include <linux/nfs_fs.h>
13#include <linux/sunrpc/clnt.h> 14#include <linux/sunrpc/clnt.h>
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index c81249fef11f..7932c399fab4 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/smp_lock.h> 10#include <linux/smp_lock.h>
11#include <linux/slab.h>
11#include <linux/types.h> 12#include <linux/types.h>
12#include <linux/errno.h> 13#include <linux/errno.h>
13#include <linux/fs.h> 14#include <linux/fs.h>
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index fefa4df3f005..e3015464fbab 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -10,6 +10,7 @@
10#include <linux/utsname.h> 10#include <linux/utsname.h>
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/ktime.h> 12#include <linux/ktime.h>
13#include <linux/slab.h>
13 14
14#include <linux/sunrpc/clnt.h> 15#include <linux/sunrpc/clnt.h>
15#include <linux/sunrpc/xprtsock.h> 16#include <linux/sunrpc/xprtsock.h>
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 7d150517ddf0..f1bacf1a0391 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -21,7 +21,6 @@
21#include <linux/errno.h> 21#include <linux/errno.h>
22#include <linux/in.h> 22#include <linux/in.h>
23#include <linux/uio.h> 23#include <linux/uio.h>
24#include <linux/slab.h>
25#include <linux/smp.h> 24#include <linux/smp.h>
26#include <linux/smp_lock.h> 25#include <linux/smp_lock.h>
27#include <linux/mutex.h> 26#include <linux/mutex.h>
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index a7966eed3c17..031c6569a134 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -9,7 +9,6 @@
9 9
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/slab.h>
13#include <linux/smp_lock.h> 12#include <linux/smp_lock.h>
14#include <linux/lockd/lockd.h> 13#include <linux/lockd/lockd.h>
15#include <linux/lockd/share.h> 14#include <linux/lockd/share.h>
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index d1001790fa9a..84055d31bfc5 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/types.h> 23#include <linux/types.h>
24#include <linux/slab.h>
24#include <linux/errno.h> 25#include <linux/errno.h>
25#include <linux/kernel.h> 26#include <linux/kernel.h>
26#include <linux/sched.h> 27#include <linux/sched.h>
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 56c9519d900a..0f2ab741ae7c 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -9,7 +9,6 @@
9 9
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/slab.h>
13#include <linux/smp_lock.h> 12#include <linux/smp_lock.h>
14#include <linux/lockd/lockd.h> 13#include <linux/lockd/lockd.h>
15#include <linux/lockd/share.h> 14#include <linux/lockd/share.h>
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index ad478da7ca63..d0ef94cfb3da 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -10,6 +10,7 @@
10#include <linux/string.h> 10#include <linux/string.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/in.h> 12#include <linux/in.h>
13#include <linux/slab.h>
13#include <linux/mutex.h> 14#include <linux/mutex.h>
14#include <linux/sunrpc/svc.h> 15#include <linux/sunrpc/svc.h>
15#include <linux/sunrpc/clnt.h> 16#include <linux/sunrpc/clnt.h>
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 9718c22f186d..243c00071f76 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -9,6 +9,7 @@
9#include <linux/bio.h> 9#include <linux/bio.h>
10#include <linux/blkdev.h> 10#include <linux/blkdev.h>
11#include <linux/buffer_head.h> 11#include <linux/buffer_head.h>
12#include <linux/gfp.h>
12 13
13#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) 14#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
14 15
@@ -80,6 +81,7 @@ static void writeseg_end_io(struct bio *bio, int err)
80 prefetchw(&bvec->bv_page->flags); 81 prefetchw(&bvec->bv_page->flags);
81 82
82 end_page_writeback(page); 83 end_page_writeback(page);
84 page_cache_release(page);
83 } while (bvec >= bio->bi_io_vec); 85 } while (bvec >= bio->bi_io_vec);
84 bio_put(bio); 86 bio_put(bio);
85 if (atomic_dec_and_test(&super->s_pending_writes)) 87 if (atomic_dec_and_test(&super->s_pending_writes))
@@ -97,8 +99,10 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
97 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9); 99 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
98 int i; 100 int i;
99 101
102 if (max_pages > BIO_MAX_PAGES)
103 max_pages = BIO_MAX_PAGES;
100 bio = bio_alloc(GFP_NOFS, max_pages); 104 bio = bio_alloc(GFP_NOFS, max_pages);
101 BUG_ON(!bio); /* FIXME: handle this */ 105 BUG_ON(!bio);
102 106
103 for (i = 0; i < nr_pages; i++) { 107 for (i = 0; i < nr_pages; i++) {
104 if (i >= max_pages) { 108 if (i >= max_pages) {
@@ -191,8 +195,10 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
191 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9); 195 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
192 int i; 196 int i;
193 197
198 if (max_pages > BIO_MAX_PAGES)
199 max_pages = BIO_MAX_PAGES;
194 bio = bio_alloc(GFP_NOFS, max_pages); 200 bio = bio_alloc(GFP_NOFS, max_pages);
195 BUG_ON(!bio); /* FIXME: handle this */ 201 BUG_ON(!bio);
196 202
197 for (i = 0; i < nr_pages; i++) { 203 for (i = 0; i < nr_pages; i++) {
198 if (i >= max_pages) { 204 if (i >= max_pages) {
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 56a8bfbb0120..2396a85c0f55 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -6,7 +6,7 @@
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> 6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */ 7 */
8#include "logfs.h" 8#include "logfs.h"
9 9#include <linux/slab.h>
10 10
11/* 11/*
12 * Atomic dir operations 12 * Atomic dir operations
@@ -303,12 +303,12 @@ static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
303 (filler_t *)logfs_readpage, NULL); 303 (filler_t *)logfs_readpage, NULL);
304 if (IS_ERR(page)) 304 if (IS_ERR(page))
305 return PTR_ERR(page); 305 return PTR_ERR(page);
306 dd = kmap_atomic(page, KM_USER0); 306 dd = kmap(page);
307 BUG_ON(dd->namelen == 0); 307 BUG_ON(dd->namelen == 0);
308 308
309 full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen), 309 full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen),
310 pos, be64_to_cpu(dd->ino), dd->type); 310 pos, be64_to_cpu(dd->ino), dd->type);
311 kunmap_atomic(dd, KM_USER0); 311 kunmap(page);
312 page_cache_release(page); 312 page_cache_release(page);
313 if (full) 313 if (full)
314 break; 314 break;
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
index 92949f95a901..76c242fbe1b0 100644
--- a/fs/logfs/gc.c
+++ b/fs/logfs/gc.c
@@ -7,6 +7,7 @@
7 */ 7 */
8#include "logfs.h" 8#include "logfs.h"
9#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/slab.h>
10 11
11/* 12/*
12 * Wear leveling needs to kick in when the difference between low erase 13 * Wear leveling needs to kick in when the difference between low erase
@@ -458,6 +459,14 @@ static void __logfs_gc_pass(struct super_block *sb, int target)
458 struct logfs_block *block; 459 struct logfs_block *block;
459 int round, progress, last_progress = 0; 460 int round, progress, last_progress = 0;
460 461
462 /*
463 * Doing too many changes to the segfile at once would result
464 * in a large number of aliases. Write the journal before
465 * things get out of hand.
466 */
467 if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES)
468 logfs_write_anchor(sb);
469
461 if (no_free_segments(sb) >= target && 470 if (no_free_segments(sb) >= target &&
462 super->s_no_object_aliases < MAX_OBJ_ALIASES) 471 super->s_no_object_aliases < MAX_OBJ_ALIASES)
463 return; 472 return;
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 33ec1aeaeec4..14ed27274da2 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -6,6 +6,7 @@
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> 6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */ 7 */
8#include "logfs.h" 8#include "logfs.h"
9#include <linux/slab.h>
9#include <linux/writeback.h> 10#include <linux/writeback.h>
10#include <linux/backing-dev.h> 11#include <linux/backing-dev.h>
11 12
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index 6ad30a4c9052..fb0a613f885b 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -6,6 +6,7 @@
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> 6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */ 7 */
8#include "logfs.h" 8#include "logfs.h"
9#include <linux/slab.h>
9 10
10static void logfs_calc_free(struct super_block *sb) 11static void logfs_calc_free(struct super_block *sb)
11{ 12{
@@ -388,7 +389,10 @@ static void journal_get_erase_count(struct logfs_area *area)
388static int journal_erase_segment(struct logfs_area *area) 389static int journal_erase_segment(struct logfs_area *area)
389{ 390{
390 struct super_block *sb = area->a_sb; 391 struct super_block *sb = area->a_sb;
391 struct logfs_segment_header sh; 392 union {
393 struct logfs_segment_header sh;
394 unsigned char c[ALIGN(sizeof(struct logfs_segment_header), 16)];
395 } u;
392 u64 ofs; 396 u64 ofs;
393 int err; 397 int err;
394 398
@@ -396,20 +400,21 @@ static int journal_erase_segment(struct logfs_area *area)
396 if (err) 400 if (err)
397 return err; 401 return err;
398 402
399 sh.pad = 0; 403 memset(&u, 0, sizeof(u));
400 sh.type = SEG_JOURNAL; 404 u.sh.pad = 0;
401 sh.level = 0; 405 u.sh.type = SEG_JOURNAL;
402 sh.segno = cpu_to_be32(area->a_segno); 406 u.sh.level = 0;
403 sh.ec = cpu_to_be32(area->a_erase_count); 407 u.sh.segno = cpu_to_be32(area->a_segno);
404 sh.gec = cpu_to_be64(logfs_super(sb)->s_gec); 408 u.sh.ec = cpu_to_be32(area->a_erase_count);
405 sh.crc = logfs_crc32(&sh, sizeof(sh), 4); 409 u.sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
410 u.sh.crc = logfs_crc32(&u.sh, sizeof(u.sh), 4);
406 411
407 /* This causes a bug in segment.c. Not yet. */ 412 /* This causes a bug in segment.c. Not yet. */
408 //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0); 413 //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0);
409 414
410 ofs = dev_ofs(sb, area->a_segno, 0); 415 ofs = dev_ofs(sb, area->a_segno, 0);
411 area->a_used_bytes = ALIGN(sizeof(sh), 16); 416 area->a_used_bytes = sizeof(u);
412 logfs_buf_write(area, ofs, &sh, sizeof(sh)); 417 logfs_buf_write(area, ofs, &u, sizeof(u));
413 return 0; 418 return 0;
414} 419}
415 420
@@ -493,6 +498,8 @@ static void account_shadows(struct super_block *sb)
493 498
494 btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow); 499 btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow);
495 btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow); 500 btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow);
501 btree_grim_visitor32(&tree->segment_map, 0, NULL);
502 tree->no_shadowed_segments = 0;
496 503
497 if (li->li_block) { 504 if (li->li_block) {
498 /* 505 /*
@@ -606,9 +613,9 @@ static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
606 if (len == 0) 613 if (len == 0)
607 return logfs_write_header(super, header, 0, type); 614 return logfs_write_header(super, header, 0, type);
608 615
616 BUG_ON(len > sb->s_blocksize);
609 compr_len = logfs_compress(buf, data, len, sb->s_blocksize); 617 compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
610 if (compr_len < 0 || type == JE_ANCHOR) { 618 if (compr_len < 0 || type == JE_ANCHOR) {
611 BUG_ON(len > sb->s_blocksize);
612 memcpy(data, buf, len); 619 memcpy(data, buf, len);
613 compr_len = len; 620 compr_len = len;
614 compr = COMPR_NONE; 621 compr = COMPR_NONE;
@@ -660,6 +667,7 @@ static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type,
660 if (ofs < 0) 667 if (ofs < 0)
661 return ofs; 668 return ofs;
662 logfs_buf_write(area, ofs, super->s_compressed_je, len); 669 logfs_buf_write(area, ofs, super->s_compressed_je, len);
670 BUG_ON(super->s_no_je >= MAX_JOURNAL_ENTRIES);
663 super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs); 671 super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs);
664 return 0; 672 return 0;
665} 673}
@@ -800,6 +808,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
800{ 808{
801 struct logfs_super *super = logfs_super(sb); 809 struct logfs_super *super = logfs_super(sb);
802 struct logfs_area *area = super->s_journal_area; 810 struct logfs_area *area = super->s_journal_area;
811 struct btree_head32 *head = &super->s_reserved_segments;
803 u32 segno, ec; 812 u32 segno, ec;
804 int i, err; 813 int i, err;
805 814
@@ -807,6 +816,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
807 /* Drop old segments */ 816 /* Drop old segments */
808 journal_for_each(i) 817 journal_for_each(i)
809 if (super->s_journal_seg[i]) { 818 if (super->s_journal_seg[i]) {
819 btree_remove32(head, super->s_journal_seg[i]);
810 logfs_set_segment_unreserved(sb, 820 logfs_set_segment_unreserved(sb,
811 super->s_journal_seg[i], 821 super->s_journal_seg[i],
812 super->s_journal_ec[i]); 822 super->s_journal_ec[i]);
@@ -819,8 +829,13 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
819 super->s_journal_seg[i] = segno; 829 super->s_journal_seg[i] = segno;
820 super->s_journal_ec[i] = ec; 830 super->s_journal_ec[i] = ec;
821 logfs_set_segment_reserved(sb, segno); 831 logfs_set_segment_reserved(sb, segno);
832 err = btree_insert32(head, segno, (void *)1, GFP_KERNEL);
833 BUG_ON(err); /* mempool should prevent this */
834 err = logfs_erase_segment(sb, segno, 1);
835 BUG_ON(err); /* FIXME: remount-ro would be nicer */
822 } 836 }
823 /* Manually move journal_area */ 837 /* Manually move journal_area */
838 freeseg(sb, area->a_segno);
824 area->a_segno = super->s_journal_seg[0]; 839 area->a_segno = super->s_journal_seg[0];
825 area->a_is_open = 0; 840 area->a_is_open = 0;
826 area->a_used_bytes = 0; 841 area->a_used_bytes = 0;
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 129779431373..0a3df1a0c936 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -257,10 +257,14 @@ struct logfs_shadow {
257 * struct shadow_tree 257 * struct shadow_tree
258 * @new: shadows where old_ofs==0, indexed by new_ofs 258 * @new: shadows where old_ofs==0, indexed by new_ofs
259 * @old: shadows where old_ofs!=0, indexed by old_ofs 259 * @old: shadows where old_ofs!=0, indexed by old_ofs
260 * @segment_map: bitfield of segments containing shadows
261 * @no_shadowed_segment: number of segments containing shadows
260 */ 262 */
261struct shadow_tree { 263struct shadow_tree {
262 struct btree_head64 new; 264 struct btree_head64 new;
263 struct btree_head64 old; 265 struct btree_head64 old;
266 struct btree_head32 segment_map;
267 int no_shadowed_segments;
264}; 268};
265 269
266struct object_alias_item { 270struct object_alias_item {
@@ -305,13 +309,14 @@ typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix,
305 level_t level, int child_no, __be64 val); 309 level_t level, int child_no, __be64 val);
306struct logfs_block_ops { 310struct logfs_block_ops {
307 void (*write_block)(struct logfs_block *block); 311 void (*write_block)(struct logfs_block *block);
308 gc_level_t (*block_level)(struct logfs_block *block);
309 void (*free_block)(struct super_block *sb, struct logfs_block*block); 312 void (*free_block)(struct super_block *sb, struct logfs_block*block);
310 int (*write_alias)(struct super_block *sb, 313 int (*write_alias)(struct super_block *sb,
311 struct logfs_block *block, 314 struct logfs_block *block,
312 write_alias_t *write_one_alias); 315 write_alias_t *write_one_alias);
313}; 316};
314 317
318#define MAX_JOURNAL_ENTRIES 256
319
315struct logfs_super { 320struct logfs_super {
316 struct mtd_info *s_mtd; /* underlying device */ 321 struct mtd_info *s_mtd; /* underlying device */
317 struct block_device *s_bdev; /* underlying device */ 322 struct block_device *s_bdev; /* underlying device */
@@ -378,7 +383,7 @@ struct logfs_super {
378 u32 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */ 383 u32 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */
379 u64 s_last_version; 384 u64 s_last_version;
380 struct logfs_area *s_journal_area; /* open journal segment */ 385 struct logfs_area *s_journal_area; /* open journal segment */
381 __be64 s_je_array[64]; 386 __be64 s_je_array[MAX_JOURNAL_ENTRIES];
382 int s_no_je; 387 int s_no_je;
383 388
384 int s_sum_index; /* for the 12 summaries */ 389 int s_sum_index; /* for the 12 summaries */
@@ -587,6 +592,7 @@ void move_page_to_btree(struct page *page);
587int logfs_init_mapping(struct super_block *sb); 592int logfs_init_mapping(struct super_block *sb);
588void logfs_sync_area(struct logfs_area *area); 593void logfs_sync_area(struct logfs_area *area);
589void logfs_sync_segments(struct super_block *sb); 594void logfs_sync_segments(struct super_block *sb);
595void freeseg(struct super_block *sb, u32 segno);
590 596
591/* area handling */ 597/* area handling */
592int logfs_init_areas(struct super_block *sb); 598int logfs_init_areas(struct super_block *sb);
@@ -721,4 +727,10 @@ static inline struct logfs_area *get_area(struct super_block *sb,
721 return logfs_super(sb)->s_area[(__force u8)gc_level]; 727 return logfs_super(sb)->s_area[(__force u8)gc_level];
722} 728}
723 729
730static inline void logfs_mempool_destroy(mempool_t *pool)
731{
732 if (pool)
733 mempool_destroy(pool);
734}
735
724#endif 736#endif
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 7a23b3e7c0a7..3159db6958e5 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -18,6 +18,7 @@
18 */ 18 */
19#include "logfs.h" 19#include "logfs.h"
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h>
21 22
22static u64 adjust_bix(u64 bix, level_t level) 23static u64 adjust_bix(u64 bix, level_t level)
23{ 24{
@@ -429,25 +430,6 @@ static void inode_write_block(struct logfs_block *block)
429 } 430 }
430} 431}
431 432
432static gc_level_t inode_block_level(struct logfs_block *block)
433{
434 BUG_ON(block->inode->i_ino == LOGFS_INO_MASTER);
435 return GC_LEVEL(LOGFS_MAX_LEVELS);
436}
437
438static gc_level_t indirect_block_level(struct logfs_block *block)
439{
440 struct page *page;
441 struct inode *inode;
442 u64 bix;
443 level_t level;
444
445 page = block->page;
446 inode = page->mapping->host;
447 logfs_unpack_index(page->index, &bix, &level);
448 return expand_level(inode->i_ino, level);
449}
450
451/* 433/*
452 * This silences a false, yet annoying gcc warning. I hate it when my editor 434 * This silences a false, yet annoying gcc warning. I hate it when my editor
453 * jumps into bitops.h each time I recompile this file. 435 * jumps into bitops.h each time I recompile this file.
@@ -586,14 +568,12 @@ static void indirect_free_block(struct super_block *sb,
586 568
587static struct logfs_block_ops inode_block_ops = { 569static struct logfs_block_ops inode_block_ops = {
588 .write_block = inode_write_block, 570 .write_block = inode_write_block,
589 .block_level = inode_block_level,
590 .free_block = inode_free_block, 571 .free_block = inode_free_block,
591 .write_alias = inode_write_alias, 572 .write_alias = inode_write_alias,
592}; 573};
593 574
594struct logfs_block_ops indirect_block_ops = { 575struct logfs_block_ops indirect_block_ops = {
595 .write_block = indirect_write_block, 576 .write_block = indirect_write_block,
596 .block_level = indirect_block_level,
597 .free_block = indirect_free_block, 577 .free_block = indirect_free_block,
598 .write_alias = indirect_write_alias, 578 .write_alias = indirect_write_alias,
599}; 579};
@@ -1240,6 +1220,18 @@ static void free_shadow(struct inode *inode, struct logfs_shadow *shadow)
1240 mempool_free(shadow, super->s_shadow_pool); 1220 mempool_free(shadow, super->s_shadow_pool);
1241} 1221}
1242 1222
1223static void mark_segment(struct shadow_tree *tree, u32 segno)
1224{
1225 int err;
1226
1227 if (!btree_lookup32(&tree->segment_map, segno)) {
1228 err = btree_insert32(&tree->segment_map, segno, (void *)1,
1229 GFP_NOFS);
1230 BUG_ON(err);
1231 tree->no_shadowed_segments++;
1232 }
1233}
1234
1243/** 1235/**
1244 * fill_shadow_tree - Propagate shadow tree changes due to a write 1236 * fill_shadow_tree - Propagate shadow tree changes due to a write
1245 * @inode: Inode owning the page 1237 * @inode: Inode owning the page
@@ -1287,6 +1279,8 @@ static void fill_shadow_tree(struct inode *inode, struct page *page,
1287 1279
1288 super->s_dirty_used_bytes += shadow->new_len; 1280 super->s_dirty_used_bytes += shadow->new_len;
1289 super->s_dirty_free_bytes += shadow->old_len; 1281 super->s_dirty_free_bytes += shadow->old_len;
1282 mark_segment(tree, shadow->old_ofs >> super->s_segshift);
1283 mark_segment(tree, shadow->new_ofs >> super->s_segshift);
1290 } 1284 }
1291} 1285}
1292 1286
@@ -1594,7 +1588,6 @@ int logfs_delete(struct inode *inode, pgoff_t index,
1594 return ret; 1588 return ret;
1595} 1589}
1596 1590
1597/* Rewrite cannot mark the inode dirty but has to write it immediatly. */
1598int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs, 1591int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
1599 gc_level_t gc_level, long flags) 1592 gc_level_t gc_level, long flags)
1600{ 1593{
@@ -1611,6 +1604,18 @@ int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
1611 if (level != 0) 1604 if (level != 0)
1612 alloc_indirect_block(inode, page, 0); 1605 alloc_indirect_block(inode, page, 0);
1613 err = logfs_write_buf(inode, page, flags); 1606 err = logfs_write_buf(inode, page, flags);
1607 if (!err && shrink_level(gc_level) == 0) {
1608 /* Rewrite cannot mark the inode dirty but has to
1609 * write it immediatly.
1610 * Q: Can't we just create an alias for the inode
1611 * instead? And if not, why not?
1612 */
1613 if (inode->i_ino == LOGFS_INO_MASTER)
1614 logfs_write_anchor(inode->i_sb);
1615 else {
1616 err = __logfs_write_inode(inode, flags);
1617 }
1618 }
1614 } 1619 }
1615 logfs_put_write_page(page); 1620 logfs_put_write_page(page);
1616 return err; 1621 return err;
@@ -1833,19 +1838,37 @@ static int __logfs_truncate(struct inode *inode, u64 size)
1833 return logfs_truncate_direct(inode, size); 1838 return logfs_truncate_direct(inode, size);
1834} 1839}
1835 1840
1836int logfs_truncate(struct inode *inode, u64 size) 1841/*
1842 * Truncate, by changing the segment file, can consume a fair amount
1843 * of resources. So back off from time to time and do some GC.
1844 * 8 or 2048 blocks should be well within safety limits even if
1845 * every single block resided in a different segment.
1846 */
1847#define TRUNCATE_STEP (8 * 1024 * 1024)
1848int logfs_truncate(struct inode *inode, u64 target)
1837{ 1849{
1838 struct super_block *sb = inode->i_sb; 1850 struct super_block *sb = inode->i_sb;
1839 int err; 1851 u64 size = i_size_read(inode);
1852 int err = 0;
1840 1853
1841 logfs_get_wblocks(sb, NULL, 1); 1854 size = ALIGN(size, TRUNCATE_STEP);
1842 err = __logfs_truncate(inode, size); 1855 while (size > target) {
1843 if (!err) 1856 if (size > TRUNCATE_STEP)
1844 err = __logfs_write_inode(inode, 0); 1857 size -= TRUNCATE_STEP;
1845 logfs_put_wblocks(sb, NULL, 1); 1858 else
1859 size = 0;
1860 if (size < target)
1861 size = target;
1862
1863 logfs_get_wblocks(sb, NULL, 1);
1864 err = __logfs_truncate(inode, target);
1865 if (!err)
1866 err = __logfs_write_inode(inode, 0);
1867 logfs_put_wblocks(sb, NULL, 1);
1868 }
1846 1869
1847 if (!err) 1870 if (!err)
1848 err = vmtruncate(inode, size); 1871 err = vmtruncate(inode, target);
1849 1872
1850 /* I don't trust error recovery yet. */ 1873 /* I don't trust error recovery yet. */
1851 WARN_ON(err); 1874 WARN_ON(err);
@@ -2239,8 +2262,6 @@ void logfs_cleanup_rw(struct super_block *sb)
2239 struct logfs_super *super = logfs_super(sb); 2262 struct logfs_super *super = logfs_super(sb);
2240 2263
2241 destroy_meta_inode(super->s_segfile_inode); 2264 destroy_meta_inode(super->s_segfile_inode);
2242 if (super->s_block_pool) 2265 logfs_mempool_destroy(super->s_block_pool);
2243 mempool_destroy(super->s_block_pool); 2266 logfs_mempool_destroy(super->s_shadow_pool);
2244 if (super->s_shadow_pool)
2245 mempool_destroy(super->s_shadow_pool);
2246} 2267}
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 1a14f9910d55..f77ce2b470ba 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -10,6 +10,7 @@
10 * three kinds of objects: inodes, dentries and blocks, both data and indirect. 10 * three kinds of objects: inodes, dentries and blocks, both data and indirect.
11 */ 11 */
12#include "logfs.h" 12#include "logfs.h"
13#include <linux/slab.h>
13 14
14static int logfs_mark_segment_bad(struct super_block *sb, u32 segno) 15static int logfs_mark_segment_bad(struct super_block *sb, u32 segno)
15{ 16{
@@ -93,50 +94,58 @@ void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
93 } while (len); 94 } while (len);
94} 95}
95 96
96/* 97static void pad_partial_page(struct logfs_area *area)
97 * bdev_writeseg will write full pages. Memset the tail to prevent data leaks.
98 */
99static void pad_wbuf(struct logfs_area *area, int final)
100{ 98{
101 struct super_block *sb = area->a_sb; 99 struct super_block *sb = area->a_sb;
102 struct logfs_super *super = logfs_super(sb);
103 struct page *page; 100 struct page *page;
104 u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes); 101 u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
105 pgoff_t index = ofs >> PAGE_SHIFT; 102 pgoff_t index = ofs >> PAGE_SHIFT;
106 long offset = ofs & (PAGE_SIZE-1); 103 long offset = ofs & (PAGE_SIZE-1);
107 u32 len = PAGE_SIZE - offset; 104 u32 len = PAGE_SIZE - offset;
108 105
109 if (len == PAGE_SIZE) { 106 if (len % PAGE_SIZE) {
110 /* The math in this function can surely use some love */ 107 page = get_mapping_page(sb, index, 0);
111 len = 0;
112 }
113 if (len) {
114 BUG_ON(area->a_used_bytes >= super->s_segsize);
115
116 page = get_mapping_page(area->a_sb, index, 0);
117 BUG_ON(!page); /* FIXME: reserve a pool */ 108 BUG_ON(!page); /* FIXME: reserve a pool */
118 memset(page_address(page) + offset, 0xff, len); 109 memset(page_address(page) + offset, 0xff, len);
119 SetPagePrivate(page); 110 SetPagePrivate(page);
120 page_cache_release(page); 111 page_cache_release(page);
121 } 112 }
113}
122 114
123 if (!final) 115static void pad_full_pages(struct logfs_area *area)
124 return; 116{
117 struct super_block *sb = area->a_sb;
118 struct logfs_super *super = logfs_super(sb);
119 u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
120 u32 len = super->s_segsize - area->a_used_bytes;
121 pgoff_t index = PAGE_CACHE_ALIGN(ofs) >> PAGE_CACHE_SHIFT;
122 pgoff_t no_indizes = len >> PAGE_CACHE_SHIFT;
123 struct page *page;
125 124
126 area->a_used_bytes += len; 125 while (no_indizes) {
127 for ( ; area->a_used_bytes < super->s_segsize; 126 page = get_mapping_page(sb, index, 0);
128 area->a_used_bytes += PAGE_SIZE) {
129 /* Memset another page */
130 index++;
131 page = get_mapping_page(area->a_sb, index, 0);
132 BUG_ON(!page); /* FIXME: reserve a pool */ 127 BUG_ON(!page); /* FIXME: reserve a pool */
133 memset(page_address(page), 0xff, PAGE_SIZE); 128 SetPageUptodate(page);
129 memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
134 SetPagePrivate(page); 130 SetPagePrivate(page);
135 page_cache_release(page); 131 page_cache_release(page);
132 index++;
133 no_indizes--;
136 } 134 }
137} 135}
138 136
139/* 137/*
138 * bdev_writeseg will write full pages. Memset the tail to prevent data leaks.
139 * Also make sure we allocate (and memset) all pages for final writeout.
140 */
141static void pad_wbuf(struct logfs_area *area, int final)
142{
143 pad_partial_page(area);
144 if (final)
145 pad_full_pages(area);
146}
147
148/*
140 * We have to be careful with the alias tree. Since lookup is done by bix, 149 * We have to be careful with the alias tree. Since lookup is done by bix,
141 * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with 150 * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with
142 * indirect blocks. So always use it through accessor functions. 151 * indirect blocks. So always use it through accessor functions.
@@ -174,14 +183,8 @@ static int btree_write_alias(struct super_block *sb, struct logfs_block *block,
174 return 0; 183 return 0;
175} 184}
176 185
177static gc_level_t btree_block_level(struct logfs_block *block)
178{
179 return expand_level(block->ino, block->level);
180}
181
182static struct logfs_block_ops btree_block_ops = { 186static struct logfs_block_ops btree_block_ops = {
183 .write_block = btree_write_block, 187 .write_block = btree_write_block,
184 .block_level = btree_block_level,
185 .free_block = __free_block, 188 .free_block = __free_block,
186 .write_alias = btree_write_alias, 189 .write_alias = btree_write_alias,
187}; 190};
@@ -683,7 +686,7 @@ int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow)
683 return 0; 686 return 0;
684} 687}
685 688
686static void freeseg(struct super_block *sb, u32 segno) 689void freeseg(struct super_block *sb, u32 segno)
687{ 690{
688 struct logfs_super *super = logfs_super(sb); 691 struct logfs_super *super = logfs_super(sb);
689 struct address_space *mapping = super->s_mapping_inode->i_mapping; 692 struct address_space *mapping = super->s_mapping_inode->i_mapping;
@@ -910,7 +913,7 @@ err:
910 for (i--; i >= 0; i--) 913 for (i--; i >= 0; i--)
911 free_area(super->s_area[i]); 914 free_area(super->s_area[i]);
912 free_area(super->s_journal_area); 915 free_area(super->s_journal_area);
913 mempool_destroy(super->s_alias_pool); 916 logfs_mempool_destroy(super->s_alias_pool);
914 return -ENOMEM; 917 return -ENOMEM;
915} 918}
916 919
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index c66beab78dee..5866ee6e1327 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -11,6 +11,8 @@
11 */ 11 */
12#include "logfs.h" 12#include "logfs.h"
13#include <linux/bio.h> 13#include <linux/bio.h>
14#include <linux/slab.h>
15#include <linux/blkdev.h>
14#include <linux/mtd/mtd.h> 16#include <linux/mtd/mtd.h>
15#include <linux/statfs.h> 17#include <linux/statfs.h>
16#include <linux/buffer_head.h> 18#include <linux/buffer_head.h>
@@ -136,6 +138,10 @@ static int logfs_sb_set(struct super_block *sb, void *_super)
136 sb->s_fs_info = super; 138 sb->s_fs_info = super;
137 sb->s_mtd = super->s_mtd; 139 sb->s_mtd = super->s_mtd;
138 sb->s_bdev = super->s_bdev; 140 sb->s_bdev = super->s_bdev;
141 if (sb->s_bdev)
142 sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
143 if (sb->s_mtd)
144 sb->s_bdi = sb->s_mtd->backing_dev_info;
139 return 0; 145 return 0;
140} 146}
141 147
@@ -277,7 +283,7 @@ static int logfs_recover_sb(struct super_block *sb)
277 } 283 }
278 if (valid0 && valid1 && ds_cmp(ds0, ds1)) { 284 if (valid0 && valid1 && ds_cmp(ds0, ds1)) {
279 printk(KERN_INFO"Superblocks don't match - fixing.\n"); 285 printk(KERN_INFO"Superblocks don't match - fixing.\n");
280 return write_one_sb(sb, super->s_devops->find_last_sb); 286 return logfs_write_sb(sb);
281 } 287 }
282 /* If neither is valid now, something's wrong. Didn't we properly 288 /* If neither is valid now, something's wrong. Didn't we properly
283 * check them before?!? */ 289 * check them before?!? */
@@ -289,6 +295,10 @@ static int logfs_make_writeable(struct super_block *sb)
289{ 295{
290 int err; 296 int err;
291 297
298 err = logfs_open_segfile(sb);
299 if (err)
300 return err;
301
292 /* Repair any broken superblock copies */ 302 /* Repair any broken superblock copies */
293 err = logfs_recover_sb(sb); 303 err = logfs_recover_sb(sb);
294 if (err) 304 if (err)
@@ -299,10 +309,6 @@ static int logfs_make_writeable(struct super_block *sb)
299 if (err) 309 if (err)
300 return err; 310 return err;
301 311
302 err = logfs_open_segfile(sb);
303 if (err)
304 return err;
305
306 /* Do one GC pass before any data gets dirtied */ 312 /* Do one GC pass before any data gets dirtied */
307 logfs_gc_pass(sb); 313 logfs_gc_pass(sb);
308 314
@@ -328,7 +334,7 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
328 334
329 sb->s_root = d_alloc_root(rootdir); 335 sb->s_root = d_alloc_root(rootdir);
330 if (!sb->s_root) 336 if (!sb->s_root)
331 goto fail; 337 goto fail2;
332 338
333 super->s_erase_page = alloc_pages(GFP_KERNEL, 0); 339 super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
334 if (!super->s_erase_page) 340 if (!super->s_erase_page)
@@ -451,6 +457,8 @@ static int logfs_read_sb(struct super_block *sb, int read_only)
451 457
452 btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool); 458 btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool);
453 btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool); 459 btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool);
460 btree_init_mempool32(&super->s_shadow_tree.segment_map,
461 super->s_btree_pool);
454 462
455 ret = logfs_init_mapping(sb); 463 ret = logfs_init_mapping(sb);
456 if (ret) 464 if (ret)
@@ -515,8 +523,8 @@ static void logfs_kill_sb(struct super_block *sb)
515 if (super->s_erase_page) 523 if (super->s_erase_page)
516 __free_page(super->s_erase_page); 524 __free_page(super->s_erase_page);
517 super->s_devops->put_device(sb); 525 super->s_devops->put_device(sb);
518 mempool_destroy(super->s_btree_pool); 526 logfs_mempool_destroy(super->s_btree_pool);
519 mempool_destroy(super->s_alias_pool); 527 logfs_mempool_destroy(super->s_alias_pool);
520 kfree(super); 528 kfree(super);
521 log_super("LogFS: Finished unmounting\n"); 529 log_super("LogFS: Finished unmounting\n");
522} 530}
@@ -572,8 +580,7 @@ int logfs_get_sb_device(struct file_system_type *type, int flags,
572 return 0; 580 return 0;
573 581
574err1: 582err1:
575 up_write(&sb->s_umount); 583 deactivate_locked_super(sb);
576 deactivate_super(sb);
577 return err; 584 return err;
578err0: 585err0:
579 kfree(super); 586 kfree(super);
diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c
index 82d6554b02fe..282e15ad8cd8 100644
--- a/fs/minix/itree_v1.c
+++ b/fs/minix/itree_v1.c
@@ -1,4 +1,5 @@
1#include <linux/buffer_head.h> 1#include <linux/buffer_head.h>
2#include <linux/slab.h>
2#include "minix.h" 3#include "minix.h"
3 4
4enum {DEPTH = 3, DIRECT = 7}; /* Only double indirect */ 5enum {DEPTH = 3, DIRECT = 7}; /* Only double indirect */
diff --git a/fs/mpage.c b/fs/mpage.c
index 598d54e200eb..fd56ca2ea556 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -16,6 +16,7 @@
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include <linux/kdev_t.h> 18#include <linux/kdev_t.h>
19#include <linux/gfp.h>
19#include <linux/bio.h> 20#include <linux/bio.h>
20#include <linux/fs.h> 21#include <linux/fs.h>
21#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
diff --git a/fs/namei.c b/fs/namei.c
index 1c0fca6e899e..a7dce91a7e42 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1610,8 +1610,7 @@ exit:
1610 1610
1611static struct file *do_last(struct nameidata *nd, struct path *path, 1611static struct file *do_last(struct nameidata *nd, struct path *path,
1612 int open_flag, int acc_mode, 1612 int open_flag, int acc_mode,
1613 int mode, const char *pathname, 1613 int mode, const char *pathname)
1614 int *want_dir)
1615{ 1614{
1616 struct dentry *dir = nd->path.dentry; 1615 struct dentry *dir = nd->path.dentry;
1617 struct file *filp; 1616 struct file *filp;
@@ -1642,7 +1641,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1642 if (nd->last.name[nd->last.len]) { 1641 if (nd->last.name[nd->last.len]) {
1643 if (open_flag & O_CREAT) 1642 if (open_flag & O_CREAT)
1644 goto exit; 1643 goto exit;
1645 *want_dir = 1; 1644 nd->flags |= LOOKUP_DIRECTORY;
1646 } 1645 }
1647 1646
1648 /* just plain open? */ 1647 /* just plain open? */
@@ -1656,8 +1655,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1656 if (path->dentry->d_inode->i_op->follow_link) 1655 if (path->dentry->d_inode->i_op->follow_link)
1657 return NULL; 1656 return NULL;
1658 error = -ENOTDIR; 1657 error = -ENOTDIR;
1659 if (*want_dir && !path->dentry->d_inode->i_op->lookup) 1658 if (nd->flags & LOOKUP_DIRECTORY) {
1660 goto exit_dput; 1659 if (!path->dentry->d_inode->i_op->lookup)
1660 goto exit_dput;
1661 }
1661 path_to_nameidata(path, nd); 1662 path_to_nameidata(path, nd);
1662 audit_inode(pathname, nd->path.dentry); 1663 audit_inode(pathname, nd->path.dentry);
1663 goto ok; 1664 goto ok;
@@ -1766,7 +1767,6 @@ struct file *do_filp_open(int dfd, const char *pathname,
1766 int count = 0; 1767 int count = 0;
1767 int flag = open_to_namei_flags(open_flag); 1768 int flag = open_to_namei_flags(open_flag);
1768 int force_reval = 0; 1769 int force_reval = 0;
1769 int want_dir = open_flag & O_DIRECTORY;
1770 1770
1771 if (!(open_flag & O_CREAT)) 1771 if (!(open_flag & O_CREAT))
1772 mode = 0; 1772 mode = 0;
@@ -1828,7 +1828,9 @@ reval:
1828 if (open_flag & O_EXCL) 1828 if (open_flag & O_EXCL)
1829 nd.flags |= LOOKUP_EXCL; 1829 nd.flags |= LOOKUP_EXCL;
1830 } 1830 }
1831 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir); 1831 if (open_flag & O_DIRECTORY)
1832 nd.flags |= LOOKUP_DIRECTORY;
1833 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1832 while (unlikely(!filp)) { /* trailing symlink */ 1834 while (unlikely(!filp)) { /* trailing symlink */
1833 struct path holder; 1835 struct path holder;
1834 struct inode *inode = path.dentry->d_inode; 1836 struct inode *inode = path.dentry->d_inode;
@@ -1866,7 +1868,7 @@ reval:
1866 } 1868 }
1867 holder = path; 1869 holder = path;
1868 nd.flags &= ~LOOKUP_PARENT; 1870 nd.flags &= ~LOOKUP_PARENT;
1869 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir); 1871 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1870 if (inode->i_op->put_link) 1872 if (inode->i_op->put_link)
1871 inode->i_op->put_link(holder.dentry, &nd, cookie); 1873 inode->i_op->put_link(holder.dentry, &nd, cookie);
1872 path_put(&holder); 1874 path_put(&holder);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index b8b5b30d53f0..7edfcd4d5e52 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -15,7 +15,6 @@
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/stat.h> 16#include <linux/stat.h>
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/slab.h>
19#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
20#include <linux/mm.h> 19#include <linux/mm.h>
21#include <asm/uaccess.h> 20#include <asm/uaccess.h>
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 6a7d901f1936..1daabb90e0a5 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -15,7 +15,6 @@
15#include <linux/fcntl.h> 15#include <linux/fcntl.h>
16#include <linux/stat.h> 16#include <linux/stat.h>
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include <linux/slab.h>
19#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
20#include <linux/sched.h> 19#include <linux/sched.h>
21#include <linux/smp_lock.h> 20#include <linux/smp_lock.h>
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index ec8f45f12e05..60a5e2864ea8 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -15,6 +15,7 @@
15#include <linux/time.h> 15#include <linux/time.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/mount.h> 17#include <linux/mount.h>
18#include <linux/slab.h>
18#include <linux/highuid.h> 19#include <linux/highuid.h>
19#include <linux/smp_lock.h> 20#include <linux/smp_lock.h>
20#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 15458decdb8a..56f5b3a0e1ee 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -9,12 +9,12 @@
9#include <linux/stat.h> 9#include <linux/stat.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/gfp.h>
12#include <linux/mm.h> 13#include <linux/mm.h>
13#include <linux/shm.h> 14#include <linux/shm.h>
14#include <linux/errno.h> 15#include <linux/errno.h>
15#include <linux/mman.h> 16#include <linux/mman.h>
16#include <linux/string.h> 17#include <linux/string.h>
17#include <linux/slab.h>
18#include <linux/fcntl.h> 18#include <linux/fcntl.h>
19#include <linux/ncp_fs.h> 19#include <linux/ncp_fs.h>
20 20
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index e37df8d5fe70..c7ff6c700a6e 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -21,6 +21,7 @@
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/netdevice.h> 22#include <linux/netdevice.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/slab.h>
24#include <net/scm.h> 25#include <net/scm.h>
25#include <net/sock.h> 26#include <net/sock.h>
26#include <linux/ipx.h> 27#include <linux/ipx.h>
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index e3d26c1bd105..c634fd17b337 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -27,6 +27,7 @@
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/ncp_fs.h> 28#include <linux/ncp_fs.h>
29#include <linux/time.h> 29#include <linux/time.h>
30#include <linux/slab.h>
30#include <linux/mm.h> 31#include <linux/mm.h>
31#include <linux/stat.h> 32#include <linux/stat.h>
32#include "ncplib_kernel.h" 33#include "ncplib_kernel.h"
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index b4ffd0146ea6..84690319e625 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -10,6 +10,7 @@
10#include <linux/moduleparam.h> 10#include <linux/moduleparam.h>
11#include <linux/mount.h> 11#include <linux/mount.h>
12#include <linux/namei.h> 12#include <linux/namei.h>
13#include <linux/slab.h>
13#include <linux/sunrpc/cache.h> 14#include <linux/sunrpc/cache.h>
14#include <linux/sunrpc/rpc_pipe_fs.h> 15#include <linux/sunrpc/rpc_pipe_fs.h>
15 16
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 84761b5bb8e2..a08770a7e857 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -7,6 +7,7 @@
7 */ 7 */
8#include <linux/nfs4.h> 8#include <linux/nfs4.h>
9#include <linux/nfs_fs.h> 9#include <linux/nfs_fs.h>
10#include <linux/slab.h>
10#include "nfs4_fs.h" 11#include "nfs4_fs.h"
11#include "callback.h" 12#include "callback.h"
12#include "delegation.h" 13#include "delegation.h"
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index db30c0b398b5..05af212f0edf 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -9,6 +9,7 @@
9#include <linux/sunrpc/svc.h> 9#include <linux/sunrpc/svc.h>
10#include <linux/nfs4.h> 10#include <linux/nfs4.h>
11#include <linux/nfs_fs.h> 11#include <linux/nfs_fs.h>
12#include <linux/slab.h>
12#include "nfs4_fs.h" 13#include "nfs4_fs.h"
13#include "callback.h" 14#include "callback.h"
14 15
@@ -782,6 +783,7 @@ struct svc_version nfs4_callback_version1 = {
782 .vs_proc = nfs4_callback_procedures1, 783 .vs_proc = nfs4_callback_procedures1,
783 .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, 784 .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
784 .vs_dispatch = NULL, 785 .vs_dispatch = NULL,
786 .vs_hidden = 1,
785}; 787};
786 788
787struct svc_version nfs4_callback_version4 = { 789struct svc_version nfs4_callback_version4 = {
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2274f1737336..a8766c4ef2e0 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -35,6 +35,7 @@
35#include <linux/vfs.h> 35#include <linux/vfs.h>
36#include <linux/inet.h> 36#include <linux/inet.h>
37#include <linux/in6.h> 37#include <linux/in6.h>
38#include <linux/slab.h>
38#include <net/ipv6.h> 39#include <net/ipv6.h>
39#include <linux/nfs_xdr.h> 40#include <linux/nfs_xdr.h>
40#include <linux/sunrpc/bc_xprt.h> 41#include <linux/sunrpc/bc_xprt.h>
@@ -1293,7 +1294,8 @@ static int nfs4_init_server(struct nfs_server *server,
1293 1294
1294 /* Initialise the client representation from the mount data */ 1295 /* Initialise the client representation from the mount data */
1295 server->flags = data->flags; 1296 server->flags = data->flags;
1296 server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR; 1297 server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|
1298 NFS_CAP_POSIX_LOCK;
1297 server->options = data->options; 1299 server->options = data->options;
1298 1300
1299 /* Get a client record */ 1301 /* Get a client record */
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 2563bebc4c67..15671245c6ee 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -10,6 +10,7 @@
10#include <linux/kthread.h> 10#include <linux/kthread.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/slab.h>
13#include <linux/smp_lock.h> 14#include <linux/smp_lock.h>
14#include <linux/spinlock.h> 15#include <linux/spinlock.h>
15 16
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 944b627ec6e1..69e7b8140122 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -71,4 +71,10 @@ static inline int nfs_inode_return_delegation(struct inode *inode)
71} 71}
72#endif 72#endif
73 73
74static inline int nfs_have_delegated_attributes(struct inode *inode)
75{
76 return nfs_have_delegation(inode, FMODE_READ) &&
77 !(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED);
78}
79
74#endif 80#endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index a1f6b4438fb1..be46f26c9a56 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1025,12 +1025,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1025 res = NULL; 1025 res = NULL;
1026 goto out; 1026 goto out;
1027 /* This turned out not to be a regular file */ 1027 /* This turned out not to be a regular file */
1028 case -EISDIR:
1028 case -ENOTDIR: 1029 case -ENOTDIR:
1029 goto no_open; 1030 goto no_open;
1030 case -ELOOP: 1031 case -ELOOP:
1031 if (!(nd->intent.open.flags & O_NOFOLLOW)) 1032 if (!(nd->intent.open.flags & O_NOFOLLOW))
1032 goto no_open; 1033 goto no_open;
1033 /* case -EISDIR: */
1034 /* case -EINVAL: */ 1034 /* case -EINVAL: */
1035 default: 1035 default:
1036 goto out; 1036 goto out;
@@ -1789,7 +1789,7 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
1789 cache = nfs_access_search_rbtree(inode, cred); 1789 cache = nfs_access_search_rbtree(inode, cred);
1790 if (cache == NULL) 1790 if (cache == NULL)
1791 goto out; 1791 goto out;
1792 if (!nfs_have_delegation(inode, FMODE_READ) && 1792 if (!nfs_have_delegated_attributes(inode) &&
1793 !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) 1793 !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
1794 goto out_stale; 1794 goto out_stale;
1795 res->jiffies = cache->jiffies; 1795 res->jiffies = cache->jiffies;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 0d289823e856..ad4cd31d6050 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -44,6 +44,7 @@
44#include <linux/file.h> 44#include <linux/file.h>
45#include <linux/pagemap.h> 45#include <linux/pagemap.h>
46#include <linux/kref.h> 46#include <linux/kref.h>
47#include <linux/slab.h>
47 48
48#include <linux/nfs_fs.h> 49#include <linux/nfs_fs.h>
49#include <linux/nfs_page.h> 50#include <linux/nfs_page.h>
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 3f0cd4dfddaf..76fd235d0024 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -9,6 +9,7 @@
9#include <linux/hash.h> 9#include <linux/hash.h>
10#include <linux/string.h> 10#include <linux/string.h>
11#include <linux/kmod.h> 11#include <linux/kmod.h>
12#include <linux/slab.h>
12#include <linux/module.h> 13#include <linux/module.h>
13#include <linux/socket.h> 14#include <linux/socket.h>
14#include <linux/seq_file.h> 15#include <linux/seq_file.h>
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index ae8d02294e46..8d965bddb87e 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -24,9 +24,9 @@
24#include <linux/nfs_fs.h> 24#include <linux/nfs_fs.h>
25#include <linux/nfs_mount.h> 25#include <linux/nfs_mount.h>
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/slab.h>
28#include <linux/pagemap.h> 27#include <linux/pagemap.h>
29#include <linux/aio.h> 28#include <linux/aio.h>
29#include <linux/gfp.h>
30 30
31#include <asm/uaccess.h> 31#include <asm/uaccess.h>
32#include <asm/system.h> 32#include <asm/system.h>
@@ -491,7 +491,8 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
491{ 491{
492 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); 492 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
493 493
494 if (gfp & __GFP_WAIT) 494 /* Only do I/O if gfp is a superset of GFP_KERNEL */
495 if ((gfp & GFP_KERNEL) == GFP_KERNEL)
495 nfs_wb_page(page->mapping->host, page); 496 nfs_wb_page(page->mapping->host, page);
496 /* If PagePrivate() is set, then the page is not freeable */ 497 /* If PagePrivate() is set, then the page is not freeable */
497 if (PagePrivate(page)) 498 if (PagePrivate(page))
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 237874f1af23..a6b16ed93229 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -17,6 +17,7 @@
17#include <linux/nfs_fs_sb.h> 17#include <linux/nfs_fs_sb.h>
18#include <linux/in6.h> 18#include <linux/in6.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/slab.h>
20 21
21#include "internal.h" 22#include "internal.h"
22#include "iostat.h" 23#include "iostat.h"
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 657201acda84..50a56edca0b5 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -36,6 +36,7 @@
36#include <linux/vfs.h> 36#include <linux/vfs.h>
37#include <linux/inet.h> 37#include <linux/inet.h>
38#include <linux/nfs_xdr.h> 38#include <linux/nfs_xdr.h>
39#include <linux/slab.h>
39 40
40#include <asm/system.h> 41#include <asm/system.h>
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
@@ -622,10 +623,10 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
622 list_for_each_entry(pos, &nfsi->open_files, list) { 623 list_for_each_entry(pos, &nfsi->open_files, list) {
623 if (cred != NULL && pos->cred != cred) 624 if (cred != NULL && pos->cred != cred)
624 continue; 625 continue;
625 if ((pos->mode & mode) == mode) { 626 if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode)
626 ctx = get_nfs_open_context(pos); 627 continue;
627 break; 628 ctx = get_nfs_open_context(pos);
628 } 629 break;
629 } 630 }
630 spin_unlock(&inode->i_lock); 631 spin_unlock(&inode->i_lock);
631 return ctx; 632 return ctx;
@@ -729,7 +730,7 @@ int nfs_attribute_timeout(struct inode *inode)
729{ 730{
730 struct nfs_inode *nfsi = NFS_I(inode); 731 struct nfs_inode *nfsi = NFS_I(inode);
731 732
732 if (nfs_have_delegation(inode, FMODE_READ)) 733 if (nfs_have_delegated_attributes(inode))
733 return 0; 734 return 0;
734 return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); 735 return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
735} 736}
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 40c766782891..7888cf36022d 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -8,6 +8,7 @@
8 */ 8 */
9 9
10#include <linux/dcache.h> 10#include <linux/dcache.h>
11#include <linux/gfp.h>
11#include <linux/mount.h> 12#include <linux/mount.h>
12#include <linux/namei.h> 13#include <linux/namei.h>
13#include <linux/nfs_fs.h> 14#include <linux/nfs_fs.h>
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 7bc2da8efd4a..81cf14257916 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -12,7 +12,6 @@
12#include <linux/param.h> 12#include <linux/param.h>
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/slab.h>
16#include <linux/errno.h> 15#include <linux/errno.h>
17#include <linux/string.h> 16#include <linux/string.h>
18#include <linux/in.h> 17#include <linux/in.h>
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index bac60515a4b3..d150ae0c5ecd 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -1,4 +1,5 @@
1#include <linux/fs.h> 1#include <linux/fs.h>
2#include <linux/gfp.h>
2#include <linux/nfs.h> 3#include <linux/nfs.h>
3#include <linux/nfs3.h> 4#include <linux/nfs3.h>
4#include <linux/nfs_fs.h> 5#include <linux/nfs_fs.h>
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 24992f0a29f2..e701002694e5 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -10,6 +10,7 @@
10#include <linux/errno.h> 10#include <linux/errno.h>
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/sunrpc/clnt.h> 12#include <linux/sunrpc/clnt.h>
13#include <linux/slab.h>
13#include <linux/nfs.h> 14#include <linux/nfs.h>
14#include <linux/nfs3.h> 15#include <linux/nfs3.h>
15#include <linux/nfs_fs.h> 16#include <linux/nfs_fs.h>
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 5fe5492fbd29..56a86f6ac8b5 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -9,7 +9,6 @@
9#include <linux/param.h> 9#include <linux/param.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/slab.h>
13#include <linux/errno.h> 12#include <linux/errno.h>
14#include <linux/string.h> 13#include <linux/string.h>
15#include <linux/in.h> 14#include <linux/in.h>
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index fa3408f20112..f071d12c613b 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -11,6 +11,7 @@
11#include <linux/mount.h> 11#include <linux/mount.h>
12#include <linux/namei.h> 12#include <linux/namei.h>
13#include <linux/nfs_fs.h> 13#include <linux/nfs_fs.h>
14#include <linux/slab.h>
14#include <linux/string.h> 15#include <linux/string.h>
15#include <linux/sunrpc/clnt.h> 16#include <linux/sunrpc/clnt.h>
16#include <linux/vfs.h> 17#include <linux/vfs.h>
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index eda74c42d552..638067007c65 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -39,6 +39,7 @@
39#include <linux/delay.h> 39#include <linux/delay.h>
40#include <linux/errno.h> 40#include <linux/errno.h>
41#include <linux/string.h> 41#include <linux/string.h>
42#include <linux/slab.h>
42#include <linux/sunrpc/clnt.h> 43#include <linux/sunrpc/clnt.h>
43#include <linux/nfs.h> 44#include <linux/nfs.h>
44#include <linux/nfs4.h> 45#include <linux/nfs4.h>
@@ -1522,6 +1523,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
1522 nfs_post_op_update_inode(dir, o_res->dir_attr); 1523 nfs_post_op_update_inode(dir, o_res->dir_attr);
1523 } else 1524 } else
1524 nfs_refresh_inode(dir, o_res->dir_attr); 1525 nfs_refresh_inode(dir, o_res->dir_attr);
1526 if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0)
1527 server->caps &= ~NFS_CAP_POSIX_LOCK;
1525 if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { 1528 if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
1526 status = _nfs4_proc_open_confirm(data); 1529 status = _nfs4_proc_open_confirm(data);
1527 if (status != 0) 1530 if (status != 0)
@@ -1663,7 +1666,7 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in
1663 status = PTR_ERR(state); 1666 status = PTR_ERR(state);
1664 if (IS_ERR(state)) 1667 if (IS_ERR(state))
1665 goto err_opendata_put; 1668 goto err_opendata_put;
1666 if ((opendata->o_res.rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) != 0) 1669 if (server->caps & NFS_CAP_POSIX_LOCK)
1667 set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); 1670 set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
1668 nfs4_opendata_put(opendata); 1671 nfs4_opendata_put(opendata);
1669 nfs4_put_state_owner(sp); 1672 nfs4_put_state_owner(sp);
@@ -2067,8 +2070,7 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
2067 case -EDQUOT: 2070 case -EDQUOT:
2068 case -ENOSPC: 2071 case -ENOSPC:
2069 case -EROFS: 2072 case -EROFS:
2070 lookup_instantiate_filp(nd, (struct dentry *)state, NULL); 2073 return PTR_ERR(state);
2071 return 1;
2072 default: 2074 default:
2073 goto out_drop; 2075 goto out_drop;
2074 } 2076 }
@@ -5107,6 +5109,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp,
5107 res = kzalloc(sizeof(*res), GFP_KERNEL); 5109 res = kzalloc(sizeof(*res), GFP_KERNEL);
5108 if (!args || !res) { 5110 if (!args || !res) {
5109 kfree(args); 5111 kfree(args);
5112 kfree(res);
5110 nfs_put_client(clp); 5113 nfs_put_client(clp);
5111 return -ENOMEM; 5114 return -ENOMEM;
5112 } 5115 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4d338be492cb..38f3b582e7c2 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -38,7 +38,6 @@
38#include <linux/param.h> 38#include <linux/param.h>
39#include <linux/time.h> 39#include <linux/time.h>
40#include <linux/mm.h> 40#include <linux/mm.h>
41#include <linux/slab.h>
42#include <linux/errno.h> 41#include <linux/errno.h>
43#include <linux/string.h> 42#include <linux/string.h>
44#include <linux/in.h> 43#include <linux/in.h>
@@ -5552,6 +5551,8 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf
5552 if (status != 0) 5551 if (status != 0)
5553 goto out; 5552 goto out;
5554 status = decode_delegreturn(&xdr); 5553 status = decode_delegreturn(&xdr);
5554 if (status != 0)
5555 goto out;
5555 decode_getfattr(&xdr, res->fattr, res->server, 5556 decode_getfattr(&xdr, res->fattr, res->server,
5556 !RPC_IS_ASYNC(rqstp->rq_task)); 5557 !RPC_IS_ASYNC(rqstp->rq_task));
5557out: 5558out:
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index a12c45b65dd4..29d9d36cd5f4 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -112,12 +112,10 @@ void nfs_unlock_request(struct nfs_page *req)
112 */ 112 */
113int nfs_set_page_tag_locked(struct nfs_page *req) 113int nfs_set_page_tag_locked(struct nfs_page *req)
114{ 114{
115 struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode);
116
117 if (!nfs_lock_request_dontget(req)) 115 if (!nfs_lock_request_dontget(req))
118 return 0; 116 return 0;
119 if (req->wb_page != NULL) 117 if (req->wb_page != NULL)
120 radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); 118 radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
121 return 1; 119 return 1;
122} 120}
123 121
@@ -126,10 +124,10 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
126 */ 124 */
127void nfs_clear_page_tag_locked(struct nfs_page *req) 125void nfs_clear_page_tag_locked(struct nfs_page *req)
128{ 126{
129 struct inode *inode = req->wb_context->path.dentry->d_inode;
130 struct nfs_inode *nfsi = NFS_I(inode);
131
132 if (req->wb_page != NULL) { 127 if (req->wb_page != NULL) {
128 struct inode *inode = req->wb_context->path.dentry->d_inode;
129 struct nfs_inode *nfsi = NFS_I(inode);
130
133 spin_lock(&inode->i_lock); 131 spin_lock(&inode->i_lock);
134 radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); 132 radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
135 nfs_unlock_request(req); 133 nfs_unlock_request(req);
@@ -142,16 +140,22 @@ void nfs_clear_page_tag_locked(struct nfs_page *req)
142 * nfs_clear_request - Free up all resources allocated to the request 140 * nfs_clear_request - Free up all resources allocated to the request
143 * @req: 141 * @req:
144 * 142 *
145 * Release page resources associated with a write request after it 143 * Release page and open context resources associated with a read/write
146 * has completed. 144 * request after it has completed.
147 */ 145 */
148void nfs_clear_request(struct nfs_page *req) 146void nfs_clear_request(struct nfs_page *req)
149{ 147{
150 struct page *page = req->wb_page; 148 struct page *page = req->wb_page;
149 struct nfs_open_context *ctx = req->wb_context;
150
151 if (page != NULL) { 151 if (page != NULL) {
152 page_cache_release(page); 152 page_cache_release(page);
153 req->wb_page = NULL; 153 req->wb_page = NULL;
154 } 154 }
155 if (ctx != NULL) {
156 put_nfs_open_context(ctx);
157 req->wb_context = NULL;
158 }
155} 159}
156 160
157 161
@@ -165,9 +169,8 @@ static void nfs_free_request(struct kref *kref)
165{ 169{
166 struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); 170 struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
167 171
168 /* Release struct file or cached credential */ 172 /* Release struct file and open context */
169 nfs_clear_request(req); 173 nfs_clear_request(req);
170 put_nfs_open_context(req->wb_context);
171 nfs_page_free(req); 174 nfs_page_free(req);
172} 175}
173 176
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index c752d944fe9e..0288be80444f 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -29,7 +29,6 @@
29 29
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/param.h> 31#include <linux/param.h>
32#include <linux/slab.h>
33#include <linux/time.h> 32#include <linux/time.h>
34#include <linux/mm.h> 33#include <linux/mm.h>
35#include <linux/errno.h> 34#include <linux/errno.h>
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f1afee4eea77..e01637240eeb 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -48,6 +48,7 @@
48#include <linux/vfs.h> 48#include <linux/vfs.h>
49#include <linux/inet.h> 49#include <linux/inet.h>
50#include <linux/in6.h> 50#include <linux/in6.h>
51#include <linux/slab.h>
51#include <net/ipv6.h> 52#include <net/ipv6.h>
52#include <linux/netdevice.h> 53#include <linux/netdevice.h>
53#include <linux/nfs_xdr.h> 54#include <linux/nfs_xdr.h>
@@ -2214,7 +2215,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2214 } else { 2215 } else {
2215 error = nfs_bdi_register(server); 2216 error = nfs_bdi_register(server);
2216 if (error) 2217 if (error)
2217 goto error_splat_super; 2218 goto error_splat_bdi;
2218 } 2219 }
2219 2220
2220 if (!s->s_root) { 2221 if (!s->s_root) {
@@ -2256,6 +2257,9 @@ out_err_nosb:
2256error_splat_root: 2257error_splat_root:
2257 dput(mntroot); 2258 dput(mntroot);
2258error_splat_super: 2259error_splat_super:
2260 if (server && !s->s_root)
2261 bdi_unregister(&server->backing_dev_info);
2262error_splat_bdi:
2259 deactivate_locked_super(s); 2263 deactivate_locked_super(s);
2260 goto out; 2264 goto out;
2261} 2265}
@@ -2326,7 +2330,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
2326 } else { 2330 } else {
2327 error = nfs_bdi_register(server); 2331 error = nfs_bdi_register(server);
2328 if (error) 2332 if (error)
2329 goto error_splat_super; 2333 goto error_splat_bdi;
2330 } 2334 }
2331 2335
2332 if (!s->s_root) { 2336 if (!s->s_root) {
@@ -2363,6 +2367,9 @@ out_err_noserver:
2363 return error; 2367 return error;
2364 2368
2365error_splat_super: 2369error_splat_super:
2370 if (server && !s->s_root)
2371 bdi_unregister(&server->backing_dev_info);
2372error_splat_bdi:
2366 deactivate_locked_super(s); 2373 deactivate_locked_super(s);
2367 dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error); 2374 dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error);
2368 return error; 2375 return error;
@@ -2578,7 +2585,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
2578 } else { 2585 } else {
2579 error = nfs_bdi_register(server); 2586 error = nfs_bdi_register(server);
2580 if (error) 2587 if (error)
2581 goto error_splat_super; 2588 goto error_splat_bdi;
2582 } 2589 }
2583 2590
2584 if (!s->s_root) { 2591 if (!s->s_root) {
@@ -2616,6 +2623,9 @@ out_free:
2616error_splat_root: 2623error_splat_root:
2617 dput(mntroot); 2624 dput(mntroot);
2618error_splat_super: 2625error_splat_super:
2626 if (server && !s->s_root)
2627 bdi_unregister(&server->backing_dev_info);
2628error_splat_bdi:
2619 deactivate_locked_super(s); 2629 deactivate_locked_super(s);
2620 goto out; 2630 goto out;
2621} 2631}
@@ -2811,7 +2821,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
2811 } else { 2821 } else {
2812 error = nfs_bdi_register(server); 2822 error = nfs_bdi_register(server);
2813 if (error) 2823 if (error)
2814 goto error_splat_super; 2824 goto error_splat_bdi;
2815 } 2825 }
2816 2826
2817 if (!s->s_root) { 2827 if (!s->s_root) {
@@ -2847,6 +2857,9 @@ out_err_noserver:
2847 return error; 2857 return error;
2848 2858
2849error_splat_super: 2859error_splat_super:
2860 if (server && !s->s_root)
2861 bdi_unregister(&server->backing_dev_info);
2862error_splat_bdi:
2850 deactivate_locked_super(s); 2863 deactivate_locked_super(s);
2851 dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error); 2864 dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error);
2852 return error; 2865 return error;
@@ -2893,7 +2906,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
2893 } else { 2906 } else {
2894 error = nfs_bdi_register(server); 2907 error = nfs_bdi_register(server);
2895 if (error) 2908 if (error)
2896 goto error_splat_super; 2909 goto error_splat_bdi;
2897 } 2910 }
2898 2911
2899 if (!s->s_root) { 2912 if (!s->s_root) {
@@ -2929,6 +2942,9 @@ out_err_noserver:
2929 return error; 2942 return error;
2930 2943
2931error_splat_super: 2944error_splat_super:
2945 if (server && !s->s_root)
2946 bdi_unregister(&server->backing_dev_info);
2947error_splat_bdi:
2932 deactivate_locked_super(s); 2948 deactivate_locked_super(s);
2933 dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error); 2949 dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
2934 return error; 2950 return error;
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 2ea9e5c27e55..05c9e02f4153 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -19,7 +19,6 @@
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/stat.h> 20#include <linux/stat.h>
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/slab.h>
23#include <linux/string.h> 22#include <linux/string.h>
24#include <linux/namei.h> 23#include <linux/namei.h>
25 24
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 53ff70e23993..de38d63aa920 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -201,6 +201,7 @@ static int nfs_set_page_writeback(struct page *page)
201 struct inode *inode = page->mapping->host; 201 struct inode *inode = page->mapping->host;
202 struct nfs_server *nfss = NFS_SERVER(inode); 202 struct nfs_server *nfss = NFS_SERVER(inode);
203 203
204 page_cache_get(page);
204 if (atomic_long_inc_return(&nfss->writeback) > 205 if (atomic_long_inc_return(&nfss->writeback) >
205 NFS_CONGESTION_ON_THRESH) { 206 NFS_CONGESTION_ON_THRESH) {
206 set_bdi_congested(&nfss->backing_dev_info, 207 set_bdi_congested(&nfss->backing_dev_info,
@@ -216,6 +217,7 @@ static void nfs_end_page_writeback(struct page *page)
216 struct nfs_server *nfss = NFS_SERVER(inode); 217 struct nfs_server *nfss = NFS_SERVER(inode);
217 218
218 end_page_writeback(page); 219 end_page_writeback(page);
220 page_cache_release(page);
219 if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) 221 if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
220 clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); 222 clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
221} 223}
@@ -421,6 +423,7 @@ static void
421nfs_mark_request_dirty(struct nfs_page *req) 423nfs_mark_request_dirty(struct nfs_page *req)
422{ 424{
423 __set_page_dirty_nobuffers(req->wb_page); 425 __set_page_dirty_nobuffers(req->wb_page);
426 __mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC);
424} 427}
425 428
426#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 429#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -660,9 +663,11 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
660 req = nfs_setup_write_request(ctx, page, offset, count); 663 req = nfs_setup_write_request(ctx, page, offset, count);
661 if (IS_ERR(req)) 664 if (IS_ERR(req))
662 return PTR_ERR(req); 665 return PTR_ERR(req);
666 nfs_mark_request_dirty(req);
663 /* Update file length */ 667 /* Update file length */
664 nfs_grow_file(page, offset, count); 668 nfs_grow_file(page, offset, count);
665 nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); 669 nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
670 nfs_mark_request_dirty(req);
666 nfs_clear_page_tag_locked(req); 671 nfs_clear_page_tag_locked(req);
667 return 0; 672 return 0;
668} 673}
@@ -739,8 +744,6 @@ int nfs_updatepage(struct file *file, struct page *page,
739 status = nfs_writepage_setup(ctx, page, offset, count); 744 status = nfs_writepage_setup(ctx, page, offset, count);
740 if (status < 0) 745 if (status < 0)
741 nfs_set_pageerror(page); 746 nfs_set_pageerror(page);
742 else
743 __set_page_dirty_nobuffers(page);
744 747
745 dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", 748 dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n",
746 status, (long long)i_size_read(inode)); 749 status, (long long)i_size_read(inode));
@@ -749,13 +752,12 @@ int nfs_updatepage(struct file *file, struct page *page,
749 752
750static void nfs_writepage_release(struct nfs_page *req) 753static void nfs_writepage_release(struct nfs_page *req)
751{ 754{
755 struct page *page = req->wb_page;
752 756
753 if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) { 757 if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req))
754 nfs_end_page_writeback(req->wb_page);
755 nfs_inode_remove_request(req); 758 nfs_inode_remove_request(req);
756 } else
757 nfs_end_page_writeback(req->wb_page);
758 nfs_clear_page_tag_locked(req); 759 nfs_clear_page_tag_locked(req);
760 nfs_end_page_writeback(page);
759} 761}
760 762
761static int flush_task_priority(int how) 763static int flush_task_priority(int how)
@@ -779,7 +781,6 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
779 int how) 781 int how)
780{ 782{
781 struct inode *inode = req->wb_context->path.dentry->d_inode; 783 struct inode *inode = req->wb_context->path.dentry->d_inode;
782 int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
783 int priority = flush_task_priority(how); 784 int priority = flush_task_priority(how);
784 struct rpc_task *task; 785 struct rpc_task *task;
785 struct rpc_message msg = { 786 struct rpc_message msg = {
@@ -794,9 +795,10 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
794 .callback_ops = call_ops, 795 .callback_ops = call_ops,
795 .callback_data = data, 796 .callback_data = data,
796 .workqueue = nfsiod_workqueue, 797 .workqueue = nfsiod_workqueue,
797 .flags = flags, 798 .flags = RPC_TASK_ASYNC,
798 .priority = priority, 799 .priority = priority,
799 }; 800 };
801 int ret = 0;
800 802
801 /* Set up the RPC argument and reply structs 803 /* Set up the RPC argument and reply structs
802 * NB: take care not to mess about with data->commit et al. */ 804 * NB: take care not to mess about with data->commit et al. */
@@ -835,10 +837,18 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
835 (unsigned long long)data->args.offset); 837 (unsigned long long)data->args.offset);
836 838
837 task = rpc_run_task(&task_setup_data); 839 task = rpc_run_task(&task_setup_data);
838 if (IS_ERR(task)) 840 if (IS_ERR(task)) {
839 return PTR_ERR(task); 841 ret = PTR_ERR(task);
842 goto out;
843 }
844 if (how & FLUSH_SYNC) {
845 ret = rpc_wait_for_completion_task(task);
846 if (ret == 0)
847 ret = task->tk_status;
848 }
840 rpc_put_task(task); 849 rpc_put_task(task);
841 return 0; 850out:
851 return ret;
842} 852}
843 853
844/* If a nfs_flush_* function fails, it should remove reqs from @head and 854/* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -847,9 +857,11 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
847 */ 857 */
848static void nfs_redirty_request(struct nfs_page *req) 858static void nfs_redirty_request(struct nfs_page *req)
849{ 859{
860 struct page *page = req->wb_page;
861
850 nfs_mark_request_dirty(req); 862 nfs_mark_request_dirty(req);
851 nfs_end_page_writeback(req->wb_page);
852 nfs_clear_page_tag_locked(req); 863 nfs_clear_page_tag_locked(req);
864 nfs_end_page_writeback(page);
853} 865}
854 866
855/* 867/*
@@ -1084,16 +1096,15 @@ static void nfs_writeback_release_full(void *calldata)
1084 if (nfs_write_need_commit(data)) { 1096 if (nfs_write_need_commit(data)) {
1085 memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); 1097 memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
1086 nfs_mark_request_commit(req); 1098 nfs_mark_request_commit(req);
1087 nfs_end_page_writeback(page);
1088 dprintk(" marked for commit\n"); 1099 dprintk(" marked for commit\n");
1089 goto next; 1100 goto next;
1090 } 1101 }
1091 dprintk(" OK\n"); 1102 dprintk(" OK\n");
1092remove_request: 1103remove_request:
1093 nfs_end_page_writeback(page);
1094 nfs_inode_remove_request(req); 1104 nfs_inode_remove_request(req);
1095 next: 1105 next:
1096 nfs_clear_page_tag_locked(req); 1106 nfs_clear_page_tag_locked(req);
1107 nfs_end_page_writeback(page);
1097 } 1108 }
1098 nfs_writedata_release(calldata); 1109 nfs_writedata_release(calldata);
1099} 1110}
@@ -1207,7 +1218,6 @@ static int nfs_commit_rpcsetup(struct list_head *head,
1207{ 1218{
1208 struct nfs_page *first = nfs_list_entry(head->next); 1219 struct nfs_page *first = nfs_list_entry(head->next);
1209 struct inode *inode = first->wb_context->path.dentry->d_inode; 1220 struct inode *inode = first->wb_context->path.dentry->d_inode;
1210 int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
1211 int priority = flush_task_priority(how); 1221 int priority = flush_task_priority(how);
1212 struct rpc_task *task; 1222 struct rpc_task *task;
1213 struct rpc_message msg = { 1223 struct rpc_message msg = {
@@ -1222,7 +1232,7 @@ static int nfs_commit_rpcsetup(struct list_head *head,
1222 .callback_ops = &nfs_commit_ops, 1232 .callback_ops = &nfs_commit_ops,
1223 .callback_data = data, 1233 .callback_data = data,
1224 .workqueue = nfsiod_workqueue, 1234 .workqueue = nfsiod_workqueue,
1225 .flags = flags, 1235 .flags = RPC_TASK_ASYNC,
1226 .priority = priority, 1236 .priority = priority,
1227 }; 1237 };
1228 1238
@@ -1252,6 +1262,8 @@ static int nfs_commit_rpcsetup(struct list_head *head,
1252 task = rpc_run_task(&task_setup_data); 1262 task = rpc_run_task(&task_setup_data);
1253 if (IS_ERR(task)) 1263 if (IS_ERR(task))
1254 return PTR_ERR(task); 1264 return PTR_ERR(task);
1265 if (how & FLUSH_SYNC)
1266 rpc_wait_for_completion_task(task);
1255 rpc_put_task(task); 1267 rpc_put_task(task);
1256 return 0; 1268 return 0;
1257} 1269}
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 04133aacb1e5..fc1c52571c03 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/fs.h> 24#include <linux/fs.h>
25#include <linux/gfp.h>
25#include <linux/sunrpc/xdr.h> 26#include <linux/sunrpc/xdr.h>
26#include <linux/nfsacl.h> 27#include <linux/nfsacl.h>
27#include <linux/nfs3.h> 28#include <linux/nfs3.h>
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index a0c4016413f1..872a5ef550c7 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -12,6 +12,7 @@
12 * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de> 12 * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de>
13 */ 13 */
14 14
15#include <linux/slab.h>
15#include <linux/namei.h> 16#include <linux/namei.h>
16#include <linux/module.h> 17#include <linux/module.h>
17#include <linux/exportfs.h> 18#include <linux/exportfs.h>
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index f20589d2ae27..6aa5590c3679 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -7,6 +7,7 @@
7#include "nfsd.h" 7#include "nfsd.h"
8/* FIXME: nfsacl.h is a broken header */ 8/* FIXME: nfsacl.h is a broken header */
9#include <linux/nfsacl.h> 9#include <linux/nfsacl.h>
10#include <linux/gfp.h>
10#include "cache.h" 11#include "cache.h"
11#include "xdr3.h" 12#include "xdr3.h"
12#include "vfs.h" 13#include "vfs.h"
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index e0c4846bad92..a596e9d987e4 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -7,6 +7,7 @@
7#include "nfsd.h" 7#include "nfsd.h"
8/* FIXME: nfsacl.h is a broken header */ 8/* FIXME: nfsacl.h is a broken header */
9#include <linux/nfsacl.h> 9#include <linux/nfsacl.h>
10#include <linux/gfp.h>
10#include "cache.h" 11#include "cache.h"
11#include "xdr3.h" 12#include "xdr3.h"
12#include "vfs.h" 13#include "vfs.h"
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 88150685df34..e48052615159 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -34,6 +34,7 @@
34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 */ 35 */
36 36
37#include <linux/slab.h>
37#include <linux/nfs_fs.h> 38#include <linux/nfs_fs.h>
38#include <linux/nfs4_acl.h> 39#include <linux/nfs4_acl.h>
39 40
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 4bc22c763de7..7e32bd394e86 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -32,6 +32,7 @@
32 */ 32 */
33 33
34#include <linux/sunrpc/clnt.h> 34#include <linux/sunrpc/clnt.h>
35#include <linux/slab.h>
35#include "nfsd.h" 36#include "nfsd.h"
36#include "state.h" 37#include "state.h"
37 38
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 6e2983b27f3c..c78dbf493424 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -36,6 +36,7 @@
36#include <linux/nfsd_idmap.h> 36#include <linux/nfsd_idmap.h>
37#include <linux/seq_file.h> 37#include <linux/seq_file.h>
38#include <linux/sched.h> 38#include <linux/sched.h>
39#include <linux/slab.h>
39 40
40/* 41/*
41 * Cache entry 42 * Cache entry
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 37514c469846..2ab9e8501bfe 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -33,6 +33,7 @@
33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 */ 34 */
35#include <linux/file.h> 35#include <linux/file.h>
36#include <linux/slab.h>
36 37
37#include "cache.h" 38#include "cache.h"
38#include "xdr4.h" 39#include "xdr4.h"
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 98fb98e330b4..7a9ae3254a4b 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -32,6 +32,7 @@
32*/ 32*/
33 33
34#include <linux/file.h> 34#include <linux/file.h>
35#include <linux/slab.h>
35#include <linux/namei.h> 36#include <linux/namei.h>
36#include <linux/crypto.h> 37#include <linux/crypto.h>
37#include <linux/sched.h> 38#include <linux/sched.h>
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c97fddbd17db..6a8fedaa4f55 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -34,6 +34,7 @@
34 34
35#include <linux/file.h> 35#include <linux/file.h>
36#include <linux/smp_lock.h> 36#include <linux/smp_lock.h>
37#include <linux/slab.h>
37#include <linux/namei.h> 38#include <linux/namei.h>
38#include <linux/swap.h> 39#include <linux/swap.h>
39#include <linux/sunrpc/svcauth_gss.h> 40#include <linux/sunrpc/svcauth_gss.h>
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c47b4d7bafa7..e1703175ee28 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -40,6 +40,7 @@
40 * at the end of nfs4svc_decode_compoundargs. 40 * at the end of nfs4svc_decode_compoundargs.
41 */ 41 */
42 42
43#include <linux/slab.h>
43#include <linux/namei.h> 44#include <linux/namei.h>
44#include <linux/statfs.h> 45#include <linux/statfs.h>
45#include <linux/utsname.h> 46#include <linux/utsname.h>
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index da08560c4818..4666a209678a 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -8,6 +8,8 @@
8 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> 8 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
9 */ 9 */
10 10
11#include <linux/slab.h>
12
11#include "nfsd.h" 13#include "nfsd.h"
12#include "cache.h" 14#include "cache.h"
13 15
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 0f0e77f2012f..e3591073098f 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -4,6 +4,7 @@
4 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> 4 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
5 */ 5 */
6 6
7#include <linux/slab.h>
7#include <linux/namei.h> 8#include <linux/namei.h>
8#include <linux/ctype.h> 9#include <linux/ctype.h>
9 10
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a11b0e8678ee..6dd5f1970e01 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -25,6 +25,7 @@
25#include <linux/xattr.h> 25#include <linux/xattr.h>
26#include <linux/jhash.h> 26#include <linux/jhash.h>
27#include <linux/ima.h> 27#include <linux/ima.h>
28#include <linux/slab.h>
28#include <asm/uaccess.h> 29#include <asm/uaccess.h>
29#include <linux/exportfs.h> 30#include <linux/exportfs.h>
30#include <linux/writeback.h> 31#include <linux/writeback.h>
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 3f959f1879d8..7cfb87e692da 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -26,6 +26,7 @@
26#include <linux/buffer_head.h> 26#include <linux/buffer_head.h>
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/bitops.h> 28#include <linux/bitops.h>
29#include <linux/slab.h>
29#include "mdt.h" 30#include "mdt.h"
30#include "alloc.h" 31#include "alloc.h"
31 32
@@ -425,7 +426,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
425 bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); 426 bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
426 if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), 427 if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
427 group_offset, bitmap)) 428 group_offset, bitmap))
428 printk(KERN_WARNING "%s: entry numer %llu already freed\n", 429 printk(KERN_WARNING "%s: entry number %llu already freed\n",
429 __func__, (unsigned long long)req->pr_entry_nr); 430 __func__, (unsigned long long)req->pr_entry_nr);
430 431
431 nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); 432 nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index f4543ac4f560..5cccf874d692 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -42,7 +42,7 @@ void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
42 const struct buffer_head *, void *); 42 const struct buffer_head *, void *);
43 43
44/** 44/**
45 * nilfs_palloc_req - persistent alloctor request and reply 45 * nilfs_palloc_req - persistent allocator request and reply
46 * @pr_entry_nr: entry number (vblocknr or inode number) 46 * @pr_entry_nr: entry number (vblocknr or inode number)
47 * @pr_desc_bh: buffer head of the buffer containing block group descriptors 47 * @pr_desc_bh: buffer head of the buffer containing block group descriptors
48 * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap 48 * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 471e269536ae..447ce47a3306 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -27,6 +27,7 @@
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/backing-dev.h> 29#include <linux/backing-dev.h>
30#include <linux/gfp.h>
30#include "nilfs.h" 31#include "nilfs.h"
31#include "mdt.h" 32#include "mdt.h"
32#include "dat.h" 33#include "dat.h"
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 7cdd98b8d514..76c38e3e19d2 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -1879,7 +1879,7 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
1879 struct nilfs_btree_path *path, 1879 struct nilfs_btree_path *path,
1880 int level, struct buffer_head *bh) 1880 int level, struct buffer_head *bh)
1881{ 1881{
1882 int maxlevel, ret; 1882 int maxlevel = 0, ret;
1883 struct nilfs_btree_node *parent; 1883 struct nilfs_btree_node *parent;
1884 struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap); 1884 struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
1885 __u64 ptr; 1885 __u64 ptr;
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 9d1e5de91afb..013146755683 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -288,7 +288,7 @@ int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr)
288 * @vblocknrs and @nitems. 288 * @vblocknrs and @nitems.
289 * 289 *
290 * Return Value: On success, 0 is returned. On error, one of the following 290 * Return Value: On success, 0 is returned. On error, one of the following
291 * nagative error codes is returned. 291 * negative error codes is returned.
292 * 292 *
293 * %-EIO - I/O error. 293 * %-EIO - I/O error.
294 * 294 *
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 0092840492ee..85c89dfc71f0 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -396,7 +396,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
396 /* next page is past the blocks we've got */ 396 /* next page is past the blocks we've got */
397 if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) { 397 if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
398 nilfs_error(dir->i_sb, __func__, 398 nilfs_error(dir->i_sb, __func__,
399 "dir %lu size %lld exceeds block cout %llu", 399 "dir %lu size %lld exceeds block count %llu",
400 dir->i_ino, dir->i_size, 400 dir->i_ino, dir->i_size,
401 (unsigned long long)dir->i_blocks); 401 (unsigned long long)dir->i_blocks);
402 goto out; 402 goto out;
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index e16a6664dfa2..145f03cd7d3e 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -28,10 +28,10 @@
28 * gcinodes), and this file provides lookup function of the dummy 28 * gcinodes), and this file provides lookup function of the dummy
29 * inodes and their buffer read function. 29 * inodes and their buffer read function.
30 * 30 *
31 * Since NILFS2 keeps up multiple checkpoints/snapshots accross GC, it 31 * Since NILFS2 keeps up multiple checkpoints/snapshots across GC, it
32 * has to treat blocks that belong to a same file but have different 32 * has to treat blocks that belong to a same file but have different
33 * checkpoint numbers. To avoid interference among generations, dummy 33 * checkpoint numbers. To avoid interference among generations, dummy
34 * inodes are managed separatly from actual inodes, and their lookup 34 * inodes are managed separately from actual inodes, and their lookup
35 * function (nilfs_gc_iget) is designed to be specified with a 35 * function (nilfs_gc_iget) is designed to be specified with a
36 * checkpoint number argument as well as an inode number. 36 * checkpoint number argument as well as an inode number.
37 * 37 *
@@ -45,6 +45,7 @@
45#include <linux/buffer_head.h> 45#include <linux/buffer_head.h>
46#include <linux/mpage.h> 46#include <linux/mpage.h>
47#include <linux/hash.h> 47#include <linux/hash.h>
48#include <linux/slab.h>
48#include <linux/swap.h> 49#include <linux/swap.h>
49#include "nilfs.h" 50#include "nilfs.h"
50#include "page.h" 51#include "page.h"
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 7868cc122ac7..0957b58f909d 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -22,6 +22,7 @@
22 */ 22 */
23 23
24#include <linux/buffer_head.h> 24#include <linux/buffer_head.h>
25#include <linux/gfp.h>
25#include <linux/mpage.h> 26#include <linux/mpage.h>
26#include <linux/writeback.h> 27#include <linux/writeback.h>
27#include <linux/uio.h> 28#include <linux/uio.h>
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 313d0a21da48..f90a33d9a5b0 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -23,6 +23,7 @@
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/wait.h> 24#include <linux/wait.h>
25#include <linux/smp_lock.h> /* lock_kernel(), unlock_kernel() */ 25#include <linux/smp_lock.h> /* lock_kernel(), unlock_kernel() */
26#include <linux/slab.h>
26#include <linux/capability.h> /* capable() */ 27#include <linux/capability.h> /* capable() */
27#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */ 28#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */
28#include <linux/vmalloc.h> 29#include <linux/vmalloc.h>
@@ -648,7 +649,7 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
648long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 649long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
649{ 650{
650 struct inode *inode = filp->f_dentry->d_inode; 651 struct inode *inode = filp->f_dentry->d_inode;
651 void __user *argp = (void * __user *)arg; 652 void __user *argp = (void __user *)arg;
652 653
653 switch (cmd) { 654 switch (cmd) {
654 case NILFS_IOCTL_CHANGE_CPMODE: 655 case NILFS_IOCTL_CHANGE_CPMODE:
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 06713ffcc7f2..024be8c35bb6 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -26,6 +26,7 @@
26#include <linux/writeback.h> 26#include <linux/writeback.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/swap.h> 28#include <linux/swap.h>
29#include <linux/slab.h>
29#include "nilfs.h" 30#include "nilfs.h"
30#include "segment.h" 31#include "segment.h"
31#include "page.h" 32#include "page.h"
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index a2692bbc7b50..8de3e1e48130 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -29,6 +29,7 @@
29#include <linux/list.h> 29#include <linux/list.h>
30#include <linux/highmem.h> 30#include <linux/highmem.h>
31#include <linux/pagevec.h> 31#include <linux/pagevec.h>
32#include <linux/gfp.h>
32#include "nilfs.h" 33#include "nilfs.h"
33#include "page.h" 34#include "page.h"
34#include "mdt.h" 35#include "mdt.h"
@@ -292,7 +293,7 @@ void nilfs_free_private_page(struct page *page)
292 * @src: source page 293 * @src: source page
293 * @copy_dirty: flag whether to copy dirty states on the page's buffer heads. 294 * @copy_dirty: flag whether to copy dirty states on the page's buffer heads.
294 * 295 *
295 * This fuction is for both data pages and btnode pages. The dirty flag 296 * This function is for both data pages and btnode pages. The dirty flag
296 * should be treated by caller. The page must not be under i/o. 297 * should be treated by caller. The page must not be under i/o.
297 * Both src and dst page must be locked 298 * Both src and dst page must be locked
298 */ 299 */
@@ -388,7 +389,7 @@ repeat:
388} 389}
389 390
390/** 391/**
391 * nilfs_copy_back_pages -- copy back pages to orignal cache from shadow cache 392 * nilfs_copy_back_pages -- copy back pages to original cache from shadow cache
392 * @dmap: destination page cache 393 * @dmap: destination page cache
393 * @smap: source page cache 394 * @smap: source page cache
394 * 395 *
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 017bedc761a0..ba43146f3c30 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -23,6 +23,7 @@
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/blkdev.h> 24#include <linux/blkdev.h>
25#include <linux/swap.h> 25#include <linux/swap.h>
26#include <linux/slab.h>
26#include <linux/crc32.h> 27#include <linux/crc32.h>
27#include "nilfs.h" 28#include "nilfs.h"
28#include "segment.h" 29#include "segment.h"
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index ab56fe44e377..17851f77f739 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -25,6 +25,7 @@
25#include <linux/writeback.h> 25#include <linux/writeback.h>
26#include <linux/crc32.h> 26#include <linux/crc32.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/slab.h>
28#include "page.h" 29#include "page.h"
29#include "segbuf.h" 30#include "segbuf.h"
30 31
@@ -32,7 +33,7 @@
32struct nilfs_write_info { 33struct nilfs_write_info {
33 struct the_nilfs *nilfs; 34 struct the_nilfs *nilfs;
34 struct bio *bio; 35 struct bio *bio;
35 int start, end; /* The region to be submitted */ 36 int start, end; /* The region to be submitted */
36 int rest_blocks; 37 int rest_blocks;
37 int max_pages; 38 int max_pages;
38 int nr_vecs; 39 int nr_vecs;
@@ -174,7 +175,7 @@ int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
174} 175}
175 176
176/* 177/*
177 * Setup segument summary 178 * Setup segment summary
178 */ 179 */
179void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf) 180void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
180{ 181{
@@ -323,14 +324,14 @@ int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs)
323int nilfs_wait_on_logs(struct list_head *logs) 324int nilfs_wait_on_logs(struct list_head *logs)
324{ 325{
325 struct nilfs_segment_buffer *segbuf; 326 struct nilfs_segment_buffer *segbuf;
326 int err; 327 int err, ret = 0;
327 328
328 list_for_each_entry(segbuf, logs, sb_list) { 329 list_for_each_entry(segbuf, logs, sb_list) {
329 err = nilfs_segbuf_wait(segbuf); 330 err = nilfs_segbuf_wait(segbuf);
330 if (err) 331 if (err && !ret)
331 return err; 332 ret = err;
332 } 333 }
333 return 0; 334 return ret;
334} 335}
335 336
336/* 337/*
@@ -470,8 +471,8 @@ static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf,
470 * 471 *
471 * %-ENOMEM - Insufficient memory available. 472 * %-ENOMEM - Insufficient memory available.
472 */ 473 */
473int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, 474static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
474 struct the_nilfs *nilfs) 475 struct the_nilfs *nilfs)
475{ 476{
476 struct nilfs_write_info wi; 477 struct nilfs_write_info wi;
477 struct buffer_head *bh; 478 struct buffer_head *bh;
@@ -514,7 +515,7 @@ int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
514 * 515 *
515 * %-EIO - I/O error 516 * %-EIO - I/O error
516 */ 517 */
517int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf) 518static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf)
518{ 519{
519 int err = 0; 520 int err = 0;
520 521
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index ada2f1b947a3..6a7dbd8451db 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -32,6 +32,7 @@
32#include <linux/kthread.h> 32#include <linux/kthread.h>
33#include <linux/crc32.h> 33#include <linux/crc32.h>
34#include <linux/pagevec.h> 34#include <linux/pagevec.h>
35#include <linux/slab.h>
35#include "nilfs.h" 36#include "nilfs.h"
36#include "btnode.h" 37#include "btnode.h"
37#include "page.h" 38#include "page.h"
@@ -141,7 +142,7 @@ int nilfs_init_transaction_cache(void)
141} 142}
142 143
143/** 144/**
144 * nilfs_detroy_transaction_cache - destroy the cache for transaction info 145 * nilfs_destroy_transaction_cache - destroy the cache for transaction info
145 * 146 *
146 * nilfs_destroy_transaction_cache() frees the slab cache for the struct 147 * nilfs_destroy_transaction_cache() frees the slab cache for the struct
147 * nilfs_transaction_info. 148 * nilfs_transaction_info.
@@ -201,7 +202,7 @@ static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
201 * This function allocates a nilfs_transaction_info struct to keep context 202 * This function allocates a nilfs_transaction_info struct to keep context
202 * information on it. It is initialized and hooked onto the current task in 203 * information on it. It is initialized and hooked onto the current task in
203 * the outermost call. If a pre-allocated struct is given to @ti, it is used 204 * the outermost call. If a pre-allocated struct is given to @ti, it is used
204 * instead; othewise a new struct is assigned from a slab. 205 * instead; otherwise a new struct is assigned from a slab.
205 * 206 *
206 * When @vacancy_check flag is set, this function will check the amount of 207 * When @vacancy_check flag is set, this function will check the amount of
207 * free space, and will wait for the GC to reclaim disk space if low capacity. 208 * free space, and will wait for the GC to reclaim disk space if low capacity.
@@ -1510,6 +1511,12 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
1510 if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE) 1511 if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
1511 break; 1512 break;
1512 1513
1514 nilfs_clear_logs(&sci->sc_segbufs);
1515
1516 err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
1517 if (unlikely(err))
1518 return err;
1519
1513 if (sci->sc_stage.flags & NILFS_CF_SUFREED) { 1520 if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
1514 err = nilfs_sufile_cancel_freev(nilfs->ns_sufile, 1521 err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
1515 sci->sc_freesegs, 1522 sci->sc_freesegs,
@@ -1517,12 +1524,6 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
1517 NULL); 1524 NULL);
1518 WARN_ON(err); /* do not happen */ 1525 WARN_ON(err); /* do not happen */
1519 } 1526 }
1520 nilfs_clear_logs(&sci->sc_segbufs);
1521
1522 err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
1523 if (unlikely(err))
1524 return err;
1525
1526 nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA); 1527 nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
1527 sci->sc_stage = prev_stage; 1528 sci->sc_stage = prev_stage;
1528 } 1529 }
@@ -1897,8 +1898,7 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
1897 1898
1898 list_splice_tail_init(&sci->sc_write_logs, &logs); 1899 list_splice_tail_init(&sci->sc_write_logs, &logs);
1899 ret = nilfs_wait_on_logs(&logs); 1900 ret = nilfs_wait_on_logs(&logs);
1900 if (ret) 1901 nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret ? : err);
1901 nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret);
1902 1902
1903 list_splice_tail_init(&sci->sc_segbufs, &logs); 1903 list_splice_tail_init(&sci->sc_segbufs, &logs);
1904 nilfs_cancel_segusage(&logs, nilfs->ns_sufile); 1904 nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
@@ -2214,7 +2214,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2214} 2214}
2215 2215
2216/** 2216/**
2217 * nilfs_secgtor_start_timer - set timer of background write 2217 * nilfs_segctor_start_timer - set timer of background write
2218 * @sci: nilfs_sc_info 2218 * @sci: nilfs_sc_info
2219 * 2219 *
2220 * If the timer has already been set, it ignores the new request. 2220 * If the timer has already been set, it ignores the new request.
@@ -2854,7 +2854,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2854 * @sbi: nilfs_sb_info 2854 * @sbi: nilfs_sb_info
2855 * 2855 *
2856 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info, 2856 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
2857 * initilizes it, and starts the segment constructor. 2857 * initializes it, and starts the segment constructor.
2858 * 2858 *
2859 * Return Value: On success, 0 is returned. On error, one of the following 2859 * Return Value: On success, 0 is returned. On error, one of the following
2860 * negative error code is returned. 2860 * negative error code is returned.
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 3155e0c7f415..82dfd6a686b9 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -30,7 +30,7 @@
30#include "sb.h" 30#include "sb.h"
31 31
32/** 32/**
33 * struct nilfs_recovery_info - Recovery infomation 33 * struct nilfs_recovery_info - Recovery information
34 * @ri_need_recovery: Recovery status 34 * @ri_need_recovery: Recovery status
35 * @ri_super_root: Block number of the last super root 35 * @ri_super_root: Block number of the last super root
36 * @ri_ri_cno: Number of the last checkpoint 36 * @ri_ri_cno: Number of the last checkpoint
@@ -71,7 +71,7 @@ struct nilfs_recovery_info {
71 */ 71 */
72struct nilfs_cstage { 72struct nilfs_cstage {
73 int scnt; 73 int scnt;
74 unsigned flags; 74 unsigned flags;
75 struct nilfs_inode_info *dirty_file_ptr; 75 struct nilfs_inode_info *dirty_file_ptr;
76 struct nilfs_inode_info *gc_inode_ptr; 76 struct nilfs_inode_info *gc_inode_ptr;
77}; 77};
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index b6c36d0cc331..3c6cc6005c2e 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -18,7 +18,7 @@
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 * 19 *
20 * Written by Koji Sato <koji@osrg.net>. 20 * Written by Koji Sato <koji@osrg.net>.
21 * Rivised by Ryusuke Konishi <ryusuke@osrg.net>. 21 * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
22 */ 22 */
23 23
24#include <linux/kernel.h> 24#include <linux/kernel.h>
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 92579cc4c935..0cdbc5e7655a 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -436,7 +436,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
436 /* 436 /*
437 * Compute the overhead 437 * Compute the overhead
438 * 438 *
439 * When distributing meta data blocks outside semgent structure, 439 * When distributing meta data blocks outside segment structure,
440 * We must count them as the overhead. 440 * We must count them as the overhead.
441 */ 441 */
442 overhead = 0; 442 overhead = 0;
@@ -866,7 +866,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
866 if ((*flags & MS_RDONLY) && 866 if ((*flags & MS_RDONLY) &&
867 sbi->s_snapshot_cno != old_opts.snapshot_cno) { 867 sbi->s_snapshot_cno != old_opts.snapshot_cno) {
868 printk(KERN_WARNING "NILFS (device %s): couldn't " 868 printk(KERN_WARNING "NILFS (device %s): couldn't "
869 "remount to a different snapshot. \n", 869 "remount to a different snapshot.\n",
870 sb->s_id); 870 sb->s_id);
871 err = -EINVAL; 871 err = -EINVAL;
872 goto restore_opts; 872 goto restore_opts;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 92733d5651d2..33871f7e4f01 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -386,7 +386,7 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
386 386
387 nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); 387 nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
388 if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) { 388 if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
389 printk(KERN_ERR "NILFS: too short segment. \n"); 389 printk(KERN_ERR "NILFS: too short segment.\n");
390 return -EINVAL; 390 return -EINVAL;
391 } 391 }
392 392
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index e9795f1724d7..1ab974533697 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -29,6 +29,7 @@
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/blkdev.h> 30#include <linux/blkdev.h>
31#include <linux/backing-dev.h> 31#include <linux/backing-dev.h>
32#include <linux/slab.h>
32#include "sb.h" 33#include "sb.h"
33 34
34/* the_nilfs struct */ 35/* the_nilfs struct */
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 037e878e03fc..fcc2f064af83 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/dcache.h> 19#include <linux/dcache.h>
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/gfp.h>
21#include <linux/init.h> 22#include <linux/init.h>
22#include <linux/module.h> 23#include <linux/module.h>
23#include <linux/srcu.h> 24#include <linux/srcu.h>
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 3165d85aada2..0399bcbe09c8 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -87,7 +87,6 @@
87#include <linux/kernel.h> 87#include <linux/kernel.h>
88#include <linux/module.h> 88#include <linux/module.h>
89#include <linux/mutex.h> 89#include <linux/mutex.h>
90#include <linux/slab.h>
91#include <linux/spinlock.h> 90#include <linux/spinlock.h>
92#include <linux/writeback.h> /* for inode_lock */ 91#include <linux/writeback.h> /* for inode_lock */
93 92
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index cfce53cb65d7..c3c2c7ac9020 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -23,6 +23,7 @@
23 23
24#include <linux/errno.h> 24#include <linux/errno.h>
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/gfp.h>
26#include <linux/mm.h> 27#include <linux/mm.h>
27#include <linux/pagemap.h> 28#include <linux/pagemap.h>
28#include <linux/swap.h> 29#include <linux/swap.h>
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 50d3b0c258e3..f5094ee224c1 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/slab.h>
25#include <linux/swap.h> 26#include <linux/swap.h>
26#include <linux/writeback.h> 27#include <linux/writeback.h>
27 28
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 08f7530e9341..6551c7cbad92 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -25,6 +25,7 @@
25#include <linux/buffer_head.h> 25#include <linux/buffer_head.h>
26#include <linux/blkdev.h> 26#include <linux/blkdev.h>
27#include <linux/vmalloc.h> 27#include <linux/vmalloc.h>
28#include <linux/slab.h>
28 29
29#include "attrib.h" 30#include "attrib.h"
30#include "inode.h" 31#include "inode.h"
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 9173e82a45d1..fe44d3feee4a 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/slab.h>
24 25
25#include "dir.h" 26#include "dir.h"
26#include "aops.h" 27#include "aops.h"
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index b681c71d7069..8804f093ba75 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -20,6 +20,7 @@
20 */ 20 */
21 21
22#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
23#include <linux/gfp.h>
23#include <linux/pagemap.h> 24#include <linux/pagemap.h>
24#include <linux/pagevec.h> 25#include <linux/pagevec.h>
25#include <linux/sched.h> 26#include <linux/sched.h>
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
index 2194eff49743..096c135691ae 100644
--- a/fs/ntfs/index.c
+++ b/fs/ntfs/index.c
@@ -19,6 +19,8 @@
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21 21
22#include <linux/slab.h>
23
22#include "aops.h" 24#include "aops.h"
23#include "collate.h" 25#include "collate.h"
24#include "debug.h" 26#include "debug.h"
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 1caa0ef0b2bb..b572b6727181 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/slab.h>
24#include <linux/swap.h> 25#include <linux/swap.h>
25 26
26#include "attrib.h" 27#include "attrib.h"
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 2ca00153b6ec..358273e59ade 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -23,6 +23,7 @@
23#include <linux/dcache.h> 23#include <linux/dcache.h>
24#include <linux/exportfs.h> 24#include <linux/exportfs.h>
25#include <linux/security.h> 25#include <linux/security.h>
26#include <linux/slab.h>
26 27
27#include "attrib.h" 28#include "attrib.h"
28#include "debug.h" 29#include "debug.h"
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 1cf39dfaee7a..0de1db6cddbf 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -31,6 +31,7 @@
31#include <linux/vfs.h> 31#include <linux/vfs.h>
32#include <linux/moduleparam.h> 32#include <linux/moduleparam.h>
33#include <linux/smp_lock.h> 33#include <linux/smp_lock.h>
34#include <linux/bitmap.h>
34 35
35#include "sysctl.h" 36#include "sysctl.h"
36#include "logfile.h" 37#include "logfile.h"
@@ -2458,7 +2459,6 @@ static void ntfs_put_super(struct super_block *sb)
2458static s64 get_nr_free_clusters(ntfs_volume *vol) 2459static s64 get_nr_free_clusters(ntfs_volume *vol)
2459{ 2460{
2460 s64 nr_free = vol->nr_clusters; 2461 s64 nr_free = vol->nr_clusters;
2461 u32 *kaddr;
2462 struct address_space *mapping = vol->lcnbmp_ino->i_mapping; 2462 struct address_space *mapping = vol->lcnbmp_ino->i_mapping;
2463 struct page *page; 2463 struct page *page;
2464 pgoff_t index, max_index; 2464 pgoff_t index, max_index;
@@ -2477,7 +2477,8 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
2477 ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.", 2477 ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.",
2478 max_index, PAGE_CACHE_SIZE / 4); 2478 max_index, PAGE_CACHE_SIZE / 4);
2479 for (index = 0; index < max_index; index++) { 2479 for (index = 0; index < max_index; index++) {
2480 unsigned int i; 2480 unsigned long *kaddr;
2481
2481 /* 2482 /*
2482 * Read the page from page cache, getting it from backing store 2483 * Read the page from page cache, getting it from backing store
2483 * if necessary, and increment the use count. 2484 * if necessary, and increment the use count.
@@ -2490,16 +2491,16 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
2490 nr_free -= PAGE_CACHE_SIZE * 8; 2491 nr_free -= PAGE_CACHE_SIZE * 8;
2491 continue; 2492 continue;
2492 } 2493 }
2493 kaddr = (u32*)kmap_atomic(page, KM_USER0); 2494 kaddr = kmap_atomic(page, KM_USER0);
2494 /* 2495 /*
2495 * For each 4 bytes, subtract the number of set bits. If this 2496 * Subtract the number of set bits. If this
2496 * is the last page and it is partial we don't really care as 2497 * is the last page and it is partial we don't really care as
2497 * it just means we do a little extra work but it won't affect 2498 * it just means we do a little extra work but it won't affect
2498 * the result as all out of range bytes are set to zero by 2499 * the result as all out of range bytes are set to zero by
2499 * ntfs_readpage(). 2500 * ntfs_readpage().
2500 */ 2501 */
2501 for (i = 0; i < PAGE_CACHE_SIZE / 4; i++) 2502 nr_free -= bitmap_weight(kaddr,
2502 nr_free -= (s64)hweight32(kaddr[i]); 2503 PAGE_CACHE_SIZE * BITS_PER_BYTE);
2503 kunmap_atomic(kaddr, KM_USER0); 2504 kunmap_atomic(kaddr, KM_USER0);
2504 page_cache_release(page); 2505 page_cache_release(page);
2505 } 2506 }
@@ -2538,7 +2539,6 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
2538static unsigned long __get_nr_free_mft_records(ntfs_volume *vol, 2539static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
2539 s64 nr_free, const pgoff_t max_index) 2540 s64 nr_free, const pgoff_t max_index)
2540{ 2541{
2541 u32 *kaddr;
2542 struct address_space *mapping = vol->mftbmp_ino->i_mapping; 2542 struct address_space *mapping = vol->mftbmp_ino->i_mapping;
2543 struct page *page; 2543 struct page *page;
2544 pgoff_t index; 2544 pgoff_t index;
@@ -2548,7 +2548,8 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
2548 ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = " 2548 ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = "
2549 "0x%lx.", max_index, PAGE_CACHE_SIZE / 4); 2549 "0x%lx.", max_index, PAGE_CACHE_SIZE / 4);
2550 for (index = 0; index < max_index; index++) { 2550 for (index = 0; index < max_index; index++) {
2551 unsigned int i; 2551 unsigned long *kaddr;
2552
2552 /* 2553 /*
2553 * Read the page from page cache, getting it from backing store 2554 * Read the page from page cache, getting it from backing store
2554 * if necessary, and increment the use count. 2555 * if necessary, and increment the use count.
@@ -2561,16 +2562,16 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
2561 nr_free -= PAGE_CACHE_SIZE * 8; 2562 nr_free -= PAGE_CACHE_SIZE * 8;
2562 continue; 2563 continue;
2563 } 2564 }
2564 kaddr = (u32*)kmap_atomic(page, KM_USER0); 2565 kaddr = kmap_atomic(page, KM_USER0);
2565 /* 2566 /*
2566 * For each 4 bytes, subtract the number of set bits. If this 2567 * Subtract the number of set bits. If this
2567 * is the last page and it is partial we don't really care as 2568 * is the last page and it is partial we don't really care as
2568 * it just means we do a little extra work but it won't affect 2569 * it just means we do a little extra work but it won't affect
2569 * the result as all out of range bytes are set to zero by 2570 * the result as all out of range bytes are set to zero by
2570 * ntfs_readpage(). 2571 * ntfs_readpage().
2571 */ 2572 */
2572 for (i = 0; i < PAGE_CACHE_SIZE / 4; i++) 2573 nr_free -= bitmap_weight(kaddr,
2573 nr_free -= (s64)hweight32(kaddr[i]); 2574 PAGE_CACHE_SIZE * BITS_PER_BYTE);
2574 kunmap_atomic(kaddr, KM_USER0); 2575 kunmap_atomic(kaddr, KM_USER0);
2575 page_cache_release(page); 2576 page_cache_release(page);
2576 } 2577 }
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 0501974bedd0..e13fc9e8fcdc 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -21,6 +21,7 @@
21 21
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/slab.h>
24#include <linux/string.h> 25#include <linux/string.h>
25 26
26#define MLOG_MASK_PREFIX ML_INODE 27#define MLOG_MASK_PREFIX ML_INODE
@@ -30,6 +31,8 @@
30#include "alloc.h" 31#include "alloc.h"
31#include "dlmglue.h" 32#include "dlmglue.h"
32#include "file.h" 33#include "file.h"
34#include "inode.h"
35#include "journal.h"
33#include "ocfs2_fs.h" 36#include "ocfs2_fs.h"
34 37
35#include "xattr.h" 38#include "xattr.h"
@@ -166,6 +169,60 @@ static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
166} 169}
167 170
168/* 171/*
172 * Helper function to set i_mode in memory and disk. Some call paths
173 * will not have di_bh or a journal handle to pass, in which case it
174 * will create it's own.
175 */
176static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
177 handle_t *handle, umode_t new_mode)
178{
179 int ret, commit_handle = 0;
180 struct ocfs2_dinode *di;
181
182 if (di_bh == NULL) {
183 ret = ocfs2_read_inode_block(inode, &di_bh);
184 if (ret) {
185 mlog_errno(ret);
186 goto out;
187 }
188 } else
189 get_bh(di_bh);
190
191 if (handle == NULL) {
192 handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
193 OCFS2_INODE_UPDATE_CREDITS);
194 if (IS_ERR(handle)) {
195 ret = PTR_ERR(handle);
196 mlog_errno(ret);
197 goto out_brelse;
198 }
199
200 commit_handle = 1;
201 }
202
203 di = (struct ocfs2_dinode *)di_bh->b_data;
204 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
205 OCFS2_JOURNAL_ACCESS_WRITE);
206 if (ret) {
207 mlog_errno(ret);
208 goto out_commit;
209 }
210
211 inode->i_mode = new_mode;
212 di->i_mode = cpu_to_le16(inode->i_mode);
213
214 ocfs2_journal_dirty(handle, di_bh);
215
216out_commit:
217 if (commit_handle)
218 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
219out_brelse:
220 brelse(di_bh);
221out:
222 return ret;
223}
224
225/*
169 * Set the access or default ACL of an inode. 226 * Set the access or default ACL of an inode.
170 */ 227 */
171static int ocfs2_set_acl(handle_t *handle, 228static int ocfs2_set_acl(handle_t *handle,
@@ -193,9 +250,14 @@ static int ocfs2_set_acl(handle_t *handle,
193 if (ret < 0) 250 if (ret < 0)
194 return ret; 251 return ret;
195 else { 252 else {
196 inode->i_mode = mode;
197 if (ret == 0) 253 if (ret == 0)
198 acl = NULL; 254 acl = NULL;
255
256 ret = ocfs2_acl_set_mode(inode, di_bh,
257 handle, mode);
258 if (ret)
259 return ret;
260
199 } 261 }
200 } 262 }
201 break; 263 break;
@@ -283,6 +345,7 @@ int ocfs2_init_acl(handle_t *handle,
283 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 345 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
284 struct posix_acl *acl = NULL; 346 struct posix_acl *acl = NULL;
285 int ret = 0; 347 int ret = 0;
348 mode_t mode;
286 349
287 if (!S_ISLNK(inode->i_mode)) { 350 if (!S_ISLNK(inode->i_mode)) {
288 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { 351 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
@@ -291,12 +354,17 @@ int ocfs2_init_acl(handle_t *handle,
291 if (IS_ERR(acl)) 354 if (IS_ERR(acl))
292 return PTR_ERR(acl); 355 return PTR_ERR(acl);
293 } 356 }
294 if (!acl) 357 if (!acl) {
295 inode->i_mode &= ~current_umask(); 358 mode = inode->i_mode & ~current_umask();
359 ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
360 if (ret) {
361 mlog_errno(ret);
362 goto cleanup;
363 }
364 }
296 } 365 }
297 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { 366 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
298 struct posix_acl *clone; 367 struct posix_acl *clone;
299 mode_t mode;
300 368
301 if (S_ISDIR(inode->i_mode)) { 369 if (S_ISDIR(inode->i_mode)) {
302 ret = ocfs2_set_acl(handle, inode, di_bh, 370 ret = ocfs2_set_acl(handle, inode, di_bh,
@@ -313,7 +381,7 @@ int ocfs2_init_acl(handle_t *handle,
313 mode = inode->i_mode; 381 mode = inode->i_mode;
314 ret = posix_acl_create_masq(clone, &mode); 382 ret = posix_acl_create_masq(clone, &mode);
315 if (ret >= 0) { 383 if (ret >= 0) {
316 inode->i_mode = mode; 384 ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
317 if (ret > 0) { 385 if (ret > 0) {
318 ret = ocfs2_set_acl(handle, inode, 386 ret = ocfs2_set_acl(handle, inode,
319 di_bh, ACL_TYPE_ACCESS, 387 di_bh, ACL_TYPE_ACCESS,
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 21c808f752d8..ecebb2276790 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30 29
31#include <cluster/masklog.h> 30#include <cluster/masklog.h>
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 5c9890006708..41d5f1f92d56 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -34,6 +34,7 @@
34#include <linux/crc32.h> 34#include <linux/crc32.h>
35#include <linux/time.h> 35#include <linux/time.h>
36#include <linux/debugfs.h> 36#include <linux/debugfs.h>
37#include <linux/slab.h>
37 38
38#include "heartbeat.h" 39#include "heartbeat.h"
39#include "tcp.h" 40#include "tcp.h"
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index c81142e3ef84..ed0c9f367fed 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -19,6 +19,7 @@
19 * Boston, MA 021110-1307, USA. 19 * Boston, MA 021110-1307, USA.
20 */ 20 */
21 21
22#include <linux/slab.h>
22#include <linux/kernel.h> 23#include <linux/kernel.h>
23#include <linux/module.h> 24#include <linux/module.h>
24#include <linux/configfs.h> 25#include <linux/configfs.h>
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 639024033fce..cf3e16696216 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -44,7 +44,6 @@
44 * and if they're the last, they fire off the decision. 44 * and if they're the last, they fire off the decision.
45 */ 45 */
46#include <linux/kernel.h> 46#include <linux/kernel.h>
47#include <linux/slab.h>
48#include <linux/workqueue.h> 47#include <linux/workqueue.h>
49#include <linux/reboot.h> 48#include <linux/reboot.h>
50 49
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index dccc439fa087..a795eb91f4ea 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index f283bce776b4..90803b47cd8c 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index a659606dcb95..9289b4357d27 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1875,7 +1875,6 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
1875ok: 1875ok:
1876 spin_unlock(&res->spinlock); 1876 spin_unlock(&res->spinlock);
1877 } 1877 }
1878 spin_unlock(&dlm->spinlock);
1879 1878
1880 // mlog(0, "woo! got an assert_master from node %u!\n", 1879 // mlog(0, "woo! got an assert_master from node %u!\n",
1881 // assert->node_idx); 1880 // assert->node_idx);
@@ -1926,7 +1925,6 @@ ok:
1926 /* master is known, detach if not already detached. 1925 /* master is known, detach if not already detached.
1927 * ensures that only one assert_master call will happen 1926 * ensures that only one assert_master call will happen
1928 * on this mle. */ 1927 * on this mle. */
1929 spin_lock(&dlm->spinlock);
1930 spin_lock(&dlm->master_lock); 1928 spin_lock(&dlm->master_lock);
1931 1929
1932 rr = atomic_read(&mle->mle_refs.refcount); 1930 rr = atomic_read(&mle->mle_refs.refcount);
@@ -1959,7 +1957,6 @@ ok:
1959 __dlm_put_mle(mle); 1957 __dlm_put_mle(mle);
1960 } 1958 }
1961 spin_unlock(&dlm->master_lock); 1959 spin_unlock(&dlm->master_lock);
1962 spin_unlock(&dlm->spinlock);
1963 } else if (res) { 1960 } else if (res) {
1964 if (res->owner != assert->node_idx) { 1961 if (res->owner != assert->node_idx) {
1965 mlog(0, "assert_master from %u, but current " 1962 mlog(0, "assert_master from %u, but current "
@@ -1967,6 +1964,7 @@ ok:
1967 res->owner, namelen, name); 1964 res->owner, namelen, name);
1968 } 1965 }
1969 } 1966 }
1967 spin_unlock(&dlm->spinlock);
1970 1968
1971done: 1969done:
1972 ret = 0; 1970 ret = 0;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 52ec020ea78b..11a6d1fd1d35 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 49e29ecd0201..b47c1b92b82b 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index c562a7581cf9..09e3fdfa6d33 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -24,6 +24,7 @@
24 24
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/slab.h>
27#include <linux/types.h> 28#include <linux/types.h>
28#include <linux/fiemap.h> 29#include <linux/fiemap.h>
29 30
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index c6e7213db868..1aa863dd901f 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -26,7 +26,6 @@
26 26
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h> 29#include <linux/highmem.h>
31 30
32#define MLOG_MASK_PREFIX ML_SUPER 31#define MLOG_MASK_PREFIX ML_SUPER
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 278a223aae14..07cc8bb68b6d 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30#include <linux/pagemap.h> 29#include <linux/pagemap.h>
31#include <linux/quotaops.h> 30#include <linux/quotaops.h>
@@ -891,6 +890,21 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
891 /* Do some basic inode verification... */ 890 /* Do some basic inode verification... */
892 di = (struct ocfs2_dinode *) di_bh->b_data; 891 di = (struct ocfs2_dinode *) di_bh->b_data;
893 if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) { 892 if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) {
893 /*
894 * Inodes in the orphan dir must have ORPHANED_FL. The only
895 * inodes that come back out of the orphan dir are reflink
896 * targets. A reflink target may be moved out of the orphan
897 * dir between the time we scan the directory and the time we
898 * process it. This would lead to HAS_REFCOUNT_FL being set but
899 * ORPHANED_FL not.
900 */
901 if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) {
902 mlog(0, "Reflinked inode %llu is no longer orphaned. "
903 "it shouldn't be deleted\n",
904 (unsigned long long)oi->ip_blkno);
905 goto bail;
906 }
907
894 /* for lack of a better error? */ 908 /* for lack of a better error? */
895 status = -EEXIST; 909 status = -EEXIST;
896 mlog(ML_ERROR, 910 mlog(ML_ERROR,
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ca992d91f511..c983715d8d8c 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -872,8 +872,10 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
872 (unsigned long long)la_start_blk, 872 (unsigned long long)la_start_blk,
873 (unsigned long long)blkno); 873 (unsigned long long)blkno);
874 874
875 status = ocfs2_free_clusters(handle, main_bm_inode, 875 status = ocfs2_release_clusters(handle,
876 main_bm_bh, blkno, count); 876 main_bm_inode,
877 main_bm_bh, blkno,
878 count);
877 if (status < 0) { 879 if (status < 0) {
878 mlog_errno(status); 880 mlog_errno(status);
879 goto bail; 881 goto bail;
@@ -984,8 +986,7 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
984 } 986 }
985 987
986retry_enospc: 988retry_enospc:
987 (*ac)->ac_bits_wanted = osb->local_alloc_bits; 989 (*ac)->ac_bits_wanted = osb->local_alloc_default_bits;
988
989 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); 990 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
990 if (status == -ENOSPC) { 991 if (status == -ENOSPC) {
991 if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) == 992 if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
@@ -1061,6 +1062,7 @@ retry_enospc:
1061 OCFS2_LA_DISABLED) 1062 OCFS2_LA_DISABLED)
1062 goto bail; 1063 goto bail;
1063 1064
1065 ac->ac_bits_wanted = osb->local_alloc_default_bits;
1064 status = ocfs2_claim_clusters(osb, handle, ac, 1066 status = ocfs2_claim_clusters(osb, handle, ac,
1065 osb->local_alloc_bits, 1067 osb->local_alloc_bits,
1066 &cluster_off, 1068 &cluster_off,
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 544ac6245175..b5cb3ede9408 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -133,7 +133,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
133 133
134 if (!(fl->fl_flags & FL_POSIX)) 134 if (!(fl->fl_flags & FL_POSIX))
135 return -ENOLCK; 135 return -ENOLCK;
136 if (__mandatory_lock(inode)) 136 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
137 return -ENOLCK; 137 return -ENOLCK;
138 138
139 return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl); 139 return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 39737613424a..7898bd3a99f5 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30#include <linux/pagemap.h> 29#include <linux/pagemap.h>
31#include <linux/uio.h> 30#include <linux/uio.h>
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d9cd4e373a53..b1eb50ae4097 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -84,7 +84,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
84static int ocfs2_orphan_add(struct ocfs2_super *osb, 84static int ocfs2_orphan_add(struct ocfs2_super *osb,
85 handle_t *handle, 85 handle_t *handle,
86 struct inode *inode, 86 struct inode *inode,
87 struct ocfs2_dinode *fe, 87 struct buffer_head *fe_bh,
88 char *name, 88 char *name,
89 struct ocfs2_dir_lookup_result *lookup, 89 struct ocfs2_dir_lookup_result *lookup,
90 struct inode *orphan_dir_inode); 90 struct inode *orphan_dir_inode);
@@ -879,7 +879,7 @@ static int ocfs2_unlink(struct inode *dir,
879 fe = (struct ocfs2_dinode *) fe_bh->b_data; 879 fe = (struct ocfs2_dinode *) fe_bh->b_data;
880 880
881 if (inode_is_unlinkable(inode)) { 881 if (inode_is_unlinkable(inode)) {
882 status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name, 882 status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name,
883 &orphan_insert, orphan_dir); 883 &orphan_insert, orphan_dir);
884 if (status < 0) { 884 if (status < 0) {
885 mlog_errno(status); 885 mlog_errno(status);
@@ -1300,7 +1300,7 @@ static int ocfs2_rename(struct inode *old_dir,
1300 if (S_ISDIR(new_inode->i_mode) || 1300 if (S_ISDIR(new_inode->i_mode) ||
1301 (ocfs2_read_links_count(newfe) == 1)) { 1301 (ocfs2_read_links_count(newfe) == 1)) {
1302 status = ocfs2_orphan_add(osb, handle, new_inode, 1302 status = ocfs2_orphan_add(osb, handle, new_inode,
1303 newfe, orphan_name, 1303 newfe_bh, orphan_name,
1304 &orphan_insert, orphan_dir); 1304 &orphan_insert, orphan_dir);
1305 if (status < 0) { 1305 if (status < 0) {
1306 mlog_errno(status); 1306 mlog_errno(status);
@@ -1911,7 +1911,7 @@ leave:
1911static int ocfs2_orphan_add(struct ocfs2_super *osb, 1911static int ocfs2_orphan_add(struct ocfs2_super *osb,
1912 handle_t *handle, 1912 handle_t *handle,
1913 struct inode *inode, 1913 struct inode *inode,
1914 struct ocfs2_dinode *fe, 1914 struct buffer_head *fe_bh,
1915 char *name, 1915 char *name,
1916 struct ocfs2_dir_lookup_result *lookup, 1916 struct ocfs2_dir_lookup_result *lookup,
1917 struct inode *orphan_dir_inode) 1917 struct inode *orphan_dir_inode)
@@ -1919,6 +1919,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1919 struct buffer_head *orphan_dir_bh = NULL; 1919 struct buffer_head *orphan_dir_bh = NULL;
1920 int status = 0; 1920 int status = 0;
1921 struct ocfs2_dinode *orphan_fe; 1921 struct ocfs2_dinode *orphan_fe;
1922 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1922 1923
1923 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); 1924 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
1924 1925
@@ -1959,6 +1960,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1959 goto leave; 1960 goto leave;
1960 } 1961 }
1961 1962
1963 /*
1964 * We're going to journal the change of i_flags and i_orphaned_slot.
1965 * It's safe anyway, though some callers may duplicate the journaling.
1966 * Journaling within the func just make the logic look more
1967 * straightforward.
1968 */
1969 status = ocfs2_journal_access_di(handle,
1970 INODE_CACHE(inode),
1971 fe_bh,
1972 OCFS2_JOURNAL_ACCESS_WRITE);
1973 if (status < 0) {
1974 mlog_errno(status);
1975 goto leave;
1976 }
1977
1962 le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL); 1978 le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
1963 1979
1964 /* Record which orphan dir our inode now resides 1980 /* Record which orphan dir our inode now resides
@@ -1966,6 +1982,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1966 * dir to lock. */ 1982 * dir to lock. */
1967 fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); 1983 fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
1968 1984
1985 ocfs2_journal_dirty(handle, fe_bh);
1986
1969 mlog(0, "Inode %llu orphaned in slot %d\n", 1987 mlog(0, "Inode %llu orphaned in slot %d\n",
1970 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); 1988 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
1971 1989
@@ -2123,7 +2141,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2123 } 2141 }
2124 2142
2125 di = (struct ocfs2_dinode *)new_di_bh->b_data; 2143 di = (struct ocfs2_dinode *)new_di_bh->b_data;
2126 status = ocfs2_orphan_add(osb, handle, inode, di, orphan_name, 2144 status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
2127 &orphan_insert, orphan_dir); 2145 &orphan_insert, orphan_dir);
2128 if (status < 0) { 2146 if (status < 0) {
2129 mlog_errno(status); 2147 mlog_errno(status);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 1238b491db90..adf5e2ebc2c4 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -763,8 +763,18 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
763 return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits); 763 return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
764} 764}
765 765
766#define ocfs2_set_bit ext2_set_bit 766static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
767#define ocfs2_clear_bit ext2_clear_bit 767{
768 ext2_set_bit(bit, bitmap);
769}
770#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
771
772static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
773{
774 ext2_clear_bit(bit, bitmap);
775}
776#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
777
768#define ocfs2_test_bit ext2_test_bit 778#define ocfs2_test_bit ext2_test_bit
769#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit 779#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
770#define ocfs2_find_next_bit ext2_find_next_bit 780#define ocfs2_find_next_bit ext2_find_next_bit
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 355f41d1d520..ab42a74c7539 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -3,6 +3,7 @@
3 */ 3 */
4#include <linux/spinlock.h> 4#include <linux/spinlock.h>
5#include <linux/fs.h> 5#include <linux/fs.h>
6#include <linux/slab.h>
6#include <linux/quota.h> 7#include <linux/quota.h>
7#include <linux/quotaops.h> 8#include <linux/quotaops.h>
8#include <linux/dqblk_qtree.h> 9#include <linux/dqblk_qtree.h>
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index a6467f3d262e..9ad49305f450 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -3,6 +3,7 @@
3 */ 3 */
4 4
5#include <linux/fs.h> 5#include <linux/fs.h>
6#include <linux/slab.h>
6#include <linux/quota.h> 7#include <linux/quota.h>
7#include <linux/quotaops.h> 8#include <linux/quotaops.h>
8#include <linux/module.h> 9#include <linux/module.h>
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 9e96921dffda..bd96f6c7877e 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -37,7 +37,6 @@
37 37
38#include <linux/bio.h> 38#include <linux/bio.h>
39#include <linux/blkdev.h> 39#include <linux/blkdev.h>
40#include <linux/gfp.h>
41#include <linux/slab.h> 40#include <linux/slab.h>
42#include <linux/writeback.h> 41#include <linux/writeback.h>
43#include <linux/pagevec.h> 42#include <linux/pagevec.h>
@@ -4075,6 +4074,7 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
4075 OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features; 4074 OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
4076 spin_unlock(&OCFS2_I(t_inode)->ip_lock); 4075 spin_unlock(&OCFS2_I(t_inode)->ip_lock);
4077 i_size_write(t_inode, size); 4076 i_size_write(t_inode, size);
4077 t_inode->i_blocks = s_inode->i_blocks;
4078 4078
4079 di->i_xattr_inline_size = s_di->i_xattr_inline_size; 4079 di->i_xattr_inline_size = s_di->i_xattr_inline_size;
4080 di->i_clusters = s_di->i_clusters; 4080 di->i_clusters = s_di->i_clusters;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 7020e1253ffa..0d3049f696c5 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -19,6 +19,7 @@
19 19
20#include <linux/kernel.h> 20#include <linux/kernel.h>
21#include <linux/crc32.h> 21#include <linux/crc32.h>
22#include <linux/slab.h>
22#include <linux/module.h> 23#include <linux/module.h>
23 24
24/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */ 25/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 5ae8812b2864..2dc57bca0688 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -21,6 +21,7 @@
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/miscdevice.h> 22#include <linux/miscdevice.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h>
24#include <linux/smp_lock.h> 25#include <linux/smp_lock.h>
25#include <linux/reboot.h> 26#include <linux/reboot.h>
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c3c60bc3e072..19ba00f28547 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -95,13 +95,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
95 struct buffer_head *group_bh, 95 struct buffer_head *group_bh,
96 unsigned int bit_off, 96 unsigned int bit_off,
97 unsigned int num_bits); 97 unsigned int num_bits);
98static inline int ocfs2_block_group_clear_bits(handle_t *handle,
99 struct inode *alloc_inode,
100 struct ocfs2_group_desc *bg,
101 struct buffer_head *group_bh,
102 unsigned int bit_off,
103 unsigned int num_bits);
104
105static int ocfs2_relink_block_group(handle_t *handle, 98static int ocfs2_relink_block_group(handle_t *handle,
106 struct inode *alloc_inode, 99 struct inode *alloc_inode,
107 struct buffer_head *fe_bh, 100 struct buffer_head *fe_bh,
@@ -152,7 +145,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
152 145
153#define do_error(fmt, ...) \ 146#define do_error(fmt, ...) \
154 do{ \ 147 do{ \
155 if (clean_error) \ 148 if (resize) \
156 mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \ 149 mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \
157 else \ 150 else \
158 ocfs2_error(sb, fmt, ##__VA_ARGS__); \ 151 ocfs2_error(sb, fmt, ##__VA_ARGS__); \
@@ -160,7 +153,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
160 153
161static int ocfs2_validate_gd_self(struct super_block *sb, 154static int ocfs2_validate_gd_self(struct super_block *sb,
162 struct buffer_head *bh, 155 struct buffer_head *bh,
163 int clean_error) 156 int resize)
164{ 157{
165 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 158 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
166 159
@@ -211,7 +204,7 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
211static int ocfs2_validate_gd_parent(struct super_block *sb, 204static int ocfs2_validate_gd_parent(struct super_block *sb,
212 struct ocfs2_dinode *di, 205 struct ocfs2_dinode *di,
213 struct buffer_head *bh, 206 struct buffer_head *bh,
214 int clean_error) 207 int resize)
215{ 208{
216 unsigned int max_bits; 209 unsigned int max_bits;
217 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 210 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
@@ -233,8 +226,11 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
233 return -EINVAL; 226 return -EINVAL;
234 } 227 }
235 228
236 if (le16_to_cpu(gd->bg_chain) >= 229 /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
237 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) { 230 if ((le16_to_cpu(gd->bg_chain) >
231 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
232 ((le16_to_cpu(gd->bg_chain) ==
233 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
238 do_error("Group descriptor #%llu has bad chain %u", 234 do_error("Group descriptor #%llu has bad chain %u",
239 (unsigned long long)bh->b_blocknr, 235 (unsigned long long)bh->b_blocknr,
240 le16_to_cpu(gd->bg_chain)); 236 le16_to_cpu(gd->bg_chain));
@@ -1975,18 +1971,18 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
1975 bits_wanted, cluster_start, num_clusters); 1971 bits_wanted, cluster_start, num_clusters);
1976} 1972}
1977 1973
1978static inline int ocfs2_block_group_clear_bits(handle_t *handle, 1974static int ocfs2_block_group_clear_bits(handle_t *handle,
1979 struct inode *alloc_inode, 1975 struct inode *alloc_inode,
1980 struct ocfs2_group_desc *bg, 1976 struct ocfs2_group_desc *bg,
1981 struct buffer_head *group_bh, 1977 struct buffer_head *group_bh,
1982 unsigned int bit_off, 1978 unsigned int bit_off,
1983 unsigned int num_bits) 1979 unsigned int num_bits,
1980 void (*undo_fn)(unsigned int bit,
1981 unsigned long *bmap))
1984{ 1982{
1985 int status; 1983 int status;
1986 unsigned int tmp; 1984 unsigned int tmp;
1987 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1988 struct ocfs2_group_desc *undo_bg = NULL; 1985 struct ocfs2_group_desc *undo_bg = NULL;
1989 int cluster_bitmap = 0;
1990 1986
1991 mlog_entry_void(); 1987 mlog_entry_void();
1992 1988
@@ -1996,20 +1992,18 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1996 1992
1997 mlog(0, "off = %u, num = %u\n", bit_off, num_bits); 1993 mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1998 1994
1999 if (ocfs2_is_cluster_bitmap(alloc_inode)) 1995 BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
2000 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
2001
2002 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1996 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
2003 group_bh, journal_type); 1997 group_bh,
1998 undo_fn ?
1999 OCFS2_JOURNAL_ACCESS_UNDO :
2000 OCFS2_JOURNAL_ACCESS_WRITE);
2004 if (status < 0) { 2001 if (status < 0) {
2005 mlog_errno(status); 2002 mlog_errno(status);
2006 goto bail; 2003 goto bail;
2007 } 2004 }
2008 2005
2009 if (ocfs2_is_cluster_bitmap(alloc_inode)) 2006 if (undo_fn) {
2010 cluster_bitmap = 1;
2011
2012 if (cluster_bitmap) {
2013 jbd_lock_bh_state(group_bh); 2007 jbd_lock_bh_state(group_bh);
2014 undo_bg = (struct ocfs2_group_desc *) 2008 undo_bg = (struct ocfs2_group_desc *)
2015 bh2jh(group_bh)->b_committed_data; 2009 bh2jh(group_bh)->b_committed_data;
@@ -2020,13 +2014,13 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
2020 while(tmp--) { 2014 while(tmp--) {
2021 ocfs2_clear_bit((bit_off + tmp), 2015 ocfs2_clear_bit((bit_off + tmp),
2022 (unsigned long *) bg->bg_bitmap); 2016 (unsigned long *) bg->bg_bitmap);
2023 if (cluster_bitmap) 2017 if (undo_fn)
2024 ocfs2_set_bit(bit_off + tmp, 2018 undo_fn(bit_off + tmp,
2025 (unsigned long *) undo_bg->bg_bitmap); 2019 (unsigned long *) undo_bg->bg_bitmap);
2026 } 2020 }
2027 le16_add_cpu(&bg->bg_free_bits_count, num_bits); 2021 le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2028 2022
2029 if (cluster_bitmap) 2023 if (undo_fn)
2030 jbd_unlock_bh_state(group_bh); 2024 jbd_unlock_bh_state(group_bh);
2031 2025
2032 status = ocfs2_journal_dirty(handle, group_bh); 2026 status = ocfs2_journal_dirty(handle, group_bh);
@@ -2039,12 +2033,14 @@ bail:
2039/* 2033/*
2040 * expects the suballoc inode to already be locked. 2034 * expects the suballoc inode to already be locked.
2041 */ 2035 */
2042int ocfs2_free_suballoc_bits(handle_t *handle, 2036static int _ocfs2_free_suballoc_bits(handle_t *handle,
2043 struct inode *alloc_inode, 2037 struct inode *alloc_inode,
2044 struct buffer_head *alloc_bh, 2038 struct buffer_head *alloc_bh,
2045 unsigned int start_bit, 2039 unsigned int start_bit,
2046 u64 bg_blkno, 2040 u64 bg_blkno,
2047 unsigned int count) 2041 unsigned int count,
2042 void (*undo_fn)(unsigned int bit,
2043 unsigned long *bitmap))
2048{ 2044{
2049 int status = 0; 2045 int status = 0;
2050 u32 tmp_used; 2046 u32 tmp_used;
@@ -2079,7 +2075,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
2079 2075
2080 status = ocfs2_block_group_clear_bits(handle, alloc_inode, 2076 status = ocfs2_block_group_clear_bits(handle, alloc_inode,
2081 group, group_bh, 2077 group, group_bh,
2082 start_bit, count); 2078 start_bit, count, undo_fn);
2083 if (status < 0) { 2079 if (status < 0) {
2084 mlog_errno(status); 2080 mlog_errno(status);
2085 goto bail; 2081 goto bail;
@@ -2110,6 +2106,17 @@ bail:
2110 return status; 2106 return status;
2111} 2107}
2112 2108
2109int ocfs2_free_suballoc_bits(handle_t *handle,
2110 struct inode *alloc_inode,
2111 struct buffer_head *alloc_bh,
2112 unsigned int start_bit,
2113 u64 bg_blkno,
2114 unsigned int count)
2115{
2116 return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2117 start_bit, bg_blkno, count, NULL);
2118}
2119
2113int ocfs2_free_dinode(handle_t *handle, 2120int ocfs2_free_dinode(handle_t *handle,
2114 struct inode *inode_alloc_inode, 2121 struct inode *inode_alloc_inode,
2115 struct buffer_head *inode_alloc_bh, 2122 struct buffer_head *inode_alloc_bh,
@@ -2123,11 +2130,13 @@ int ocfs2_free_dinode(handle_t *handle,
2123 inode_alloc_bh, bit, bg_blkno, 1); 2130 inode_alloc_bh, bit, bg_blkno, 1);
2124} 2131}
2125 2132
2126int ocfs2_free_clusters(handle_t *handle, 2133static int _ocfs2_free_clusters(handle_t *handle,
2127 struct inode *bitmap_inode, 2134 struct inode *bitmap_inode,
2128 struct buffer_head *bitmap_bh, 2135 struct buffer_head *bitmap_bh,
2129 u64 start_blk, 2136 u64 start_blk,
2130 unsigned int num_clusters) 2137 unsigned int num_clusters,
2138 void (*undo_fn)(unsigned int bit,
2139 unsigned long *bitmap))
2131{ 2140{
2132 int status; 2141 int status;
2133 u16 bg_start_bit; 2142 u16 bg_start_bit;
@@ -2154,9 +2163,9 @@ int ocfs2_free_clusters(handle_t *handle,
2154 mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n", 2163 mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
2155 (unsigned long long)bg_blkno, bg_start_bit); 2164 (unsigned long long)bg_blkno, bg_start_bit);
2156 2165
2157 status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, 2166 status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2158 bg_start_bit, bg_blkno, 2167 bg_start_bit, bg_blkno,
2159 num_clusters); 2168 num_clusters, undo_fn);
2160 if (status < 0) { 2169 if (status < 0) {
2161 mlog_errno(status); 2170 mlog_errno(status);
2162 goto out; 2171 goto out;
@@ -2170,6 +2179,32 @@ out:
2170 return status; 2179 return status;
2171} 2180}
2172 2181
2182int ocfs2_free_clusters(handle_t *handle,
2183 struct inode *bitmap_inode,
2184 struct buffer_head *bitmap_bh,
2185 u64 start_blk,
2186 unsigned int num_clusters)
2187{
2188 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2189 start_blk, num_clusters,
2190 _ocfs2_set_bit);
2191}
2192
2193/*
2194 * Give never-used clusters back to the global bitmap. We don't need
2195 * to protect these bits in the undo buffer.
2196 */
2197int ocfs2_release_clusters(handle_t *handle,
2198 struct inode *bitmap_inode,
2199 struct buffer_head *bitmap_bh,
2200 u64 start_blk,
2201 unsigned int num_clusters)
2202{
2203 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2204 start_blk, num_clusters,
2205 _ocfs2_clear_bit);
2206}
2207
2173static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg) 2208static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
2174{ 2209{
2175 printk("Block Group:\n"); 2210 printk("Block Group:\n");
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index fa60723c43e8..e0f46df357e6 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -127,6 +127,11 @@ int ocfs2_free_clusters(handle_t *handle,
127 struct buffer_head *bitmap_bh, 127 struct buffer_head *bitmap_bh,
128 u64 start_blk, 128 u64 start_blk,
129 unsigned int num_clusters); 129 unsigned int num_clusters);
130int ocfs2_release_clusters(handle_t *handle,
131 struct inode *bitmap_inode,
132 struct buffer_head *bitmap_bh,
133 u64 start_blk,
134 unsigned int num_clusters);
130 135
131static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit) 136static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
132{ 137{
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 40e53702948c..bfe7190cdbf1 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30 29
31#define MLOG_MASK_PREFIX ML_INODE 30#define MLOG_MASK_PREFIX ML_INODE
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d1b0d386f6d1..3e7773089b96 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1622,7 +1622,7 @@ static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc)
1622 /* Now tell xh->xh_entries about it */ 1622 /* Now tell xh->xh_entries about it */
1623 for (i = 0; i < count; i++) { 1623 for (i = 0; i < count; i++) {
1624 offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset); 1624 offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
1625 if (offset < namevalue_offset) 1625 if (offset <= namevalue_offset)
1626 le16_add_cpu(&xh->xh_entries[i].xe_name_offset, 1626 le16_add_cpu(&xh->xh_entries[i].xe_name_offset,
1627 namevalue_size); 1627 namevalue_size);
1628 } 1628 }
@@ -6528,13 +6528,11 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
6528 int indexed) 6528 int indexed)
6529{ 6529{
6530 int ret; 6530 int ret;
6531 struct ocfs2_alloc_context *meta_ac;
6532 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 6531 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6533 struct ocfs2_xattr_set_ctxt ctxt = { 6532 struct ocfs2_xattr_set_ctxt ctxt;
6534 .meta_ac = meta_ac,
6535 };
6536 6533
6537 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); 6534 memset(&ctxt, 0, sizeof(ctxt));
6535 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &ctxt.meta_ac);
6538 if (ret < 0) { 6536 if (ret < 0) {
6539 mlog_errno(ret); 6537 mlog_errno(ret);
6540 return ret; 6538 return ret;
@@ -6556,7 +6554,7 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
6556 6554
6557 ocfs2_commit_trans(osb, ctxt.handle); 6555 ocfs2_commit_trans(osb, ctxt.handle);
6558out: 6556out:
6559 ocfs2_free_alloc_context(meta_ac); 6557 ocfs2_free_alloc_context(ctxt.meta_ac);
6560 return ret; 6558 return ret;
6561} 6559}
6562 6560
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 2ff33ea5cb34..b44bb835e8ea 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -5,6 +5,7 @@
5 */ 5 */
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/slab.h>
8#include <linux/fs.h> 9#include <linux/fs.h>
9#include <linux/vfs.h> 10#include <linux/vfs.h>
10#include <linux/parser.h> 11#include <linux/parser.h>
diff --git a/fs/open.c b/fs/open.c
index e17f54454b50..74e5cd9f718e 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -10,7 +10,6 @@
10#include <linux/fdtable.h> 10#include <linux/fdtable.h>
11#include <linux/fsnotify.h> 11#include <linux/fsnotify.h>
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/slab.h>
14#include <linux/tty.h> 13#include <linux/tty.h>
15#include <linux/namei.h> 14#include <linux/namei.h>
16#include <linux/backing-dev.h> 15#include <linux/backing-dev.h>
@@ -20,6 +19,7 @@
20#include <linux/mount.h> 19#include <linux/mount.h>
21#include <linux/vfs.h> 20#include <linux/vfs.h>
22#include <linux/fcntl.h> 21#include <linux/fcntl.h>
22#include <linux/slab.h>
23#include <asm/uaccess.h> 23#include <asm/uaccess.h>
24#include <linux/fs.h> 24#include <linux/fs.h>
25#include <linux/personality.h> 25#include <linux/personality.h>
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index e8865c11777f..e238ab23a9e7 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -16,6 +16,7 @@
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/slab.h>
19#include <linux/kmod.h> 20#include <linux/kmod.h>
20#include <linux/ctype.h> 21#include <linux/ctype.h>
21#include <linux/genhd.h> 22#include <linux/genhd.h>
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 49cfd5f54238..91babdae7587 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -95,6 +95,7 @@
95 ************************************************************/ 95 ************************************************************/
96#include <linux/crc32.h> 96#include <linux/crc32.h>
97#include <linux/math64.h> 97#include <linux/math64.h>
98#include <linux/slab.h>
98#include "check.h" 99#include "check.h"
99#include "efi.h" 100#include "efi.h"
100 101
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 0028d2ef0662..90be97f1f5a8 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -31,14 +31,17 @@
31 */ 31 */
32#include <asm/unaligned.h> 32#include <asm/unaligned.h>
33 33
34#define SYS_IND(p) (get_unaligned(&p->sys_ind)) 34#define SYS_IND(p) get_unaligned(&p->sys_ind)
35#define NR_SECTS(p) ({ __le32 __a = get_unaligned(&p->nr_sects); \
36 le32_to_cpu(__a); \
37 })
38 35
39#define START_SECT(p) ({ __le32 __a = get_unaligned(&p->start_sect); \ 36static inline sector_t nr_sects(struct partition *p)
40 le32_to_cpu(__a); \ 37{
41 }) 38 return (sector_t)get_unaligned_le32(&p->nr_sects);
39}
40
41static inline sector_t start_sect(struct partition *p)
42{
43 return (sector_t)get_unaligned_le32(&p->start_sect);
44}
42 45
43static inline int is_extended_partition(struct partition *p) 46static inline int is_extended_partition(struct partition *p)
44{ 47{
@@ -104,13 +107,13 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
104 107
105static void 108static void
106parse_extended(struct parsed_partitions *state, struct block_device *bdev, 109parse_extended(struct parsed_partitions *state, struct block_device *bdev,
107 u32 first_sector, u32 first_size) 110 sector_t first_sector, sector_t first_size)
108{ 111{
109 struct partition *p; 112 struct partition *p;
110 Sector sect; 113 Sector sect;
111 unsigned char *data; 114 unsigned char *data;
112 u32 this_sector, this_size; 115 sector_t this_sector, this_size;
113 int sector_size = bdev_logical_block_size(bdev) / 512; 116 sector_t sector_size = bdev_logical_block_size(bdev) / 512;
114 int loopct = 0; /* number of links followed 117 int loopct = 0; /* number of links followed
115 without finding a data partition */ 118 without finding a data partition */
116 int i; 119 int i;
@@ -145,14 +148,14 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
145 * First process the data partition(s) 148 * First process the data partition(s)
146 */ 149 */
147 for (i=0; i<4; i++, p++) { 150 for (i=0; i<4; i++, p++) {
148 u32 offs, size, next; 151 sector_t offs, size, next;
149 if (!NR_SECTS(p) || is_extended_partition(p)) 152 if (!nr_sects(p) || is_extended_partition(p))
150 continue; 153 continue;
151 154
152 /* Check the 3rd and 4th entries - 155 /* Check the 3rd and 4th entries -
153 these sometimes contain random garbage */ 156 these sometimes contain random garbage */
154 offs = START_SECT(p)*sector_size; 157 offs = start_sect(p)*sector_size;
155 size = NR_SECTS(p)*sector_size; 158 size = nr_sects(p)*sector_size;
156 next = this_sector + offs; 159 next = this_sector + offs;
157 if (i >= 2) { 160 if (i >= 2) {
158 if (offs + size > this_size) 161 if (offs + size > this_size)
@@ -179,13 +182,13 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
179 */ 182 */
180 p -= 4; 183 p -= 4;
181 for (i=0; i<4; i++, p++) 184 for (i=0; i<4; i++, p++)
182 if (NR_SECTS(p) && is_extended_partition(p)) 185 if (nr_sects(p) && is_extended_partition(p))
183 break; 186 break;
184 if (i == 4) 187 if (i == 4)
185 goto done; /* nothing left to do */ 188 goto done; /* nothing left to do */
186 189
187 this_sector = first_sector + START_SECT(p) * sector_size; 190 this_sector = first_sector + start_sect(p) * sector_size;
188 this_size = NR_SECTS(p) * sector_size; 191 this_size = nr_sects(p) * sector_size;
189 put_dev_sector(sect); 192 put_dev_sector(sect);
190 } 193 }
191done: 194done:
@@ -197,7 +200,7 @@ done:
197 200
198static void 201static void
199parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev, 202parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
200 u32 offset, u32 size, int origin) 203 sector_t offset, sector_t size, int origin)
201{ 204{
202#ifdef CONFIG_SOLARIS_X86_PARTITION 205#ifdef CONFIG_SOLARIS_X86_PARTITION
203 Sector sect; 206 Sector sect;
@@ -244,7 +247,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
244 */ 247 */
245static void 248static void
246parse_bsd(struct parsed_partitions *state, struct block_device *bdev, 249parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
247 u32 offset, u32 size, int origin, char *flavour, 250 sector_t offset, sector_t size, int origin, char *flavour,
248 int max_partitions) 251 int max_partitions)
249{ 252{
250 Sector sect; 253 Sector sect;
@@ -263,7 +266,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
263 if (le16_to_cpu(l->d_npartitions) < max_partitions) 266 if (le16_to_cpu(l->d_npartitions) < max_partitions)
264 max_partitions = le16_to_cpu(l->d_npartitions); 267 max_partitions = le16_to_cpu(l->d_npartitions);
265 for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) { 268 for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) {
266 u32 bsd_start, bsd_size; 269 sector_t bsd_start, bsd_size;
267 270
268 if (state->next == state->limit) 271 if (state->next == state->limit)
269 break; 272 break;
@@ -290,7 +293,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
290 293
291static void 294static void
292parse_freebsd(struct parsed_partitions *state, struct block_device *bdev, 295parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
293 u32 offset, u32 size, int origin) 296 sector_t offset, sector_t size, int origin)
294{ 297{
295#ifdef CONFIG_BSD_DISKLABEL 298#ifdef CONFIG_BSD_DISKLABEL
296 parse_bsd(state, bdev, offset, size, origin, 299 parse_bsd(state, bdev, offset, size, origin,
@@ -300,7 +303,7 @@ parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
300 303
301static void 304static void
302parse_netbsd(struct parsed_partitions *state, struct block_device *bdev, 305parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
303 u32 offset, u32 size, int origin) 306 sector_t offset, sector_t size, int origin)
304{ 307{
305#ifdef CONFIG_BSD_DISKLABEL 308#ifdef CONFIG_BSD_DISKLABEL
306 parse_bsd(state, bdev, offset, size, origin, 309 parse_bsd(state, bdev, offset, size, origin,
@@ -310,7 +313,7 @@ parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
310 313
311static void 314static void
312parse_openbsd(struct parsed_partitions *state, struct block_device *bdev, 315parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
313 u32 offset, u32 size, int origin) 316 sector_t offset, sector_t size, int origin)
314{ 317{
315#ifdef CONFIG_BSD_DISKLABEL 318#ifdef CONFIG_BSD_DISKLABEL
316 parse_bsd(state, bdev, offset, size, origin, 319 parse_bsd(state, bdev, offset, size, origin,
@@ -324,7 +327,7 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
324 */ 327 */
325static void 328static void
326parse_unixware(struct parsed_partitions *state, struct block_device *bdev, 329parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
327 u32 offset, u32 size, int origin) 330 sector_t offset, sector_t size, int origin)
328{ 331{
329#ifdef CONFIG_UNIXWARE_DISKLABEL 332#ifdef CONFIG_UNIXWARE_DISKLABEL
330 Sector sect; 333 Sector sect;
@@ -348,7 +351,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
348 351
349 if (p->s_label != UNIXWARE_FS_UNUSED) 352 if (p->s_label != UNIXWARE_FS_UNUSED)
350 put_partition(state, state->next++, 353 put_partition(state, state->next++,
351 START_SECT(p), NR_SECTS(p)); 354 le32_to_cpu(p->start_sect),
355 le32_to_cpu(p->nr_sects));
352 p++; 356 p++;
353 } 357 }
354 put_dev_sector(sect); 358 put_dev_sector(sect);
@@ -363,7 +367,7 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
363 */ 367 */
364static void 368static void
365parse_minix(struct parsed_partitions *state, struct block_device *bdev, 369parse_minix(struct parsed_partitions *state, struct block_device *bdev,
366 u32 offset, u32 size, int origin) 370 sector_t offset, sector_t size, int origin)
367{ 371{
368#ifdef CONFIG_MINIX_SUBPARTITION 372#ifdef CONFIG_MINIX_SUBPARTITION
369 Sector sect; 373 Sector sect;
@@ -390,7 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
390 /* add each partition in use */ 394 /* add each partition in use */
391 if (SYS_IND(p) == MINIX_PARTITION) 395 if (SYS_IND(p) == MINIX_PARTITION)
392 put_partition(state, state->next++, 396 put_partition(state, state->next++,
393 START_SECT(p), NR_SECTS(p)); 397 start_sect(p), nr_sects(p));
394 } 398 }
395 printk(" >\n"); 399 printk(" >\n");
396 } 400 }
@@ -401,7 +405,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
401static struct { 405static struct {
402 unsigned char id; 406 unsigned char id;
403 void (*parse)(struct parsed_partitions *, struct block_device *, 407 void (*parse)(struct parsed_partitions *, struct block_device *,
404 u32, u32, int); 408 sector_t, sector_t, int);
405} subtypes[] = { 409} subtypes[] = {
406 {FREEBSD_PARTITION, parse_freebsd}, 410 {FREEBSD_PARTITION, parse_freebsd},
407 {NETBSD_PARTITION, parse_netbsd}, 411 {NETBSD_PARTITION, parse_netbsd},
@@ -415,7 +419,7 @@ static struct {
415 419
416int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) 420int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
417{ 421{
418 int sector_size = bdev_logical_block_size(bdev) / 512; 422 sector_t sector_size = bdev_logical_block_size(bdev) / 512;
419 Sector sect; 423 Sector sect;
420 unsigned char *data; 424 unsigned char *data;
421 struct partition *p; 425 struct partition *p;
@@ -483,14 +487,21 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
483 487
484 state->next = 5; 488 state->next = 5;
485 for (slot = 1 ; slot <= 4 ; slot++, p++) { 489 for (slot = 1 ; slot <= 4 ; slot++, p++) {
486 u32 start = START_SECT(p)*sector_size; 490 sector_t start = start_sect(p)*sector_size;
487 u32 size = NR_SECTS(p)*sector_size; 491 sector_t size = nr_sects(p)*sector_size;
488 if (!size) 492 if (!size)
489 continue; 493 continue;
490 if (is_extended_partition(p)) { 494 if (is_extended_partition(p)) {
491 /* prevent someone doing mkfs or mkswap on an 495 /*
492 extended partition, but leave room for LILO */ 496 * prevent someone doing mkfs or mkswap on an
493 put_partition(state, slot, start, size == 1 ? 1 : 2); 497 * extended partition, but leave room for LILO
498 * FIXME: this uses one logical sector for > 512b
499 * sector, although it may not be enough/proper.
500 */
501 sector_t n = 2;
502 n = min(size, max(sector_size, n));
503 put_partition(state, slot, start, n);
504
494 printk(" <"); 505 printk(" <");
495 parse_extended(state, bdev, start, size); 506 parse_extended(state, bdev, start, size);
496 printk(" >"); 507 printk(" >");
@@ -513,7 +524,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
513 unsigned char id = SYS_IND(p); 524 unsigned char id = SYS_IND(p);
514 int n; 525 int n;
515 526
516 if (!NR_SECTS(p)) 527 if (!nr_sects(p))
517 continue; 528 continue;
518 529
519 for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++) 530 for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++)
@@ -521,8 +532,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
521 532
522 if (!subtypes[n].parse) 533 if (!subtypes[n].parse)
523 continue; 534 continue;
524 subtypes[n].parse(state, bdev, START_SECT(p)*sector_size, 535 subtypes[n].parse(state, bdev, start_sect(p)*sector_size,
525 NR_SECTS(p)*sector_size, slot); 536 nr_sects(p)*sector_size, slot);
526 } 537 }
527 put_dev_sector(sect); 538 put_dev_sector(sect);
528 return 1; 539 return 1;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index aa8637b81028..e51f2ec2c5e5 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -68,7 +68,6 @@
68#include <linux/hugetlb.h> 68#include <linux/hugetlb.h>
69#include <linux/pagemap.h> 69#include <linux/pagemap.h>
70#include <linux/swap.h> 70#include <linux/swap.h>
71#include <linux/slab.h>
72#include <linux/smp.h> 71#include <linux/smp.h>
73#include <linux/signal.h> 72#include <linux/signal.h>
74#include <linux/highmem.h> 73#include <linux/highmem.h>
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a7310841c831..7621db800a74 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -81,6 +81,7 @@
81#include <linux/elf.h> 81#include <linux/elf.h>
82#include <linux/pid_namespace.h> 82#include <linux/pid_namespace.h>
83#include <linux/fs_struct.h> 83#include <linux/fs_struct.h>
84#include <linux/slab.h>
84#include "internal.h" 85#include "internal.h"
85 86
86/* NOTE: 87/* NOTE:
@@ -442,12 +443,13 @@ static const struct file_operations proc_lstats_operations = {
442unsigned long badness(struct task_struct *p, unsigned long uptime); 443unsigned long badness(struct task_struct *p, unsigned long uptime);
443static int proc_oom_score(struct task_struct *task, char *buffer) 444static int proc_oom_score(struct task_struct *task, char *buffer)
444{ 445{
445 unsigned long points; 446 unsigned long points = 0;
446 struct timespec uptime; 447 struct timespec uptime;
447 448
448 do_posix_clock_monotonic_gettime(&uptime); 449 do_posix_clock_monotonic_gettime(&uptime);
449 read_lock(&tasklist_lock); 450 read_lock(&tasklist_lock);
450 points = badness(task->group_leader, uptime.tv_sec); 451 if (pid_alive(task))
452 points = badness(task, uptime.tv_sec);
451 read_unlock(&tasklist_lock); 453 read_unlock(&tasklist_lock);
452 return sprintf(buffer, "%lu\n", points); 454 return sprintf(buffer, "%lu\n", points);
453} 455}
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 08f4d71dacd7..43c127490606 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -13,6 +13,7 @@
13#include <linux/proc_fs.h> 13#include <linux/proc_fs.h>
14#include <linux/stat.h> 14#include <linux/stat.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/slab.h>
16#include <linux/mount.h> 17#include <linux/mount.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/idr.h> 19#include <linux/idr.h>
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 445a02bcaab3..d35b23238fb1 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -18,6 +18,7 @@
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/smp_lock.h> 19#include <linux/smp_lock.h>
20#include <linux/sysctl.h> 20#include <linux/sysctl.h>
21#include <linux/slab.h>
21 22
22#include <asm/system.h> 23#include <asm/system.h>
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a44a7897fd4d..19979a2ce272 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -19,6 +19,7 @@
19#include <linux/highmem.h> 19#include <linux/highmem.h>
20#include <linux/bootmem.h> 20#include <linux/bootmem.h>
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/slab.h>
22#include <asm/uaccess.h> 23#include <asm/uaccess.h>
23#include <asm/io.h> 24#include <asm/io.h>
24#include <linux/list.h> 25#include <linux/list.h>
@@ -490,7 +491,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
490 } 491 }
491 read_unlock(&kclist_lock); 492 read_unlock(&kclist_lock);
492 493
493 if (m == NULL) { 494 if (&m->list == &kclist_head) {
494 if (clear_user(buffer, tsz)) 495 if (clear_user(buffer, tsz))
495 return -EFAULT; 496 return -EFAULT;
496 } else if (is_vmalloc_or_module_addr((void *)start)) { 497 } else if (is_vmalloc_or_module_addr((void *)start)) {
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 9fe7d7ebe115..b1822dde55c2 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -21,7 +21,6 @@
21#include <linux/mmzone.h> 21#include <linux/mmzone.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/swap.h> 23#include <linux/swap.h>
24#include <linux/slab.h>
25#include <linux/smp.h> 24#include <linux/smp.h>
26#include <linux/seq_file.h> 25#include <linux/seq_file.h>
27#include <linux/hugetlb.h> 26#include <linux/hugetlb.h>
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index f8650dce74fb..ce94801f48ca 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -12,6 +12,7 @@
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/of.h> 13#include <linux/of.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/slab.h>
15#include <asm/prom.h> 16#include <asm/prom.h>
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17#include "internal.h" 18#include "internal.h"
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 04d1270f1c38..9020ac15baaa 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -14,6 +14,7 @@
14#include <linux/time.h> 14#include <linux/time.h>
15#include <linux/proc_fs.h> 15#include <linux/proc_fs.h>
16#include <linux/stat.h> 16#include <linux/stat.h>
17#include <linux/slab.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/sched.h> 19#include <linux/sched.h>
19#include <linux/module.h> 20#include <linux/module.h>
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index b9b7aad2003d..bf31b03fc275 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -1,6 +1,5 @@
1#include <linux/cpumask.h> 1#include <linux/cpumask.h>
2#include <linux/fs.h> 2#include <linux/fs.h>
3#include <linux/gfp.h>
4#include <linux/init.h> 3#include <linux/init.h>
5#include <linux/interrupt.h> 4#include <linux/interrupt.h>
6#include <linux/kernel_stat.h> 5#include <linux/kernel_stat.h>
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 183f8ff5f400..070553427dd5 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -4,6 +4,7 @@
4#include <linux/seq_file.h> 4#include <linux/seq_file.h>
5#include <linux/highmem.h> 5#include <linux/highmem.h>
6#include <linux/ptrace.h> 6#include <linux/ptrace.h>
7#include <linux/slab.h>
7#include <linux/pagemap.h> 8#include <linux/pagemap.h>
8#include <linux/mempolicy.h> 9#include <linux/mempolicy.h>
9#include <linux/swap.h> 10#include <linux/swap.h>
@@ -406,6 +407,7 @@ static int show_smap(struct seq_file *m, void *v)
406 407
407 memset(&mss, 0, sizeof mss); 408 memset(&mss, 0, sizeof mss);
408 mss.vma = vma; 409 mss.vma = vma;
410 /* mmap_sem is held in m_start */
409 if (vma->vm_mm && !is_vm_hugetlb_page(vma)) 411 if (vma->vm_mm && !is_vm_hugetlb_page(vma))
410 walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); 412 walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
411 413
@@ -552,7 +554,8 @@ const struct file_operations proc_clear_refs_operations = {
552}; 554};
553 555
554struct pagemapread { 556struct pagemapread {
555 u64 __user *out, *end; 557 int pos, len;
558 u64 *buffer;
556}; 559};
557 560
558#define PM_ENTRY_BYTES sizeof(u64) 561#define PM_ENTRY_BYTES sizeof(u64)
@@ -575,10 +578,8 @@ struct pagemapread {
575static int add_to_pagemap(unsigned long addr, u64 pfn, 578static int add_to_pagemap(unsigned long addr, u64 pfn,
576 struct pagemapread *pm) 579 struct pagemapread *pm)
577{ 580{
578 if (put_user(pfn, pm->out)) 581 pm->buffer[pm->pos++] = pfn;
579 return -EFAULT; 582 if (pm->pos >= pm->len)
580 pm->out++;
581 if (pm->out >= pm->end)
582 return PM_END_OF_BUFFER; 583 return PM_END_OF_BUFFER;
583 return 0; 584 return 0;
584} 585}
@@ -661,31 +662,18 @@ static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
661 return pme; 662 return pme;
662} 663}
663 664
664static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr, 665/* This function walks within one hugetlb entry in the single call */
665 unsigned long end, struct mm_walk *walk) 666static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
667 unsigned long addr, unsigned long end,
668 struct mm_walk *walk)
666{ 669{
667 struct vm_area_struct *vma;
668 struct pagemapread *pm = walk->private; 670 struct pagemapread *pm = walk->private;
669 struct hstate *hs = NULL;
670 int err = 0; 671 int err = 0;
672 u64 pfn;
671 673
672 vma = find_vma(walk->mm, addr);
673 if (vma)
674 hs = hstate_vma(vma);
675 for (; addr != end; addr += PAGE_SIZE) { 674 for (; addr != end; addr += PAGE_SIZE) {
676 u64 pfn = PM_NOT_PRESENT; 675 int offset = (addr & ~hmask) >> PAGE_SHIFT;
677 676 pfn = huge_pte_to_pagemap_entry(*pte, offset);
678 if (vma && (addr >= vma->vm_end)) {
679 vma = find_vma(walk->mm, addr);
680 if (vma)
681 hs = hstate_vma(vma);
682 }
683
684 if (vma && (vma->vm_start <= addr) && is_vm_hugetlb_page(vma)) {
685 /* calculate pfn of the "raw" page in the hugepage. */
686 int offset = (addr & ~huge_page_mask(hs)) >> PAGE_SHIFT;
687 pfn = huge_pte_to_pagemap_entry(*pte, offset);
688 }
689 err = add_to_pagemap(addr, pfn, pm); 677 err = add_to_pagemap(addr, pfn, pm);
690 if (err) 678 if (err)
691 return err; 679 return err;
@@ -720,21 +708,20 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr,
720 * determine which areas of memory are actually mapped and llseek to 708 * determine which areas of memory are actually mapped and llseek to
721 * skip over unmapped regions. 709 * skip over unmapped regions.
722 */ 710 */
711#define PAGEMAP_WALK_SIZE (PMD_SIZE)
723static ssize_t pagemap_read(struct file *file, char __user *buf, 712static ssize_t pagemap_read(struct file *file, char __user *buf,
724 size_t count, loff_t *ppos) 713 size_t count, loff_t *ppos)
725{ 714{
726 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); 715 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
727 struct page **pages, *page;
728 unsigned long uaddr, uend;
729 struct mm_struct *mm; 716 struct mm_struct *mm;
730 struct pagemapread pm; 717 struct pagemapread pm;
731 int pagecount;
732 int ret = -ESRCH; 718 int ret = -ESRCH;
733 struct mm_walk pagemap_walk = {}; 719 struct mm_walk pagemap_walk = {};
734 unsigned long src; 720 unsigned long src;
735 unsigned long svpfn; 721 unsigned long svpfn;
736 unsigned long start_vaddr; 722 unsigned long start_vaddr;
737 unsigned long end_vaddr; 723 unsigned long end_vaddr;
724 int copied = 0;
738 725
739 if (!task) 726 if (!task)
740 goto out; 727 goto out;
@@ -757,35 +744,12 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
757 if (!mm) 744 if (!mm)
758 goto out_task; 745 goto out_task;
759 746
760 747 pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
761 uaddr = (unsigned long)buf & PAGE_MASK; 748 pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
762 uend = (unsigned long)(buf + count);
763 pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE;
764 ret = 0;
765 if (pagecount == 0)
766 goto out_mm;
767 pages = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
768 ret = -ENOMEM; 749 ret = -ENOMEM;
769 if (!pages) 750 if (!pm.buffer)
770 goto out_mm; 751 goto out_mm;
771 752
772 down_read(&current->mm->mmap_sem);
773 ret = get_user_pages(current, current->mm, uaddr, pagecount,
774 1, 0, pages, NULL);
775 up_read(&current->mm->mmap_sem);
776
777 if (ret < 0)
778 goto out_free;
779
780 if (ret != pagecount) {
781 pagecount = ret;
782 ret = -EFAULT;
783 goto out_pages;
784 }
785
786 pm.out = (u64 __user *)buf;
787 pm.end = (u64 __user *)(buf + count);
788
789 pagemap_walk.pmd_entry = pagemap_pte_range; 753 pagemap_walk.pmd_entry = pagemap_pte_range;
790 pagemap_walk.pte_hole = pagemap_pte_hole; 754 pagemap_walk.pte_hole = pagemap_pte_hole;
791 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; 755 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
@@ -807,23 +771,36 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
807 * user buffer is tracked in "pm", and the walk 771 * user buffer is tracked in "pm", and the walk
808 * will stop when we hit the end of the buffer. 772 * will stop when we hit the end of the buffer.
809 */ 773 */
810 ret = walk_page_range(start_vaddr, end_vaddr, &pagemap_walk); 774 ret = 0;
811 if (ret == PM_END_OF_BUFFER) 775 while (count && (start_vaddr < end_vaddr)) {
812 ret = 0; 776 int len;
813 /* don't need mmap_sem for these, but this looks cleaner */ 777 unsigned long end;
814 *ppos += (char __user *)pm.out - buf; 778
815 if (!ret) 779 pm.pos = 0;
816 ret = (char __user *)pm.out - buf; 780 end = start_vaddr + PAGEMAP_WALK_SIZE;
817 781 /* overflow ? */
818out_pages: 782 if (end < start_vaddr || end > end_vaddr)
819 for (; pagecount; pagecount--) { 783 end = end_vaddr;
820 page = pages[pagecount-1]; 784 down_read(&mm->mmap_sem);
821 if (!PageReserved(page)) 785 ret = walk_page_range(start_vaddr, end, &pagemap_walk);
822 SetPageDirty(page); 786 up_read(&mm->mmap_sem);
823 page_cache_release(page); 787 start_vaddr = end;
788
789 len = min(count, PM_ENTRY_BYTES * pm.pos);
790 if (copy_to_user(buf, pm.buffer, len)) {
791 ret = -EFAULT;
792 goto out_free;
793 }
794 copied += len;
795 buf += len;
796 count -= len;
824 } 797 }
798 *ppos += copied;
799 if (!ret || ret == PM_END_OF_BUFFER)
800 ret = copied;
801
825out_free: 802out_free:
826 kfree(pages); 803 kfree(pm.buffer);
827out_mm: 804out_mm:
828 mmput(mm); 805 mmput(mm);
829out_task: 806out_task:
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 5d9fd64ef81a..46d4b5d72bd3 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -5,6 +5,7 @@
5#include <linux/fs_struct.h> 5#include <linux/fs_struct.h>
6#include <linux/mount.h> 6#include <linux/mount.h>
7#include <linux/ptrace.h> 7#include <linux/ptrace.h>
8#include <linux/slab.h>
8#include <linux/seq_file.h> 9#include <linux/seq_file.h>
9#include "internal.h" 10#include "internal.h"
10 11
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 0872afa58d39..9fbc99ec799a 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -12,6 +12,7 @@
12#include <linux/user.h> 12#include <linux/user.h>
13#include <linux/elf.h> 13#include <linux/elf.h>
14#include <linux/elfcore.h> 14#include <linux/elfcore.h>
15#include <linux/slab.h>
15#include <linux/highmem.h> 16#include <linux/highmem.h>
16#include <linux/bootmem.h> 17#include <linux/bootmem.h>
17#include <linux/init.h> 18#include <linux/init.h>
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index dad7fb247ddc..3e21b1e2ad3a 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -33,6 +33,14 @@ config PRINT_QUOTA_WARNING
33 Note that this behavior is currently deprecated and may go away in 33 Note that this behavior is currently deprecated and may go away in
34 future. Please use notification via netlink socket instead. 34 future. Please use notification via netlink socket instead.
35 35
36config QUOTA_DEBUG
37 bool "Additional quota sanity checks"
38 depends on QUOTA
39 default n
40 help
41 If you say Y here, quota subsystem will perform some additional
42 sanity checks of quota internal structures. If unsure, say N.
43
36# Generic support for tree structured quota files. Selected when needed. 44# Generic support for tree structured quota files. Selected when needed.
37config QUOTA_TREE 45config QUOTA_TREE
38 tristate 46 tristate
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index e0b870f4749f..788b5802a7ce 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -80,8 +80,6 @@
80 80
81#include <asm/uaccess.h> 81#include <asm/uaccess.h>
82 82
83#define __DQUOT_PARANOIA
84
85/* 83/*
86 * There are three quota SMP locks. dq_list_lock protects all lists with quotas 84 * There are three quota SMP locks. dq_list_lock protects all lists with quotas
87 * and quota formats, dqstats structure containing statistics about the lists 85 * and quota formats, dqstats structure containing statistics about the lists
@@ -695,7 +693,7 @@ void dqput(struct dquot *dquot)
695 693
696 if (!dquot) 694 if (!dquot)
697 return; 695 return;
698#ifdef __DQUOT_PARANOIA 696#ifdef CONFIG_QUOTA_DEBUG
699 if (!atomic_read(&dquot->dq_count)) { 697 if (!atomic_read(&dquot->dq_count)) {
700 printk("VFS: dqput: trying to free free dquot\n"); 698 printk("VFS: dqput: trying to free free dquot\n");
701 printk("VFS: device %s, dquot of %s %d\n", 699 printk("VFS: device %s, dquot of %s %d\n",
@@ -748,7 +746,7 @@ we_slept:
748 goto we_slept; 746 goto we_slept;
749 } 747 }
750 atomic_dec(&dquot->dq_count); 748 atomic_dec(&dquot->dq_count);
751#ifdef __DQUOT_PARANOIA 749#ifdef CONFIG_QUOTA_DEBUG
752 /* sanity check */ 750 /* sanity check */
753 BUG_ON(!list_empty(&dquot->dq_free)); 751 BUG_ON(!list_empty(&dquot->dq_free));
754#endif 752#endif
@@ -845,7 +843,7 @@ we_slept:
845 dquot = NULL; 843 dquot = NULL;
846 goto out; 844 goto out;
847 } 845 }
848#ifdef __DQUOT_PARANOIA 846#ifdef CONFIG_QUOTA_DEBUG
849 BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */ 847 BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */
850#endif 848#endif
851out: 849out:
@@ -874,14 +872,18 @@ static int dqinit_needed(struct inode *inode, int type)
874static void add_dquot_ref(struct super_block *sb, int type) 872static void add_dquot_ref(struct super_block *sb, int type)
875{ 873{
876 struct inode *inode, *old_inode = NULL; 874 struct inode *inode, *old_inode = NULL;
875#ifdef CONFIG_QUOTA_DEBUG
877 int reserved = 0; 876 int reserved = 0;
877#endif
878 878
879 spin_lock(&inode_lock); 879 spin_lock(&inode_lock);
880 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 880 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
881 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) 881 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
882 continue; 882 continue;
883#ifdef CONFIG_QUOTA_DEBUG
883 if (unlikely(inode_get_rsv_space(inode) > 0)) 884 if (unlikely(inode_get_rsv_space(inode) > 0))
884 reserved = 1; 885 reserved = 1;
886#endif
885 if (!atomic_read(&inode->i_writecount)) 887 if (!atomic_read(&inode->i_writecount))
886 continue; 888 continue;
887 if (!dqinit_needed(inode, type)) 889 if (!dqinit_needed(inode, type))
@@ -903,11 +905,13 @@ static void add_dquot_ref(struct super_block *sb, int type)
903 spin_unlock(&inode_lock); 905 spin_unlock(&inode_lock);
904 iput(old_inode); 906 iput(old_inode);
905 907
908#ifdef CONFIG_QUOTA_DEBUG
906 if (reserved) { 909 if (reserved) {
907 printk(KERN_WARNING "VFS (%s): Writes happened before quota" 910 printk(KERN_WARNING "VFS (%s): Writes happened before quota"
908 " was turned on thus quota information is probably " 911 " was turned on thus quota information is probably "
909 "inconsistent. Please run quotacheck(8).\n", sb->s_id); 912 "inconsistent. Please run quotacheck(8).\n", sb->s_id);
910 } 913 }
914#endif
911} 915}
912 916
913/* 917/*
@@ -934,7 +938,7 @@ static int remove_inode_dquot_ref(struct inode *inode, int type,
934 inode->i_dquot[type] = NULL; 938 inode->i_dquot[type] = NULL;
935 if (dquot) { 939 if (dquot) {
936 if (dqput_blocks(dquot)) { 940 if (dqput_blocks(dquot)) {
937#ifdef __DQUOT_PARANOIA 941#ifdef CONFIG_QUOTA_DEBUG
938 if (atomic_read(&dquot->dq_count) != 1) 942 if (atomic_read(&dquot->dq_count) != 1)
939 printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count)); 943 printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count));
940#endif 944#endif
@@ -2322,34 +2326,34 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
2322 if (di->dqb_valid & QIF_SPACE) { 2326 if (di->dqb_valid & QIF_SPACE) {
2323 dm->dqb_curspace = di->dqb_curspace - dm->dqb_rsvspace; 2327 dm->dqb_curspace = di->dqb_curspace - dm->dqb_rsvspace;
2324 check_blim = 1; 2328 check_blim = 1;
2325 __set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags); 2329 set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
2326 } 2330 }
2327 if (di->dqb_valid & QIF_BLIMITS) { 2331 if (di->dqb_valid & QIF_BLIMITS) {
2328 dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit); 2332 dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit);
2329 dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit); 2333 dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit);
2330 check_blim = 1; 2334 check_blim = 1;
2331 __set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags); 2335 set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
2332 } 2336 }
2333 if (di->dqb_valid & QIF_INODES) { 2337 if (di->dqb_valid & QIF_INODES) {
2334 dm->dqb_curinodes = di->dqb_curinodes; 2338 dm->dqb_curinodes = di->dqb_curinodes;
2335 check_ilim = 1; 2339 check_ilim = 1;
2336 __set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags); 2340 set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
2337 } 2341 }
2338 if (di->dqb_valid & QIF_ILIMITS) { 2342 if (di->dqb_valid & QIF_ILIMITS) {
2339 dm->dqb_isoftlimit = di->dqb_isoftlimit; 2343 dm->dqb_isoftlimit = di->dqb_isoftlimit;
2340 dm->dqb_ihardlimit = di->dqb_ihardlimit; 2344 dm->dqb_ihardlimit = di->dqb_ihardlimit;
2341 check_ilim = 1; 2345 check_ilim = 1;
2342 __set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags); 2346 set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
2343 } 2347 }
2344 if (di->dqb_valid & QIF_BTIME) { 2348 if (di->dqb_valid & QIF_BTIME) {
2345 dm->dqb_btime = di->dqb_btime; 2349 dm->dqb_btime = di->dqb_btime;
2346 check_blim = 1; 2350 check_blim = 1;
2347 __set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags); 2351 set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
2348 } 2352 }
2349 if (di->dqb_valid & QIF_ITIME) { 2353 if (di->dqb_valid & QIF_ITIME) {
2350 dm->dqb_itime = di->dqb_itime; 2354 dm->dqb_itime = di->dqb_itime;
2351 check_ilim = 1; 2355 check_ilim = 1;
2352 __set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); 2356 set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
2353 } 2357 }
2354 2358
2355 if (check_blim) { 2359 if (check_blim) {
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index 2663ed90fb03..d67908b407d9 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -5,6 +5,7 @@
5#include <linux/kernel.h> 5#include <linux/kernel.h>
6#include <linux/quotaops.h> 6#include <linux/quotaops.h>
7#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/slab.h>
8#include <net/netlink.h> 9#include <net/netlink.h>
9#include <net/genetlink.h> 10#include <net/genetlink.h>
10 11
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 1739a4aba25f..5ea4ad81a429 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -21,6 +21,7 @@
21#include <linux/pagevec.h> 21#include <linux/pagevec.h>
22#include <linux/mman.h> 22#include <linux/mman.h>
23#include <linux/sched.h> 23#include <linux/sched.h>
24#include <linux/slab.h>
24 25
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26#include "internal.h" 27#include "internal.h"
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a6090aa1a7c1..c94853473ca9 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -35,6 +35,7 @@
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/parser.h> 36#include <linux/parser.h>
37#include <linux/magic.h> 37#include <linux/magic.h>
38#include <linux/slab.h>
38#include <asm/uaccess.h> 39#include <asm/uaccess.h>
39#include "internal.h" 40#include "internal.h"
40 41
diff --git a/fs/read_write.c b/fs/read_write.c
index b7f4a1f94d48..113386d6fd2d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -258,6 +258,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
258 init_sync_kiocb(&kiocb, filp); 258 init_sync_kiocb(&kiocb, filp);
259 kiocb.ki_pos = *ppos; 259 kiocb.ki_pos = *ppos;
260 kiocb.ki_left = len; 260 kiocb.ki_left = len;
261 kiocb.ki_nbytes = len;
261 262
262 for (;;) { 263 for (;;) {
263 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); 264 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
@@ -313,6 +314,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
313 init_sync_kiocb(&kiocb, filp); 314 init_sync_kiocb(&kiocb, filp);
314 kiocb.ki_pos = *ppos; 315 kiocb.ki_pos = *ppos;
315 kiocb.ki_left = len; 316 kiocb.ki_left = len;
317 kiocb.ki_nbytes = len;
316 318
317 for (;;) { 319 for (;;) {
318 ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); 320 ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index c094f58c7448..f8a6075abf50 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -8,6 +8,7 @@
8#include <linux/reiserfs_fs.h> 8#include <linux/reiserfs_fs.h>
9#include <linux/stat.h> 9#include <linux/stat.h>
10#include <linux/buffer_head.h> 10#include <linux/buffer_head.h>
11#include <linux/slab.h>
11#include <asm/uaccess.h> 12#include <asm/uaccess.h>
12 13
13extern const struct reiserfs_key MIN_KEY; 14extern const struct reiserfs_key MIN_KEY;
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 6591cb21edf6..1e4250bc3a6f 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -35,6 +35,7 @@
35 **/ 35 **/
36 36
37#include <linux/time.h> 37#include <linux/time.h>
38#include <linux/slab.h>
38#include <linux/string.h> 39#include <linux/string.h>
39#include <linux/reiserfs_fs.h> 40#include <linux/reiserfs_fs.h>
40#include <linux/buffer_head.h> 41#include <linux/buffer_head.h>
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index d1da94b82d8f..dc2c65e04853 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -11,6 +11,7 @@
11#include <linux/smp_lock.h> 11#include <linux/smp_lock.h>
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/highmem.h> 13#include <linux/highmem.h>
14#include <linux/slab.h>
14#include <asm/uaccess.h> 15#include <asm/uaccess.h>
15#include <asm/unaligned.h> 16#include <asm/unaligned.h>
16#include <linux/buffer_head.h> 17#include <linux/buffer_head.h>
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index ba98546fabbd..19fbc810e8e7 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -50,6 +50,7 @@
50#include <linux/blkdev.h> 50#include <linux/blkdev.h>
51#include <linux/backing-dev.h> 51#include <linux/backing-dev.h>
52#include <linux/uaccess.h> 52#include <linux/uaccess.h>
53#include <linux/slab.h>
53 54
54#include <asm/system.h> 55#include <asm/system.h>
55 56
@@ -2217,6 +2218,15 @@ static int journal_read_transaction(struct super_block *sb,
2217 brelse(d_bh); 2218 brelse(d_bh);
2218 return 1; 2219 return 1;
2219 } 2220 }
2221
2222 if (bdev_read_only(sb->s_bdev)) {
2223 reiserfs_warning(sb, "clm-2076",
2224 "device is readonly, unable to replay log");
2225 brelse(c_bh);
2226 brelse(d_bh);
2227 return -EROFS;
2228 }
2229
2220 trans_id = get_desc_trans_id(desc); 2230 trans_id = get_desc_trans_id(desc);
2221 /* now we know we've got a good transaction, and it was inside the valid time ranges */ 2231 /* now we know we've got a good transaction, and it was inside the valid time ranges */
2222 log_blocks = kmalloc(get_desc_trans_len(desc) * 2232 log_blocks = kmalloc(get_desc_trans_len(desc) *
@@ -2459,12 +2469,6 @@ static int journal_read(struct super_block *sb)
2459 goto start_log_replay; 2469 goto start_log_replay;
2460 } 2470 }
2461 2471
2462 if (continue_replay && bdev_read_only(sb->s_bdev)) {
2463 reiserfs_warning(sb, "clm-2076",
2464 "device is readonly, unable to replay log");
2465 return -1;
2466 }
2467
2468 /* ok, there are transactions that need to be replayed. start with the first log block, find 2472 /* ok, there are transactions that need to be replayed. start with the first log block, find
2469 ** all the valid transactions, and pick out the oldest. 2473 ** all the valid transactions, and pick out the oldest.
2470 */ 2474 */
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 96e4cbbfaa18..d0c43cb99ffc 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -13,6 +13,7 @@
13 13
14#include <linux/time.h> 14#include <linux/time.h>
15#include <linux/bitops.h> 15#include <linux/bitops.h>
16#include <linux/slab.h>
16#include <linux/reiserfs_fs.h> 17#include <linux/reiserfs_fs.h>
17#include <linux/reiserfs_acl.h> 18#include <linux/reiserfs_acl.h>
18#include <linux/reiserfs_xattr.h> 19#include <linux/reiserfs_xattr.h>
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 04bf5d791bda..59125fb36d42 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -12,6 +12,7 @@
12 */ 12 */
13 13
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/slab.h>
15#include <linux/vmalloc.h> 16#include <linux/vmalloc.h>
16#include <linux/time.h> 17#include <linux/time.h>
17#include <asm/uaccess.h> 18#include <asm/uaccess.h>
@@ -1618,10 +1619,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1618 save_mount_options(s, data); 1619 save_mount_options(s, data);
1619 1620
1620 sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL); 1621 sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
1621 if (!sbi) { 1622 if (!sbi)
1622 errval = -ENOMEM; 1623 return -ENOMEM;
1623 goto error_alloc;
1624 }
1625 s->s_fs_info = sbi; 1624 s->s_fs_info = sbi;
1626 /* Set default values for options: non-aggressive tails, RO on errors */ 1625 /* Set default values for options: non-aggressive tails, RO on errors */
1627 REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL); 1626 REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
@@ -1878,12 +1877,12 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1878 return (0); 1877 return (0);
1879 1878
1880error: 1879error:
1881 reiserfs_write_unlock(s);
1882error_alloc:
1883 if (jinit_done) { /* kill the commit thread, free journal ram */ 1880 if (jinit_done) { /* kill the commit thread, free journal ram */
1884 journal_release_error(NULL, s); 1881 journal_release_error(NULL, s);
1885 } 1882 }
1886 1883
1884 reiserfs_write_unlock(s);
1885
1887 reiserfs_free_bitmap_cache(s); 1886 reiserfs_free_bitmap_cache(s);
1888 if (SB_BUFFER_WITH_SB(s)) 1887 if (SB_BUFFER_WITH_SB(s))
1889 brelse(SB_BUFFER_WITH_SB(s)); 1888 brelse(SB_BUFFER_WITH_SB(s));
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 37d034ca7d99..4f9586bb7631 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -38,6 +38,7 @@
38#include <linux/dcache.h> 38#include <linux/dcache.h>
39#include <linux/namei.h> 39#include <linux/namei.h>
40#include <linux/errno.h> 40#include <linux/errno.h>
41#include <linux/gfp.h>
41#include <linux/fs.h> 42#include <linux/fs.h>
42#include <linux/file.h> 43#include <linux/file.h>
43#include <linux/pagemap.h> 44#include <linux/pagemap.h>
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index dd20a7883f0f..9cdb759645a9 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -5,6 +5,7 @@
5#include <linux/errno.h> 5#include <linux/errno.h>
6#include <linux/pagemap.h> 6#include <linux/pagemap.h>
7#include <linux/xattr.h> 7#include <linux/xattr.h>
8#include <linux/slab.h>
8#include <linux/posix_acl_xattr.h> 9#include <linux/posix_acl_xattr.h>
9#include <linux/reiserfs_xattr.h> 10#include <linux/reiserfs_xattr.h>
10#include <linux/reiserfs_acl.h> 11#include <linux/reiserfs_acl.h>
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index d8b5bfcbdd30..7271a477c041 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -3,6 +3,7 @@
3#include <linux/fs.h> 3#include <linux/fs.h>
4#include <linux/pagemap.h> 4#include <linux/pagemap.h>
5#include <linux/xattr.h> 5#include <linux/xattr.h>
6#include <linux/slab.h>
6#include <linux/reiserfs_xattr.h> 7#include <linux/reiserfs_xattr.h>
7#include <linux/security.h> 8#include <linux/security.h>
8#include <asm/uaccess.h> 9#include <asm/uaccess.h>
@@ -76,7 +77,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
76 return error; 77 return error;
77 } 78 }
78 79
79 if (sec->length) { 80 if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) {
80 blocks = reiserfs_xattr_jcreate_nblocks(inode) + 81 blocks = reiserfs_xattr_jcreate_nblocks(inode) +
81 reiserfs_xattr_nblocks(inode, sec->length); 82 reiserfs_xattr_nblocks(inode, sec->length);
82 /* We don't want to count the directories twice if we have 83 /* We don't want to count the directories twice if we have
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 1dabe4ee02fe..f329849ce3c0 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -22,6 +22,7 @@
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/slab.h>
25#include <linux/kernel.h> 26#include <linux/kernel.h>
26#include <linux/signal.h> 27#include <linux/signal.h>
27#include <linux/list.h> 28#include <linux/list.h>
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index 92d5e8ffb639..dbf6548bbf06 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -13,7 +13,6 @@
13#include <linux/fcntl.h> 13#include <linux/fcntl.h>
14#include <linux/stat.h> 14#include <linux/stat.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/slab.h>
17#include <linux/pagemap.h> 16#include <linux/pagemap.h>
18#include <linux/smp_lock.h> 17#include <linux/smp_lock.h>
19#include <linux/net.h> 18#include <linux/net.h>
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
index 6bd9b691a463..0e39a924f10a 100644
--- a/fs/smbfs/smbiod.c
+++ b/fs/smbfs/smbiod.c
@@ -12,7 +12,6 @@
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/stat.h> 13#include <linux/stat.h>
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/slab.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/file.h> 16#include <linux/file.h>
18#include <linux/dcache.h> 17#include <linux/dcache.h>
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
index 00b2909bd469..54350b59046b 100644
--- a/fs/smbfs/symlink.c
+++ b/fs/smbfs/symlink.c
@@ -15,6 +15,7 @@
15#include <linux/pagemap.h> 15#include <linux/pagemap.h>
16#include <linux/net.h> 16#include <linux/net.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/slab.h>
18 19
19#include <asm/uaccess.h> 20#include <asm/uaccess.h>
20#include <asm/system.h> 21#include <asm/system.h>
diff --git a/fs/splice.c b/fs/splice.c
index 39208663aaf1..9313b6124a2e 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -30,6 +30,7 @@
30#include <linux/syscalls.h> 30#include <linux/syscalls.h>
31#include <linux/uio.h> 31#include <linux/uio.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/gfp.h>
33 34
34/* 35/*
35 * Attempt to steal a page from a pipe buffer. This should perhaps go into 36 * Attempt to steal a page from a pipe buffer. This should perhaps go into
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index e80be2022a7f..32b911f4ee39 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -33,7 +33,6 @@
33#include <linux/fs.h> 33#include <linux/fs.h>
34#include <linux/vfs.h> 34#include <linux/vfs.h>
35#include <linux/kernel.h> 35#include <linux/kernel.h>
36#include <linux/slab.h>
37#include <linux/string.h> 36#include <linux/string.h>
38#include <linux/pagemap.h> 37#include <linux/pagemap.h>
39 38
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 4dd70e04333b..15a03d0fb9f3 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -24,6 +24,7 @@
24 24
25#include <linux/mutex.h> 25#include <linux/mutex.h>
26#include <linux/buffer_head.h> 26#include <linux/buffer_head.h>
27#include <linux/slab.h>
27#include <linux/zlib.h> 28#include <linux/zlib.h>
28 29
29#include "squashfs_fs.h" 30#include "squashfs_fs.h"
diff --git a/fs/sync.c b/fs/sync.c
index f557d71cb097..fc5c3d75cf3c 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -5,6 +5,7 @@
5#include <linux/kernel.h> 5#include <linux/kernel.h>
6#include <linux/file.h> 6#include <linux/file.h>
7#include <linux/fs.h> 7#include <linux/fs.h>
8#include <linux/slab.h>
8#include <linux/module.h> 9#include <linux/module.h>
9#include <linux/sched.h> 10#include <linux/sched.h>
10#include <linux/writeback.h> 11#include <linux/writeback.h>
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 082daaecac1b..a4a0a9419711 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -18,6 +18,7 @@
18#include <linux/capability.h> 18#include <linux/capability.h>
19#include <linux/errno.h> 19#include <linux/errno.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h>
21#include <linux/xattr.h> 22#include <linux/xattr.h>
22#include <linux/security.h> 23#include <linux/security.h>
23#include "sysfs.h" 24#include "sysfs.h"
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 0cb10884a2fc..776137828dca 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -18,6 +18,7 @@
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/magic.h> 20#include <linux/magic.h>
21#include <linux/slab.h>
21 22
22#include "sysfs.h" 23#include "sysfs.h"
23 24
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 1b9a3a1e8a17..b93ec51fa7ac 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -11,6 +11,7 @@
11 */ 11 */
12 12
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/gfp.h>
14#include <linux/mount.h> 15#include <linux/mount.h>
15#include <linux/module.h> 16#include <linux/module.h>
16#include <linux/kobject.h> 17#include <linux/kobject.h>
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 1bfc95ad5f71..98158de91d24 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -14,6 +14,7 @@
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/slab.h>
17#include <linux/list.h> 18#include <linux/list.h>
18#include <linux/spinlock.h> 19#include <linux/spinlock.h>
19#include <linux/time.h> 20#include <linux/time.h>
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 4775af401167..37fa7ed062d8 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -45,6 +45,7 @@
45 45
46#include <linux/freezer.h> 46#include <linux/freezer.h>
47#include <linux/kthread.h> 47#include <linux/kthread.h>
48#include <linux/slab.h>
48#include "ubifs.h" 49#include "ubifs.h"
49 50
50/** 51/**
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 90492327b383..c2a68baa782f 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -34,6 +34,7 @@
34#include <linux/moduleparam.h> 34#include <linux/moduleparam.h>
35#include <linux/debugfs.h> 35#include <linux/debugfs.h>
36#include <linux/math64.h> 36#include <linux/math64.h>
37#include <linux/slab.h>
37 38
38#ifdef CONFIG_UBIFS_FS_DEBUG 39#ifdef CONFIG_UBIFS_FS_DEBUG
39 40
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index e26c02ab6cd5..5692cf72b807 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -52,6 +52,7 @@
52#include "ubifs.h" 52#include "ubifs.h"
53#include <linux/mount.h> 53#include <linux/mount.h>
54#include <linux/namei.h> 54#include <linux/namei.h>
55#include <linux/slab.h>
55 56
56static int read_block(struct inode *inode, void *addr, unsigned int block, 57static int read_block(struct inode *inode, void *addr, unsigned int block,
57 struct ubifs_data_node *dn) 58 struct ubifs_data_node *dn)
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index e5a3d8e96bb7..918d1582ca05 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -53,6 +53,7 @@
53 * good, and GC takes extra care when moving them. 53 * good, and GC takes extra care when moving them.
54 */ 54 */
55 55
56#include <linux/slab.h>
56#include <linux/pagemap.h> 57#include <linux/pagemap.h>
57#include <linux/list_sort.h> 58#include <linux/list_sort.h>
58#include "ubifs.h" 59#include "ubifs.h"
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index e589fedaf1ef..77d5cf4a7547 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -51,6 +51,7 @@
51 */ 51 */
52 52
53#include <linux/crc32.h> 53#include <linux/crc32.h>
54#include <linux/slab.h>
54#include "ubifs.h" 55#include "ubifs.h"
55 56
56/** 57/**
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index b2792e84d245..ad7f67b827ea 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -46,6 +46,7 @@
46#include "ubifs.h" 46#include "ubifs.h"
47#include <linux/crc16.h> 47#include <linux/crc16.h>
48#include <linux/math64.h> 48#include <linux/math64.h>
49#include <linux/slab.h>
49 50
50/** 51/**
51 * do_calc_lpt_geom - calculate sizes for the LPT area. 52 * do_calc_lpt_geom - calculate sizes for the LPT area.
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 8cbfb8248025..13cb7a4237bf 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -26,6 +26,7 @@
26 */ 26 */
27 27
28#include <linux/crc16.h> 28#include <linux/crc16.h>
29#include <linux/slab.h>
29#include "ubifs.h" 30#include "ubifs.h"
30 31
31/** 32/**
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 868a55ee080f..109c6ea03bb5 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -31,6 +31,7 @@
31 */ 31 */
32 32
33#include <linux/crc32.h> 33#include <linux/crc32.h>
34#include <linux/slab.h>
34#include "ubifs.h" 35#include "ubifs.h"
35 36
36/** 37/**
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 57085e43320f..96cb62c8a9dd 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -27,6 +27,7 @@
27 */ 27 */
28 28
29#include "ubifs.h" 29#include "ubifs.h"
30#include <linux/slab.h>
30#include <linux/random.h> 31#include <linux/random.h>
31#include <linux/math64.h> 32#include <linux/math64.h>
32 33
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index e5b1a7d00fa0..2194915220e5 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -31,6 +31,7 @@
31 */ 31 */
32 32
33#include <linux/crc32.h> 33#include <linux/crc32.h>
34#include <linux/slab.h>
34#include "ubifs.h" 35#include "ubifs.h"
35 36
36/* 37/*
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index b2d976366a46..bd2542dad014 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -28,6 +28,7 @@
28#include <linux/fs.h> 28#include <linux/fs.h>
29#include <linux/err.h> 29#include <linux/err.h>
30#include <linux/sched.h> 30#include <linux/sched.h>
31#include <linux/slab.h>
31#include <linux/vmalloc.h> 32#include <linux/vmalloc.h>
32#include <linux/spinlock.h> 33#include <linux/spinlock.h>
33#include <linux/mutex.h> 34#include <linux/mutex.h>
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 195830f47569..c74400f88fe0 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -56,6 +56,7 @@
56 */ 56 */
57 57
58#include "ubifs.h" 58#include "ubifs.h"
59#include <linux/slab.h>
59#include <linux/xattr.h> 60#include <linux/xattr.h>
60#include <linux/posix_acl_xattr.h> 61#include <linux/posix_acl_xattr.h>
61 62
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 19626e2491c4..9a9378b4eb5a 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -125,9 +125,8 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
125 125
126 mutex_lock(&sbi->s_alloc_mutex); 126 mutex_lock(&sbi->s_alloc_mutex);
127 partmap = &sbi->s_partmaps[bloc->partitionReferenceNum]; 127 partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
128 if (bloc->logicalBlockNum < 0 || 128 if (bloc->logicalBlockNum + count < count ||
129 (bloc->logicalBlockNum + count) > 129 (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
130 partmap->s_partition_len) {
131 udf_debug("%d < %d || %d + %d > %d\n", 130 udf_debug("%d < %d || %d + %d > %d\n",
132 bloc->logicalBlockNum, 0, bloc->logicalBlockNum, 131 bloc->logicalBlockNum, 0, bloc->logicalBlockNum,
133 count, partmap->s_partition_len); 132 count, partmap->s_partition_len);
@@ -393,9 +392,8 @@ static void udf_table_free_blocks(struct super_block *sb,
393 392
394 mutex_lock(&sbi->s_alloc_mutex); 393 mutex_lock(&sbi->s_alloc_mutex);
395 partmap = &sbi->s_partmaps[bloc->partitionReferenceNum]; 394 partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
396 if (bloc->logicalBlockNum < 0 || 395 if (bloc->logicalBlockNum + count < count ||
397 (bloc->logicalBlockNum + count) > 396 (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
398 partmap->s_partition_len) {
399 udf_debug("%d < %d || %d + %d > %d\n", 397 udf_debug("%d < %d || %d + %d > %d\n",
400 bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count, 398 bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count,
401 partmap->s_partition_len); 399 partmap->s_partition_len);
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 1eb06774ed90..4b6a46ccbf46 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -218,7 +218,7 @@ const struct file_operations udf_file_operations = {
218 .llseek = generic_file_llseek, 218 .llseek = generic_file_llseek,
219}; 219};
220 220
221static int udf_setattr(struct dentry *dentry, struct iattr *iattr) 221int udf_setattr(struct dentry *dentry, struct iattr *iattr)
222{ 222{
223 struct inode *inode = dentry->d_inode; 223 struct inode *inode = dentry->d_inode;
224 int error; 224 int error;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index bb863fe579ac..8a3fbd177cab 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1314,7 +1314,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1314 break; 1314 break;
1315 case ICBTAG_FILE_TYPE_SYMLINK: 1315 case ICBTAG_FILE_TYPE_SYMLINK:
1316 inode->i_data.a_ops = &udf_symlink_aops; 1316 inode->i_data.a_ops = &udf_symlink_aops;
1317 inode->i_op = &page_symlink_inode_operations; 1317 inode->i_op = &udf_symlink_inode_operations;
1318 inode->i_mode = S_IFLNK | S_IRWXUGO; 1318 inode->i_mode = S_IFLNK | S_IRWXUGO;
1319 break; 1319 break;
1320 case ICBTAG_FILE_TYPE_MAIN: 1320 case ICBTAG_FILE_TYPE_MAIN:
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index db423ab078b1..75816025f95f 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -925,7 +925,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
925 iinfo = UDF_I(inode); 925 iinfo = UDF_I(inode);
926 inode->i_mode = S_IFLNK | S_IRWXUGO; 926 inode->i_mode = S_IFLNK | S_IRWXUGO;
927 inode->i_data.a_ops = &udf_symlink_aops; 927 inode->i_data.a_ops = &udf_symlink_aops;
928 inode->i_op = &page_symlink_inode_operations; 928 inode->i_op = &udf_symlink_inode_operations;
929 929
930 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { 930 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
931 struct kernel_lb_addr eloc; 931 struct kernel_lb_addr eloc;
@@ -1393,6 +1393,7 @@ const struct export_operations udf_export_ops = {
1393const struct inode_operations udf_dir_inode_operations = { 1393const struct inode_operations udf_dir_inode_operations = {
1394 .lookup = udf_lookup, 1394 .lookup = udf_lookup,
1395 .create = udf_create, 1395 .create = udf_create,
1396 .setattr = udf_setattr,
1396 .link = udf_link, 1397 .link = udf_link,
1397 .unlink = udf_unlink, 1398 .unlink = udf_unlink,
1398 .symlink = udf_symlink, 1399 .symlink = udf_symlink,
@@ -1401,3 +1402,9 @@ const struct inode_operations udf_dir_inode_operations = {
1401 .mknod = udf_mknod, 1402 .mknod = udf_mknod,
1402 .rename = udf_rename, 1403 .rename = udf_rename,
1403}; 1404};
1405const struct inode_operations udf_symlink_inode_operations = {
1406 .readlink = generic_readlink,
1407 .follow_link = page_follow_link_light,
1408 .put_link = page_put_link,
1409 .setattr = udf_setattr,
1410};
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 4b540ee632d5..745eb209be0c 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -24,7 +24,6 @@
24 24
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/string.h> 26#include <linux/string.h>
27#include <linux/slab.h>
28#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
29 28
30uint32_t udf_get_pblock(struct super_block *sb, uint32_t block, 29uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 852e91845688..16064787d2b7 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -26,7 +26,6 @@
26#include <linux/time.h> 26#include <linux/time.h>
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/stat.h> 28#include <linux/stat.h>
29#include <linux/slab.h>
30#include <linux/pagemap.h> 29#include <linux/pagemap.h>
31#include <linux/smp_lock.h> 30#include <linux/smp_lock.h>
32#include <linux/buffer_head.h> 31#include <linux/buffer_head.h>
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 4223ac855da9..702a1148e702 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -76,6 +76,7 @@ extern const struct inode_operations udf_dir_inode_operations;
76extern const struct file_operations udf_dir_operations; 76extern const struct file_operations udf_dir_operations;
77extern const struct inode_operations udf_file_inode_operations; 77extern const struct inode_operations udf_file_inode_operations;
78extern const struct file_operations udf_file_operations; 78extern const struct file_operations udf_file_operations;
79extern const struct inode_operations udf_symlink_inode_operations;
79extern const struct address_space_operations udf_aops; 80extern const struct address_space_operations udf_aops;
80extern const struct address_space_operations udf_adinicb_aops; 81extern const struct address_space_operations udf_adinicb_aops;
81extern const struct address_space_operations udf_symlink_aops; 82extern const struct address_space_operations udf_symlink_aops;
@@ -131,7 +132,7 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
131/* file.c */ 132/* file.c */
132extern int udf_ioctl(struct inode *, struct file *, unsigned int, 133extern int udf_ioctl(struct inode *, struct file *, unsigned int,
133 unsigned long); 134 unsigned long);
134 135extern int udf_setattr(struct dentry *dentry, struct iattr *iattr);
135/* inode.c */ 136/* inode.c */
136extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); 137extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
137extern int udf_sync_inode(struct inode *); 138extern int udf_sync_inode(struct inode *);
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index cefa8c8913e6..d03a90b6ad69 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -24,6 +24,7 @@
24#include <linux/string.h> /* for memset */ 24#include <linux/string.h> /* for memset */
25#include <linux/nls.h> 25#include <linux/nls.h>
26#include <linux/crc-itu-t.h> 26#include <linux/crc-itu-t.h>
27#include <linux/slab.h>
27 28
28#include "udf_sb.h" 29#include "udf_sb.h"
29 30
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
index 05ac0fe9c4d3..8d5a506c82eb 100644
--- a/fs/xattr_acl.c
+++ b/fs/xattr_acl.c
@@ -6,9 +6,9 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/slab.h>
10#include <linux/fs.h> 9#include <linux/fs.h>
11#include <linux/posix_acl_xattr.h> 10#include <linux/posix_acl_xattr.h>
11#include <linux/gfp.h>
12 12
13 13
14/* 14/*
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index bc7405585def..666c9db48eb6 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include <linux/mm.h> 18#include <linux/mm.h>
19#include <linux/highmem.h> 19#include <linux/highmem.h>
20#include <linux/slab.h>
20#include <linux/swap.h> 21#include <linux/swap.h>
21#include <linux/blkdev.h> 22#include <linux/blkdev.h>
22#include <linux/backing-dev.h> 23#include <linux/backing-dev.h>
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index bf85bbe4a9ae..a7bc925c4d60 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -22,6 +22,7 @@
22#include "xfs_inode.h" 22#include "xfs_inode.h"
23#include "xfs_vnodeops.h" 23#include "xfs_vnodeops.h"
24#include "xfs_trace.h" 24#include "xfs_trace.h"
25#include <linux/slab.h>
25#include <linux/xattr.h> 26#include <linux/xattr.h>
26#include <linux/posix_acl_xattr.h> 27#include <linux/posix_acl_xattr.h>
27 28
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 9083357f9e44..0f8b9968a803 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -40,6 +40,7 @@
40#include "xfs_vnodeops.h" 40#include "xfs_vnodeops.h"
41#include "xfs_trace.h" 41#include "xfs_trace.h"
42#include "xfs_bmap.h" 42#include "xfs_bmap.h"
43#include <linux/gfp.h>
43#include <linux/mpage.h> 44#include <linux/mpage.h>
44#include <linux/pagevec.h> 45#include <linux/pagevec.h>
45#include <linux/writeback.h> 46#include <linux/writeback.h>
@@ -932,6 +933,9 @@ xfs_aops_discard_page(
932 if (!xfs_is_delayed_page(page, IOMAP_DELAY)) 933 if (!xfs_is_delayed_page(page, IOMAP_DELAY))
933 goto out_invalidate; 934 goto out_invalidate;
934 935
936 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
937 goto out_invalidate;
938
935 xfs_fs_cmn_err(CE_ALERT, ip->i_mount, 939 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
936 "page discard on page %p, inode 0x%llx, offset %llu.", 940 "page discard on page %p, inode 0x%llx, offset %llu.",
937 page, ip->i_ino, offset); 941 page, ip->i_ino, offset);
@@ -964,8 +968,10 @@ xfs_aops_discard_page(
964 968
965 if (error) { 969 if (error) {
966 /* something screwed, just bail */ 970 /* something screwed, just bail */
967 xfs_fs_cmn_err(CE_ALERT, ip->i_mount, 971 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
968 "page discard failed delalloc mapping lookup."); 972 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
973 "page discard failed delalloc mapping lookup.");
974 }
969 break; 975 break;
970 } 976 }
971 if (!nimaps) { 977 if (!nimaps) {
@@ -991,8 +997,10 @@ xfs_aops_discard_page(
991 ASSERT(!flist.xbf_count && !flist.xbf_first); 997 ASSERT(!flist.xbf_count && !flist.xbf_first);
992 if (error) { 998 if (error) {
993 /* something screwed, just bail */ 999 /* something screwed, just bail */
994 xfs_fs_cmn_err(CE_ALERT, ip->i_mount, 1000 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1001 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
995 "page discard unable to remove delalloc mapping."); 1002 "page discard unable to remove delalloc mapping.");
1003 }
996 break; 1004 break;
997 } 1005 }
998next_buffer: 1006next_buffer:
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 6f76ba85f193..44c2b0ef9a41 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -18,7 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include <linux/stddef.h> 19#include <linux/stddef.h>
20#include <linux/errno.h> 20#include <linux/errno.h>
21#include <linux/slab.h> 21#include <linux/gfp.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
@@ -168,75 +168,6 @@ test_page_region(
168} 168}
169 169
170/* 170/*
171 * Mapping of multi-page buffers into contiguous virtual space
172 */
173
174typedef struct a_list {
175 void *vm_addr;
176 struct a_list *next;
177} a_list_t;
178
179static a_list_t *as_free_head;
180static int as_list_len;
181static DEFINE_SPINLOCK(as_lock);
182
183/*
184 * Try to batch vunmaps because they are costly.
185 */
186STATIC void
187free_address(
188 void *addr)
189{
190 a_list_t *aentry;
191
192#ifdef CONFIG_XEN
193 /*
194 * Xen needs to be able to make sure it can get an exclusive
195 * RO mapping of pages it wants to turn into a pagetable. If
196 * a newly allocated page is also still being vmap()ed by xfs,
197 * it will cause pagetable construction to fail. This is a
198 * quick workaround to always eagerly unmap pages so that Xen
199 * is happy.
200 */
201 vunmap(addr);
202 return;
203#endif
204
205 aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
206 if (likely(aentry)) {
207 spin_lock(&as_lock);
208 aentry->next = as_free_head;
209 aentry->vm_addr = addr;
210 as_free_head = aentry;
211 as_list_len++;
212 spin_unlock(&as_lock);
213 } else {
214 vunmap(addr);
215 }
216}
217
218STATIC void
219purge_addresses(void)
220{
221 a_list_t *aentry, *old;
222
223 if (as_free_head == NULL)
224 return;
225
226 spin_lock(&as_lock);
227 aentry = as_free_head;
228 as_free_head = NULL;
229 as_list_len = 0;
230 spin_unlock(&as_lock);
231
232 while ((old = aentry) != NULL) {
233 vunmap(aentry->vm_addr);
234 aentry = aentry->next;
235 kfree(old);
236 }
237}
238
239/*
240 * Internal xfs_buf_t object manipulation 171 * Internal xfs_buf_t object manipulation
241 */ 172 */
242 173
@@ -337,7 +268,8 @@ xfs_buf_free(
337 uint i; 268 uint i;
338 269
339 if (xfs_buf_is_vmapped(bp)) 270 if (xfs_buf_is_vmapped(bp))
340 free_address(bp->b_addr - bp->b_offset); 271 vm_unmap_ram(bp->b_addr - bp->b_offset,
272 bp->b_page_count);
341 273
342 for (i = 0; i < bp->b_page_count; i++) { 274 for (i = 0; i < bp->b_page_count; i++) {
343 struct page *page = bp->b_pages[i]; 275 struct page *page = bp->b_pages[i];
@@ -457,10 +389,8 @@ _xfs_buf_map_pages(
457 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 389 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
458 bp->b_flags |= XBF_MAPPED; 390 bp->b_flags |= XBF_MAPPED;
459 } else if (flags & XBF_MAPPED) { 391 } else if (flags & XBF_MAPPED) {
460 if (as_list_len > 64) 392 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
461 purge_addresses(); 393 -1, PAGE_KERNEL);
462 bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
463 VM_MAP, PAGE_KERNEL);
464 if (unlikely(bp->b_addr == NULL)) 394 if (unlikely(bp->b_addr == NULL))
465 return -ENOMEM; 395 return -ENOMEM;
466 bp->b_addr += bp->b_offset; 396 bp->b_addr += bp->b_offset;
@@ -1955,9 +1885,6 @@ xfsbufd(
1955 xfs_buf_iostrategy(bp); 1885 xfs_buf_iostrategy(bp);
1956 count++; 1886 count++;
1957 } 1887 }
1958
1959 if (as_list_len > 0)
1960 purge_addresses();
1961 if (count) 1888 if (count)
1962 blk_run_address_space(target->bt_mapping); 1889 blk_run_address_space(target->bt_mapping);
1963 1890
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 4ea1ee18aded..7b26cc2fd284 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -58,6 +58,7 @@
58#include <linux/mount.h> 58#include <linux/mount.h>
59#include <linux/namei.h> 59#include <linux/namei.h>
60#include <linux/pagemap.h> 60#include <linux/pagemap.h>
61#include <linux/slab.h>
61#include <linux/exportfs.h> 62#include <linux/exportfs.h>
62 63
63/* 64/*
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 0bf6d61f0528..593c05b4df8d 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -18,6 +18,7 @@
18#include <linux/compat.h> 18#include <linux/compat.h>
19#include <linux/ioctl.h> 19#include <linux/ioctl.h>
20#include <linux/mount.h> 20#include <linux/mount.h>
21#include <linux/slab.h>
21#include <asm/uaccess.h> 22#include <asm/uaccess.h>
22#include "xfs.h" 23#include "xfs.h"
23#include "xfs_fs.h" 24#include "xfs_fs.h"
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 61a99608731e..e65a7937f3a4 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -56,6 +56,7 @@
56#include <linux/security.h> 56#include <linux/security.h>
57#include <linux/falloc.h> 57#include <linux/falloc.h>
58#include <linux/fiemap.h> 58#include <linux/fiemap.h>
59#include <linux/slab.h>
59 60
60/* 61/*
61 * Bring the timestamps in the XFS inode uptodate. 62 * Bring the timestamps in the XFS inode uptodate.
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 71345a370d9f..52e06b487ced 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -61,6 +61,7 @@
61 61
62#include <linux/namei.h> 62#include <linux/namei.h>
63#include <linux/init.h> 63#include <linux/init.h>
64#include <linux/slab.h>
64#include <linux/mount.h> 65#include <linux/mount.h>
65#include <linux/mempool.h> 66#include <linux/mempool.h>
66#include <linux/writeback.h> 67#include <linux/writeback.h>
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 05cd85317f6f..fd9698215759 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -820,10 +820,10 @@ xfs_reclaim_inode(
820 * call into reclaim to find it in a clean state instead of waiting for 820 * call into reclaim to find it in a clean state instead of waiting for
821 * it now. We also don't return errors here - if the error is transient 821 * it now. We also don't return errors here - if the error is transient
822 * then the next reclaim pass will flush the inode, and if the error 822 * then the next reclaim pass will flush the inode, and if the error
823 * is permanent then the next sync reclaim will relcaim the inode and 823 * is permanent then the next sync reclaim will reclaim the inode and
824 * pass on the error. 824 * pass on the error.
825 */ 825 */
826 if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { 826 if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
827 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 827 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
828 "inode 0x%llx background reclaim flush failed with %d", 828 "inode 0x%llx background reclaim flush failed with %d",
829 (long long)ip->i_ino, error); 829 (long long)ip->i_ino, error);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index e8fba92d7cd9..2be019136287 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -745,9 +745,16 @@ xfs_log_move_tail(xfs_mount_t *mp,
745 745
746/* 746/*
747 * Determine if we have a transaction that has gone to disk 747 * Determine if we have a transaction that has gone to disk
748 * that needs to be covered. Log activity needs to be idle (no AIL and 748 * that needs to be covered. To begin the transition to the idle state
749 * nothing in the iclogs). And, we need to be in the right state indicating 749 * firstly the log needs to be idle (no AIL and nothing in the iclogs).
750 * something has gone out. 750 * If we are then in a state where covering is needed, the caller is informed
751 * that dummy transactions are required to move the log into the idle state.
752 *
753 * Because this is called as part of the sync process, we should also indicate
754 * that dummy transactions should be issued in anything but the covered or
755 * idle states. This ensures that the log tail is accurately reflected in
756 * the log at the end of the sync, hence if a crash occurrs avoids replay
757 * of transactions where the metadata is already on disk.
751 */ 758 */
752int 759int
753xfs_log_need_covered(xfs_mount_t *mp) 760xfs_log_need_covered(xfs_mount_t *mp)
@@ -759,17 +766,24 @@ xfs_log_need_covered(xfs_mount_t *mp)
759 return 0; 766 return 0;
760 767
761 spin_lock(&log->l_icloglock); 768 spin_lock(&log->l_icloglock);
762 if (((log->l_covered_state == XLOG_STATE_COVER_NEED) || 769 switch (log->l_covered_state) {
763 (log->l_covered_state == XLOG_STATE_COVER_NEED2)) 770 case XLOG_STATE_COVER_DONE:
764 && !xfs_trans_ail_tail(log->l_ailp) 771 case XLOG_STATE_COVER_DONE2:
765 && xlog_iclogs_empty(log)) { 772 case XLOG_STATE_COVER_IDLE:
766 if (log->l_covered_state == XLOG_STATE_COVER_NEED) 773 break;
767 log->l_covered_state = XLOG_STATE_COVER_DONE; 774 case XLOG_STATE_COVER_NEED:
768 else { 775 case XLOG_STATE_COVER_NEED2:
769 ASSERT(log->l_covered_state == XLOG_STATE_COVER_NEED2); 776 if (!xfs_trans_ail_tail(log->l_ailp) &&
770 log->l_covered_state = XLOG_STATE_COVER_DONE2; 777 xlog_iclogs_empty(log)) {
778 if (log->l_covered_state == XLOG_STATE_COVER_NEED)
779 log->l_covered_state = XLOG_STATE_COVER_DONE;
780 else
781 log->l_covered_state = XLOG_STATE_COVER_DONE2;
771 } 782 }
783 /* FALLTHRU */
784 default:
772 needed = 1; 785 needed = 1;
786 break;
773 } 787 }
774 spin_unlock(&log->l_icloglock); 788 spin_unlock(&log->l_icloglock);
775 return needed; 789 return needed;