aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2010-05-21 18:31:36 -0400
committerNeilBrown <neilb@suse.de>2010-05-21 18:31:36 -0400
commit19fdb9eefb21b72edbc365b838502780c392bad6 (patch)
treedeae04c48532d6eab64ed4b0396737bb854b5506 /fs
parentbe6800a73aa2f3dc14744c3b80e676d189789f04 (diff)
parent3ff195b011d7decf501a4d55aeed312731094796 (diff)
Merge commit '3ff195b011d7decf501a4d55aeed312731094796' into for-linus
Conflicts: drivers/md/md.c - Resolved conflict in md_update_sb - Added extra 'NULL' arg to new instance of sysfs_get_dirent. Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/cache.c1
-rw-r--r--fs/9p/fid.c13
-rw-r--r--fs/9p/v9fs.c32
-rw-r--r--fs/9p/v9fs.h9
-rw-r--r--fs/9p/vfs_dentry.c1
-rw-r--r--fs/9p/vfs_dir.c14
-rw-r--r--fs/9p/vfs_file.c4
-rw-r--r--fs/9p/vfs_inode.c10
-rw-r--r--fs/9p/vfs_super.c5
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile1
-rw-r--r--fs/adfs/super.c1
-rw-r--r--fs/affs/bitmap.c3
-rw-r--r--fs/affs/inode.c1
-rw-r--r--fs/affs/super.c1
-rw-r--r--fs/afs/cache.c1
-rw-r--r--fs/afs/cmservice.c1
-rw-r--r--fs/afs/dir.c1
-rw-r--r--fs/afs/file.c2
-rw-r--r--fs/afs/fsclient.c1
-rw-r--r--fs/afs/inode.c1
-rw-r--r--fs/afs/internal.h2
-rw-r--r--fs/afs/mntpt.c26
-rw-r--r--fs/afs/rxrpc.c1
-rw-r--r--fs/afs/security.c5
-rw-r--r--fs/afs/super.c1
-rw-r--r--fs/afs/vlclient.c1
-rw-r--r--fs/afs/vlocation.c1
-rw-r--r--fs/afs/vnode.c1
-rw-r--r--fs/afs/volume.c7
-rw-r--r--fs/anon_inodes.c2
-rw-r--r--fs/autofs/root.c1
-rw-r--r--fs/autofs4/dev-ioctl.c1
-rw-r--r--fs/autofs4/root.c6
-rw-r--r--fs/befs/datastream.c1
-rw-r--r--fs/binfmt_aout.c16
-rw-r--r--fs/binfmt_elf_fdpic.c11
-rw-r--r--fs/binfmt_em86.c1
-rw-r--r--fs/binfmt_flat.c2
-rw-r--r--fs/binfmt_script.c1
-rw-r--r--fs/bio-integrity.c1
-rw-r--r--fs/bio.c11
-rw-r--r--fs/block_dev.c22
-rw-r--r--fs/btrfs/acl.c1
-rw-r--r--fs/btrfs/async-thread.c1
-rw-r--r--fs/btrfs/btrfs_inode.h5
-rw-r--r--fs/btrfs/compression.c23
-rw-r--r--fs/btrfs/ctree.c5
-rw-r--r--fs/btrfs/ctree.h15
-rw-r--r--fs/btrfs/delayed-ref.c1
-rw-r--r--fs/btrfs/disk-io.c44
-rw-r--r--fs/btrfs/export.c4
-rw-r--r--fs/btrfs/extent-tree.c55
-rw-r--r--fs/btrfs/extent_io.c99
-rw-r--r--fs/btrfs/extent_io.h10
-rw-r--r--fs/btrfs/extent_map.c3
-rw-r--r--fs/btrfs/file-item.c1
-rw-r--r--fs/btrfs/file.c24
-rw-r--r--fs/btrfs/free-space-cache.c5
-rw-r--r--fs/btrfs/inode.c199
-rw-r--r--fs/btrfs/ioctl.c715
-rw-r--r--fs/btrfs/ioctl.h111
-rw-r--r--fs/btrfs/locking.c1
-rw-r--r--fs/btrfs/ordered-data.c48
-rw-r--r--fs/btrfs/ordered-data.h9
-rw-r--r--fs/btrfs/ref-cache.c1
-rw-r--r--fs/btrfs/ref-cache.h2
-rw-r--r--fs/btrfs/relocation.c9
-rw-r--r--fs/btrfs/super.c259
-rw-r--r--fs/btrfs/sysfs.c4
-rw-r--r--fs/btrfs/transaction.c120
-rw-r--r--fs/btrfs/tree-log.c3
-rw-r--r--fs/btrfs/volumes.c56
-rw-r--r--fs/buffer.c15
-rw-r--r--fs/cachefiles/interface.c1
-rw-r--r--fs/cachefiles/internal.h1
-rw-r--r--fs/cachefiles/namei.c99
-rw-r--r--fs/cachefiles/rdwr.c1
-rw-r--r--fs/cachefiles/security.c4
-rw-r--r--fs/cachefiles/xattr.c1
-rw-r--r--fs/ceph/Kconfig27
-rw-r--r--fs/ceph/Makefile39
-rw-r--r--fs/ceph/README20
-rw-r--r--fs/ceph/addr.c1187
-rw-r--r--fs/ceph/armor.c99
-rw-r--r--fs/ceph/auth.c259
-rw-r--r--fs/ceph/auth.h84
-rw-r--r--fs/ceph/auth_none.c122
-rw-r--r--fs/ceph/auth_none.h30
-rw-r--r--fs/ceph/auth_x.c668
-rw-r--r--fs/ceph/auth_x.h49
-rw-r--r--fs/ceph/auth_x_protocol.h90
-rw-r--r--fs/ceph/buffer.c81
-rw-r--r--fs/ceph/buffer.h39
-rw-r--r--fs/ceph/caps.c2960
-rw-r--r--fs/ceph/ceph_debug.h37
-rw-r--r--fs/ceph/ceph_frag.c21
-rw-r--r--fs/ceph/ceph_frag.h109
-rw-r--r--fs/ceph/ceph_fs.c74
-rw-r--r--fs/ceph/ceph_fs.h650
-rw-r--r--fs/ceph/ceph_hash.c118
-rw-r--r--fs/ceph/ceph_hash.h13
-rw-r--r--fs/ceph/ceph_strings.c176
-rw-r--r--fs/ceph/crush/crush.c151
-rw-r--r--fs/ceph/crush/crush.h180
-rw-r--r--fs/ceph/crush/hash.c149
-rw-r--r--fs/ceph/crush/hash.h17
-rw-r--r--fs/ceph/crush/mapper.c596
-rw-r--r--fs/ceph/crush/mapper.h20
-rw-r--r--fs/ceph/crypto.c409
-rw-r--r--fs/ceph/crypto.h48
-rw-r--r--fs/ceph/debugfs.c484
-rw-r--r--fs/ceph/decode.h194
-rw-r--r--fs/ceph/dir.c1233
-rw-r--r--fs/ceph/export.c224
-rw-r--r--fs/ceph/file.c939
-rw-r--r--fs/ceph/inode.c1782
-rw-r--r--fs/ceph/ioctl.c160
-rw-r--r--fs/ceph/ioctl.h40
-rw-r--r--fs/ceph/mds_client.c3047
-rw-r--r--fs/ceph/mds_client.h335
-rw-r--r--fs/ceph/mdsmap.c174
-rw-r--r--fs/ceph/mdsmap.h54
-rw-r--r--fs/ceph/messenger.c2284
-rw-r--r--fs/ceph/messenger.h256
-rw-r--r--fs/ceph/mon_client.c835
-rw-r--r--fs/ceph/mon_client.h119
-rw-r--r--fs/ceph/msgpool.c186
-rw-r--r--fs/ceph/msgpool.h27
-rw-r--r--fs/ceph/msgr.h158
-rw-r--r--fs/ceph/osd_client.c1564
-rw-r--r--fs/ceph/osd_client.h167
-rw-r--r--fs/ceph/osdmap.c1081
-rw-r--r--fs/ceph/osdmap.h128
-rw-r--r--fs/ceph/pagelist.c55
-rw-r--r--fs/ceph/pagelist.h54
-rw-r--r--fs/ceph/rados.h377
-rw-r--r--fs/ceph/snap.c911
-rw-r--r--fs/ceph/super.c1041
-rw-r--r--fs/ceph/super.h902
-rw-r--r--fs/ceph/types.h29
-rw-r--r--fs/ceph/xattr.c845
-rw-r--r--fs/cifs/asn1.c105
-rw-r--r--fs/cifs/cifs_debug.c48
-rw-r--r--fs/cifs/cifs_debug.h42
-rw-r--r--fs/cifs/cifs_dfs_ref.c37
-rw-r--r--fs/cifs/cifs_fs_sb.h3
-rw-r--r--fs/cifs/cifs_spnego.c7
-rw-r--r--fs/cifs/cifs_unicode.c6
-rw-r--r--fs/cifs/cifsacl.c77
-rw-r--r--fs/cifs/cifsencrypt.c11
-rw-r--r--fs/cifs/cifsfs.c177
-rw-r--r--fs/cifs/cifsfs.h5
-rw-r--r--fs/cifs/cifsglob.h12
-rw-r--r--fs/cifs/cifsproto.h36
-rw-r--r--fs/cifs/cifssmb.c601
-rw-r--r--fs/cifs/connect.c640
-rw-r--r--fs/cifs/dir.c93
-rw-r--r--fs/cifs/dns_resolve.c17
-rw-r--r--fs/cifs/export.c2
-rw-r--r--fs/cifs/file.c254
-rw-r--r--fs/cifs/inode.c422
-rw-r--r--fs/cifs/ioctl.c10
-rw-r--r--fs/cifs/link.c11
-rw-r--r--fs/cifs/misc.c81
-rw-r--r--fs/cifs/netmisc.c16
-rw-r--r--fs/cifs/readdir.c86
-rw-r--r--fs/cifs/sess.c82
-rw-r--r--fs/cifs/smbencrypt.c1
-rw-r--r--fs/cifs/transport.c93
-rw-r--r--fs/cifs/xattr.c41
-rw-r--r--fs/coda/dir.c1
-rw-r--r--fs/coda/file.c1
-rw-r--r--fs/coda/inode.c9
-rw-r--r--fs/coda/upcall.c1
-rw-r--r--fs/compat.c21
-rw-r--r--fs/compat_ioctl.c5
-rw-r--r--fs/configfs/dir.c4
-rw-r--r--fs/configfs/inode.c1
-rw-r--r--fs/configfs/mount.c1
-rw-r--r--fs/configfs/symlink.c1
-rw-r--r--fs/debugfs/inode.c1
-rw-r--r--fs/devpts/inode.c1
-rw-r--r--fs/dlm/config.c1
-rw-r--r--fs/dlm/debug_fs.c1
-rw-r--r--fs/dlm/lock.c6
-rw-r--r--fs/dlm/lockspace.c2
-rw-r--r--fs/dlm/lowcomms.c1
-rw-r--r--fs/dlm/member.c2
-rw-r--r--fs/dlm/netlink.c1
-rw-r--r--fs/dlm/plock.c1
-rw-r--r--fs/dlm/user.c89
-rw-r--r--fs/ecryptfs/crypto.c38
-rw-r--r--fs/ecryptfs/dentry.c1
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h15
-rw-r--r--fs/ecryptfs/file.c1
-rw-r--r--fs/ecryptfs/inode.c130
-rw-r--r--fs/ecryptfs/keystore.c1
-rw-r--r--fs/ecryptfs/kthread.c1
-rw-r--r--fs/ecryptfs/main.c11
-rw-r--r--fs/ecryptfs/messaging.c1
-rw-r--r--fs/ecryptfs/miscdev.c1
-rw-r--r--fs/ecryptfs/mmap.c39
-rw-r--r--fs/ecryptfs/super.c3
-rw-r--r--fs/eventfd.c1
-rw-r--r--fs/eventpoll.c3
-rw-r--r--fs/exec.c2
-rw-r--r--fs/exofs/exofs.h2
-rw-r--r--fs/exofs/inode.c1
-rw-r--r--fs/exofs/ios.c1
-rw-r--r--fs/exofs/super.c9
-rw-r--r--fs/ext2/balloc.c1
-rw-r--r--fs/ext2/symlink.c2
-rw-r--r--fs/ext2/xattr_security.c1
-rw-r--r--fs/ext3/balloc.c1
-rw-r--r--fs/ext3/ialloc.c4
-rw-r--r--fs/ext3/inode.c2
-rw-r--r--fs/ext3/super.c2
-rw-r--r--fs/ext3/symlink.c2
-rw-r--r--fs/ext3/xattr_security.c1
-rw-r--r--fs/ext4/block_validity.c1
-rw-r--r--fs/ext4/extents.c1
-rw-r--r--fs/ext4/ialloc.c4
-rw-r--r--fs/ext4/inode.c8
-rw-r--r--fs/ext4/mballoc.c24
-rw-r--r--fs/ext4/migrate.c1
-rw-r--r--fs/ext4/move_extent.c1
-rw-r--r--fs/ext4/super.c33
-rw-r--r--fs/ext4/xattr_security.c1
-rw-r--r--fs/fat/cache.c1
-rw-r--r--fs/fat/inode.c2
-rw-r--r--fs/fat/namei_vfat.c33
-rw-r--r--fs/fcntl.c66
-rw-r--r--fs/fifo.c1
-rw-r--r--fs/filesystems.c2
-rw-r--r--fs/freevxfs/vxfs_subr.c1
-rw-r--r--fs/fs-writeback.c134
-rw-r--r--fs/fscache/Kconfig1
-rw-r--r--fs/fscache/object-list.c1
-rw-r--r--fs/fscache/object.c6
-rw-r--r--fs/fscache/operation.c5
-rw-r--r--fs/fscache/page.c2
-rw-r--r--fs/fscache/stats.c4
-rw-r--r--fs/fuse/cuse.c1
-rw-r--r--fs/fuse/inode.c2
-rw-r--r--fs/generic_acl.c1
-rw-r--r--fs/gfs2/Kconfig1
-rw-r--r--fs/gfs2/aops.c8
-rw-r--r--fs/gfs2/bmap.c18
-rw-r--r--fs/gfs2/dentry.c1
-rw-r--r--fs/gfs2/dir.c2
-rw-r--r--fs/gfs2/export.c3
-rw-r--r--fs/gfs2/file.c2
-rw-r--r--fs/gfs2/glock.c3
-rw-r--r--fs/gfs2/glops.c1
-rw-r--r--fs/gfs2/incore.h13
-rw-r--r--fs/gfs2/inode.c101
-rw-r--r--fs/gfs2/inode.h5
-rw-r--r--fs/gfs2/lock_dlm.c1
-rw-r--r--fs/gfs2/log.c161
-rw-r--r--fs/gfs2/log.h1
-rw-r--r--fs/gfs2/lops.c2
-rw-r--r--fs/gfs2/main.c2
-rw-r--r--fs/gfs2/meta_io.c5
-rw-r--r--fs/gfs2/ops_fstype.c21
-rw-r--r--fs/gfs2/quota.c102
-rw-r--r--fs/gfs2/rgrp.c68
-rw-r--r--fs/gfs2/rgrp.h2
-rw-r--r--fs/gfs2/super.c11
-rw-r--r--fs/gfs2/sys.c11
-rw-r--r--fs/gfs2/trans.c18
-rw-r--r--fs/gfs2/util.c1
-rw-r--r--fs/hfs/bnode.c1
-rw-r--r--fs/hfs/btree.c1
-rw-r--r--fs/hfs/mdb.c1
-rw-r--r--fs/hfs/super.c1
-rw-r--r--fs/hfsplus/options.c1
-rw-r--r--fs/hostfs/hostfs_kern.c1
-rw-r--r--fs/hpfs/buffer.c1
-rw-r--r--fs/hpfs/dir.c1
-rw-r--r--fs/hpfs/inode.c1
-rw-r--r--fs/hpfs/super.c1
-rw-r--r--fs/inode.c2
-rw-r--r--fs/ioctl.c92
-rw-r--r--fs/ioprio.c1
-rw-r--r--fs/isofs/dir.c1
-rw-r--r--fs/isofs/namei.c1
-rw-r--r--fs/jbd/commit.c1
-rw-r--r--fs/jbd/recovery.c1
-rw-r--r--fs/jbd/transaction.c2
-rw-r--r--fs/jbd2/journal.c2
-rw-r--r--fs/jbd2/recovery.c1
-rw-r--r--fs/jffs2/background.c3
-rw-r--r--fs/jffs2/compr_lzo.c1
-rw-r--r--fs/jffs2/compr_zlib.c1
-rw-r--r--fs/jffs2/debug.c1
-rw-r--r--fs/jffs2/erase.c12
-rw-r--r--fs/jffs2/file.c1
-rw-r--r--fs/jffs2/fs.c10
-rw-r--r--fs/jffs2/gc.c17
-rw-r--r--fs/jffs2/nodelist.c1
-rw-r--r--fs/jffs2/nodelist.h10
-rw-r--r--fs/jffs2/nodemgmt.c29
-rw-r--r--fs/jffs2/os-linux.h3
-rw-r--r--fs/jffs2/readinode.c2
-rw-r--r--fs/jffs2/scan.c4
-rw-r--r--fs/jffs2/super.c2
-rw-r--r--fs/jffs2/symlink.c1
-rw-r--r--fs/jffs2/wbuf.c8
-rw-r--r--fs/jffs2/write.c1
-rw-r--r--fs/jfs/acl.c1
-rw-r--r--fs/jfs/inode.c2
-rw-r--r--fs/jfs/jfs_dmap.c19
-rw-r--r--fs/jfs/jfs_dmap.h6
-rw-r--r--fs/jfs/jfs_dtree.c1
-rw-r--r--fs/jfs/jfs_imap.c1
-rw-r--r--fs/jfs/jfs_inode.h1
-rw-r--r--fs/jfs/jfs_logmgr.c1
-rw-r--r--fs/jfs/jfs_metapage.c1
-rw-r--r--fs/jfs/jfs_unicode.h1
-rw-r--r--fs/jfs/namei.c4
-rw-r--r--fs/jfs/resize.c6
-rw-r--r--fs/jfs/super.c14
-rw-r--r--fs/jfs/symlink.c14
-rw-r--r--fs/jfs/xattr.c1
-rw-r--r--fs/libfs.c36
-rw-r--r--fs/lockd/clntlock.c1
-rw-r--r--fs/lockd/clntproc.c1
-rw-r--r--fs/lockd/mon.c1
-rw-r--r--fs/lockd/svc.c1
-rw-r--r--fs/lockd/svc4proc.c1
-rw-r--r--fs/lockd/svclock.c1
-rw-r--r--fs/lockd/svcproc.c1
-rw-r--r--fs/lockd/svcsubs.c1
-rw-r--r--fs/locks.c2
-rw-r--r--fs/logfs/dev_bdev.c16
-rw-r--r--fs/logfs/dev_mtd.c26
-rw-r--r--fs/logfs/dir.c8
-rw-r--r--fs/logfs/file.c16
-rw-r--r--fs/logfs/gc.c58
-rw-r--r--fs/logfs/inode.c7
-rw-r--r--fs/logfs/journal.c44
-rw-r--r--fs/logfs/logfs.h31
-rw-r--r--fs/logfs/logfs_abi.h10
-rw-r--r--fs/logfs/readwrite.c106
-rw-r--r--fs/logfs/segment.c70
-rw-r--r--fs/logfs/super.c45
-rw-r--r--fs/minix/itree_v1.c1
-rw-r--r--fs/mpage.c3
-rw-r--r--fs/namei.c45
-rw-r--r--fs/namespace.c19
-rw-r--r--fs/ncpfs/dir.c1
-rw-r--r--fs/ncpfs/file.c1
-rw-r--r--fs/ncpfs/inode.c8
-rw-r--r--fs/ncpfs/ioctl.c1
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/ncpfs/sock.c1
-rw-r--r--fs/ncpfs/symlink.c1
-rw-r--r--fs/nfs/cache_lib.c1
-rw-r--r--fs/nfs/callback_proc.c1
-rw-r--r--fs/nfs/callback_xdr.c2
-rw-r--r--fs/nfs/client.c61
-rw-r--r--fs/nfs/delegation.c89
-rw-r--r--fs/nfs/delegation.h6
-rw-r--r--fs/nfs/dir.c151
-rw-r--r--fs/nfs/direct.c1
-rw-r--r--fs/nfs/dns_resolve.c1
-rw-r--r--fs/nfs/file.c20
-rw-r--r--fs/nfs/fscache.c4
-rw-r--r--fs/nfs/getroot.c191
-rw-r--r--fs/nfs/inode.c69
-rw-r--r--fs/nfs/internal.h4
-rw-r--r--fs/nfs/iostat.h6
-rw-r--r--fs/nfs/namespace.c21
-rw-r--r--fs/nfs/nfs2xdr.c1
-rw-r--r--fs/nfs/nfs3acl.c24
-rw-r--r--fs/nfs/nfs3proc.c129
-rw-r--r--fs/nfs/nfs3xdr.c3
-rw-r--r--fs/nfs/nfs4_fs.h8
-rw-r--r--fs/nfs/nfs4namespace.c13
-rw-r--r--fs/nfs/nfs4proc.c186
-rw-r--r--fs/nfs/nfs4state.c36
-rw-r--r--fs/nfs/nfs4xdr.c25
-rw-r--r--fs/nfs/nfsroot.c14
-rw-r--r--fs/nfs/pagelist.c37
-rw-r--r--fs/nfs/proc.c145
-rw-r--r--fs/nfs/read.c4
-rw-r--r--fs/nfs/super.c176
-rw-r--r--fs/nfs/symlink.c1
-rw-r--r--fs/nfs/unlink.c4
-rw-r--r--fs/nfs/write.c95
-rw-r--r--fs/nfs_common/nfsacl.c1
-rw-r--r--fs/nfsd/export.c45
-rw-r--r--fs/nfsd/nfs2acl.c1
-rw-r--r--fs/nfsd/nfs3acl.c1
-rw-r--r--fs/nfsd/nfs4acl.c1
-rw-r--r--fs/nfsd/nfs4callback.c141
-rw-r--r--fs/nfsd/nfs4idmap.c1
-rw-r--r--fs/nfsd/nfs4proc.c51
-rw-r--r--fs/nfsd/nfs4recover.c1
-rw-r--r--fs/nfsd/nfs4state.c377
-rw-r--r--fs/nfsd/nfs4xdr.c38
-rw-r--r--fs/nfsd/nfscache.c2
-rw-r--r--fs/nfsd/nfsctl.c65
-rw-r--r--fs/nfsd/nfsd.h6
-rw-r--r--fs/nfsd/nfssvc.c2
-rw-r--r--fs/nfsd/state.h47
-rw-r--r--fs/nfsd/vfs.c9
-rw-r--r--fs/nfsd/vfs.h1
-rw-r--r--fs/nfsd/xdr4.h11
-rw-r--r--fs/nilfs2/alloc.c157
-rw-r--r--fs/nilfs2/alloc.h9
-rw-r--r--fs/nilfs2/btnode.c1
-rw-r--r--fs/nilfs2/btree.c93
-rw-r--r--fs/nilfs2/btree.h23
-rw-r--r--fs/nilfs2/dat.c2
-rw-r--r--fs/nilfs2/dir.c2
-rw-r--r--fs/nilfs2/gcinode.c5
-rw-r--r--fs/nilfs2/inode.c5
-rw-r--r--fs/nilfs2/ioctl.c3
-rw-r--r--fs/nilfs2/mdt.c1
-rw-r--r--fs/nilfs2/page.c5
-rw-r--r--fs/nilfs2/recovery.c3
-rw-r--r--fs/nilfs2/segbuf.c89
-rw-r--r--fs/nilfs2/segbuf.h10
-rw-r--r--fs/nilfs2/segment.c177
-rw-r--r--fs/nilfs2/segment.h10
-rw-r--r--fs/nilfs2/sufile.c2
-rw-r--r--fs/nilfs2/super.c221
-rw-r--r--fs/nilfs2/the_nilfs.c12
-rw-r--r--fs/nilfs2/the_nilfs.h1
-rw-r--r--fs/notify/fsnotify.c1
-rw-r--r--fs/notify/inode_mark.c1
-rw-r--r--fs/notify/inotify/Kconfig1
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c2
-rw-r--r--fs/notify/inotify/inotify_user.c16
-rw-r--r--fs/ntfs/ChangeLog1702
-rw-r--r--fs/ntfs/aops.c1
-rw-r--r--fs/ntfs/attrib.c1
-rw-r--r--fs/ntfs/compress.c1
-rw-r--r--fs/ntfs/dir.c1
-rw-r--r--fs/ntfs/file.c1
-rw-r--r--fs/ntfs/index.c2
-rw-r--r--fs/ntfs/mft.c1
-rw-r--r--fs/ntfs/namei.c1
-rw-r--r--fs/ntfs/super.c25
-rw-r--r--fs/ocfs2/Makefile1
-rw-r--r--fs/ocfs2/acl.c78
-rw-r--r--fs/ocfs2/alloc.c908
-rw-r--r--fs/ocfs2/alloc.h12
-rw-r--r--fs/ocfs2/aops.c3
-rw-r--r--fs/ocfs2/buffer_head_io.c3
-rw-r--r--fs/ocfs2/cluster/heartbeat.c1
-rw-r--r--fs/ocfs2/cluster/masklog.c3
-rw-r--r--fs/ocfs2/cluster/masklog.h1
-rw-r--r--fs/ocfs2/cluster/nodemanager.c1
-rw-r--r--fs/ocfs2/cluster/quorum.c1
-rw-r--r--fs/ocfs2/cluster/tcp.c7
-rw-r--r--fs/ocfs2/dir.c75
-rw-r--r--fs/ocfs2/dlm/dlmast.c14
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h4
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c5
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c28
-rw-r--r--fs/ocfs2/dlm/dlmlock.c6
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c34
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c27
-rw-r--r--fs/ocfs2/dlm/dlmthread.c17
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c4
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c14
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/extent_map.c3
-rw-r--r--fs/ocfs2/file.c247
-rw-r--r--fs/ocfs2/heartbeat.c1
-rw-r--r--fs/ocfs2/inode.c129
-rw-r--r--fs/ocfs2/inode.h4
-rw-r--r--fs/ocfs2/journal.c26
-rw-r--r--fs/ocfs2/journal.h15
-rw-r--r--fs/ocfs2/localalloc.c285
-rw-r--r--fs/ocfs2/localalloc.h3
-rw-r--r--fs/ocfs2/locks.c2
-rw-r--r--fs/ocfs2/mmap.c49
-rw-r--r--fs/ocfs2/namei.c175
-rw-r--r--fs/ocfs2/ocfs2.h36
-rw-r--r--fs/ocfs2/ocfs2_fs.h144
-rw-r--r--fs/ocfs2/quota_global.c5
-rw-r--r--fs/ocfs2/quota_local.c51
-rw-r--r--fs/ocfs2/refcounttree.c79
-rw-r--r--fs/ocfs2/refcounttree.h4
-rw-r--r--fs/ocfs2/reservations.c847
-rw-r--r--fs/ocfs2/reservations.h159
-rw-r--r--fs/ocfs2/resize.c19
-rw-r--r--fs/ocfs2/stack_o2cb.c1
-rw-r--r--fs/ocfs2/stack_user.c1
-rw-r--r--fs/ocfs2/suballoc.c817
-rw-r--r--fs/ocfs2/suballoc.h26
-rw-r--r--fs/ocfs2/super.c88
-rw-r--r--fs/ocfs2/super.h7
-rw-r--r--fs/ocfs2/sysfile.c1
-rw-r--r--fs/ocfs2/xattr.c115
-rw-r--r--fs/omfs/inode.c2
-rw-r--r--fs/open.c2
-rw-r--r--fs/partitions/check.c1
-rw-r--r--fs/partitions/efi.c1
-rw-r--r--fs/partitions/msdos.c85
-rw-r--r--fs/proc/array.c4
-rw-r--r--fs/proc/base.c18
-rw-r--r--fs/proc/generic.c1
-rw-r--r--fs/proc/inode.c5
-rw-r--r--fs/proc/kcore.c4
-rw-r--r--fs/proc/kmsg.c1
-rw-r--r--fs/proc/nommu.c1
-rw-r--r--fs/proc/proc_devtree.c1
-rw-r--r--fs/proc/proc_net.c1
-rw-r--r--fs/proc/stat.c1
-rw-r--r--fs/proc/task_mmu.c134
-rw-r--r--fs/proc/task_nommu.c1
-rw-r--r--fs/proc/vmcore.c2
-rw-r--r--fs/qnx4/inode.c3
-rw-r--r--fs/quota/Kconfig8
-rw-r--r--fs/quota/dquot.c28
-rw-r--r--fs/quota/netlink.c1
-rw-r--r--fs/ramfs/file-nommu.c1
-rw-r--r--fs/ramfs/inode.c3
-rw-r--r--fs/read_write.c2
-rw-r--r--fs/reiserfs/bitmap.c2
-rw-r--r--fs/reiserfs/dir.c3
-rw-r--r--fs/reiserfs/fix_node.c1
-rw-r--r--fs/reiserfs/inode.c1
-rw-r--r--fs/reiserfs/journal.c16
-rw-r--r--fs/reiserfs/namei.c1
-rw-r--r--fs/reiserfs/super.c11
-rw-r--r--fs/reiserfs/xattr.c20
-rw-r--r--fs/reiserfs/xattr_acl.c1
-rw-r--r--fs/reiserfs/xattr_security.c3
-rw-r--r--fs/select.c17
-rw-r--r--fs/signalfd.c1
-rw-r--r--fs/smbfs/file.c1
-rw-r--r--fs/smbfs/inode.c8
-rw-r--r--fs/smbfs/smbiod.c1
-rw-r--r--fs/smbfs/symlink.c1
-rw-r--r--fs/splice.c1
-rw-r--r--fs/squashfs/block.c5
-rw-r--r--fs/squashfs/super.c4
-rw-r--r--fs/squashfs/symlink.c1
-rw-r--r--fs/squashfs/zlib_wrapper.c4
-rw-r--r--fs/super.c9
-rw-r--r--fs/sync.c4
-rw-r--r--fs/sysfs/bin.c52
-rw-r--r--fs/sysfs/dir.c222
-rw-r--r--fs/sysfs/file.c64
-rw-r--r--fs/sysfs/group.c6
-rw-r--r--fs/sysfs/inode.c18
-rw-r--r--fs/sysfs/mount.c95
-rw-r--r--fs/sysfs/symlink.c51
-rw-r--r--fs/sysfs/sysfs.h40
-rw-r--r--fs/sysv/dir.c2
-rw-r--r--fs/timerfd.c26
-rw-r--r--fs/ubifs/commit.c1
-rw-r--r--fs/ubifs/debug.c1
-rw-r--r--fs/ubifs/file.c1
-rw-r--r--fs/ubifs/gc.c1
-rw-r--r--fs/ubifs/io.c2
-rw-r--r--fs/ubifs/lpt.c1
-rw-r--r--fs/ubifs/lpt_commit.c1
-rw-r--r--fs/ubifs/recovery.c1
-rw-r--r--fs/ubifs/sb.c1
-rw-r--r--fs/ubifs/tnc.c1
-rw-r--r--fs/ubifs/ubifs.h1
-rw-r--r--fs/ubifs/xattr.c1
-rw-r--r--fs/udf/balloc.c59
-rw-r--r--fs/udf/file.c2
-rw-r--r--fs/udf/inode.c38
-rw-r--r--fs/udf/namei.c9
-rw-r--r--fs/udf/partition.c1
-rw-r--r--fs/udf/symlink.c1
-rw-r--r--fs/udf/udfdecl.h3
-rw-r--r--fs/udf/unicode.c1
-rw-r--r--fs/ufs/super.c3
-rw-r--r--fs/ufs/ufs_fs.h15
-rw-r--r--fs/xattr_acl.c2
-rw-r--r--fs/xfs/linux-2.6/kmem.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c245
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c110
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c5
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c5
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c6
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c16
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c207
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h7
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h150
-rw-r--r--fs/xfs/quota/xfs_dquot.c193
-rw-r--r--fs/xfs/quota/xfs_dquot.h35
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c30
-rw-r--r--fs/xfs/quota/xfs_qm.c609
-rw-r--r--fs/xfs/quota/xfs_qm.h23
-rw-r--r--fs/xfs/quota/xfs_qm_stats.c2
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c155
-rw-r--r--fs/xfs/quota/xfs_quota_priv.h102
-rw-r--r--fs/xfs/quota/xfs_trans_dquot.c29
-rw-r--r--fs/xfs/xfs_ag.h1
-rw-r--r--fs/xfs/xfs_bmap.c2
-rw-r--r--fs/xfs/xfs_buf_item.c55
-rw-r--r--fs/xfs/xfs_buf_item.h2
-rw-r--r--fs/xfs/xfs_dfrag.c22
-rw-r--r--fs/xfs/xfs_error.c30
-rw-r--r--fs/xfs/xfs_error.h9
-rw-r--r--fs/xfs/xfs_extfree_item.c18
-rw-r--r--fs/xfs/xfs_inode.c2
-rw-r--r--fs/xfs/xfs_inode_item.c21
-rw-r--r--fs/xfs/xfs_iomap.c123
-rw-r--r--fs/xfs/xfs_iomap.h47
-rw-r--r--fs/xfs/xfs_log.c740
-rw-r--r--fs/xfs/xfs_log.h13
-rw-r--r--fs/xfs/xfs_log_priv.h12
-rw-r--r--fs/xfs/xfs_log_recover.c311
-rw-r--r--fs/xfs/xfs_mount.c7
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_quota.h3
-rw-r--r--fs/xfs/xfs_trans.c760
-rw-r--r--fs/xfs/xfs_trans.h14
-rw-r--r--fs/xfs/xfs_trans_buf.c187
625 files changed, 40541 insertions, 10026 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index e777961939f3..0dbe0d139ac2 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24#include <linux/file.h> 24#include <linux/file.h>
25#include <linux/slab.h>
25#include <linux/stat.h> 26#include <linux/stat.h>
26#include <linux/sched.h> 27#include <linux/sched.h>
27#include <linux/fs.h> 28#include <linux/fs.h>
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 08b2eb157048..7317b39b2815 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -24,6 +24,7 @@
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/errno.h> 25#include <linux/errno.h>
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/slab.h>
27#include <linux/sched.h> 28#include <linux/sched.h>
28#include <linux/idr.h> 29#include <linux/idr.h>
29#include <net/9p/9p.h> 30#include <net/9p/9p.h>
@@ -110,7 +111,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
110{ 111{
111 int i, n, l, clone, any, access; 112 int i, n, l, clone, any, access;
112 u32 uid; 113 u32 uid;
113 struct p9_fid *fid; 114 struct p9_fid *fid, *old_fid = NULL;
114 struct dentry *d, *ds; 115 struct dentry *d, *ds;
115 struct v9fs_session_info *v9ses; 116 struct v9fs_session_info *v9ses;
116 char **wnames, *uname; 117 char **wnames, *uname;
@@ -183,10 +184,18 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
183 l = min(n - i, P9_MAXWELEM); 184 l = min(n - i, P9_MAXWELEM);
184 fid = p9_client_walk(fid, l, &wnames[i], clone); 185 fid = p9_client_walk(fid, l, &wnames[i], clone);
185 if (IS_ERR(fid)) { 186 if (IS_ERR(fid)) {
187 if (old_fid) {
188 /*
189 * If we fail, clunk fid which are mapping
190 * to path component and not the last component
191 * of the path.
192 */
193 p9_client_clunk(old_fid);
194 }
186 kfree(wnames); 195 kfree(wnames);
187 return fid; 196 return fid;
188 } 197 }
189 198 old_fid = fid;
190 i += l; 199 i += l;
191 clone = 0; 200 clone = 0;
192 } 201 }
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6c7f6a251115..f8b86e92cd66 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -29,6 +29,7 @@
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/parser.h> 30#include <linux/parser.h>
31#include <linux/idr.h> 31#include <linux/idr.h>
32#include <linux/slab.h>
32#include <net/9p/9p.h> 33#include <net/9p/9p.h>
33#include <net/9p/client.h> 34#include <net/9p/client.h>
34#include <net/9p/transport.h> 35#include <net/9p/transport.h>
@@ -237,11 +238,18 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
237 return ERR_PTR(-ENOMEM); 238 return ERR_PTR(-ENOMEM);
238 } 239 }
239 240
241 rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
242 if (rc) {
243 __putname(v9ses->aname);
244 __putname(v9ses->uname);
245 return ERR_PTR(rc);
246 }
247
240 spin_lock(&v9fs_sessionlist_lock); 248 spin_lock(&v9fs_sessionlist_lock);
241 list_add(&v9ses->slist, &v9fs_sessionlist); 249 list_add(&v9ses->slist, &v9fs_sessionlist);
242 spin_unlock(&v9fs_sessionlist_lock); 250 spin_unlock(&v9fs_sessionlist_lock);
243 251
244 v9ses->flags = V9FS_PROTO_2000U | V9FS_ACCESS_USER; 252 v9ses->flags = V9FS_ACCESS_USER;
245 strcpy(v9ses->uname, V9FS_DEFUSER); 253 strcpy(v9ses->uname, V9FS_DEFUSER);
246 strcpy(v9ses->aname, V9FS_DEFANAME); 254 strcpy(v9ses->aname, V9FS_DEFANAME);
247 v9ses->uid = ~0; 255 v9ses->uid = ~0;
@@ -262,8 +270,10 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
262 goto error; 270 goto error;
263 } 271 }
264 272
265 if (!p9_is_proto_dotu(v9ses->clnt)) 273 if (p9_is_proto_dotl(v9ses->clnt))
266 v9ses->flags &= ~V9FS_PROTO_2000U; 274 v9ses->flags |= V9FS_PROTO_2000L;
275 else if (p9_is_proto_dotu(v9ses->clnt))
276 v9ses->flags |= V9FS_PROTO_2000U;
267 277
268 v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; 278 v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
269 279
@@ -298,6 +308,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
298 return fid; 308 return fid;
299 309
300error: 310error:
311 bdi_destroy(&v9ses->bdi);
301 return ERR_PTR(retval); 312 return ERR_PTR(retval);
302} 313}
303 314
@@ -323,6 +334,8 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
323 __putname(v9ses->uname); 334 __putname(v9ses->uname);
324 __putname(v9ses->aname); 335 __putname(v9ses->aname);
325 336
337 bdi_destroy(&v9ses->bdi);
338
326 spin_lock(&v9fs_sessionlist_lock); 339 spin_lock(&v9fs_sessionlist_lock);
327 list_del(&v9ses->slist); 340 list_del(&v9ses->slist);
328 spin_unlock(&v9fs_sessionlist_lock); 341 spin_unlock(&v9fs_sessionlist_lock);
@@ -340,6 +353,19 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
340 p9_client_disconnect(v9ses->clnt); 353 p9_client_disconnect(v9ses->clnt);
341} 354}
342 355
356/**
357 * v9fs_session_begin_cancel - Begin terminate of a session
358 * @v9ses: session to terminate
359 *
360 * After this call we don't allow any request other than clunk.
361 */
362
363void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses)
364{
365 P9_DPRINTK(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses);
366 p9_client_begin_disconnect(v9ses->clnt);
367}
368
343extern int v9fs_error_init(void); 369extern int v9fs_error_init(void);
344 370
345static struct kobject *v9fs_kobj; 371static struct kobject *v9fs_kobj;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 79000bf62491..bec4d0bcb458 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -20,11 +20,12 @@
20 * Boston, MA 02111-1301 USA 20 * Boston, MA 02111-1301 USA
21 * 21 *
22 */ 22 */
23#include <linux/backing-dev.h>
23 24
24/** 25/**
25 * enum p9_session_flags - option flags for each 9P session 26 * enum p9_session_flags - option flags for each 9P session
26 * @V9FS_PROTO_2000U: whether or not to use 9P2000.u extensions 27 * @V9FS_PROTO_2000U: whether or not to use 9P2000.u extensions
27 * @V9FS_PROTO_2010L: whether or not to use 9P2010.l extensions 28 * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions
28 * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy 29 * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy
29 * @V9FS_ACCESS_USER: a new attach will be issued for every user (default) 30 * @V9FS_ACCESS_USER: a new attach will be issued for every user (default)
30 * @V9FS_ACCESS_ANY: use a single attach for all users 31 * @V9FS_ACCESS_ANY: use a single attach for all users
@@ -34,7 +35,7 @@
34 */ 35 */
35enum p9_session_flags { 36enum p9_session_flags {
36 V9FS_PROTO_2000U = 0x01, 37 V9FS_PROTO_2000U = 0x01,
37 V9FS_PROTO_2010L = 0x02, 38 V9FS_PROTO_2000L = 0x02,
38 V9FS_ACCESS_SINGLE = 0x04, 39 V9FS_ACCESS_SINGLE = 0x04,
39 V9FS_ACCESS_USER = 0x08, 40 V9FS_ACCESS_USER = 0x08,
40 V9FS_ACCESS_ANY = 0x0C, 41 V9FS_ACCESS_ANY = 0x0C,
@@ -102,12 +103,14 @@ struct v9fs_session_info {
102 u32 uid; /* if ACCESS_SINGLE, the uid that has access */ 103 u32 uid; /* if ACCESS_SINGLE, the uid that has access */
103 struct p9_client *clnt; /* 9p client */ 104 struct p9_client *clnt; /* 9p client */
104 struct list_head slist; /* list of sessions registered with v9fs */ 105 struct list_head slist; /* list of sessions registered with v9fs */
106 struct backing_dev_info bdi;
105}; 107};
106 108
107struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, 109struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
108 char *); 110 char *);
109void v9fs_session_close(struct v9fs_session_info *v9ses); 111void v9fs_session_close(struct v9fs_session_info *v9ses);
110void v9fs_session_cancel(struct v9fs_session_info *v9ses); 112void v9fs_session_cancel(struct v9fs_session_info *v9ses);
113void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
111 114
112#define V9FS_MAGIC 0x01021997 115#define V9FS_MAGIC 0x01021997
113 116
@@ -130,5 +133,5 @@ static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses)
130 133
131static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses) 134static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
132{ 135{
133 return v9ses->flags & V9FS_PROTO_2010L; 136 return v9ses->flags & V9FS_PROTO_2000L;
134} 137}
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index d74325295b1e..cbf4e50f3933 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -34,6 +34,7 @@
34#include <linux/namei.h> 34#include <linux/namei.h>
35#include <linux/idr.h> 35#include <linux/idr.h>
36#include <linux/sched.h> 36#include <linux/sched.h>
37#include <linux/slab.h>
37#include <net/9p/9p.h> 38#include <net/9p/9p.h>
38#include <net/9p/client.h> 39#include <net/9p/client.h>
39 40
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 6580aa449541..0adfd64dfcee 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -32,6 +32,7 @@
32#include <linux/sched.h> 32#include <linux/sched.h>
33#include <linux/inet.h> 33#include <linux/inet.h>
34#include <linux/idr.h> 34#include <linux/idr.h>
35#include <linux/slab.h>
35#include <net/9p/9p.h> 36#include <net/9p/9p.h>
36#include <net/9p/client.h> 37#include <net/9p/client.h>
37 38
@@ -76,6 +77,15 @@ static inline int dt_type(struct p9_wstat *mistat)
76 return rettype; 77 return rettype;
77} 78}
78 79
80static void p9stat_init(struct p9_wstat *stbuf)
81{
82 stbuf->name = NULL;
83 stbuf->uid = NULL;
84 stbuf->gid = NULL;
85 stbuf->muid = NULL;
86 stbuf->extension = NULL;
87}
88
79/** 89/**
80 * v9fs_dir_readdir - read a directory 90 * v9fs_dir_readdir - read a directory
81 * @filp: opened file structure 91 * @filp: opened file structure
@@ -121,6 +131,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
121 rdir = (struct p9_rdir *) fid->rdir; 131 rdir = (struct p9_rdir *) fid->rdir;
122 132
123 err = mutex_lock_interruptible(&rdir->mutex); 133 err = mutex_lock_interruptible(&rdir->mutex);
134 if (err)
135 return err;
124 while (err == 0) { 136 while (err == 0) {
125 if (rdir->tail == rdir->head) { 137 if (rdir->tail == rdir->head) {
126 err = v9fs_file_readn(filp, rdir->buf, NULL, 138 err = v9fs_file_readn(filp, rdir->buf, NULL,
@@ -131,8 +143,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
131 rdir->head = 0; 143 rdir->head = 0;
132 rdir->tail = err; 144 rdir->tail = err;
133 } 145 }
134
135 while (rdir->head < rdir->tail) { 146 while (rdir->head < rdir->tail) {
147 p9stat_init(&st);
136 err = p9stat_read(rdir->buf + rdir->head, 148 err = p9stat_read(rdir->buf + rdir->head,
137 buflen - rdir->head, &st, 149 buflen - rdir->head, &st,
138 fid->clnt->proto_version); 150 fid->clnt->proto_version);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 36122683fae8..df52d488d2a6 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -114,7 +114,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
114 P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); 114 P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
115 115
116 /* No mandatory locks */ 116 /* No mandatory locks */
117 if (__mandatory_lock(inode)) 117 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
118 return -ENOLCK; 118 return -ENOLCK;
119 119
120 if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { 120 if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
@@ -215,7 +215,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
215 struct p9_fid *fid; 215 struct p9_fid *fid;
216 struct p9_client *clnt; 216 struct p9_client *clnt;
217 struct inode *inode = filp->f_path.dentry->d_inode; 217 struct inode *inode = filp->f_path.dentry->d_inode;
218 int origin = *offset; 218 loff_t origin = *offset;
219 unsigned long pg_start, pg_end; 219 unsigned long pg_start, pg_end;
220 220
221 P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data, 221 P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 5fe45d692c9f..f2434fc9d2c4 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -34,6 +34,7 @@
34#include <linux/namei.h> 34#include <linux/namei.h>
35#include <linux/idr.h> 35#include <linux/idr.h>
36#include <linux/sched.h> 36#include <linux/sched.h>
37#include <linux/slab.h>
37#include <net/9p/9p.h> 38#include <net/9p/9p.h>
38#include <net/9p/client.h> 39#include <net/9p/client.h>
39 40
@@ -431,6 +432,7 @@ error:
431 432
432static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir) 433static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
433{ 434{
435 int retval;
434 struct inode *file_inode; 436 struct inode *file_inode;
435 struct v9fs_session_info *v9ses; 437 struct v9fs_session_info *v9ses;
436 struct p9_fid *v9fid; 438 struct p9_fid *v9fid;
@@ -444,7 +446,10 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
444 if (IS_ERR(v9fid)) 446 if (IS_ERR(v9fid))
445 return PTR_ERR(v9fid); 447 return PTR_ERR(v9fid);
446 448
447 return p9_client_remove(v9fid); 449 retval = p9_client_remove(v9fid);
450 if (!retval)
451 drop_nlink(file_inode);
452 return retval;
448} 453}
449 454
450static int 455static int
@@ -656,6 +661,9 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
656 P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n", 661 P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
657 dir, dentry->d_name.name, dentry, nameidata); 662 dir, dentry->d_name.name, dentry, nameidata);
658 663
664 if (dentry->d_name.len > NAME_MAX)
665 return ERR_PTR(-ENAMETOOLONG);
666
659 sb = dir->i_sb; 667 sb = dir->i_sb;
660 v9ses = v9fs_inode2v9ses(dir); 668 v9ses = v9fs_inode2v9ses(dir);
661 dfid = v9fs_fid_lookup(dentry->d_parent); 669 dfid = v9fs_fid_lookup(dentry->d_parent);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 69357c0d9899..806da5d3b3a0 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -37,6 +37,7 @@
37#include <linux/mount.h> 37#include <linux/mount.h>
38#include <linux/idr.h> 38#include <linux/idr.h>
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <linux/slab.h>
40#include <net/9p/9p.h> 41#include <net/9p/9p.h>
41#include <net/9p/client.h> 42#include <net/9p/client.h>
42 43
@@ -76,6 +77,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
76 sb->s_blocksize = 1 << sb->s_blocksize_bits; 77 sb->s_blocksize = 1 << sb->s_blocksize_bits;
77 sb->s_magic = V9FS_MAGIC; 78 sb->s_magic = V9FS_MAGIC;
78 sb->s_op = &v9fs_super_ops; 79 sb->s_op = &v9fs_super_ops;
80 sb->s_bdi = &v9ses->bdi;
79 81
80 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | 82 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
81 MS_NOATIME; 83 MS_NOATIME;
@@ -193,6 +195,7 @@ static void v9fs_kill_super(struct super_block *s)
193 195
194 kill_anon_super(s); 196 kill_anon_super(s);
195 197
198 v9fs_session_cancel(v9ses);
196 v9fs_session_close(v9ses); 199 v9fs_session_close(v9ses);
197 kfree(v9ses); 200 kfree(v9ses);
198 s->s_fs_info = NULL; 201 s->s_fs_info = NULL;
@@ -205,7 +208,7 @@ v9fs_umount_begin(struct super_block *sb)
205 struct v9fs_session_info *v9ses; 208 struct v9fs_session_info *v9ses;
206 209
207 v9ses = sb->s_fs_info; 210 v9ses = sb->s_fs_info;
208 v9fs_session_cancel(v9ses); 211 v9fs_session_begin_cancel(v9ses);
209} 212}
210 213
211static const struct super_operations v9fs_super_ops = { 214static const struct super_operations v9fs_super_ops = {
diff --git a/fs/Kconfig b/fs/Kconfig
index 7405f071be67..5f85b5947613 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -235,6 +235,7 @@ config NFS_COMMON
235 235
236source "net/sunrpc/Kconfig" 236source "net/sunrpc/Kconfig"
237source "fs/smbfs/Kconfig" 237source "fs/smbfs/Kconfig"
238source "fs/ceph/Kconfig"
238source "fs/cifs/Kconfig" 239source "fs/cifs/Kconfig"
239source "fs/ncpfs/Kconfig" 240source "fs/ncpfs/Kconfig"
240source "fs/coda/Kconfig" 241source "fs/coda/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index c3633aa46911..97f340f14ba2 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -125,3 +125,4 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/
125obj-$(CONFIG_BTRFS_FS) += btrfs/ 125obj-$(CONFIG_BTRFS_FS) += btrfs/
126obj-$(CONFIG_GFS2_FS) += gfs2/ 126obj-$(CONFIG_GFS2_FS) += gfs2/
127obj-$(CONFIG_EXOFS_FS) += exofs/ 127obj-$(CONFIG_EXOFS_FS) += exofs/
128obj-$(CONFIG_CEPH_FS) += ceph/
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 6910a98bd73c..4a3af7075c1d 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -13,6 +13,7 @@
13#include <linux/parser.h> 13#include <linux/parser.h>
14#include <linux/mount.h> 14#include <linux/mount.h>
15#include <linux/seq_file.h> 15#include <linux/seq_file.h>
16#include <linux/slab.h>
16#include <linux/smp_lock.h> 17#include <linux/smp_lock.h>
17#include <linux/statfs.h> 18#include <linux/statfs.h>
18#include "adfs.h" 19#include "adfs.h"
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index dc5ef14bdc1c..3e262711ae06 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -7,6 +7,7 @@
7 * block allocation, deallocation, calculation of free space. 7 * block allocation, deallocation, calculation of free space.
8 */ 8 */
9 9
10#include <linux/slab.h>
10#include "affs.h" 11#include "affs.h"
11 12
12/* This is, of course, shamelessly stolen from fs/minix */ 13/* This is, of course, shamelessly stolen from fs/minix */
@@ -128,7 +129,7 @@ err_range:
128/* 129/*
129 * Allocate a block in the given allocation zone. 130 * Allocate a block in the given allocation zone.
130 * Since we have to byte-swap the bitmap on little-endian 131 * Since we have to byte-swap the bitmap on little-endian
131 * machines, this is rather expensive. Therefor we will 132 * machines, this is rather expensive. Therefore we will
132 * preallocate up to 16 blocks from the same word, if 133 * preallocate up to 16 blocks from the same word, if
133 * possible. We are not doing preallocations in the 134 * possible. We are not doing preallocations in the
134 * header zone, though. 135 * header zone, though.
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index c9744d771d98..f4b2a4ee4f91 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -10,6 +10,7 @@
10 * (C) 1991 Linus Torvalds - minix filesystem 10 * (C) 1991 Linus Torvalds - minix filesystem
11 */ 11 */
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/gfp.h>
13#include "affs.h" 14#include "affs.h"
14 15
15extern const struct inode_operations affs_symlink_inode_operations; 16extern const struct inode_operations affs_symlink_inode_operations;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index d41e9673cd97..16a3e4765f68 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -17,6 +17,7 @@
17#include <linux/magic.h> 17#include <linux/magic.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/smp_lock.h> 19#include <linux/smp_lock.h>
20#include <linux/slab.h>
20#include "affs.h" 21#include "affs.h"
21 22
22extern struct timezone sys_tz; 23extern struct timezone sys_tz;
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index e2b1d3f16519..0fb315dd4d2a 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -9,7 +9,6 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/slab.h>
13#include <linux/sched.h> 12#include <linux/sched.h>
14#include "internal.h" 13#include "internal.h"
15 14
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index eb765489164f..a3bcec75c54a 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/slab.h>
14#include <linux/sched.h> 15#include <linux/sched.h>
15#include <linux/ip.h> 16#include <linux/ip.h>
16#include "internal.h" 17#include "internal.h"
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 88067f36e5e7..adc1cb771b57 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -12,7 +12,6 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
17#include <linux/pagemap.h> 16#include <linux/pagemap.h>
18#include <linux/ctype.h> 17#include <linux/ctype.h>
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 39b301662f22..0df9bc2b724d 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -12,10 +12,10 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
17#include <linux/pagemap.h> 16#include <linux/pagemap.h>
18#include <linux/writeback.h> 17#include <linux/writeback.h>
18#include <linux/gfp.h>
19#include "internal.h" 19#include "internal.h"
20 20
21static int afs_readpage(struct file *file, struct page *page); 21static int afs_readpage(struct file *file, struct page *page);
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 023b95b0d9d7..4bd0218473a9 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -10,6 +10,7 @@
10 */ 10 */
11 11
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/slab.h>
13#include <linux/sched.h> 14#include <linux/sched.h>
14#include <linux/circ_buf.h> 15#include <linux/circ_buf.h>
15#include "internal.h" 16#include "internal.h"
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index c048f0658751..d00b312e3110 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -16,7 +16,6 @@
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/slab.h>
20#include <linux/fs.h> 19#include <linux/fs.h>
21#include <linux/pagemap.h> 20#include <linux/pagemap.h>
22#include <linux/sched.h> 21#include <linux/sched.h>
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index c54dad4e6063..a10f2582844f 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -19,6 +19,7 @@
19#include <linux/workqueue.h> 19#include <linux/workqueue.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/fscache.h> 21#include <linux/fscache.h>
22#include <linux/backing-dev.h>
22 23
23#include "afs.h" 24#include "afs.h"
24#include "afs_vl.h" 25#include "afs_vl.h"
@@ -313,6 +314,7 @@ struct afs_volume {
313 unsigned short rjservers; /* number of servers discarded due to -ENOMEDIUM */ 314 unsigned short rjservers; /* number of servers discarded due to -ENOMEDIUM */
314 struct afs_server *servers[8]; /* servers on which volume resides (ordered) */ 315 struct afs_server *servers[8]; /* servers on which volume resides (ordered) */
315 struct rw_semaphore server_sem; /* lock for accessing current server */ 316 struct rw_semaphore server_sem; /* lock for accessing current server */
317 struct backing_dev_info bdi;
316}; 318};
317 319
318/* 320/*
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 5ffb570cd3a8..b3feddc4f7d6 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -12,11 +12,11 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
17#include <linux/pagemap.h> 16#include <linux/pagemap.h>
18#include <linux/mount.h> 17#include <linux/mount.h>
19#include <linux/namei.h> 18#include <linux/namei.h>
19#include <linux/gfp.h>
20#include "internal.h" 20#include "internal.h"
21 21
22 22
@@ -138,9 +138,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
138{ 138{
139 struct afs_super_info *super; 139 struct afs_super_info *super;
140 struct vfsmount *mnt; 140 struct vfsmount *mnt;
141 struct page *page = NULL; 141 struct page *page;
142 size_t size; 142 size_t size;
143 char *buf, *devname = NULL, *options = NULL; 143 char *buf, *devname, *options;
144 int ret; 144 int ret;
145 145
146 _enter("{%s}", mntpt->d_name.name); 146 _enter("{%s}", mntpt->d_name.name);
@@ -150,22 +150,22 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
150 ret = -EINVAL; 150 ret = -EINVAL;
151 size = mntpt->d_inode->i_size; 151 size = mntpt->d_inode->i_size;
152 if (size > PAGE_SIZE - 1) 152 if (size > PAGE_SIZE - 1)
153 goto error; 153 goto error_no_devname;
154 154
155 ret = -ENOMEM; 155 ret = -ENOMEM;
156 devname = (char *) get_zeroed_page(GFP_KERNEL); 156 devname = (char *) get_zeroed_page(GFP_KERNEL);
157 if (!devname) 157 if (!devname)
158 goto error; 158 goto error_no_devname;
159 159
160 options = (char *) get_zeroed_page(GFP_KERNEL); 160 options = (char *) get_zeroed_page(GFP_KERNEL);
161 if (!options) 161 if (!options)
162 goto error; 162 goto error_no_options;
163 163
164 /* read the contents of the AFS special symlink */ 164 /* read the contents of the AFS special symlink */
165 page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL); 165 page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
166 if (IS_ERR(page)) { 166 if (IS_ERR(page)) {
167 ret = PTR_ERR(page); 167 ret = PTR_ERR(page);
168 goto error; 168 goto error_no_page;
169 } 169 }
170 170
171 ret = -EIO; 171 ret = -EIO;
@@ -196,12 +196,12 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
196 return mnt; 196 return mnt;
197 197
198error: 198error:
199 if (page) 199 page_cache_release(page);
200 page_cache_release(page); 200error_no_page:
201 if (devname) 201 free_page((unsigned long) options);
202 free_page((unsigned long) devname); 202error_no_options:
203 if (options) 203 free_page((unsigned long) devname);
204 free_page((unsigned long) options); 204error_no_devname:
205 _leave(" = %d", ret); 205 _leave(" = %d", ret);
206 return ERR_PTR(ret); 206 return ERR_PTR(ret);
207} 207}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index bde3f19c0995..67cf810e0fd6 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -9,6 +9,7 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/slab.h>
12#include <net/sock.h> 13#include <net/sock.h>
13#include <net/af_rxrpc.h> 14#include <net/af_rxrpc.h>
14#include <rxrpc/packet.h> 15#include <rxrpc/packet.h>
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 3ef504370034..bb4ed144d0e4 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -189,8 +189,9 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order)
189 if (!permits) 189 if (!permits)
190 goto out_unlock; 190 goto out_unlock;
191 191
192 memcpy(permits->permits, xpermits->permits, 192 if (xpermits)
193 count * sizeof(struct afs_permit)); 193 memcpy(permits->permits, xpermits->permits,
194 count * sizeof(struct afs_permit));
194 195
195 _debug("key %x access %x", 196 _debug("key %x access %x",
196 key_serial(key), vnode->status.caller_access); 197 key_serial(key), vnode->status.caller_access);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 14f6431598ad..e932e5a3a0c1 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -311,6 +311,7 @@ static int afs_fill_super(struct super_block *sb, void *data)
311 sb->s_magic = AFS_FS_MAGIC; 311 sb->s_magic = AFS_FS_MAGIC;
312 sb->s_op = &afs_super_ops; 312 sb->s_op = &afs_super_ops;
313 sb->s_fs_info = as; 313 sb->s_fs_info = as;
314 sb->s_bdi = &as->volume->bdi;
314 315
315 /* allocate the root inode and dentry */ 316 /* allocate the root inode and dentry */
316 fid.vid = as->volume->vid; 317 fid.vid = as->volume->vid;
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 36c1306e09e0..340afd0cd182 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -9,6 +9,7 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/gfp.h>
12#include <linux/init.h> 13#include <linux/init.h>
13#include <linux/sched.h> 14#include <linux/sched.h>
14#include "internal.h" 15#include "internal.h"
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 6e689208def2..9ac260d1361d 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/slab.h>
14#include <linux/init.h> 15#include <linux/init.h>
15#include <linux/sched.h> 16#include <linux/sched.h>
16#include "internal.h" 17#include "internal.h"
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index 2f05c4fc2a70..25cf4c3f4ff7 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -12,7 +12,6 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
17#include <linux/sched.h> 16#include <linux/sched.h>
18#include "internal.h" 17#include "internal.h"
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index a353e69e2391..401eeb21869f 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -106,6 +106,10 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
106 volume->cell = params->cell; 106 volume->cell = params->cell;
107 volume->vid = vlocation->vldb.vid[params->type]; 107 volume->vid = vlocation->vldb.vid[params->type];
108 108
109 ret = bdi_setup_and_register(&volume->bdi, "afs", BDI_CAP_MAP_COPY);
110 if (ret)
111 goto error_bdi;
112
109 init_rwsem(&volume->server_sem); 113 init_rwsem(&volume->server_sem);
110 114
111 /* look up all the applicable server records */ 115 /* look up all the applicable server records */
@@ -151,6 +155,8 @@ error:
151 return ERR_PTR(ret); 155 return ERR_PTR(ret);
152 156
153error_discard: 157error_discard:
158 bdi_destroy(&volume->bdi);
159error_bdi:
154 up_write(&params->cell->vl_sem); 160 up_write(&params->cell->vl_sem);
155 161
156 for (loop = volume->nservers - 1; loop >= 0; loop--) 162 for (loop = volume->nservers - 1; loop >= 0; loop--)
@@ -200,6 +206,7 @@ void afs_put_volume(struct afs_volume *volume)
200 for (loop = volume->nservers - 1; loop >= 0; loop--) 206 for (loop = volume->nservers - 1; loop >= 0; loop--)
201 afs_put_server(volume->servers[loop]); 207 afs_put_server(volume->servers[loop]);
202 208
209 bdi_destroy(&volume->bdi);
203 kfree(volume); 210 kfree(volume);
204 211
205 _leave(" [destroyed]"); 212 _leave(" [destroyed]");
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 9f0bf13291e5..e4b75d6eda83 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -12,7 +12,6 @@
12#include <linux/file.h> 12#include <linux/file.h>
13#include <linux/poll.h> 13#include <linux/poll.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/slab.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/fs.h> 16#include <linux/fs.h>
18#include <linux/mount.h> 17#include <linux/mount.h>
@@ -209,6 +208,7 @@ static struct inode *anon_inode_mkinode(void)
209 inode->i_mode = S_IRUSR | S_IWUSR; 208 inode->i_mode = S_IRUSR | S_IWUSR;
210 inode->i_uid = current_fsuid(); 209 inode->i_uid = current_fsuid();
211 inode->i_gid = current_fsgid(); 210 inode->i_gid = current_fsgid();
211 inode->i_flags |= S_PRIVATE;
212 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 212 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
213 return inode; 213 return inode;
214} 214}
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 4a1401cea0a1..8713c7cfbc79 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -13,6 +13,7 @@
13#include <linux/capability.h> 13#include <linux/capability.h>
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/stat.h> 15#include <linux/stat.h>
16#include <linux/slab.h>
16#include <linux/param.h> 17#include <linux/param.h>
17#include <linux/time.h> 18#include <linux/time.h>
18#include <linux/smp_lock.h> 19#include <linux/smp_lock.h>
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index c8a80dffb455..d29b7f6df862 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -22,6 +22,7 @@
22#include <linux/magic.h> 22#include <linux/magic.h>
23#include <linux/dcache.h> 23#include <linux/dcache.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/slab.h>
25 26
26#include "autofs_i.h" 27#include "autofs_i.h"
27 28
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index a015b49891df..e8e5e63ac950 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -15,6 +15,7 @@
15#include <linux/capability.h> 15#include <linux/capability.h>
16#include <linux/errno.h> 16#include <linux/errno.h>
17#include <linux/stat.h> 17#include <linux/stat.h>
18#include <linux/slab.h>
18#include <linux/param.h> 19#include <linux/param.h>
19#include <linux/time.h> 20#include <linux/time.h>
20#include "autofs_i.h" 21#include "autofs_i.h"
@@ -176,8 +177,7 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
176 } 177 }
177 /* Trigger mount for path component or follow link */ 178 /* Trigger mount for path component or follow link */
178 } else if (ino->flags & AUTOFS_INF_PENDING || 179 } else if (ino->flags & AUTOFS_INF_PENDING ||
179 autofs4_need_mount(flags) || 180 autofs4_need_mount(flags)) {
180 current->link_count) {
181 DPRINTK("waiting for mount name=%.*s", 181 DPRINTK("waiting for mount name=%.*s",
182 dentry->d_name.len, dentry->d_name.name); 182 dentry->d_name.len, dentry->d_name.name);
183 183
@@ -261,7 +261,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
261 spin_unlock(&dcache_lock); 261 spin_unlock(&dcache_lock);
262 spin_unlock(&sbi->fs_lock); 262 spin_unlock(&sbi->fs_lock);
263 263
264 status = try_to_fill_dentry(dentry, 0); 264 status = try_to_fill_dentry(dentry, nd->flags);
265 if (status) 265 if (status)
266 goto out_error; 266 goto out_error;
267 267
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index e3287d0d1a58..59096b5e0fc7 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -11,7 +11,6 @@
11 */ 11 */
12 12
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/slab.h>
15#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
16#include <linux/string.h> 15#include <linux/string.h>
17 16
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 15d80bb35d6f..f96eff04e11a 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -20,11 +20,11 @@
20#include <linux/fcntl.h> 20#include <linux/fcntl.h>
21#include <linux/ptrace.h> 21#include <linux/ptrace.h>
22#include <linux/user.h> 22#include <linux/user.h>
23#include <linux/slab.h>
24#include <linux/binfmts.h> 23#include <linux/binfmts.h>
25#include <linux/personality.h> 24#include <linux/personality.h>
26#include <linux/init.h> 25#include <linux/init.h>
27#include <linux/coredump.h> 26#include <linux/coredump.h>
27#include <linux/slab.h>
28 28
29#include <asm/system.h> 29#include <asm/system.h>
30#include <asm/uaccess.h> 30#include <asm/uaccess.h>
@@ -75,14 +75,16 @@ static int aout_core_dump(struct coredump_params *cprm)
75 struct file *file = cprm->file; 75 struct file *file = cprm->file;
76 mm_segment_t fs; 76 mm_segment_t fs;
77 int has_dumped = 0; 77 int has_dumped = 0;
78 unsigned long dump_start, dump_size; 78 void __user *dump_start;
79 int dump_size;
79 struct user dump; 80 struct user dump;
80#ifdef __alpha__ 81#ifdef __alpha__
81# define START_DATA(u) (u.start_data) 82# define START_DATA(u) ((void __user *)u.start_data)
82#else 83#else
83# define START_DATA(u) ((u.u_tsize << PAGE_SHIFT) + u.start_code) 84# define START_DATA(u) ((void __user *)((u.u_tsize << PAGE_SHIFT) + \
85 u.start_code))
84#endif 86#endif
85# define START_STACK(u) (u.start_stack) 87# define START_STACK(u) ((void __user *)u.start_stack)
86 88
87 fs = get_fs(); 89 fs = get_fs();
88 set_fs(KERNEL_DS); 90 set_fs(KERNEL_DS);
@@ -104,9 +106,9 @@ static int aout_core_dump(struct coredump_params *cprm)
104 106
105/* make sure we actually have a data and stack area to dump */ 107/* make sure we actually have a data and stack area to dump */
106 set_fs(USER_DS); 108 set_fs(USER_DS);
107 if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize << PAGE_SHIFT)) 109 if (!access_ok(VERIFY_READ, START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
108 dump.u_dsize = 0; 110 dump.u_dsize = 0;
109 if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize << PAGE_SHIFT)) 111 if (!access_ok(VERIFY_READ, START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
110 dump.u_ssize = 0; 112 dump.u_ssize = 0;
111 113
112 set_fs(KERNEL_DS); 114 set_fs(KERNEL_DS);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 6d6a16c5e9bb..2c5f9a0e5d72 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1005,15 +1005,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
1005 } 1005 }
1006 } else if (!mm->start_data) { 1006 } else if (!mm->start_data) {
1007 mm->start_data = seg->addr; 1007 mm->start_data = seg->addr;
1008#ifndef CONFIG_MMU
1009 mm->end_data = seg->addr + phdr->p_memsz; 1008 mm->end_data = seg->addr + phdr->p_memsz;
1010#endif
1011 } 1009 }
1012
1013#ifdef CONFIG_MMU
1014 if (seg->addr + phdr->p_memsz > mm->end_data)
1015 mm->end_data = seg->addr + phdr->p_memsz;
1016#endif
1017 } 1010 }
1018 1011
1019 seg++; 1012 seg++;
@@ -1374,7 +1367,7 @@ static inline void fill_note(struct memelfnote *note, const char *name, int type
1374 1367
1375/* 1368/*
1376 * fill up all the fields in prstatus from the given task struct, except 1369 * fill up all the fields in prstatus from the given task struct, except
1377 * registers which need to be filled up seperately. 1370 * registers which need to be filled up separately.
1378 */ 1371 */
1379static void fill_prstatus(struct elf_prstatus *prstatus, 1372static void fill_prstatus(struct elf_prstatus *prstatus,
1380 struct task_struct *p, long signr) 1373 struct task_struct *p, long signr)
@@ -1590,7 +1583,7 @@ static size_t elf_core_vma_data_size(unsigned long mm_flags)
1590 struct vm_area_struct *vma; 1583 struct vm_area_struct *vma;
1591 size_t size = 0; 1584 size_t size = 0;
1592 1585
1593 for (vma = current->mm->mmap; vma; vma->vm_next) 1586 for (vma = current->mm->mmap; vma; vma = vma->vm_next)
1594 if (maydump(vma, mm_flags)) 1587 if (maydump(vma, mm_flags))
1595 size += vma->vm_end - vma->vm_start; 1588 size += vma->vm_end - vma->vm_start;
1596 return size; 1589 return size;
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index 32fb00b52cd0..b8e8b0acf9bd 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -11,7 +11,6 @@
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/stat.h> 13#include <linux/stat.h>
14#include <linux/slab.h>
15#include <linux/binfmts.h> 14#include <linux/binfmts.h>
16#include <linux/elf.h> 15#include <linux/elf.h>
17#include <linux/init.h> 16#include <linux/init.h>
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index e0e769bdca59..49566c1687d8 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -355,7 +355,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
355 355
356 if (!flat_reloc_valid(r, start_brk - start_data + text_len)) { 356 if (!flat_reloc_valid(r, start_brk - start_data + text_len)) {
357 printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)", 357 printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)",
358 (int) r,(int)(start_brk-start_code),(int)text_len); 358 (int) r,(int)(start_brk-start_data+text_len),(int)text_len);
359 goto failed; 359 goto failed;
360 } 360 }
361 361
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 08343505e184..aca9d55afb22 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -8,7 +8,6 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/string.h> 9#include <linux/string.h>
10#include <linux/stat.h> 10#include <linux/stat.h>
11#include <linux/slab.h>
12#include <linux/binfmts.h> 11#include <linux/binfmts.h>
13#include <linux/init.h> 12#include <linux/init.h>
14#include <linux/file.h> 13#include <linux/file.h>
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index a16f29e888cd..612a5c38d3c1 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -24,6 +24,7 @@
24#include <linux/mempool.h> 24#include <linux/mempool.h>
25#include <linux/bio.h> 25#include <linux/bio.h>
26#include <linux/workqueue.h> 26#include <linux/workqueue.h>
27#include <linux/slab.h>
27 28
28struct integrity_slab { 29struct integrity_slab {
29 struct kmem_cache *slab; 30 struct kmem_cache *slab;
diff --git a/fs/bio.c b/fs/bio.c
index dc17afd672e3..e7bf6ca64dcf 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -264,13 +264,12 @@ EXPORT_SYMBOL(bio_init);
264 * bio_alloc_bioset - allocate a bio for I/O 264 * bio_alloc_bioset - allocate a bio for I/O
265 * @gfp_mask: the GFP_ mask given to the slab allocator 265 * @gfp_mask: the GFP_ mask given to the slab allocator
266 * @nr_iovecs: number of iovecs to pre-allocate 266 * @nr_iovecs: number of iovecs to pre-allocate
267 * @bs: the bio_set to allocate from. If %NULL, just use kmalloc 267 * @bs: the bio_set to allocate from.
268 * 268 *
269 * Description: 269 * Description:
270 * bio_alloc_bioset will first try its own mempool to satisfy the allocation. 270 * bio_alloc_bioset will try its own mempool to satisfy the allocation.
271 * If %__GFP_WAIT is set then we will block on the internal pool waiting 271 * If %__GFP_WAIT is set then we will block on the internal pool waiting
272 * for a &struct bio to become free. If a %NULL @bs is passed in, we will 272 * for a &struct bio to become free.
273 * fall back to just using @kmalloc to allocate the required memory.
274 * 273 *
275 * Note that the caller must set ->bi_destructor on successful return 274 * Note that the caller must set ->bi_destructor on successful return
276 * of a bio, to do the appropriate freeing of the bio once the reference 275 * of a bio, to do the appropriate freeing of the bio once the reference
@@ -555,7 +554,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
555 .bi_rw = bio->bi_rw, 554 .bi_rw = bio->bi_rw,
556 }; 555 };
557 556
558 if (q->merge_bvec_fn(q, &bvm, prev) < len) { 557 if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) {
559 prev->bv_len -= len; 558 prev->bv_len -= len;
560 return 0; 559 return 0;
561 } 560 }
@@ -608,7 +607,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
608 * merge_bvec_fn() returns number of bytes it can accept 607 * merge_bvec_fn() returns number of bytes it can accept
609 * at this offset 608 * at this offset
610 */ 609 */
611 if (q->merge_bvec_fn(q, &bvm, bvec) < len) { 610 if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) {
612 bvec->bv_page = NULL; 611 bvec->bv_page = NULL;
613 bvec->bv_len = 0; 612 bvec->bv_len = 0;
614 bvec->bv_offset = 0; 613 bvec->bv_offset = 0;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index d11d0289f3d2..6dcee88c2e5d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -404,20 +404,28 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
404 * NULL first argument is nfsd_sync_dir() and that's not a directory. 404 * NULL first argument is nfsd_sync_dir() and that's not a directory.
405 */ 405 */
406 406
407static int block_fsync(struct file *filp, struct dentry *dentry, int datasync) 407int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
408{ 408{
409 struct block_device *bdev = I_BDEV(filp->f_mapping->host); 409 struct inode *bd_inode = filp->f_mapping->host;
410 struct block_device *bdev = I_BDEV(bd_inode);
410 int error; 411 int error;
411 412
412 error = sync_blockdev(bdev); 413 /*
413 if (error) 414 * There is no need to serialise calls to blkdev_issue_flush with
414 return error; 415 * i_mutex and doing so causes performance issues with concurrent
415 416 * O_SYNC writers to a block device.
417 */
418 mutex_unlock(&bd_inode->i_mutex);
419
416 error = blkdev_issue_flush(bdev, NULL); 420 error = blkdev_issue_flush(bdev, NULL);
417 if (error == -EOPNOTSUPP) 421 if (error == -EOPNOTSUPP)
418 error = 0; 422 error = 0;
423
424 mutex_lock(&bd_inode->i_mutex);
425
419 return error; 426 return error;
420} 427}
428EXPORT_SYMBOL(blkdev_fsync);
421 429
422/* 430/*
423 * pseudo-fs 431 * pseudo-fs
@@ -1481,7 +1489,7 @@ const struct file_operations def_blk_fops = {
1481 .aio_read = generic_file_aio_read, 1489 .aio_read = generic_file_aio_read,
1482 .aio_write = blkdev_aio_write, 1490 .aio_write = blkdev_aio_write,
1483 .mmap = generic_file_mmap, 1491 .mmap = generic_file_mmap,
1484 .fsync = block_fsync, 1492 .fsync = blkdev_fsync,
1485 .unlocked_ioctl = block_ioctl, 1493 .unlocked_ioctl = block_ioctl,
1486#ifdef CONFIG_COMPAT 1494#ifdef CONFIG_COMPAT
1487 .compat_ioctl = compat_blkdev_ioctl, 1495 .compat_ioctl = compat_blkdev_ioctl,
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6df6d6ed74fd..6ef7b26724ec 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -22,6 +22,7 @@
22#include <linux/posix_acl_xattr.h> 22#include <linux/posix_acl_xattr.h>
23#include <linux/posix_acl.h> 23#include <linux/posix_acl.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/slab.h>
25 26
26#include "ctree.h" 27#include "ctree.h"
27#include "btrfs_inode.h" 28#include "btrfs_inode.h"
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c0861e781cdb..462859a30141 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/kthread.h> 19#include <linux/kthread.h>
20#include <linux/slab.h>
20#include <linux/list.h> 21#include <linux/list.h>
21#include <linux/spinlock.h> 22#include <linux/spinlock.h>
22#include <linux/freezer.h> 23#include <linux/freezer.h>
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 3f1f50d9d916..7a4dee199832 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -153,6 +153,11 @@ struct btrfs_inode {
153 unsigned ordered_data_close:1; 153 unsigned ordered_data_close:1;
154 unsigned dummy_inode:1; 154 unsigned dummy_inode:1;
155 155
156 /*
157 * always compress this one file
158 */
159 unsigned force_compress:1;
160
156 struct inode vfs_inode; 161 struct inode vfs_inode;
157}; 162};
158 163
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index a11a32058b50..396039b3a8a2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -31,7 +31,7 @@
31#include <linux/swap.h> 31#include <linux/swap.h>
32#include <linux/writeback.h> 32#include <linux/writeback.h>
33#include <linux/bit_spinlock.h> 33#include <linux/bit_spinlock.h>
34#include <linux/pagevec.h> 34#include <linux/slab.h>
35#include "compat.h" 35#include "compat.h"
36#include "ctree.h" 36#include "ctree.h"
37#include "disk-io.h" 37#include "disk-io.h"
@@ -445,7 +445,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
445 unsigned long nr_pages = 0; 445 unsigned long nr_pages = 0;
446 struct extent_map *em; 446 struct extent_map *em;
447 struct address_space *mapping = inode->i_mapping; 447 struct address_space *mapping = inode->i_mapping;
448 struct pagevec pvec;
449 struct extent_map_tree *em_tree; 448 struct extent_map_tree *em_tree;
450 struct extent_io_tree *tree; 449 struct extent_io_tree *tree;
451 u64 end; 450 u64 end;
@@ -461,7 +460,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
461 460
462 end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; 461 end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
463 462
464 pagevec_init(&pvec, 0);
465 while (last_offset < compressed_end) { 463 while (last_offset < compressed_end) {
466 page_index = last_offset >> PAGE_CACHE_SHIFT; 464 page_index = last_offset >> PAGE_CACHE_SHIFT;
467 465
@@ -478,26 +476,17 @@ static noinline int add_ra_bio_pages(struct inode *inode,
478 goto next; 476 goto next;
479 } 477 }
480 478
481 page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS); 479 page = __page_cache_alloc(mapping_gfp_mask(mapping) &
480 ~__GFP_FS);
482 if (!page) 481 if (!page)
483 break; 482 break;
484 483
485 page->index = page_index; 484 if (add_to_page_cache_lru(page, mapping, page_index,
486 /* 485 GFP_NOFS)) {
487 * what we want to do here is call add_to_page_cache_lru,
488 * but that isn't exported, so we reproduce it here
489 */
490 if (add_to_page_cache(page, mapping,
491 page->index, GFP_NOFS)) {
492 page_cache_release(page); 486 page_cache_release(page);
493 goto next; 487 goto next;
494 } 488 }
495 489
496 /* open coding of lru_cache_add, also not exported */
497 page_cache_get(page);
498 if (!pagevec_add(&pvec, page))
499 __pagevec_lru_add_file(&pvec);
500
501 end = last_offset + PAGE_CACHE_SIZE - 1; 490 end = last_offset + PAGE_CACHE_SIZE - 1;
502 /* 491 /*
503 * at this point, we have a locked page in the page cache 492 * at this point, we have a locked page in the page cache
@@ -551,8 +540,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
551next: 540next:
552 last_offset += PAGE_CACHE_SIZE; 541 last_offset += PAGE_CACHE_SIZE;
553 } 542 }
554 if (pagevec_count(&pvec))
555 __pagevec_lru_add_file(&pvec);
556 return 0; 543 return 0;
557} 544}
558 545
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c4bc570a396e..6795a713b205 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "disk-io.h" 22#include "disk-io.h"
22#include "transaction.h" 23#include "transaction.h"
@@ -3040,6 +3041,10 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3040 if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0])) 3041 if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
3041 goto err; 3042 goto err;
3042 3043
3044 /* the leaf has changed, it now has room. return now */
3045 if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
3046 goto err;
3047
3043 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3048 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3044 fi = btrfs_item_ptr(leaf, path->slots[0], 3049 fi = btrfs_item_ptr(leaf, path->slots[0],
3045 struct btrfs_file_extent_item); 3050 struct btrfs_file_extent_item);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8b5cfdd4bfc1..746a7248678e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -26,6 +26,7 @@
26#include <linux/completion.h> 26#include <linux/completion.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/wait.h> 28#include <linux/wait.h>
29#include <linux/slab.h>
29#include <asm/kmap_types.h> 30#include <asm/kmap_types.h>
30#include "extent_io.h" 31#include "extent_io.h"
31#include "extent_map.h" 32#include "extent_map.h"
@@ -373,11 +374,13 @@ struct btrfs_super_block {
373 * ones specified below then we will fail to mount 374 * ones specified below then we will fail to mount
374 */ 375 */
375#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) 376#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
377#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (2ULL << 0)
376 378
377#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 379#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
378#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 380#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
379#define BTRFS_FEATURE_INCOMPAT_SUPP \ 381#define BTRFS_FEATURE_INCOMPAT_SUPP \
380 BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF 382 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
383 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)
381 384
382/* 385/*
383 * A leaf is full of items. offset and size tell us where to find 386 * A leaf is full of items. offset and size tell us where to find
@@ -832,7 +835,6 @@ struct btrfs_fs_info {
832 u64 last_trans_log_full_commit; 835 u64 last_trans_log_full_commit;
833 u64 open_ioctl_trans; 836 u64 open_ioctl_trans;
834 unsigned long mount_opt; 837 unsigned long mount_opt;
835 u64 max_extent;
836 u64 max_inline; 838 u64 max_inline;
837 u64 alloc_start; 839 u64 alloc_start;
838 struct btrfs_transaction *running_transaction; 840 struct btrfs_transaction *running_transaction;
@@ -1182,7 +1184,6 @@ struct btrfs_root {
1182#define BTRFS_INODE_NOATIME (1 << 9) 1184#define BTRFS_INODE_NOATIME (1 << 9)
1183#define BTRFS_INODE_DIRSYNC (1 << 10) 1185#define BTRFS_INODE_DIRSYNC (1 << 10)
1184 1186
1185
1186/* some macros to generate set/get funcs for the struct fields. This 1187/* some macros to generate set/get funcs for the struct fields. This
1187 * assumes there is a lefoo_to_cpu for every type, so lets make a simple 1188 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
1188 * one for u8: 1189 * one for u8:
@@ -1842,7 +1843,7 @@ BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
1842BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, 1843BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
1843 compat_flags, 64); 1844 compat_flags, 64);
1844BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, 1845BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
1845 compat_flags, 64); 1846 compat_ro_flags, 64);
1846BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, 1847BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
1847 incompat_flags, 64); 1848 incompat_flags, 64);
1848BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, 1849BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
@@ -2310,7 +2311,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2310 u32 min_type); 2311 u32 min_type);
2311 2312
2312int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 2313int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
2313int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end); 2314int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2315 struct extent_state **cached_state);
2314int btrfs_writepages(struct address_space *mapping, 2316int btrfs_writepages(struct address_space *mapping,
2315 struct writeback_control *wbc); 2317 struct writeback_control *wbc);
2316int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 2318int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -2335,7 +2337,7 @@ int btrfs_init_cachep(void);
2335void btrfs_destroy_cachep(void); 2337void btrfs_destroy_cachep(void);
2336long btrfs_ioctl_trans_end(struct file *file); 2338long btrfs_ioctl_trans_end(struct file *file);
2337struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 2339struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
2338 struct btrfs_root *root); 2340 struct btrfs_root *root, int *was_new);
2339int btrfs_commit_write(struct file *file, struct page *page, 2341int btrfs_commit_write(struct file *file, struct page *page,
2340 unsigned from, unsigned to); 2342 unsigned from, unsigned to);
2341struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, 2343struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
@@ -2386,7 +2388,6 @@ void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
2386ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); 2388ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
2387 2389
2388/* super.c */ 2390/* super.c */
2389u64 btrfs_parse_size(char *str);
2390int btrfs_parse_options(struct btrfs_root *root, char *options); 2391int btrfs_parse_options(struct btrfs_root *root, char *options);
2391int btrfs_sync_fs(struct super_block *sb, int wait); 2392int btrfs_sync_fs(struct super_block *sb, int wait);
2392 2393
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 84e6781413b1..902ce507c4e3 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include <linux/sort.h> 21#include <linux/sort.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "delayed-ref.h" 23#include "delayed-ref.h"
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2b59201b955c..feca04197d02 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -27,6 +27,7 @@
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <linux/freezer.h> 28#include <linux/freezer.h>
29#include <linux/crc32c.h> 29#include <linux/crc32c.h>
30#include <linux/slab.h>
30#include "compat.h" 31#include "compat.h"
31#include "ctree.h" 32#include "ctree.h"
32#include "disk-io.h" 33#include "disk-io.h"
@@ -43,8 +44,6 @@ static struct extent_io_ops btree_extent_io_ops;
43static void end_workqueue_fn(struct btrfs_work *work); 44static void end_workqueue_fn(struct btrfs_work *work);
44static void free_fs_root(struct btrfs_root *root); 45static void free_fs_root(struct btrfs_root *root);
45 46
46static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
47
48/* 47/*
49 * end_io_wq structs are used to do processing in task context when an IO is 48 * end_io_wq structs are used to do processing in task context when an IO is
50 * complete. This is used during reads to verify checksums, and it is used 49 * complete. This is used during reads to verify checksums, and it is used
@@ -263,13 +262,15 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
263static int verify_parent_transid(struct extent_io_tree *io_tree, 262static int verify_parent_transid(struct extent_io_tree *io_tree,
264 struct extent_buffer *eb, u64 parent_transid) 263 struct extent_buffer *eb, u64 parent_transid)
265{ 264{
265 struct extent_state *cached_state = NULL;
266 int ret; 266 int ret;
267 267
268 if (!parent_transid || btrfs_header_generation(eb) == parent_transid) 268 if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
269 return 0; 269 return 0;
270 270
271 lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS); 271 lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
272 if (extent_buffer_uptodate(io_tree, eb) && 272 0, &cached_state, GFP_NOFS);
273 if (extent_buffer_uptodate(io_tree, eb, cached_state) &&
273 btrfs_header_generation(eb) == parent_transid) { 274 btrfs_header_generation(eb) == parent_transid) {
274 ret = 0; 275 ret = 0;
275 goto out; 276 goto out;
@@ -282,10 +283,10 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
282 (unsigned long long)btrfs_header_generation(eb)); 283 (unsigned long long)btrfs_header_generation(eb));
283 } 284 }
284 ret = 1; 285 ret = 1;
285 clear_extent_buffer_uptodate(io_tree, eb); 286 clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
286out: 287out:
287 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, 288 unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
288 GFP_NOFS); 289 &cached_state, GFP_NOFS);
289 return ret; 290 return ret;
290} 291}
291 292
@@ -901,7 +902,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
901 root->highest_objectid = 0; 902 root->highest_objectid = 0;
902 root->name = NULL; 903 root->name = NULL;
903 root->in_sysfs = 0; 904 root->in_sysfs = 0;
904 root->inode_tree.rb_node = NULL; 905 root->inode_tree = RB_ROOT;
905 906
906 INIT_LIST_HEAD(&root->dirty_list); 907 INIT_LIST_HEAD(&root->dirty_list);
907 INIT_LIST_HEAD(&root->orphan_list); 908 INIT_LIST_HEAD(&root->orphan_list);
@@ -1372,19 +1373,11 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1372{ 1373{
1373 int err; 1374 int err;
1374 1375
1375 bdi->name = "btrfs";
1376 bdi->capabilities = BDI_CAP_MAP_COPY; 1376 bdi->capabilities = BDI_CAP_MAP_COPY;
1377 err = bdi_init(bdi); 1377 err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY);
1378 if (err) 1378 if (err)
1379 return err; 1379 return err;
1380 1380
1381 err = bdi_register(bdi, NULL, "btrfs-%d",
1382 atomic_inc_return(&btrfs_bdi_num));
1383 if (err) {
1384 bdi_destroy(bdi);
1385 return err;
1386 }
1387
1388 bdi->ra_pages = default_backing_dev_info.ra_pages; 1381 bdi->ra_pages = default_backing_dev_info.ra_pages;
1389 bdi->unplug_io_fn = btrfs_unplug_io_fn; 1382 bdi->unplug_io_fn = btrfs_unplug_io_fn;
1390 bdi->unplug_io_data = info; 1383 bdi->unplug_io_data = info;
@@ -1632,7 +1625,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1632 atomic_set(&fs_info->async_submit_draining, 0); 1625 atomic_set(&fs_info->async_submit_draining, 0);
1633 atomic_set(&fs_info->nr_async_bios, 0); 1626 atomic_set(&fs_info->nr_async_bios, 0);
1634 fs_info->sb = sb; 1627 fs_info->sb = sb;
1635 fs_info->max_extent = (u64)-1;
1636 fs_info->max_inline = 8192 * 1024; 1628 fs_info->max_inline = 8192 * 1024;
1637 fs_info->metadata_ratio = 0; 1629 fs_info->metadata_ratio = 0;
1638 1630
@@ -1673,7 +1665,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1673 insert_inode_hash(fs_info->btree_inode); 1665 insert_inode_hash(fs_info->btree_inode);
1674 1666
1675 spin_lock_init(&fs_info->block_group_cache_lock); 1667 spin_lock_init(&fs_info->block_group_cache_lock);
1676 fs_info->block_group_cache_tree.rb_node = NULL; 1668 fs_info->block_group_cache_tree = RB_ROOT;
1677 1669
1678 extent_io_tree_init(&fs_info->freed_extents[0], 1670 extent_io_tree_init(&fs_info->freed_extents[0],
1679 fs_info->btree_inode->i_mapping, GFP_NOFS); 1671 fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -1920,7 +1912,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1920 1912
1921 csum_root->track_dirty = 1; 1913 csum_root->track_dirty = 1;
1922 1914
1923 btrfs_read_block_groups(extent_root); 1915 ret = btrfs_read_block_groups(extent_root);
1916 if (ret) {
1917 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
1918 goto fail_block_groups;
1919 }
1924 1920
1925 fs_info->generation = generation; 1921 fs_info->generation = generation;
1926 fs_info->last_trans_committed = generation; 1922 fs_info->last_trans_committed = generation;
@@ -1930,7 +1926,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1930 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 1926 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1931 "btrfs-cleaner"); 1927 "btrfs-cleaner");
1932 if (IS_ERR(fs_info->cleaner_kthread)) 1928 if (IS_ERR(fs_info->cleaner_kthread))
1933 goto fail_csum_root; 1929 goto fail_block_groups;
1934 1930
1935 fs_info->transaction_kthread = kthread_run(transaction_kthread, 1931 fs_info->transaction_kthread = kthread_run(transaction_kthread,
1936 tree_root, 1932 tree_root,
@@ -2018,7 +2014,8 @@ fail_cleaner:
2018 filemap_write_and_wait(fs_info->btree_inode->i_mapping); 2014 filemap_write_and_wait(fs_info->btree_inode->i_mapping);
2019 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2015 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2020 2016
2021fail_csum_root: 2017fail_block_groups:
2018 btrfs_free_block_groups(fs_info);
2022 free_extent_buffer(csum_root->node); 2019 free_extent_buffer(csum_root->node);
2023 free_extent_buffer(csum_root->commit_root); 2020 free_extent_buffer(csum_root->commit_root);
2024fail_dev_root: 2021fail_dev_root:
@@ -2497,7 +2494,8 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
2497 int ret; 2494 int ret;
2498 struct inode *btree_inode = buf->first_page->mapping->host; 2495 struct inode *btree_inode = buf->first_page->mapping->host;
2499 2496
2500 ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf); 2497 ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf,
2498 NULL);
2501 if (!ret) 2499 if (!ret)
2502 return ret; 2500 return ret;
2503 2501
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index ba5c3fd5ab8c..951ef09b82f4 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -95,7 +95,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
95 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 95 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
96 key.offset = 0; 96 key.offset = 0;
97 97
98 inode = btrfs_iget(sb, &key, root); 98 inode = btrfs_iget(sb, &key, root, NULL);
99 if (IS_ERR(inode)) { 99 if (IS_ERR(inode)) {
100 err = PTR_ERR(inode); 100 err = PTR_ERR(inode);
101 goto fail; 101 goto fail;
@@ -223,7 +223,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
223 223
224 key.type = BTRFS_INODE_ITEM_KEY; 224 key.type = BTRFS_INODE_ITEM_KEY;
225 key.offset = 0; 225 key.offset = 0;
226 dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root)); 226 dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
227 if (!IS_ERR(dentry)) 227 if (!IS_ERR(dentry))
228 dentry->d_op = &btrfs_dentry_operations; 228 dentry->d_op = &btrfs_dentry_operations;
229 return dentry; 229 return dentry;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 559f72489b3b..b34d32fdaaec 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -22,6 +22,7 @@
22#include <linux/sort.h> 22#include <linux/sort.h>
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/slab.h>
25#include "compat.h" 26#include "compat.h"
26#include "hash.h" 27#include "hash.h"
27#include "ctree.h" 28#include "ctree.h"
@@ -2676,6 +2677,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2676 2677
2677 INIT_LIST_HEAD(&found->block_groups); 2678 INIT_LIST_HEAD(&found->block_groups);
2678 init_rwsem(&found->groups_sem); 2679 init_rwsem(&found->groups_sem);
2680 init_waitqueue_head(&found->flush_wait);
2681 init_waitqueue_head(&found->allocate_wait);
2679 spin_lock_init(&found->lock); 2682 spin_lock_init(&found->lock);
2680 found->flags = flags; 2683 found->flags = flags;
2681 found->total_bytes = total_bytes; 2684 found->total_bytes = total_bytes;
@@ -2846,7 +2849,7 @@ int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2846 } 2849 }
2847 spin_unlock(&BTRFS_I(inode)->accounting_lock); 2850 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2848 2851
2849 BTRFS_I(inode)->reserved_extents--; 2852 BTRFS_I(inode)->reserved_extents -= num_items;
2850 BUG_ON(BTRFS_I(inode)->reserved_extents < 0); 2853 BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
2851 2854
2852 if (meta_sinfo->bytes_delalloc < num_bytes) { 2855 if (meta_sinfo->bytes_delalloc < num_bytes) {
@@ -2944,12 +2947,10 @@ static void flush_delalloc(struct btrfs_root *root,
2944 2947
2945 spin_lock(&info->lock); 2948 spin_lock(&info->lock);
2946 2949
2947 if (!info->flushing) { 2950 if (!info->flushing)
2948 info->flushing = 1; 2951 info->flushing = 1;
2949 init_waitqueue_head(&info->flush_wait); 2952 else
2950 } else {
2951 wait = true; 2953 wait = true;
2952 }
2953 2954
2954 spin_unlock(&info->lock); 2955 spin_unlock(&info->lock);
2955 2956
@@ -3011,7 +3012,6 @@ static int maybe_allocate_chunk(struct btrfs_root *root,
3011 if (!info->allocating_chunk) { 3012 if (!info->allocating_chunk) {
3012 info->force_alloc = 1; 3013 info->force_alloc = 1;
3013 info->allocating_chunk = 1; 3014 info->allocating_chunk = 1;
3014 init_waitqueue_head(&info->allocate_wait);
3015 } else { 3015 } else {
3016 wait = true; 3016 wait = true;
3017 } 3017 }
@@ -3111,7 +3111,7 @@ again:
3111 return -ENOSPC; 3111 return -ENOSPC;
3112 } 3112 }
3113 3113
3114 BTRFS_I(inode)->reserved_extents++; 3114 BTRFS_I(inode)->reserved_extents += num_items;
3115 check_force_delalloc(meta_sinfo); 3115 check_force_delalloc(meta_sinfo);
3116 spin_unlock(&meta_sinfo->lock); 3116 spin_unlock(&meta_sinfo->lock);
3117 3117
@@ -3235,7 +3235,8 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
3235 u64 bytes) 3235 u64 bytes)
3236{ 3236{
3237 struct btrfs_space_info *data_sinfo; 3237 struct btrfs_space_info *data_sinfo;
3238 int ret = 0, committed = 0; 3238 u64 used;
3239 int ret = 0, committed = 0, flushed = 0;
3239 3240
3240 /* make sure bytes are sectorsize aligned */ 3241 /* make sure bytes are sectorsize aligned */
3241 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3242 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
@@ -3247,12 +3248,21 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
3247again: 3248again:
3248 /* make sure we have enough space to handle the data first */ 3249 /* make sure we have enough space to handle the data first */
3249 spin_lock(&data_sinfo->lock); 3250 spin_lock(&data_sinfo->lock);
3250 if (data_sinfo->total_bytes - data_sinfo->bytes_used - 3251 used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc +
3251 data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - 3252 data_sinfo->bytes_reserved + data_sinfo->bytes_pinned +
3252 data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - 3253 data_sinfo->bytes_readonly + data_sinfo->bytes_may_use +
3253 data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) { 3254 data_sinfo->bytes_super;
3255
3256 if (used + bytes > data_sinfo->total_bytes) {
3254 struct btrfs_trans_handle *trans; 3257 struct btrfs_trans_handle *trans;
3255 3258
3259 if (!flushed) {
3260 spin_unlock(&data_sinfo->lock);
3261 flush_delalloc(root, data_sinfo);
3262 flushed = 1;
3263 goto again;
3264 }
3265
3256 /* 3266 /*
3257 * if we don't have enough free bytes in this space then we need 3267 * if we don't have enough free bytes in this space then we need
3258 * to alloc a new chunk. 3268 * to alloc a new chunk.
@@ -4170,6 +4180,10 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4170 ins->offset = 0; 4180 ins->offset = 0;
4171 4181
4172 space_info = __find_space_info(root->fs_info, data); 4182 space_info = __find_space_info(root->fs_info, data);
4183 if (!space_info) {
4184 printk(KERN_ERR "No space info for %d\n", data);
4185 return -ENOSPC;
4186 }
4173 4187
4174 if (orig_root->ref_cows || empty_size) 4188 if (orig_root->ref_cows || empty_size)
4175 allowed_chunk_alloc = 1; 4189 allowed_chunk_alloc = 1;
@@ -5205,6 +5219,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5205 next = btrfs_find_tree_block(root, bytenr, blocksize); 5219 next = btrfs_find_tree_block(root, bytenr, blocksize);
5206 if (!next) { 5220 if (!next) {
5207 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 5221 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
5222 if (!next)
5223 return -ENOMEM;
5208 reada = 1; 5224 reada = 1;
5209 } 5225 }
5210 btrfs_tree_lock(next); 5226 btrfs_tree_lock(next);
@@ -5417,7 +5433,8 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5417 if (ret > 0) { 5433 if (ret > 0) {
5418 path->slots[level]++; 5434 path->slots[level]++;
5419 continue; 5435 continue;
5420 } 5436 } else if (ret < 0)
5437 return ret;
5421 level = wc->level; 5438 level = wc->level;
5422 } 5439 }
5423 return 0; 5440 return 0;
@@ -6561,6 +6578,7 @@ static noinline int invalidate_extent_cache(struct btrfs_root *root,
6561 struct btrfs_key key; 6578 struct btrfs_key key;
6562 struct inode *inode = NULL; 6579 struct inode *inode = NULL;
6563 struct btrfs_file_extent_item *fi; 6580 struct btrfs_file_extent_item *fi;
6581 struct extent_state *cached_state = NULL;
6564 u64 num_bytes; 6582 u64 num_bytes;
6565 u64 skip_objectid = 0; 6583 u64 skip_objectid = 0;
6566 u32 nritems; 6584 u32 nritems;
@@ -6589,12 +6607,14 @@ static noinline int invalidate_extent_cache(struct btrfs_root *root,
6589 } 6607 }
6590 num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 6608 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6591 6609
6592 lock_extent(&BTRFS_I(inode)->io_tree, key.offset, 6610 lock_extent_bits(&BTRFS_I(inode)->io_tree, key.offset,
6593 key.offset + num_bytes - 1, GFP_NOFS); 6611 key.offset + num_bytes - 1, 0, &cached_state,
6612 GFP_NOFS);
6594 btrfs_drop_extent_cache(inode, key.offset, 6613 btrfs_drop_extent_cache(inode, key.offset,
6595 key.offset + num_bytes - 1, 1); 6614 key.offset + num_bytes - 1, 1);
6596 unlock_extent(&BTRFS_I(inode)->io_tree, key.offset, 6615 unlock_extent_cached(&BTRFS_I(inode)->io_tree, key.offset,
6597 key.offset + num_bytes - 1, GFP_NOFS); 6616 key.offset + num_bytes - 1, &cached_state,
6617 GFP_NOFS);
6598 cond_resched(); 6618 cond_resched();
6599 } 6619 }
6600 iput(inode); 6620 iput(inode);
@@ -7366,7 +7386,6 @@ static int find_first_block_group(struct btrfs_root *root,
7366 } 7386 }
7367 path->slots[0]++; 7387 path->slots[0]++;
7368 } 7388 }
7369 ret = -ENOENT;
7370out: 7389out:
7371 return ret; 7390 return ret;
7372} 7391}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b177ed319612..d2d03684fab2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2,7 +2,6 @@
2#include <linux/slab.h> 2#include <linux/slab.h>
3#include <linux/bio.h> 3#include <linux/bio.h>
4#include <linux/mm.h> 4#include <linux/mm.h>
5#include <linux/gfp.h>
6#include <linux/pagemap.h> 5#include <linux/pagemap.h>
7#include <linux/page-flags.h> 6#include <linux/page-flags.h>
8#include <linux/module.h> 7#include <linux/module.h>
@@ -104,8 +103,8 @@ void extent_io_exit(void)
104void extent_io_tree_init(struct extent_io_tree *tree, 103void extent_io_tree_init(struct extent_io_tree *tree,
105 struct address_space *mapping, gfp_t mask) 104 struct address_space *mapping, gfp_t mask)
106{ 105{
107 tree->state.rb_node = NULL; 106 tree->state = RB_ROOT;
108 tree->buffer.rb_node = NULL; 107 tree->buffer = RB_ROOT;
109 tree->ops = NULL; 108 tree->ops = NULL;
110 tree->dirty_bytes = 0; 109 tree->dirty_bytes = 0;
111 spin_lock_init(&tree->lock); 110 spin_lock_init(&tree->lock);
@@ -513,7 +512,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
513 u64 last_end; 512 u64 last_end;
514 int err; 513 int err;
515 int set = 0; 514 int set = 0;
515 int clear = 0;
516 516
517 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
518 clear = 1;
517again: 519again:
518 if (!prealloc && (mask & __GFP_WAIT)) { 520 if (!prealloc && (mask & __GFP_WAIT)) {
519 prealloc = alloc_extent_state(mask); 521 prealloc = alloc_extent_state(mask);
@@ -524,14 +526,20 @@ again:
524 spin_lock(&tree->lock); 526 spin_lock(&tree->lock);
525 if (cached_state) { 527 if (cached_state) {
526 cached = *cached_state; 528 cached = *cached_state;
527 *cached_state = NULL; 529
528 cached_state = NULL; 530 if (clear) {
531 *cached_state = NULL;
532 cached_state = NULL;
533 }
534
529 if (cached && cached->tree && cached->start == start) { 535 if (cached && cached->tree && cached->start == start) {
530 atomic_dec(&cached->refs); 536 if (clear)
537 atomic_dec(&cached->refs);
531 state = cached; 538 state = cached;
532 goto hit_next; 539 goto hit_next;
533 } 540 }
534 free_extent_state(cached); 541 if (clear)
542 free_extent_state(cached);
535 } 543 }
536 /* 544 /*
537 * this search will find the extents that end after 545 * this search will find the extents that end after
@@ -946,11 +954,11 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
946} 954}
947 955
948int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 956int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
949 gfp_t mask) 957 struct extent_state **cached_state, gfp_t mask)
950{ 958{
951 return set_extent_bit(tree, start, end, 959 return set_extent_bit(tree, start, end,
952 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 960 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
953 0, NULL, NULL, mask); 961 0, NULL, cached_state, mask);
954} 962}
955 963
956int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 964int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -984,10 +992,11 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
984} 992}
985 993
986static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, 994static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
987 u64 end, gfp_t mask) 995 u64 end, struct extent_state **cached_state,
996 gfp_t mask)
988{ 997{
989 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, 998 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
990 NULL, mask); 999 cached_state, mask);
991} 1000}
992 1001
993int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1002int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
@@ -1171,7 +1180,8 @@ out:
1171 * 1 is returned if we find something, 0 if nothing was in the tree 1180 * 1 is returned if we find something, 0 if nothing was in the tree
1172 */ 1181 */
1173static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1182static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1174 u64 *start, u64 *end, u64 max_bytes) 1183 u64 *start, u64 *end, u64 max_bytes,
1184 struct extent_state **cached_state)
1175{ 1185{
1176 struct rb_node *node; 1186 struct rb_node *node;
1177 struct extent_state *state; 1187 struct extent_state *state;
@@ -1203,8 +1213,11 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1203 *end = state->end; 1213 *end = state->end;
1204 goto out; 1214 goto out;
1205 } 1215 }
1206 if (!found) 1216 if (!found) {
1207 *start = state->start; 1217 *start = state->start;
1218 *cached_state = state;
1219 atomic_inc(&state->refs);
1220 }
1208 found++; 1221 found++;
1209 *end = state->end; 1222 *end = state->end;
1210 cur_start = state->end + 1; 1223 cur_start = state->end + 1;
@@ -1336,10 +1349,11 @@ again:
1336 delalloc_start = *start; 1349 delalloc_start = *start;
1337 delalloc_end = 0; 1350 delalloc_end = 0;
1338 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1351 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1339 max_bytes); 1352 max_bytes, &cached_state);
1340 if (!found || delalloc_end <= *start) { 1353 if (!found || delalloc_end <= *start) {
1341 *start = delalloc_start; 1354 *start = delalloc_start;
1342 *end = delalloc_end; 1355 *end = delalloc_end;
1356 free_extent_state(cached_state);
1343 return found; 1357 return found;
1344 } 1358 }
1345 1359
@@ -1722,7 +1736,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
1722 } 1736 }
1723 1737
1724 if (!uptodate) { 1738 if (!uptodate) {
1725 clear_extent_uptodate(tree, start, end, GFP_NOFS); 1739 clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
1726 ClearPageUptodate(page); 1740 ClearPageUptodate(page);
1727 SetPageError(page); 1741 SetPageError(page);
1728 } 1742 }
@@ -1750,7 +1764,8 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
1750static void end_bio_extent_readpage(struct bio *bio, int err) 1764static void end_bio_extent_readpage(struct bio *bio, int err)
1751{ 1765{
1752 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1766 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1753 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1767 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
1768 struct bio_vec *bvec = bio->bi_io_vec;
1754 struct extent_io_tree *tree; 1769 struct extent_io_tree *tree;
1755 u64 start; 1770 u64 start;
1756 u64 end; 1771 u64 end;
@@ -1773,7 +1788,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1773 else 1788 else
1774 whole_page = 0; 1789 whole_page = 0;
1775 1790
1776 if (--bvec >= bio->bi_io_vec) 1791 if (++bvec <= bvec_end)
1777 prefetchw(&bvec->bv_page->flags); 1792 prefetchw(&bvec->bv_page->flags);
1778 1793
1779 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 1794 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
@@ -1818,7 +1833,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1818 } 1833 }
1819 check_page_locked(tree, page); 1834 check_page_locked(tree, page);
1820 } 1835 }
1821 } while (bvec >= bio->bi_io_vec); 1836 } while (bvec <= bvec_end);
1822 1837
1823 bio_put(bio); 1838 bio_put(bio);
1824} 1839}
@@ -2663,33 +2678,20 @@ int extent_readpages(struct extent_io_tree *tree,
2663{ 2678{
2664 struct bio *bio = NULL; 2679 struct bio *bio = NULL;
2665 unsigned page_idx; 2680 unsigned page_idx;
2666 struct pagevec pvec;
2667 unsigned long bio_flags = 0; 2681 unsigned long bio_flags = 0;
2668 2682
2669 pagevec_init(&pvec, 0);
2670 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 2683 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2671 struct page *page = list_entry(pages->prev, struct page, lru); 2684 struct page *page = list_entry(pages->prev, struct page, lru);
2672 2685
2673 prefetchw(&page->flags); 2686 prefetchw(&page->flags);
2674 list_del(&page->lru); 2687 list_del(&page->lru);
2675 /* 2688 if (!add_to_page_cache_lru(page, mapping,
2676 * what we want to do here is call add_to_page_cache_lru,
2677 * but that isn't exported, so we reproduce it here
2678 */
2679 if (!add_to_page_cache(page, mapping,
2680 page->index, GFP_KERNEL)) { 2689 page->index, GFP_KERNEL)) {
2681
2682 /* open coding of lru_cache_add, also not exported */
2683 page_cache_get(page);
2684 if (!pagevec_add(&pvec, page))
2685 __pagevec_lru_add_file(&pvec);
2686 __extent_read_full_page(tree, page, get_extent, 2690 __extent_read_full_page(tree, page, get_extent,
2687 &bio, 0, &bio_flags); 2691 &bio, 0, &bio_flags);
2688 } 2692 }
2689 page_cache_release(page); 2693 page_cache_release(page);
2690 } 2694 }
2691 if (pagevec_count(&pvec))
2692 __pagevec_lru_add_file(&pvec);
2693 BUG_ON(!list_empty(pages)); 2695 BUG_ON(!list_empty(pages));
2694 if (bio) 2696 if (bio)
2695 submit_one_bio(READ, bio, 0, bio_flags); 2697 submit_one_bio(READ, bio, 0, bio_flags);
@@ -2704,6 +2706,7 @@ int extent_readpages(struct extent_io_tree *tree,
2704int extent_invalidatepage(struct extent_io_tree *tree, 2706int extent_invalidatepage(struct extent_io_tree *tree,
2705 struct page *page, unsigned long offset) 2707 struct page *page, unsigned long offset)
2706{ 2708{
2709 struct extent_state *cached_state = NULL;
2707 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); 2710 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2708 u64 end = start + PAGE_CACHE_SIZE - 1; 2711 u64 end = start + PAGE_CACHE_SIZE - 1;
2709 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 2712 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
@@ -2712,12 +2715,12 @@ int extent_invalidatepage(struct extent_io_tree *tree,
2712 if (start > end) 2715 if (start > end)
2713 return 0; 2716 return 0;
2714 2717
2715 lock_extent(tree, start, end, GFP_NOFS); 2718 lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
2716 wait_on_page_writeback(page); 2719 wait_on_page_writeback(page);
2717 clear_extent_bit(tree, start, end, 2720 clear_extent_bit(tree, start, end,
2718 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 2721 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
2719 EXTENT_DO_ACCOUNTING, 2722 EXTENT_DO_ACCOUNTING,
2720 1, 1, NULL, GFP_NOFS); 2723 1, 1, &cached_state, GFP_NOFS);
2721 return 0; 2724 return 0;
2722} 2725}
2723 2726
@@ -2920,16 +2923,17 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2920 get_extent_t *get_extent) 2923 get_extent_t *get_extent)
2921{ 2924{
2922 struct inode *inode = mapping->host; 2925 struct inode *inode = mapping->host;
2926 struct extent_state *cached_state = NULL;
2923 u64 start = iblock << inode->i_blkbits; 2927 u64 start = iblock << inode->i_blkbits;
2924 sector_t sector = 0; 2928 sector_t sector = 0;
2925 size_t blksize = (1 << inode->i_blkbits); 2929 size_t blksize = (1 << inode->i_blkbits);
2926 struct extent_map *em; 2930 struct extent_map *em;
2927 2931
2928 lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, 2932 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2929 GFP_NOFS); 2933 0, &cached_state, GFP_NOFS);
2930 em = get_extent(inode, NULL, 0, start, blksize, 0); 2934 em = get_extent(inode, NULL, 0, start, blksize, 0);
2931 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, 2935 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start,
2932 GFP_NOFS); 2936 start + blksize - 1, &cached_state, GFP_NOFS);
2933 if (!em || IS_ERR(em)) 2937 if (!em || IS_ERR(em))
2934 return 0; 2938 return 0;
2935 2939
@@ -2951,6 +2955,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2951 u32 flags = 0; 2955 u32 flags = 0;
2952 u64 disko = 0; 2956 u64 disko = 0;
2953 struct extent_map *em = NULL; 2957 struct extent_map *em = NULL;
2958 struct extent_state *cached_state = NULL;
2954 int end = 0; 2959 int end = 0;
2955 u64 em_start = 0, em_len = 0; 2960 u64 em_start = 0, em_len = 0;
2956 unsigned long emflags; 2961 unsigned long emflags;
@@ -2959,8 +2964,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2959 if (len == 0) 2964 if (len == 0)
2960 return -EINVAL; 2965 return -EINVAL;
2961 2966
2962 lock_extent(&BTRFS_I(inode)->io_tree, start, start + len, 2967 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2963 GFP_NOFS); 2968 &cached_state, GFP_NOFS);
2964 em = get_extent(inode, NULL, 0, off, max - off, 0); 2969 em = get_extent(inode, NULL, 0, off, max - off, 0);
2965 if (!em) 2970 if (!em)
2966 goto out; 2971 goto out;
@@ -3023,8 +3028,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3023out_free: 3028out_free:
3024 free_extent_map(em); 3029 free_extent_map(em);
3025out: 3030out:
3026 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len, 3031 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
3027 GFP_NOFS); 3032 &cached_state, GFP_NOFS);
3028 return ret; 3033 return ret;
3029} 3034}
3030 3035
@@ -3264,7 +3269,8 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
3264} 3269}
3265 3270
3266int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 3271int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3267 struct extent_buffer *eb) 3272 struct extent_buffer *eb,
3273 struct extent_state **cached_state)
3268{ 3274{
3269 unsigned long i; 3275 unsigned long i;
3270 struct page *page; 3276 struct page *page;
@@ -3274,7 +3280,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3274 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3280 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3275 3281
3276 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3282 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3277 GFP_NOFS); 3283 cached_state, GFP_NOFS);
3278 for (i = 0; i < num_pages; i++) { 3284 for (i = 0; i < num_pages; i++) {
3279 page = extent_buffer_page(eb, i); 3285 page = extent_buffer_page(eb, i);
3280 if (page) 3286 if (page)
@@ -3334,7 +3340,8 @@ int extent_range_uptodate(struct extent_io_tree *tree,
3334} 3340}
3335 3341
3336int extent_buffer_uptodate(struct extent_io_tree *tree, 3342int extent_buffer_uptodate(struct extent_io_tree *tree,
3337 struct extent_buffer *eb) 3343 struct extent_buffer *eb,
3344 struct extent_state *cached_state)
3338{ 3345{
3339 int ret = 0; 3346 int ret = 0;
3340 unsigned long num_pages; 3347 unsigned long num_pages;
@@ -3346,7 +3353,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3346 return 1; 3353 return 1;
3347 3354
3348 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3355 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3349 EXTENT_UPTODATE, 1, NULL); 3356 EXTENT_UPTODATE, 1, cached_state);
3350 if (ret) 3357 if (ret)
3351 return ret; 3358 return ret;
3352 3359
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 36de250a7b2b..bbab4813646f 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -163,6 +163,8 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
163int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 163int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
164 int bits, struct extent_state **cached, gfp_t mask); 164 int bits, struct extent_state **cached, gfp_t mask);
165int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); 165int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
166int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
167 struct extent_state **cached, gfp_t mask);
166int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 168int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
167 gfp_t mask); 169 gfp_t mask);
168int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 170int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
@@ -196,7 +198,7 @@ int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
196int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start, 198int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
197 u64 end, gfp_t mask); 199 u64 end, gfp_t mask);
198int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 200int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
199 gfp_t mask); 201 struct extent_state **cached_state, gfp_t mask);
200int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, 202int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
201 gfp_t mask); 203 gfp_t mask);
202int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 204int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -281,9 +283,11 @@ int test_extent_buffer_dirty(struct extent_io_tree *tree,
281int set_extent_buffer_uptodate(struct extent_io_tree *tree, 283int set_extent_buffer_uptodate(struct extent_io_tree *tree,
282 struct extent_buffer *eb); 284 struct extent_buffer *eb);
283int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 285int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
284 struct extent_buffer *eb); 286 struct extent_buffer *eb,
287 struct extent_state **cached_state);
285int extent_buffer_uptodate(struct extent_io_tree *tree, 288int extent_buffer_uptodate(struct extent_io_tree *tree,
286 struct extent_buffer *eb); 289 struct extent_buffer *eb,
290 struct extent_state *cached_state);
287int map_extent_buffer(struct extent_buffer *eb, unsigned long offset, 291int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
288 unsigned long min_len, char **token, char **map, 292 unsigned long min_len, char **token, char **map,
289 unsigned long *map_start, 293 unsigned long *map_start,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 428fcac45f90..454ca52d6451 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1,5 +1,4 @@
1#include <linux/err.h> 1#include <linux/err.h>
2#include <linux/gfp.h>
3#include <linux/slab.h> 2#include <linux/slab.h>
4#include <linux/module.h> 3#include <linux/module.h>
5#include <linux/spinlock.h> 4#include <linux/spinlock.h>
@@ -35,7 +34,7 @@ void extent_map_exit(void)
35 */ 34 */
36void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) 35void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
37{ 36{
38 tree->map.rb_node = NULL; 37 tree->map = RB_ROOT;
39 rwlock_init(&tree->lock); 38 rwlock_init(&tree->lock);
40} 39}
41 40
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 9b99886562d0..54a255065aa3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/bio.h> 19#include <linux/bio.h>
20#include <linux/slab.h>
20#include <linux/pagemap.h> 21#include <linux/pagemap.h>
21#include <linux/highmem.h> 22#include <linux/highmem.h>
22#include "ctree.h" 23#include "ctree.h"
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 6ed434ac037f..29ff749ff4ca 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -28,6 +28,7 @@
28#include <linux/writeback.h> 28#include <linux/writeback.h>
29#include <linux/statfs.h> 29#include <linux/statfs.h>
30#include <linux/compat.h> 30#include <linux/compat.h>
31#include <linux/slab.h>
31#include "ctree.h" 32#include "ctree.h"
32#include "disk-io.h" 33#include "disk-io.h"
33#include "transaction.h" 34#include "transaction.h"
@@ -123,7 +124,8 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
123 root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 124 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
124 125
125 end_of_last_block = start_pos + num_bytes - 1; 126 end_of_last_block = start_pos + num_bytes - 1;
126 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); 127 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
128 NULL);
127 if (err) 129 if (err)
128 return err; 130 return err;
129 131
@@ -753,6 +755,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
753 loff_t pos, unsigned long first_index, 755 loff_t pos, unsigned long first_index,
754 unsigned long last_index, size_t write_bytes) 756 unsigned long last_index, size_t write_bytes)
755{ 757{
758 struct extent_state *cached_state = NULL;
756 int i; 759 int i;
757 unsigned long index = pos >> PAGE_CACHE_SHIFT; 760 unsigned long index = pos >> PAGE_CACHE_SHIFT;
758 struct inode *inode = fdentry(file)->d_inode; 761 struct inode *inode = fdentry(file)->d_inode;
@@ -781,16 +784,18 @@ again:
781 } 784 }
782 if (start_pos < inode->i_size) { 785 if (start_pos < inode->i_size) {
783 struct btrfs_ordered_extent *ordered; 786 struct btrfs_ordered_extent *ordered;
784 lock_extent(&BTRFS_I(inode)->io_tree, 787 lock_extent_bits(&BTRFS_I(inode)->io_tree,
785 start_pos, last_pos - 1, GFP_NOFS); 788 start_pos, last_pos - 1, 0, &cached_state,
789 GFP_NOFS);
786 ordered = btrfs_lookup_first_ordered_extent(inode, 790 ordered = btrfs_lookup_first_ordered_extent(inode,
787 last_pos - 1); 791 last_pos - 1);
788 if (ordered && 792 if (ordered &&
789 ordered->file_offset + ordered->len > start_pos && 793 ordered->file_offset + ordered->len > start_pos &&
790 ordered->file_offset < last_pos) { 794 ordered->file_offset < last_pos) {
791 btrfs_put_ordered_extent(ordered); 795 btrfs_put_ordered_extent(ordered);
792 unlock_extent(&BTRFS_I(inode)->io_tree, 796 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
793 start_pos, last_pos - 1, GFP_NOFS); 797 start_pos, last_pos - 1,
798 &cached_state, GFP_NOFS);
794 for (i = 0; i < num_pages; i++) { 799 for (i = 0; i < num_pages; i++) {
795 unlock_page(pages[i]); 800 unlock_page(pages[i]);
796 page_cache_release(pages[i]); 801 page_cache_release(pages[i]);
@@ -802,12 +807,13 @@ again:
802 if (ordered) 807 if (ordered)
803 btrfs_put_ordered_extent(ordered); 808 btrfs_put_ordered_extent(ordered);
804 809
805 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, 810 clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
806 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | 811 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
807 EXTENT_DO_ACCOUNTING, 812 EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
808 GFP_NOFS); 813 GFP_NOFS);
809 unlock_extent(&BTRFS_I(inode)->io_tree, 814 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
810 start_pos, last_pos - 1, GFP_NOFS); 815 start_pos, last_pos - 1, &cached_state,
816 GFP_NOFS);
811 } 817 }
812 for (i = 0; i < num_pages; i++) { 818 for (i = 0; i < num_pages; i++) {
813 clear_page_dirty_for_io(pages[i]); 819 clear_page_dirty_for_io(pages[i]);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index cb2849f03251..f488fac04d99 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h>
21#include <linux/math64.h> 22#include <linux/math64.h>
22#include "ctree.h" 23#include "ctree.h"
23#include "free-space-cache.h" 24#include "free-space-cache.h"
@@ -870,7 +871,7 @@ __btrfs_return_cluster_to_free_space(
870 tree_insert_offset(&block_group->free_space_offset, 871 tree_insert_offset(&block_group->free_space_offset,
871 entry->offset, &entry->offset_index, 0); 872 entry->offset, &entry->offset_index, 0);
872 } 873 }
873 cluster->root.rb_node = NULL; 874 cluster->root = RB_ROOT;
874 875
875out: 876out:
876 spin_unlock(&cluster->lock); 877 spin_unlock(&cluster->lock);
@@ -1355,7 +1356,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
1355{ 1356{
1356 spin_lock_init(&cluster->lock); 1357 spin_lock_init(&cluster->lock);
1357 spin_lock_init(&cluster->refill_lock); 1358 spin_lock_init(&cluster->refill_lock);
1358 cluster->root.rb_node = NULL; 1359 cluster->root = RB_ROOT;
1359 cluster->max_size = 0; 1360 cluster->max_size = 0;
1360 cluster->points_to_bitmap = false; 1361 cluster->points_to_bitmap = false;
1361 INIT_LIST_HEAD(&cluster->block_group_list); 1362 INIT_LIST_HEAD(&cluster->block_group_list);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c41db6d45ab6..2bfdc641d4e3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -36,6 +36,7 @@
36#include <linux/xattr.h> 36#include <linux/xattr.h>
37#include <linux/posix_acl.h> 37#include <linux/posix_acl.h>
38#include <linux/falloc.h> 38#include <linux/falloc.h>
39#include <linux/slab.h>
39#include "compat.h" 40#include "compat.h"
40#include "ctree.h" 41#include "ctree.h"
41#include "disk-io.h" 42#include "disk-io.h"
@@ -379,7 +380,8 @@ again:
379 * change at any time if we discover bad compression ratios. 380 * change at any time if we discover bad compression ratios.
380 */ 381 */
381 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && 382 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
382 btrfs_test_opt(root, COMPRESS)) { 383 (btrfs_test_opt(root, COMPRESS) ||
384 (BTRFS_I(inode)->force_compress))) {
383 WARN_ON(pages); 385 WARN_ON(pages);
384 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 386 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
385 387
@@ -483,8 +485,10 @@ again:
483 nr_pages_ret = 0; 485 nr_pages_ret = 0;
484 486
485 /* flag the file so we don't compress in the future */ 487 /* flag the file so we don't compress in the future */
486 if (!btrfs_test_opt(root, FORCE_COMPRESS)) 488 if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
489 !(BTRFS_I(inode)->force_compress)) {
487 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 490 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
491 }
488 } 492 }
489 if (will_compress) { 493 if (will_compress) {
490 *num_added += 1; 494 *num_added += 1;
@@ -570,8 +574,8 @@ retry:
570 unsigned long nr_written = 0; 574 unsigned long nr_written = 0;
571 575
572 lock_extent(io_tree, async_extent->start, 576 lock_extent(io_tree, async_extent->start,
573 async_extent->start + 577 async_extent->start +
574 async_extent->ram_size - 1, GFP_NOFS); 578 async_extent->ram_size - 1, GFP_NOFS);
575 579
576 /* allocate blocks */ 580 /* allocate blocks */
577 ret = cow_file_range(inode, async_cow->locked_page, 581 ret = cow_file_range(inode, async_cow->locked_page,
@@ -793,7 +797,7 @@ static noinline int cow_file_range(struct inode *inode,
793 while (disk_num_bytes > 0) { 797 while (disk_num_bytes > 0) {
794 unsigned long op; 798 unsigned long op;
795 799
796 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); 800 cur_alloc_size = disk_num_bytes;
797 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 801 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
798 root->sectorsize, 0, alloc_hint, 802 root->sectorsize, 0, alloc_hint,
799 (u64)-1, &ins, 1); 803 (u64)-1, &ins, 1);
@@ -1211,7 +1215,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1211 else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) 1215 else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
1212 ret = run_delalloc_nocow(inode, locked_page, start, end, 1216 ret = run_delalloc_nocow(inode, locked_page, start, end,
1213 page_started, 0, nr_written); 1217 page_started, 0, nr_written);
1214 else if (!btrfs_test_opt(root, COMPRESS)) 1218 else if (!btrfs_test_opt(root, COMPRESS) &&
1219 !(BTRFS_I(inode)->force_compress))
1215 ret = cow_file_range(inode, locked_page, start, end, 1220 ret = cow_file_range(inode, locked_page, start, end,
1216 page_started, nr_written, 1); 1221 page_started, nr_written, 1);
1217 else 1222 else
@@ -1223,30 +1228,9 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1223static int btrfs_split_extent_hook(struct inode *inode, 1228static int btrfs_split_extent_hook(struct inode *inode,
1224 struct extent_state *orig, u64 split) 1229 struct extent_state *orig, u64 split)
1225{ 1230{
1226 struct btrfs_root *root = BTRFS_I(inode)->root;
1227 u64 size;
1228
1229 if (!(orig->state & EXTENT_DELALLOC)) 1231 if (!(orig->state & EXTENT_DELALLOC))
1230 return 0; 1232 return 0;
1231 1233
1232 size = orig->end - orig->start + 1;
1233 if (size > root->fs_info->max_extent) {
1234 u64 num_extents;
1235 u64 new_size;
1236
1237 new_size = orig->end - split + 1;
1238 num_extents = div64_u64(size + root->fs_info->max_extent - 1,
1239 root->fs_info->max_extent);
1240
1241 /*
1242 * if we break a large extent up then leave oustanding_extents
1243 * be, since we've already accounted for the large extent.
1244 */
1245 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1246 root->fs_info->max_extent) < num_extents)
1247 return 0;
1248 }
1249
1250 spin_lock(&BTRFS_I(inode)->accounting_lock); 1234 spin_lock(&BTRFS_I(inode)->accounting_lock);
1251 BTRFS_I(inode)->outstanding_extents++; 1235 BTRFS_I(inode)->outstanding_extents++;
1252 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1236 spin_unlock(&BTRFS_I(inode)->accounting_lock);
@@ -1264,38 +1248,10 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1264 struct extent_state *new, 1248 struct extent_state *new,
1265 struct extent_state *other) 1249 struct extent_state *other)
1266{ 1250{
1267 struct btrfs_root *root = BTRFS_I(inode)->root;
1268 u64 new_size, old_size;
1269 u64 num_extents;
1270
1271 /* not delalloc, ignore it */ 1251 /* not delalloc, ignore it */
1272 if (!(other->state & EXTENT_DELALLOC)) 1252 if (!(other->state & EXTENT_DELALLOC))
1273 return 0; 1253 return 0;
1274 1254
1275 old_size = other->end - other->start + 1;
1276 if (new->start < other->start)
1277 new_size = other->end - new->start + 1;
1278 else
1279 new_size = new->end - other->start + 1;
1280
1281 /* we're not bigger than the max, unreserve the space and go */
1282 if (new_size <= root->fs_info->max_extent) {
1283 spin_lock(&BTRFS_I(inode)->accounting_lock);
1284 BTRFS_I(inode)->outstanding_extents--;
1285 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1286 return 0;
1287 }
1288
1289 /*
1290 * If we grew by another max_extent, just return, we want to keep that
1291 * reserved amount.
1292 */
1293 num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
1294 root->fs_info->max_extent);
1295 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1296 root->fs_info->max_extent) > num_extents)
1297 return 0;
1298
1299 spin_lock(&BTRFS_I(inode)->accounting_lock); 1255 spin_lock(&BTRFS_I(inode)->accounting_lock);
1300 BTRFS_I(inode)->outstanding_extents--; 1256 BTRFS_I(inode)->outstanding_extents--;
1301 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1257 spin_unlock(&BTRFS_I(inode)->accounting_lock);
@@ -1324,6 +1280,7 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1324 BTRFS_I(inode)->outstanding_extents++; 1280 BTRFS_I(inode)->outstanding_extents++;
1325 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1281 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1326 btrfs_delalloc_reserve_space(root, inode, end - start + 1); 1282 btrfs_delalloc_reserve_space(root, inode, end - start + 1);
1283
1327 spin_lock(&root->fs_info->delalloc_lock); 1284 spin_lock(&root->fs_info->delalloc_lock);
1328 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1285 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
1329 root->fs_info->delalloc_bytes += end - start + 1; 1286 root->fs_info->delalloc_bytes += end - start + 1;
@@ -1352,6 +1309,7 @@ static int btrfs_clear_bit_hook(struct inode *inode,
1352 1309
1353 if (bits & EXTENT_DO_ACCOUNTING) { 1310 if (bits & EXTENT_DO_ACCOUNTING) {
1354 spin_lock(&BTRFS_I(inode)->accounting_lock); 1311 spin_lock(&BTRFS_I(inode)->accounting_lock);
1312 WARN_ON(!BTRFS_I(inode)->outstanding_extents);
1355 BTRFS_I(inode)->outstanding_extents--; 1313 BTRFS_I(inode)->outstanding_extents--;
1356 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1314 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1357 btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 1315 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
@@ -1508,12 +1466,13 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1508 return 0; 1466 return 0;
1509} 1467}
1510 1468
1511int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end) 1469int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1470 struct extent_state **cached_state)
1512{ 1471{
1513 if ((end & (PAGE_CACHE_SIZE - 1)) == 0) 1472 if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
1514 WARN_ON(1); 1473 WARN_ON(1);
1515 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1474 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1516 GFP_NOFS); 1475 cached_state, GFP_NOFS);
1517} 1476}
1518 1477
1519/* see btrfs_writepage_start_hook for details on why this is required */ 1478/* see btrfs_writepage_start_hook for details on why this is required */
@@ -1526,6 +1485,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1526{ 1485{
1527 struct btrfs_writepage_fixup *fixup; 1486 struct btrfs_writepage_fixup *fixup;
1528 struct btrfs_ordered_extent *ordered; 1487 struct btrfs_ordered_extent *ordered;
1488 struct extent_state *cached_state = NULL;
1529 struct page *page; 1489 struct page *page;
1530 struct inode *inode; 1490 struct inode *inode;
1531 u64 page_start; 1491 u64 page_start;
@@ -1544,7 +1504,8 @@ again:
1544 page_start = page_offset(page); 1504 page_start = page_offset(page);
1545 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; 1505 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1546 1506
1547 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); 1507 lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
1508 &cached_state, GFP_NOFS);
1548 1509
1549 /* already ordered? We're done */ 1510 /* already ordered? We're done */
1550 if (PagePrivate2(page)) 1511 if (PagePrivate2(page))
@@ -1552,17 +1513,18 @@ again:
1552 1513
1553 ordered = btrfs_lookup_ordered_extent(inode, page_start); 1514 ordered = btrfs_lookup_ordered_extent(inode, page_start);
1554 if (ordered) { 1515 if (ordered) {
1555 unlock_extent(&BTRFS_I(inode)->io_tree, page_start, 1516 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
1556 page_end, GFP_NOFS); 1517 page_end, &cached_state, GFP_NOFS);
1557 unlock_page(page); 1518 unlock_page(page);
1558 btrfs_start_ordered_extent(inode, ordered, 1); 1519 btrfs_start_ordered_extent(inode, ordered, 1);
1559 goto again; 1520 goto again;
1560 } 1521 }
1561 1522
1562 btrfs_set_extent_delalloc(inode, page_start, page_end); 1523 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
1563 ClearPageChecked(page); 1524 ClearPageChecked(page);
1564out: 1525out:
1565 unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); 1526 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
1527 &cached_state, GFP_NOFS);
1566out_page: 1528out_page:
1567 unlock_page(page); 1529 unlock_page(page);
1568 page_cache_release(page); 1530 page_cache_release(page);
@@ -1691,14 +1653,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1691 struct btrfs_trans_handle *trans; 1653 struct btrfs_trans_handle *trans;
1692 struct btrfs_ordered_extent *ordered_extent = NULL; 1654 struct btrfs_ordered_extent *ordered_extent = NULL;
1693 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1655 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1656 struct extent_state *cached_state = NULL;
1694 int compressed = 0; 1657 int compressed = 0;
1695 int ret; 1658 int ret;
1696 1659
1697 ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1); 1660 ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
1661 end - start + 1);
1698 if (!ret) 1662 if (!ret)
1699 return 0; 1663 return 0;
1700
1701 ordered_extent = btrfs_lookup_ordered_extent(inode, start);
1702 BUG_ON(!ordered_extent); 1664 BUG_ON(!ordered_extent);
1703 1665
1704 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1666 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
@@ -1713,9 +1675,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1713 goto out; 1675 goto out;
1714 } 1676 }
1715 1677
1716 lock_extent(io_tree, ordered_extent->file_offset, 1678 lock_extent_bits(io_tree, ordered_extent->file_offset,
1717 ordered_extent->file_offset + ordered_extent->len - 1, 1679 ordered_extent->file_offset + ordered_extent->len - 1,
1718 GFP_NOFS); 1680 0, &cached_state, GFP_NOFS);
1719 1681
1720 trans = btrfs_join_transaction(root, 1); 1682 trans = btrfs_join_transaction(root, 1);
1721 1683
@@ -1742,9 +1704,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1742 ordered_extent->len); 1704 ordered_extent->len);
1743 BUG_ON(ret); 1705 BUG_ON(ret);
1744 } 1706 }
1745 unlock_extent(io_tree, ordered_extent->file_offset, 1707 unlock_extent_cached(io_tree, ordered_extent->file_offset,
1746 ordered_extent->file_offset + ordered_extent->len - 1, 1708 ordered_extent->file_offset +
1747 GFP_NOFS); 1709 ordered_extent->len - 1, &cached_state, GFP_NOFS);
1710
1748 add_pending_csums(trans, inode, ordered_extent->file_offset, 1711 add_pending_csums(trans, inode, ordered_extent->file_offset,
1749 &ordered_extent->list); 1712 &ordered_extent->list);
1750 1713
@@ -2153,7 +2116,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2153 found_key.objectid = found_key.offset; 2116 found_key.objectid = found_key.offset;
2154 found_key.type = BTRFS_INODE_ITEM_KEY; 2117 found_key.type = BTRFS_INODE_ITEM_KEY;
2155 found_key.offset = 0; 2118 found_key.offset = 0;
2156 inode = btrfs_iget(root->fs_info->sb, &found_key, root); 2119 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2157 if (IS_ERR(inode)) 2120 if (IS_ERR(inode))
2158 break; 2121 break;
2159 2122
@@ -3081,6 +3044,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3081 struct btrfs_root *root = BTRFS_I(inode)->root; 3044 struct btrfs_root *root = BTRFS_I(inode)->root;
3082 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3045 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3083 struct btrfs_ordered_extent *ordered; 3046 struct btrfs_ordered_extent *ordered;
3047 struct extent_state *cached_state = NULL;
3084 char *kaddr; 3048 char *kaddr;
3085 u32 blocksize = root->sectorsize; 3049 u32 blocksize = root->sectorsize;
3086 pgoff_t index = from >> PAGE_CACHE_SHIFT; 3050 pgoff_t index = from >> PAGE_CACHE_SHIFT;
@@ -3127,12 +3091,14 @@ again:
3127 } 3091 }
3128 wait_on_page_writeback(page); 3092 wait_on_page_writeback(page);
3129 3093
3130 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 3094 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
3095 GFP_NOFS);
3131 set_page_extent_mapped(page); 3096 set_page_extent_mapped(page);
3132 3097
3133 ordered = btrfs_lookup_ordered_extent(inode, page_start); 3098 ordered = btrfs_lookup_ordered_extent(inode, page_start);
3134 if (ordered) { 3099 if (ordered) {
3135 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3100 unlock_extent_cached(io_tree, page_start, page_end,
3101 &cached_state, GFP_NOFS);
3136 unlock_page(page); 3102 unlock_page(page);
3137 page_cache_release(page); 3103 page_cache_release(page);
3138 btrfs_start_ordered_extent(inode, ordered, 1); 3104 btrfs_start_ordered_extent(inode, ordered, 1);
@@ -3140,13 +3106,15 @@ again:
3140 goto again; 3106 goto again;
3141 } 3107 }
3142 3108
3143 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 3109 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
3144 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 3110 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
3145 GFP_NOFS); 3111 0, 0, &cached_state, GFP_NOFS);
3146 3112
3147 ret = btrfs_set_extent_delalloc(inode, page_start, page_end); 3113 ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
3114 &cached_state);
3148 if (ret) { 3115 if (ret) {
3149 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3116 unlock_extent_cached(io_tree, page_start, page_end,
3117 &cached_state, GFP_NOFS);
3150 goto out_unlock; 3118 goto out_unlock;
3151 } 3119 }
3152 3120
@@ -3159,7 +3127,8 @@ again:
3159 } 3127 }
3160 ClearPageChecked(page); 3128 ClearPageChecked(page);
3161 set_page_dirty(page); 3129 set_page_dirty(page);
3162 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3130 unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
3131 GFP_NOFS);
3163 3132
3164out_unlock: 3133out_unlock:
3165 if (ret) 3134 if (ret)
@@ -3177,6 +3146,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3177 struct btrfs_root *root = BTRFS_I(inode)->root; 3146 struct btrfs_root *root = BTRFS_I(inode)->root;
3178 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3147 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3179 struct extent_map *em; 3148 struct extent_map *em;
3149 struct extent_state *cached_state = NULL;
3180 u64 mask = root->sectorsize - 1; 3150 u64 mask = root->sectorsize - 1;
3181 u64 hole_start = (inode->i_size + mask) & ~mask; 3151 u64 hole_start = (inode->i_size + mask) & ~mask;
3182 u64 block_end = (size + mask) & ~mask; 3152 u64 block_end = (size + mask) & ~mask;
@@ -3192,11 +3162,13 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3192 struct btrfs_ordered_extent *ordered; 3162 struct btrfs_ordered_extent *ordered;
3193 btrfs_wait_ordered_range(inode, hole_start, 3163 btrfs_wait_ordered_range(inode, hole_start,
3194 block_end - hole_start); 3164 block_end - hole_start);
3195 lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3165 lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
3166 &cached_state, GFP_NOFS);
3196 ordered = btrfs_lookup_ordered_extent(inode, hole_start); 3167 ordered = btrfs_lookup_ordered_extent(inode, hole_start);
3197 if (!ordered) 3168 if (!ordered)
3198 break; 3169 break;
3199 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3170 unlock_extent_cached(io_tree, hole_start, block_end - 1,
3171 &cached_state, GFP_NOFS);
3200 btrfs_put_ordered_extent(ordered); 3172 btrfs_put_ordered_extent(ordered);
3201 } 3173 }
3202 3174
@@ -3241,7 +3213,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3241 break; 3213 break;
3242 } 3214 }
3243 3215
3244 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3216 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
3217 GFP_NOFS);
3245 return err; 3218 return err;
3246} 3219}
3247 3220
@@ -3639,6 +3612,7 @@ static noinline void init_btrfs_i(struct inode *inode)
3639 bi->index_cnt = (u64)-1; 3612 bi->index_cnt = (u64)-1;
3640 bi->last_unlink_trans = 0; 3613 bi->last_unlink_trans = 0;
3641 bi->ordered_data_close = 0; 3614 bi->ordered_data_close = 0;
3615 bi->force_compress = 0;
3642 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); 3616 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
3643 extent_io_tree_init(&BTRFS_I(inode)->io_tree, 3617 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
3644 inode->i_mapping, GFP_NOFS); 3618 inode->i_mapping, GFP_NOFS);
@@ -3687,7 +3661,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s,
3687 * Returns in *is_new if the inode was read from disk 3661 * Returns in *is_new if the inode was read from disk
3688 */ 3662 */
3689struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 3663struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3690 struct btrfs_root *root) 3664 struct btrfs_root *root, int *new)
3691{ 3665{
3692 struct inode *inode; 3666 struct inode *inode;
3693 3667
@@ -3702,6 +3676,8 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3702 3676
3703 inode_tree_add(inode); 3677 inode_tree_add(inode);
3704 unlock_new_inode(inode); 3678 unlock_new_inode(inode);
3679 if (new)
3680 *new = 1;
3705 } 3681 }
3706 3682
3707 return inode; 3683 return inode;
@@ -3754,7 +3730,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3754 return NULL; 3730 return NULL;
3755 3731
3756 if (location.type == BTRFS_INODE_ITEM_KEY) { 3732 if (location.type == BTRFS_INODE_ITEM_KEY) {
3757 inode = btrfs_iget(dir->i_sb, &location, root); 3733 inode = btrfs_iget(dir->i_sb, &location, root, NULL);
3758 return inode; 3734 return inode;
3759 } 3735 }
3760 3736
@@ -3769,7 +3745,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3769 else 3745 else
3770 inode = new_simple_dir(dir->i_sb, &location, sub_root); 3746 inode = new_simple_dir(dir->i_sb, &location, sub_root);
3771 } else { 3747 } else {
3772 inode = btrfs_iget(dir->i_sb, &location, sub_root); 3748 inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
3773 } 3749 }
3774 srcu_read_unlock(&root->fs_info->subvol_srcu, index); 3750 srcu_read_unlock(&root->fs_info->subvol_srcu, index);
3775 3751
@@ -4501,7 +4477,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4501 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4477 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4502 if (err) { 4478 if (err) {
4503 err = -ENOSPC; 4479 err = -ENOSPC;
4504 goto out_unlock; 4480 goto out_fail;
4505 } 4481 }
4506 4482
4507 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4483 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
@@ -4979,6 +4955,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4979{ 4955{
4980 struct extent_io_tree *tree; 4956 struct extent_io_tree *tree;
4981 struct btrfs_ordered_extent *ordered; 4957 struct btrfs_ordered_extent *ordered;
4958 struct extent_state *cached_state = NULL;
4982 u64 page_start = page_offset(page); 4959 u64 page_start = page_offset(page);
4983 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 4960 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
4984 4961
@@ -4997,7 +4974,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4997 btrfs_releasepage(page, GFP_NOFS); 4974 btrfs_releasepage(page, GFP_NOFS);
4998 return; 4975 return;
4999 } 4976 }
5000 lock_extent(tree, page_start, page_end, GFP_NOFS); 4977 lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
4978 GFP_NOFS);
5001 ordered = btrfs_lookup_ordered_extent(page->mapping->host, 4979 ordered = btrfs_lookup_ordered_extent(page->mapping->host,
5002 page_offset(page)); 4980 page_offset(page));
5003 if (ordered) { 4981 if (ordered) {
@@ -5008,7 +4986,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
5008 clear_extent_bit(tree, page_start, page_end, 4986 clear_extent_bit(tree, page_start, page_end,
5009 EXTENT_DIRTY | EXTENT_DELALLOC | 4987 EXTENT_DIRTY | EXTENT_DELALLOC |
5010 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, 4988 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
5011 NULL, GFP_NOFS); 4989 &cached_state, GFP_NOFS);
5012 /* 4990 /*
5013 * whoever cleared the private bit is responsible 4991 * whoever cleared the private bit is responsible
5014 * for the finish_ordered_io 4992 * for the finish_ordered_io
@@ -5018,11 +4996,13 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
5018 page_start, page_end); 4996 page_start, page_end);
5019 } 4997 }
5020 btrfs_put_ordered_extent(ordered); 4998 btrfs_put_ordered_extent(ordered);
5021 lock_extent(tree, page_start, page_end, GFP_NOFS); 4999 cached_state = NULL;
5000 lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
5001 GFP_NOFS);
5022 } 5002 }
5023 clear_extent_bit(tree, page_start, page_end, 5003 clear_extent_bit(tree, page_start, page_end,
5024 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 5004 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
5025 EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS); 5005 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS);
5026 __btrfs_releasepage(page, GFP_NOFS); 5006 __btrfs_releasepage(page, GFP_NOFS);
5027 5007
5028 ClearPageChecked(page); 5008 ClearPageChecked(page);
@@ -5055,6 +5035,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5055 struct btrfs_root *root = BTRFS_I(inode)->root; 5035 struct btrfs_root *root = BTRFS_I(inode)->root;
5056 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5036 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5057 struct btrfs_ordered_extent *ordered; 5037 struct btrfs_ordered_extent *ordered;
5038 struct extent_state *cached_state = NULL;
5058 char *kaddr; 5039 char *kaddr;
5059 unsigned long zero_start; 5040 unsigned long zero_start;
5060 loff_t size; 5041 loff_t size;
@@ -5093,7 +5074,8 @@ again:
5093 } 5074 }
5094 wait_on_page_writeback(page); 5075 wait_on_page_writeback(page);
5095 5076
5096 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 5077 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
5078 GFP_NOFS);
5097 set_page_extent_mapped(page); 5079 set_page_extent_mapped(page);
5098 5080
5099 /* 5081 /*
@@ -5102,7 +5084,8 @@ again:
5102 */ 5084 */
5103 ordered = btrfs_lookup_ordered_extent(inode, page_start); 5085 ordered = btrfs_lookup_ordered_extent(inode, page_start);
5104 if (ordered) { 5086 if (ordered) {
5105 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 5087 unlock_extent_cached(io_tree, page_start, page_end,
5088 &cached_state, GFP_NOFS);
5106 unlock_page(page); 5089 unlock_page(page);
5107 btrfs_start_ordered_extent(inode, ordered, 1); 5090 btrfs_start_ordered_extent(inode, ordered, 1);
5108 btrfs_put_ordered_extent(ordered); 5091 btrfs_put_ordered_extent(ordered);
@@ -5116,13 +5099,15 @@ again:
5116 * is probably a better way to do this, but for now keep consistent with 5099 * is probably a better way to do this, but for now keep consistent with
5117 * prepare_pages in the normal write path. 5100 * prepare_pages in the normal write path.
5118 */ 5101 */
5119 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 5102 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
5120 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 5103 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
5121 GFP_NOFS); 5104 0, 0, &cached_state, GFP_NOFS);
5122 5105
5123 ret = btrfs_set_extent_delalloc(inode, page_start, page_end); 5106 ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
5107 &cached_state);
5124 if (ret) { 5108 if (ret) {
5125 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 5109 unlock_extent_cached(io_tree, page_start, page_end,
5110 &cached_state, GFP_NOFS);
5126 ret = VM_FAULT_SIGBUS; 5111 ret = VM_FAULT_SIGBUS;
5127 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 5112 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5128 goto out_unlock; 5113 goto out_unlock;
@@ -5148,7 +5133,7 @@ again:
5148 BTRFS_I(inode)->last_trans = root->fs_info->generation; 5133 BTRFS_I(inode)->last_trans = root->fs_info->generation;
5149 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 5134 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
5150 5135
5151 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 5136 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
5152 5137
5153out_unlock: 5138out_unlock:
5154 btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 5139 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
@@ -5353,7 +5338,6 @@ free:
5353void btrfs_drop_inode(struct inode *inode) 5338void btrfs_drop_inode(struct inode *inode)
5354{ 5339{
5355 struct btrfs_root *root = BTRFS_I(inode)->root; 5340 struct btrfs_root *root = BTRFS_I(inode)->root;
5356
5357 if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0) 5341 if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
5358 generic_delete_inode(inode); 5342 generic_delete_inode(inode);
5359 else 5343 else
@@ -5757,18 +5741,15 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
5757 struct btrfs_trans_handle *trans; 5741 struct btrfs_trans_handle *trans;
5758 struct btrfs_root *root = BTRFS_I(inode)->root; 5742 struct btrfs_root *root = BTRFS_I(inode)->root;
5759 struct btrfs_key ins; 5743 struct btrfs_key ins;
5760 u64 alloc_size;
5761 u64 cur_offset = start; 5744 u64 cur_offset = start;
5762 u64 num_bytes = end - start; 5745 u64 num_bytes = end - start;
5763 int ret = 0; 5746 int ret = 0;
5764 u64 i_size; 5747 u64 i_size;
5765 5748
5766 while (num_bytes > 0) { 5749 while (num_bytes > 0) {
5767 alloc_size = min(num_bytes, root->fs_info->max_extent);
5768
5769 trans = btrfs_start_transaction(root, 1); 5750 trans = btrfs_start_transaction(root, 1);
5770 5751
5771 ret = btrfs_reserve_extent(trans, root, alloc_size, 5752 ret = btrfs_reserve_extent(trans, root, num_bytes,
5772 root->sectorsize, 0, alloc_hint, 5753 root->sectorsize, 0, alloc_hint,
5773 (u64)-1, &ins, 1); 5754 (u64)-1, &ins, 1);
5774 if (ret) { 5755 if (ret) {
@@ -5827,6 +5808,7 @@ stop_trans:
5827static long btrfs_fallocate(struct inode *inode, int mode, 5808static long btrfs_fallocate(struct inode *inode, int mode,
5828 loff_t offset, loff_t len) 5809 loff_t offset, loff_t len)
5829{ 5810{
5811 struct extent_state *cached_state = NULL;
5830 u64 cur_offset; 5812 u64 cur_offset;
5831 u64 last_byte; 5813 u64 last_byte;
5832 u64 alloc_start; 5814 u64 alloc_start;
@@ -5865,16 +5847,17 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5865 /* the extent lock is ordered inside the running 5847 /* the extent lock is ordered inside the running
5866 * transaction 5848 * transaction
5867 */ 5849 */
5868 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 5850 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
5869 GFP_NOFS); 5851 locked_end, 0, &cached_state, GFP_NOFS);
5870 ordered = btrfs_lookup_first_ordered_extent(inode, 5852 ordered = btrfs_lookup_first_ordered_extent(inode,
5871 alloc_end - 1); 5853 alloc_end - 1);
5872 if (ordered && 5854 if (ordered &&
5873 ordered->file_offset + ordered->len > alloc_start && 5855 ordered->file_offset + ordered->len > alloc_start &&
5874 ordered->file_offset < alloc_end) { 5856 ordered->file_offset < alloc_end) {
5875 btrfs_put_ordered_extent(ordered); 5857 btrfs_put_ordered_extent(ordered);
5876 unlock_extent(&BTRFS_I(inode)->io_tree, 5858 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
5877 alloc_start, locked_end, GFP_NOFS); 5859 alloc_start, locked_end,
5860 &cached_state, GFP_NOFS);
5878 /* 5861 /*
5879 * we can't wait on the range with the transaction 5862 * we can't wait on the range with the transaction
5880 * running or with the extent lock held 5863 * running or with the extent lock held
@@ -5916,8 +5899,8 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5916 break; 5899 break;
5917 } 5900 }
5918 } 5901 }
5919 unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 5902 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
5920 GFP_NOFS); 5903 &cached_state, GFP_NOFS);
5921 5904
5922 btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode, 5905 btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode,
5923 alloc_end - alloc_start); 5906 alloc_end - alloc_start);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 645a17927a8f..97a97839a867 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -39,6 +39,7 @@
39#include <linux/security.h> 39#include <linux/security.h>
40#include <linux/xattr.h> 40#include <linux/xattr.h>
41#include <linux/vmalloc.h> 41#include <linux/vmalloc.h>
42#include <linux/slab.h>
42#include "compat.h" 43#include "compat.h"
43#include "ctree.h" 44#include "ctree.h"
44#include "disk-io.h" 45#include "disk-io.h"
@@ -474,7 +475,79 @@ out_unlock:
474 return error; 475 return error;
475} 476}
476 477
477static int btrfs_defrag_file(struct file *file) 478static int should_defrag_range(struct inode *inode, u64 start, u64 len,
479 int thresh, u64 *last_len, u64 *skip,
480 u64 *defrag_end)
481{
482 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
483 struct extent_map *em = NULL;
484 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
485 int ret = 1;
486
487
488 if (thresh == 0)
489 thresh = 256 * 1024;
490
491 /*
492 * make sure that once we start defragging and extent, we keep on
493 * defragging it
494 */
495 if (start < *defrag_end)
496 return 1;
497
498 *skip = 0;
499
500 /*
501 * hopefully we have this extent in the tree already, try without
502 * the full extent lock
503 */
504 read_lock(&em_tree->lock);
505 em = lookup_extent_mapping(em_tree, start, len);
506 read_unlock(&em_tree->lock);
507
508 if (!em) {
509 /* get the big lock and read metadata off disk */
510 lock_extent(io_tree, start, start + len - 1, GFP_NOFS);
511 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
512 unlock_extent(io_tree, start, start + len - 1, GFP_NOFS);
513
514 if (IS_ERR(em))
515 return 0;
516 }
517
518 /* this will cover holes, and inline extents */
519 if (em->block_start >= EXTENT_MAP_LAST_BYTE)
520 ret = 0;
521
522 /*
523 * we hit a real extent, if it is big don't bother defragging it again
524 */
525 if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh)
526 ret = 0;
527
528 /*
529 * last_len ends up being a counter of how many bytes we've defragged.
530 * every time we choose not to defrag an extent, we reset *last_len
531 * so that the next tiny extent will force a defrag.
532 *
533 * The end result of this is that tiny extents before a single big
534 * extent will force at least part of that big extent to be defragged.
535 */
536 if (ret) {
537 *last_len += len;
538 *defrag_end = extent_map_end(em);
539 } else {
540 *last_len = 0;
541 *skip = extent_map_end(em);
542 *defrag_end = 0;
543 }
544
545 free_extent_map(em);
546 return ret;
547}
548
549static int btrfs_defrag_file(struct file *file,
550 struct btrfs_ioctl_defrag_range_args *range)
478{ 551{
479 struct inode *inode = fdentry(file)->d_inode; 552 struct inode *inode = fdentry(file)->d_inode;
480 struct btrfs_root *root = BTRFS_I(inode)->root; 553 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -486,37 +559,96 @@ static int btrfs_defrag_file(struct file *file)
486 unsigned long total_read = 0; 559 unsigned long total_read = 0;
487 u64 page_start; 560 u64 page_start;
488 u64 page_end; 561 u64 page_end;
562 u64 last_len = 0;
563 u64 skip = 0;
564 u64 defrag_end = 0;
489 unsigned long i; 565 unsigned long i;
490 int ret; 566 int ret;
491 567
492 ret = btrfs_check_data_free_space(root, inode, inode->i_size); 568 if (inode->i_size == 0)
493 if (ret) 569 return 0;
494 return -ENOSPC; 570
571 if (range->start + range->len > range->start) {
572 last_index = min_t(u64, inode->i_size - 1,
573 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
574 } else {
575 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
576 }
577
578 i = range->start >> PAGE_CACHE_SHIFT;
579 while (i <= last_index) {
580 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
581 PAGE_CACHE_SIZE,
582 range->extent_thresh,
583 &last_len, &skip,
584 &defrag_end)) {
585 unsigned long next;
586 /*
587 * the should_defrag function tells us how much to skip
588 * bump our counter by the suggested amount
589 */
590 next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
591 i = max(i + 1, next);
592 continue;
593 }
495 594
496 mutex_lock(&inode->i_mutex);
497 last_index = inode->i_size >> PAGE_CACHE_SHIFT;
498 for (i = 0; i <= last_index; i++) {
499 if (total_read % ra_pages == 0) { 595 if (total_read % ra_pages == 0) {
500 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i, 596 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
501 min(last_index, i + ra_pages - 1)); 597 min(last_index, i + ra_pages - 1));
502 } 598 }
503 total_read++; 599 total_read++;
600 mutex_lock(&inode->i_mutex);
601 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
602 BTRFS_I(inode)->force_compress = 1;
603
604 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
605 if (ret) {
606 ret = -ENOSPC;
607 break;
608 }
609
610 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
611 if (ret) {
612 btrfs_free_reserved_data_space(root, inode,
613 PAGE_CACHE_SIZE);
614 ret = -ENOSPC;
615 break;
616 }
504again: 617again:
618 if (inode->i_size == 0 ||
619 i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
620 ret = 0;
621 goto err_reservations;
622 }
623
505 page = grab_cache_page(inode->i_mapping, i); 624 page = grab_cache_page(inode->i_mapping, i);
506 if (!page) 625 if (!page)
507 goto out_unlock; 626 goto err_reservations;
627
508 if (!PageUptodate(page)) { 628 if (!PageUptodate(page)) {
509 btrfs_readpage(NULL, page); 629 btrfs_readpage(NULL, page);
510 lock_page(page); 630 lock_page(page);
511 if (!PageUptodate(page)) { 631 if (!PageUptodate(page)) {
512 unlock_page(page); 632 unlock_page(page);
513 page_cache_release(page); 633 page_cache_release(page);
514 goto out_unlock; 634 goto err_reservations;
515 } 635 }
516 } 636 }
517 637
638 if (page->mapping != inode->i_mapping) {
639 unlock_page(page);
640 page_cache_release(page);
641 goto again;
642 }
643
518 wait_on_page_writeback(page); 644 wait_on_page_writeback(page);
519 645
646 if (PageDirty(page)) {
647 btrfs_free_reserved_data_space(root, inode,
648 PAGE_CACHE_SIZE);
649 goto loop_unlock;
650 }
651
520 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 652 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
521 page_end = page_start + PAGE_CACHE_SIZE - 1; 653 page_end = page_start + PAGE_CACHE_SIZE - 1;
522 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 654 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
@@ -537,18 +669,54 @@ again:
537 * page if it is dirtied again later 669 * page if it is dirtied again later
538 */ 670 */
539 clear_page_dirty_for_io(page); 671 clear_page_dirty_for_io(page);
672 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start,
673 page_end, EXTENT_DIRTY | EXTENT_DELALLOC |
674 EXTENT_DO_ACCOUNTING, GFP_NOFS);
540 675
541 btrfs_set_extent_delalloc(inode, page_start, page_end); 676 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
677 ClearPageChecked(page);
542 set_page_dirty(page); 678 set_page_dirty(page);
543 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 679 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
680
681loop_unlock:
544 unlock_page(page); 682 unlock_page(page);
545 page_cache_release(page); 683 page_cache_release(page);
684 mutex_unlock(&inode->i_mutex);
685
686 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
546 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); 687 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
688 i++;
689 }
690
691 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
692 filemap_flush(inode->i_mapping);
693
694 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
695 /* the filemap_flush will queue IO into the worker threads, but
696 * we have to make sure the IO is actually started and that
697 * ordered extents get created before we return
698 */
699 atomic_inc(&root->fs_info->async_submit_draining);
700 while (atomic_read(&root->fs_info->nr_async_submits) ||
701 atomic_read(&root->fs_info->async_delalloc_pages)) {
702 wait_event(root->fs_info->async_submit_wait,
703 (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
704 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
705 }
706 atomic_dec(&root->fs_info->async_submit_draining);
707
708 mutex_lock(&inode->i_mutex);
709 BTRFS_I(inode)->force_compress = 0;
710 mutex_unlock(&inode->i_mutex);
547 } 711 }
548 712
549out_unlock:
550 mutex_unlock(&inode->i_mutex);
551 return 0; 713 return 0;
714
715err_reservations:
716 mutex_unlock(&inode->i_mutex);
717 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
718 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
719 return ret;
552} 720}
553 721
554static noinline int btrfs_ioctl_resize(struct btrfs_root *root, 722static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
@@ -608,7 +776,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
608 mod = 1; 776 mod = 1;
609 sizestr++; 777 sizestr++;
610 } 778 }
611 new_size = btrfs_parse_size(sizestr); 779 new_size = memparse(sizestr, NULL);
612 if (new_size == 0) { 780 if (new_size == 0) {
613 ret = -EINVAL; 781 ret = -EINVAL;
614 goto out_unlock; 782 goto out_unlock;
@@ -743,6 +911,330 @@ out:
743 return ret; 911 return ret;
744} 912}
745 913
914static noinline int key_in_sk(struct btrfs_key *key,
915 struct btrfs_ioctl_search_key *sk)
916{
917 struct btrfs_key test;
918 int ret;
919
920 test.objectid = sk->min_objectid;
921 test.type = sk->min_type;
922 test.offset = sk->min_offset;
923
924 ret = btrfs_comp_cpu_keys(key, &test);
925 if (ret < 0)
926 return 0;
927
928 test.objectid = sk->max_objectid;
929 test.type = sk->max_type;
930 test.offset = sk->max_offset;
931
932 ret = btrfs_comp_cpu_keys(key, &test);
933 if (ret > 0)
934 return 0;
935 return 1;
936}
937
938static noinline int copy_to_sk(struct btrfs_root *root,
939 struct btrfs_path *path,
940 struct btrfs_key *key,
941 struct btrfs_ioctl_search_key *sk,
942 char *buf,
943 unsigned long *sk_offset,
944 int *num_found)
945{
946 u64 found_transid;
947 struct extent_buffer *leaf;
948 struct btrfs_ioctl_search_header sh;
949 unsigned long item_off;
950 unsigned long item_len;
951 int nritems;
952 int i;
953 int slot;
954 int found = 0;
955 int ret = 0;
956
957 leaf = path->nodes[0];
958 slot = path->slots[0];
959 nritems = btrfs_header_nritems(leaf);
960
961 if (btrfs_header_generation(leaf) > sk->max_transid) {
962 i = nritems;
963 goto advance_key;
964 }
965 found_transid = btrfs_header_generation(leaf);
966
967 for (i = slot; i < nritems; i++) {
968 item_off = btrfs_item_ptr_offset(leaf, i);
969 item_len = btrfs_item_size_nr(leaf, i);
970
971 if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
972 item_len = 0;
973
974 if (sizeof(sh) + item_len + *sk_offset >
975 BTRFS_SEARCH_ARGS_BUFSIZE) {
976 ret = 1;
977 goto overflow;
978 }
979
980 btrfs_item_key_to_cpu(leaf, key, i);
981 if (!key_in_sk(key, sk))
982 continue;
983
984 sh.objectid = key->objectid;
985 sh.offset = key->offset;
986 sh.type = key->type;
987 sh.len = item_len;
988 sh.transid = found_transid;
989
990 /* copy search result header */
991 memcpy(buf + *sk_offset, &sh, sizeof(sh));
992 *sk_offset += sizeof(sh);
993
994 if (item_len) {
995 char *p = buf + *sk_offset;
996 /* copy the item */
997 read_extent_buffer(leaf, p,
998 item_off, item_len);
999 *sk_offset += item_len;
1000 }
1001 found++;
1002
1003 if (*num_found >= sk->nr_items)
1004 break;
1005 }
1006advance_key:
1007 ret = 0;
1008 if (key->offset < (u64)-1 && key->offset < sk->max_offset)
1009 key->offset++;
1010 else if (key->type < (u8)-1 && key->type < sk->max_type) {
1011 key->offset = 0;
1012 key->type++;
1013 } else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) {
1014 key->offset = 0;
1015 key->type = 0;
1016 key->objectid++;
1017 } else
1018 ret = 1;
1019overflow:
1020 *num_found += found;
1021 return ret;
1022}
1023
1024static noinline int search_ioctl(struct inode *inode,
1025 struct btrfs_ioctl_search_args *args)
1026{
1027 struct btrfs_root *root;
1028 struct btrfs_key key;
1029 struct btrfs_key max_key;
1030 struct btrfs_path *path;
1031 struct btrfs_ioctl_search_key *sk = &args->key;
1032 struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
1033 int ret;
1034 int num_found = 0;
1035 unsigned long sk_offset = 0;
1036
1037 path = btrfs_alloc_path();
1038 if (!path)
1039 return -ENOMEM;
1040
1041 if (sk->tree_id == 0) {
1042 /* search the root of the inode that was passed */
1043 root = BTRFS_I(inode)->root;
1044 } else {
1045 key.objectid = sk->tree_id;
1046 key.type = BTRFS_ROOT_ITEM_KEY;
1047 key.offset = (u64)-1;
1048 root = btrfs_read_fs_root_no_name(info, &key);
1049 if (IS_ERR(root)) {
1050 printk(KERN_ERR "could not find root %llu\n",
1051 sk->tree_id);
1052 btrfs_free_path(path);
1053 return -ENOENT;
1054 }
1055 }
1056
1057 key.objectid = sk->min_objectid;
1058 key.type = sk->min_type;
1059 key.offset = sk->min_offset;
1060
1061 max_key.objectid = sk->max_objectid;
1062 max_key.type = sk->max_type;
1063 max_key.offset = sk->max_offset;
1064
1065 path->keep_locks = 1;
1066
1067 while(1) {
1068 ret = btrfs_search_forward(root, &key, &max_key, path, 0,
1069 sk->min_transid);
1070 if (ret != 0) {
1071 if (ret > 0)
1072 ret = 0;
1073 goto err;
1074 }
1075 ret = copy_to_sk(root, path, &key, sk, args->buf,
1076 &sk_offset, &num_found);
1077 btrfs_release_path(root, path);
1078 if (ret || num_found >= sk->nr_items)
1079 break;
1080
1081 }
1082 ret = 0;
1083err:
1084 sk->nr_items = num_found;
1085 btrfs_free_path(path);
1086 return ret;
1087}
1088
1089static noinline int btrfs_ioctl_tree_search(struct file *file,
1090 void __user *argp)
1091{
1092 struct btrfs_ioctl_search_args *args;
1093 struct inode *inode;
1094 int ret;
1095
1096 if (!capable(CAP_SYS_ADMIN))
1097 return -EPERM;
1098
1099 args = kmalloc(sizeof(*args), GFP_KERNEL);
1100 if (!args)
1101 return -ENOMEM;
1102
1103 if (copy_from_user(args, argp, sizeof(*args))) {
1104 kfree(args);
1105 return -EFAULT;
1106 }
1107 inode = fdentry(file)->d_inode;
1108 ret = search_ioctl(inode, args);
1109 if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
1110 ret = -EFAULT;
1111 kfree(args);
1112 return ret;
1113}
1114
1115/*
1116 * Search INODE_REFs to identify path name of 'dirid' directory
1117 * in a 'tree_id' tree. and sets path name to 'name'.
1118 */
1119static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
1120 u64 tree_id, u64 dirid, char *name)
1121{
1122 struct btrfs_root *root;
1123 struct btrfs_key key;
1124 char *ptr;
1125 int ret = -1;
1126 int slot;
1127 int len;
1128 int total_len = 0;
1129 struct btrfs_inode_ref *iref;
1130 struct extent_buffer *l;
1131 struct btrfs_path *path;
1132
1133 if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
1134 name[0]='\0';
1135 return 0;
1136 }
1137
1138 path = btrfs_alloc_path();
1139 if (!path)
1140 return -ENOMEM;
1141
1142 ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX];
1143
1144 key.objectid = tree_id;
1145 key.type = BTRFS_ROOT_ITEM_KEY;
1146 key.offset = (u64)-1;
1147 root = btrfs_read_fs_root_no_name(info, &key);
1148 if (IS_ERR(root)) {
1149 printk(KERN_ERR "could not find root %llu\n", tree_id);
1150 ret = -ENOENT;
1151 goto out;
1152 }
1153
1154 key.objectid = dirid;
1155 key.type = BTRFS_INODE_REF_KEY;
1156 key.offset = (u64)-1;
1157
1158 while(1) {
1159 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1160 if (ret < 0)
1161 goto out;
1162
1163 l = path->nodes[0];
1164 slot = path->slots[0];
1165 if (ret > 0 && slot > 0)
1166 slot--;
1167 btrfs_item_key_to_cpu(l, &key, slot);
1168
1169 if (ret > 0 && (key.objectid != dirid ||
1170 key.type != BTRFS_INODE_REF_KEY)) {
1171 ret = -ENOENT;
1172 goto out;
1173 }
1174
1175 iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
1176 len = btrfs_inode_ref_name_len(l, iref);
1177 ptr -= len + 1;
1178 total_len += len + 1;
1179 if (ptr < name)
1180 goto out;
1181
1182 *(ptr + len) = '/';
1183 read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len);
1184
1185 if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
1186 break;
1187
1188 btrfs_release_path(root, path);
1189 key.objectid = key.offset;
1190 key.offset = (u64)-1;
1191 dirid = key.objectid;
1192
1193 }
1194 if (ptr < name)
1195 goto out;
1196 memcpy(name, ptr, total_len);
1197 name[total_len]='\0';
1198 ret = 0;
1199out:
1200 btrfs_free_path(path);
1201 return ret;
1202}
1203
1204static noinline int btrfs_ioctl_ino_lookup(struct file *file,
1205 void __user *argp)
1206{
1207 struct btrfs_ioctl_ino_lookup_args *args;
1208 struct inode *inode;
1209 int ret;
1210
1211 if (!capable(CAP_SYS_ADMIN))
1212 return -EPERM;
1213
1214 args = kmalloc(sizeof(*args), GFP_KERNEL);
1215 if (!args)
1216 return -ENOMEM;
1217
1218 if (copy_from_user(args, argp, sizeof(*args))) {
1219 kfree(args);
1220 return -EFAULT;
1221 }
1222 inode = fdentry(file)->d_inode;
1223
1224 if (args->treeid == 0)
1225 args->treeid = BTRFS_I(inode)->root->root_key.objectid;
1226
1227 ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
1228 args->treeid, args->objectid,
1229 args->name);
1230
1231 if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
1232 ret = -EFAULT;
1233
1234 kfree(args);
1235 return ret;
1236}
1237
746static noinline int btrfs_ioctl_snap_destroy(struct file *file, 1238static noinline int btrfs_ioctl_snap_destroy(struct file *file,
747 void __user *arg) 1239 void __user *arg)
748{ 1240{
@@ -849,10 +1341,11 @@ out:
849 return err; 1341 return err;
850} 1342}
851 1343
852static int btrfs_ioctl_defrag(struct file *file) 1344static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
853{ 1345{
854 struct inode *inode = fdentry(file)->d_inode; 1346 struct inode *inode = fdentry(file)->d_inode;
855 struct btrfs_root *root = BTRFS_I(inode)->root; 1347 struct btrfs_root *root = BTRFS_I(inode)->root;
1348 struct btrfs_ioctl_defrag_range_args *range;
856 int ret; 1349 int ret;
857 1350
858 ret = mnt_want_write(file->f_path.mnt); 1351 ret = mnt_want_write(file->f_path.mnt);
@@ -873,7 +1366,31 @@ static int btrfs_ioctl_defrag(struct file *file)
873 ret = -EINVAL; 1366 ret = -EINVAL;
874 goto out; 1367 goto out;
875 } 1368 }
876 btrfs_defrag_file(file); 1369
1370 range = kzalloc(sizeof(*range), GFP_KERNEL);
1371 if (!range) {
1372 ret = -ENOMEM;
1373 goto out;
1374 }
1375
1376 if (argp) {
1377 if (copy_from_user(range, argp,
1378 sizeof(*range))) {
1379 ret = -EFAULT;
1380 kfree(range);
1381 goto out;
1382 }
1383 /* compression requires us to start the IO */
1384 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
1385 range->flags |= BTRFS_DEFRAG_RANGE_START_IO;
1386 range->extent_thresh = (u32)-1;
1387 }
1388 } else {
1389 /* the rest are all set to zero by kzalloc */
1390 range->len = (u64)-1;
1391 }
1392 btrfs_defrag_file(file, range);
1393 kfree(range);
877 break; 1394 break;
878 } 1395 }
879out: 1396out:
@@ -964,12 +1481,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
964 ret = -EBADF; 1481 ret = -EBADF;
965 goto out_drop_write; 1482 goto out_drop_write;
966 } 1483 }
1484
967 src = src_file->f_dentry->d_inode; 1485 src = src_file->f_dentry->d_inode;
968 1486
969 ret = -EINVAL; 1487 ret = -EINVAL;
970 if (src == inode) 1488 if (src == inode)
971 goto out_fput; 1489 goto out_fput;
972 1490
1491 /* the src must be open for reading */
1492 if (!(src_file->f_mode & FMODE_READ))
1493 goto out_fput;
1494
973 ret = -EISDIR; 1495 ret = -EISDIR;
974 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) 1496 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
975 goto out_fput; 1497 goto out_fput;
@@ -1274,6 +1796,157 @@ out:
1274 return ret; 1796 return ret;
1275} 1797}
1276 1798
1799static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
1800{
1801 struct inode *inode = fdentry(file)->d_inode;
1802 struct btrfs_root *root = BTRFS_I(inode)->root;
1803 struct btrfs_root *new_root;
1804 struct btrfs_dir_item *di;
1805 struct btrfs_trans_handle *trans;
1806 struct btrfs_path *path;
1807 struct btrfs_key location;
1808 struct btrfs_disk_key disk_key;
1809 struct btrfs_super_block *disk_super;
1810 u64 features;
1811 u64 objectid = 0;
1812 u64 dir_id;
1813
1814 if (!capable(CAP_SYS_ADMIN))
1815 return -EPERM;
1816
1817 if (copy_from_user(&objectid, argp, sizeof(objectid)))
1818 return -EFAULT;
1819
1820 if (!objectid)
1821 objectid = root->root_key.objectid;
1822
1823 location.objectid = objectid;
1824 location.type = BTRFS_ROOT_ITEM_KEY;
1825 location.offset = (u64)-1;
1826
1827 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
1828 if (IS_ERR(new_root))
1829 return PTR_ERR(new_root);
1830
1831 if (btrfs_root_refs(&new_root->root_item) == 0)
1832 return -ENOENT;
1833
1834 path = btrfs_alloc_path();
1835 if (!path)
1836 return -ENOMEM;
1837 path->leave_spinning = 1;
1838
1839 trans = btrfs_start_transaction(root, 1);
1840 if (!trans) {
1841 btrfs_free_path(path);
1842 return -ENOMEM;
1843 }
1844
1845 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
1846 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
1847 dir_id, "default", 7, 1);
1848 if (!di) {
1849 btrfs_free_path(path);
1850 btrfs_end_transaction(trans, root);
1851 printk(KERN_ERR "Umm, you don't have the default dir item, "
1852 "this isn't going to work\n");
1853 return -ENOENT;
1854 }
1855
1856 btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
1857 btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
1858 btrfs_mark_buffer_dirty(path->nodes[0]);
1859 btrfs_free_path(path);
1860
1861 disk_super = &root->fs_info->super_copy;
1862 features = btrfs_super_incompat_flags(disk_super);
1863 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
1864 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
1865 btrfs_set_super_incompat_flags(disk_super, features);
1866 }
1867 btrfs_end_transaction(trans, root);
1868
1869 return 0;
1870}
1871
1872long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
1873{
1874 struct btrfs_ioctl_space_args space_args;
1875 struct btrfs_ioctl_space_info space;
1876 struct btrfs_ioctl_space_info *dest;
1877 struct btrfs_ioctl_space_info *dest_orig;
1878 struct btrfs_ioctl_space_info *user_dest;
1879 struct btrfs_space_info *info;
1880 int alloc_size;
1881 int ret = 0;
1882 int slot_count = 0;
1883
1884 if (copy_from_user(&space_args,
1885 (struct btrfs_ioctl_space_args __user *)arg,
1886 sizeof(space_args)))
1887 return -EFAULT;
1888
1889 /* first we count slots */
1890 rcu_read_lock();
1891 list_for_each_entry_rcu(info, &root->fs_info->space_info, list)
1892 slot_count++;
1893 rcu_read_unlock();
1894
1895 /* space_slots == 0 means they are asking for a count */
1896 if (space_args.space_slots == 0) {
1897 space_args.total_spaces = slot_count;
1898 goto out;
1899 }
1900 alloc_size = sizeof(*dest) * slot_count;
1901 /* we generally have at most 6 or so space infos, one for each raid
1902 * level. So, a whole page should be more than enough for everyone
1903 */
1904 if (alloc_size > PAGE_CACHE_SIZE)
1905 return -ENOMEM;
1906
1907 space_args.total_spaces = 0;
1908 dest = kmalloc(alloc_size, GFP_NOFS);
1909 if (!dest)
1910 return -ENOMEM;
1911 dest_orig = dest;
1912
1913 /* now we have a buffer to copy into */
1914 rcu_read_lock();
1915 list_for_each_entry_rcu(info, &root->fs_info->space_info, list) {
1916 /* make sure we don't copy more than we allocated
1917 * in our buffer
1918 */
1919 if (slot_count == 0)
1920 break;
1921 slot_count--;
1922
1923 /* make sure userland has enough room in their buffer */
1924 if (space_args.total_spaces >= space_args.space_slots)
1925 break;
1926
1927 space.flags = info->flags;
1928 space.total_bytes = info->total_bytes;
1929 space.used_bytes = info->bytes_used;
1930 memcpy(dest, &space, sizeof(space));
1931 dest++;
1932 space_args.total_spaces++;
1933 }
1934 rcu_read_unlock();
1935
1936 user_dest = (struct btrfs_ioctl_space_info *)
1937 (arg + sizeof(struct btrfs_ioctl_space_args));
1938
1939 if (copy_to_user(user_dest, dest_orig, alloc_size))
1940 ret = -EFAULT;
1941
1942 kfree(dest_orig);
1943out:
1944 if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args)))
1945 ret = -EFAULT;
1946
1947 return ret;
1948}
1949
1277/* 1950/*
1278 * there are many ways the trans_start and trans_end ioctls can lead 1951 * there are many ways the trans_start and trans_end ioctls can lead
1279 * to deadlocks. They should only be used by applications that 1952 * to deadlocks. They should only be used by applications that
@@ -1320,8 +1993,12 @@ long btrfs_ioctl(struct file *file, unsigned int
1320 return btrfs_ioctl_snap_create(file, argp, 1); 1993 return btrfs_ioctl_snap_create(file, argp, 1);
1321 case BTRFS_IOC_SNAP_DESTROY: 1994 case BTRFS_IOC_SNAP_DESTROY:
1322 return btrfs_ioctl_snap_destroy(file, argp); 1995 return btrfs_ioctl_snap_destroy(file, argp);
1996 case BTRFS_IOC_DEFAULT_SUBVOL:
1997 return btrfs_ioctl_default_subvol(file, argp);
1323 case BTRFS_IOC_DEFRAG: 1998 case BTRFS_IOC_DEFRAG:
1324 return btrfs_ioctl_defrag(file); 1999 return btrfs_ioctl_defrag(file, NULL);
2000 case BTRFS_IOC_DEFRAG_RANGE:
2001 return btrfs_ioctl_defrag(file, argp);
1325 case BTRFS_IOC_RESIZE: 2002 case BTRFS_IOC_RESIZE:
1326 return btrfs_ioctl_resize(root, argp); 2003 return btrfs_ioctl_resize(root, argp);
1327 case BTRFS_IOC_ADD_DEV: 2004 case BTRFS_IOC_ADD_DEV:
@@ -1338,6 +2015,12 @@ long btrfs_ioctl(struct file *file, unsigned int
1338 return btrfs_ioctl_trans_start(file); 2015 return btrfs_ioctl_trans_start(file);
1339 case BTRFS_IOC_TRANS_END: 2016 case BTRFS_IOC_TRANS_END:
1340 return btrfs_ioctl_trans_end(file); 2017 return btrfs_ioctl_trans_end(file);
2018 case BTRFS_IOC_TREE_SEARCH:
2019 return btrfs_ioctl_tree_search(file, argp);
2020 case BTRFS_IOC_INO_LOOKUP:
2021 return btrfs_ioctl_ino_lookup(file, argp);
2022 case BTRFS_IOC_SPACE_INFO:
2023 return btrfs_ioctl_space_info(root, argp);
1341 case BTRFS_IOC_SYNC: 2024 case BTRFS_IOC_SYNC:
1342 btrfs_sync_fs(file->f_dentry->d_sb, 1); 2025 btrfs_sync_fs(file->f_dentry->d_sb, 1);
1343 return 0; 2026 return 0;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index bc49914475eb..424694aa517f 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,12 +30,114 @@ struct btrfs_ioctl_vol_args {
30 char name[BTRFS_PATH_NAME_MAX + 1]; 30 char name[BTRFS_PATH_NAME_MAX + 1];
31}; 31};
32 32
33#define BTRFS_INO_LOOKUP_PATH_MAX 4080
34struct btrfs_ioctl_ino_lookup_args {
35 __u64 treeid;
36 __u64 objectid;
37 char name[BTRFS_INO_LOOKUP_PATH_MAX];
38};
39
40struct btrfs_ioctl_search_key {
41 /* which root are we searching. 0 is the tree of tree roots */
42 __u64 tree_id;
43
44 /* keys returned will be >= min and <= max */
45 __u64 min_objectid;
46 __u64 max_objectid;
47
48 /* keys returned will be >= min and <= max */
49 __u64 min_offset;
50 __u64 max_offset;
51
52 /* max and min transids to search for */
53 __u64 min_transid;
54 __u64 max_transid;
55
56 /* keys returned will be >= min and <= max */
57 __u32 min_type;
58 __u32 max_type;
59
60 /*
61 * how many items did userland ask for, and how many are we
62 * returning
63 */
64 __u32 nr_items;
65
66 /* align to 64 bits */
67 __u32 unused;
68
69 /* some extra for later */
70 __u64 unused1;
71 __u64 unused2;
72 __u64 unused3;
73 __u64 unused4;
74};
75
76struct btrfs_ioctl_search_header {
77 __u64 transid;
78 __u64 objectid;
79 __u64 offset;
80 __u32 type;
81 __u32 len;
82};
83
84#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key))
85/*
86 * the buf is an array of search headers where
87 * each header is followed by the actual item
88 * the type field is expanded to 32 bits for alignment
89 */
90struct btrfs_ioctl_search_args {
91 struct btrfs_ioctl_search_key key;
92 char buf[BTRFS_SEARCH_ARGS_BUFSIZE];
93};
94
33struct btrfs_ioctl_clone_range_args { 95struct btrfs_ioctl_clone_range_args {
34 __s64 src_fd; 96 __s64 src_fd;
35 __u64 src_offset, src_length; 97 __u64 src_offset, src_length;
36 __u64 dest_offset; 98 __u64 dest_offset;
37}; 99};
38 100
101/* flags for the defrag range ioctl */
102#define BTRFS_DEFRAG_RANGE_COMPRESS 1
103#define BTRFS_DEFRAG_RANGE_START_IO 2
104
105struct btrfs_ioctl_defrag_range_args {
106 /* start of the defrag operation */
107 __u64 start;
108
109 /* number of bytes to defrag, use (u64)-1 to say all */
110 __u64 len;
111
112 /*
113 * flags for the operation, which can include turning
114 * on compression for this one defrag
115 */
116 __u64 flags;
117
118 /*
119 * any extent bigger than this will be considered
120 * already defragged. Use 0 to take the kernel default
121 * Use 1 to say every single extent must be rewritten
122 */
123 __u32 extent_thresh;
124
125 /* spare for later */
126 __u32 unused[5];
127};
128
129struct btrfs_ioctl_space_info {
130 __u64 flags;
131 __u64 total_bytes;
132 __u64 used_bytes;
133};
134
135struct btrfs_ioctl_space_args {
136 __u64 space_slots;
137 __u64 total_spaces;
138 struct btrfs_ioctl_space_info spaces[0];
139};
140
39#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ 141#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
40 struct btrfs_ioctl_vol_args) 142 struct btrfs_ioctl_vol_args)
41#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ 143#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -67,4 +169,13 @@ struct btrfs_ioctl_clone_range_args {
67 struct btrfs_ioctl_vol_args) 169 struct btrfs_ioctl_vol_args)
68#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \ 170#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
69 struct btrfs_ioctl_vol_args) 171 struct btrfs_ioctl_vol_args)
172#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \
173 struct btrfs_ioctl_defrag_range_args)
174#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \
175 struct btrfs_ioctl_search_args)
176#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
177 struct btrfs_ioctl_ino_lookup_args)
178#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
179#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
180 struct btrfs_ioctl_space_args)
70#endif 181#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 1c36e5cd8f55..6151f2ea38bb 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -16,7 +16,6 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/gfp.h>
20#include <linux/pagemap.h> 19#include <linux/pagemap.h>
21#include <linux/spinlock.h> 20#include <linux/spinlock.h>
22#include <linux/page-flags.h> 21#include <linux/page-flags.h>
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 5c2a9e78a949..a127c0ebb2dc 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -16,7 +16,6 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/gfp.h>
20#include <linux/slab.h> 19#include <linux/slab.h>
21#include <linux/blkdev.h> 20#include <linux/blkdev.h>
22#include <linux/writeback.h> 21#include <linux/writeback.h>
@@ -174,7 +173,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
174 if (!entry) 173 if (!entry)
175 return -ENOMEM; 174 return -ENOMEM;
176 175
177 mutex_lock(&tree->mutex);
178 entry->file_offset = file_offset; 176 entry->file_offset = file_offset;
179 entry->start = start; 177 entry->start = start;
180 entry->len = len; 178 entry->len = len;
@@ -190,16 +188,17 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
190 INIT_LIST_HEAD(&entry->list); 188 INIT_LIST_HEAD(&entry->list);
191 INIT_LIST_HEAD(&entry->root_extent_list); 189 INIT_LIST_HEAD(&entry->root_extent_list);
192 190
191 spin_lock(&tree->lock);
193 node = tree_insert(&tree->tree, file_offset, 192 node = tree_insert(&tree->tree, file_offset,
194 &entry->rb_node); 193 &entry->rb_node);
195 BUG_ON(node); 194 BUG_ON(node);
195 spin_unlock(&tree->lock);
196 196
197 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 197 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
198 list_add_tail(&entry->root_extent_list, 198 list_add_tail(&entry->root_extent_list,
199 &BTRFS_I(inode)->root->fs_info->ordered_extents); 199 &BTRFS_I(inode)->root->fs_info->ordered_extents);
200 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 200 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
201 201
202 mutex_unlock(&tree->mutex);
203 BUG_ON(node); 202 BUG_ON(node);
204 return 0; 203 return 0;
205} 204}
@@ -216,9 +215,9 @@ int btrfs_add_ordered_sum(struct inode *inode,
216 struct btrfs_ordered_inode_tree *tree; 215 struct btrfs_ordered_inode_tree *tree;
217 216
218 tree = &BTRFS_I(inode)->ordered_tree; 217 tree = &BTRFS_I(inode)->ordered_tree;
219 mutex_lock(&tree->mutex); 218 spin_lock(&tree->lock);
220 list_add_tail(&sum->list, &entry->list); 219 list_add_tail(&sum->list, &entry->list);
221 mutex_unlock(&tree->mutex); 220 spin_unlock(&tree->lock);
222 return 0; 221 return 0;
223} 222}
224 223
@@ -232,15 +231,16 @@ int btrfs_add_ordered_sum(struct inode *inode,
232 * to make sure this function only returns 1 once for a given ordered extent. 231 * to make sure this function only returns 1 once for a given ordered extent.
233 */ 232 */
234int btrfs_dec_test_ordered_pending(struct inode *inode, 233int btrfs_dec_test_ordered_pending(struct inode *inode,
234 struct btrfs_ordered_extent **cached,
235 u64 file_offset, u64 io_size) 235 u64 file_offset, u64 io_size)
236{ 236{
237 struct btrfs_ordered_inode_tree *tree; 237 struct btrfs_ordered_inode_tree *tree;
238 struct rb_node *node; 238 struct rb_node *node;
239 struct btrfs_ordered_extent *entry; 239 struct btrfs_ordered_extent *entry = NULL;
240 int ret; 240 int ret;
241 241
242 tree = &BTRFS_I(inode)->ordered_tree; 242 tree = &BTRFS_I(inode)->ordered_tree;
243 mutex_lock(&tree->mutex); 243 spin_lock(&tree->lock);
244 node = tree_search(tree, file_offset); 244 node = tree_search(tree, file_offset);
245 if (!node) { 245 if (!node) {
246 ret = 1; 246 ret = 1;
@@ -264,7 +264,11 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
264 else 264 else
265 ret = 1; 265 ret = 1;
266out: 266out:
267 mutex_unlock(&tree->mutex); 267 if (!ret && cached && entry) {
268 *cached = entry;
269 atomic_inc(&entry->refs);
270 }
271 spin_unlock(&tree->lock);
268 return ret == 0; 272 return ret == 0;
269} 273}
270 274
@@ -291,13 +295,14 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
291 295
292/* 296/*
293 * remove an ordered extent from the tree. No references are dropped 297 * remove an ordered extent from the tree. No references are dropped
294 * and you must wake_up entry->wait. You must hold the tree mutex 298 * and you must wake_up entry->wait. You must hold the tree lock
295 * while you call this function. 299 * while you call this function.
296 */ 300 */
297static int __btrfs_remove_ordered_extent(struct inode *inode, 301static int __btrfs_remove_ordered_extent(struct inode *inode,
298 struct btrfs_ordered_extent *entry) 302 struct btrfs_ordered_extent *entry)
299{ 303{
300 struct btrfs_ordered_inode_tree *tree; 304 struct btrfs_ordered_inode_tree *tree;
305 struct btrfs_root *root = BTRFS_I(inode)->root;
301 struct rb_node *node; 306 struct rb_node *node;
302 307
303 tree = &BTRFS_I(inode)->ordered_tree; 308 tree = &BTRFS_I(inode)->ordered_tree;
@@ -307,12 +312,13 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
307 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 312 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
308 313
309 spin_lock(&BTRFS_I(inode)->accounting_lock); 314 spin_lock(&BTRFS_I(inode)->accounting_lock);
315 WARN_ON(!BTRFS_I(inode)->outstanding_extents);
310 BTRFS_I(inode)->outstanding_extents--; 316 BTRFS_I(inode)->outstanding_extents--;
311 spin_unlock(&BTRFS_I(inode)->accounting_lock); 317 spin_unlock(&BTRFS_I(inode)->accounting_lock);
312 btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root, 318 btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
313 inode, 1); 319 inode, 1);
314 320
315 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 321 spin_lock(&root->fs_info->ordered_extent_lock);
316 list_del_init(&entry->root_extent_list); 322 list_del_init(&entry->root_extent_list);
317 323
318 /* 324 /*
@@ -324,7 +330,7 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
324 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { 330 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
325 list_del_init(&BTRFS_I(inode)->ordered_operations); 331 list_del_init(&BTRFS_I(inode)->ordered_operations);
326 } 332 }
327 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 333 spin_unlock(&root->fs_info->ordered_extent_lock);
328 334
329 return 0; 335 return 0;
330} 336}
@@ -340,9 +346,9 @@ int btrfs_remove_ordered_extent(struct inode *inode,
340 int ret; 346 int ret;
341 347
342 tree = &BTRFS_I(inode)->ordered_tree; 348 tree = &BTRFS_I(inode)->ordered_tree;
343 mutex_lock(&tree->mutex); 349 spin_lock(&tree->lock);
344 ret = __btrfs_remove_ordered_extent(inode, entry); 350 ret = __btrfs_remove_ordered_extent(inode, entry);
345 mutex_unlock(&tree->mutex); 351 spin_unlock(&tree->lock);
346 wake_up(&entry->wait); 352 wake_up(&entry->wait);
347 353
348 return ret; 354 return ret;
@@ -567,7 +573,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
567 struct btrfs_ordered_extent *entry = NULL; 573 struct btrfs_ordered_extent *entry = NULL;
568 574
569 tree = &BTRFS_I(inode)->ordered_tree; 575 tree = &BTRFS_I(inode)->ordered_tree;
570 mutex_lock(&tree->mutex); 576 spin_lock(&tree->lock);
571 node = tree_search(tree, file_offset); 577 node = tree_search(tree, file_offset);
572 if (!node) 578 if (!node)
573 goto out; 579 goto out;
@@ -578,7 +584,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
578 if (entry) 584 if (entry)
579 atomic_inc(&entry->refs); 585 atomic_inc(&entry->refs);
580out: 586out:
581 mutex_unlock(&tree->mutex); 587 spin_unlock(&tree->lock);
582 return entry; 588 return entry;
583} 589}
584 590
@@ -594,7 +600,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
594 struct btrfs_ordered_extent *entry = NULL; 600 struct btrfs_ordered_extent *entry = NULL;
595 601
596 tree = &BTRFS_I(inode)->ordered_tree; 602 tree = &BTRFS_I(inode)->ordered_tree;
597 mutex_lock(&tree->mutex); 603 spin_lock(&tree->lock);
598 node = tree_search(tree, file_offset); 604 node = tree_search(tree, file_offset);
599 if (!node) 605 if (!node)
600 goto out; 606 goto out;
@@ -602,7 +608,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
602 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 608 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
603 atomic_inc(&entry->refs); 609 atomic_inc(&entry->refs);
604out: 610out:
605 mutex_unlock(&tree->mutex); 611 spin_unlock(&tree->lock);
606 return entry; 612 return entry;
607} 613}
608 614
@@ -629,7 +635,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
629 else 635 else
630 offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); 636 offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
631 637
632 mutex_lock(&tree->mutex); 638 spin_lock(&tree->lock);
633 disk_i_size = BTRFS_I(inode)->disk_i_size; 639 disk_i_size = BTRFS_I(inode)->disk_i_size;
634 640
635 /* truncate file */ 641 /* truncate file */
@@ -735,7 +741,7 @@ out:
735 */ 741 */
736 if (ordered) 742 if (ordered)
737 __btrfs_remove_ordered_extent(inode, ordered); 743 __btrfs_remove_ordered_extent(inode, ordered);
738 mutex_unlock(&tree->mutex); 744 spin_unlock(&tree->lock);
739 if (ordered) 745 if (ordered)
740 wake_up(&ordered->wait); 746 wake_up(&ordered->wait);
741 return ret; 747 return ret;
@@ -762,7 +768,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
762 if (!ordered) 768 if (!ordered)
763 return 1; 769 return 1;
764 770
765 mutex_lock(&tree->mutex); 771 spin_lock(&tree->lock);
766 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { 772 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
767 if (disk_bytenr >= ordered_sum->bytenr) { 773 if (disk_bytenr >= ordered_sum->bytenr) {
768 num_sectors = ordered_sum->len / sectorsize; 774 num_sectors = ordered_sum->len / sectorsize;
@@ -777,7 +783,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
777 } 783 }
778 } 784 }
779out: 785out:
780 mutex_unlock(&tree->mutex); 786 spin_unlock(&tree->lock);
781 btrfs_put_ordered_extent(ordered); 787 btrfs_put_ordered_extent(ordered);
782 return ret; 788 return ret;
783} 789}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 1fe1282ef47c..c82f76a9f040 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -21,7 +21,7 @@
21 21
22/* one of these per inode */ 22/* one of these per inode */
23struct btrfs_ordered_inode_tree { 23struct btrfs_ordered_inode_tree {
24 struct mutex mutex; 24 spinlock_t lock;
25 struct rb_root tree; 25 struct rb_root tree;
26 struct rb_node *last; 26 struct rb_node *last;
27}; 27};
@@ -128,8 +128,8 @@ static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
128static inline void 128static inline void
129btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) 129btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
130{ 130{
131 mutex_init(&t->mutex); 131 spin_lock_init(&t->lock);
132 t->tree.rb_node = NULL; 132 t->tree = RB_ROOT;
133 t->last = NULL; 133 t->last = NULL;
134} 134}
135 135
@@ -137,7 +137,8 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
137int btrfs_remove_ordered_extent(struct inode *inode, 137int btrfs_remove_ordered_extent(struct inode *inode,
138 struct btrfs_ordered_extent *entry); 138 struct btrfs_ordered_extent *entry);
139int btrfs_dec_test_ordered_pending(struct inode *inode, 139int btrfs_dec_test_ordered_pending(struct inode *inode,
140 u64 file_offset, u64 io_size); 140 struct btrfs_ordered_extent **cached,
141 u64 file_offset, u64 io_size);
141int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 142int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
142 u64 start, u64 len, u64 disk_len, int tyep); 143 u64 start, u64 len, u64 disk_len, int tyep);
143int btrfs_add_ordered_sum(struct inode *inode, 144int btrfs_add_ordered_sum(struct inode *inode,
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index d0cc62bccb94..a97314cf6bd6 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include <linux/sort.h> 21#include <linux/sort.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "ref-cache.h" 23#include "ref-cache.h"
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index bc283ad2db73..e2a55cb2072b 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -52,7 +52,7 @@ static inline size_t btrfs_leaf_ref_size(int nr_extents)
52 52
53static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree) 53static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
54{ 54{
55 tree->root.rb_node = NULL; 55 tree->root = RB_ROOT;
56 INIT_LIST_HEAD(&tree->list); 56 INIT_LIST_HEAD(&tree->list);
57 spin_lock_init(&tree->lock); 57 spin_lock_init(&tree->lock);
58} 58}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index ab7ab5318745..e558dd941ded 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -21,6 +21,7 @@
21#include <linux/writeback.h> 21#include <linux/writeback.h>
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/rbtree.h> 23#include <linux/rbtree.h>
24#include <linux/slab.h>
24#include "ctree.h" 25#include "ctree.h"
25#include "disk-io.h" 26#include "disk-io.h"
26#include "transaction.h" 27#include "transaction.h"
@@ -170,14 +171,14 @@ struct async_merge {
170 171
171static void mapping_tree_init(struct mapping_tree *tree) 172static void mapping_tree_init(struct mapping_tree *tree)
172{ 173{
173 tree->rb_root.rb_node = NULL; 174 tree->rb_root = RB_ROOT;
174 spin_lock_init(&tree->lock); 175 spin_lock_init(&tree->lock);
175} 176}
176 177
177static void backref_cache_init(struct backref_cache *cache) 178static void backref_cache_init(struct backref_cache *cache)
178{ 179{
179 int i; 180 int i;
180 cache->rb_root.rb_node = NULL; 181 cache->rb_root = RB_ROOT;
181 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 182 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
182 INIT_LIST_HEAD(&cache->pending[i]); 183 INIT_LIST_HEAD(&cache->pending[i]);
183 spin_lock_init(&cache->lock); 184 spin_lock_init(&cache->lock);
@@ -2659,7 +2660,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2659 EXTENT_BOUNDARY, GFP_NOFS); 2660 EXTENT_BOUNDARY, GFP_NOFS);
2660 nr++; 2661 nr++;
2661 } 2662 }
2662 btrfs_set_extent_delalloc(inode, page_start, page_end); 2663 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
2663 2664
2664 set_page_dirty(page); 2665 set_page_dirty(page);
2665 dirty_page++; 2666 dirty_page++;
@@ -3487,7 +3488,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3487 key.objectid = objectid; 3488 key.objectid = objectid;
3488 key.type = BTRFS_INODE_ITEM_KEY; 3489 key.type = BTRFS_INODE_ITEM_KEY;
3489 key.offset = 0; 3490 key.offset = 0;
3490 inode = btrfs_iget(root->fs_info->sb, &key, root); 3491 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
3491 BUG_ON(IS_ERR(inode) || is_bad_inode(inode)); 3492 BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
3492 BTRFS_I(inode)->index_cnt = group->key.objectid; 3493 BTRFS_I(inode)->index_cnt = group->key.objectid;
3493 3494
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8a1ea6e64575..1866dff0538e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -38,6 +38,7 @@
38#include <linux/namei.h> 38#include <linux/namei.h>
39#include <linux/miscdevice.h> 39#include <linux/miscdevice.h>
40#include <linux/magic.h> 40#include <linux/magic.h>
41#include <linux/slab.h>
41#include "compat.h" 42#include "compat.h"
42#include "ctree.h" 43#include "ctree.h"
43#include "disk-io.h" 44#include "disk-io.h"
@@ -63,22 +64,21 @@ static void btrfs_put_super(struct super_block *sb)
63} 64}
64 65
65enum { 66enum {
66 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, 67 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
67 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, 68 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
68 Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, 69 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
69 Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio, 70 Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
70 Opt_flushoncommit,
71 Opt_discard, Opt_err, 71 Opt_discard, Opt_err,
72}; 72};
73 73
74static match_table_t tokens = { 74static match_table_t tokens = {
75 {Opt_degraded, "degraded"}, 75 {Opt_degraded, "degraded"},
76 {Opt_subvol, "subvol=%s"}, 76 {Opt_subvol, "subvol=%s"},
77 {Opt_subvolid, "subvolid=%d"},
77 {Opt_device, "device=%s"}, 78 {Opt_device, "device=%s"},
78 {Opt_nodatasum, "nodatasum"}, 79 {Opt_nodatasum, "nodatasum"},
79 {Opt_nodatacow, "nodatacow"}, 80 {Opt_nodatacow, "nodatacow"},
80 {Opt_nobarrier, "nobarrier"}, 81 {Opt_nobarrier, "nobarrier"},
81 {Opt_max_extent, "max_extent=%s"},
82 {Opt_max_inline, "max_inline=%s"}, 82 {Opt_max_inline, "max_inline=%s"},
83 {Opt_alloc_start, "alloc_start=%s"}, 83 {Opt_alloc_start, "alloc_start=%s"},
84 {Opt_thread_pool, "thread_pool=%d"}, 84 {Opt_thread_pool, "thread_pool=%d"},
@@ -95,31 +95,6 @@ static match_table_t tokens = {
95 {Opt_err, NULL}, 95 {Opt_err, NULL},
96}; 96};
97 97
98u64 btrfs_parse_size(char *str)
99{
100 u64 res;
101 int mult = 1;
102 char *end;
103 char last;
104
105 res = simple_strtoul(str, &end, 10);
106
107 last = end[0];
108 if (isalpha(last)) {
109 last = tolower(last);
110 switch (last) {
111 case 'g':
112 mult *= 1024;
113 case 'm':
114 mult *= 1024;
115 case 'k':
116 mult *= 1024;
117 }
118 res = res * mult;
119 }
120 return res;
121}
122
123/* 98/*
124 * Regular mount options parser. Everything that is needed only when 99 * Regular mount options parser. Everything that is needed only when
125 * reading in a new superblock is parsed here. 100 * reading in a new superblock is parsed here.
@@ -128,7 +103,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
128{ 103{
129 struct btrfs_fs_info *info = root->fs_info; 104 struct btrfs_fs_info *info = root->fs_info;
130 substring_t args[MAX_OPT_ARGS]; 105 substring_t args[MAX_OPT_ARGS];
131 char *p, *num; 106 char *p, *num, *orig;
132 int intarg; 107 int intarg;
133 int ret = 0; 108 int ret = 0;
134 109
@@ -143,6 +118,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
143 if (!options) 118 if (!options)
144 return -ENOMEM; 119 return -ENOMEM;
145 120
121 orig = options;
146 122
147 while ((p = strsep(&options, ",")) != NULL) { 123 while ((p = strsep(&options, ",")) != NULL) {
148 int token; 124 int token;
@@ -156,6 +132,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
156 btrfs_set_opt(info->mount_opt, DEGRADED); 132 btrfs_set_opt(info->mount_opt, DEGRADED);
157 break; 133 break;
158 case Opt_subvol: 134 case Opt_subvol:
135 case Opt_subvolid:
159 case Opt_device: 136 case Opt_device:
160 /* 137 /*
161 * These are parsed by btrfs_parse_early_options 138 * These are parsed by btrfs_parse_early_options
@@ -210,22 +187,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
210 info->thread_pool_size); 187 info->thread_pool_size);
211 } 188 }
212 break; 189 break;
213 case Opt_max_extent:
214 num = match_strdup(&args[0]);
215 if (num) {
216 info->max_extent = btrfs_parse_size(num);
217 kfree(num);
218
219 info->max_extent = max_t(u64,
220 info->max_extent, root->sectorsize);
221 printk(KERN_INFO "btrfs: max_extent at %llu\n",
222 (unsigned long long)info->max_extent);
223 }
224 break;
225 case Opt_max_inline: 190 case Opt_max_inline:
226 num = match_strdup(&args[0]); 191 num = match_strdup(&args[0]);
227 if (num) { 192 if (num) {
228 info->max_inline = btrfs_parse_size(num); 193 info->max_inline = memparse(num, NULL);
229 kfree(num); 194 kfree(num);
230 195
231 if (info->max_inline) { 196 if (info->max_inline) {
@@ -240,7 +205,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
240 case Opt_alloc_start: 205 case Opt_alloc_start:
241 num = match_strdup(&args[0]); 206 num = match_strdup(&args[0]);
242 if (num) { 207 if (num) {
243 info->alloc_start = btrfs_parse_size(num); 208 info->alloc_start = memparse(num, NULL);
244 kfree(num); 209 kfree(num);
245 printk(KERN_INFO 210 printk(KERN_INFO
246 "btrfs: allocations start at %llu\n", 211 "btrfs: allocations start at %llu\n",
@@ -280,7 +245,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
280 } 245 }
281 } 246 }
282out: 247out:
283 kfree(options); 248 kfree(orig);
284 return ret; 249 return ret;
285} 250}
286 251
@@ -291,12 +256,13 @@ out:
291 * only when we need to allocate a new super block. 256 * only when we need to allocate a new super block.
292 */ 257 */
293static int btrfs_parse_early_options(const char *options, fmode_t flags, 258static int btrfs_parse_early_options(const char *options, fmode_t flags,
294 void *holder, char **subvol_name, 259 void *holder, char **subvol_name, u64 *subvol_objectid,
295 struct btrfs_fs_devices **fs_devices) 260 struct btrfs_fs_devices **fs_devices)
296{ 261{
297 substring_t args[MAX_OPT_ARGS]; 262 substring_t args[MAX_OPT_ARGS];
298 char *opts, *p; 263 char *opts, *p;
299 int error = 0; 264 int error = 0;
265 int intarg;
300 266
301 if (!options) 267 if (!options)
302 goto out; 268 goto out;
@@ -319,6 +285,18 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
319 case Opt_subvol: 285 case Opt_subvol:
320 *subvol_name = match_strdup(&args[0]); 286 *subvol_name = match_strdup(&args[0]);
321 break; 287 break;
288 case Opt_subvolid:
289 intarg = 0;
290 error = match_int(&args[0], &intarg);
291 if (!error) {
292 /* we want the original fs_tree */
293 if (!intarg)
294 *subvol_objectid =
295 BTRFS_FS_TREE_OBJECTID;
296 else
297 *subvol_objectid = intarg;
298 }
299 break;
322 case Opt_device: 300 case Opt_device:
323 error = btrfs_scan_one_device(match_strdup(&args[0]), 301 error = btrfs_scan_one_device(match_strdup(&args[0]),
324 flags, holder, fs_devices); 302 flags, holder, fs_devices);
@@ -346,6 +324,110 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
346 return error; 324 return error;
347} 325}
348 326
327static struct dentry *get_default_root(struct super_block *sb,
328 u64 subvol_objectid)
329{
330 struct btrfs_root *root = sb->s_fs_info;
331 struct btrfs_root *new_root;
332 struct btrfs_dir_item *di;
333 struct btrfs_path *path;
334 struct btrfs_key location;
335 struct inode *inode;
336 struct dentry *dentry;
337 u64 dir_id;
338 int new = 0;
339
340 /*
341 * We have a specific subvol we want to mount, just setup location and
342 * go look up the root.
343 */
344 if (subvol_objectid) {
345 location.objectid = subvol_objectid;
346 location.type = BTRFS_ROOT_ITEM_KEY;
347 location.offset = (u64)-1;
348 goto find_root;
349 }
350
351 path = btrfs_alloc_path();
352 if (!path)
353 return ERR_PTR(-ENOMEM);
354 path->leave_spinning = 1;
355
356 /*
357 * Find the "default" dir item which points to the root item that we
358 * will mount by default if we haven't been given a specific subvolume
359 * to mount.
360 */
361 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
362 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
363 if (!di) {
364 /*
365 * Ok the default dir item isn't there. This is weird since
366 * it's always been there, but don't freak out, just try and
367 * mount to root most subvolume.
368 */
369 btrfs_free_path(path);
370 dir_id = BTRFS_FIRST_FREE_OBJECTID;
371 new_root = root->fs_info->fs_root;
372 goto setup_root;
373 }
374
375 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
376 btrfs_free_path(path);
377
378find_root:
379 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
380 if (IS_ERR(new_root))
381 return ERR_PTR(PTR_ERR(new_root));
382
383 if (btrfs_root_refs(&new_root->root_item) == 0)
384 return ERR_PTR(-ENOENT);
385
386 dir_id = btrfs_root_dirid(&new_root->root_item);
387setup_root:
388 location.objectid = dir_id;
389 location.type = BTRFS_INODE_ITEM_KEY;
390 location.offset = 0;
391
392 inode = btrfs_iget(sb, &location, new_root, &new);
393 if (!inode)
394 return ERR_PTR(-ENOMEM);
395
396 /*
397 * If we're just mounting the root most subvol put the inode and return
398 * a reference to the dentry. We will have already gotten a reference
399 * to the inode in btrfs_fill_super so we're good to go.
400 */
401 if (!new && sb->s_root->d_inode == inode) {
402 iput(inode);
403 return dget(sb->s_root);
404 }
405
406 if (new) {
407 const struct qstr name = { .name = "/", .len = 1 };
408
409 /*
410 * New inode, we need to make the dentry a sibling of s_root so
411 * everything gets cleaned up properly on unmount.
412 */
413 dentry = d_alloc(sb->s_root, &name);
414 if (!dentry) {
415 iput(inode);
416 return ERR_PTR(-ENOMEM);
417 }
418 d_splice_alias(inode, dentry);
419 } else {
420 /*
421 * We found the inode in cache, just find a dentry for it and
422 * put the reference to the inode we just got.
423 */
424 dentry = d_find_alias(inode);
425 iput(inode);
426 }
427
428 return dentry;
429}
430
349static int btrfs_fill_super(struct super_block *sb, 431static int btrfs_fill_super(struct super_block *sb,
350 struct btrfs_fs_devices *fs_devices, 432 struct btrfs_fs_devices *fs_devices,
351 void *data, int silent) 433 void *data, int silent)
@@ -379,7 +461,7 @@ static int btrfs_fill_super(struct super_block *sb,
379 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 461 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
380 key.type = BTRFS_INODE_ITEM_KEY; 462 key.type = BTRFS_INODE_ITEM_KEY;
381 key.offset = 0; 463 key.offset = 0;
382 inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root); 464 inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL);
383 if (IS_ERR(inode)) { 465 if (IS_ERR(inode)) {
384 err = PTR_ERR(inode); 466 err = PTR_ERR(inode);
385 goto fail_close; 467 goto fail_close;
@@ -391,12 +473,6 @@ static int btrfs_fill_super(struct super_block *sb,
391 err = -ENOMEM; 473 err = -ENOMEM;
392 goto fail_close; 474 goto fail_close;
393 } 475 }
394#if 0
395 /* this does the super kobj at the same time */
396 err = btrfs_sysfs_add_super(tree_root->fs_info);
397 if (err)
398 goto fail_close;
399#endif
400 476
401 sb->s_root = root_dentry; 477 sb->s_root = root_dentry;
402 478
@@ -440,9 +516,6 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
440 seq_puts(seq, ",nodatacow"); 516 seq_puts(seq, ",nodatacow");
441 if (btrfs_test_opt(root, NOBARRIER)) 517 if (btrfs_test_opt(root, NOBARRIER))
442 seq_puts(seq, ",nobarrier"); 518 seq_puts(seq, ",nobarrier");
443 if (info->max_extent != (u64)-1)
444 seq_printf(seq, ",max_extent=%llu",
445 (unsigned long long)info->max_extent);
446 if (info->max_inline != 8192 * 1024) 519 if (info->max_inline != 8192 * 1024)
447 seq_printf(seq, ",max_inline=%llu", 520 seq_printf(seq, ",max_inline=%llu",
448 (unsigned long long)info->max_inline); 521 (unsigned long long)info->max_inline);
@@ -488,19 +561,22 @@ static int btrfs_test_super(struct super_block *s, void *data)
488static int btrfs_get_sb(struct file_system_type *fs_type, int flags, 561static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
489 const char *dev_name, void *data, struct vfsmount *mnt) 562 const char *dev_name, void *data, struct vfsmount *mnt)
490{ 563{
491 char *subvol_name = NULL;
492 struct block_device *bdev = NULL; 564 struct block_device *bdev = NULL;
493 struct super_block *s; 565 struct super_block *s;
494 struct dentry *root; 566 struct dentry *root;
495 struct btrfs_fs_devices *fs_devices = NULL; 567 struct btrfs_fs_devices *fs_devices = NULL;
496 fmode_t mode = FMODE_READ; 568 fmode_t mode = FMODE_READ;
569 char *subvol_name = NULL;
570 u64 subvol_objectid = 0;
497 int error = 0; 571 int error = 0;
572 int found = 0;
498 573
499 if (!(flags & MS_RDONLY)) 574 if (!(flags & MS_RDONLY))
500 mode |= FMODE_WRITE; 575 mode |= FMODE_WRITE;
501 576
502 error = btrfs_parse_early_options(data, mode, fs_type, 577 error = btrfs_parse_early_options(data, mode, fs_type,
503 &subvol_name, &fs_devices); 578 &subvol_name, &subvol_objectid,
579 &fs_devices);
504 if (error) 580 if (error)
505 return error; 581 return error;
506 582
@@ -529,6 +605,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
529 goto error_close_devices; 605 goto error_close_devices;
530 } 606 }
531 607
608 found = 1;
532 btrfs_close_devices(fs_devices); 609 btrfs_close_devices(fs_devices);
533 } else { 610 } else {
534 char b[BDEVNAME_SIZE]; 611 char b[BDEVNAME_SIZE];
@@ -546,25 +623,35 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
546 s->s_flags |= MS_ACTIVE; 623 s->s_flags |= MS_ACTIVE;
547 } 624 }
548 625
549 if (!strcmp(subvol_name, ".")) 626 root = get_default_root(s, subvol_objectid);
550 root = dget(s->s_root); 627 if (IS_ERR(root)) {
551 else { 628 error = PTR_ERR(root);
552 mutex_lock(&s->s_root->d_inode->i_mutex); 629 deactivate_locked_super(s);
553 root = lookup_one_len(subvol_name, s->s_root, 630 goto error;
631 }
632 /* if they gave us a subvolume name bind mount into that */
633 if (strcmp(subvol_name, ".")) {
634 struct dentry *new_root;
635 mutex_lock(&root->d_inode->i_mutex);
636 new_root = lookup_one_len(subvol_name, root,
554 strlen(subvol_name)); 637 strlen(subvol_name));
555 mutex_unlock(&s->s_root->d_inode->i_mutex); 638 mutex_unlock(&root->d_inode->i_mutex);
556 639
557 if (IS_ERR(root)) { 640 if (IS_ERR(new_root)) {
558 deactivate_locked_super(s); 641 deactivate_locked_super(s);
559 error = PTR_ERR(root); 642 error = PTR_ERR(new_root);
560 goto error_free_subvol_name; 643 dput(root);
644 goto error_close_devices;
561 } 645 }
562 if (!root->d_inode) { 646 if (!new_root->d_inode) {
563 dput(root); 647 dput(root);
648 dput(new_root);
564 deactivate_locked_super(s); 649 deactivate_locked_super(s);
565 error = -ENXIO; 650 error = -ENXIO;
566 goto error_free_subvol_name; 651 goto error_close_devices;
567 } 652 }
653 dput(root);
654 root = new_root;
568 } 655 }
569 656
570 mnt->mnt_sb = s; 657 mnt->mnt_sb = s;
@@ -579,6 +666,7 @@ error_close_devices:
579 btrfs_close_devices(fs_devices); 666 btrfs_close_devices(fs_devices);
580error_free_subvol_name: 667error_free_subvol_name:
581 kfree(subvol_name); 668 kfree(subvol_name);
669error:
582 return error; 670 return error;
583} 671}
584 672
@@ -623,14 +711,37 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
623{ 711{
624 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 712 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
625 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 713 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
714 struct list_head *head = &root->fs_info->space_info;
715 struct btrfs_space_info *found;
716 u64 total_used = 0;
717 u64 data_used = 0;
626 int bits = dentry->d_sb->s_blocksize_bits; 718 int bits = dentry->d_sb->s_blocksize_bits;
627 __be32 *fsid = (__be32 *)root->fs_info->fsid; 719 __be32 *fsid = (__be32 *)root->fs_info->fsid;
628 720
721 rcu_read_lock();
722 list_for_each_entry_rcu(found, head, list) {
723 if (found->flags & (BTRFS_BLOCK_GROUP_DUP|
724 BTRFS_BLOCK_GROUP_RAID10|
725 BTRFS_BLOCK_GROUP_RAID1)) {
726 total_used += found->bytes_used;
727 if (found->flags & BTRFS_BLOCK_GROUP_DATA)
728 data_used += found->bytes_used;
729 else
730 data_used += found->total_bytes;
731 }
732
733 total_used += found->bytes_used;
734 if (found->flags & BTRFS_BLOCK_GROUP_DATA)
735 data_used += found->bytes_used;
736 else
737 data_used += found->total_bytes;
738 }
739 rcu_read_unlock();
740
629 buf->f_namelen = BTRFS_NAME_LEN; 741 buf->f_namelen = BTRFS_NAME_LEN;
630 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 742 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
631 buf->f_bfree = buf->f_blocks - 743 buf->f_bfree = buf->f_blocks - (total_used >> bits);
632 (btrfs_super_bytes_used(disk_super) >> bits); 744 buf->f_bavail = buf->f_blocks - (data_used >> bits);
633 buf->f_bavail = buf->f_bfree;
634 buf->f_bsize = dentry->d_sb->s_blocksize; 745 buf->f_bsize = dentry->d_sb->s_blocksize;
635 buf->f_type = BTRFS_SUPER_MAGIC; 746 buf->f_type = BTRFS_SUPER_MAGIC;
636 747
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index a240b6fa81df..4ce16ef702a3 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -164,12 +164,12 @@ static void btrfs_root_release(struct kobject *kobj)
164 complete(&root->kobj_unregister); 164 complete(&root->kobj_unregister);
165} 165}
166 166
167static struct sysfs_ops btrfs_super_attr_ops = { 167static const struct sysfs_ops btrfs_super_attr_ops = {
168 .show = btrfs_super_attr_show, 168 .show = btrfs_super_attr_show,
169 .store = btrfs_super_attr_store, 169 .store = btrfs_super_attr_store,
170}; 170};
171 171
172static struct sysfs_ops btrfs_root_attr_ops = { 172static const struct sysfs_ops btrfs_root_attr_ops = {
173 .show = btrfs_root_attr_show, 173 .show = btrfs_root_attr_show,
174 .store = btrfs_root_attr_store, 174 .store = btrfs_root_attr_store,
175}; 175};
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b2acc79f1b34..2cb116099b90 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/slab.h>
20#include <linux/sched.h> 21#include <linux/sched.h>
21#include <linux/writeback.h> 22#include <linux/writeback.h>
22#include <linux/pagemap.h> 23#include <linux/pagemap.h>
@@ -69,7 +70,7 @@ static noinline int join_transaction(struct btrfs_root *root)
69 cur_trans->commit_done = 0; 70 cur_trans->commit_done = 0;
70 cur_trans->start_time = get_seconds(); 71 cur_trans->start_time = get_seconds();
71 72
72 cur_trans->delayed_refs.root.rb_node = NULL; 73 cur_trans->delayed_refs.root = RB_ROOT;
73 cur_trans->delayed_refs.num_entries = 0; 74 cur_trans->delayed_refs.num_entries = 0;
74 cur_trans->delayed_refs.num_heads_ready = 0; 75 cur_trans->delayed_refs.num_heads_ready = 0;
75 cur_trans->delayed_refs.num_heads = 0; 76 cur_trans->delayed_refs.num_heads = 0;
@@ -147,18 +148,13 @@ static void wait_current_trans(struct btrfs_root *root)
147 while (1) { 148 while (1) {
148 prepare_to_wait(&root->fs_info->transaction_wait, &wait, 149 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
149 TASK_UNINTERRUPTIBLE); 150 TASK_UNINTERRUPTIBLE);
150 if (cur_trans->blocked) { 151 if (!cur_trans->blocked)
151 mutex_unlock(&root->fs_info->trans_mutex);
152 schedule();
153 mutex_lock(&root->fs_info->trans_mutex);
154 finish_wait(&root->fs_info->transaction_wait,
155 &wait);
156 } else {
157 finish_wait(&root->fs_info->transaction_wait,
158 &wait);
159 break; 152 break;
160 } 153 mutex_unlock(&root->fs_info->trans_mutex);
154 schedule();
155 mutex_lock(&root->fs_info->trans_mutex);
161 } 156 }
157 finish_wait(&root->fs_info->transaction_wait, &wait);
162 put_transaction(cur_trans); 158 put_transaction(cur_trans);
163 } 159 }
164} 160}
@@ -760,10 +756,17 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
760 struct btrfs_root_item *new_root_item; 756 struct btrfs_root_item *new_root_item;
761 struct btrfs_root *tree_root = fs_info->tree_root; 757 struct btrfs_root *tree_root = fs_info->tree_root;
762 struct btrfs_root *root = pending->root; 758 struct btrfs_root *root = pending->root;
759 struct btrfs_root *parent_root;
760 struct inode *parent_inode;
763 struct extent_buffer *tmp; 761 struct extent_buffer *tmp;
764 struct extent_buffer *old; 762 struct extent_buffer *old;
765 int ret; 763 int ret;
766 u64 objectid; 764 u64 objectid;
765 int namelen;
766 u64 index = 0;
767
768 parent_inode = pending->dentry->d_parent->d_inode;
769 parent_root = BTRFS_I(parent_inode)->root;
767 770
768 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 771 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
769 if (!new_root_item) { 772 if (!new_root_item) {
@@ -774,79 +777,59 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
774 if (ret) 777 if (ret)
775 goto fail; 778 goto fail;
776 779
777 record_root_in_trans(trans, root);
778 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
779 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
780
781 key.objectid = objectid; 780 key.objectid = objectid;
782 /* record when the snapshot was created in key.offset */ 781 /* record when the snapshot was created in key.offset */
783 key.offset = trans->transid; 782 key.offset = trans->transid;
784 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 783 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
785 784
786 old = btrfs_lock_root_node(root);
787 btrfs_cow_block(trans, root, old, NULL, 0, &old);
788 btrfs_set_lock_blocking(old);
789
790 btrfs_copy_root(trans, root, old, &tmp, objectid);
791 btrfs_tree_unlock(old);
792 free_extent_buffer(old);
793
794 btrfs_set_root_node(new_root_item, tmp);
795 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
796 new_root_item);
797 btrfs_tree_unlock(tmp);
798 free_extent_buffer(tmp);
799 if (ret)
800 goto fail;
801
802 key.offset = (u64)-1;
803 memcpy(&pending->root_key, &key, sizeof(key)); 785 memcpy(&pending->root_key, &key, sizeof(key));
804fail: 786 pending->root_key.offset = (u64)-1;
805 kfree(new_root_item);
806 return ret;
807}
808
809static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
810 struct btrfs_pending_snapshot *pending)
811{
812 int ret;
813 int namelen;
814 u64 index = 0;
815 struct btrfs_trans_handle *trans;
816 struct inode *parent_inode;
817 struct btrfs_root *parent_root;
818
819 parent_inode = pending->dentry->d_parent->d_inode;
820 parent_root = BTRFS_I(parent_inode)->root;
821 trans = btrfs_join_transaction(parent_root, 1);
822 787
788 record_root_in_trans(trans, parent_root);
823 /* 789 /*
824 * insert the directory item 790 * insert the directory item
825 */ 791 */
826 namelen = strlen(pending->name); 792 namelen = strlen(pending->name);
827 ret = btrfs_set_inode_index(parent_inode, &index); 793 ret = btrfs_set_inode_index(parent_inode, &index);
794 BUG_ON(ret);
828 ret = btrfs_insert_dir_item(trans, parent_root, 795 ret = btrfs_insert_dir_item(trans, parent_root,
829 pending->name, namelen, 796 pending->name, namelen,
830 parent_inode->i_ino, 797 parent_inode->i_ino,
831 &pending->root_key, BTRFS_FT_DIR, index); 798 &pending->root_key, BTRFS_FT_DIR, index);
832 799 BUG_ON(ret);
833 if (ret)
834 goto fail;
835 800
836 btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2); 801 btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
837 ret = btrfs_update_inode(trans, parent_root, parent_inode); 802 ret = btrfs_update_inode(trans, parent_root, parent_inode);
838 BUG_ON(ret); 803 BUG_ON(ret);
839 804
805 record_root_in_trans(trans, root);
806 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
807 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
808
809 old = btrfs_lock_root_node(root);
810 btrfs_cow_block(trans, root, old, NULL, 0, &old);
811 btrfs_set_lock_blocking(old);
812
813 btrfs_copy_root(trans, root, old, &tmp, objectid);
814 btrfs_tree_unlock(old);
815 free_extent_buffer(old);
816
817 btrfs_set_root_node(new_root_item, tmp);
818 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
819 new_root_item);
820 BUG_ON(ret);
821 btrfs_tree_unlock(tmp);
822 free_extent_buffer(tmp);
823
840 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, 824 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
841 pending->root_key.objectid, 825 pending->root_key.objectid,
842 parent_root->root_key.objectid, 826 parent_root->root_key.objectid,
843 parent_inode->i_ino, index, pending->name, 827 parent_inode->i_ino, index, pending->name,
844 namelen); 828 namelen);
845
846 BUG_ON(ret); 829 BUG_ON(ret);
847 830
848fail: 831fail:
849 btrfs_end_transaction(trans, fs_info->fs_root); 832 kfree(new_root_item);
850 return ret; 833 return ret;
851} 834}
852 835
@@ -867,25 +850,6 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
867 return 0; 850 return 0;
868} 851}
869 852
870static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
871 struct btrfs_fs_info *fs_info)
872{
873 struct btrfs_pending_snapshot *pending;
874 struct list_head *head = &trans->transaction->pending_snapshots;
875 int ret;
876
877 while (!list_empty(head)) {
878 pending = list_entry(head->next,
879 struct btrfs_pending_snapshot, list);
880 ret = finish_pending_snapshot(fs_info, pending);
881 BUG_ON(ret);
882 list_del(&pending->list);
883 kfree(pending->name);
884 kfree(pending);
885 }
886 return 0;
887}
888
889static void update_super_roots(struct btrfs_root *root) 853static void update_super_roots(struct btrfs_root *root)
890{ 854{
891 struct btrfs_root_item *root_item; 855 struct btrfs_root_item *root_item;
@@ -997,13 +961,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
997 961
998 mutex_unlock(&root->fs_info->trans_mutex); 962 mutex_unlock(&root->fs_info->trans_mutex);
999 963
1000 if (flush_on_commit) { 964 if (flush_on_commit || snap_pending) {
1001 btrfs_start_delalloc_inodes(root, 1); 965 btrfs_start_delalloc_inodes(root, 1);
1002 ret = btrfs_wait_ordered_extents(root, 0, 1); 966 ret = btrfs_wait_ordered_extents(root, 0, 1);
1003 BUG_ON(ret); 967 BUG_ON(ret);
1004 } else if (snap_pending) {
1005 ret = btrfs_wait_ordered_extents(root, 0, 1);
1006 BUG_ON(ret);
1007 } 968 }
1008 969
1009 /* 970 /*
@@ -1100,9 +1061,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1100 1061
1101 btrfs_finish_extent_commit(trans, root); 1062 btrfs_finish_extent_commit(trans, root);
1102 1063
1103 /* do the directory inserts of any pending snapshot creations */
1104 finish_pending_snapshots(trans, root->fs_info);
1105
1106 mutex_lock(&root->fs_info->trans_mutex); 1064 mutex_lock(&root->fs_info->trans_mutex);
1107 1065
1108 cur_trans->commit_done = 1; 1066 cur_trans->commit_done = 1;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4a9434b622ec..af57dd2b43d4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "transaction.h" 22#include "transaction.h"
22#include "disk-io.h" 23#include "disk-io.h"
@@ -445,7 +446,7 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root,
445 key.objectid = objectid; 446 key.objectid = objectid;
446 key.type = BTRFS_INODE_ITEM_KEY; 447 key.type = BTRFS_INODE_ITEM_KEY;
447 key.offset = 0; 448 key.offset = 0;
448 inode = btrfs_iget(root->fs_info->sb, &key, root); 449 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
449 if (IS_ERR(inode)) { 450 if (IS_ERR(inode)) {
450 inode = NULL; 451 inode = NULL;
451 } else if (is_bad_inode(inode)) { 452 } else if (is_bad_inode(inode)) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 41ecbb2347f2..8db7b14bbae8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/bio.h> 19#include <linux/bio.h>
20#include <linux/slab.h>
20#include <linux/buffer_head.h> 21#include <linux/buffer_head.h>
21#include <linux/blkdev.h> 22#include <linux/blkdev.h>
22#include <linux/random.h> 23#include <linux/random.h>
@@ -256,13 +257,13 @@ loop_lock:
256 wake_up(&fs_info->async_submit_wait); 257 wake_up(&fs_info->async_submit_wait);
257 258
258 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 259 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
259 submit_bio(cur->bi_rw, cur);
260 num_run++;
261 batch_run++;
262 260
263 if (bio_rw_flagged(cur, BIO_RW_SYNCIO)) 261 if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
264 num_sync_run++; 262 num_sync_run++;
265 263
264 submit_bio(cur->bi_rw, cur);
265 num_run++;
266 batch_run++;
266 if (need_resched()) { 267 if (need_resched()) {
267 if (num_sync_run) { 268 if (num_sync_run) {
268 blk_run_backing_dev(bdi, NULL); 269 blk_run_backing_dev(bdi, NULL);
@@ -325,16 +326,6 @@ loop_lock:
325 num_sync_run = 0; 326 num_sync_run = 0;
326 blk_run_backing_dev(bdi, NULL); 327 blk_run_backing_dev(bdi, NULL);
327 } 328 }
328
329 cond_resched();
330 if (again)
331 goto loop;
332
333 spin_lock(&device->io_lock);
334 if (device->pending_bios.head || device->pending_sync_bios.head)
335 goto loop_lock;
336 spin_unlock(&device->io_lock);
337
338 /* 329 /*
339 * IO has already been through a long path to get here. Checksumming, 330 * IO has already been through a long path to get here. Checksumming,
340 * async helper threads, perhaps compression. We've done a pretty 331 * async helper threads, perhaps compression. We've done a pretty
@@ -346,6 +337,16 @@ loop_lock:
346 * cared about found its way down here. 337 * cared about found its way down here.
347 */ 338 */
348 blk_run_backing_dev(bdi, NULL); 339 blk_run_backing_dev(bdi, NULL);
340
341 cond_resched();
342 if (again)
343 goto loop;
344
345 spin_lock(&device->io_lock);
346 if (device->pending_bios.head || device->pending_sync_bios.head)
347 goto loop_lock;
348 spin_unlock(&device->io_lock);
349
349done: 350done:
350 return 0; 351 return 0;
351} 352}
@@ -365,6 +366,7 @@ static noinline int device_list_add(const char *path,
365 struct btrfs_device *device; 366 struct btrfs_device *device;
366 struct btrfs_fs_devices *fs_devices; 367 struct btrfs_fs_devices *fs_devices;
367 u64 found_transid = btrfs_super_generation(disk_super); 368 u64 found_transid = btrfs_super_generation(disk_super);
369 char *name;
368 370
369 fs_devices = find_fsid(disk_super->fsid); 371 fs_devices = find_fsid(disk_super->fsid);
370 if (!fs_devices) { 372 if (!fs_devices) {
@@ -411,6 +413,12 @@ static noinline int device_list_add(const char *path,
411 413
412 device->fs_devices = fs_devices; 414 device->fs_devices = fs_devices;
413 fs_devices->num_devices++; 415 fs_devices->num_devices++;
416 } else if (strcmp(device->name, path)) {
417 name = kstrdup(path, GFP_NOFS);
418 if (!name)
419 return -ENOMEM;
420 kfree(device->name);
421 device->name = name;
414 } 422 }
415 423
416 if (found_transid > fs_devices->latest_trans) { 424 if (found_transid > fs_devices->latest_trans) {
@@ -592,7 +600,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
592 goto error_close; 600 goto error_close;
593 601
594 disk_super = (struct btrfs_super_block *)bh->b_data; 602 disk_super = (struct btrfs_super_block *)bh->b_data;
595 devid = le64_to_cpu(disk_super->dev_item.devid); 603 devid = btrfs_stack_device_id(&disk_super->dev_item);
596 if (devid != device->devid) 604 if (devid != device->devid)
597 goto error_brelse; 605 goto error_brelse;
598 606
@@ -694,7 +702,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
694 goto error_close; 702 goto error_close;
695 } 703 }
696 disk_super = (struct btrfs_super_block *)bh->b_data; 704 disk_super = (struct btrfs_super_block *)bh->b_data;
697 devid = le64_to_cpu(disk_super->dev_item.devid); 705 devid = btrfs_stack_device_id(&disk_super->dev_item);
698 transid = btrfs_super_generation(disk_super); 706 transid = btrfs_super_generation(disk_super);
699 if (disk_super->label[0]) 707 if (disk_super->label[0])
700 printk(KERN_INFO "device label %s ", disk_super->label); 708 printk(KERN_INFO "device label %s ", disk_super->label);
@@ -1187,7 +1195,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1187 goto error_close; 1195 goto error_close;
1188 } 1196 }
1189 disk_super = (struct btrfs_super_block *)bh->b_data; 1197 disk_super = (struct btrfs_super_block *)bh->b_data;
1190 devid = le64_to_cpu(disk_super->dev_item.devid); 1198 devid = btrfs_stack_device_id(&disk_super->dev_item);
1191 dev_uuid = disk_super->dev_item.uuid; 1199 dev_uuid = disk_super->dev_item.uuid;
1192 device = btrfs_find_device(root, devid, dev_uuid, 1200 device = btrfs_find_device(root, devid, dev_uuid,
1193 disk_super->fsid); 1201 disk_super->fsid);
@@ -2191,9 +2199,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2191 min_stripes = 2; 2199 min_stripes = 2;
2192 } 2200 }
2193 if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2201 if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
2194 num_stripes = min_t(u64, 2, fs_devices->rw_devices); 2202 if (fs_devices->rw_devices < 2)
2195 if (num_stripes < 2)
2196 return -ENOSPC; 2203 return -ENOSPC;
2204 num_stripes = 2;
2197 min_stripes = 2; 2205 min_stripes = 2;
2198 } 2206 }
2199 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2207 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
@@ -2237,8 +2245,16 @@ again:
2237 do_div(calc_size, stripe_len); 2245 do_div(calc_size, stripe_len);
2238 calc_size *= stripe_len; 2246 calc_size *= stripe_len;
2239 } 2247 }
2248
2240 /* we don't want tiny stripes */ 2249 /* we don't want tiny stripes */
2241 calc_size = max_t(u64, min_stripe_size, calc_size); 2250 if (!looped)
2251 calc_size = max_t(u64, min_stripe_size, calc_size);
2252
2253 /*
2254 * we're about to do_div by the stripe_len so lets make sure
2255 * we end up with something bigger than a stripe
2256 */
2257 calc_size = max_t(u64, calc_size, stripe_len * 4);
2242 2258
2243 do_div(calc_size, stripe_len); 2259 do_div(calc_size, stripe_len);
2244 calc_size *= stripe_len; 2260 calc_size *= stripe_len;
@@ -3382,6 +3398,8 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
3382 key.type = 0; 3398 key.type = 0;
3383again: 3399again:
3384 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3400 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3401 if (ret < 0)
3402 goto error;
3385 while (1) { 3403 while (1) {
3386 leaf = path->nodes[0]; 3404 leaf = path->nodes[0];
3387 slot = path->slots[0]; 3405 slot = path->slots[0];
diff --git a/fs/buffer.c b/fs/buffer.c
index 6fa530256bfd..c9c266db0624 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2893,7 +2893,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2893 2893
2894 /* 2894 /*
2895 * The page straddles i_size. It must be zeroed out on each and every 2895 * The page straddles i_size. It must be zeroed out on each and every
2896 * writepage invokation because it may be mmapped. "A file is mapped 2896 * writepage invocation because it may be mmapped. "A file is mapped
2897 * in multiples of the page size. For a file that is not a multiple of 2897 * in multiples of the page size. For a file that is not a multiple of
2898 * the page size, the remaining memory is zeroed when mapped, and 2898 * the page size, the remaining memory is zeroed when mapped, and
2899 * writes to that region are not written out to the file." 2899 * writes to that region are not written out to the file."
@@ -3265,7 +3265,7 @@ static void recalc_bh_state(void)
3265 3265
3266struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) 3266struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3267{ 3267{
3268 struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags); 3268 struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3269 if (ret) { 3269 if (ret) {
3270 INIT_LIST_HEAD(&ret->b_assoc_buffers); 3270 INIT_LIST_HEAD(&ret->b_assoc_buffers);
3271 get_cpu_var(bh_accounting).nr++; 3271 get_cpu_var(bh_accounting).nr++;
@@ -3352,15 +3352,6 @@ int bh_submit_read(struct buffer_head *bh)
3352} 3352}
3353EXPORT_SYMBOL(bh_submit_read); 3353EXPORT_SYMBOL(bh_submit_read);
3354 3354
3355static void
3356init_buffer_head(void *data)
3357{
3358 struct buffer_head *bh = data;
3359
3360 memset(bh, 0, sizeof(*bh));
3361 INIT_LIST_HEAD(&bh->b_assoc_buffers);
3362}
3363
3364void __init buffer_init(void) 3355void __init buffer_init(void)
3365{ 3356{
3366 int nrpages; 3357 int nrpages;
@@ -3369,7 +3360,7 @@ void __init buffer_init(void)
3369 sizeof(struct buffer_head), 0, 3360 sizeof(struct buffer_head), 0,
3370 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| 3361 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3371 SLAB_MEM_SPREAD), 3362 SLAB_MEM_SPREAD),
3372 init_buffer_head); 3363 NULL);
3373 3364
3374 /* 3365 /*
3375 * Limit the bh occupancy to 10% of ZONE_NORMAL 3366 * Limit the bh occupancy to 10% of ZONE_NORMAL
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 27089311fbea..37fe101a4e0d 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -9,6 +9,7 @@
9 * 2 of the Licence, or (at your option) any later version. 9 * 2 of the Licence, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/slab.h>
12#include <linux/mount.h> 13#include <linux/mount.h>
13#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
14#include "internal.h" 15#include "internal.h"
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index f7c255f9c624..a8cd821226da 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -34,6 +34,7 @@ struct cachefiles_object {
34 loff_t i_size; /* object size */ 34 loff_t i_size; /* object size */
35 unsigned long flags; 35 unsigned long flags;
36#define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */ 36#define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */
37#define CACHEFILES_OBJECT_BURIED 1 /* T if preemptively buried */
37 atomic_t usage; /* object usage count */ 38 atomic_t usage; /* object usage count */
38 uint8_t type; /* object type */ 39 uint8_t type; /* object type */
39 uint8_t new; /* T if object new */ 40 uint8_t new; /* T if object new */
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index eeb4986ea7db..f4a7840bf42c 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -19,6 +19,7 @@
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/namei.h> 20#include <linux/namei.h>
21#include <linux/security.h> 21#include <linux/security.h>
22#include <linux/slab.h>
22#include "internal.h" 23#include "internal.h"
23 24
24#define CACHEFILES_KEYBUF_SIZE 512 25#define CACHEFILES_KEYBUF_SIZE 512
@@ -92,6 +93,59 @@ static noinline void cachefiles_printk_object(struct cachefiles_object *object,
92} 93}
93 94
94/* 95/*
96 * mark the owner of a dentry, if there is one, to indicate that that dentry
97 * has been preemptively deleted
98 * - the caller must hold the i_mutex on the dentry's parent as required to
99 * call vfs_unlink(), vfs_rmdir() or vfs_rename()
100 */
101static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
102 struct dentry *dentry)
103{
104 struct cachefiles_object *object;
105 struct rb_node *p;
106
107 _enter(",'%*.*s'",
108 dentry->d_name.len, dentry->d_name.len, dentry->d_name.name);
109
110 write_lock(&cache->active_lock);
111
112 p = cache->active_nodes.rb_node;
113 while (p) {
114 object = rb_entry(p, struct cachefiles_object, active_node);
115 if (object->dentry > dentry)
116 p = p->rb_left;
117 else if (object->dentry < dentry)
118 p = p->rb_right;
119 else
120 goto found_dentry;
121 }
122
123 write_unlock(&cache->active_lock);
124 _leave(" [no owner]");
125 return;
126
127 /* found the dentry for */
128found_dentry:
129 kdebug("preemptive burial: OBJ%x [%s] %p",
130 object->fscache.debug_id,
131 fscache_object_states[object->fscache.state],
132 dentry);
133
134 if (object->fscache.state < FSCACHE_OBJECT_DYING) {
135 printk(KERN_ERR "\n");
136 printk(KERN_ERR "CacheFiles: Error:"
137 " Can't preemptively bury live object\n");
138 cachefiles_printk_object(object, NULL);
139 } else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
140 printk(KERN_ERR "CacheFiles: Error:"
141 " Object already preemptively buried\n");
142 }
143
144 write_unlock(&cache->active_lock);
145 _leave(" [owner marked]");
146}
147
148/*
95 * record the fact that an object is now active 149 * record the fact that an object is now active
96 */ 150 */
97static int cachefiles_mark_object_active(struct cachefiles_cache *cache, 151static int cachefiles_mark_object_active(struct cachefiles_cache *cache,
@@ -218,7 +272,8 @@ requeue:
218 */ 272 */
219static int cachefiles_bury_object(struct cachefiles_cache *cache, 273static int cachefiles_bury_object(struct cachefiles_cache *cache,
220 struct dentry *dir, 274 struct dentry *dir,
221 struct dentry *rep) 275 struct dentry *rep,
276 bool preemptive)
222{ 277{
223 struct dentry *grave, *trap; 278 struct dentry *grave, *trap;
224 char nbuffer[8 + 8 + 1]; 279 char nbuffer[8 + 8 + 1];
@@ -228,11 +283,16 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
228 dir->d_name.len, dir->d_name.len, dir->d_name.name, 283 dir->d_name.len, dir->d_name.len, dir->d_name.name,
229 rep->d_name.len, rep->d_name.len, rep->d_name.name); 284 rep->d_name.len, rep->d_name.len, rep->d_name.name);
230 285
286 _debug("remove %p from %p", rep, dir);
287
231 /* non-directories can just be unlinked */ 288 /* non-directories can just be unlinked */
232 if (!S_ISDIR(rep->d_inode->i_mode)) { 289 if (!S_ISDIR(rep->d_inode->i_mode)) {
233 _debug("unlink stale object"); 290 _debug("unlink stale object");
234 ret = vfs_unlink(dir->d_inode, rep); 291 ret = vfs_unlink(dir->d_inode, rep);
235 292
293 if (preemptive)
294 cachefiles_mark_object_buried(cache, rep);
295
236 mutex_unlock(&dir->d_inode->i_mutex); 296 mutex_unlock(&dir->d_inode->i_mutex);
237 297
238 if (ret == -EIO) 298 if (ret == -EIO)
@@ -324,6 +384,9 @@ try_again:
324 if (ret != 0 && ret != -ENOMEM) 384 if (ret != 0 && ret != -ENOMEM)
325 cachefiles_io_error(cache, "Rename failed with error %d", ret); 385 cachefiles_io_error(cache, "Rename failed with error %d", ret);
326 386
387 if (preemptive)
388 cachefiles_mark_object_buried(cache, rep);
389
327 unlock_rename(cache->graveyard, dir); 390 unlock_rename(cache->graveyard, dir);
328 dput(grave); 391 dput(grave);
329 _leave(" = 0"); 392 _leave(" = 0");
@@ -339,7 +402,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
339 struct dentry *dir; 402 struct dentry *dir;
340 int ret; 403 int ret;
341 404
342 _enter(",{%p}", object->dentry); 405 _enter(",OBJ%x{%p}", object->fscache.debug_id, object->dentry);
343 406
344 ASSERT(object->dentry); 407 ASSERT(object->dentry);
345 ASSERT(object->dentry->d_inode); 408 ASSERT(object->dentry->d_inode);
@@ -349,15 +412,25 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
349 412
350 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); 413 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
351 414
352 /* we need to check that our parent is _still_ our parent - it may have 415 if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
353 * been renamed */ 416 /* object allocation for the same key preemptively deleted this
354 if (dir == object->dentry->d_parent) { 417 * object's file so that it could create its own file */
355 ret = cachefiles_bury_object(cache, dir, object->dentry); 418 _debug("object preemptively buried");
356 } else {
357 /* it got moved, presumably by cachefilesd culling it, so it's
358 * no longer in the key path and we can ignore it */
359 mutex_unlock(&dir->d_inode->i_mutex); 419 mutex_unlock(&dir->d_inode->i_mutex);
360 ret = 0; 420 ret = 0;
421 } else {
422 /* we need to check that our parent is _still_ our parent - it
423 * may have been renamed */
424 if (dir == object->dentry->d_parent) {
425 ret = cachefiles_bury_object(cache, dir,
426 object->dentry, false);
427 } else {
428 /* it got moved, presumably by cachefilesd culling it,
429 * so it's no longer in the key path and we can ignore
430 * it */
431 mutex_unlock(&dir->d_inode->i_mutex);
432 ret = 0;
433 }
361 } 434 }
362 435
363 dput(dir); 436 dput(dir);
@@ -380,7 +453,9 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
380 const char *name; 453 const char *name;
381 int ret, nlen; 454 int ret, nlen;
382 455
383 _enter("{%p},,%s,", parent->dentry, key); 456 _enter("OBJ%x{%p},OBJ%x,%s,",
457 parent->fscache.debug_id, parent->dentry,
458 object->fscache.debug_id, key);
384 459
385 cache = container_of(parent->fscache.cache, 460 cache = container_of(parent->fscache.cache,
386 struct cachefiles_cache, cache); 461 struct cachefiles_cache, cache);
@@ -508,7 +583,7 @@ lookup_again:
508 * mutex) */ 583 * mutex) */
509 object->dentry = NULL; 584 object->dentry = NULL;
510 585
511 ret = cachefiles_bury_object(cache, dir, next); 586 ret = cachefiles_bury_object(cache, dir, next, true);
512 dput(next); 587 dput(next);
513 next = NULL; 588 next = NULL;
514 589
@@ -827,7 +902,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
827 /* actually remove the victim (drops the dir mutex) */ 902 /* actually remove the victim (drops the dir mutex) */
828 _debug("bury"); 903 _debug("bury");
829 904
830 ret = cachefiles_bury_object(cache, dir, victim); 905 ret = cachefiles_bury_object(cache, dir, victim, false);
831 if (ret < 0) 906 if (ret < 0)
832 goto error; 907 goto error;
833 908
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 1d8332563863..0f0d41fbb03f 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -10,6 +10,7 @@
10 */ 10 */
11 11
12#include <linux/mount.h> 12#include <linux/mount.h>
13#include <linux/slab.h>
13#include <linux/file.h> 14#include <linux/file.h>
14#include "internal.h" 15#include "internal.h"
15 16
diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c
index b5808cdb2232..039b5011d83b 100644
--- a/fs/cachefiles/security.c
+++ b/fs/cachefiles/security.c
@@ -77,6 +77,8 @@ static int cachefiles_check_cache_dir(struct cachefiles_cache *cache,
77/* 77/*
78 * check the security details of the on-disk cache 78 * check the security details of the on-disk cache
79 * - must be called with security override in force 79 * - must be called with security override in force
80 * - must return with a security override in force - even in the case of an
81 * error
80 */ 82 */
81int cachefiles_determine_cache_security(struct cachefiles_cache *cache, 83int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
82 struct dentry *root, 84 struct dentry *root,
@@ -99,6 +101,8 @@ int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
99 * which create files */ 101 * which create files */
100 ret = set_create_files_as(new, root->d_inode); 102 ret = set_create_files_as(new, root->d_inode);
101 if (ret < 0) { 103 if (ret < 0) {
104 abort_creds(new);
105 cachefiles_begin_secure(cache, _saved_cred);
102 _leave(" = %d [cfa]", ret); 106 _leave(" = %d [cfa]", ret);
103 return ret; 107 return ret;
104 } 108 }
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index f3e7a0bf068b..e18b183b47e1 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -16,6 +16,7 @@
16#include <linux/fsnotify.h> 16#include <linux/fsnotify.h>
17#include <linux/quotaops.h> 17#include <linux/quotaops.h>
18#include <linux/xattr.h> 18#include <linux/xattr.h>
19#include <linux/slab.h>
19#include "internal.h" 20#include "internal.h"
20 21
21static const char cachefiles_xattr_cache[] = 22static const char cachefiles_xattr_cache[] =
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
new file mode 100644
index 000000000000..04b8280582a9
--- /dev/null
+++ b/fs/ceph/Kconfig
@@ -0,0 +1,27 @@
1config CEPH_FS
2 tristate "Ceph distributed file system (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL
4 select LIBCRC32C
5 select CONFIG_CRYPTO_AES
6 help
7 Choose Y or M here to include support for mounting the
8 experimental Ceph distributed file system. Ceph is an extremely
9 scalable file system designed to provide high performance,
10 reliable access to petabytes of storage.
11
12 More information at http://ceph.newdream.net/.
13
14 If unsure, say N.
15
16config CEPH_FS_PRETTYDEBUG
17 bool "Include file:line in ceph debug output"
18 depends on CEPH_FS
19 default n
20 help
21 If you say Y here, debug output will include a filename and
22 line to aid debugging. This icnreases kernel size and slows
23 execution slightly when debug call sites are enabled (e.g.,
24 via CONFIG_DYNAMIC_DEBUG).
25
26 If unsure, say N.
27
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
new file mode 100644
index 000000000000..6a660e610be8
--- /dev/null
+++ b/fs/ceph/Makefile
@@ -0,0 +1,39 @@
1#
2# Makefile for CEPH filesystem.
3#
4
5ifneq ($(KERNELRELEASE),)
6
7obj-$(CONFIG_CEPH_FS) += ceph.o
8
9ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
10 export.o caps.o snap.o xattr.o \
11 messenger.o msgpool.o buffer.o pagelist.o \
12 mds_client.o mdsmap.o \
13 mon_client.o \
14 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
15 debugfs.o \
16 auth.o auth_none.o \
17 crypto.o armor.o \
18 auth_x.o \
19 ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
20
21else
22#Otherwise we were called directly from the command
23# line; invoke the kernel build system.
24
25KERNELDIR ?= /lib/modules/$(shell uname -r)/build
26PWD := $(shell pwd)
27
28default: all
29
30all:
31 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
32
33modules_install:
34 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
35
36clean:
37 $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
38
39endif
diff --git a/fs/ceph/README b/fs/ceph/README
new file mode 100644
index 000000000000..18352fab37c0
--- /dev/null
+++ b/fs/ceph/README
@@ -0,0 +1,20 @@
1#
2# The following files are shared by (and manually synchronized
3# between) the Ceph userland and kernel client.
4#
5# userland kernel
6src/include/ceph_fs.h fs/ceph/ceph_fs.h
7src/include/ceph_fs.cc fs/ceph/ceph_fs.c
8src/include/msgr.h fs/ceph/msgr.h
9src/include/rados.h fs/ceph/rados.h
10src/include/ceph_strings.cc fs/ceph/ceph_strings.c
11src/include/ceph_frag.h fs/ceph/ceph_frag.h
12src/include/ceph_frag.cc fs/ceph/ceph_frag.c
13src/include/ceph_hash.h fs/ceph/ceph_hash.h
14src/include/ceph_hash.cc fs/ceph/ceph_hash.c
15src/crush/crush.c fs/ceph/crush/crush.c
16src/crush/crush.h fs/ceph/crush/crush.h
17src/crush/mapper.c fs/ceph/crush/mapper.c
18src/crush/mapper.h fs/ceph/crush/mapper.h
19src/crush/hash.h fs/ceph/crush/hash.h
20src/crush/hash.c fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
new file mode 100644
index 000000000000..a9005d862ed4
--- /dev/null
+++ b/fs/ceph/addr.c
@@ -0,0 +1,1187 @@
1#include "ceph_debug.h"
2
3#include <linux/backing-dev.h>
4#include <linux/fs.h>
5#include <linux/mm.h>
6#include <linux/pagemap.h>
7#include <linux/writeback.h> /* generic_writepages */
8#include <linux/slab.h>
9#include <linux/pagevec.h>
10#include <linux/task_io_accounting_ops.h>
11
12#include "super.h"
13#include "osd_client.h"
14
15/*
16 * Ceph address space ops.
17 *
18 * There are a few funny things going on here.
19 *
20 * The page->private field is used to reference a struct
21 * ceph_snap_context for _every_ dirty page. This indicates which
22 * snapshot the page was logically dirtied in, and thus which snap
23 * context needs to be associated with the osd write during writeback.
24 *
25 * Similarly, struct ceph_inode_info maintains a set of counters to
26 * count dirty pages on the inode. In the absense of snapshots,
27 * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
28 *
29 * When a snapshot is taken (that is, when the client receives
30 * notification that a snapshot was taken), each inode with caps and
31 * with dirty pages (dirty pages implies there is a cap) gets a new
32 * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
33 * order, new snaps go to the tail). The i_wrbuffer_ref_head count is
34 * moved to capsnap->dirty. (Unless a sync write is currently in
35 * progress. In that case, the capsnap is said to be "pending", new
36 * writes cannot start, and the capsnap isn't "finalized" until the
37 * write completes (or fails) and a final size/mtime for the inode for
38 * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0.
39 *
40 * On writeback, we must submit writes to the osd IN SNAP ORDER. So,
41 * we look for the first capsnap in i_cap_snaps and write out pages in
42 * that snap context _only_. Then we move on to the next capsnap,
43 * eventually reaching the "live" or "head" context (i.e., pages that
44 * are not yet snapped) and are writing the most recently dirtied
45 * pages.
46 *
47 * Invalidate and so forth must take care to ensure the dirty page
48 * accounting is preserved.
49 */
50
51#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
52#define CONGESTION_OFF_THRESH(congestion_kb) \
53 (CONGESTION_ON_THRESH(congestion_kb) - \
54 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
55
56
57
58/*
59 * Dirty a page. Optimistically adjust accounting, on the assumption
60 * that we won't race with invalidate. If we do, readjust.
61 */
62static int ceph_set_page_dirty(struct page *page)
63{
64 struct address_space *mapping = page->mapping;
65 struct inode *inode;
66 struct ceph_inode_info *ci;
67 int undo = 0;
68 struct ceph_snap_context *snapc;
69
70 if (unlikely(!mapping))
71 return !TestSetPageDirty(page);
72
73 if (TestSetPageDirty(page)) {
74 dout("%p set_page_dirty %p idx %lu -- already dirty\n",
75 mapping->host, page, page->index);
76 return 0;
77 }
78
79 inode = mapping->host;
80 ci = ceph_inode(inode);
81
82 /*
83 * Note that we're grabbing a snapc ref here without holding
84 * any locks!
85 */
86 snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
87
88 /* dirty the head */
89 spin_lock(&inode->i_lock);
90 if (ci->i_wrbuffer_ref_head == 0)
91 ci->i_head_snapc = ceph_get_snap_context(snapc);
92 ++ci->i_wrbuffer_ref_head;
93 if (ci->i_wrbuffer_ref == 0)
94 igrab(inode);
95 ++ci->i_wrbuffer_ref;
96 dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
97 "snapc %p seq %lld (%d snaps)\n",
98 mapping->host, page, page->index,
99 ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
100 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
101 snapc, snapc->seq, snapc->num_snaps);
102 spin_unlock(&inode->i_lock);
103
104 /* now adjust page */
105 spin_lock_irq(&mapping->tree_lock);
106 if (page->mapping) { /* Race with truncate? */
107 WARN_ON_ONCE(!PageUptodate(page));
108
109 if (mapping_cap_account_dirty(mapping)) {
110 __inc_zone_page_state(page, NR_FILE_DIRTY);
111 __inc_bdi_stat(mapping->backing_dev_info,
112 BDI_RECLAIMABLE);
113 task_io_account_write(PAGE_CACHE_SIZE);
114 }
115 radix_tree_tag_set(&mapping->page_tree,
116 page_index(page), PAGECACHE_TAG_DIRTY);
117
118 /*
119 * Reference snap context in page->private. Also set
120 * PagePrivate so that we get invalidatepage callback.
121 */
122 page->private = (unsigned long)snapc;
123 SetPagePrivate(page);
124 } else {
125 dout("ANON set_page_dirty %p (raced truncate?)\n", page);
126 undo = 1;
127 }
128
129 spin_unlock_irq(&mapping->tree_lock);
130
131 if (undo)
132 /* whoops, we failed to dirty the page */
133 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
134
135 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
136
137 BUG_ON(!PageDirty(page));
138 return 1;
139}
140
141/*
142 * If we are truncating the full page (i.e. offset == 0), adjust the
143 * dirty page counters appropriately. Only called if there is private
144 * data on the page.
145 */
146static void ceph_invalidatepage(struct page *page, unsigned long offset)
147{
148 struct inode *inode;
149 struct ceph_inode_info *ci;
150 struct ceph_snap_context *snapc = (void *)page->private;
151
152 BUG_ON(!PageLocked(page));
153 BUG_ON(!page->private);
154 BUG_ON(!PagePrivate(page));
155 BUG_ON(!page->mapping);
156
157 inode = page->mapping->host;
158
159 /*
160 * We can get non-dirty pages here due to races between
161 * set_page_dirty and truncate_complete_page; just spit out a
162 * warning, in case we end up with accounting problems later.
163 */
164 if (!PageDirty(page))
165 pr_err("%p invalidatepage %p page not dirty\n", inode, page);
166
167 if (offset == 0)
168 ClearPageChecked(page);
169
170 ci = ceph_inode(inode);
171 if (offset == 0) {
172 dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
173 inode, page, page->index, offset);
174 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
175 ceph_put_snap_context(snapc);
176 page->private = 0;
177 ClearPagePrivate(page);
178 } else {
179 dout("%p invalidatepage %p idx %lu partial dirty page\n",
180 inode, page, page->index);
181 }
182}
183
184/* just a sanity check */
185static int ceph_releasepage(struct page *page, gfp_t g)
186{
187 struct inode *inode = page->mapping ? page->mapping->host : NULL;
188 dout("%p releasepage %p idx %lu\n", inode, page, page->index);
189 WARN_ON(PageDirty(page));
190 WARN_ON(page->private);
191 WARN_ON(PagePrivate(page));
192 return 0;
193}
194
195/*
196 * read a single page, without unlocking it.
197 */
198static int readpage_nounlock(struct file *filp, struct page *page)
199{
200 struct inode *inode = filp->f_dentry->d_inode;
201 struct ceph_inode_info *ci = ceph_inode(inode);
202 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
203 int err = 0;
204 u64 len = PAGE_CACHE_SIZE;
205
206 dout("readpage inode %p file %p page %p index %lu\n",
207 inode, filp, page, page->index);
208 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
209 page->index << PAGE_CACHE_SHIFT, &len,
210 ci->i_truncate_seq, ci->i_truncate_size,
211 &page, 1);
212 if (err == -ENOENT)
213 err = 0;
214 if (err < 0) {
215 SetPageError(page);
216 goto out;
217 } else if (err < PAGE_CACHE_SIZE) {
218 /* zero fill remainder of page */
219 zero_user_segment(page, err, PAGE_CACHE_SIZE);
220 }
221 SetPageUptodate(page);
222
223out:
224 return err < 0 ? err : 0;
225}
226
227static int ceph_readpage(struct file *filp, struct page *page)
228{
229 int r = readpage_nounlock(filp, page);
230 unlock_page(page);
231 return r;
232}
233
234/*
235 * Build a vector of contiguous pages from the provided page list.
236 */
237static struct page **page_vector_from_list(struct list_head *page_list,
238 unsigned *nr_pages)
239{
240 struct page **pages;
241 struct page *page;
242 int next_index, contig_pages = 0;
243
244 /* build page vector */
245 pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
246 if (!pages)
247 return ERR_PTR(-ENOMEM);
248
249 BUG_ON(list_empty(page_list));
250 next_index = list_entry(page_list->prev, struct page, lru)->index;
251 list_for_each_entry_reverse(page, page_list, lru) {
252 if (page->index == next_index) {
253 dout("readpages page %d %p\n", contig_pages, page);
254 pages[contig_pages] = page;
255 contig_pages++;
256 next_index++;
257 } else {
258 break;
259 }
260 }
261 *nr_pages = contig_pages;
262 return pages;
263}
264
265/*
266 * Read multiple pages. Leave pages we don't read + unlock in page_list;
267 * the caller (VM) cleans them up.
268 */
269static int ceph_readpages(struct file *file, struct address_space *mapping,
270 struct list_head *page_list, unsigned nr_pages)
271{
272 struct inode *inode = file->f_dentry->d_inode;
273 struct ceph_inode_info *ci = ceph_inode(inode);
274 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
275 int rc = 0;
276 struct page **pages;
277 struct pagevec pvec;
278 loff_t offset;
279 u64 len;
280
281 dout("readpages %p file %p nr_pages %d\n",
282 inode, file, nr_pages);
283
284 pages = page_vector_from_list(page_list, &nr_pages);
285 if (IS_ERR(pages))
286 return PTR_ERR(pages);
287
288 /* guess read extent */
289 offset = pages[0]->index << PAGE_CACHE_SHIFT;
290 len = nr_pages << PAGE_CACHE_SHIFT;
291 rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
292 offset, &len,
293 ci->i_truncate_seq, ci->i_truncate_size,
294 pages, nr_pages);
295 if (rc == -ENOENT)
296 rc = 0;
297 if (rc < 0)
298 goto out;
299
300 /* set uptodate and add to lru in pagevec-sized chunks */
301 pagevec_init(&pvec, 0);
302 for (; !list_empty(page_list) && len > 0;
303 rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
304 struct page *page =
305 list_entry(page_list->prev, struct page, lru);
306
307 list_del(&page->lru);
308
309 if (rc < (int)PAGE_CACHE_SIZE) {
310 /* zero (remainder of) page */
311 int s = rc < 0 ? 0 : rc;
312 zero_user_segment(page, s, PAGE_CACHE_SIZE);
313 }
314
315 if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
316 page_cache_release(page);
317 dout("readpages %p add_to_page_cache failed %p\n",
318 inode, page);
319 continue;
320 }
321 dout("readpages %p adding %p idx %lu\n", inode, page,
322 page->index);
323 flush_dcache_page(page);
324 SetPageUptodate(page);
325 unlock_page(page);
326 if (pagevec_add(&pvec, page) == 0)
327 pagevec_lru_add_file(&pvec); /* add to lru */
328 }
329 pagevec_lru_add_file(&pvec);
330 rc = 0;
331
332out:
333 kfree(pages);
334 return rc;
335}
336
337/*
338 * Get ref for the oldest snapc for an inode with dirty data... that is, the
339 * only snap context we are allowed to write back.
340 */
341static struct ceph_snap_context *get_oldest_context(struct inode *inode,
342 u64 *snap_size)
343{
344 struct ceph_inode_info *ci = ceph_inode(inode);
345 struct ceph_snap_context *snapc = NULL;
346 struct ceph_cap_snap *capsnap = NULL;
347
348 spin_lock(&inode->i_lock);
349 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
350 dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
351 capsnap->context, capsnap->dirty_pages);
352 if (capsnap->dirty_pages) {
353 snapc = ceph_get_snap_context(capsnap->context);
354 if (snap_size)
355 *snap_size = capsnap->size;
356 break;
357 }
358 }
359 if (!snapc && ci->i_head_snapc) {
360 snapc = ceph_get_snap_context(ci->i_head_snapc);
361 dout(" head snapc %p has %d dirty pages\n",
362 snapc, ci->i_wrbuffer_ref_head);
363 }
364 spin_unlock(&inode->i_lock);
365 return snapc;
366}
367
368/*
369 * Write a single page, but leave the page locked.
370 *
371 * If we get a write error, set the page error bit, but still adjust the
372 * dirty page accounting (i.e., page is no longer dirty).
373 */
374static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
375{
376 struct inode *inode;
377 struct ceph_inode_info *ci;
378 struct ceph_client *client;
379 struct ceph_osd_client *osdc;
380 loff_t page_off = page->index << PAGE_CACHE_SHIFT;
381 int len = PAGE_CACHE_SIZE;
382 loff_t i_size;
383 int err = 0;
384 struct ceph_snap_context *snapc, *oldest;
385 u64 snap_size = 0;
386 long writeback_stat;
387
388 dout("writepage %p idx %lu\n", page, page->index);
389
390 if (!page->mapping || !page->mapping->host) {
391 dout("writepage %p - no mapping\n", page);
392 return -EFAULT;
393 }
394 inode = page->mapping->host;
395 ci = ceph_inode(inode);
396 client = ceph_inode_to_client(inode);
397 osdc = &client->osdc;
398
399 /* verify this is a writeable snap context */
400 snapc = (void *)page->private;
401 if (snapc == NULL) {
402 dout("writepage %p page %p not dirty?\n", inode, page);
403 goto out;
404 }
405 oldest = get_oldest_context(inode, &snap_size);
406 if (snapc->seq > oldest->seq) {
407 dout("writepage %p page %p snapc %p not writeable - noop\n",
408 inode, page, (void *)page->private);
409 /* we should only noop if called by kswapd */
410 WARN_ON((current->flags & PF_MEMALLOC) == 0);
411 ceph_put_snap_context(oldest);
412 goto out;
413 }
414 ceph_put_snap_context(oldest);
415
416 /* is this a partial page at end of file? */
417 if (snap_size)
418 i_size = snap_size;
419 else
420 i_size = i_size_read(inode);
421 if (i_size < page_off + len)
422 len = i_size - page_off;
423
424 dout("writepage %p page %p index %lu on %llu~%u\n",
425 inode, page, page->index, page_off, len);
426
427 writeback_stat = atomic_long_inc_return(&client->writeback_count);
428 if (writeback_stat >
429 CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
430 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
431
432 set_page_writeback(page);
433 err = ceph_osdc_writepages(osdc, ceph_vino(inode),
434 &ci->i_layout, snapc,
435 page_off, len,
436 ci->i_truncate_seq, ci->i_truncate_size,
437 &inode->i_mtime,
438 &page, 1, 0, 0, true);
439 if (err < 0) {
440 dout("writepage setting page/mapping error %d %p\n", err, page);
441 SetPageError(page);
442 mapping_set_error(&inode->i_data, err);
443 if (wbc)
444 wbc->pages_skipped++;
445 } else {
446 dout("writepage cleaned page %p\n", page);
447 err = 0; /* vfs expects us to return 0 */
448 }
449 page->private = 0;
450 ClearPagePrivate(page);
451 end_page_writeback(page);
452 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
453 ceph_put_snap_context(snapc); /* page's reference */
454out:
455 return err;
456}
457
458static int ceph_writepage(struct page *page, struct writeback_control *wbc)
459{
460 int err;
461 struct inode *inode = page->mapping->host;
462 BUG_ON(!inode);
463 igrab(inode);
464 err = writepage_nounlock(page, wbc);
465 unlock_page(page);
466 iput(inode);
467 return err;
468}
469
470
471/*
472 * lame release_pages helper. release_pages() isn't exported to
473 * modules.
474 */
475static void ceph_release_pages(struct page **pages, int num)
476{
477 struct pagevec pvec;
478 int i;
479
480 pagevec_init(&pvec, 0);
481 for (i = 0; i < num; i++) {
482 if (pagevec_add(&pvec, pages[i]) == 0)
483 pagevec_release(&pvec);
484 }
485 pagevec_release(&pvec);
486}
487
488
489/*
490 * async writeback completion handler.
491 *
492 * If we get an error, set the mapping error bit, but not the individual
493 * page error bits.
494 */
495static void writepages_finish(struct ceph_osd_request *req,
496 struct ceph_msg *msg)
497{
498 struct inode *inode = req->r_inode;
499 struct ceph_osd_reply_head *replyhead;
500 struct ceph_osd_op *op;
501 struct ceph_inode_info *ci = ceph_inode(inode);
502 unsigned wrote;
503 struct page *page;
504 int i;
505 struct ceph_snap_context *snapc = req->r_snapc;
506 struct address_space *mapping = inode->i_mapping;
507 __s32 rc = -EIO;
508 u64 bytes = 0;
509 struct ceph_client *client = ceph_inode_to_client(inode);
510 long writeback_stat;
511 unsigned issued = ceph_caps_issued(ci);
512
513 /* parse reply */
514 replyhead = msg->front.iov_base;
515 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
516 op = (void *)(replyhead + 1);
517 rc = le32_to_cpu(replyhead->result);
518 bytes = le64_to_cpu(op->extent.length);
519
520 if (rc >= 0) {
521 /*
522 * Assume we wrote the pages we originally sent. The
523 * osd might reply with fewer pages if our writeback
524 * raced with a truncation and was adjusted at the osd,
525 * so don't believe the reply.
526 */
527 wrote = req->r_num_pages;
528 } else {
529 wrote = 0;
530 mapping_set_error(mapping, rc);
531 }
532 dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
533 inode, rc, bytes, wrote);
534
535 /* clean all pages */
536 for (i = 0; i < req->r_num_pages; i++) {
537 page = req->r_pages[i];
538 BUG_ON(!page);
539 WARN_ON(!PageUptodate(page));
540
541 writeback_stat =
542 atomic_long_dec_return(&client->writeback_count);
543 if (writeback_stat <
544 CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
545 clear_bdi_congested(&client->backing_dev_info,
546 BLK_RW_ASYNC);
547
548 ceph_put_snap_context((void *)page->private);
549 page->private = 0;
550 ClearPagePrivate(page);
551 dout("unlocking %d %p\n", i, page);
552 end_page_writeback(page);
553
554 /*
555 * We lost the cache cap, need to truncate the page before
556 * it is unlocked, otherwise we'd truncate it later in the
557 * page truncation thread, possibly losing some data that
558 * raced its way in
559 */
560 if ((issued & CEPH_CAP_FILE_CACHE) == 0)
561 generic_error_remove_page(inode->i_mapping, page);
562
563 unlock_page(page);
564 }
565 dout("%p wrote+cleaned %d pages\n", inode, wrote);
566 ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
567
568 ceph_release_pages(req->r_pages, req->r_num_pages);
569 if (req->r_pages_from_pool)
570 mempool_free(req->r_pages,
571 ceph_client(inode->i_sb)->wb_pagevec_pool);
572 else
573 kfree(req->r_pages);
574 ceph_osdc_put_request(req);
575}
576
577/*
578 * allocate a page vec, either directly, or if necessary, via a the
579 * mempool. we avoid the mempool if we can because req->r_num_pages
580 * may be less than the maximum write size.
581 */
582static void alloc_page_vec(struct ceph_client *client,
583 struct ceph_osd_request *req)
584{
585 req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
586 GFP_NOFS);
587 if (!req->r_pages) {
588 req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
589 req->r_pages_from_pool = 1;
590 WARN_ON(!req->r_pages);
591 }
592}
593
594/*
595 * initiate async writeback
596 */
597static int ceph_writepages_start(struct address_space *mapping,
598 struct writeback_control *wbc)
599{
600 struct inode *inode = mapping->host;
601 struct backing_dev_info *bdi = mapping->backing_dev_info;
602 struct ceph_inode_info *ci = ceph_inode(inode);
603 struct ceph_client *client;
604 pgoff_t index, start, end;
605 int range_whole = 0;
606 int should_loop = 1;
607 pgoff_t max_pages = 0, max_pages_ever = 0;
608 struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
609 struct pagevec pvec;
610 int done = 0;
611 int rc = 0;
612 unsigned wsize = 1 << inode->i_blkbits;
613 struct ceph_osd_request *req = NULL;
614 int do_sync;
615 u64 snap_size = 0;
616
617 /*
618 * Include a 'sync' in the OSD request if this is a data
619 * integrity write (e.g., O_SYNC write or fsync()), or if our
620 * cap is being revoked.
621 */
622 do_sync = wbc->sync_mode == WB_SYNC_ALL;
623 if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
624 do_sync = 1;
625 dout("writepages_start %p dosync=%d (mode=%s)\n",
626 inode, do_sync,
627 wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
628 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
629
630 client = ceph_inode_to_client(inode);
631 if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
632 pr_warning("writepage_start %p on forced umount\n", inode);
633 return -EIO; /* we're in a forced umount, don't write! */
634 }
635 if (client->mount_args->wsize && client->mount_args->wsize < wsize)
636 wsize = client->mount_args->wsize;
637 if (wsize < PAGE_CACHE_SIZE)
638 wsize = PAGE_CACHE_SIZE;
639 max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
640
641 pagevec_init(&pvec, 0);
642
643 /* ?? */
644 if (wbc->nonblocking && bdi_write_congested(bdi)) {
645 dout(" writepages congested\n");
646 wbc->encountered_congestion = 1;
647 goto out_final;
648 }
649
650 /* where to start/end? */
651 if (wbc->range_cyclic) {
652 start = mapping->writeback_index; /* Start from prev offset */
653 end = -1;
654 dout(" cyclic, start at %lu\n", start);
655 } else {
656 start = wbc->range_start >> PAGE_CACHE_SHIFT;
657 end = wbc->range_end >> PAGE_CACHE_SHIFT;
658 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
659 range_whole = 1;
660 should_loop = 0;
661 dout(" not cyclic, %lu to %lu\n", start, end);
662 }
663 index = start;
664
665retry:
666 /* find oldest snap context with dirty data */
667 ceph_put_snap_context(snapc);
668 snapc = get_oldest_context(inode, &snap_size);
669 if (!snapc) {
670 /* hmm, why does writepages get called when there
671 is no dirty data? */
672 dout(" no snap context with dirty data?\n");
673 goto out;
674 }
675 dout(" oldest snapc is %p seq %lld (%d snaps)\n",
676 snapc, snapc->seq, snapc->num_snaps);
677 if (last_snapc && snapc != last_snapc) {
678 /* if we switched to a newer snapc, restart our scan at the
679 * start of the original file range. */
680 dout(" snapc differs from last pass, restarting at %lu\n",
681 index);
682 index = start;
683 }
684 last_snapc = snapc;
685
686 while (!done && index <= end) {
687 unsigned i;
688 int first;
689 pgoff_t next;
690 int pvec_pages, locked_pages;
691 struct page *page;
692 int want;
693 u64 offset, len;
694 struct ceph_osd_request_head *reqhead;
695 struct ceph_osd_op *op;
696 long writeback_stat;
697
698 next = 0;
699 locked_pages = 0;
700 max_pages = max_pages_ever;
701
702get_more_pages:
703 first = -1;
704 want = min(end - index,
705 min((pgoff_t)PAGEVEC_SIZE,
706 max_pages - (pgoff_t)locked_pages) - 1)
707 + 1;
708 pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
709 PAGECACHE_TAG_DIRTY,
710 want);
711 dout("pagevec_lookup_tag got %d\n", pvec_pages);
712 if (!pvec_pages && !locked_pages)
713 break;
714 for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
715 page = pvec.pages[i];
716 dout("? %p idx %lu\n", page, page->index);
717 if (locked_pages == 0)
718 lock_page(page); /* first page */
719 else if (!trylock_page(page))
720 break;
721
722 /* only dirty pages, or our accounting breaks */
723 if (unlikely(!PageDirty(page)) ||
724 unlikely(page->mapping != mapping)) {
725 dout("!dirty or !mapping %p\n", page);
726 unlock_page(page);
727 break;
728 }
729 if (!wbc->range_cyclic && page->index > end) {
730 dout("end of range %p\n", page);
731 done = 1;
732 unlock_page(page);
733 break;
734 }
735 if (next && (page->index != next)) {
736 dout("not consecutive %p\n", page);
737 unlock_page(page);
738 break;
739 }
740 if (wbc->sync_mode != WB_SYNC_NONE) {
741 dout("waiting on writeback %p\n", page);
742 wait_on_page_writeback(page);
743 }
744 if ((snap_size && page_offset(page) > snap_size) ||
745 (!snap_size &&
746 page_offset(page) > i_size_read(inode))) {
747 dout("%p page eof %llu\n", page, snap_size ?
748 snap_size : i_size_read(inode));
749 done = 1;
750 unlock_page(page);
751 break;
752 }
753 if (PageWriteback(page)) {
754 dout("%p under writeback\n", page);
755 unlock_page(page);
756 break;
757 }
758
759 /* only if matching snap context */
760 pgsnapc = (void *)page->private;
761 if (pgsnapc->seq > snapc->seq) {
762 dout("page snapc %p %lld > oldest %p %lld\n",
763 pgsnapc, pgsnapc->seq, snapc, snapc->seq);
764 unlock_page(page);
765 if (!locked_pages)
766 continue; /* keep looking for snap */
767 break;
768 }
769
770 if (!clear_page_dirty_for_io(page)) {
771 dout("%p !clear_page_dirty_for_io\n", page);
772 unlock_page(page);
773 break;
774 }
775
776 /* ok */
777 if (locked_pages == 0) {
778 /* prepare async write request */
779 offset = page->index << PAGE_CACHE_SHIFT;
780 len = wsize;
781 req = ceph_osdc_new_request(&client->osdc,
782 &ci->i_layout,
783 ceph_vino(inode),
784 offset, &len,
785 CEPH_OSD_OP_WRITE,
786 CEPH_OSD_FLAG_WRITE |
787 CEPH_OSD_FLAG_ONDISK,
788 snapc, do_sync,
789 ci->i_truncate_seq,
790 ci->i_truncate_size,
791 &inode->i_mtime, true, 1);
792 max_pages = req->r_num_pages;
793
794 alloc_page_vec(client, req);
795 req->r_callback = writepages_finish;
796 req->r_inode = inode;
797 }
798
799 /* note position of first page in pvec */
800 if (first < 0)
801 first = i;
802 dout("%p will write page %p idx %lu\n",
803 inode, page, page->index);
804
805 writeback_stat = atomic_long_inc_return(&client->writeback_count);
806 if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) {
807 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
808 }
809
810 set_page_writeback(page);
811 req->r_pages[locked_pages] = page;
812 locked_pages++;
813 next = page->index + 1;
814 }
815
816 /* did we get anything? */
817 if (!locked_pages)
818 goto release_pvec_pages;
819 if (i) {
820 int j;
821 BUG_ON(!locked_pages || first < 0);
822
823 if (pvec_pages && i == pvec_pages &&
824 locked_pages < max_pages) {
825 dout("reached end pvec, trying for more\n");
826 pagevec_reinit(&pvec);
827 goto get_more_pages;
828 }
829
830 /* shift unused pages over in the pvec... we
831 * will need to release them below. */
832 for (j = i; j < pvec_pages; j++) {
833 dout(" pvec leftover page %p\n",
834 pvec.pages[j]);
835 pvec.pages[j-i+first] = pvec.pages[j];
836 }
837 pvec.nr -= i-first;
838 }
839
840 /* submit the write */
841 offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
842 len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
843 (u64)locked_pages << PAGE_CACHE_SHIFT);
844 dout("writepages got %d pages at %llu~%llu\n",
845 locked_pages, offset, len);
846
847 /* revise final length, page count */
848 req->r_num_pages = locked_pages;
849 reqhead = req->r_request->front.iov_base;
850 op = (void *)(reqhead + 1);
851 op->extent.length = cpu_to_le64(len);
852 op->payload_len = cpu_to_le32(len);
853 req->r_request->hdr.data_len = cpu_to_le32(len);
854
855 ceph_osdc_start_request(&client->osdc, req, true);
856 req = NULL;
857
858 /* continue? */
859 index = next;
860 wbc->nr_to_write -= locked_pages;
861 if (wbc->nr_to_write <= 0)
862 done = 1;
863
864release_pvec_pages:
865 dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
866 pvec.nr ? pvec.pages[0] : NULL);
867 pagevec_release(&pvec);
868
869 if (locked_pages && !done)
870 goto retry;
871 }
872
873 if (should_loop && !done) {
874 /* more to do; loop back to beginning of file */
875 dout("writepages looping back to beginning of file\n");
876 should_loop = 0;
877 index = 0;
878 goto retry;
879 }
880
881 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
882 mapping->writeback_index = index;
883
884out:
885 if (req)
886 ceph_osdc_put_request(req);
887 if (rc > 0)
888 rc = 0; /* vfs expects us to return 0 */
889 ceph_put_snap_context(snapc);
890 dout("writepages done, rc = %d\n", rc);
891out_final:
892 return rc;
893}
894
895
896
897/*
898 * See if a given @snapc is either writeable, or already written.
899 */
900static int context_is_writeable_or_written(struct inode *inode,
901 struct ceph_snap_context *snapc)
902{
903 struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
904 int ret = !oldest || snapc->seq <= oldest->seq;
905
906 ceph_put_snap_context(oldest);
907 return ret;
908}
909
910/*
911 * We are only allowed to write into/dirty the page if the page is
912 * clean, or already dirty within the same snap context.
913 *
914 * called with page locked.
915 * return success with page locked,
916 * or any failure (incl -EAGAIN) with page unlocked.
917 */
918static int ceph_update_writeable_page(struct file *file,
919 loff_t pos, unsigned len,
920 struct page *page)
921{
922 struct inode *inode = file->f_dentry->d_inode;
923 struct ceph_inode_info *ci = ceph_inode(inode);
924 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
925 loff_t page_off = pos & PAGE_CACHE_MASK;
926 int pos_in_page = pos & ~PAGE_CACHE_MASK;
927 int end_in_page = pos_in_page + len;
928 loff_t i_size;
929 int r;
930 struct ceph_snap_context *snapc, *oldest;
931
932retry_locked:
933 /* writepages currently holds page lock, but if we change that later, */
934 wait_on_page_writeback(page);
935
936 /* check snap context */
937 BUG_ON(!ci->i_snap_realm);
938 down_read(&mdsc->snap_rwsem);
939 BUG_ON(!ci->i_snap_realm->cached_context);
940 snapc = (void *)page->private;
941 if (snapc && snapc != ci->i_head_snapc) {
942 /*
943 * this page is already dirty in another (older) snap
944 * context! is it writeable now?
945 */
946 oldest = get_oldest_context(inode, NULL);
947 up_read(&mdsc->snap_rwsem);
948
949 if (snapc->seq > oldest->seq) {
950 ceph_put_snap_context(oldest);
951 dout(" page %p snapc %p not current or oldest\n",
952 page, snapc);
953 /*
954 * queue for writeback, and wait for snapc to
955 * be writeable or written
956 */
957 snapc = ceph_get_snap_context(snapc);
958 unlock_page(page);
959 ceph_queue_writeback(inode);
960 r = wait_event_interruptible(ci->i_cap_wq,
961 context_is_writeable_or_written(inode, snapc));
962 ceph_put_snap_context(snapc);
963 if (r == -ERESTARTSYS)
964 return r;
965 return -EAGAIN;
966 }
967 ceph_put_snap_context(oldest);
968
969 /* yay, writeable, do it now (without dropping page lock) */
970 dout(" page %p snapc %p not current, but oldest\n",
971 page, snapc);
972 if (!clear_page_dirty_for_io(page))
973 goto retry_locked;
974 r = writepage_nounlock(page, NULL);
975 if (r < 0)
976 goto fail_nosnap;
977 goto retry_locked;
978 }
979
980 if (PageUptodate(page)) {
981 dout(" page %p already uptodate\n", page);
982 return 0;
983 }
984
985 /* full page? */
986 if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
987 return 0;
988
989 /* past end of file? */
990 i_size = inode->i_size; /* caller holds i_mutex */
991
992 if (i_size + len > inode->i_sb->s_maxbytes) {
993 /* file is too big */
994 r = -EINVAL;
995 goto fail;
996 }
997
998 if (page_off >= i_size ||
999 (pos_in_page == 0 && (pos+len) >= i_size &&
1000 end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
1001 dout(" zeroing %p 0 - %d and %d - %d\n",
1002 page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
1003 zero_user_segments(page,
1004 0, pos_in_page,
1005 end_in_page, PAGE_CACHE_SIZE);
1006 return 0;
1007 }
1008
1009 /* we need to read it. */
1010 up_read(&mdsc->snap_rwsem);
1011 r = readpage_nounlock(file, page);
1012 if (r < 0)
1013 goto fail_nosnap;
1014 goto retry_locked;
1015
1016fail:
1017 up_read(&mdsc->snap_rwsem);
1018fail_nosnap:
1019 unlock_page(page);
1020 return r;
1021}
1022
1023/*
1024 * We are only allowed to write into/dirty the page if the page is
1025 * clean, or already dirty within the same snap context.
1026 */
1027static int ceph_write_begin(struct file *file, struct address_space *mapping,
1028 loff_t pos, unsigned len, unsigned flags,
1029 struct page **pagep, void **fsdata)
1030{
1031 struct inode *inode = file->f_dentry->d_inode;
1032 struct page *page;
1033 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1034 int r;
1035
1036 do {
1037 /* get a page */
1038 page = grab_cache_page_write_begin(mapping, index, 0);
1039 if (!page)
1040 return -ENOMEM;
1041 *pagep = page;
1042
1043 dout("write_begin file %p inode %p page %p %d~%d\n", file,
1044 inode, page, (int)pos, (int)len);
1045
1046 r = ceph_update_writeable_page(file, pos, len, page);
1047 } while (r == -EAGAIN);
1048
1049 return r;
1050}
1051
1052/*
1053 * we don't do anything in here that simple_write_end doesn't do
1054 * except adjust dirty page accounting and drop read lock on
1055 * mdsc->snap_rwsem.
1056 */
1057static int ceph_write_end(struct file *file, struct address_space *mapping,
1058 loff_t pos, unsigned len, unsigned copied,
1059 struct page *page, void *fsdata)
1060{
1061 struct inode *inode = file->f_dentry->d_inode;
1062 struct ceph_client *client = ceph_inode_to_client(inode);
1063 struct ceph_mds_client *mdsc = &client->mdsc;
1064 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1065 int check_cap = 0;
1066
1067 dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
1068 inode, page, (int)pos, (int)copied, (int)len);
1069
1070 /* zero the stale part of the page if we did a short copy */
1071 if (copied < len)
1072 zero_user_segment(page, from+copied, len);
1073
1074 /* did file size increase? */
1075 /* (no need for i_size_read(); we caller holds i_mutex */
1076 if (pos+copied > inode->i_size)
1077 check_cap = ceph_inode_set_size(inode, pos+copied);
1078
1079 if (!PageUptodate(page))
1080 SetPageUptodate(page);
1081
1082 set_page_dirty(page);
1083
1084 unlock_page(page);
1085 up_read(&mdsc->snap_rwsem);
1086 page_cache_release(page);
1087
1088 if (check_cap)
1089 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
1090
1091 return copied;
1092}
1093
1094/*
1095 * we set .direct_IO to indicate direct io is supported, but since we
1096 * intercept O_DIRECT reads and writes early, this function should
1097 * never get called.
1098 */
1099static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
1100 const struct iovec *iov,
1101 loff_t pos, unsigned long nr_segs)
1102{
1103 WARN_ON(1);
1104 return -EINVAL;
1105}
1106
1107const struct address_space_operations ceph_aops = {
1108 .readpage = ceph_readpage,
1109 .readpages = ceph_readpages,
1110 .writepage = ceph_writepage,
1111 .writepages = ceph_writepages_start,
1112 .write_begin = ceph_write_begin,
1113 .write_end = ceph_write_end,
1114 .set_page_dirty = ceph_set_page_dirty,
1115 .invalidatepage = ceph_invalidatepage,
1116 .releasepage = ceph_releasepage,
1117 .direct_IO = ceph_direct_io,
1118};
1119
1120
1121/*
1122 * vm ops
1123 */
1124
1125/*
1126 * Reuse write_begin here for simplicity.
1127 */
1128static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1129{
1130 struct inode *inode = vma->vm_file->f_dentry->d_inode;
1131 struct page *page = vmf->page;
1132 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
1133 loff_t off = page->index << PAGE_CACHE_SHIFT;
1134 loff_t size, len;
1135 int ret;
1136
1137 size = i_size_read(inode);
1138 if (off + PAGE_CACHE_SIZE <= size)
1139 len = PAGE_CACHE_SIZE;
1140 else
1141 len = size & ~PAGE_CACHE_MASK;
1142
1143 dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
1144 off, len, page, page->index);
1145
1146 lock_page(page);
1147
1148 ret = VM_FAULT_NOPAGE;
1149 if ((off > size) ||
1150 (page->mapping != inode->i_mapping))
1151 goto out;
1152
1153 ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
1154 if (ret == 0) {
1155 /* success. we'll keep the page locked. */
1156 set_page_dirty(page);
1157 up_read(&mdsc->snap_rwsem);
1158 ret = VM_FAULT_LOCKED;
1159 } else {
1160 if (ret == -ENOMEM)
1161 ret = VM_FAULT_OOM;
1162 else
1163 ret = VM_FAULT_SIGBUS;
1164 }
1165out:
1166 dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
1167 if (ret != VM_FAULT_LOCKED)
1168 unlock_page(page);
1169 return ret;
1170}
1171
1172static struct vm_operations_struct ceph_vmops = {
1173 .fault = filemap_fault,
1174 .page_mkwrite = ceph_page_mkwrite,
1175};
1176
1177int ceph_mmap(struct file *file, struct vm_area_struct *vma)
1178{
1179 struct address_space *mapping = file->f_mapping;
1180
1181 if (!mapping->a_ops->readpage)
1182 return -ENOEXEC;
1183 file_accessed(file);
1184 vma->vm_ops = &ceph_vmops;
1185 vma->vm_flags |= VM_CAN_NONLINEAR;
1186 return 0;
1187}
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
new file mode 100644
index 000000000000..67b2c030924b
--- /dev/null
+++ b/fs/ceph/armor.c
@@ -0,0 +1,99 @@
1
2#include <linux/errno.h>
3
4/*
5 * base64 encode/decode.
6 */
7
8const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
9
10static int encode_bits(int c)
11{
12 return pem_key[c];
13}
14
15static int decode_bits(char c)
16{
17 if (c >= 'A' && c <= 'Z')
18 return c - 'A';
19 if (c >= 'a' && c <= 'z')
20 return c - 'a' + 26;
21 if (c >= '0' && c <= '9')
22 return c - '0' + 52;
23 if (c == '+')
24 return 62;
25 if (c == '/')
26 return 63;
27 if (c == '=')
28 return 0; /* just non-negative, please */
29 return -EINVAL;
30}
31
32int ceph_armor(char *dst, const char *src, const char *end)
33{
34 int olen = 0;
35 int line = 0;
36
37 while (src < end) {
38 unsigned char a, b, c;
39
40 a = *src++;
41 *dst++ = encode_bits(a >> 2);
42 if (src < end) {
43 b = *src++;
44 *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
45 if (src < end) {
46 c = *src++;
47 *dst++ = encode_bits(((b & 15) << 2) |
48 (c >> 6));
49 *dst++ = encode_bits(c & 63);
50 } else {
51 *dst++ = encode_bits((b & 15) << 2);
52 *dst++ = '=';
53 }
54 } else {
55 *dst++ = encode_bits(((a & 3) << 4));
56 *dst++ = '=';
57 *dst++ = '=';
58 }
59 olen += 4;
60 line += 4;
61 if (line == 64) {
62 line = 0;
63 *(dst++) = '\n';
64 olen++;
65 }
66 }
67 return olen;
68}
69
70int ceph_unarmor(char *dst, const char *src, const char *end)
71{
72 int olen = 0;
73
74 while (src < end) {
75 int a, b, c, d;
76
77 if (src < end && src[0] == '\n')
78 src++;
79 if (src + 4 > end)
80 return -EINVAL;
81 a = decode_bits(src[0]);
82 b = decode_bits(src[1]);
83 c = decode_bits(src[2]);
84 d = decode_bits(src[3]);
85 if (a < 0 || b < 0 || c < 0 || d < 0)
86 return -EINVAL;
87
88 *dst++ = (a << 2) | (b >> 4);
89 if (src[2] == '=')
90 return olen + 1;
91 *dst++ = ((b & 15) << 4) | (c >> 2);
92 if (src[3] == '=')
93 return olen + 2;
94 *dst++ = ((c & 3) << 6) | d;
95 olen += 3;
96 src += 4;
97 }
98 return olen;
99}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
new file mode 100644
index 000000000000..818afe72e6c7
--- /dev/null
+++ b/fs/ceph/auth.c
@@ -0,0 +1,259 @@
1#include "ceph_debug.h"
2
3#include <linux/module.h>
4#include <linux/slab.h>
5#include <linux/err.h>
6#include <linux/slab.h>
7
8#include "types.h"
9#include "auth_none.h"
10#include "auth_x.h"
11#include "decode.h"
12#include "super.h"
13
14#include "messenger.h"
15
16/*
17 * get protocol handler
18 */
19static u32 supported_protocols[] = {
20 CEPH_AUTH_NONE,
21 CEPH_AUTH_CEPHX
22};
23
24int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
25{
26 switch (protocol) {
27 case CEPH_AUTH_NONE:
28 return ceph_auth_none_init(ac);
29 case CEPH_AUTH_CEPHX:
30 return ceph_x_init(ac);
31 default:
32 return -ENOENT;
33 }
34}
35
36/*
37 * setup, teardown.
38 */
39struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
40{
41 struct ceph_auth_client *ac;
42 int ret;
43
44 dout("auth_init name '%s' secret '%s'\n", name, secret);
45
46 ret = -ENOMEM;
47 ac = kzalloc(sizeof(*ac), GFP_NOFS);
48 if (!ac)
49 goto out;
50
51 ac->negotiating = true;
52 if (name)
53 ac->name = name;
54 else
55 ac->name = CEPH_AUTH_NAME_DEFAULT;
56 dout("auth_init name %s secret %s\n", ac->name, secret);
57 ac->secret = secret;
58 return ac;
59
60out:
61 return ERR_PTR(ret);
62}
63
64void ceph_auth_destroy(struct ceph_auth_client *ac)
65{
66 dout("auth_destroy %p\n", ac);
67 if (ac->ops)
68 ac->ops->destroy(ac);
69 kfree(ac);
70}
71
72/*
73 * Reset occurs when reconnecting to the monitor.
74 */
75void ceph_auth_reset(struct ceph_auth_client *ac)
76{
77 dout("auth_reset %p\n", ac);
78 if (ac->ops && !ac->negotiating)
79 ac->ops->reset(ac);
80 ac->negotiating = true;
81}
82
83int ceph_entity_name_encode(const char *name, void **p, void *end)
84{
85 int len = strlen(name);
86
87 if (*p + 2*sizeof(u32) + len > end)
88 return -ERANGE;
89 ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
90 ceph_encode_32(p, len);
91 ceph_encode_copy(p, name, len);
92 return 0;
93}
94
95/*
96 * Initiate protocol negotiation with monitor. Include entity name
97 * and list supported protocols.
98 */
99int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
100{
101 struct ceph_mon_request_header *monhdr = buf;
102 void *p = monhdr + 1, *end = buf + len, *lenp;
103 int i, num;
104 int ret;
105
106 dout("auth_build_hello\n");
107 monhdr->have_version = 0;
108 monhdr->session_mon = cpu_to_le16(-1);
109 monhdr->session_mon_tid = 0;
110
111 ceph_encode_32(&p, 0); /* no protocol, yet */
112
113 lenp = p;
114 p += sizeof(u32);
115
116 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
117 ceph_encode_8(&p, 1);
118 num = ARRAY_SIZE(supported_protocols);
119 ceph_encode_32(&p, num);
120 ceph_decode_need(&p, end, num * sizeof(u32), bad);
121 for (i = 0; i < num; i++)
122 ceph_encode_32(&p, supported_protocols[i]);
123
124 ret = ceph_entity_name_encode(ac->name, &p, end);
125 if (ret < 0)
126 return ret;
127 ceph_decode_need(&p, end, sizeof(u64), bad);
128 ceph_encode_64(&p, ac->global_id);
129
130 ceph_encode_32(&lenp, p - lenp - sizeof(u32));
131 return p - buf;
132
133bad:
134 return -ERANGE;
135}
136
137int ceph_build_auth_request(struct ceph_auth_client *ac,
138 void *msg_buf, size_t msg_len)
139{
140 struct ceph_mon_request_header *monhdr = msg_buf;
141 void *p = monhdr + 1;
142 void *end = msg_buf + msg_len;
143 int ret;
144
145 monhdr->have_version = 0;
146 monhdr->session_mon = cpu_to_le16(-1);
147 monhdr->session_mon_tid = 0;
148
149 ceph_encode_32(&p, ac->protocol);
150
151 ret = ac->ops->build_request(ac, p + sizeof(u32), end);
152 if (ret < 0) {
153 pr_err("error %d building request\n", ret);
154 return ret;
155 }
156 dout(" built request %d bytes\n", ret);
157 ceph_encode_32(&p, ret);
158 return p + ret - msg_buf;
159}
160
161/*
162 * Handle auth message from monitor.
163 */
164int ceph_handle_auth_reply(struct ceph_auth_client *ac,
165 void *buf, size_t len,
166 void *reply_buf, size_t reply_len)
167{
168 void *p = buf;
169 void *end = buf + len;
170 int protocol;
171 s32 result;
172 u64 global_id;
173 void *payload, *payload_end;
174 int payload_len;
175 char *result_msg;
176 int result_msg_len;
177 int ret = -EINVAL;
178
179 dout("handle_auth_reply %p %p\n", p, end);
180 ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
181 protocol = ceph_decode_32(&p);
182 result = ceph_decode_32(&p);
183 global_id = ceph_decode_64(&p);
184 payload_len = ceph_decode_32(&p);
185 payload = p;
186 p += payload_len;
187 ceph_decode_need(&p, end, sizeof(u32), bad);
188 result_msg_len = ceph_decode_32(&p);
189 result_msg = p;
190 p += result_msg_len;
191 if (p != end)
192 goto bad;
193
194 dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
195 result_msg, global_id, payload_len);
196
197 payload_end = payload + payload_len;
198
199 if (global_id && ac->global_id != global_id) {
200 dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
201 ac->global_id = global_id;
202 }
203
204 if (ac->negotiating) {
205 /* server does not support our protocols? */
206 if (!protocol && result < 0) {
207 ret = result;
208 goto out;
209 }
210 /* set up (new) protocol handler? */
211 if (ac->protocol && ac->protocol != protocol) {
212 ac->ops->destroy(ac);
213 ac->protocol = 0;
214 ac->ops = NULL;
215 }
216 if (ac->protocol != protocol) {
217 ret = ceph_auth_init_protocol(ac, protocol);
218 if (ret) {
219 pr_err("error %d on auth protocol %d init\n",
220 ret, protocol);
221 goto out;
222 }
223 }
224
225 ac->negotiating = false;
226 }
227
228 ret = ac->ops->handle_reply(ac, result, payload, payload_end);
229 if (ret == -EAGAIN) {
230 return ceph_build_auth_request(ac, reply_buf, reply_len);
231 } else if (ret) {
232 pr_err("authentication error %d\n", ret);
233 return ret;
234 }
235 return 0;
236
237bad:
238 pr_err("failed to decode auth msg\n");
239out:
240 return ret;
241}
242
243int ceph_build_auth(struct ceph_auth_client *ac,
244 void *msg_buf, size_t msg_len)
245{
246 if (!ac->protocol)
247 return ceph_auth_build_hello(ac, msg_buf, msg_len);
248 BUG_ON(!ac->ops);
249 if (!ac->ops->is_authenticated(ac))
250 return ceph_build_auth_request(ac, msg_buf, msg_len);
251 return 0;
252}
253
254int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
255{
256 if (!ac->ops)
257 return 0;
258 return ac->ops->is_authenticated(ac);
259}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
new file mode 100644
index 000000000000..ca4f57cfb267
--- /dev/null
+++ b/fs/ceph/auth.h
@@ -0,0 +1,84 @@
1#ifndef _FS_CEPH_AUTH_H
2#define _FS_CEPH_AUTH_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * Abstract interface for communicating with the authenticate module.
9 * There is some handshake that takes place between us and the monitor
10 * to acquire the necessary keys. These are used to generate an
11 * 'authorizer' that we use when connecting to a service (mds, osd).
12 */
13
14struct ceph_auth_client;
15struct ceph_authorizer;
16
17struct ceph_auth_client_ops {
18 /*
19 * true if we are authenticated and can connect to
20 * services.
21 */
22 int (*is_authenticated)(struct ceph_auth_client *ac);
23
24 /*
25 * build requests and process replies during monitor
26 * handshake. if handle_reply returns -EAGAIN, we build
27 * another request.
28 */
29 int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
30 int (*handle_reply)(struct ceph_auth_client *ac, int result,
31 void *buf, void *end);
32
33 /*
34 * Create authorizer for connecting to a service, and verify
35 * the response to authenticate the service.
36 */
37 int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
38 struct ceph_authorizer **a,
39 void **buf, size_t *len,
40 void **reply_buf, size_t *reply_len);
41 int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
42 struct ceph_authorizer *a, size_t len);
43 void (*destroy_authorizer)(struct ceph_auth_client *ac,
44 struct ceph_authorizer *a);
45 void (*invalidate_authorizer)(struct ceph_auth_client *ac,
46 int peer_type);
47
48 /* reset when we (re)connect to a monitor */
49 void (*reset)(struct ceph_auth_client *ac);
50
51 void (*destroy)(struct ceph_auth_client *ac);
52};
53
54struct ceph_auth_client {
55 u32 protocol; /* CEPH_AUTH_* */
56 void *private; /* for use by protocol implementation */
57 const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */
58
59 bool negotiating; /* true if negotiating protocol */
60 const char *name; /* entity name */
61 u64 global_id; /* our unique id in system */
62 const char *secret; /* our secret key */
63 unsigned want_keys; /* which services we want */
64};
65
66extern struct ceph_auth_client *ceph_auth_init(const char *name,
67 const char *secret);
68extern void ceph_auth_destroy(struct ceph_auth_client *ac);
69
70extern void ceph_auth_reset(struct ceph_auth_client *ac);
71
72extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
73 void *buf, size_t len);
74extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
75 void *buf, size_t len,
76 void *reply_buf, size_t reply_len);
77extern int ceph_entity_name_encode(const char *name, void **p, void *end);
78
79extern int ceph_build_auth(struct ceph_auth_client *ac,
80 void *msg_buf, size_t msg_len);
81
82extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
83
84#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
new file mode 100644
index 000000000000..8cd9e3af07f7
--- /dev/null
+++ b/fs/ceph/auth_none.c
@@ -0,0 +1,122 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7#include <linux/slab.h>
8
9#include "auth_none.h"
10#include "auth.h"
11#include "decode.h"
12
13static void reset(struct ceph_auth_client *ac)
14{
15 struct ceph_auth_none_info *xi = ac->private;
16
17 xi->starting = true;
18 xi->built_authorizer = false;
19}
20
21static void destroy(struct ceph_auth_client *ac)
22{
23 kfree(ac->private);
24 ac->private = NULL;
25}
26
27static int is_authenticated(struct ceph_auth_client *ac)
28{
29 struct ceph_auth_none_info *xi = ac->private;
30
31 return !xi->starting;
32}
33
34/*
35 * the generic auth code decode the global_id, and we carry no actual
36 * authenticate state, so nothing happens here.
37 */
38static int handle_reply(struct ceph_auth_client *ac, int result,
39 void *buf, void *end)
40{
41 struct ceph_auth_none_info *xi = ac->private;
42
43 xi->starting = false;
44 return result;
45}
46
47/*
48 * build an 'authorizer' with our entity_name and global_id. we can
49 * reuse a single static copy since it is identical for all services
50 * we connect to.
51 */
52static int ceph_auth_none_create_authorizer(
53 struct ceph_auth_client *ac, int peer_type,
54 struct ceph_authorizer **a,
55 void **buf, size_t *len,
56 void **reply_buf, size_t *reply_len)
57{
58 struct ceph_auth_none_info *ai = ac->private;
59 struct ceph_none_authorizer *au = &ai->au;
60 void *p, *end;
61 int ret;
62
63 if (!ai->built_authorizer) {
64 p = au->buf;
65 end = p + sizeof(au->buf);
66 ceph_encode_8(&p, 1);
67 ret = ceph_entity_name_encode(ac->name, &p, end - 8);
68 if (ret < 0)
69 goto bad;
70 ceph_decode_need(&p, end, sizeof(u64), bad2);
71 ceph_encode_64(&p, ac->global_id);
72 au->buf_len = p - (void *)au->buf;
73 ai->built_authorizer = true;
74 dout("built authorizer len %d\n", au->buf_len);
75 }
76
77 *a = (struct ceph_authorizer *)au;
78 *buf = au->buf;
79 *len = au->buf_len;
80 *reply_buf = au->reply_buf;
81 *reply_len = sizeof(au->reply_buf);
82 return 0;
83
84bad2:
85 ret = -ERANGE;
86bad:
87 return ret;
88}
89
90static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
91 struct ceph_authorizer *a)
92{
93 /* nothing to do */
94}
95
96static const struct ceph_auth_client_ops ceph_auth_none_ops = {
97 .reset = reset,
98 .destroy = destroy,
99 .is_authenticated = is_authenticated,
100 .handle_reply = handle_reply,
101 .create_authorizer = ceph_auth_none_create_authorizer,
102 .destroy_authorizer = ceph_auth_none_destroy_authorizer,
103};
104
105int ceph_auth_none_init(struct ceph_auth_client *ac)
106{
107 struct ceph_auth_none_info *xi;
108
109 dout("ceph_auth_none_init %p\n", ac);
110 xi = kzalloc(sizeof(*xi), GFP_NOFS);
111 if (!xi)
112 return -ENOMEM;
113
114 xi->starting = true;
115 xi->built_authorizer = false;
116
117 ac->protocol = CEPH_AUTH_NONE;
118 ac->private = xi;
119 ac->ops = &ceph_auth_none_ops;
120 return 0;
121}
122
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
new file mode 100644
index 000000000000..8164df1a08be
--- /dev/null
+++ b/fs/ceph/auth_none.h
@@ -0,0 +1,30 @@
1#ifndef _FS_CEPH_AUTH_NONE_H
2#define _FS_CEPH_AUTH_NONE_H
3
4#include <linux/slab.h>
5
6#include "auth.h"
7
8/*
9 * null security mode.
10 *
11 * we use a single static authorizer that simply encodes our entity name
12 * and global id.
13 */
14
15struct ceph_none_authorizer {
16 char buf[128];
17 int buf_len;
18 char reply_buf[0];
19};
20
21struct ceph_auth_none_info {
22 bool starting;
23 bool built_authorizer;
24 struct ceph_none_authorizer au; /* we only need one; it's static */
25};
26
27extern int ceph_auth_none_init(struct ceph_auth_client *ac);
28
29#endif
30
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
new file mode 100644
index 000000000000..fee5a08da881
--- /dev/null
+++ b/fs/ceph/auth_x.c
@@ -0,0 +1,668 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7#include <linux/slab.h>
8
9#include "auth_x.h"
10#include "auth_x_protocol.h"
11#include "crypto.h"
12#include "auth.h"
13#include "decode.h"
14
15#define TEMP_TICKET_BUF_LEN 256
16
17static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
18
19static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
20{
21 struct ceph_x_info *xi = ac->private;
22 int need;
23
24 ceph_x_validate_tickets(ac, &need);
25 dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
26 ac->want_keys, need, xi->have_keys);
27 return (ac->want_keys & xi->have_keys) == ac->want_keys;
28}
29
30static int ceph_x_encrypt_buflen(int ilen)
31{
32 return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
33 sizeof(u32);
34}
35
36static int ceph_x_encrypt(struct ceph_crypto_key *secret,
37 void *ibuf, int ilen, void *obuf, size_t olen)
38{
39 struct ceph_x_encrypt_header head = {
40 .struct_v = 1,
41 .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
42 };
43 size_t len = olen - sizeof(u32);
44 int ret;
45
46 ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
47 &head, sizeof(head), ibuf, ilen);
48 if (ret)
49 return ret;
50 ceph_encode_32(&obuf, len);
51 return len + sizeof(u32);
52}
53
54static int ceph_x_decrypt(struct ceph_crypto_key *secret,
55 void **p, void *end, void *obuf, size_t olen)
56{
57 struct ceph_x_encrypt_header head;
58 size_t head_len = sizeof(head);
59 int len, ret;
60
61 len = ceph_decode_32(p);
62 if (*p + len > end)
63 return -EINVAL;
64
65 dout("ceph_x_decrypt len %d\n", len);
66 ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
67 *p, len);
68 if (ret)
69 return ret;
70 if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
71 return -EPERM;
72 *p += len;
73 return olen;
74}
75
76/*
77 * get existing (or insert new) ticket handler
78 */
79struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac,
80 int service)
81{
82 struct ceph_x_ticket_handler *th;
83 struct ceph_x_info *xi = ac->private;
84 struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
85
86 while (*p) {
87 parent = *p;
88 th = rb_entry(parent, struct ceph_x_ticket_handler, node);
89 if (service < th->service)
90 p = &(*p)->rb_left;
91 else if (service > th->service)
92 p = &(*p)->rb_right;
93 else
94 return th;
95 }
96
97 /* add it */
98 th = kzalloc(sizeof(*th), GFP_NOFS);
99 if (!th)
100 return ERR_PTR(-ENOMEM);
101 th->service = service;
102 rb_link_node(&th->node, parent, p);
103 rb_insert_color(&th->node, &xi->ticket_handlers);
104 return th;
105}
106
107static void remove_ticket_handler(struct ceph_auth_client *ac,
108 struct ceph_x_ticket_handler *th)
109{
110 struct ceph_x_info *xi = ac->private;
111
112 dout("remove_ticket_handler %p %d\n", th, th->service);
113 rb_erase(&th->node, &xi->ticket_handlers);
114 ceph_crypto_key_destroy(&th->session_key);
115 if (th->ticket_blob)
116 ceph_buffer_put(th->ticket_blob);
117 kfree(th);
118}
119
120static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
121 struct ceph_crypto_key *secret,
122 void *buf, void *end)
123{
124 struct ceph_x_info *xi = ac->private;
125 int num;
126 void *p = buf;
127 int ret;
128 char *dbuf;
129 char *ticket_buf;
130 u8 struct_v;
131
132 dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
133 if (!dbuf)
134 return -ENOMEM;
135
136 ret = -ENOMEM;
137 ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
138 if (!ticket_buf)
139 goto out_dbuf;
140
141 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
142 struct_v = ceph_decode_8(&p);
143 if (struct_v != 1)
144 goto bad;
145 num = ceph_decode_32(&p);
146 dout("%d tickets\n", num);
147 while (num--) {
148 int type;
149 u8 struct_v;
150 struct ceph_x_ticket_handler *th;
151 void *dp, *dend;
152 int dlen;
153 char is_enc;
154 struct timespec validity;
155 struct ceph_crypto_key old_key;
156 void *tp, *tpend;
157 struct ceph_timespec new_validity;
158 struct ceph_crypto_key new_session_key;
159 struct ceph_buffer *new_ticket_blob;
160 unsigned long new_expires, new_renew_after;
161 u64 new_secret_id;
162
163 ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
164
165 type = ceph_decode_32(&p);
166 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
167
168 struct_v = ceph_decode_8(&p);
169 if (struct_v != 1)
170 goto bad;
171
172 th = get_ticket_handler(ac, type);
173 if (IS_ERR(th)) {
174 ret = PTR_ERR(th);
175 goto out;
176 }
177
178 /* blob for me */
179 dlen = ceph_x_decrypt(secret, &p, end, dbuf,
180 TEMP_TICKET_BUF_LEN);
181 if (dlen <= 0) {
182 ret = dlen;
183 goto out;
184 }
185 dout(" decrypted %d bytes\n", dlen);
186 dend = dbuf + dlen;
187 dp = dbuf;
188
189 struct_v = ceph_decode_8(&dp);
190 if (struct_v != 1)
191 goto bad;
192
193 memcpy(&old_key, &th->session_key, sizeof(old_key));
194 ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
195 if (ret)
196 goto out;
197
198 ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
199 ceph_decode_timespec(&validity, &new_validity);
200 new_expires = get_seconds() + validity.tv_sec;
201 new_renew_after = new_expires - (validity.tv_sec / 4);
202 dout(" expires=%lu renew_after=%lu\n", new_expires,
203 new_renew_after);
204
205 /* ticket blob for service */
206 ceph_decode_8_safe(&p, end, is_enc, bad);
207 tp = ticket_buf;
208 if (is_enc) {
209 /* encrypted */
210 dout(" encrypted ticket\n");
211 dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
212 TEMP_TICKET_BUF_LEN);
213 if (dlen < 0) {
214 ret = dlen;
215 goto out;
216 }
217 dlen = ceph_decode_32(&tp);
218 } else {
219 /* unencrypted */
220 ceph_decode_32_safe(&p, end, dlen, bad);
221 ceph_decode_need(&p, end, dlen, bad);
222 ceph_decode_copy(&p, ticket_buf, dlen);
223 }
224 tpend = tp + dlen;
225 dout(" ticket blob is %d bytes\n", dlen);
226 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
227 struct_v = ceph_decode_8(&tp);
228 new_secret_id = ceph_decode_64(&tp);
229 ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
230 if (ret)
231 goto out;
232
233 /* all is well, update our ticket */
234 ceph_crypto_key_destroy(&th->session_key);
235 if (th->ticket_blob)
236 ceph_buffer_put(th->ticket_blob);
237 th->session_key = new_session_key;
238 th->ticket_blob = new_ticket_blob;
239 th->validity = new_validity;
240 th->secret_id = new_secret_id;
241 th->expires = new_expires;
242 th->renew_after = new_renew_after;
243 dout(" got ticket service %d (%s) secret_id %lld len %d\n",
244 type, ceph_entity_type_name(type), th->secret_id,
245 (int)th->ticket_blob->vec.iov_len);
246 xi->have_keys |= th->service;
247 }
248
249 ret = 0;
250out:
251 kfree(ticket_buf);
252out_dbuf:
253 kfree(dbuf);
254 return ret;
255
256bad:
257 ret = -EINVAL;
258 goto out;
259}
260
261static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
262 struct ceph_x_ticket_handler *th,
263 struct ceph_x_authorizer *au)
264{
265 int maxlen;
266 struct ceph_x_authorize_a *msg_a;
267 struct ceph_x_authorize_b msg_b;
268 void *p, *end;
269 int ret;
270 int ticket_blob_len =
271 (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
272
273 dout("build_authorizer for %s %p\n",
274 ceph_entity_type_name(th->service), au);
275
276 maxlen = sizeof(*msg_a) + sizeof(msg_b) +
277 ceph_x_encrypt_buflen(ticket_blob_len);
278 dout(" need len %d\n", maxlen);
279 if (au->buf && au->buf->alloc_len < maxlen) {
280 ceph_buffer_put(au->buf);
281 au->buf = NULL;
282 }
283 if (!au->buf) {
284 au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
285 if (!au->buf)
286 return -ENOMEM;
287 }
288 au->service = th->service;
289
290 msg_a = au->buf->vec.iov_base;
291 msg_a->struct_v = 1;
292 msg_a->global_id = cpu_to_le64(ac->global_id);
293 msg_a->service_id = cpu_to_le32(th->service);
294 msg_a->ticket_blob.struct_v = 1;
295 msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
296 msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
297 if (ticket_blob_len) {
298 memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
299 th->ticket_blob->vec.iov_len);
300 }
301 dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
302 le64_to_cpu(msg_a->ticket_blob.secret_id));
303
304 p = msg_a + 1;
305 p += ticket_blob_len;
306 end = au->buf->vec.iov_base + au->buf->vec.iov_len;
307
308 get_random_bytes(&au->nonce, sizeof(au->nonce));
309 msg_b.struct_v = 1;
310 msg_b.nonce = cpu_to_le64(au->nonce);
311 ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
312 p, end - p);
313 if (ret < 0)
314 goto out_buf;
315 p += ret;
316 au->buf->vec.iov_len = p - au->buf->vec.iov_base;
317 dout(" built authorizer nonce %llx len %d\n", au->nonce,
318 (int)au->buf->vec.iov_len);
319 BUG_ON(au->buf->vec.iov_len > maxlen);
320 return 0;
321
322out_buf:
323 ceph_buffer_put(au->buf);
324 au->buf = NULL;
325 return ret;
326}
327
328static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
329 void **p, void *end)
330{
331 ceph_decode_need(p, end, 1 + sizeof(u64), bad);
332 ceph_encode_8(p, 1);
333 ceph_encode_64(p, th->secret_id);
334 if (th->ticket_blob) {
335 const char *buf = th->ticket_blob->vec.iov_base;
336 u32 len = th->ticket_blob->vec.iov_len;
337
338 ceph_encode_32_safe(p, end, len, bad);
339 ceph_encode_copy_safe(p, end, buf, len, bad);
340 } else {
341 ceph_encode_32_safe(p, end, 0, bad);
342 }
343
344 return 0;
345bad:
346 return -ERANGE;
347}
348
349static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
350{
351 int want = ac->want_keys;
352 struct ceph_x_info *xi = ac->private;
353 int service;
354
355 *pneed = ac->want_keys & ~(xi->have_keys);
356
357 for (service = 1; service <= want; service <<= 1) {
358 struct ceph_x_ticket_handler *th;
359
360 if (!(ac->want_keys & service))
361 continue;
362
363 if (*pneed & service)
364 continue;
365
366 th = get_ticket_handler(ac, service);
367
368 if (!th) {
369 *pneed |= service;
370 continue;
371 }
372
373 if (get_seconds() >= th->renew_after)
374 *pneed |= service;
375 if (get_seconds() >= th->expires)
376 xi->have_keys &= ~service;
377 }
378}
379
380
381static int ceph_x_build_request(struct ceph_auth_client *ac,
382 void *buf, void *end)
383{
384 struct ceph_x_info *xi = ac->private;
385 int need;
386 struct ceph_x_request_header *head = buf;
387 int ret;
388 struct ceph_x_ticket_handler *th =
389 get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
390
391 ceph_x_validate_tickets(ac, &need);
392
393 dout("build_request want %x have %x need %x\n",
394 ac->want_keys, xi->have_keys, need);
395
396 if (need & CEPH_ENTITY_TYPE_AUTH) {
397 struct ceph_x_authenticate *auth = (void *)(head + 1);
398 void *p = auth + 1;
399 struct ceph_x_challenge_blob tmp;
400 char tmp_enc[40];
401 u64 *u;
402
403 if (p > end)
404 return -ERANGE;
405
406 dout(" get_auth_session_key\n");
407 head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
408
409 /* encrypt and hash */
410 get_random_bytes(&auth->client_challenge, sizeof(u64));
411 tmp.client_challenge = auth->client_challenge;
412 tmp.server_challenge = cpu_to_le64(xi->server_challenge);
413 ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
414 tmp_enc, sizeof(tmp_enc));
415 if (ret < 0)
416 return ret;
417
418 auth->struct_v = 1;
419 auth->key = 0;
420 for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
421 auth->key ^= *u;
422 dout(" server_challenge %llx client_challenge %llx key %llx\n",
423 xi->server_challenge, le64_to_cpu(auth->client_challenge),
424 le64_to_cpu(auth->key));
425
426 /* now encode the old ticket if exists */
427 ret = ceph_x_encode_ticket(th, &p, end);
428 if (ret < 0)
429 return ret;
430
431 return p - buf;
432 }
433
434 if (need) {
435 void *p = head + 1;
436 struct ceph_x_service_ticket_request *req;
437
438 if (p > end)
439 return -ERANGE;
440 head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
441
442 BUG_ON(!th);
443 ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
444 if (ret)
445 return ret;
446 ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
447 xi->auth_authorizer.buf->vec.iov_len);
448
449 req = p;
450 req->keys = cpu_to_le32(need);
451 p += sizeof(*req);
452 return p - buf;
453 }
454
455 return 0;
456}
457
458static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
459 void *buf, void *end)
460{
461 struct ceph_x_info *xi = ac->private;
462 struct ceph_x_reply_header *head = buf;
463 struct ceph_x_ticket_handler *th;
464 int len = end - buf;
465 int op;
466 int ret;
467
468 if (result)
469 return result; /* XXX hmm? */
470
471 if (xi->starting) {
472 /* it's a hello */
473 struct ceph_x_server_challenge *sc = buf;
474
475 if (len != sizeof(*sc))
476 return -EINVAL;
477 xi->server_challenge = le64_to_cpu(sc->server_challenge);
478 dout("handle_reply got server challenge %llx\n",
479 xi->server_challenge);
480 xi->starting = false;
481 xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
482 return -EAGAIN;
483 }
484
485 op = le32_to_cpu(head->op);
486 result = le32_to_cpu(head->result);
487 dout("handle_reply op %d result %d\n", op, result);
488 switch (op) {
489 case CEPHX_GET_AUTH_SESSION_KEY:
490 /* verify auth key */
491 ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
492 buf + sizeof(*head), end);
493 break;
494
495 case CEPHX_GET_PRINCIPAL_SESSION_KEY:
496 th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
497 BUG_ON(!th);
498 ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
499 buf + sizeof(*head), end);
500 break;
501
502 default:
503 return -EINVAL;
504 }
505 if (ret)
506 return ret;
507 if (ac->want_keys == xi->have_keys)
508 return 0;
509 return -EAGAIN;
510}
511
512static int ceph_x_create_authorizer(
513 struct ceph_auth_client *ac, int peer_type,
514 struct ceph_authorizer **a,
515 void **buf, size_t *len,
516 void **reply_buf, size_t *reply_len)
517{
518 struct ceph_x_authorizer *au;
519 struct ceph_x_ticket_handler *th;
520 int ret;
521
522 th = get_ticket_handler(ac, peer_type);
523 if (IS_ERR(th))
524 return PTR_ERR(th);
525
526 au = kzalloc(sizeof(*au), GFP_NOFS);
527 if (!au)
528 return -ENOMEM;
529
530 ret = ceph_x_build_authorizer(ac, th, au);
531 if (ret) {
532 kfree(au);
533 return ret;
534 }
535
536 *a = (struct ceph_authorizer *)au;
537 *buf = au->buf->vec.iov_base;
538 *len = au->buf->vec.iov_len;
539 *reply_buf = au->reply_buf;
540 *reply_len = sizeof(au->reply_buf);
541 return 0;
542}
543
544static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
545 struct ceph_authorizer *a, size_t len)
546{
547 struct ceph_x_authorizer *au = (void *)a;
548 struct ceph_x_ticket_handler *th;
549 int ret = 0;
550 struct ceph_x_authorize_reply reply;
551 void *p = au->reply_buf;
552 void *end = p + sizeof(au->reply_buf);
553
554 th = get_ticket_handler(ac, au->service);
555 if (!th)
556 return -EIO; /* hrm! */
557 ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
558 if (ret < 0)
559 return ret;
560 if (ret != sizeof(reply))
561 return -EPERM;
562
563 if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
564 ret = -EPERM;
565 else
566 ret = 0;
567 dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
568 au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
569 return ret;
570}
571
572static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
573 struct ceph_authorizer *a)
574{
575 struct ceph_x_authorizer *au = (void *)a;
576
577 ceph_buffer_put(au->buf);
578 kfree(au);
579}
580
581
582static void ceph_x_reset(struct ceph_auth_client *ac)
583{
584 struct ceph_x_info *xi = ac->private;
585
586 dout("reset\n");
587 xi->starting = true;
588 xi->server_challenge = 0;
589}
590
591static void ceph_x_destroy(struct ceph_auth_client *ac)
592{
593 struct ceph_x_info *xi = ac->private;
594 struct rb_node *p;
595
596 dout("ceph_x_destroy %p\n", ac);
597 ceph_crypto_key_destroy(&xi->secret);
598
599 while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
600 struct ceph_x_ticket_handler *th =
601 rb_entry(p, struct ceph_x_ticket_handler, node);
602 remove_ticket_handler(ac, th);
603 }
604
605 kfree(ac->private);
606 ac->private = NULL;
607}
608
609static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
610 int peer_type)
611{
612 struct ceph_x_ticket_handler *th;
613
614 th = get_ticket_handler(ac, peer_type);
615 if (th && !IS_ERR(th))
616 remove_ticket_handler(ac, th);
617}
618
619
620static const struct ceph_auth_client_ops ceph_x_ops = {
621 .is_authenticated = ceph_x_is_authenticated,
622 .build_request = ceph_x_build_request,
623 .handle_reply = ceph_x_handle_reply,
624 .create_authorizer = ceph_x_create_authorizer,
625 .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
626 .destroy_authorizer = ceph_x_destroy_authorizer,
627 .invalidate_authorizer = ceph_x_invalidate_authorizer,
628 .reset = ceph_x_reset,
629 .destroy = ceph_x_destroy,
630};
631
632
633int ceph_x_init(struct ceph_auth_client *ac)
634{
635 struct ceph_x_info *xi;
636 int ret;
637
638 dout("ceph_x_init %p\n", ac);
639 ret = -ENOMEM;
640 xi = kzalloc(sizeof(*xi), GFP_NOFS);
641 if (!xi)
642 goto out;
643
644 ret = -EINVAL;
645 if (!ac->secret) {
646 pr_err("no secret set (for auth_x protocol)\n");
647 goto out_nomem;
648 }
649
650 ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
651 if (ret)
652 goto out_nomem;
653
654 xi->starting = true;
655 xi->ticket_handlers = RB_ROOT;
656
657 ac->protocol = CEPH_AUTH_CEPHX;
658 ac->private = xi;
659 ac->ops = &ceph_x_ops;
660 return 0;
661
662out_nomem:
663 kfree(xi);
664out:
665 return ret;
666}
667
668
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h
new file mode 100644
index 000000000000..ff6f8180e681
--- /dev/null
+++ b/fs/ceph/auth_x.h
@@ -0,0 +1,49 @@
1#ifndef _FS_CEPH_AUTH_X_H
2#define _FS_CEPH_AUTH_X_H
3
4#include <linux/rbtree.h>
5
6#include "crypto.h"
7#include "auth.h"
8#include "auth_x_protocol.h"
9
10/*
11 * Handle ticket for a single service.
12 */
13struct ceph_x_ticket_handler {
14 struct rb_node node;
15 unsigned service;
16
17 struct ceph_crypto_key session_key;
18 struct ceph_timespec validity;
19
20 u64 secret_id;
21 struct ceph_buffer *ticket_blob;
22
23 unsigned long renew_after, expires;
24};
25
26
27struct ceph_x_authorizer {
28 struct ceph_buffer *buf;
29 unsigned service;
30 u64 nonce;
31 char reply_buf[128]; /* big enough for encrypted blob */
32};
33
34struct ceph_x_info {
35 struct ceph_crypto_key secret;
36
37 bool starting;
38 u64 server_challenge;
39
40 unsigned have_keys;
41 struct rb_root ticket_handlers;
42
43 struct ceph_x_authorizer auth_authorizer;
44};
45
46extern int ceph_x_init(struct ceph_auth_client *ac);
47
48#endif
49
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h
new file mode 100644
index 000000000000..671d30576c4f
--- /dev/null
+++ b/fs/ceph/auth_x_protocol.h
@@ -0,0 +1,90 @@
1#ifndef __FS_CEPH_AUTH_X_PROTOCOL
2#define __FS_CEPH_AUTH_X_PROTOCOL
3
4#define CEPHX_GET_AUTH_SESSION_KEY 0x0100
5#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
6#define CEPHX_GET_ROTATING_KEY 0x0400
7
8/* common bits */
9struct ceph_x_ticket_blob {
10 __u8 struct_v;
11 __le64 secret_id;
12 __le32 blob_len;
13 char blob[];
14} __attribute__ ((packed));
15
16
17/* common request/reply headers */
18struct ceph_x_request_header {
19 __le16 op;
20} __attribute__ ((packed));
21
22struct ceph_x_reply_header {
23 __le16 op;
24 __le32 result;
25} __attribute__ ((packed));
26
27
28/* authenticate handshake */
29
30/* initial hello (no reply header) */
31struct ceph_x_server_challenge {
32 __u8 struct_v;
33 __le64 server_challenge;
34} __attribute__ ((packed));
35
36struct ceph_x_authenticate {
37 __u8 struct_v;
38 __le64 client_challenge;
39 __le64 key;
40 /* ticket blob */
41} __attribute__ ((packed));
42
43struct ceph_x_service_ticket_request {
44 __u8 struct_v;
45 __le32 keys;
46} __attribute__ ((packed));
47
48struct ceph_x_challenge_blob {
49 __le64 server_challenge;
50 __le64 client_challenge;
51} __attribute__ ((packed));
52
53
54
55/* authorize handshake */
56
57/*
58 * The authorizer consists of two pieces:
59 * a - service id, ticket blob
60 * b - encrypted with session key
61 */
62struct ceph_x_authorize_a {
63 __u8 struct_v;
64 __le64 global_id;
65 __le32 service_id;
66 struct ceph_x_ticket_blob ticket_blob;
67} __attribute__ ((packed));
68
69struct ceph_x_authorize_b {
70 __u8 struct_v;
71 __le64 nonce;
72} __attribute__ ((packed));
73
74struct ceph_x_authorize_reply {
75 __u8 struct_v;
76 __le64 nonce_plus_one;
77} __attribute__ ((packed));
78
79
80/*
81 * encyption bundle
82 */
83#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
84
85struct ceph_x_encrypt_header {
86 __u8 struct_v;
87 __le64 magic;
88} __attribute__ ((packed));
89
90#endif
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
new file mode 100644
index 000000000000..c67535d70aa6
--- /dev/null
+++ b/fs/ceph/buffer.c
@@ -0,0 +1,81 @@
1
2#include "ceph_debug.h"
3
4#include <linux/slab.h>
5
6#include "buffer.h"
7#include "decode.h"
8
9struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
10{
11 struct ceph_buffer *b;
12
13 b = kmalloc(sizeof(*b), gfp);
14 if (!b)
15 return NULL;
16
17 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
18 if (b->vec.iov_base) {
19 b->is_vmalloc = false;
20 } else {
21 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
22 if (!b->vec.iov_base) {
23 kfree(b);
24 return NULL;
25 }
26 b->is_vmalloc = true;
27 }
28
29 kref_init(&b->kref);
30 b->alloc_len = len;
31 b->vec.iov_len = len;
32 dout("buffer_new %p\n", b);
33 return b;
34}
35
36void ceph_buffer_release(struct kref *kref)
37{
38 struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
39
40 dout("buffer_release %p\n", b);
41 if (b->vec.iov_base) {
42 if (b->is_vmalloc)
43 vfree(b->vec.iov_base);
44 else
45 kfree(b->vec.iov_base);
46 }
47 kfree(b);
48}
49
50int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
51{
52 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
53 if (b->vec.iov_base) {
54 b->is_vmalloc = false;
55 } else {
56 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
57 b->is_vmalloc = true;
58 }
59 if (!b->vec.iov_base)
60 return -ENOMEM;
61 b->alloc_len = len;
62 b->vec.iov_len = len;
63 return 0;
64}
65
66int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
67{
68 size_t len;
69
70 ceph_decode_need(p, end, sizeof(u32), bad);
71 len = ceph_decode_32(p);
72 dout("decode_buffer len %d\n", (int)len);
73 ceph_decode_need(p, end, len, bad);
74 *b = ceph_buffer_new(len, GFP_NOFS);
75 if (!*b)
76 return -ENOMEM;
77 ceph_decode_copy(p, (*b)->vec.iov_base, len);
78 return 0;
79bad:
80 return -EINVAL;
81}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
new file mode 100644
index 000000000000..58d19014068f
--- /dev/null
+++ b/fs/ceph/buffer.h
@@ -0,0 +1,39 @@
1#ifndef __FS_CEPH_BUFFER_H
2#define __FS_CEPH_BUFFER_H
3
4#include <linux/kref.h>
5#include <linux/mm.h>
6#include <linux/vmalloc.h>
7#include <linux/types.h>
8#include <linux/uio.h>
9
10/*
11 * a simple reference counted buffer.
12 *
13 * use kmalloc for small sizes (<= one page), vmalloc for larger
14 * sizes.
15 */
16struct ceph_buffer {
17 struct kref kref;
18 struct kvec vec;
19 size_t alloc_len;
20 bool is_vmalloc;
21};
22
23extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
24extern void ceph_buffer_release(struct kref *kref);
25
26static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
27{
28 kref_get(&b->kref);
29 return b;
30}
31
32static inline void ceph_buffer_put(struct ceph_buffer *b)
33{
34 kref_put(&b->kref, ceph_buffer_release);
35}
36
37extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
38
39#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
new file mode 100644
index 000000000000..d9400534b279
--- /dev/null
+++ b/fs/ceph/caps.c
@@ -0,0 +1,2960 @@
1#include "ceph_debug.h"
2
3#include <linux/fs.h>
4#include <linux/kernel.h>
5#include <linux/sched.h>
6#include <linux/slab.h>
7#include <linux/vmalloc.h>
8#include <linux/wait.h>
9#include <linux/writeback.h>
10
11#include "super.h"
12#include "decode.h"
13#include "messenger.h"
14
15/*
16 * Capability management
17 *
18 * The Ceph metadata servers control client access to inode metadata
19 * and file data by issuing capabilities, granting clients permission
20 * to read and/or write both inode field and file data to OSDs
21 * (storage nodes). Each capability consists of a set of bits
22 * indicating which operations are allowed.
23 *
24 * If the client holds a *_SHARED cap, the client has a coherent value
25 * that can be safely read from the cached inode.
26 *
27 * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
28 * client is allowed to change inode attributes (e.g., file size,
29 * mtime), note its dirty state in the ceph_cap, and asynchronously
30 * flush that metadata change to the MDS.
31 *
32 * In the event of a conflicting operation (perhaps by another
33 * client), the MDS will revoke the conflicting client capabilities.
34 *
35 * In order for a client to cache an inode, it must hold a capability
36 * with at least one MDS server. When inodes are released, release
37 * notifications are batched and periodically sent en masse to the MDS
38 * cluster to release server state.
39 */
40
41
42/*
43 * Generate readable cap strings for debugging output.
44 */
45#define MAX_CAP_STR 20
46static char cap_str[MAX_CAP_STR][40];
47static DEFINE_SPINLOCK(cap_str_lock);
48static int last_cap_str;
49
50static char *gcap_string(char *s, int c)
51{
52 if (c & CEPH_CAP_GSHARED)
53 *s++ = 's';
54 if (c & CEPH_CAP_GEXCL)
55 *s++ = 'x';
56 if (c & CEPH_CAP_GCACHE)
57 *s++ = 'c';
58 if (c & CEPH_CAP_GRD)
59 *s++ = 'r';
60 if (c & CEPH_CAP_GWR)
61 *s++ = 'w';
62 if (c & CEPH_CAP_GBUFFER)
63 *s++ = 'b';
64 if (c & CEPH_CAP_GLAZYIO)
65 *s++ = 'l';
66 return s;
67}
68
69const char *ceph_cap_string(int caps)
70{
71 int i;
72 char *s;
73 int c;
74
75 spin_lock(&cap_str_lock);
76 i = last_cap_str++;
77 if (last_cap_str == MAX_CAP_STR)
78 last_cap_str = 0;
79 spin_unlock(&cap_str_lock);
80
81 s = cap_str[i];
82
83 if (caps & CEPH_CAP_PIN)
84 *s++ = 'p';
85
86 c = (caps >> CEPH_CAP_SAUTH) & 3;
87 if (c) {
88 *s++ = 'A';
89 s = gcap_string(s, c);
90 }
91
92 c = (caps >> CEPH_CAP_SLINK) & 3;
93 if (c) {
94 *s++ = 'L';
95 s = gcap_string(s, c);
96 }
97
98 c = (caps >> CEPH_CAP_SXATTR) & 3;
99 if (c) {
100 *s++ = 'X';
101 s = gcap_string(s, c);
102 }
103
104 c = caps >> CEPH_CAP_SFILE;
105 if (c) {
106 *s++ = 'F';
107 s = gcap_string(s, c);
108 }
109
110 if (s == cap_str[i])
111 *s++ = '-';
112 *s = 0;
113 return cap_str[i];
114}
115
116/*
117 * Cap reservations
118 *
119 * Maintain a global pool of preallocated struct ceph_caps, referenced
120 * by struct ceph_caps_reservations. This ensures that we preallocate
121 * memory needed to successfully process an MDS response. (If an MDS
122 * sends us cap information and we fail to process it, we will have
123 * problems due to the client and MDS being out of sync.)
124 *
125 * Reservations are 'owned' by a ceph_cap_reservation context.
126 */
127static spinlock_t caps_list_lock;
128static struct list_head caps_list; /* unused (reserved or unreserved) */
129static int caps_total_count; /* total caps allocated */
130static int caps_use_count; /* in use */
131static int caps_reserve_count; /* unused, reserved */
132static int caps_avail_count; /* unused, unreserved */
133static int caps_min_count; /* keep at least this many (unreserved) */
134
135void __init ceph_caps_init(void)
136{
137 INIT_LIST_HEAD(&caps_list);
138 spin_lock_init(&caps_list_lock);
139}
140
141void ceph_caps_finalize(void)
142{
143 struct ceph_cap *cap;
144
145 spin_lock(&caps_list_lock);
146 while (!list_empty(&caps_list)) {
147 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
148 list_del(&cap->caps_item);
149 kmem_cache_free(ceph_cap_cachep, cap);
150 }
151 caps_total_count = 0;
152 caps_avail_count = 0;
153 caps_use_count = 0;
154 caps_reserve_count = 0;
155 caps_min_count = 0;
156 spin_unlock(&caps_list_lock);
157}
158
159void ceph_adjust_min_caps(int delta)
160{
161 spin_lock(&caps_list_lock);
162 caps_min_count += delta;
163 BUG_ON(caps_min_count < 0);
164 spin_unlock(&caps_list_lock);
165}
166
167int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
168{
169 int i;
170 struct ceph_cap *cap;
171 int have;
172 int alloc = 0;
173 LIST_HEAD(newcaps);
174 int ret = 0;
175
176 dout("reserve caps ctx=%p need=%d\n", ctx, need);
177
178 /* first reserve any caps that are already allocated */
179 spin_lock(&caps_list_lock);
180 if (caps_avail_count >= need)
181 have = need;
182 else
183 have = caps_avail_count;
184 caps_avail_count -= have;
185 caps_reserve_count += have;
186 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
187 caps_avail_count);
188 spin_unlock(&caps_list_lock);
189
190 for (i = have; i < need; i++) {
191 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
192 if (!cap) {
193 ret = -ENOMEM;
194 goto out_alloc_count;
195 }
196 list_add(&cap->caps_item, &newcaps);
197 alloc++;
198 }
199 BUG_ON(have + alloc != need);
200
201 spin_lock(&caps_list_lock);
202 caps_total_count += alloc;
203 caps_reserve_count += alloc;
204 list_splice(&newcaps, &caps_list);
205
206 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
207 caps_avail_count);
208 spin_unlock(&caps_list_lock);
209
210 ctx->count = need;
211 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
212 ctx, caps_total_count, caps_use_count, caps_reserve_count,
213 caps_avail_count);
214 return 0;
215
216out_alloc_count:
217 /* we didn't manage to reserve as much as we needed */
218 pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
219 ctx, need, have);
220 return ret;
221}
222
223int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
224{
225 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
226 if (ctx->count) {
227 spin_lock(&caps_list_lock);
228 BUG_ON(caps_reserve_count < ctx->count);
229 caps_reserve_count -= ctx->count;
230 caps_avail_count += ctx->count;
231 ctx->count = 0;
232 dout("unreserve caps %d = %d used + %d resv + %d avail\n",
233 caps_total_count, caps_use_count, caps_reserve_count,
234 caps_avail_count);
235 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
236 caps_avail_count);
237 spin_unlock(&caps_list_lock);
238 }
239 return 0;
240}
241
242static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
243{
244 struct ceph_cap *cap = NULL;
245
246 /* temporary, until we do something about cap import/export */
247 if (!ctx)
248 return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
249
250 spin_lock(&caps_list_lock);
251 dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
252 ctx, ctx->count, caps_total_count, caps_use_count,
253 caps_reserve_count, caps_avail_count);
254 BUG_ON(!ctx->count);
255 BUG_ON(ctx->count > caps_reserve_count);
256 BUG_ON(list_empty(&caps_list));
257
258 ctx->count--;
259 caps_reserve_count--;
260 caps_use_count++;
261
262 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
263 list_del(&cap->caps_item);
264
265 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
266 caps_avail_count);
267 spin_unlock(&caps_list_lock);
268 return cap;
269}
270
271void ceph_put_cap(struct ceph_cap *cap)
272{
273 spin_lock(&caps_list_lock);
274 dout("put_cap %p %d = %d used + %d resv + %d avail\n",
275 cap, caps_total_count, caps_use_count,
276 caps_reserve_count, caps_avail_count);
277 caps_use_count--;
278 /*
279 * Keep some preallocated caps around (ceph_min_count), to
280 * avoid lots of free/alloc churn.
281 */
282 if (caps_avail_count >= caps_reserve_count + caps_min_count) {
283 caps_total_count--;
284 kmem_cache_free(ceph_cap_cachep, cap);
285 } else {
286 caps_avail_count++;
287 list_add(&cap->caps_item, &caps_list);
288 }
289
290 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
291 caps_avail_count);
292 spin_unlock(&caps_list_lock);
293}
294
295void ceph_reservation_status(struct ceph_client *client,
296 int *total, int *avail, int *used, int *reserved,
297 int *min)
298{
299 if (total)
300 *total = caps_total_count;
301 if (avail)
302 *avail = caps_avail_count;
303 if (used)
304 *used = caps_use_count;
305 if (reserved)
306 *reserved = caps_reserve_count;
307 if (min)
308 *min = caps_min_count;
309}
310
311/*
312 * Find ceph_cap for given mds, if any.
313 *
314 * Called with i_lock held.
315 */
316static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
317{
318 struct ceph_cap *cap;
319 struct rb_node *n = ci->i_caps.rb_node;
320
321 while (n) {
322 cap = rb_entry(n, struct ceph_cap, ci_node);
323 if (mds < cap->mds)
324 n = n->rb_left;
325 else if (mds > cap->mds)
326 n = n->rb_right;
327 else
328 return cap;
329 }
330 return NULL;
331}
332
333/*
334 * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else
335 * -1.
336 */
337static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq)
338{
339 struct ceph_cap *cap;
340 int mds = -1;
341 struct rb_node *p;
342
343 /* prefer mds with WR|WRBUFFER|EXCL caps */
344 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
345 cap = rb_entry(p, struct ceph_cap, ci_node);
346 mds = cap->mds;
347 if (mseq)
348 *mseq = cap->mseq;
349 if (cap->issued & (CEPH_CAP_FILE_WR |
350 CEPH_CAP_FILE_BUFFER |
351 CEPH_CAP_FILE_EXCL))
352 break;
353 }
354 return mds;
355}
356
357int ceph_get_cap_mds(struct inode *inode)
358{
359 int mds;
360 spin_lock(&inode->i_lock);
361 mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
362 spin_unlock(&inode->i_lock);
363 return mds;
364}
365
366/*
367 * Called under i_lock.
368 */
369static void __insert_cap_node(struct ceph_inode_info *ci,
370 struct ceph_cap *new)
371{
372 struct rb_node **p = &ci->i_caps.rb_node;
373 struct rb_node *parent = NULL;
374 struct ceph_cap *cap = NULL;
375
376 while (*p) {
377 parent = *p;
378 cap = rb_entry(parent, struct ceph_cap, ci_node);
379 if (new->mds < cap->mds)
380 p = &(*p)->rb_left;
381 else if (new->mds > cap->mds)
382 p = &(*p)->rb_right;
383 else
384 BUG();
385 }
386
387 rb_link_node(&new->ci_node, parent, p);
388 rb_insert_color(&new->ci_node, &ci->i_caps);
389}
390
391/*
392 * (re)set cap hold timeouts, which control the delayed release
393 * of unused caps back to the MDS. Should be called on cap use.
394 */
395static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
396 struct ceph_inode_info *ci)
397{
398 struct ceph_mount_args *ma = mdsc->client->mount_args;
399
400 ci->i_hold_caps_min = round_jiffies(jiffies +
401 ma->caps_wanted_delay_min * HZ);
402 ci->i_hold_caps_max = round_jiffies(jiffies +
403 ma->caps_wanted_delay_max * HZ);
404 dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
405 ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
406}
407
408/*
409 * (Re)queue cap at the end of the delayed cap release list.
410 *
411 * If I_FLUSH is set, leave the inode at the front of the list.
412 *
413 * Caller holds i_lock
414 * -> we take mdsc->cap_delay_lock
415 */
416static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
417 struct ceph_inode_info *ci)
418{
419 __cap_set_timeouts(mdsc, ci);
420 dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
421 ci->i_ceph_flags, ci->i_hold_caps_max);
422 if (!mdsc->stopping) {
423 spin_lock(&mdsc->cap_delay_lock);
424 if (!list_empty(&ci->i_cap_delay_list)) {
425 if (ci->i_ceph_flags & CEPH_I_FLUSH)
426 goto no_change;
427 list_del_init(&ci->i_cap_delay_list);
428 }
429 list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
430no_change:
431 spin_unlock(&mdsc->cap_delay_lock);
432 }
433}
434
435/*
436 * Queue an inode for immediate writeback. Mark inode with I_FLUSH,
437 * indicating we should send a cap message to flush dirty metadata
438 * asap, and move to the front of the delayed cap list.
439 */
440static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
441 struct ceph_inode_info *ci)
442{
443 dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
444 spin_lock(&mdsc->cap_delay_lock);
445 ci->i_ceph_flags |= CEPH_I_FLUSH;
446 if (!list_empty(&ci->i_cap_delay_list))
447 list_del_init(&ci->i_cap_delay_list);
448 list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
449 spin_unlock(&mdsc->cap_delay_lock);
450}
451
452/*
453 * Cancel delayed work on cap.
454 *
455 * Caller must hold i_lock.
456 */
457static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
458 struct ceph_inode_info *ci)
459{
460 dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
461 if (list_empty(&ci->i_cap_delay_list))
462 return;
463 spin_lock(&mdsc->cap_delay_lock);
464 list_del_init(&ci->i_cap_delay_list);
465 spin_unlock(&mdsc->cap_delay_lock);
466}
467
468/*
469 * Common issue checks for add_cap, handle_cap_grant.
470 */
471static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
472 unsigned issued)
473{
474 unsigned had = __ceph_caps_issued(ci, NULL);
475
476 /*
477 * Each time we receive FILE_CACHE anew, we increment
478 * i_rdcache_gen.
479 */
480 if ((issued & CEPH_CAP_FILE_CACHE) &&
481 (had & CEPH_CAP_FILE_CACHE) == 0)
482 ci->i_rdcache_gen++;
483
484 /*
485 * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
486 * don't know what happened to this directory while we didn't
487 * have the cap.
488 */
489 if ((issued & CEPH_CAP_FILE_SHARED) &&
490 (had & CEPH_CAP_FILE_SHARED) == 0) {
491 ci->i_shared_gen++;
492 if (S_ISDIR(ci->vfs_inode.i_mode)) {
493 dout(" marking %p NOT complete\n", &ci->vfs_inode);
494 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
495 }
496 }
497}
498
499/*
500 * Add a capability under the given MDS session.
501 *
502 * Caller should hold session snap_rwsem (read) and s_mutex.
503 *
504 * @fmode is the open file mode, if we are opening a file, otherwise
505 * it is < 0. (This is so we can atomically add the cap and add an
506 * open file reference to it.)
507 */
508int ceph_add_cap(struct inode *inode,
509 struct ceph_mds_session *session, u64 cap_id,
510 int fmode, unsigned issued, unsigned wanted,
511 unsigned seq, unsigned mseq, u64 realmino, int flags,
512 struct ceph_cap_reservation *caps_reservation)
513{
514 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
515 struct ceph_inode_info *ci = ceph_inode(inode);
516 struct ceph_cap *new_cap = NULL;
517 struct ceph_cap *cap;
518 int mds = session->s_mds;
519 int actual_wanted;
520
521 dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
522 session->s_mds, cap_id, ceph_cap_string(issued), seq);
523
524 /*
525 * If we are opening the file, include file mode wanted bits
526 * in wanted.
527 */
528 if (fmode >= 0)
529 wanted |= ceph_caps_for_mode(fmode);
530
531retry:
532 spin_lock(&inode->i_lock);
533 cap = __get_cap_for_mds(ci, mds);
534 if (!cap) {
535 if (new_cap) {
536 cap = new_cap;
537 new_cap = NULL;
538 } else {
539 spin_unlock(&inode->i_lock);
540 new_cap = get_cap(caps_reservation);
541 if (new_cap == NULL)
542 return -ENOMEM;
543 goto retry;
544 }
545
546 cap->issued = 0;
547 cap->implemented = 0;
548 cap->mds = mds;
549 cap->mds_wanted = 0;
550
551 cap->ci = ci;
552 __insert_cap_node(ci, cap);
553
554 /* clear out old exporting info? (i.e. on cap import) */
555 if (ci->i_cap_exporting_mds == mds) {
556 ci->i_cap_exporting_issued = 0;
557 ci->i_cap_exporting_mseq = 0;
558 ci->i_cap_exporting_mds = -1;
559 }
560
561 /* add to session cap list */
562 cap->session = session;
563 spin_lock(&session->s_cap_lock);
564 list_add_tail(&cap->session_caps, &session->s_caps);
565 session->s_nr_caps++;
566 spin_unlock(&session->s_cap_lock);
567 }
568
569 if (!ci->i_snap_realm) {
570 /*
571 * add this inode to the appropriate snap realm
572 */
573 struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
574 realmino);
575 if (realm) {
576 ceph_get_snap_realm(mdsc, realm);
577 spin_lock(&realm->inodes_with_caps_lock);
578 ci->i_snap_realm = realm;
579 list_add(&ci->i_snap_realm_item,
580 &realm->inodes_with_caps);
581 spin_unlock(&realm->inodes_with_caps_lock);
582 } else {
583 pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
584 realmino);
585 }
586 }
587
588 __check_cap_issue(ci, cap, issued);
589
590 /*
591 * If we are issued caps we don't want, or the mds' wanted
592 * value appears to be off, queue a check so we'll release
593 * later and/or update the mds wanted value.
594 */
595 actual_wanted = __ceph_caps_wanted(ci);
596 if ((wanted & ~actual_wanted) ||
597 (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
598 dout(" issued %s, mds wanted %s, actual %s, queueing\n",
599 ceph_cap_string(issued), ceph_cap_string(wanted),
600 ceph_cap_string(actual_wanted));
601 __cap_delay_requeue(mdsc, ci);
602 }
603
604 if (flags & CEPH_CAP_FLAG_AUTH)
605 ci->i_auth_cap = cap;
606 else if (ci->i_auth_cap == cap)
607 ci->i_auth_cap = NULL;
608
609 dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
610 inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
611 ceph_cap_string(issued|cap->issued), seq, mds);
612 cap->cap_id = cap_id;
613 cap->issued = issued;
614 cap->implemented |= issued;
615 cap->mds_wanted |= wanted;
616 cap->seq = seq;
617 cap->issue_seq = seq;
618 cap->mseq = mseq;
619 cap->cap_gen = session->s_cap_gen;
620
621 if (fmode >= 0)
622 __ceph_get_fmode(ci, fmode);
623 spin_unlock(&inode->i_lock);
624 wake_up(&ci->i_cap_wq);
625 return 0;
626}
627
628/*
629 * Return true if cap has not timed out and belongs to the current
630 * generation of the MDS session (i.e. has not gone 'stale' due to
631 * us losing touch with the mds).
632 */
633static int __cap_is_valid(struct ceph_cap *cap)
634{
635 unsigned long ttl;
636 u32 gen;
637
638 spin_lock(&cap->session->s_cap_lock);
639 gen = cap->session->s_cap_gen;
640 ttl = cap->session->s_cap_ttl;
641 spin_unlock(&cap->session->s_cap_lock);
642
643 if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
644 dout("__cap_is_valid %p cap %p issued %s "
645 "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
646 cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
647 return 0;
648 }
649
650 return 1;
651}
652
653/*
654 * Return set of valid cap bits issued to us. Note that caps time
655 * out, and may be invalidated in bulk if the client session times out
656 * and session->s_cap_gen is bumped.
657 */
658int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
659{
660 int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
661 struct ceph_cap *cap;
662 struct rb_node *p;
663
664 if (implemented)
665 *implemented = 0;
666 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
667 cap = rb_entry(p, struct ceph_cap, ci_node);
668 if (!__cap_is_valid(cap))
669 continue;
670 dout("__ceph_caps_issued %p cap %p issued %s\n",
671 &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
672 have |= cap->issued;
673 if (implemented)
674 *implemented |= cap->implemented;
675 }
676 return have;
677}
678
679/*
680 * Get cap bits issued by caps other than @ocap
681 */
682int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
683{
684 int have = ci->i_snap_caps;
685 struct ceph_cap *cap;
686 struct rb_node *p;
687
688 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
689 cap = rb_entry(p, struct ceph_cap, ci_node);
690 if (cap == ocap)
691 continue;
692 if (!__cap_is_valid(cap))
693 continue;
694 have |= cap->issued;
695 }
696 return have;
697}
698
699/*
700 * Move a cap to the end of the LRU (oldest caps at list head, newest
701 * at list tail).
702 */
703static void __touch_cap(struct ceph_cap *cap)
704{
705 struct ceph_mds_session *s = cap->session;
706
707 spin_lock(&s->s_cap_lock);
708 if (s->s_cap_iterator == NULL) {
709 dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
710 s->s_mds);
711 list_move_tail(&cap->session_caps, &s->s_caps);
712 } else {
713 dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
714 &cap->ci->vfs_inode, cap, s->s_mds);
715 }
716 spin_unlock(&s->s_cap_lock);
717}
718
719/*
720 * Check if we hold the given mask. If so, move the cap(s) to the
721 * front of their respective LRUs. (This is the preferred way for
722 * callers to check for caps they want.)
723 */
724int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
725{
726 struct ceph_cap *cap;
727 struct rb_node *p;
728 int have = ci->i_snap_caps;
729
730 if ((have & mask) == mask) {
731 dout("__ceph_caps_issued_mask %p snap issued %s"
732 " (mask %s)\n", &ci->vfs_inode,
733 ceph_cap_string(have),
734 ceph_cap_string(mask));
735 return 1;
736 }
737
738 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
739 cap = rb_entry(p, struct ceph_cap, ci_node);
740 if (!__cap_is_valid(cap))
741 continue;
742 if ((cap->issued & mask) == mask) {
743 dout("__ceph_caps_issued_mask %p cap %p issued %s"
744 " (mask %s)\n", &ci->vfs_inode, cap,
745 ceph_cap_string(cap->issued),
746 ceph_cap_string(mask));
747 if (touch)
748 __touch_cap(cap);
749 return 1;
750 }
751
752 /* does a combination of caps satisfy mask? */
753 have |= cap->issued;
754 if ((have & mask) == mask) {
755 dout("__ceph_caps_issued_mask %p combo issued %s"
756 " (mask %s)\n", &ci->vfs_inode,
757 ceph_cap_string(cap->issued),
758 ceph_cap_string(mask));
759 if (touch) {
760 struct rb_node *q;
761
762 /* touch this + preceeding caps */
763 __touch_cap(cap);
764 for (q = rb_first(&ci->i_caps); q != p;
765 q = rb_next(q)) {
766 cap = rb_entry(q, struct ceph_cap,
767 ci_node);
768 if (!__cap_is_valid(cap))
769 continue;
770 __touch_cap(cap);
771 }
772 }
773 return 1;
774 }
775 }
776
777 return 0;
778}
779
780/*
781 * Return true if mask caps are currently being revoked by an MDS.
782 */
783int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
784{
785 struct inode *inode = &ci->vfs_inode;
786 struct ceph_cap *cap;
787 struct rb_node *p;
788 int ret = 0;
789
790 spin_lock(&inode->i_lock);
791 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
792 cap = rb_entry(p, struct ceph_cap, ci_node);
793 if (__cap_is_valid(cap) &&
794 (cap->implemented & ~cap->issued & mask)) {
795 ret = 1;
796 break;
797 }
798 }
799 spin_unlock(&inode->i_lock);
800 dout("ceph_caps_revoking %p %s = %d\n", inode,
801 ceph_cap_string(mask), ret);
802 return ret;
803}
804
805int __ceph_caps_used(struct ceph_inode_info *ci)
806{
807 int used = 0;
808 if (ci->i_pin_ref)
809 used |= CEPH_CAP_PIN;
810 if (ci->i_rd_ref)
811 used |= CEPH_CAP_FILE_RD;
812 if (ci->i_rdcache_ref || ci->i_rdcache_gen)
813 used |= CEPH_CAP_FILE_CACHE;
814 if (ci->i_wr_ref)
815 used |= CEPH_CAP_FILE_WR;
816 if (ci->i_wrbuffer_ref)
817 used |= CEPH_CAP_FILE_BUFFER;
818 return used;
819}
820
821/*
822 * wanted, by virtue of open file modes
823 */
824int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
825{
826 int want = 0;
827 int mode;
828 for (mode = 0; mode < 4; mode++)
829 if (ci->i_nr_by_mode[mode])
830 want |= ceph_caps_for_mode(mode);
831 return want;
832}
833
834/*
835 * Return caps we have registered with the MDS(s) as 'wanted'.
836 */
837int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
838{
839 struct ceph_cap *cap;
840 struct rb_node *p;
841 int mds_wanted = 0;
842
843 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
844 cap = rb_entry(p, struct ceph_cap, ci_node);
845 if (!__cap_is_valid(cap))
846 continue;
847 mds_wanted |= cap->mds_wanted;
848 }
849 return mds_wanted;
850}
851
852/*
853 * called under i_lock
854 */
855static int __ceph_is_any_caps(struct ceph_inode_info *ci)
856{
857 return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
858}
859
860/*
861 * Remove a cap. Take steps to deal with a racing iterate_session_caps.
862 *
863 * caller should hold i_lock.
864 * caller will not hold session s_mutex if called from destroy_inode.
865 */
866void __ceph_remove_cap(struct ceph_cap *cap)
867{
868 struct ceph_mds_session *session = cap->session;
869 struct ceph_inode_info *ci = cap->ci;
870 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
871 int removed = 0;
872
873 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
874
875 /* remove from session list */
876 spin_lock(&session->s_cap_lock);
877 if (session->s_cap_iterator == cap) {
878 /* not yet, we are iterating over this very cap */
879 dout("__ceph_remove_cap delaying %p removal from session %p\n",
880 cap, cap->session);
881 } else {
882 list_del_init(&cap->session_caps);
883 session->s_nr_caps--;
884 cap->session = NULL;
885 removed = 1;
886 }
887 /* protect backpointer with s_cap_lock: see iterate_session_caps */
888 cap->ci = NULL;
889 spin_unlock(&session->s_cap_lock);
890
891 /* remove from inode list */
892 rb_erase(&cap->ci_node, &ci->i_caps);
893 if (ci->i_auth_cap == cap)
894 ci->i_auth_cap = NULL;
895
896 if (removed)
897 ceph_put_cap(cap);
898
899 if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
900 struct ceph_snap_realm *realm = ci->i_snap_realm;
901 spin_lock(&realm->inodes_with_caps_lock);
902 list_del_init(&ci->i_snap_realm_item);
903 ci->i_snap_realm_counter++;
904 ci->i_snap_realm = NULL;
905 spin_unlock(&realm->inodes_with_caps_lock);
906 ceph_put_snap_realm(mdsc, realm);
907 }
908 if (!__ceph_is_any_real_caps(ci))
909 __cap_delay_cancel(mdsc, ci);
910}
911
912/*
913 * Build and send a cap message to the given MDS.
914 *
915 * Caller should be holding s_mutex.
916 */
917static int send_cap_msg(struct ceph_mds_session *session,
918 u64 ino, u64 cid, int op,
919 int caps, int wanted, int dirty,
920 u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
921 u64 size, u64 max_size,
922 struct timespec *mtime, struct timespec *atime,
923 u64 time_warp_seq,
924 uid_t uid, gid_t gid, mode_t mode,
925 u64 xattr_version,
926 struct ceph_buffer *xattrs_buf,
927 u64 follows)
928{
929 struct ceph_mds_caps *fc;
930 struct ceph_msg *msg;
931
932 dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
933 " seq %u/%u mseq %u follows %lld size %llu/%llu"
934 " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
935 cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
936 ceph_cap_string(dirty),
937 seq, issue_seq, mseq, follows, size, max_size,
938 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
939
940 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
941 if (IS_ERR(msg))
942 return PTR_ERR(msg);
943
944 msg->hdr.tid = cpu_to_le64(flush_tid);
945
946 fc = msg->front.iov_base;
947 memset(fc, 0, sizeof(*fc));
948
949 fc->cap_id = cpu_to_le64(cid);
950 fc->op = cpu_to_le32(op);
951 fc->seq = cpu_to_le32(seq);
952 fc->issue_seq = cpu_to_le32(issue_seq);
953 fc->migrate_seq = cpu_to_le32(mseq);
954 fc->caps = cpu_to_le32(caps);
955 fc->wanted = cpu_to_le32(wanted);
956 fc->dirty = cpu_to_le32(dirty);
957 fc->ino = cpu_to_le64(ino);
958 fc->snap_follows = cpu_to_le64(follows);
959
960 fc->size = cpu_to_le64(size);
961 fc->max_size = cpu_to_le64(max_size);
962 if (mtime)
963 ceph_encode_timespec(&fc->mtime, mtime);
964 if (atime)
965 ceph_encode_timespec(&fc->atime, atime);
966 fc->time_warp_seq = cpu_to_le32(time_warp_seq);
967
968 fc->uid = cpu_to_le32(uid);
969 fc->gid = cpu_to_le32(gid);
970 fc->mode = cpu_to_le32(mode);
971
972 fc->xattr_version = cpu_to_le64(xattr_version);
973 if (xattrs_buf) {
974 msg->middle = ceph_buffer_get(xattrs_buf);
975 fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
976 msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
977 }
978
979 ceph_con_send(&session->s_con, msg);
980 return 0;
981}
982
983/*
984 * Queue cap releases when an inode is dropped from our cache. Since
985 * inode is about to be destroyed, there is no need for i_lock.
986 */
987void ceph_queue_caps_release(struct inode *inode)
988{
989 struct ceph_inode_info *ci = ceph_inode(inode);
990 struct rb_node *p;
991
992 p = rb_first(&ci->i_caps);
993 while (p) {
994 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
995 struct ceph_mds_session *session = cap->session;
996 struct ceph_msg *msg;
997 struct ceph_mds_cap_release *head;
998 struct ceph_mds_cap_item *item;
999
1000 spin_lock(&session->s_cap_lock);
1001 BUG_ON(!session->s_num_cap_releases);
1002 msg = list_first_entry(&session->s_cap_releases,
1003 struct ceph_msg, list_head);
1004
1005 dout(" adding %p release to mds%d msg %p (%d left)\n",
1006 inode, session->s_mds, msg, session->s_num_cap_releases);
1007
1008 BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
1009 head = msg->front.iov_base;
1010 head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
1011 item = msg->front.iov_base + msg->front.iov_len;
1012 item->ino = cpu_to_le64(ceph_ino(inode));
1013 item->cap_id = cpu_to_le64(cap->cap_id);
1014 item->migrate_seq = cpu_to_le32(cap->mseq);
1015 item->seq = cpu_to_le32(cap->issue_seq);
1016
1017 session->s_num_cap_releases--;
1018
1019 msg->front.iov_len += sizeof(*item);
1020 if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
1021 dout(" release msg %p full\n", msg);
1022 list_move_tail(&msg->list_head,
1023 &session->s_cap_releases_done);
1024 } else {
1025 dout(" release msg %p at %d/%d (%d)\n", msg,
1026 (int)le32_to_cpu(head->num),
1027 (int)CEPH_CAPS_PER_RELEASE,
1028 (int)msg->front.iov_len);
1029 }
1030 spin_unlock(&session->s_cap_lock);
1031 p = rb_next(p);
1032 __ceph_remove_cap(cap);
1033 }
1034}
1035
1036/*
1037 * Send a cap msg on the given inode. Update our caps state, then
1038 * drop i_lock and send the message.
1039 *
1040 * Make note of max_size reported/requested from mds, revoked caps
1041 * that have now been implemented.
1042 *
1043 * Make half-hearted attempt ot to invalidate page cache if we are
1044 * dropping RDCACHE. Note that this will leave behind locked pages
1045 * that we'll then need to deal with elsewhere.
1046 *
1047 * Return non-zero if delayed release, or we experienced an error
1048 * such that the caller should requeue + retry later.
1049 *
1050 * called with i_lock, then drops it.
1051 * caller should hold snap_rwsem (read), s_mutex.
1052 */
1053static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1054 int op, int used, int want, int retain, int flushing,
1055 unsigned *pflush_tid)
1056 __releases(cap->ci->vfs_inode->i_lock)
1057{
1058 struct ceph_inode_info *ci = cap->ci;
1059 struct inode *inode = &ci->vfs_inode;
1060 u64 cap_id = cap->cap_id;
1061 int held, revoking, dropping, keep;
1062 u64 seq, issue_seq, mseq, time_warp_seq, follows;
1063 u64 size, max_size;
1064 struct timespec mtime, atime;
1065 int wake = 0;
1066 mode_t mode;
1067 uid_t uid;
1068 gid_t gid;
1069 struct ceph_mds_session *session;
1070 u64 xattr_version = 0;
1071 int delayed = 0;
1072 u64 flush_tid = 0;
1073 int i;
1074 int ret;
1075
1076 held = cap->issued | cap->implemented;
1077 revoking = cap->implemented & ~cap->issued;
1078 retain &= ~revoking;
1079 dropping = cap->issued & ~retain;
1080
1081 dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
1082 inode, cap, cap->session,
1083 ceph_cap_string(held), ceph_cap_string(held & retain),
1084 ceph_cap_string(revoking));
1085 BUG_ON((retain & CEPH_CAP_PIN) == 0);
1086
1087 session = cap->session;
1088
1089 /* don't release wanted unless we've waited a bit. */
1090 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1091 time_before(jiffies, ci->i_hold_caps_min)) {
1092 dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
1093 ceph_cap_string(cap->issued),
1094 ceph_cap_string(cap->issued & retain),
1095 ceph_cap_string(cap->mds_wanted),
1096 ceph_cap_string(want));
1097 want |= cap->mds_wanted;
1098 retain |= cap->issued;
1099 delayed = 1;
1100 }
1101 ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
1102
1103 cap->issued &= retain; /* drop bits we don't want */
1104 if (cap->implemented & ~cap->issued) {
1105 /*
1106 * Wake up any waiters on wanted -> needed transition.
1107 * This is due to the weird transition from buffered
1108 * to sync IO... we need to flush dirty pages _before_
1109 * allowing sync writes to avoid reordering.
1110 */
1111 wake = 1;
1112 }
1113 cap->implemented &= cap->issued | used;
1114 cap->mds_wanted = want;
1115
1116 if (flushing) {
1117 /*
1118 * assign a tid for flush operations so we can avoid
1119 * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
1120 * clean type races. track latest tid for every bit
1121 * so we can handle flush AxFw, flush Fw, and have the
1122 * first ack clean Ax.
1123 */
1124 flush_tid = ++ci->i_cap_flush_last_tid;
1125 if (pflush_tid)
1126 *pflush_tid = flush_tid;
1127 dout(" cap_flush_tid %d\n", (int)flush_tid);
1128 for (i = 0; i < CEPH_CAP_BITS; i++)
1129 if (flushing & (1 << i))
1130 ci->i_cap_flush_tid[i] = flush_tid;
1131 }
1132
1133 keep = cap->implemented;
1134 seq = cap->seq;
1135 issue_seq = cap->issue_seq;
1136 mseq = cap->mseq;
1137 size = inode->i_size;
1138 ci->i_reported_size = size;
1139 max_size = ci->i_wanted_max_size;
1140 ci->i_requested_max_size = max_size;
1141 mtime = inode->i_mtime;
1142 atime = inode->i_atime;
1143 time_warp_seq = ci->i_time_warp_seq;
1144 follows = ci->i_snap_realm->cached_context->seq;
1145 uid = inode->i_uid;
1146 gid = inode->i_gid;
1147 mode = inode->i_mode;
1148
1149 if (dropping & CEPH_CAP_XATTR_EXCL) {
1150 __ceph_build_xattrs_blob(ci);
1151 xattr_version = ci->i_xattrs.version + 1;
1152 }
1153
1154 spin_unlock(&inode->i_lock);
1155
1156 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
1157 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
1158 size, max_size, &mtime, &atime, time_warp_seq,
1159 uid, gid, mode,
1160 xattr_version,
1161 (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
1162 follows);
1163 if (ret < 0) {
1164 dout("error sending cap msg, must requeue %p\n", inode);
1165 delayed = 1;
1166 }
1167
1168 if (wake)
1169 wake_up(&ci->i_cap_wq);
1170
1171 return delayed;
1172}
1173
1174/*
1175 * When a snapshot is taken, clients accumulate dirty metadata on
1176 * inodes with capabilities in ceph_cap_snaps to describe the file
1177 * state at the time the snapshot was taken. This must be flushed
1178 * asynchronously back to the MDS once sync writes complete and dirty
1179 * data is written out.
1180 *
1181 * Called under i_lock. Takes s_mutex as needed.
1182 */
1183void __ceph_flush_snaps(struct ceph_inode_info *ci,
1184 struct ceph_mds_session **psession)
1185{
1186 struct inode *inode = &ci->vfs_inode;
1187 int mds;
1188 struct ceph_cap_snap *capsnap;
1189 u32 mseq;
1190 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
1191 struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
1192 session->s_mutex */
1193 u64 next_follows = 0; /* keep track of how far we've gotten through the
1194 i_cap_snaps list, and skip these entries next time
1195 around to avoid an infinite loop */
1196
1197 if (psession)
1198 session = *psession;
1199
1200 dout("__flush_snaps %p\n", inode);
1201retry:
1202 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
1203 /* avoid an infiniute loop after retry */
1204 if (capsnap->follows < next_follows)
1205 continue;
1206 /*
1207 * we need to wait for sync writes to complete and for dirty
1208 * pages to be written out.
1209 */
1210 if (capsnap->dirty_pages || capsnap->writing)
1211 continue;
1212
1213 /*
1214 * if cap writeback already occurred, we should have dropped
1215 * the capsnap in ceph_put_wrbuffer_cap_refs.
1216 */
1217 BUG_ON(capsnap->dirty == 0);
1218
1219 /* pick mds, take s_mutex */
1220 mds = __ceph_get_cap_mds(ci, &mseq);
1221 if (session && session->s_mds != mds) {
1222 dout("oops, wrong session %p mutex\n", session);
1223 mutex_unlock(&session->s_mutex);
1224 ceph_put_mds_session(session);
1225 session = NULL;
1226 }
1227 if (!session) {
1228 spin_unlock(&inode->i_lock);
1229 mutex_lock(&mdsc->mutex);
1230 session = __ceph_lookup_mds_session(mdsc, mds);
1231 mutex_unlock(&mdsc->mutex);
1232 if (session) {
1233 dout("inverting session/ino locks on %p\n",
1234 session);
1235 mutex_lock(&session->s_mutex);
1236 }
1237 /*
1238 * if session == NULL, we raced against a cap
1239 * deletion. retry, and we'll get a better
1240 * @mds value next time.
1241 */
1242 spin_lock(&inode->i_lock);
1243 goto retry;
1244 }
1245
1246 capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
1247 atomic_inc(&capsnap->nref);
1248 if (!list_empty(&capsnap->flushing_item))
1249 list_del_init(&capsnap->flushing_item);
1250 list_add_tail(&capsnap->flushing_item,
1251 &session->s_cap_snaps_flushing);
1252 spin_unlock(&inode->i_lock);
1253
1254 dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
1255 inode, capsnap, next_follows, capsnap->size);
1256 send_cap_msg(session, ceph_vino(inode).ino, 0,
1257 CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
1258 capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
1259 capsnap->size, 0,
1260 &capsnap->mtime, &capsnap->atime,
1261 capsnap->time_warp_seq,
1262 capsnap->uid, capsnap->gid, capsnap->mode,
1263 0, NULL,
1264 capsnap->follows);
1265
1266 next_follows = capsnap->follows + 1;
1267 ceph_put_cap_snap(capsnap);
1268
1269 spin_lock(&inode->i_lock);
1270 goto retry;
1271 }
1272
1273 /* we flushed them all; remove this inode from the queue */
1274 spin_lock(&mdsc->snap_flush_lock);
1275 list_del_init(&ci->i_snap_flush_item);
1276 spin_unlock(&mdsc->snap_flush_lock);
1277
1278 if (psession)
1279 *psession = session;
1280 else if (session) {
1281 mutex_unlock(&session->s_mutex);
1282 ceph_put_mds_session(session);
1283 }
1284}
1285
1286static void ceph_flush_snaps(struct ceph_inode_info *ci)
1287{
1288 struct inode *inode = &ci->vfs_inode;
1289
1290 spin_lock(&inode->i_lock);
1291 __ceph_flush_snaps(ci, NULL);
1292 spin_unlock(&inode->i_lock);
1293}
1294
1295/*
1296 * Mark caps dirty. If inode is newly dirty, add to the global dirty
1297 * list.
1298 */
1299void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1300{
1301 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
1302 struct inode *inode = &ci->vfs_inode;
1303 int was = ci->i_dirty_caps;
1304 int dirty = 0;
1305
1306 dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
1307 ceph_cap_string(mask), ceph_cap_string(was),
1308 ceph_cap_string(was | mask));
1309 ci->i_dirty_caps |= mask;
1310 if (was == 0) {
1311 dout(" inode %p now dirty\n", &ci->vfs_inode);
1312 BUG_ON(!list_empty(&ci->i_dirty_item));
1313 spin_lock(&mdsc->cap_dirty_lock);
1314 list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
1315 spin_unlock(&mdsc->cap_dirty_lock);
1316 if (ci->i_flushing_caps == 0) {
1317 igrab(inode);
1318 dirty |= I_DIRTY_SYNC;
1319 }
1320 }
1321 BUG_ON(list_empty(&ci->i_dirty_item));
1322 if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
1323 (mask & CEPH_CAP_FILE_BUFFER))
1324 dirty |= I_DIRTY_DATASYNC;
1325 if (dirty)
1326 __mark_inode_dirty(inode, dirty);
1327 __cap_delay_requeue(mdsc, ci);
1328}
1329
1330/*
1331 * Add dirty inode to the flushing list. Assigned a seq number so we
1332 * can wait for caps to flush without starving.
1333 *
1334 * Called under i_lock.
1335 */
1336static int __mark_caps_flushing(struct inode *inode,
1337 struct ceph_mds_session *session)
1338{
1339 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1340 struct ceph_inode_info *ci = ceph_inode(inode);
1341 int flushing;
1342
1343 BUG_ON(ci->i_dirty_caps == 0);
1344 BUG_ON(list_empty(&ci->i_dirty_item));
1345
1346 flushing = ci->i_dirty_caps;
1347 dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
1348 ceph_cap_string(flushing),
1349 ceph_cap_string(ci->i_flushing_caps),
1350 ceph_cap_string(ci->i_flushing_caps | flushing));
1351 ci->i_flushing_caps |= flushing;
1352 ci->i_dirty_caps = 0;
1353 dout(" inode %p now !dirty\n", inode);
1354
1355 spin_lock(&mdsc->cap_dirty_lock);
1356 list_del_init(&ci->i_dirty_item);
1357
1358 ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
1359 if (list_empty(&ci->i_flushing_item)) {
1360 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1361 mdsc->num_cap_flushing++;
1362 dout(" inode %p now flushing seq %lld\n", inode,
1363 ci->i_cap_flush_seq);
1364 } else {
1365 list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1366 dout(" inode %p now flushing (more) seq %lld\n", inode,
1367 ci->i_cap_flush_seq);
1368 }
1369 spin_unlock(&mdsc->cap_dirty_lock);
1370
1371 return flushing;
1372}
1373
1374/*
1375 * try to invalidate mapping pages without blocking.
1376 */
1377static int mapping_is_empty(struct address_space *mapping)
1378{
1379 struct page *page = find_get_page(mapping, 0);
1380
1381 if (!page)
1382 return 1;
1383
1384 put_page(page);
1385 return 0;
1386}
1387
1388static int try_nonblocking_invalidate(struct inode *inode)
1389{
1390 struct ceph_inode_info *ci = ceph_inode(inode);
1391 u32 invalidating_gen = ci->i_rdcache_gen;
1392
1393 spin_unlock(&inode->i_lock);
1394 invalidate_mapping_pages(&inode->i_data, 0, -1);
1395 spin_lock(&inode->i_lock);
1396
1397 if (mapping_is_empty(&inode->i_data) &&
1398 invalidating_gen == ci->i_rdcache_gen) {
1399 /* success. */
1400 dout("try_nonblocking_invalidate %p success\n", inode);
1401 ci->i_rdcache_gen = 0;
1402 ci->i_rdcache_revoking = 0;
1403 return 0;
1404 }
1405 dout("try_nonblocking_invalidate %p failed\n", inode);
1406 return -1;
1407}
1408
1409/*
1410 * Swiss army knife function to examine currently used and wanted
1411 * versus held caps. Release, flush, ack revoked caps to mds as
1412 * appropriate.
1413 *
1414 * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
1415 * cap release further.
1416 * CHECK_CAPS_AUTHONLY - we should only check the auth cap
1417 * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
1418 * further delay.
1419 */
1420void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1421 struct ceph_mds_session *session)
1422 __releases(session->s_mutex)
1423{
1424 struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
1425 struct ceph_mds_client *mdsc = &client->mdsc;
1426 struct inode *inode = &ci->vfs_inode;
1427 struct ceph_cap *cap;
1428 int file_wanted, used;
1429 int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
1430 int issued, implemented, want, retain, revoking, flushing = 0;
1431 int mds = -1; /* keep track of how far we've gone through i_caps list
1432 to avoid an infinite loop on retry */
1433 struct rb_node *p;
1434 int tried_invalidate = 0;
1435 int delayed = 0, sent = 0, force_requeue = 0, num;
1436 int queue_invalidate = 0;
1437 int is_delayed = flags & CHECK_CAPS_NODELAY;
1438
1439 /* if we are unmounting, flush any unused caps immediately. */
1440 if (mdsc->stopping)
1441 is_delayed = 1;
1442
1443 spin_lock(&inode->i_lock);
1444
1445 if (ci->i_ceph_flags & CEPH_I_FLUSH)
1446 flags |= CHECK_CAPS_FLUSH;
1447
1448 /* flush snaps first time around only */
1449 if (!list_empty(&ci->i_cap_snaps))
1450 __ceph_flush_snaps(ci, &session);
1451 goto retry_locked;
1452retry:
1453 spin_lock(&inode->i_lock);
1454retry_locked:
1455 file_wanted = __ceph_caps_file_wanted(ci);
1456 used = __ceph_caps_used(ci);
1457 want = file_wanted | used;
1458 issued = __ceph_caps_issued(ci, &implemented);
1459 revoking = implemented & ~issued;
1460
1461 retain = want | CEPH_CAP_PIN;
1462 if (!mdsc->stopping && inode->i_nlink > 0) {
1463 if (want) {
1464 retain |= CEPH_CAP_ANY; /* be greedy */
1465 } else {
1466 retain |= CEPH_CAP_ANY_SHARED;
1467 /*
1468 * keep RD only if we didn't have the file open RW,
1469 * because then the mds would revoke it anyway to
1470 * journal max_size=0.
1471 */
1472 if (ci->i_max_size == 0)
1473 retain |= CEPH_CAP_ANY_RD;
1474 }
1475 }
1476
1477 dout("check_caps %p file_want %s used %s dirty %s flushing %s"
1478 " issued %s revoking %s retain %s %s%s%s\n", inode,
1479 ceph_cap_string(file_wanted),
1480 ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
1481 ceph_cap_string(ci->i_flushing_caps),
1482 ceph_cap_string(issued), ceph_cap_string(revoking),
1483 ceph_cap_string(retain),
1484 (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
1485 (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
1486 (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
1487
1488 /*
1489 * If we no longer need to hold onto old our caps, and we may
1490 * have cached pages, but don't want them, then try to invalidate.
1491 * If we fail, it's because pages are locked.... try again later.
1492 */
1493 if ((!is_delayed || mdsc->stopping) &&
1494 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
1495 ci->i_rdcache_gen && /* may have cached pages */
1496 (file_wanted == 0 || /* no open files */
1497 (revoking & CEPH_CAP_FILE_CACHE)) && /* or revoking cache */
1498 !tried_invalidate) {
1499 dout("check_caps trying to invalidate on %p\n", inode);
1500 if (try_nonblocking_invalidate(inode) < 0) {
1501 if (revoking & CEPH_CAP_FILE_CACHE) {
1502 dout("check_caps queuing invalidate\n");
1503 queue_invalidate = 1;
1504 ci->i_rdcache_revoking = ci->i_rdcache_gen;
1505 } else {
1506 dout("check_caps failed to invalidate pages\n");
1507 /* we failed to invalidate pages. check these
1508 caps again later. */
1509 force_requeue = 1;
1510 __cap_set_timeouts(mdsc, ci);
1511 }
1512 }
1513 tried_invalidate = 1;
1514 goto retry_locked;
1515 }
1516
1517 num = 0;
1518 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
1519 cap = rb_entry(p, struct ceph_cap, ci_node);
1520 num++;
1521
1522 /* avoid looping forever */
1523 if (mds >= cap->mds ||
1524 ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
1525 continue;
1526
1527 /* NOTE: no side-effects allowed, until we take s_mutex */
1528
1529 revoking = cap->implemented & ~cap->issued;
1530 if (revoking)
1531 dout(" mds%d revoking %s\n", cap->mds,
1532 ceph_cap_string(revoking));
1533
1534 if (cap == ci->i_auth_cap &&
1535 (cap->issued & CEPH_CAP_FILE_WR)) {
1536 /* request larger max_size from MDS? */
1537 if (ci->i_wanted_max_size > ci->i_max_size &&
1538 ci->i_wanted_max_size > ci->i_requested_max_size) {
1539 dout("requesting new max_size\n");
1540 goto ack;
1541 }
1542
1543 /* approaching file_max? */
1544 if ((inode->i_size << 1) >= ci->i_max_size &&
1545 (ci->i_reported_size << 1) < ci->i_max_size) {
1546 dout("i_size approaching max_size\n");
1547 goto ack;
1548 }
1549 }
1550 /* flush anything dirty? */
1551 if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
1552 ci->i_dirty_caps) {
1553 dout("flushing dirty caps\n");
1554 goto ack;
1555 }
1556
1557 /* completed revocation? going down and there are no caps? */
1558 if (revoking && (revoking & used) == 0) {
1559 dout("completed revocation of %s\n",
1560 ceph_cap_string(cap->implemented & ~cap->issued));
1561 goto ack;
1562 }
1563
1564 /* want more caps from mds? */
1565 if (want & ~(cap->mds_wanted | cap->issued))
1566 goto ack;
1567
1568 /* things we might delay */
1569 if ((cap->issued & ~retain) == 0 &&
1570 cap->mds_wanted == want)
1571 continue; /* nope, all good */
1572
1573 if (is_delayed)
1574 goto ack;
1575
1576 /* delay? */
1577 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1578 time_before(jiffies, ci->i_hold_caps_max)) {
1579 dout(" delaying issued %s -> %s, wanted %s -> %s\n",
1580 ceph_cap_string(cap->issued),
1581 ceph_cap_string(cap->issued & retain),
1582 ceph_cap_string(cap->mds_wanted),
1583 ceph_cap_string(want));
1584 delayed++;
1585 continue;
1586 }
1587
1588ack:
1589 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1590 dout(" skipping %p I_NOFLUSH set\n", inode);
1591 continue;
1592 }
1593
1594 if (session && session != cap->session) {
1595 dout("oops, wrong session %p mutex\n", session);
1596 mutex_unlock(&session->s_mutex);
1597 session = NULL;
1598 }
1599 if (!session) {
1600 session = cap->session;
1601 if (mutex_trylock(&session->s_mutex) == 0) {
1602 dout("inverting session/ino locks on %p\n",
1603 session);
1604 spin_unlock(&inode->i_lock);
1605 if (took_snap_rwsem) {
1606 up_read(&mdsc->snap_rwsem);
1607 took_snap_rwsem = 0;
1608 }
1609 mutex_lock(&session->s_mutex);
1610 goto retry;
1611 }
1612 }
1613 /* take snap_rwsem after session mutex */
1614 if (!took_snap_rwsem) {
1615 if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
1616 dout("inverting snap/in locks on %p\n",
1617 inode);
1618 spin_unlock(&inode->i_lock);
1619 down_read(&mdsc->snap_rwsem);
1620 took_snap_rwsem = 1;
1621 goto retry;
1622 }
1623 took_snap_rwsem = 1;
1624 }
1625
1626 if (cap == ci->i_auth_cap && ci->i_dirty_caps)
1627 flushing = __mark_caps_flushing(inode, session);
1628
1629 mds = cap->mds; /* remember mds, so we don't repeat */
1630 sent++;
1631
1632 /* __send_cap drops i_lock */
1633 delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
1634 retain, flushing, NULL);
1635 goto retry; /* retake i_lock and restart our cap scan. */
1636 }
1637
1638 /*
1639 * Reschedule delayed caps release if we delayed anything,
1640 * otherwise cancel.
1641 */
1642 if (delayed && is_delayed)
1643 force_requeue = 1; /* __send_cap delayed release; requeue */
1644 if (!delayed && !is_delayed)
1645 __cap_delay_cancel(mdsc, ci);
1646 else if (!is_delayed || force_requeue)
1647 __cap_delay_requeue(mdsc, ci);
1648
1649 spin_unlock(&inode->i_lock);
1650
1651 if (queue_invalidate)
1652 ceph_queue_invalidate(inode);
1653
1654 if (session)
1655 mutex_unlock(&session->s_mutex);
1656 if (took_snap_rwsem)
1657 up_read(&mdsc->snap_rwsem);
1658}
1659
1660/*
1661 * Try to flush dirty caps back to the auth mds.
1662 */
1663static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
1664 unsigned *flush_tid)
1665{
1666 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1667 struct ceph_inode_info *ci = ceph_inode(inode);
1668 int unlock_session = session ? 0 : 1;
1669 int flushing = 0;
1670
1671retry:
1672 spin_lock(&inode->i_lock);
1673 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1674 dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
1675 goto out;
1676 }
1677 if (ci->i_dirty_caps && ci->i_auth_cap) {
1678 struct ceph_cap *cap = ci->i_auth_cap;
1679 int used = __ceph_caps_used(ci);
1680 int want = __ceph_caps_wanted(ci);
1681 int delayed;
1682
1683 if (!session) {
1684 spin_unlock(&inode->i_lock);
1685 session = cap->session;
1686 mutex_lock(&session->s_mutex);
1687 goto retry;
1688 }
1689 BUG_ON(session != cap->session);
1690 if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
1691 goto out;
1692
1693 flushing = __mark_caps_flushing(inode, session);
1694
1695 /* __send_cap drops i_lock */
1696 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
1697 cap->issued | cap->implemented, flushing,
1698 flush_tid);
1699 if (!delayed)
1700 goto out_unlocked;
1701
1702 spin_lock(&inode->i_lock);
1703 __cap_delay_requeue(mdsc, ci);
1704 }
1705out:
1706 spin_unlock(&inode->i_lock);
1707out_unlocked:
1708 if (session && unlock_session)
1709 mutex_unlock(&session->s_mutex);
1710 return flushing;
1711}
1712
1713/*
1714 * Return true if we've flushed caps through the given flush_tid.
1715 */
1716static int caps_are_flushed(struct inode *inode, unsigned tid)
1717{
1718 struct ceph_inode_info *ci = ceph_inode(inode);
1719 int dirty, i, ret = 1;
1720
1721 spin_lock(&inode->i_lock);
1722 dirty = __ceph_caps_dirty(ci);
1723 for (i = 0; i < CEPH_CAP_BITS; i++)
1724 if ((ci->i_flushing_caps & (1 << i)) &&
1725 ci->i_cap_flush_tid[i] <= tid) {
1726 /* still flushing this bit */
1727 ret = 0;
1728 break;
1729 }
1730 spin_unlock(&inode->i_lock);
1731 return ret;
1732}
1733
1734/*
1735 * Wait on any unsafe replies for the given inode. First wait on the
1736 * newest request, and make that the upper bound. Then, if there are
1737 * more requests, keep waiting on the oldest as long as it is still older
1738 * than the original request.
1739 */
1740static void sync_write_wait(struct inode *inode)
1741{
1742 struct ceph_inode_info *ci = ceph_inode(inode);
1743 struct list_head *head = &ci->i_unsafe_writes;
1744 struct ceph_osd_request *req;
1745 u64 last_tid;
1746
1747 spin_lock(&ci->i_unsafe_lock);
1748 if (list_empty(head))
1749 goto out;
1750
1751 /* set upper bound as _last_ entry in chain */
1752 req = list_entry(head->prev, struct ceph_osd_request,
1753 r_unsafe_item);
1754 last_tid = req->r_tid;
1755
1756 do {
1757 ceph_osdc_get_request(req);
1758 spin_unlock(&ci->i_unsafe_lock);
1759 dout("sync_write_wait on tid %llu (until %llu)\n",
1760 req->r_tid, last_tid);
1761 wait_for_completion(&req->r_safe_completion);
1762 spin_lock(&ci->i_unsafe_lock);
1763 ceph_osdc_put_request(req);
1764
1765 /*
1766 * from here on look at first entry in chain, since we
1767 * only want to wait for anything older than last_tid
1768 */
1769 if (list_empty(head))
1770 break;
1771 req = list_entry(head->next, struct ceph_osd_request,
1772 r_unsafe_item);
1773 } while (req->r_tid < last_tid);
1774out:
1775 spin_unlock(&ci->i_unsafe_lock);
1776}
1777
1778int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
1779{
1780 struct inode *inode = dentry->d_inode;
1781 struct ceph_inode_info *ci = ceph_inode(inode);
1782 unsigned flush_tid;
1783 int ret;
1784 int dirty;
1785
1786 dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
1787 sync_write_wait(inode);
1788
1789 ret = filemap_write_and_wait(inode->i_mapping);
1790 if (ret < 0)
1791 return ret;
1792
1793 dirty = try_flush_caps(inode, NULL, &flush_tid);
1794 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
1795
1796 /*
1797 * only wait on non-file metadata writeback (the mds
1798 * can recover size and mtime, so we don't need to
1799 * wait for that)
1800 */
1801 if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
1802 dout("fsync waiting for flush_tid %u\n", flush_tid);
1803 ret = wait_event_interruptible(ci->i_cap_wq,
1804 caps_are_flushed(inode, flush_tid));
1805 }
1806
1807 dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
1808 return ret;
1809}
1810
1811/*
1812 * Flush any dirty caps back to the mds. If we aren't asked to wait,
1813 * queue inode for flush but don't do so immediately, because we can
1814 * get by with fewer MDS messages if we wait for data writeback to
1815 * complete first.
1816 */
1817int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1818{
1819 struct ceph_inode_info *ci = ceph_inode(inode);
1820 unsigned flush_tid;
1821 int err = 0;
1822 int dirty;
1823 int wait = wbc->sync_mode == WB_SYNC_ALL;
1824
1825 dout("write_inode %p wait=%d\n", inode, wait);
1826 if (wait) {
1827 dirty = try_flush_caps(inode, NULL, &flush_tid);
1828 if (dirty)
1829 err = wait_event_interruptible(ci->i_cap_wq,
1830 caps_are_flushed(inode, flush_tid));
1831 } else {
1832 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1833
1834 spin_lock(&inode->i_lock);
1835 if (__ceph_caps_dirty(ci))
1836 __cap_delay_requeue_front(mdsc, ci);
1837 spin_unlock(&inode->i_lock);
1838 }
1839 return err;
1840}
1841
1842/*
1843 * After a recovering MDS goes active, we need to resend any caps
1844 * we were flushing.
1845 *
1846 * Caller holds session->s_mutex.
1847 */
1848static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
1849 struct ceph_mds_session *session)
1850{
1851 struct ceph_cap_snap *capsnap;
1852
1853 dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
1854 list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
1855 flushing_item) {
1856 struct ceph_inode_info *ci = capsnap->ci;
1857 struct inode *inode = &ci->vfs_inode;
1858 struct ceph_cap *cap;
1859
1860 spin_lock(&inode->i_lock);
1861 cap = ci->i_auth_cap;
1862 if (cap && cap->session == session) {
1863 dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
1864 cap, capsnap);
1865 __ceph_flush_snaps(ci, &session);
1866 } else {
1867 pr_err("%p auth cap %p not mds%d ???\n", inode,
1868 cap, session->s_mds);
1869 }
1870 spin_unlock(&inode->i_lock);
1871 }
1872}
1873
1874void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
1875 struct ceph_mds_session *session)
1876{
1877 struct ceph_inode_info *ci;
1878
1879 kick_flushing_capsnaps(mdsc, session);
1880
1881 dout("kick_flushing_caps mds%d\n", session->s_mds);
1882 list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
1883 struct inode *inode = &ci->vfs_inode;
1884 struct ceph_cap *cap;
1885 int delayed = 0;
1886
1887 spin_lock(&inode->i_lock);
1888 cap = ci->i_auth_cap;
1889 if (cap && cap->session == session) {
1890 dout("kick_flushing_caps %p cap %p %s\n", inode,
1891 cap, ceph_cap_string(ci->i_flushing_caps));
1892 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
1893 __ceph_caps_used(ci),
1894 __ceph_caps_wanted(ci),
1895 cap->issued | cap->implemented,
1896 ci->i_flushing_caps, NULL);
1897 if (delayed) {
1898 spin_lock(&inode->i_lock);
1899 __cap_delay_requeue(mdsc, ci);
1900 spin_unlock(&inode->i_lock);
1901 }
1902 } else {
1903 pr_err("%p auth cap %p not mds%d ???\n", inode,
1904 cap, session->s_mds);
1905 spin_unlock(&inode->i_lock);
1906 }
1907 }
1908}
1909
1910
1911/*
1912 * Take references to capabilities we hold, so that we don't release
1913 * them to the MDS prematurely.
1914 *
1915 * Protected by i_lock.
1916 */
1917static void __take_cap_refs(struct ceph_inode_info *ci, int got)
1918{
1919 if (got & CEPH_CAP_PIN)
1920 ci->i_pin_ref++;
1921 if (got & CEPH_CAP_FILE_RD)
1922 ci->i_rd_ref++;
1923 if (got & CEPH_CAP_FILE_CACHE)
1924 ci->i_rdcache_ref++;
1925 if (got & CEPH_CAP_FILE_WR)
1926 ci->i_wr_ref++;
1927 if (got & CEPH_CAP_FILE_BUFFER) {
1928 if (ci->i_wrbuffer_ref == 0)
1929 igrab(&ci->vfs_inode);
1930 ci->i_wrbuffer_ref++;
1931 dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n",
1932 &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref);
1933 }
1934}
1935
1936/*
1937 * Try to grab cap references. Specify those refs we @want, and the
1938 * minimal set we @need. Also include the larger offset we are writing
1939 * to (when applicable), and check against max_size here as well.
1940 * Note that caller is responsible for ensuring max_size increases are
1941 * requested from the MDS.
1942 */
1943static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
1944 int *got, loff_t endoff, int *check_max, int *err)
1945{
1946 struct inode *inode = &ci->vfs_inode;
1947 int ret = 0;
1948 int have, implemented;
1949 int file_wanted;
1950
1951 dout("get_cap_refs %p need %s want %s\n", inode,
1952 ceph_cap_string(need), ceph_cap_string(want));
1953 spin_lock(&inode->i_lock);
1954
1955 /* make sure file is actually open */
1956 file_wanted = __ceph_caps_file_wanted(ci);
1957 if ((file_wanted & need) == 0) {
1958 dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
1959 ceph_cap_string(need), ceph_cap_string(file_wanted));
1960 *err = -EBADF;
1961 ret = 1;
1962 goto out;
1963 }
1964
1965 if (need & CEPH_CAP_FILE_WR) {
1966 if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
1967 dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
1968 inode, endoff, ci->i_max_size);
1969 if (endoff > ci->i_wanted_max_size) {
1970 *check_max = 1;
1971 ret = 1;
1972 }
1973 goto out;
1974 }
1975 /*
1976 * If a sync write is in progress, we must wait, so that we
1977 * can get a final snapshot value for size+mtime.
1978 */
1979 if (__ceph_have_pending_cap_snap(ci)) {
1980 dout("get_cap_refs %p cap_snap_pending\n", inode);
1981 goto out;
1982 }
1983 }
1984 have = __ceph_caps_issued(ci, &implemented);
1985
1986 /*
1987 * disallow writes while a truncate is pending
1988 */
1989 if (ci->i_truncate_pending)
1990 have &= ~CEPH_CAP_FILE_WR;
1991
1992 if ((have & need) == need) {
1993 /*
1994 * Look at (implemented & ~have & not) so that we keep waiting
1995 * on transition from wanted -> needed caps. This is needed
1996 * for WRBUFFER|WR -> WR to avoid a new WR sync write from
1997 * going before a prior buffered writeback happens.
1998 */
1999 int not = want & ~(have & need);
2000 int revoking = implemented & ~have;
2001 dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
2002 inode, ceph_cap_string(have), ceph_cap_string(not),
2003 ceph_cap_string(revoking));
2004 if ((revoking & not) == 0) {
2005 *got = need | (have & want);
2006 __take_cap_refs(ci, *got);
2007 ret = 1;
2008 }
2009 } else {
2010 dout("get_cap_refs %p have %s needed %s\n", inode,
2011 ceph_cap_string(have), ceph_cap_string(need));
2012 }
2013out:
2014 spin_unlock(&inode->i_lock);
2015 dout("get_cap_refs %p ret %d got %s\n", inode,
2016 ret, ceph_cap_string(*got));
2017 return ret;
2018}
2019
2020/*
2021 * Check the offset we are writing up to against our current
2022 * max_size. If necessary, tell the MDS we want to write to
2023 * a larger offset.
2024 */
2025static void check_max_size(struct inode *inode, loff_t endoff)
2026{
2027 struct ceph_inode_info *ci = ceph_inode(inode);
2028 int check = 0;
2029
2030 /* do we need to explicitly request a larger max_size? */
2031 spin_lock(&inode->i_lock);
2032 if ((endoff >= ci->i_max_size ||
2033 endoff > (inode->i_size << 1)) &&
2034 endoff > ci->i_wanted_max_size) {
2035 dout("write %p at large endoff %llu, req max_size\n",
2036 inode, endoff);
2037 ci->i_wanted_max_size = endoff;
2038 check = 1;
2039 }
2040 spin_unlock(&inode->i_lock);
2041 if (check)
2042 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2043}
2044
2045/*
2046 * Wait for caps, and take cap references. If we can't get a WR cap
2047 * due to a small max_size, make sure we check_max_size (and possibly
2048 * ask the mds) so we don't get hung up indefinitely.
2049 */
2050int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
2051 loff_t endoff)
2052{
2053 int check_max, ret, err;
2054
2055retry:
2056 if (endoff > 0)
2057 check_max_size(&ci->vfs_inode, endoff);
2058 check_max = 0;
2059 err = 0;
2060 ret = wait_event_interruptible(ci->i_cap_wq,
2061 try_get_cap_refs(ci, need, want,
2062 got, endoff,
2063 &check_max, &err));
2064 if (err)
2065 ret = err;
2066 if (check_max)
2067 goto retry;
2068 return ret;
2069}
2070
2071/*
2072 * Take cap refs. Caller must already know we hold at least one ref
2073 * on the caps in question or we don't know this is safe.
2074 */
2075void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
2076{
2077 spin_lock(&ci->vfs_inode.i_lock);
2078 __take_cap_refs(ci, caps);
2079 spin_unlock(&ci->vfs_inode.i_lock);
2080}
2081
2082/*
2083 * Release cap refs.
2084 *
2085 * If we released the last ref on any given cap, call ceph_check_caps
2086 * to release (or schedule a release).
2087 *
2088 * If we are releasing a WR cap (from a sync write), finalize any affected
2089 * cap_snap, and wake up any waiters.
2090 */
2091void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
2092{
2093 struct inode *inode = &ci->vfs_inode;
2094 int last = 0, put = 0, flushsnaps = 0, wake = 0;
2095 struct ceph_cap_snap *capsnap;
2096
2097 spin_lock(&inode->i_lock);
2098 if (had & CEPH_CAP_PIN)
2099 --ci->i_pin_ref;
2100 if (had & CEPH_CAP_FILE_RD)
2101 if (--ci->i_rd_ref == 0)
2102 last++;
2103 if (had & CEPH_CAP_FILE_CACHE)
2104 if (--ci->i_rdcache_ref == 0)
2105 last++;
2106 if (had & CEPH_CAP_FILE_BUFFER) {
2107 if (--ci->i_wrbuffer_ref == 0) {
2108 last++;
2109 put++;
2110 }
2111 dout("put_cap_refs %p wrbuffer %d -> %d (?)\n",
2112 inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref);
2113 }
2114 if (had & CEPH_CAP_FILE_WR)
2115 if (--ci->i_wr_ref == 0) {
2116 last++;
2117 if (!list_empty(&ci->i_cap_snaps)) {
2118 capsnap = list_first_entry(&ci->i_cap_snaps,
2119 struct ceph_cap_snap,
2120 ci_item);
2121 if (capsnap->writing) {
2122 capsnap->writing = 0;
2123 flushsnaps =
2124 __ceph_finish_cap_snap(ci,
2125 capsnap);
2126 wake = 1;
2127 }
2128 }
2129 }
2130 spin_unlock(&inode->i_lock);
2131
2132 dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
2133 last ? " last" : "", put ? " put" : "");
2134
2135 if (last && !flushsnaps)
2136 ceph_check_caps(ci, 0, NULL);
2137 else if (flushsnaps)
2138 ceph_flush_snaps(ci);
2139 if (wake)
2140 wake_up(&ci->i_cap_wq);
2141 if (put)
2142 iput(inode);
2143}
2144
2145/*
2146 * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
2147 * context. Adjust per-snap dirty page accounting as appropriate.
2148 * Once all dirty data for a cap_snap is flushed, flush snapped file
2149 * metadata back to the MDS. If we dropped the last ref, call
2150 * ceph_check_caps.
2151 */
2152void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2153 struct ceph_snap_context *snapc)
2154{
2155 struct inode *inode = &ci->vfs_inode;
2156 int last = 0;
2157 int complete_capsnap = 0;
2158 int drop_capsnap = 0;
2159 int found = 0;
2160 struct ceph_cap_snap *capsnap = NULL;
2161
2162 spin_lock(&inode->i_lock);
2163 ci->i_wrbuffer_ref -= nr;
2164 last = !ci->i_wrbuffer_ref;
2165
2166 if (ci->i_head_snapc == snapc) {
2167 ci->i_wrbuffer_ref_head -= nr;
2168 if (!ci->i_wrbuffer_ref_head) {
2169 ceph_put_snap_context(ci->i_head_snapc);
2170 ci->i_head_snapc = NULL;
2171 }
2172 dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
2173 inode,
2174 ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
2175 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
2176 last ? " LAST" : "");
2177 } else {
2178 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2179 if (capsnap->context == snapc) {
2180 found = 1;
2181 break;
2182 }
2183 }
2184 BUG_ON(!found);
2185 capsnap->dirty_pages -= nr;
2186 if (capsnap->dirty_pages == 0) {
2187 complete_capsnap = 1;
2188 if (capsnap->dirty == 0)
2189 /* cap writeback completed before we created
2190 * the cap_snap; no FLUSHSNAP is needed */
2191 drop_capsnap = 1;
2192 }
2193 dout("put_wrbuffer_cap_refs on %p cap_snap %p "
2194 " snap %lld %d/%d -> %d/%d %s%s%s\n",
2195 inode, capsnap, capsnap->context->seq,
2196 ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
2197 ci->i_wrbuffer_ref, capsnap->dirty_pages,
2198 last ? " (wrbuffer last)" : "",
2199 complete_capsnap ? " (complete capsnap)" : "",
2200 drop_capsnap ? " (drop capsnap)" : "");
2201 if (drop_capsnap) {
2202 ceph_put_snap_context(capsnap->context);
2203 list_del(&capsnap->ci_item);
2204 list_del(&capsnap->flushing_item);
2205 ceph_put_cap_snap(capsnap);
2206 }
2207 }
2208
2209 spin_unlock(&inode->i_lock);
2210
2211 if (last) {
2212 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2213 iput(inode);
2214 } else if (complete_capsnap) {
2215 ceph_flush_snaps(ci);
2216 wake_up(&ci->i_cap_wq);
2217 }
2218 if (drop_capsnap)
2219 iput(inode);
2220}
2221
2222/*
2223 * Handle a cap GRANT message from the MDS. (Note that a GRANT may
2224 * actually be a revocation if it specifies a smaller cap set.)
2225 *
2226 * caller holds s_mutex and i_lock, we drop both.
2227 *
2228 * return value:
2229 * 0 - ok
2230 * 1 - check_caps on auth cap only (writeback)
2231 * 2 - check_caps (ack revoke)
2232 */
2233static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2234 struct ceph_mds_session *session,
2235 struct ceph_cap *cap,
2236 struct ceph_buffer *xattr_buf)
2237 __releases(inode->i_lock)
2238 __releases(session->s_mutex)
2239{
2240 struct ceph_inode_info *ci = ceph_inode(inode);
2241 int mds = session->s_mds;
2242 int seq = le32_to_cpu(grant->seq);
2243 int newcaps = le32_to_cpu(grant->caps);
2244 int issued, implemented, used, wanted, dirty;
2245 u64 size = le64_to_cpu(grant->size);
2246 u64 max_size = le64_to_cpu(grant->max_size);
2247 struct timespec mtime, atime, ctime;
2248 int check_caps = 0;
2249 int wake = 0;
2250 int writeback = 0;
2251 int revoked_rdcache = 0;
2252 int queue_invalidate = 0;
2253
2254 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
2255 inode, cap, mds, seq, ceph_cap_string(newcaps));
2256 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
2257 inode->i_size);
2258
2259 /*
2260 * If CACHE is being revoked, and we have no dirty buffers,
2261 * try to invalidate (once). (If there are dirty buffers, we
2262 * will invalidate _after_ writeback.)
2263 */
2264 if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
2265 !ci->i_wrbuffer_ref) {
2266 if (try_nonblocking_invalidate(inode) == 0) {
2267 revoked_rdcache = 1;
2268 } else {
2269 /* there were locked pages.. invalidate later
2270 in a separate thread. */
2271 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
2272 queue_invalidate = 1;
2273 ci->i_rdcache_revoking = ci->i_rdcache_gen;
2274 }
2275 }
2276 }
2277
2278 /* side effects now are allowed */
2279
2280 issued = __ceph_caps_issued(ci, &implemented);
2281 issued |= implemented | __ceph_caps_dirty(ci);
2282
2283 cap->cap_gen = session->s_cap_gen;
2284
2285 __check_cap_issue(ci, cap, newcaps);
2286
2287 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
2288 inode->i_mode = le32_to_cpu(grant->mode);
2289 inode->i_uid = le32_to_cpu(grant->uid);
2290 inode->i_gid = le32_to_cpu(grant->gid);
2291 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
2292 inode->i_uid, inode->i_gid);
2293 }
2294
2295 if ((issued & CEPH_CAP_LINK_EXCL) == 0)
2296 inode->i_nlink = le32_to_cpu(grant->nlink);
2297
2298 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
2299 int len = le32_to_cpu(grant->xattr_len);
2300 u64 version = le64_to_cpu(grant->xattr_version);
2301
2302 if (version > ci->i_xattrs.version) {
2303 dout(" got new xattrs v%llu on %p len %d\n",
2304 version, inode, len);
2305 if (ci->i_xattrs.blob)
2306 ceph_buffer_put(ci->i_xattrs.blob);
2307 ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
2308 ci->i_xattrs.version = version;
2309 }
2310 }
2311
2312 /* size/ctime/mtime/atime? */
2313 ceph_fill_file_size(inode, issued,
2314 le32_to_cpu(grant->truncate_seq),
2315 le64_to_cpu(grant->truncate_size), size);
2316 ceph_decode_timespec(&mtime, &grant->mtime);
2317 ceph_decode_timespec(&atime, &grant->atime);
2318 ceph_decode_timespec(&ctime, &grant->ctime);
2319 ceph_fill_file_time(inode, issued,
2320 le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
2321 &atime);
2322
2323 /* max size increase? */
2324 if (max_size != ci->i_max_size) {
2325 dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
2326 ci->i_max_size = max_size;
2327 if (max_size >= ci->i_wanted_max_size) {
2328 ci->i_wanted_max_size = 0; /* reset */
2329 ci->i_requested_max_size = 0;
2330 }
2331 wake = 1;
2332 }
2333
2334 /* check cap bits */
2335 wanted = __ceph_caps_wanted(ci);
2336 used = __ceph_caps_used(ci);
2337 dirty = __ceph_caps_dirty(ci);
2338 dout(" my wanted = %s, used = %s, dirty %s\n",
2339 ceph_cap_string(wanted),
2340 ceph_cap_string(used),
2341 ceph_cap_string(dirty));
2342 if (wanted != le32_to_cpu(grant->wanted)) {
2343 dout("mds wanted %s -> %s\n",
2344 ceph_cap_string(le32_to_cpu(grant->wanted)),
2345 ceph_cap_string(wanted));
2346 grant->wanted = cpu_to_le32(wanted);
2347 }
2348
2349 cap->seq = seq;
2350
2351 /* file layout may have changed */
2352 ci->i_layout = grant->layout;
2353
2354 /* revocation, grant, or no-op? */
2355 if (cap->issued & ~newcaps) {
2356 dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
2357 ceph_cap_string(newcaps));
2358 if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
2359 writeback = 1; /* will delay ack */
2360 else if (dirty & ~newcaps)
2361 check_caps = 1; /* initiate writeback in check_caps */
2362 else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
2363 revoked_rdcache)
2364 check_caps = 2; /* send revoke ack in check_caps */
2365 cap->issued = newcaps;
2366 cap->implemented |= newcaps;
2367 } else if (cap->issued == newcaps) {
2368 dout("caps unchanged: %s -> %s\n",
2369 ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
2370 } else {
2371 dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
2372 ceph_cap_string(newcaps));
2373 cap->issued = newcaps;
2374 cap->implemented |= newcaps; /* add bits only, to
2375 * avoid stepping on a
2376 * pending revocation */
2377 wake = 1;
2378 }
2379 BUG_ON(cap->issued & ~cap->implemented);
2380
2381 spin_unlock(&inode->i_lock);
2382 if (writeback)
2383 /*
2384 * queue inode for writeback: we can't actually call
2385 * filemap_write_and_wait, etc. from message handler
2386 * context.
2387 */
2388 ceph_queue_writeback(inode);
2389 if (queue_invalidate)
2390 ceph_queue_invalidate(inode);
2391 if (wake)
2392 wake_up(&ci->i_cap_wq);
2393
2394 if (check_caps == 1)
2395 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
2396 session);
2397 else if (check_caps == 2)
2398 ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
2399 else
2400 mutex_unlock(&session->s_mutex);
2401}
2402
2403/*
2404 * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
2405 * MDS has been safely committed.
2406 */
2407static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2408 struct ceph_mds_caps *m,
2409 struct ceph_mds_session *session,
2410 struct ceph_cap *cap)
2411 __releases(inode->i_lock)
2412{
2413 struct ceph_inode_info *ci = ceph_inode(inode);
2414 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
2415 unsigned seq = le32_to_cpu(m->seq);
2416 int dirty = le32_to_cpu(m->dirty);
2417 int cleaned = 0;
2418 int drop = 0;
2419 int i;
2420
2421 for (i = 0; i < CEPH_CAP_BITS; i++)
2422 if ((dirty & (1 << i)) &&
2423 flush_tid == ci->i_cap_flush_tid[i])
2424 cleaned |= 1 << i;
2425
2426 dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
2427 " flushing %s -> %s\n",
2428 inode, session->s_mds, seq, ceph_cap_string(dirty),
2429 ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
2430 ceph_cap_string(ci->i_flushing_caps & ~cleaned));
2431
2432 if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
2433 goto out;
2434
2435 ci->i_flushing_caps &= ~cleaned;
2436
2437 spin_lock(&mdsc->cap_dirty_lock);
2438 if (ci->i_flushing_caps == 0) {
2439 list_del_init(&ci->i_flushing_item);
2440 if (!list_empty(&session->s_cap_flushing))
2441 dout(" mds%d still flushing cap on %p\n",
2442 session->s_mds,
2443 &list_entry(session->s_cap_flushing.next,
2444 struct ceph_inode_info,
2445 i_flushing_item)->vfs_inode);
2446 mdsc->num_cap_flushing--;
2447 wake_up(&mdsc->cap_flushing_wq);
2448 dout(" inode %p now !flushing\n", inode);
2449
2450 if (ci->i_dirty_caps == 0) {
2451 dout(" inode %p now clean\n", inode);
2452 BUG_ON(!list_empty(&ci->i_dirty_item));
2453 drop = 1;
2454 } else {
2455 BUG_ON(list_empty(&ci->i_dirty_item));
2456 }
2457 }
2458 spin_unlock(&mdsc->cap_dirty_lock);
2459 wake_up(&ci->i_cap_wq);
2460
2461out:
2462 spin_unlock(&inode->i_lock);
2463 if (drop)
2464 iput(inode);
2465}
2466
2467/*
2468 * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
2469 * throw away our cap_snap.
2470 *
2471 * Caller hold s_mutex.
2472 */
2473static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
2474 struct ceph_mds_caps *m,
2475 struct ceph_mds_session *session)
2476{
2477 struct ceph_inode_info *ci = ceph_inode(inode);
2478 u64 follows = le64_to_cpu(m->snap_follows);
2479 struct ceph_cap_snap *capsnap;
2480 int drop = 0;
2481
2482 dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
2483 inode, ci, session->s_mds, follows);
2484
2485 spin_lock(&inode->i_lock);
2486 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2487 if (capsnap->follows == follows) {
2488 if (capsnap->flush_tid != flush_tid) {
2489 dout(" cap_snap %p follows %lld tid %lld !="
2490 " %lld\n", capsnap, follows,
2491 flush_tid, capsnap->flush_tid);
2492 break;
2493 }
2494 WARN_ON(capsnap->dirty_pages || capsnap->writing);
2495 dout(" removing %p cap_snap %p follows %lld\n",
2496 inode, capsnap, follows);
2497 ceph_put_snap_context(capsnap->context);
2498 list_del(&capsnap->ci_item);
2499 list_del(&capsnap->flushing_item);
2500 ceph_put_cap_snap(capsnap);
2501 drop = 1;
2502 break;
2503 } else {
2504 dout(" skipping cap_snap %p follows %lld\n",
2505 capsnap, capsnap->follows);
2506 }
2507 }
2508 spin_unlock(&inode->i_lock);
2509 if (drop)
2510 iput(inode);
2511}
2512
2513/*
2514 * Handle TRUNC from MDS, indicating file truncation.
2515 *
2516 * caller hold s_mutex.
2517 */
2518static void handle_cap_trunc(struct inode *inode,
2519 struct ceph_mds_caps *trunc,
2520 struct ceph_mds_session *session)
2521 __releases(inode->i_lock)
2522{
2523 struct ceph_inode_info *ci = ceph_inode(inode);
2524 int mds = session->s_mds;
2525 int seq = le32_to_cpu(trunc->seq);
2526 u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
2527 u64 truncate_size = le64_to_cpu(trunc->truncate_size);
2528 u64 size = le64_to_cpu(trunc->size);
2529 int implemented = 0;
2530 int dirty = __ceph_caps_dirty(ci);
2531 int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
2532 int queue_trunc = 0;
2533
2534 issued |= implemented | dirty;
2535
2536 dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
2537 inode, mds, seq, truncate_size, truncate_seq);
2538 queue_trunc = ceph_fill_file_size(inode, issued,
2539 truncate_seq, truncate_size, size);
2540 spin_unlock(&inode->i_lock);
2541
2542 if (queue_trunc)
2543 ceph_queue_vmtruncate(inode);
2544}
2545
2546/*
2547 * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
2548 * different one. If we are the most recent migration we've seen (as
2549 * indicated by mseq), make note of the migrating cap bits for the
2550 * duration (until we see the corresponding IMPORT).
2551 *
2552 * caller holds s_mutex
2553 */
2554static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2555 struct ceph_mds_session *session)
2556{
2557 struct ceph_inode_info *ci = ceph_inode(inode);
2558 int mds = session->s_mds;
2559 unsigned mseq = le32_to_cpu(ex->migrate_seq);
2560 struct ceph_cap *cap = NULL, *t;
2561 struct rb_node *p;
2562 int remember = 1;
2563
2564 dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
2565 inode, ci, mds, mseq);
2566
2567 spin_lock(&inode->i_lock);
2568
2569 /* make sure we haven't seen a higher mseq */
2570 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
2571 t = rb_entry(p, struct ceph_cap, ci_node);
2572 if (ceph_seq_cmp(t->mseq, mseq) > 0) {
2573 dout(" higher mseq on cap from mds%d\n",
2574 t->session->s_mds);
2575 remember = 0;
2576 }
2577 if (t->session->s_mds == mds)
2578 cap = t;
2579 }
2580
2581 if (cap) {
2582 if (remember) {
2583 /* make note */
2584 ci->i_cap_exporting_mds = mds;
2585 ci->i_cap_exporting_mseq = mseq;
2586 ci->i_cap_exporting_issued = cap->issued;
2587 }
2588 __ceph_remove_cap(cap);
2589 }
2590 /* else, we already released it */
2591
2592 spin_unlock(&inode->i_lock);
2593}
2594
2595/*
2596 * Handle cap IMPORT. If there are temp bits from an older EXPORT,
2597 * clean them up.
2598 *
2599 * caller holds s_mutex.
2600 */
2601static void handle_cap_import(struct ceph_mds_client *mdsc,
2602 struct inode *inode, struct ceph_mds_caps *im,
2603 struct ceph_mds_session *session,
2604 void *snaptrace, int snaptrace_len)
2605{
2606 struct ceph_inode_info *ci = ceph_inode(inode);
2607 int mds = session->s_mds;
2608 unsigned issued = le32_to_cpu(im->caps);
2609 unsigned wanted = le32_to_cpu(im->wanted);
2610 unsigned seq = le32_to_cpu(im->seq);
2611 unsigned mseq = le32_to_cpu(im->migrate_seq);
2612 u64 realmino = le64_to_cpu(im->realm);
2613 u64 cap_id = le64_to_cpu(im->cap_id);
2614
2615 if (ci->i_cap_exporting_mds >= 0 &&
2616 ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
2617 dout("handle_cap_import inode %p ci %p mds%d mseq %d"
2618 " - cleared exporting from mds%d\n",
2619 inode, ci, mds, mseq,
2620 ci->i_cap_exporting_mds);
2621 ci->i_cap_exporting_issued = 0;
2622 ci->i_cap_exporting_mseq = 0;
2623 ci->i_cap_exporting_mds = -1;
2624 } else {
2625 dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
2626 inode, ci, mds, mseq);
2627 }
2628
2629 down_write(&mdsc->snap_rwsem);
2630 ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
2631 false);
2632 downgrade_write(&mdsc->snap_rwsem);
2633 ceph_add_cap(inode, session, cap_id, -1,
2634 issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
2635 NULL /* no caps context */);
2636 try_flush_caps(inode, session, NULL);
2637 up_read(&mdsc->snap_rwsem);
2638}
2639
2640/*
2641 * Handle a caps message from the MDS.
2642 *
2643 * Identify the appropriate session, inode, and call the right handler
2644 * based on the cap op.
2645 */
2646void ceph_handle_caps(struct ceph_mds_session *session,
2647 struct ceph_msg *msg)
2648{
2649 struct ceph_mds_client *mdsc = session->s_mdsc;
2650 struct super_block *sb = mdsc->client->sb;
2651 struct inode *inode;
2652 struct ceph_cap *cap;
2653 struct ceph_mds_caps *h;
2654 int mds = session->s_mds;
2655 int op;
2656 u32 seq;
2657 struct ceph_vino vino;
2658 u64 cap_id;
2659 u64 size, max_size;
2660 u64 tid;
2661 void *snaptrace;
2662
2663 dout("handle_caps from mds%d\n", mds);
2664
2665 /* decode */
2666 tid = le64_to_cpu(msg->hdr.tid);
2667 if (msg->front.iov_len < sizeof(*h))
2668 goto bad;
2669 h = msg->front.iov_base;
2670 snaptrace = h + 1;
2671 op = le32_to_cpu(h->op);
2672 vino.ino = le64_to_cpu(h->ino);
2673 vino.snap = CEPH_NOSNAP;
2674 cap_id = le64_to_cpu(h->cap_id);
2675 seq = le32_to_cpu(h->seq);
2676 size = le64_to_cpu(h->size);
2677 max_size = le64_to_cpu(h->max_size);
2678
2679 mutex_lock(&session->s_mutex);
2680 session->s_seq++;
2681 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
2682 (unsigned)seq);
2683
2684 /* lookup ino */
2685 inode = ceph_find_inode(sb, vino);
2686 dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
2687 vino.snap, inode);
2688 if (!inode) {
2689 dout(" i don't have ino %llx\n", vino.ino);
2690 goto done;
2691 }
2692
2693 /* these will work even if we don't have a cap yet */
2694 switch (op) {
2695 case CEPH_CAP_OP_FLUSHSNAP_ACK:
2696 handle_cap_flushsnap_ack(inode, tid, h, session);
2697 goto done;
2698
2699 case CEPH_CAP_OP_EXPORT:
2700 handle_cap_export(inode, h, session);
2701 goto done;
2702
2703 case CEPH_CAP_OP_IMPORT:
2704 handle_cap_import(mdsc, inode, h, session,
2705 snaptrace, le32_to_cpu(h->snap_trace_len));
2706 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
2707 session);
2708 goto done_unlocked;
2709 }
2710
2711 /* the rest require a cap */
2712 spin_lock(&inode->i_lock);
2713 cap = __get_cap_for_mds(ceph_inode(inode), mds);
2714 if (!cap) {
2715 dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
2716 inode, ceph_ino(inode), ceph_snap(inode), mds);
2717 spin_unlock(&inode->i_lock);
2718 goto done;
2719 }
2720
2721 /* note that each of these drops i_lock for us */
2722 switch (op) {
2723 case CEPH_CAP_OP_REVOKE:
2724 case CEPH_CAP_OP_GRANT:
2725 handle_cap_grant(inode, h, session, cap, msg->middle);
2726 goto done_unlocked;
2727
2728 case CEPH_CAP_OP_FLUSH_ACK:
2729 handle_cap_flush_ack(inode, tid, h, session, cap);
2730 break;
2731
2732 case CEPH_CAP_OP_TRUNC:
2733 handle_cap_trunc(inode, h, session);
2734 break;
2735
2736 default:
2737 spin_unlock(&inode->i_lock);
2738 pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
2739 ceph_cap_op_name(op));
2740 }
2741
2742done:
2743 mutex_unlock(&session->s_mutex);
2744done_unlocked:
2745 if (inode)
2746 iput(inode);
2747 return;
2748
2749bad:
2750 pr_err("ceph_handle_caps: corrupt message\n");
2751 ceph_msg_dump(msg);
2752 return;
2753}
2754
2755/*
2756 * Delayed work handler to process end of delayed cap release LRU list.
2757 */
2758void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
2759{
2760 struct ceph_inode_info *ci;
2761 int flags = CHECK_CAPS_NODELAY;
2762
2763 dout("check_delayed_caps\n");
2764 while (1) {
2765 spin_lock(&mdsc->cap_delay_lock);
2766 if (list_empty(&mdsc->cap_delay_list))
2767 break;
2768 ci = list_first_entry(&mdsc->cap_delay_list,
2769 struct ceph_inode_info,
2770 i_cap_delay_list);
2771 if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
2772 time_before(jiffies, ci->i_hold_caps_max))
2773 break;
2774 list_del_init(&ci->i_cap_delay_list);
2775 spin_unlock(&mdsc->cap_delay_lock);
2776 dout("check_delayed_caps on %p\n", &ci->vfs_inode);
2777 ceph_check_caps(ci, flags, NULL);
2778 }
2779 spin_unlock(&mdsc->cap_delay_lock);
2780}
2781
2782/*
2783 * Flush all dirty caps to the mds
2784 */
2785void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
2786{
2787 struct ceph_inode_info *ci, *nci = NULL;
2788 struct inode *inode, *ninode = NULL;
2789 struct list_head *p, *n;
2790
2791 dout("flush_dirty_caps\n");
2792 spin_lock(&mdsc->cap_dirty_lock);
2793 list_for_each_safe(p, n, &mdsc->cap_dirty) {
2794 if (nci) {
2795 ci = nci;
2796 inode = ninode;
2797 ci->i_ceph_flags &= ~CEPH_I_NOFLUSH;
2798 dout("flush_dirty_caps inode %p (was next inode)\n",
2799 inode);
2800 } else {
2801 ci = list_entry(p, struct ceph_inode_info,
2802 i_dirty_item);
2803 inode = igrab(&ci->vfs_inode);
2804 BUG_ON(!inode);
2805 dout("flush_dirty_caps inode %p\n", inode);
2806 }
2807 if (n != &mdsc->cap_dirty) {
2808 nci = list_entry(n, struct ceph_inode_info,
2809 i_dirty_item);
2810 ninode = igrab(&nci->vfs_inode);
2811 BUG_ON(!ninode);
2812 nci->i_ceph_flags |= CEPH_I_NOFLUSH;
2813 dout("flush_dirty_caps next inode %p, noflush\n",
2814 ninode);
2815 } else {
2816 nci = NULL;
2817 ninode = NULL;
2818 }
2819 spin_unlock(&mdsc->cap_dirty_lock);
2820 if (inode) {
2821 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
2822 NULL);
2823 iput(inode);
2824 }
2825 spin_lock(&mdsc->cap_dirty_lock);
2826 }
2827 spin_unlock(&mdsc->cap_dirty_lock);
2828}
2829
2830/*
2831 * Drop open file reference. If we were the last open file,
2832 * we may need to release capabilities to the MDS (or schedule
2833 * their delayed release).
2834 */
2835void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
2836{
2837 struct inode *inode = &ci->vfs_inode;
2838 int last = 0;
2839
2840 spin_lock(&inode->i_lock);
2841 dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
2842 ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
2843 BUG_ON(ci->i_nr_by_mode[fmode] == 0);
2844 if (--ci->i_nr_by_mode[fmode] == 0)
2845 last++;
2846 spin_unlock(&inode->i_lock);
2847
2848 if (last && ci->i_vino.snap == CEPH_NOSNAP)
2849 ceph_check_caps(ci, 0, NULL);
2850}
2851
2852/*
2853 * Helpers for embedding cap and dentry lease releases into mds
2854 * requests.
2855 *
2856 * @force is used by dentry_release (below) to force inclusion of a
2857 * record for the directory inode, even when there aren't any caps to
2858 * drop.
2859 */
2860int ceph_encode_inode_release(void **p, struct inode *inode,
2861 int mds, int drop, int unless, int force)
2862{
2863 struct ceph_inode_info *ci = ceph_inode(inode);
2864 struct ceph_cap *cap;
2865 struct ceph_mds_request_release *rel = *p;
2866 int ret = 0;
2867 int used = 0;
2868
2869 spin_lock(&inode->i_lock);
2870 used = __ceph_caps_used(ci);
2871
2872 dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode,
2873 mds, ceph_cap_string(used), ceph_cap_string(drop),
2874 ceph_cap_string(unless));
2875
2876 /* only drop unused caps */
2877 drop &= ~used;
2878
2879 cap = __get_cap_for_mds(ci, mds);
2880 if (cap && __cap_is_valid(cap)) {
2881 if (force ||
2882 ((cap->issued & drop) &&
2883 (cap->issued & unless) == 0)) {
2884 if ((cap->issued & drop) &&
2885 (cap->issued & unless) == 0) {
2886 dout("encode_inode_release %p cap %p %s -> "
2887 "%s\n", inode, cap,
2888 ceph_cap_string(cap->issued),
2889 ceph_cap_string(cap->issued & ~drop));
2890 cap->issued &= ~drop;
2891 cap->implemented &= ~drop;
2892 if (ci->i_ceph_flags & CEPH_I_NODELAY) {
2893 int wanted = __ceph_caps_wanted(ci);
2894 dout(" wanted %s -> %s (act %s)\n",
2895 ceph_cap_string(cap->mds_wanted),
2896 ceph_cap_string(cap->mds_wanted &
2897 ~wanted),
2898 ceph_cap_string(wanted));
2899 cap->mds_wanted &= wanted;
2900 }
2901 } else {
2902 dout("encode_inode_release %p cap %p %s"
2903 " (force)\n", inode, cap,
2904 ceph_cap_string(cap->issued));
2905 }
2906
2907 rel->ino = cpu_to_le64(ceph_ino(inode));
2908 rel->cap_id = cpu_to_le64(cap->cap_id);
2909 rel->seq = cpu_to_le32(cap->seq);
2910 rel->issue_seq = cpu_to_le32(cap->issue_seq),
2911 rel->mseq = cpu_to_le32(cap->mseq);
2912 rel->caps = cpu_to_le32(cap->issued);
2913 rel->wanted = cpu_to_le32(cap->mds_wanted);
2914 rel->dname_len = 0;
2915 rel->dname_seq = 0;
2916 *p += sizeof(*rel);
2917 ret = 1;
2918 } else {
2919 dout("encode_inode_release %p cap %p %s\n",
2920 inode, cap, ceph_cap_string(cap->issued));
2921 }
2922 }
2923 spin_unlock(&inode->i_lock);
2924 return ret;
2925}
2926
2927int ceph_encode_dentry_release(void **p, struct dentry *dentry,
2928 int mds, int drop, int unless)
2929{
2930 struct inode *dir = dentry->d_parent->d_inode;
2931 struct ceph_mds_request_release *rel = *p;
2932 struct ceph_dentry_info *di = ceph_dentry(dentry);
2933 int force = 0;
2934 int ret;
2935
2936 /*
2937 * force an record for the directory caps if we have a dentry lease.
2938 * this is racy (can't take i_lock and d_lock together), but it
2939 * doesn't have to be perfect; the mds will revoke anything we don't
2940 * release.
2941 */
2942 spin_lock(&dentry->d_lock);
2943 if (di->lease_session && di->lease_session->s_mds == mds)
2944 force = 1;
2945 spin_unlock(&dentry->d_lock);
2946
2947 ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
2948
2949 spin_lock(&dentry->d_lock);
2950 if (ret && di->lease_session && di->lease_session->s_mds == mds) {
2951 dout("encode_dentry_release %p mds%d seq %d\n",
2952 dentry, mds, (int)di->lease_seq);
2953 rel->dname_len = cpu_to_le32(dentry->d_name.len);
2954 memcpy(*p, dentry->d_name.name, dentry->d_name.len);
2955 *p += dentry->d_name.len;
2956 rel->dname_seq = cpu_to_le32(di->lease_seq);
2957 }
2958 spin_unlock(&dentry->d_lock);
2959 return ret;
2960}
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
new file mode 100644
index 000000000000..1818c2305610
--- /dev/null
+++ b/fs/ceph/ceph_debug.h
@@ -0,0 +1,37 @@
1#ifndef _FS_CEPH_DEBUG_H
2#define _FS_CEPH_DEBUG_H
3
4#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
5
6#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
7
8/*
9 * wrap pr_debug to include a filename:lineno prefix on each line.
10 * this incurs some overhead (kernel size and execution time) due to
11 * the extra function call at each call site.
12 */
13
14# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
15extern const char *ceph_file_part(const char *s, int len);
16# define dout(fmt, ...) \
17 pr_debug(" %12.12s:%-4d : " fmt, \
18 ceph_file_part(__FILE__, sizeof(__FILE__)), \
19 __LINE__, ##__VA_ARGS__)
20# else
21/* faux printk call just to see any compiler warnings. */
22# define dout(fmt, ...) do { \
23 if (0) \
24 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
25 } while (0)
26# endif
27
28#else
29
30/*
31 * or, just wrap pr_debug
32 */
33# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
34
35#endif
36
37#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
new file mode 100644
index 000000000000..ab6cf35c4091
--- /dev/null
+++ b/fs/ceph/ceph_frag.c
@@ -0,0 +1,21 @@
1/*
2 * Ceph 'frag' type
3 */
4#include "types.h"
5
6int ceph_frag_compare(__u32 a, __u32 b)
7{
8 unsigned va = ceph_frag_value(a);
9 unsigned vb = ceph_frag_value(b);
10 if (va < vb)
11 return -1;
12 if (va > vb)
13 return 1;
14 va = ceph_frag_bits(a);
15 vb = ceph_frag_bits(b);
16 if (va < vb)
17 return -1;
18 if (va > vb)
19 return 1;
20 return 0;
21}
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
new file mode 100644
index 000000000000..793f50cb7c22
--- /dev/null
+++ b/fs/ceph/ceph_frag.h
@@ -0,0 +1,109 @@
1#ifndef _FS_CEPH_FRAG_H
2#define _FS_CEPH_FRAG_H
3
4/*
5 * "Frags" are a way to describe a subset of a 32-bit number space,
6 * using a mask and a value to match against that mask. Any given frag
7 * (subset of the number space) can be partitioned into 2^n sub-frags.
8 *
9 * Frags are encoded into a 32-bit word:
10 * 8 upper bits = "bits"
11 * 24 lower bits = "value"
12 * (We could go to 5+27 bits, but who cares.)
13 *
14 * We use the _most_ significant bits of the 24 bit value. This makes
15 * values logically sort.
16 *
17 * Unfortunately, because the "bits" field is still in the high bits, we
18 * can't sort encoded frags numerically. However, it does allow you
19 * to feed encoded frags as values into frag_contains_value.
20 */
21static inline __u32 ceph_frag_make(__u32 b, __u32 v)
22{
23 return (b << 24) |
24 (v & (0xffffffu << (24-b)) & 0xffffffu);
25}
26static inline __u32 ceph_frag_bits(__u32 f)
27{
28 return f >> 24;
29}
30static inline __u32 ceph_frag_value(__u32 f)
31{
32 return f & 0xffffffu;
33}
34static inline __u32 ceph_frag_mask(__u32 f)
35{
36 return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
37}
38static inline __u32 ceph_frag_mask_shift(__u32 f)
39{
40 return 24 - ceph_frag_bits(f);
41}
42
43static inline int ceph_frag_contains_value(__u32 f, __u32 v)
44{
45 return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
46}
47static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
48{
49 /* is sub as specific as us, and contained by us? */
50 return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
51 (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
52}
53
54static inline __u32 ceph_frag_parent(__u32 f)
55{
56 return ceph_frag_make(ceph_frag_bits(f) - 1,
57 ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
58}
59static inline int ceph_frag_is_left_child(__u32 f)
60{
61 return ceph_frag_bits(f) > 0 &&
62 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
63}
64static inline int ceph_frag_is_right_child(__u32 f)
65{
66 return ceph_frag_bits(f) > 0 &&
67 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
68}
69static inline __u32 ceph_frag_sibling(__u32 f)
70{
71 return ceph_frag_make(ceph_frag_bits(f),
72 ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
73}
74static inline __u32 ceph_frag_left_child(__u32 f)
75{
76 return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
77}
78static inline __u32 ceph_frag_right_child(__u32 f)
79{
80 return ceph_frag_make(ceph_frag_bits(f)+1,
81 ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
82}
83static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
84{
85 int newbits = ceph_frag_bits(f) + by;
86 return ceph_frag_make(newbits,
87 ceph_frag_value(f) | (i << (24 - newbits)));
88}
89static inline int ceph_frag_is_leftmost(__u32 f)
90{
91 return ceph_frag_value(f) == 0;
92}
93static inline int ceph_frag_is_rightmost(__u32 f)
94{
95 return ceph_frag_value(f) == ceph_frag_mask(f);
96}
97static inline __u32 ceph_frag_next(__u32 f)
98{
99 return ceph_frag_make(ceph_frag_bits(f),
100 ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
101}
102
103/*
104 * comparator to sort frags logically, as when traversing the
105 * number space in ascending order...
106 */
107int ceph_frag_compare(__u32 a, __u32 b);
108
109#endif
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
new file mode 100644
index 000000000000..79d76bc4303f
--- /dev/null
+++ b/fs/ceph/ceph_fs.c
@@ -0,0 +1,74 @@
1/*
2 * Some non-inline ceph helpers
3 */
4#include "types.h"
5
6/*
7 * return true if @layout appears to be valid
8 */
9int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
10{
11 __u32 su = le32_to_cpu(layout->fl_stripe_unit);
12 __u32 sc = le32_to_cpu(layout->fl_stripe_count);
13 __u32 os = le32_to_cpu(layout->fl_object_size);
14
15 /* stripe unit, object size must be non-zero, 64k increment */
16 if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
17 return 0;
18 if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
19 return 0;
20 /* object size must be a multiple of stripe unit */
21 if (os < su || os % su)
22 return 0;
23 /* stripe count must be non-zero */
24 if (!sc)
25 return 0;
26 return 1;
27}
28
29
30int ceph_flags_to_mode(int flags)
31{
32#ifdef O_DIRECTORY /* fixme */
33 if ((flags & O_DIRECTORY) == O_DIRECTORY)
34 return CEPH_FILE_MODE_PIN;
35#endif
36#ifdef O_LAZY
37 if (flags & O_LAZY)
38 return CEPH_FILE_MODE_LAZY;
39#endif
40 if ((flags & O_APPEND) == O_APPEND)
41 flags |= O_WRONLY;
42
43 flags &= O_ACCMODE;
44 if ((flags & O_RDWR) == O_RDWR)
45 return CEPH_FILE_MODE_RDWR;
46 if ((flags & O_WRONLY) == O_WRONLY)
47 return CEPH_FILE_MODE_WR;
48 return CEPH_FILE_MODE_RD;
49}
50
51int ceph_caps_for_mode(int mode)
52{
53 switch (mode) {
54 case CEPH_FILE_MODE_PIN:
55 return CEPH_CAP_PIN;
56 case CEPH_FILE_MODE_RD:
57 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
58 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
59 case CEPH_FILE_MODE_RDWR:
60 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
61 CEPH_CAP_FILE_EXCL |
62 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
63 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
64 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
65 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
66 case CEPH_FILE_MODE_WR:
67 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
68 CEPH_CAP_FILE_EXCL |
69 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
70 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
71 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
72 }
73 return 0;
74}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
new file mode 100644
index 000000000000..0c2241ef3653
--- /dev/null
+++ b/fs/ceph/ceph_fs.h
@@ -0,0 +1,650 @@
1/*
2 * ceph_fs.h - Ceph constants and data types to share between kernel and
3 * user space.
4 *
5 * Most types in this file are defined as little-endian, and are
6 * primarily intended to describe data structures that pass over the
7 * wire or that are stored on disk.
8 *
9 * LGPL2
10 */
11
12#ifndef _FS_CEPH_CEPH_FS_H
13#define _FS_CEPH_CEPH_FS_H
14
15#include "msgr.h"
16#include "rados.h"
17
18/*
19 * Ceph release version
20 */
21#define CEPH_VERSION_MAJOR 0
22#define CEPH_VERSION_MINOR 19
23#define CEPH_VERSION_PATCH 0
24
25#define _CEPH_STRINGIFY(x) #x
26#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
27#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
28 "." CEPH_STRINGIFY(z)
29#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
30 CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
31
32/*
33 * subprotocol versions. when specific messages types or high-level
34 * protocols change, bump the affected components. we keep rev
35 * internal cluster protocols separately from the public,
36 * client-facing protocol.
37 */
38#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
39#define CEPH_MDS_PROTOCOL 9 /* cluster internal */
40#define CEPH_MON_PROTOCOL 5 /* cluster internal */
41#define CEPH_OSDC_PROTOCOL 24 /* server/client */
42#define CEPH_MDSC_PROTOCOL 32 /* server/client */
43#define CEPH_MONC_PROTOCOL 15 /* server/client */
44
45
46#define CEPH_INO_ROOT 1
47#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
48
49/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
50#define CEPH_MAX_MON 31
51
52
53/*
54 * feature bits
55 */
56#define CEPH_FEATURE_SUPPORTED 0
57#define CEPH_FEATURE_REQUIRED 0
58
59
60/*
61 * ceph_file_layout - describe data layout for a file/inode
62 */
63struct ceph_file_layout {
64 /* file -> object mapping */
65 __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
66 of page size. */
67 __le32 fl_stripe_count; /* over this many objects */
68 __le32 fl_object_size; /* until objects are this big, then move to
69 new objects */
70 __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */
71
72 /* pg -> disk layout */
73 __le32 fl_object_stripe_unit; /* for per-object parity, if any */
74
75 /* object -> pg layout */
76 __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
77 __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
78} __attribute__ ((packed));
79
80#define CEPH_MIN_STRIPE_UNIT 65536
81
82int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
83
84
85/* crypto algorithms */
86#define CEPH_CRYPTO_NONE 0x0
87#define CEPH_CRYPTO_AES 0x1
88
89/* security/authentication protocols */
90#define CEPH_AUTH_UNKNOWN 0x0
91#define CEPH_AUTH_NONE 0x1
92#define CEPH_AUTH_CEPHX 0x2
93
94
95/*********************************************
96 * message layer
97 */
98
99/*
100 * message types
101 */
102
103/* misc */
104#define CEPH_MSG_SHUTDOWN 1
105#define CEPH_MSG_PING 2
106
107/* client <-> monitor */
108#define CEPH_MSG_MON_MAP 4
109#define CEPH_MSG_MON_GET_MAP 5
110#define CEPH_MSG_STATFS 13
111#define CEPH_MSG_STATFS_REPLY 14
112#define CEPH_MSG_MON_SUBSCRIBE 15
113#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
114#define CEPH_MSG_AUTH 17
115#define CEPH_MSG_AUTH_REPLY 18
116
117/* client <-> mds */
118#define CEPH_MSG_MDS_MAP 21
119
120#define CEPH_MSG_CLIENT_SESSION 22
121#define CEPH_MSG_CLIENT_RECONNECT 23
122
123#define CEPH_MSG_CLIENT_REQUEST 24
124#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
125#define CEPH_MSG_CLIENT_REPLY 26
126#define CEPH_MSG_CLIENT_CAPS 0x310
127#define CEPH_MSG_CLIENT_LEASE 0x311
128#define CEPH_MSG_CLIENT_SNAP 0x312
129#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
130
131/* osd */
132#define CEPH_MSG_OSD_MAP 41
133#define CEPH_MSG_OSD_OP 42
134#define CEPH_MSG_OSD_OPREPLY 43
135
136struct ceph_mon_request_header {
137 __le64 have_version;
138 __le16 session_mon;
139 __le64 session_mon_tid;
140} __attribute__ ((packed));
141
142struct ceph_mon_statfs {
143 struct ceph_mon_request_header monhdr;
144 struct ceph_fsid fsid;
145} __attribute__ ((packed));
146
147struct ceph_statfs {
148 __le64 kb, kb_used, kb_avail;
149 __le64 num_objects;
150} __attribute__ ((packed));
151
152struct ceph_mon_statfs_reply {
153 struct ceph_fsid fsid;
154 __le64 version;
155 struct ceph_statfs st;
156} __attribute__ ((packed));
157
158struct ceph_osd_getmap {
159 struct ceph_mon_request_header monhdr;
160 struct ceph_fsid fsid;
161 __le32 start;
162} __attribute__ ((packed));
163
164struct ceph_mds_getmap {
165 struct ceph_mon_request_header monhdr;
166 struct ceph_fsid fsid;
167} __attribute__ ((packed));
168
169struct ceph_client_mount {
170 struct ceph_mon_request_header monhdr;
171} __attribute__ ((packed));
172
173struct ceph_mon_subscribe_item {
174 __le64 have_version; __le64 have;
175 __u8 onetime;
176} __attribute__ ((packed));
177
178struct ceph_mon_subscribe_ack {
179 __le32 duration; /* seconds */
180 struct ceph_fsid fsid;
181} __attribute__ ((packed));
182
183/*
184 * mds states
185 * > 0 -> in
186 * <= 0 -> out
187 */
188#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */
189#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees.
190 empty log. */
191#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */
192#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
193#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
194#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
195#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
196
197#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
198#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
199 operations (import, rename, etc.) */
200#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */
201#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */
202#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
203#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */
204#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */
205
206extern const char *ceph_mds_state_name(int s);
207
208
209/*
210 * metadata lock types.
211 * - these are bitmasks.. we can compose them
212 * - they also define the lock ordering by the MDS
213 * - a few of these are internal to the mds
214 */
215#define CEPH_LOCK_DN 1
216#define CEPH_LOCK_ISNAP 2
217#define CEPH_LOCK_IVERSION 4 /* mds internal */
218#define CEPH_LOCK_IFILE 8 /* mds internal */
219#define CEPH_LOCK_IAUTH 32
220#define CEPH_LOCK_ILINK 64
221#define CEPH_LOCK_IDFT 128 /* dir frag tree */
222#define CEPH_LOCK_INEST 256 /* mds internal */
223#define CEPH_LOCK_IXATTR 512
224#define CEPH_LOCK_INO 2048 /* immutable inode bits; not a lock */
225
226/* client_session ops */
227enum {
228 CEPH_SESSION_REQUEST_OPEN,
229 CEPH_SESSION_OPEN,
230 CEPH_SESSION_REQUEST_CLOSE,
231 CEPH_SESSION_CLOSE,
232 CEPH_SESSION_REQUEST_RENEWCAPS,
233 CEPH_SESSION_RENEWCAPS,
234 CEPH_SESSION_STALE,
235 CEPH_SESSION_RECALL_STATE,
236};
237
238extern const char *ceph_session_op_name(int op);
239
240struct ceph_mds_session_head {
241 __le32 op;
242 __le64 seq;
243 struct ceph_timespec stamp;
244 __le32 max_caps, max_leases;
245} __attribute__ ((packed));
246
247/* client_request */
248/*
249 * metadata ops.
250 * & 0x001000 -> write op
251 * & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
252 & & 0x100000 -> use weird ino/path trace
253 */
254#define CEPH_MDS_OP_WRITE 0x001000
255enum {
256 CEPH_MDS_OP_LOOKUP = 0x00100,
257 CEPH_MDS_OP_GETATTR = 0x00101,
258 CEPH_MDS_OP_LOOKUPHASH = 0x00102,
259 CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
260
261 CEPH_MDS_OP_SETXATTR = 0x01105,
262 CEPH_MDS_OP_RMXATTR = 0x01106,
263 CEPH_MDS_OP_SETLAYOUT = 0x01107,
264 CEPH_MDS_OP_SETATTR = 0x01108,
265
266 CEPH_MDS_OP_MKNOD = 0x01201,
267 CEPH_MDS_OP_LINK = 0x01202,
268 CEPH_MDS_OP_UNLINK = 0x01203,
269 CEPH_MDS_OP_RENAME = 0x01204,
270 CEPH_MDS_OP_MKDIR = 0x01220,
271 CEPH_MDS_OP_RMDIR = 0x01221,
272 CEPH_MDS_OP_SYMLINK = 0x01222,
273
274 CEPH_MDS_OP_CREATE = 0x01301,
275 CEPH_MDS_OP_OPEN = 0x00302,
276 CEPH_MDS_OP_READDIR = 0x00305,
277
278 CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
279 CEPH_MDS_OP_MKSNAP = 0x01400,
280 CEPH_MDS_OP_RMSNAP = 0x01401,
281 CEPH_MDS_OP_LSSNAP = 0x00402,
282};
283
284extern const char *ceph_mds_op_name(int op);
285
286
287#define CEPH_SETATTR_MODE 1
288#define CEPH_SETATTR_UID 2
289#define CEPH_SETATTR_GID 4
290#define CEPH_SETATTR_MTIME 8
291#define CEPH_SETATTR_ATIME 16
292#define CEPH_SETATTR_SIZE 32
293#define CEPH_SETATTR_CTIME 64
294
295union ceph_mds_request_args {
296 struct {
297 __le32 mask; /* CEPH_CAP_* */
298 } __attribute__ ((packed)) getattr;
299 struct {
300 __le32 mode;
301 __le32 uid;
302 __le32 gid;
303 struct ceph_timespec mtime;
304 struct ceph_timespec atime;
305 __le64 size, old_size; /* old_size needed by truncate */
306 __le32 mask; /* CEPH_SETATTR_* */
307 } __attribute__ ((packed)) setattr;
308 struct {
309 __le32 frag; /* which dir fragment */
310 __le32 max_entries; /* how many dentries to grab */
311 } __attribute__ ((packed)) readdir;
312 struct {
313 __le32 mode;
314 __le32 rdev;
315 } __attribute__ ((packed)) mknod;
316 struct {
317 __le32 mode;
318 } __attribute__ ((packed)) mkdir;
319 struct {
320 __le32 flags;
321 __le32 mode;
322 __le32 stripe_unit; /* layout for newly created file */
323 __le32 stripe_count; /* ... */
324 __le32 object_size;
325 __le32 file_replication;
326 __le32 preferred;
327 } __attribute__ ((packed)) open;
328 struct {
329 __le32 flags;
330 } __attribute__ ((packed)) setxattr;
331 struct {
332 struct ceph_file_layout layout;
333 } __attribute__ ((packed)) setlayout;
334} __attribute__ ((packed));
335
336#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
337#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
338
339struct ceph_mds_request_head {
340 __le64 oldest_client_tid;
341 __le32 mdsmap_epoch; /* on client */
342 __le32 flags; /* CEPH_MDS_FLAG_* */
343 __u8 num_retry, num_fwd; /* count retry, fwd attempts */
344 __le16 num_releases; /* # include cap/lease release records */
345 __le32 op; /* mds op code */
346 __le32 caller_uid, caller_gid;
347 __le64 ino; /* use this ino for openc, mkdir, mknod,
348 etc. (if replaying) */
349 union ceph_mds_request_args args;
350} __attribute__ ((packed));
351
352/* cap/lease release record */
353struct ceph_mds_request_release {
354 __le64 ino, cap_id; /* ino and unique cap id */
355 __le32 caps, wanted; /* new issued, wanted */
356 __le32 seq, issue_seq, mseq;
357 __le32 dname_seq; /* if releasing a dentry lease, a */
358 __le32 dname_len; /* string follows. */
359} __attribute__ ((packed));
360
361/* client reply */
362struct ceph_mds_reply_head {
363 __le32 op;
364 __le32 result;
365 __le32 mdsmap_epoch;
366 __u8 safe; /* true if committed to disk */
367 __u8 is_dentry, is_target; /* true if dentry, target inode records
368 are included with reply */
369} __attribute__ ((packed));
370
371/* one for each node split */
372struct ceph_frag_tree_split {
373 __le32 frag; /* this frag splits... */
374 __le32 by; /* ...by this many bits */
375} __attribute__ ((packed));
376
377struct ceph_frag_tree_head {
378 __le32 nsplits; /* num ceph_frag_tree_split records */
379 struct ceph_frag_tree_split splits[];
380} __attribute__ ((packed));
381
382/* capability issue, for bundling with mds reply */
383struct ceph_mds_reply_cap {
384 __le32 caps, wanted; /* caps issued, wanted */
385 __le64 cap_id;
386 __le32 seq, mseq;
387 __le64 realm; /* snap realm */
388 __u8 flags; /* CEPH_CAP_FLAG_* */
389} __attribute__ ((packed));
390
391#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */
392
393/* inode record, for bundling with mds reply */
394struct ceph_mds_reply_inode {
395 __le64 ino;
396 __le64 snapid;
397 __le32 rdev;
398 __le64 version; /* inode version */
399 __le64 xattr_version; /* version for xattr blob */
400 struct ceph_mds_reply_cap cap; /* caps issued for this inode */
401 struct ceph_file_layout layout;
402 struct ceph_timespec ctime, mtime, atime;
403 __le32 time_warp_seq;
404 __le64 size, max_size, truncate_size;
405 __le32 truncate_seq;
406 __le32 mode, uid, gid;
407 __le32 nlink;
408 __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */
409 struct ceph_timespec rctime;
410 struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */
411} __attribute__ ((packed));
412/* followed by frag array, then symlink string, then xattr blob */
413
414/* reply_lease follows dname, and reply_inode */
415struct ceph_mds_reply_lease {
416 __le16 mask; /* lease type(s) */
417 __le32 duration_ms; /* lease duration */
418 __le32 seq;
419} __attribute__ ((packed));
420
421struct ceph_mds_reply_dirfrag {
422 __le32 frag; /* fragment */
423 __le32 auth; /* auth mds, if this is a delegation point */
424 __le32 ndist; /* number of mds' this is replicated on */
425 __le32 dist[];
426} __attribute__ ((packed));
427
428/* file access modes */
429#define CEPH_FILE_MODE_PIN 0
430#define CEPH_FILE_MODE_RD 1
431#define CEPH_FILE_MODE_WR 2
432#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
433#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
434#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
435
436int ceph_flags_to_mode(int flags);
437
438
439/* capability bits */
440#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */
441
442/* generic cap bits */
443#define CEPH_CAP_GSHARED 1 /* client can reads */
444#define CEPH_CAP_GEXCL 2 /* client can read and update */
445#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */
446#define CEPH_CAP_GRD 8 /* (file) client can read */
447#define CEPH_CAP_GWR 16 /* (file) client can write */
448#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */
449#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
450#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
451
452/* per-lock shift */
453#define CEPH_CAP_SAUTH 2
454#define CEPH_CAP_SLINK 4
455#define CEPH_CAP_SXATTR 6
456#define CEPH_CAP_SFILE 8 /* goes at the end (uses >2 cap bits) */
457
458#define CEPH_CAP_BITS 16
459
460/* composed values */
461#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
462#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)
463#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK)
464#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK)
465#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR)
466#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR)
467#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE)
468#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE)
469#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE)
470#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE)
471#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE)
472#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE)
473#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
474#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
475#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
476
477/* cap masks (for getattr) */
478#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
479#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */
480#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN
481#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED
482#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED
483#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED
484#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED
485#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED
486#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED
487#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED
488#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */
489#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED
490#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \
491 CEPH_CAP_AUTH_SHARED | \
492 CEPH_CAP_LINK_SHARED | \
493 CEPH_CAP_FILE_SHARED | \
494 CEPH_CAP_XATTR_SHARED)
495
496#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
497 CEPH_CAP_LINK_SHARED | \
498 CEPH_CAP_XATTR_SHARED | \
499 CEPH_CAP_FILE_SHARED)
500#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \
501 CEPH_CAP_FILE_CACHE)
502
503#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \
504 CEPH_CAP_LINK_EXCL | \
505 CEPH_CAP_XATTR_EXCL | \
506 CEPH_CAP_FILE_EXCL)
507#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
508 CEPH_CAP_FILE_EXCL)
509#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
510#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
511 CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN)
512
513#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
514 CEPH_LOCK_IXATTR)
515
516int ceph_caps_for_mode(int mode);
517
518enum {
519 CEPH_CAP_OP_GRANT, /* mds->client grant */
520 CEPH_CAP_OP_REVOKE, /* mds->client revoke */
521 CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */
522 CEPH_CAP_OP_EXPORT, /* mds has exported the cap */
523 CEPH_CAP_OP_IMPORT, /* mds has imported the cap */
524 CEPH_CAP_OP_UPDATE, /* client->mds update */
525 CEPH_CAP_OP_DROP, /* client->mds drop cap bits */
526 CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */
527 CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */
528 CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
529 CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
530 CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
531 CEPH_CAP_OP_RENEW, /* client->mds renewal request */
532};
533
534extern const char *ceph_cap_op_name(int op);
535
536/*
537 * caps message, used for capability callbacks, acks, requests, etc.
538 */
539struct ceph_mds_caps {
540 __le32 op; /* CEPH_CAP_OP_* */
541 __le64 ino, realm;
542 __le64 cap_id;
543 __le32 seq, issue_seq;
544 __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
545 __le32 migrate_seq;
546 __le64 snap_follows;
547 __le32 snap_trace_len;
548
549 /* authlock */
550 __le32 uid, gid, mode;
551
552 /* linklock */
553 __le32 nlink;
554
555 /* xattrlock */
556 __le32 xattr_len;
557 __le64 xattr_version;
558
559 /* filelock */
560 __le64 size, max_size, truncate_size;
561 __le32 truncate_seq;
562 struct ceph_timespec mtime, atime, ctime;
563 struct ceph_file_layout layout;
564 __le32 time_warp_seq;
565} __attribute__ ((packed));
566
567/* cap release msg head */
568struct ceph_mds_cap_release {
569 __le32 num; /* number of cap_items that follow */
570} __attribute__ ((packed));
571
572struct ceph_mds_cap_item {
573 __le64 ino;
574 __le64 cap_id;
575 __le32 migrate_seq, seq;
576} __attribute__ ((packed));
577
578#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */
579#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */
580#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */
581#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */
582
583extern const char *ceph_lease_op_name(int o);
584
585/* lease msg header */
586struct ceph_mds_lease {
587 __u8 action; /* CEPH_MDS_LEASE_* */
588 __le16 mask; /* which lease */
589 __le64 ino;
590 __le64 first, last; /* snap range */
591 __le32 seq;
592 __le32 duration_ms; /* duration of renewal */
593} __attribute__ ((packed));
594/* followed by a __le32+string for dname */
595
596/* client reconnect */
597struct ceph_mds_cap_reconnect {
598 __le64 cap_id;
599 __le32 wanted;
600 __le32 issued;
601 __le64 size;
602 struct ceph_timespec mtime, atime;
603 __le64 snaprealm;
604 __le64 pathbase; /* base ino for our path to this ino */
605} __attribute__ ((packed));
606/* followed by encoded string */
607
608struct ceph_mds_snaprealm_reconnect {
609 __le64 ino; /* snap realm base */
610 __le64 seq; /* snap seq for this snap realm */
611 __le64 parent; /* parent realm */
612} __attribute__ ((packed));
613
614/*
615 * snaps
616 */
617enum {
618 CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */
619 CEPH_SNAP_OP_CREATE,
620 CEPH_SNAP_OP_DESTROY,
621 CEPH_SNAP_OP_SPLIT,
622};
623
624extern const char *ceph_snap_op_name(int o);
625
626/* snap msg header */
627struct ceph_mds_snap_head {
628 __le32 op; /* CEPH_SNAP_OP_* */
629 __le64 split; /* ino to split off, if any */
630 __le32 num_split_inos; /* # inos belonging to new child realm */
631 __le32 num_split_realms; /* # child realms udner new child realm */
632 __le32 trace_len; /* size of snap trace blob */
633} __attribute__ ((packed));
634/* followed by split ino list, then split realms, then the trace blob */
635
636/*
637 * encode info about a snaprealm, as viewed by a client
638 */
639struct ceph_mds_snap_realm {
640 __le64 ino; /* ino */
641 __le64 created; /* snap: when created */
642 __le64 parent; /* ino: parent realm */
643 __le64 parent_since; /* snap: same parent since */
644 __le64 seq; /* snap: version */
645 __le32 num_snaps;
646 __le32 num_prior_parent_snaps;
647} __attribute__ ((packed));
648/* followed by my snap list, then prior parent snap list */
649
650#endif
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
new file mode 100644
index 000000000000..bd570015d147
--- /dev/null
+++ b/fs/ceph/ceph_hash.c
@@ -0,0 +1,118 @@
1
2#include "types.h"
3
4/*
5 * Robert Jenkin's hash function.
6 * http://burtleburtle.net/bob/hash/evahash.html
7 * This is in the public domain.
8 */
9#define mix(a, b, c) \
10 do { \
11 a = a - b; a = a - c; a = a ^ (c >> 13); \
12 b = b - c; b = b - a; b = b ^ (a << 8); \
13 c = c - a; c = c - b; c = c ^ (b >> 13); \
14 a = a - b; a = a - c; a = a ^ (c >> 12); \
15 b = b - c; b = b - a; b = b ^ (a << 16); \
16 c = c - a; c = c - b; c = c ^ (b >> 5); \
17 a = a - b; a = a - c; a = a ^ (c >> 3); \
18 b = b - c; b = b - a; b = b ^ (a << 10); \
19 c = c - a; c = c - b; c = c ^ (b >> 15); \
20 } while (0)
21
22unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
23{
24 const unsigned char *k = (const unsigned char *)str;
25 __u32 a, b, c; /* the internal state */
26 __u32 len; /* how many key bytes still need mixing */
27
28 /* Set up the internal state */
29 len = length;
30 a = 0x9e3779b9; /* the golden ratio; an arbitrary value */
31 b = a;
32 c = 0; /* variable initialization of internal state */
33
34 /* handle most of the key */
35 while (len >= 12) {
36 a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
37 ((__u32)k[3] << 24));
38 b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
39 ((__u32)k[7] << 24));
40 c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
41 ((__u32)k[11] << 24));
42 mix(a, b, c);
43 k = k + 12;
44 len = len - 12;
45 }
46
47 /* handle the last 11 bytes */
48 c = c + length;
49 switch (len) { /* all the case statements fall through */
50 case 11:
51 c = c + ((__u32)k[10] << 24);
52 case 10:
53 c = c + ((__u32)k[9] << 16);
54 case 9:
55 c = c + ((__u32)k[8] << 8);
56 /* the first byte of c is reserved for the length */
57 case 8:
58 b = b + ((__u32)k[7] << 24);
59 case 7:
60 b = b + ((__u32)k[6] << 16);
61 case 6:
62 b = b + ((__u32)k[5] << 8);
63 case 5:
64 b = b + k[4];
65 case 4:
66 a = a + ((__u32)k[3] << 24);
67 case 3:
68 a = a + ((__u32)k[2] << 16);
69 case 2:
70 a = a + ((__u32)k[1] << 8);
71 case 1:
72 a = a + k[0];
73 /* case 0: nothing left to add */
74 }
75 mix(a, b, c);
76
77 return c;
78}
79
80/*
81 * linux dcache hash
82 */
83unsigned ceph_str_hash_linux(const char *str, unsigned length)
84{
85 unsigned long hash = 0;
86 unsigned char c;
87
88 while (length--) {
89 c = *str++;
90 hash = (hash + (c << 4) + (c >> 4)) * 11;
91 }
92 return hash;
93}
94
95
96unsigned ceph_str_hash(int type, const char *s, unsigned len)
97{
98 switch (type) {
99 case CEPH_STR_HASH_LINUX:
100 return ceph_str_hash_linux(s, len);
101 case CEPH_STR_HASH_RJENKINS:
102 return ceph_str_hash_rjenkins(s, len);
103 default:
104 return -1;
105 }
106}
107
108const char *ceph_str_hash_name(int type)
109{
110 switch (type) {
111 case CEPH_STR_HASH_LINUX:
112 return "linux";
113 case CEPH_STR_HASH_RJENKINS:
114 return "rjenkins";
115 default:
116 return "unknown";
117 }
118}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
new file mode 100644
index 000000000000..5ac470c433c9
--- /dev/null
+++ b/fs/ceph/ceph_hash.h
@@ -0,0 +1,13 @@
1#ifndef _FS_CEPH_HASH_H
2#define _FS_CEPH_HASH_H
3
4#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
5#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
6
7extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
8extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
9
10extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
11extern const char *ceph_str_hash_name(int type);
12
13#endif
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
new file mode 100644
index 000000000000..8e4be6a80c62
--- /dev/null
+++ b/fs/ceph/ceph_strings.c
@@ -0,0 +1,176 @@
1/*
2 * Ceph string constants
3 */
4#include "types.h"
5
6const char *ceph_entity_type_name(int type)
7{
8 switch (type) {
9 case CEPH_ENTITY_TYPE_MDS: return "mds";
10 case CEPH_ENTITY_TYPE_OSD: return "osd";
11 case CEPH_ENTITY_TYPE_MON: return "mon";
12 case CEPH_ENTITY_TYPE_CLIENT: return "client";
13 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
14 case CEPH_ENTITY_TYPE_AUTH: return "auth";
15 default: return "unknown";
16 }
17}
18
19const char *ceph_osd_op_name(int op)
20{
21 switch (op) {
22 case CEPH_OSD_OP_READ: return "read";
23 case CEPH_OSD_OP_STAT: return "stat";
24
25 case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
26
27 case CEPH_OSD_OP_WRITE: return "write";
28 case CEPH_OSD_OP_DELETE: return "delete";
29 case CEPH_OSD_OP_TRUNCATE: return "truncate";
30 case CEPH_OSD_OP_ZERO: return "zero";
31 case CEPH_OSD_OP_WRITEFULL: return "writefull";
32
33 case CEPH_OSD_OP_APPEND: return "append";
34 case CEPH_OSD_OP_STARTSYNC: return "startsync";
35 case CEPH_OSD_OP_SETTRUNC: return "settrunc";
36 case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
37
38 case CEPH_OSD_OP_TMAPUP: return "tmapup";
39 case CEPH_OSD_OP_TMAPGET: return "tmapget";
40 case CEPH_OSD_OP_TMAPPUT: return "tmapput";
41
42 case CEPH_OSD_OP_GETXATTR: return "getxattr";
43 case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
44 case CEPH_OSD_OP_SETXATTR: return "setxattr";
45 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
46 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
47 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
48
49 case CEPH_OSD_OP_PULL: return "pull";
50 case CEPH_OSD_OP_PUSH: return "push";
51 case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
52 case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
53 case CEPH_OSD_OP_SCRUB: return "scrub";
54
55 case CEPH_OSD_OP_WRLOCK: return "wrlock";
56 case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
57 case CEPH_OSD_OP_RDLOCK: return "rdlock";
58 case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
59 case CEPH_OSD_OP_UPLOCK: return "uplock";
60 case CEPH_OSD_OP_DNLOCK: return "dnlock";
61
62 case CEPH_OSD_OP_CALL: return "call";
63
64 case CEPH_OSD_OP_PGLS: return "pgls";
65 }
66 return "???";
67}
68
69const char *ceph_mds_state_name(int s)
70{
71 switch (s) {
72 /* down and out */
73 case CEPH_MDS_STATE_DNE: return "down:dne";
74 case CEPH_MDS_STATE_STOPPED: return "down:stopped";
75 /* up and out */
76 case CEPH_MDS_STATE_BOOT: return "up:boot";
77 case CEPH_MDS_STATE_STANDBY: return "up:standby";
78 case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay";
79 case CEPH_MDS_STATE_CREATING: return "up:creating";
80 case CEPH_MDS_STATE_STARTING: return "up:starting";
81 /* up and in */
82 case CEPH_MDS_STATE_REPLAY: return "up:replay";
83 case CEPH_MDS_STATE_RESOLVE: return "up:resolve";
84 case CEPH_MDS_STATE_RECONNECT: return "up:reconnect";
85 case CEPH_MDS_STATE_REJOIN: return "up:rejoin";
86 case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
87 case CEPH_MDS_STATE_ACTIVE: return "up:active";
88 case CEPH_MDS_STATE_STOPPING: return "up:stopping";
89 }
90 return "???";
91}
92
93const char *ceph_session_op_name(int op)
94{
95 switch (op) {
96 case CEPH_SESSION_REQUEST_OPEN: return "request_open";
97 case CEPH_SESSION_OPEN: return "open";
98 case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
99 case CEPH_SESSION_CLOSE: return "close";
100 case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
101 case CEPH_SESSION_RENEWCAPS: return "renewcaps";
102 case CEPH_SESSION_STALE: return "stale";
103 case CEPH_SESSION_RECALL_STATE: return "recall_state";
104 }
105 return "???";
106}
107
108const char *ceph_mds_op_name(int op)
109{
110 switch (op) {
111 case CEPH_MDS_OP_LOOKUP: return "lookup";
112 case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
113 case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
114 case CEPH_MDS_OP_GETATTR: return "getattr";
115 case CEPH_MDS_OP_SETXATTR: return "setxattr";
116 case CEPH_MDS_OP_SETATTR: return "setattr";
117 case CEPH_MDS_OP_RMXATTR: return "rmxattr";
118 case CEPH_MDS_OP_READDIR: return "readdir";
119 case CEPH_MDS_OP_MKNOD: return "mknod";
120 case CEPH_MDS_OP_LINK: return "link";
121 case CEPH_MDS_OP_UNLINK: return "unlink";
122 case CEPH_MDS_OP_RENAME: return "rename";
123 case CEPH_MDS_OP_MKDIR: return "mkdir";
124 case CEPH_MDS_OP_RMDIR: return "rmdir";
125 case CEPH_MDS_OP_SYMLINK: return "symlink";
126 case CEPH_MDS_OP_CREATE: return "create";
127 case CEPH_MDS_OP_OPEN: return "open";
128 case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
129 case CEPH_MDS_OP_LSSNAP: return "lssnap";
130 case CEPH_MDS_OP_MKSNAP: return "mksnap";
131 case CEPH_MDS_OP_RMSNAP: return "rmsnap";
132 }
133 return "???";
134}
135
136const char *ceph_cap_op_name(int op)
137{
138 switch (op) {
139 case CEPH_CAP_OP_GRANT: return "grant";
140 case CEPH_CAP_OP_REVOKE: return "revoke";
141 case CEPH_CAP_OP_TRUNC: return "trunc";
142 case CEPH_CAP_OP_EXPORT: return "export";
143 case CEPH_CAP_OP_IMPORT: return "import";
144 case CEPH_CAP_OP_UPDATE: return "update";
145 case CEPH_CAP_OP_DROP: return "drop";
146 case CEPH_CAP_OP_FLUSH: return "flush";
147 case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
148 case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
149 case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
150 case CEPH_CAP_OP_RELEASE: return "release";
151 case CEPH_CAP_OP_RENEW: return "renew";
152 }
153 return "???";
154}
155
156const char *ceph_lease_op_name(int o)
157{
158 switch (o) {
159 case CEPH_MDS_LEASE_REVOKE: return "revoke";
160 case CEPH_MDS_LEASE_RELEASE: return "release";
161 case CEPH_MDS_LEASE_RENEW: return "renew";
162 case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
163 }
164 return "???";
165}
166
167const char *ceph_snap_op_name(int o)
168{
169 switch (o) {
170 case CEPH_SNAP_OP_UPDATE: return "update";
171 case CEPH_SNAP_OP_CREATE: return "create";
172 case CEPH_SNAP_OP_DESTROY: return "destroy";
173 case CEPH_SNAP_OP_SPLIT: return "split";
174 }
175 return "???";
176}
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
new file mode 100644
index 000000000000..fabd302e5779
--- /dev/null
+++ b/fs/ceph/crush/crush.c
@@ -0,0 +1,151 @@
1
2#ifdef __KERNEL__
3# include <linux/slab.h>
4#else
5# include <stdlib.h>
6# include <assert.h>
7# define kfree(x) do { if (x) free(x); } while (0)
8# define BUG_ON(x) assert(!(x))
9#endif
10
11#include "crush.h"
12
13const char *crush_bucket_alg_name(int alg)
14{
15 switch (alg) {
16 case CRUSH_BUCKET_UNIFORM: return "uniform";
17 case CRUSH_BUCKET_LIST: return "list";
18 case CRUSH_BUCKET_TREE: return "tree";
19 case CRUSH_BUCKET_STRAW: return "straw";
20 default: return "unknown";
21 }
22}
23
24/**
25 * crush_get_bucket_item_weight - Get weight of an item in given bucket
26 * @b: bucket pointer
27 * @p: item index in bucket
28 */
29int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
30{
31 if (p >= b->size)
32 return 0;
33
34 switch (b->alg) {
35 case CRUSH_BUCKET_UNIFORM:
36 return ((struct crush_bucket_uniform *)b)->item_weight;
37 case CRUSH_BUCKET_LIST:
38 return ((struct crush_bucket_list *)b)->item_weights[p];
39 case CRUSH_BUCKET_TREE:
40 if (p & 1)
41 return ((struct crush_bucket_tree *)b)->node_weights[p];
42 return 0;
43 case CRUSH_BUCKET_STRAW:
44 return ((struct crush_bucket_straw *)b)->item_weights[p];
45 }
46 return 0;
47}
48
49/**
50 * crush_calc_parents - Calculate parent vectors for the given crush map.
51 * @map: crush_map pointer
52 */
53void crush_calc_parents(struct crush_map *map)
54{
55 int i, b, c;
56
57 for (b = 0; b < map->max_buckets; b++) {
58 if (map->buckets[b] == NULL)
59 continue;
60 for (i = 0; i < map->buckets[b]->size; i++) {
61 c = map->buckets[b]->items[i];
62 BUG_ON(c >= map->max_devices ||
63 c < -map->max_buckets);
64 if (c >= 0)
65 map->device_parents[c] = map->buckets[b]->id;
66 else
67 map->bucket_parents[-1-c] = map->buckets[b]->id;
68 }
69 }
70}
71
72void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
73{
74 kfree(b->h.perm);
75 kfree(b->h.items);
76 kfree(b);
77}
78
79void crush_destroy_bucket_list(struct crush_bucket_list *b)
80{
81 kfree(b->item_weights);
82 kfree(b->sum_weights);
83 kfree(b->h.perm);
84 kfree(b->h.items);
85 kfree(b);
86}
87
88void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
89{
90 kfree(b->node_weights);
91 kfree(b);
92}
93
94void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
95{
96 kfree(b->straws);
97 kfree(b->item_weights);
98 kfree(b->h.perm);
99 kfree(b->h.items);
100 kfree(b);
101}
102
103void crush_destroy_bucket(struct crush_bucket *b)
104{
105 switch (b->alg) {
106 case CRUSH_BUCKET_UNIFORM:
107 crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
108 break;
109 case CRUSH_BUCKET_LIST:
110 crush_destroy_bucket_list((struct crush_bucket_list *)b);
111 break;
112 case CRUSH_BUCKET_TREE:
113 crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
114 break;
115 case CRUSH_BUCKET_STRAW:
116 crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
117 break;
118 }
119}
120
121/**
122 * crush_destroy - Destroy a crush_map
123 * @map: crush_map pointer
124 */
125void crush_destroy(struct crush_map *map)
126{
127 int b;
128
129 /* buckets */
130 if (map->buckets) {
131 for (b = 0; b < map->max_buckets; b++) {
132 if (map->buckets[b] == NULL)
133 continue;
134 crush_destroy_bucket(map->buckets[b]);
135 }
136 kfree(map->buckets);
137 }
138
139 /* rules */
140 if (map->rules) {
141 for (b = 0; b < map->max_rules; b++)
142 kfree(map->rules[b]);
143 kfree(map->rules);
144 }
145
146 kfree(map->bucket_parents);
147 kfree(map->device_parents);
148 kfree(map);
149}
150
151
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
new file mode 100644
index 000000000000..dcd7e7523700
--- /dev/null
+++ b/fs/ceph/crush/crush.h
@@ -0,0 +1,180 @@
1#ifndef _CRUSH_CRUSH_H
2#define _CRUSH_CRUSH_H
3
4#include <linux/types.h>
5
6/*
7 * CRUSH is a pseudo-random data distribution algorithm that
8 * efficiently distributes input values (typically, data objects)
9 * across a heterogeneous, structured storage cluster.
10 *
11 * The algorithm was originally described in detail in this paper
12 * (although the algorithm has evolved somewhat since then):
13 *
14 * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
15 *
16 * LGPL2
17 */
18
19
20#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
21
22
23#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
24#define CRUSH_MAX_SET 10 /* max size of a mapping result */
25
26
27/*
28 * CRUSH uses user-defined "rules" to describe how inputs should be
29 * mapped to devices. A rule consists of sequence of steps to perform
30 * to generate the set of output devices.
31 */
32struct crush_rule_step {
33 __u32 op;
34 __s32 arg1;
35 __s32 arg2;
36};
37
38/* step op codes */
39enum {
40 CRUSH_RULE_NOOP = 0,
41 CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
42 CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
43 /* arg2 = type */
44 CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
45 CRUSH_RULE_EMIT = 4, /* no args */
46 CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
47 CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
48};
49
50/*
51 * for specifying choose num (arg1) relative to the max parameter
52 * passed to do_rule
53 */
54#define CRUSH_CHOOSE_N 0
55#define CRUSH_CHOOSE_N_MINUS(x) (-(x))
56
57/*
58 * The rule mask is used to describe what the rule is intended for.
59 * Given a ruleset and size of output set, we search through the
60 * rule list for a matching rule_mask.
61 */
62struct crush_rule_mask {
63 __u8 ruleset;
64 __u8 type;
65 __u8 min_size;
66 __u8 max_size;
67};
68
69struct crush_rule {
70 __u32 len;
71 struct crush_rule_mask mask;
72 struct crush_rule_step steps[0];
73};
74
75#define crush_rule_size(len) (sizeof(struct crush_rule) + \
76 (len)*sizeof(struct crush_rule_step))
77
78
79
80/*
81 * A bucket is a named container of other items (either devices or
82 * other buckets). Items within a bucket are chosen using one of a
83 * few different algorithms. The table summarizes how the speed of
84 * each option measures up against mapping stability when items are
85 * added or removed.
86 *
87 * Bucket Alg Speed Additions Removals
88 * ------------------------------------------------
89 * uniform O(1) poor poor
90 * list O(n) optimal poor
91 * tree O(log n) good good
92 * straw O(n) optimal optimal
93 */
94enum {
95 CRUSH_BUCKET_UNIFORM = 1,
96 CRUSH_BUCKET_LIST = 2,
97 CRUSH_BUCKET_TREE = 3,
98 CRUSH_BUCKET_STRAW = 4
99};
100extern const char *crush_bucket_alg_name(int alg);
101
102struct crush_bucket {
103 __s32 id; /* this'll be negative */
104 __u16 type; /* non-zero; type=0 is reserved for devices */
105 __u8 alg; /* one of CRUSH_BUCKET_* */
106 __u8 hash; /* which hash function to use, CRUSH_HASH_* */
107 __u32 weight; /* 16-bit fixed point */
108 __u32 size; /* num items */
109 __s32 *items;
110
111 /*
112 * cached random permutation: used for uniform bucket and for
113 * the linear search fallback for the other bucket types.
114 */
115 __u32 perm_x; /* @x for which *perm is defined */
116 __u32 perm_n; /* num elements of *perm that are permuted/defined */
117 __u32 *perm;
118};
119
120struct crush_bucket_uniform {
121 struct crush_bucket h;
122 __u32 item_weight; /* 16-bit fixed point; all items equally weighted */
123};
124
125struct crush_bucket_list {
126 struct crush_bucket h;
127 __u32 *item_weights; /* 16-bit fixed point */
128 __u32 *sum_weights; /* 16-bit fixed point. element i is sum
129 of weights 0..i, inclusive */
130};
131
132struct crush_bucket_tree {
133 struct crush_bucket h; /* note: h.size is _tree_ size, not number of
134 actual items */
135 __u8 num_nodes;
136 __u32 *node_weights;
137};
138
139struct crush_bucket_straw {
140 struct crush_bucket h;
141 __u32 *item_weights; /* 16-bit fixed point */
142 __u32 *straws; /* 16-bit fixed point */
143};
144
145
146
147/*
148 * CRUSH map includes all buckets, rules, etc.
149 */
150struct crush_map {
151 struct crush_bucket **buckets;
152 struct crush_rule **rules;
153
154 /*
155 * Parent pointers to identify the parent bucket a device or
156 * bucket in the hierarchy. If an item appears more than
157 * once, this is the _last_ time it appeared (where buckets
158 * are processed in bucket id order, from -1 on down to
159 * -max_buckets.
160 */
161 __u32 *bucket_parents;
162 __u32 *device_parents;
163
164 __s32 max_buckets;
165 __u32 max_rules;
166 __s32 max_devices;
167};
168
169
170/* crush.c */
171extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
172extern void crush_calc_parents(struct crush_map *map);
173extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
174extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
175extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
176extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
177extern void crush_destroy_bucket(struct crush_bucket *b);
178extern void crush_destroy(struct crush_map *map);
179
180#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
new file mode 100644
index 000000000000..5873aed694bf
--- /dev/null
+++ b/fs/ceph/crush/hash.c
@@ -0,0 +1,149 @@
1
2#include <linux/types.h>
3#include "hash.h"
4
5/*
6 * Robert Jenkins' function for mixing 32-bit values
7 * http://burtleburtle.net/bob/hash/evahash.html
8 * a, b = random bits, c = input and output
9 */
10#define crush_hashmix(a, b, c) do { \
11 a = a-b; a = a-c; a = a^(c>>13); \
12 b = b-c; b = b-a; b = b^(a<<8); \
13 c = c-a; c = c-b; c = c^(b>>13); \
14 a = a-b; a = a-c; a = a^(c>>12); \
15 b = b-c; b = b-a; b = b^(a<<16); \
16 c = c-a; c = c-b; c = c^(b>>5); \
17 a = a-b; a = a-c; a = a^(c>>3); \
18 b = b-c; b = b-a; b = b^(a<<10); \
19 c = c-a; c = c-b; c = c^(b>>15); \
20 } while (0)
21
22#define crush_hash_seed 1315423911
23
24static __u32 crush_hash32_rjenkins1(__u32 a)
25{
26 __u32 hash = crush_hash_seed ^ a;
27 __u32 b = a;
28 __u32 x = 231232;
29 __u32 y = 1232;
30 crush_hashmix(b, x, hash);
31 crush_hashmix(y, a, hash);
32 return hash;
33}
34
35static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
36{
37 __u32 hash = crush_hash_seed ^ a ^ b;
38 __u32 x = 231232;
39 __u32 y = 1232;
40 crush_hashmix(a, b, hash);
41 crush_hashmix(x, a, hash);
42 crush_hashmix(b, y, hash);
43 return hash;
44}
45
46static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
47{
48 __u32 hash = crush_hash_seed ^ a ^ b ^ c;
49 __u32 x = 231232;
50 __u32 y = 1232;
51 crush_hashmix(a, b, hash);
52 crush_hashmix(c, x, hash);
53 crush_hashmix(y, a, hash);
54 crush_hashmix(b, x, hash);
55 crush_hashmix(y, c, hash);
56 return hash;
57}
58
59static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
60{
61 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
62 __u32 x = 231232;
63 __u32 y = 1232;
64 crush_hashmix(a, b, hash);
65 crush_hashmix(c, d, hash);
66 crush_hashmix(a, x, hash);
67 crush_hashmix(y, b, hash);
68 crush_hashmix(c, x, hash);
69 crush_hashmix(y, d, hash);
70 return hash;
71}
72
73static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
74 __u32 e)
75{
76 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
77 __u32 x = 231232;
78 __u32 y = 1232;
79 crush_hashmix(a, b, hash);
80 crush_hashmix(c, d, hash);
81 crush_hashmix(e, x, hash);
82 crush_hashmix(y, a, hash);
83 crush_hashmix(b, x, hash);
84 crush_hashmix(y, c, hash);
85 crush_hashmix(d, x, hash);
86 crush_hashmix(y, e, hash);
87 return hash;
88}
89
90
91__u32 crush_hash32(int type, __u32 a)
92{
93 switch (type) {
94 case CRUSH_HASH_RJENKINS1:
95 return crush_hash32_rjenkins1(a);
96 default:
97 return 0;
98 }
99}
100
101__u32 crush_hash32_2(int type, __u32 a, __u32 b)
102{
103 switch (type) {
104 case CRUSH_HASH_RJENKINS1:
105 return crush_hash32_rjenkins1_2(a, b);
106 default:
107 return 0;
108 }
109}
110
111__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
112{
113 switch (type) {
114 case CRUSH_HASH_RJENKINS1:
115 return crush_hash32_rjenkins1_3(a, b, c);
116 default:
117 return 0;
118 }
119}
120
121__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
122{
123 switch (type) {
124 case CRUSH_HASH_RJENKINS1:
125 return crush_hash32_rjenkins1_4(a, b, c, d);
126 default:
127 return 0;
128 }
129}
130
131__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
132{
133 switch (type) {
134 case CRUSH_HASH_RJENKINS1:
135 return crush_hash32_rjenkins1_5(a, b, c, d, e);
136 default:
137 return 0;
138 }
139}
140
141const char *crush_hash_name(int type)
142{
143 switch (type) {
144 case CRUSH_HASH_RJENKINS1:
145 return "rjenkins1";
146 default:
147 return "unknown";
148 }
149}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
new file mode 100644
index 000000000000..ff48e110e4bb
--- /dev/null
+++ b/fs/ceph/crush/hash.h
@@ -0,0 +1,17 @@
1#ifndef _CRUSH_HASH_H
2#define _CRUSH_HASH_H
3
4#define CRUSH_HASH_RJENKINS1 0
5
6#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
7
8extern const char *crush_hash_name(int type);
9
10extern __u32 crush_hash32(int type, __u32 a);
11extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
12extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
13extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
14extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
15 __u32 e);
16
17#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
new file mode 100644
index 000000000000..9ba54efb6543
--- /dev/null
+++ b/fs/ceph/crush/mapper.c
@@ -0,0 +1,596 @@
1
2#ifdef __KERNEL__
3# include <linux/string.h>
4# include <linux/slab.h>
5# include <linux/bug.h>
6# include <linux/kernel.h>
7# ifndef dprintk
8# define dprintk(args...)
9# endif
10#else
11# include <string.h>
12# include <stdio.h>
13# include <stdlib.h>
14# include <assert.h>
15# define BUG_ON(x) assert(!(x))
16# define dprintk(args...) /* printf(args) */
17# define kmalloc(x, f) malloc(x)
18# define kfree(x) free(x)
19#endif
20
21#include "crush.h"
22#include "hash.h"
23
24/*
25 * Implement the core CRUSH mapping algorithm.
26 */
27
28/**
29 * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
30 * @map: the crush_map
31 * @ruleset: the storage ruleset id (user defined)
32 * @type: storage ruleset type (user defined)
33 * @size: output set size
34 */
35int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
36{
37 int i;
38
39 for (i = 0; i < map->max_rules; i++) {
40 if (map->rules[i] &&
41 map->rules[i]->mask.ruleset == ruleset &&
42 map->rules[i]->mask.type == type &&
43 map->rules[i]->mask.min_size <= size &&
44 map->rules[i]->mask.max_size >= size)
45 return i;
46 }
47 return -1;
48}
49
50
51/*
52 * bucket choose methods
53 *
54 * For each bucket algorithm, we have a "choose" method that, given a
55 * crush input @x and replica position (usually, position in output set) @r,
56 * will produce an item in the bucket.
57 */
58
59/*
60 * Choose based on a random permutation of the bucket.
61 *
62 * We used to use some prime number arithmetic to do this, but it
63 * wasn't very random, and had some other bad behaviors. Instead, we
64 * calculate an actual random permutation of the bucket members.
65 * Since this is expensive, we optimize for the r=0 case, which
66 * captures the vast majority of calls.
67 */
68static int bucket_perm_choose(struct crush_bucket *bucket,
69 int x, int r)
70{
71 unsigned pr = r % bucket->size;
72 unsigned i, s;
73
74 /* start a new permutation if @x has changed */
75 if (bucket->perm_x != x || bucket->perm_n == 0) {
76 dprintk("bucket %d new x=%d\n", bucket->id, x);
77 bucket->perm_x = x;
78
79 /* optimize common r=0 case */
80 if (pr == 0) {
81 s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
82 bucket->size;
83 bucket->perm[0] = s;
84 bucket->perm_n = 0xffff; /* magic value, see below */
85 goto out;
86 }
87
88 for (i = 0; i < bucket->size; i++)
89 bucket->perm[i] = i;
90 bucket->perm_n = 0;
91 } else if (bucket->perm_n == 0xffff) {
92 /* clean up after the r=0 case above */
93 for (i = 1; i < bucket->size; i++)
94 bucket->perm[i] = i;
95 bucket->perm[bucket->perm[0]] = 0;
96 bucket->perm_n = 1;
97 }
98
99 /* calculate permutation up to pr */
100 for (i = 0; i < bucket->perm_n; i++)
101 dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
102 while (bucket->perm_n <= pr) {
103 unsigned p = bucket->perm_n;
104 /* no point in swapping the final entry */
105 if (p < bucket->size - 1) {
106 i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
107 (bucket->size - p);
108 if (i) {
109 unsigned t = bucket->perm[p + i];
110 bucket->perm[p + i] = bucket->perm[p];
111 bucket->perm[p] = t;
112 }
113 dprintk(" perm_choose swap %d with %d\n", p, p+i);
114 }
115 bucket->perm_n++;
116 }
117 for (i = 0; i < bucket->size; i++)
118 dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
119
120 s = bucket->perm[pr];
121out:
122 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
123 bucket->size, x, r, pr, s);
124 return bucket->items[s];
125}
126
127/* uniform */
128static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
129 int x, int r)
130{
131 return bucket_perm_choose(&bucket->h, x, r);
132}
133
134/* list */
135static int bucket_list_choose(struct crush_bucket_list *bucket,
136 int x, int r)
137{
138 int i;
139
140 for (i = bucket->h.size-1; i >= 0; i--) {
141 __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
142 r, bucket->h.id);
143 w &= 0xffff;
144 dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
145 "sw %x rand %llx",
146 i, x, r, bucket->h.items[i], bucket->item_weights[i],
147 bucket->sum_weights[i], w);
148 w *= bucket->sum_weights[i];
149 w = w >> 16;
150 /*dprintk(" scaled %llx\n", w);*/
151 if (w < bucket->item_weights[i])
152 return bucket->h.items[i];
153 }
154
155 BUG_ON(1);
156 return 0;
157}
158
159
160/* (binary) tree */
161static int height(int n)
162{
163 int h = 0;
164 while ((n & 1) == 0) {
165 h++;
166 n = n >> 1;
167 }
168 return h;
169}
170
171static int left(int x)
172{
173 int h = height(x);
174 return x - (1 << (h-1));
175}
176
177static int right(int x)
178{
179 int h = height(x);
180 return x + (1 << (h-1));
181}
182
183static int terminal(int x)
184{
185 return x & 1;
186}
187
188static int bucket_tree_choose(struct crush_bucket_tree *bucket,
189 int x, int r)
190{
191 int n, l;
192 __u32 w;
193 __u64 t;
194
195 /* start at root */
196 n = bucket->num_nodes >> 1;
197
198 while (!terminal(n)) {
199 /* pick point in [0, w) */
200 w = bucket->node_weights[n];
201 t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
202 bucket->h.id) * (__u64)w;
203 t = t >> 32;
204
205 /* descend to the left or right? */
206 l = left(n);
207 if (t < bucket->node_weights[l])
208 n = l;
209 else
210 n = right(n);
211 }
212
213 return bucket->h.items[n >> 1];
214}
215
216
217/* straw */
218
219static int bucket_straw_choose(struct crush_bucket_straw *bucket,
220 int x, int r)
221{
222 int i;
223 int high = 0;
224 __u64 high_draw = 0;
225 __u64 draw;
226
227 for (i = 0; i < bucket->h.size; i++) {
228 draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
229 draw &= 0xffff;
230 draw *= bucket->straws[i];
231 if (i == 0 || draw > high_draw) {
232 high = i;
233 high_draw = draw;
234 }
235 }
236 return bucket->h.items[high];
237}
238
239static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
240{
241 dprintk("choose %d x=%d r=%d\n", in->id, x, r);
242 switch (in->alg) {
243 case CRUSH_BUCKET_UNIFORM:
244 return bucket_uniform_choose((struct crush_bucket_uniform *)in,
245 x, r);
246 case CRUSH_BUCKET_LIST:
247 return bucket_list_choose((struct crush_bucket_list *)in,
248 x, r);
249 case CRUSH_BUCKET_TREE:
250 return bucket_tree_choose((struct crush_bucket_tree *)in,
251 x, r);
252 case CRUSH_BUCKET_STRAW:
253 return bucket_straw_choose((struct crush_bucket_straw *)in,
254 x, r);
255 default:
256 BUG_ON(1);
257 return in->items[0];
258 }
259}
260
261/*
262 * true if device is marked "out" (failed, fully offloaded)
263 * of the cluster
264 */
265static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
266{
267 if (weight[item] >= 0x1000)
268 return 0;
269 if (weight[item] == 0)
270 return 1;
271 if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
272 < weight[item])
273 return 0;
274 return 1;
275}
276
277/**
278 * crush_choose - choose numrep distinct items of given type
279 * @map: the crush_map
280 * @bucket: the bucket we are choose an item from
281 * @x: crush input value
282 * @numrep: the number of items to choose
283 * @type: the type of item to choose
284 * @out: pointer to output vector
285 * @outpos: our position in that vector
286 * @firstn: true if choosing "first n" items, false if choosing "indep"
287 * @recurse_to_leaf: true if we want one device under each item of given type
288 * @out2: second output vector for leaf items (if @recurse_to_leaf)
289 */
290static int crush_choose(struct crush_map *map,
291 struct crush_bucket *bucket,
292 __u32 *weight,
293 int x, int numrep, int type,
294 int *out, int outpos,
295 int firstn, int recurse_to_leaf,
296 int *out2)
297{
298 int rep;
299 int ftotal, flocal;
300 int retry_descent, retry_bucket, skip_rep;
301 struct crush_bucket *in = bucket;
302 int r;
303 int i;
304 int item = 0;
305 int itemtype;
306 int collide, reject;
307 const int orig_tries = 5; /* attempts before we fall back to search */
308 dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos);
309
310 for (rep = outpos; rep < numrep; rep++) {
311 /* keep trying until we get a non-out, non-colliding item */
312 ftotal = 0;
313 skip_rep = 0;
314 do {
315 retry_descent = 0;
316 in = bucket; /* initial bucket */
317
318 /* choose through intervening buckets */
319 flocal = 0;
320 do {
321 collide = 0;
322 retry_bucket = 0;
323 r = rep;
324 if (in->alg == CRUSH_BUCKET_UNIFORM) {
325 /* be careful */
326 if (firstn || numrep >= in->size)
327 /* r' = r + f_total */
328 r += ftotal;
329 else if (in->size % numrep == 0)
330 /* r'=r+(n+1)*f_local */
331 r += (numrep+1) *
332 (flocal+ftotal);
333 else
334 /* r' = r + n*f_local */
335 r += numrep * (flocal+ftotal);
336 } else {
337 if (firstn)
338 /* r' = r + f_total */
339 r += ftotal;
340 else
341 /* r' = r + n*f_local */
342 r += numrep * (flocal+ftotal);
343 }
344
345 /* bucket choose */
346 if (in->size == 0) {
347 reject = 1;
348 goto reject;
349 }
350 if (flocal >= (in->size>>1) &&
351 flocal > orig_tries)
352 item = bucket_perm_choose(in, x, r);
353 else
354 item = crush_bucket_choose(in, x, r);
355 BUG_ON(item >= map->max_devices);
356
357 /* desired type? */
358 if (item < 0)
359 itemtype = map->buckets[-1-item]->type;
360 else
361 itemtype = 0;
362 dprintk(" item %d type %d\n", item, itemtype);
363
364 /* keep going? */
365 if (itemtype != type) {
366 BUG_ON(item >= 0 ||
367 (-1-item) >= map->max_buckets);
368 in = map->buckets[-1-item];
369 continue;
370 }
371
372 /* collision? */
373 for (i = 0; i < outpos; i++) {
374 if (out[i] == item) {
375 collide = 1;
376 break;
377 }
378 }
379
380 if (recurse_to_leaf &&
381 item < 0 &&
382 crush_choose(map, map->buckets[-1-item],
383 weight,
384 x, outpos+1, 0,
385 out2, outpos,
386 firstn, 0, NULL) <= outpos) {
387 reject = 1;
388 } else {
389 /* out? */
390 if (itemtype == 0)
391 reject = is_out(map, weight,
392 item, x);
393 else
394 reject = 0;
395 }
396
397reject:
398 if (reject || collide) {
399 ftotal++;
400 flocal++;
401
402 if (collide && flocal < 3)
403 /* retry locally a few times */
404 retry_bucket = 1;
405 else if (flocal < in->size + orig_tries)
406 /* exhaustive bucket search */
407 retry_bucket = 1;
408 else if (ftotal < 20)
409 /* then retry descent */
410 retry_descent = 1;
411 else
412 /* else give up */
413 skip_rep = 1;
414 dprintk(" reject %d collide %d "
415 "ftotal %d flocal %d\n",
416 reject, collide, ftotal,
417 flocal);
418 }
419 } while (retry_bucket);
420 } while (retry_descent);
421
422 if (skip_rep) {
423 dprintk("skip rep\n");
424 continue;
425 }
426
427 dprintk("choose got %d\n", item);
428 out[outpos] = item;
429 outpos++;
430 }
431
432 dprintk("choose returns %d\n", outpos);
433 return outpos;
434}
435
436
437/**
438 * crush_do_rule - calculate a mapping with the given input and rule
439 * @map: the crush_map
440 * @ruleno: the rule id
441 * @x: hash input
442 * @result: pointer to result vector
443 * @result_max: maximum result size
444 * @force: force initial replica choice; -1 for none
445 */
446int crush_do_rule(struct crush_map *map,
447 int ruleno, int x, int *result, int result_max,
448 int force, __u32 *weight)
449{
450 int result_len;
451 int force_context[CRUSH_MAX_DEPTH];
452 int force_pos = -1;
453 int a[CRUSH_MAX_SET];
454 int b[CRUSH_MAX_SET];
455 int c[CRUSH_MAX_SET];
456 int recurse_to_leaf;
457 int *w;
458 int wsize = 0;
459 int *o;
460 int osize;
461 int *tmp;
462 struct crush_rule *rule;
463 int step;
464 int i, j;
465 int numrep;
466 int firstn;
467 int rc = -1;
468
469 BUG_ON(ruleno >= map->max_rules);
470
471 rule = map->rules[ruleno];
472 result_len = 0;
473 w = a;
474 o = b;
475
476 /*
477 * determine hierarchical context of force, if any. note
478 * that this may or may not correspond to the specific types
479 * referenced by the crush rule.
480 */
481 if (force >= 0) {
482 if (force >= map->max_devices ||
483 map->device_parents[force] == 0) {
484 /*dprintk("CRUSH: forcefed device dne\n");*/
485 rc = -1; /* force fed device dne */
486 goto out;
487 }
488 if (!is_out(map, weight, force, x)) {
489 while (1) {
490 force_context[++force_pos] = force;
491 if (force >= 0)
492 force = map->device_parents[force];
493 else
494 force = map->bucket_parents[-1-force];
495 if (force == 0)
496 break;
497 }
498 }
499 }
500
501 for (step = 0; step < rule->len; step++) {
502 firstn = 0;
503 switch (rule->steps[step].op) {
504 case CRUSH_RULE_TAKE:
505 w[0] = rule->steps[step].arg1;
506 if (force_pos >= 0) {
507 BUG_ON(force_context[force_pos] != w[0]);
508 force_pos--;
509 }
510 wsize = 1;
511 break;
512
513 case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
514 case CRUSH_RULE_CHOOSE_FIRSTN:
515 firstn = 1;
516 case CRUSH_RULE_CHOOSE_LEAF_INDEP:
517 case CRUSH_RULE_CHOOSE_INDEP:
518 BUG_ON(wsize == 0);
519
520 recurse_to_leaf =
521 rule->steps[step].op ==
522 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
523 rule->steps[step].op ==
524 CRUSH_RULE_CHOOSE_LEAF_INDEP;
525
526 /* reset output */
527 osize = 0;
528
529 for (i = 0; i < wsize; i++) {
530 /*
531 * see CRUSH_N, CRUSH_N_MINUS macros.
532 * basically, numrep <= 0 means relative to
533 * the provided result_max
534 */
535 numrep = rule->steps[step].arg1;
536 if (numrep <= 0) {
537 numrep += result_max;
538 if (numrep <= 0)
539 continue;
540 }
541 j = 0;
542 if (osize == 0 && force_pos >= 0) {
543 /* skip any intermediate types */
544 while (force_pos &&
545 force_context[force_pos] < 0 &&
546 rule->steps[step].arg2 !=
547 map->buckets[-1 -
548 force_context[force_pos]]->type)
549 force_pos--;
550 o[osize] = force_context[force_pos];
551 if (recurse_to_leaf)
552 c[osize] = force_context[0];
553 j++;
554 force_pos--;
555 }
556 osize += crush_choose(map,
557 map->buckets[-1-w[i]],
558 weight,
559 x, numrep,
560 rule->steps[step].arg2,
561 o+osize, j,
562 firstn,
563 recurse_to_leaf, c+osize);
564 }
565
566 if (recurse_to_leaf)
567 /* copy final _leaf_ values to output set */
568 memcpy(o, c, osize*sizeof(*o));
569
570 /* swap t and w arrays */
571 tmp = o;
572 o = w;
573 w = tmp;
574 wsize = osize;
575 break;
576
577
578 case CRUSH_RULE_EMIT:
579 for (i = 0; i < wsize && result_len < result_max; i++) {
580 result[result_len] = w[i];
581 result_len++;
582 }
583 wsize = 0;
584 break;
585
586 default:
587 BUG_ON(1);
588 }
589 }
590 rc = result_len;
591
592out:
593 return rc;
594}
595
596
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
new file mode 100644
index 000000000000..98e90046fd9f
--- /dev/null
+++ b/fs/ceph/crush/mapper.h
@@ -0,0 +1,20 @@
1#ifndef _CRUSH_MAPPER_H
2#define _CRUSH_MAPPER_H
3
4/*
5 * CRUSH functions for find rules and then mapping an input to an
6 * output set.
7 *
8 * LGPL2
9 */
10
11#include "crush.h"
12
13extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
14extern int crush_do_rule(struct crush_map *map,
15 int ruleno,
16 int x, int *result, int result_max,
17 int forcefeed, /* -1 for none */
18 __u32 *weights);
19
20#endif
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
new file mode 100644
index 000000000000..f704b3b62424
--- /dev/null
+++ b/fs/ceph/crypto.c
@@ -0,0 +1,409 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/scatterlist.h>
6#include <linux/slab.h>
7#include <crypto/hash.h>
8
9#include "crypto.h"
10#include "decode.h"
11
12int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
13{
14 if (*p + sizeof(u16) + sizeof(key->created) +
15 sizeof(u16) + key->len > end)
16 return -ERANGE;
17 ceph_encode_16(p, key->type);
18 ceph_encode_copy(p, &key->created, sizeof(key->created));
19 ceph_encode_16(p, key->len);
20 ceph_encode_copy(p, key->key, key->len);
21 return 0;
22}
23
24int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
25{
26 ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
27 key->type = ceph_decode_16(p);
28 ceph_decode_copy(p, &key->created, sizeof(key->created));
29 key->len = ceph_decode_16(p);
30 ceph_decode_need(p, end, key->len, bad);
31 key->key = kmalloc(key->len, GFP_NOFS);
32 if (!key->key)
33 return -ENOMEM;
34 ceph_decode_copy(p, key->key, key->len);
35 return 0;
36
37bad:
38 dout("failed to decode crypto key\n");
39 return -EINVAL;
40}
41
42int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
43{
44 int inlen = strlen(inkey);
45 int blen = inlen * 3 / 4;
46 void *buf, *p;
47 int ret;
48
49 dout("crypto_key_unarmor %s\n", inkey);
50 buf = kmalloc(blen, GFP_NOFS);
51 if (!buf)
52 return -ENOMEM;
53 blen = ceph_unarmor(buf, inkey, inkey+inlen);
54 if (blen < 0) {
55 kfree(buf);
56 return blen;
57 }
58
59 p = buf;
60 ret = ceph_crypto_key_decode(key, &p, p + blen);
61 kfree(buf);
62 if (ret)
63 return ret;
64 dout("crypto_key_unarmor key %p type %d len %d\n", key,
65 key->type, key->len);
66 return 0;
67}
68
69
70
71#define AES_KEY_SIZE 16
72
73static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
74{
75 return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
76}
77
78const u8 *aes_iv = "cephsageyudagreg";
79
80int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len,
81 const void *src, size_t src_len)
82{
83 struct scatterlist sg_in[2], sg_out[1];
84 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
85 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
86 int ret;
87 void *iv;
88 int ivsize;
89 size_t zero_padding = (0x10 - (src_len & 0x0f));
90 char pad[16];
91
92 if (IS_ERR(tfm))
93 return PTR_ERR(tfm);
94
95 memset(pad, zero_padding, zero_padding);
96
97 *dst_len = src_len + zero_padding;
98
99 crypto_blkcipher_setkey((void *)tfm, key, key_len);
100 sg_init_table(sg_in, 2);
101 sg_set_buf(&sg_in[0], src, src_len);
102 sg_set_buf(&sg_in[1], pad, zero_padding);
103 sg_init_table(sg_out, 1);
104 sg_set_buf(sg_out, dst, *dst_len);
105 iv = crypto_blkcipher_crt(tfm)->iv;
106 ivsize = crypto_blkcipher_ivsize(tfm);
107
108 memcpy(iv, aes_iv, ivsize);
109 /*
110 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
111 key, key_len, 1);
112 print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
113 src, src_len, 1);
114 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
115 pad, zero_padding, 1);
116 */
117 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
118 src_len + zero_padding);
119 crypto_free_blkcipher(tfm);
120 if (ret < 0)
121 pr_err("ceph_aes_crypt failed %d\n", ret);
122 /*
123 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
124 dst, *dst_len, 1);
125 */
126 return 0;
127}
128
129int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len,
130 const void *src1, size_t src1_len,
131 const void *src2, size_t src2_len)
132{
133 struct scatterlist sg_in[3], sg_out[1];
134 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
135 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
136 int ret;
137 void *iv;
138 int ivsize;
139 size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
140 char pad[16];
141
142 if (IS_ERR(tfm))
143 return PTR_ERR(tfm);
144
145 memset(pad, zero_padding, zero_padding);
146
147 *dst_len = src1_len + src2_len + zero_padding;
148
149 crypto_blkcipher_setkey((void *)tfm, key, key_len);
150 sg_init_table(sg_in, 3);
151 sg_set_buf(&sg_in[0], src1, src1_len);
152 sg_set_buf(&sg_in[1], src2, src2_len);
153 sg_set_buf(&sg_in[2], pad, zero_padding);
154 sg_init_table(sg_out, 1);
155 sg_set_buf(sg_out, dst, *dst_len);
156 iv = crypto_blkcipher_crt(tfm)->iv;
157 ivsize = crypto_blkcipher_ivsize(tfm);
158
159 memcpy(iv, aes_iv, ivsize);
160 /*
161 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
162 key, key_len, 1);
163 print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
164 src1, src1_len, 1);
165 print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
166 src2, src2_len, 1);
167 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
168 pad, zero_padding, 1);
169 */
170 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
171 src1_len + src2_len + zero_padding);
172 crypto_free_blkcipher(tfm);
173 if (ret < 0)
174 pr_err("ceph_aes_crypt2 failed %d\n", ret);
175 /*
176 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
177 dst, *dst_len, 1);
178 */
179 return 0;
180}
181
182int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len,
183 const void *src, size_t src_len)
184{
185 struct scatterlist sg_in[1], sg_out[2];
186 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
187 struct blkcipher_desc desc = { .tfm = tfm };
188 char pad[16];
189 void *iv;
190 int ivsize;
191 int ret;
192 int last_byte;
193
194 if (IS_ERR(tfm))
195 return PTR_ERR(tfm);
196
197 crypto_blkcipher_setkey((void *)tfm, key, key_len);
198 sg_init_table(sg_in, 1);
199 sg_init_table(sg_out, 2);
200 sg_set_buf(sg_in, src, src_len);
201 sg_set_buf(&sg_out[0], dst, *dst_len);
202 sg_set_buf(&sg_out[1], pad, sizeof(pad));
203
204 iv = crypto_blkcipher_crt(tfm)->iv;
205 ivsize = crypto_blkcipher_ivsize(tfm);
206
207 memcpy(iv, aes_iv, ivsize);
208
209 /*
210 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
211 key, key_len, 1);
212 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
213 src, src_len, 1);
214 */
215
216 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
217 crypto_free_blkcipher(tfm);
218 if (ret < 0) {
219 pr_err("ceph_aes_decrypt failed %d\n", ret);
220 return ret;
221 }
222
223 if (src_len <= *dst_len)
224 last_byte = ((char *)dst)[src_len - 1];
225 else
226 last_byte = pad[src_len - *dst_len - 1];
227 if (last_byte <= 16 && src_len >= last_byte) {
228 *dst_len = src_len - last_byte;
229 } else {
230 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
231 last_byte, (int)src_len);
232 return -EPERM; /* bad padding */
233 }
234 /*
235 print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
236 dst, *dst_len, 1);
237 */
238 return 0;
239}
240
241int ceph_aes_decrypt2(const void *key, int key_len,
242 void *dst1, size_t *dst1_len,
243 void *dst2, size_t *dst2_len,
244 const void *src, size_t src_len)
245{
246 struct scatterlist sg_in[1], sg_out[3];
247 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
248 struct blkcipher_desc desc = { .tfm = tfm };
249 char pad[16];
250 void *iv;
251 int ivsize;
252 int ret;
253 int last_byte;
254
255 if (IS_ERR(tfm))
256 return PTR_ERR(tfm);
257
258 sg_init_table(sg_in, 1);
259 sg_set_buf(sg_in, src, src_len);
260 sg_init_table(sg_out, 3);
261 sg_set_buf(&sg_out[0], dst1, *dst1_len);
262 sg_set_buf(&sg_out[1], dst2, *dst2_len);
263 sg_set_buf(&sg_out[2], pad, sizeof(pad));
264
265 crypto_blkcipher_setkey((void *)tfm, key, key_len);
266 iv = crypto_blkcipher_crt(tfm)->iv;
267 ivsize = crypto_blkcipher_ivsize(tfm);
268
269 memcpy(iv, aes_iv, ivsize);
270
271 /*
272 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
273 key, key_len, 1);
274 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
275 src, src_len, 1);
276 */
277
278 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
279 crypto_free_blkcipher(tfm);
280 if (ret < 0) {
281 pr_err("ceph_aes_decrypt failed %d\n", ret);
282 return ret;
283 }
284
285 if (src_len <= *dst1_len)
286 last_byte = ((char *)dst1)[src_len - 1];
287 else if (src_len <= *dst1_len + *dst2_len)
288 last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
289 else
290 last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
291 if (last_byte <= 16 && src_len >= last_byte) {
292 src_len -= last_byte;
293 } else {
294 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
295 last_byte, (int)src_len);
296 return -EPERM; /* bad padding */
297 }
298
299 if (src_len < *dst1_len) {
300 *dst1_len = src_len;
301 *dst2_len = 0;
302 } else {
303 *dst2_len = src_len - *dst1_len;
304 }
305 /*
306 print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1,
307 dst1, *dst1_len, 1);
308 print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1,
309 dst2, *dst2_len, 1);
310 */
311
312 return 0;
313}
314
315
316int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
317 const void *src, size_t src_len)
318{
319 switch (secret->type) {
320 case CEPH_CRYPTO_NONE:
321 if (*dst_len < src_len)
322 return -ERANGE;
323 memcpy(dst, src, src_len);
324 *dst_len = src_len;
325 return 0;
326
327 case CEPH_CRYPTO_AES:
328 return ceph_aes_decrypt(secret->key, secret->len, dst,
329 dst_len, src, src_len);
330
331 default:
332 return -EINVAL;
333 }
334}
335
336int ceph_decrypt2(struct ceph_crypto_key *secret,
337 void *dst1, size_t *dst1_len,
338 void *dst2, size_t *dst2_len,
339 const void *src, size_t src_len)
340{
341 size_t t;
342
343 switch (secret->type) {
344 case CEPH_CRYPTO_NONE:
345 if (*dst1_len + *dst2_len < src_len)
346 return -ERANGE;
347 t = min(*dst1_len, src_len);
348 memcpy(dst1, src, t);
349 *dst1_len = t;
350 src += t;
351 src_len -= t;
352 if (src_len) {
353 t = min(*dst2_len, src_len);
354 memcpy(dst2, src, t);
355 *dst2_len = t;
356 }
357 return 0;
358
359 case CEPH_CRYPTO_AES:
360 return ceph_aes_decrypt2(secret->key, secret->len,
361 dst1, dst1_len, dst2, dst2_len,
362 src, src_len);
363
364 default:
365 return -EINVAL;
366 }
367}
368
369int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
370 const void *src, size_t src_len)
371{
372 switch (secret->type) {
373 case CEPH_CRYPTO_NONE:
374 if (*dst_len < src_len)
375 return -ERANGE;
376 memcpy(dst, src, src_len);
377 *dst_len = src_len;
378 return 0;
379
380 case CEPH_CRYPTO_AES:
381 return ceph_aes_encrypt(secret->key, secret->len, dst,
382 dst_len, src, src_len);
383
384 default:
385 return -EINVAL;
386 }
387}
388
389int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
390 const void *src1, size_t src1_len,
391 const void *src2, size_t src2_len)
392{
393 switch (secret->type) {
394 case CEPH_CRYPTO_NONE:
395 if (*dst_len < src1_len + src2_len)
396 return -ERANGE;
397 memcpy(dst, src1, src1_len);
398 memcpy(dst + src1_len, src2, src2_len);
399 *dst_len = src1_len + src2_len;
400 return 0;
401
402 case CEPH_CRYPTO_AES:
403 return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
404 src1, src1_len, src2, src2_len);
405
406 default:
407 return -EINVAL;
408 }
409}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
new file mode 100644
index 000000000000..40b502e6bd89
--- /dev/null
+++ b/fs/ceph/crypto.h
@@ -0,0 +1,48 @@
1#ifndef _FS_CEPH_CRYPTO_H
2#define _FS_CEPH_CRYPTO_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * cryptographic secret
9 */
10struct ceph_crypto_key {
11 int type;
12 struct ceph_timespec created;
13 int len;
14 void *key;
15};
16
17static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
18{
19 kfree(key->key);
20}
21
22extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
23 void **p, void *end);
24extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
25 void **p, void *end);
26extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
27
28/* crypto.c */
29extern int ceph_decrypt(struct ceph_crypto_key *secret,
30 void *dst, size_t *dst_len,
31 const void *src, size_t src_len);
32extern int ceph_encrypt(struct ceph_crypto_key *secret,
33 void *dst, size_t *dst_len,
34 const void *src, size_t src_len);
35extern int ceph_decrypt2(struct ceph_crypto_key *secret,
36 void *dst1, size_t *dst1_len,
37 void *dst2, size_t *dst2_len,
38 const void *src, size_t src_len);
39extern int ceph_encrypt2(struct ceph_crypto_key *secret,
40 void *dst, size_t *dst_len,
41 const void *src1, size_t src1_len,
42 const void *src2, size_t src2_len);
43
44/* armor.c */
45extern int ceph_armor(char *dst, const void *src, const void *end);
46extern int ceph_unarmor(void *dst, const char *src, const char *end);
47
48#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
new file mode 100644
index 000000000000..f7048da92acc
--- /dev/null
+++ b/fs/ceph/debugfs.c
@@ -0,0 +1,484 @@
1#include "ceph_debug.h"
2
3#include <linux/device.h>
4#include <linux/slab.h>
5#include <linux/module.h>
6#include <linux/ctype.h>
7#include <linux/debugfs.h>
8#include <linux/seq_file.h>
9
10#include "super.h"
11#include "mds_client.h"
12#include "mon_client.h"
13#include "auth.h"
14
15#ifdef CONFIG_DEBUG_FS
16
17/*
18 * Implement /sys/kernel/debug/ceph fun
19 *
20 * /sys/kernel/debug/ceph/client* - an instance of the ceph client
21 * .../osdmap - current osdmap
22 * .../mdsmap - current mdsmap
23 * .../monmap - current monmap
24 * .../osdc - active osd requests
25 * .../mdsc - active mds requests
26 * .../monc - mon client state
27 * .../dentry_lru - dump contents of dentry lru
28 * .../caps - expose cap (reservation) stats
29 * .../bdi - symlink to ../../bdi/something
30 */
31
32static struct dentry *ceph_debugfs_dir;
33
34static int monmap_show(struct seq_file *s, void *p)
35{
36 int i;
37 struct ceph_client *client = s->private;
38
39 if (client->monc.monmap == NULL)
40 return 0;
41
42 seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
43 for (i = 0; i < client->monc.monmap->num_mon; i++) {
44 struct ceph_entity_inst *inst =
45 &client->monc.monmap->mon_inst[i];
46
47 seq_printf(s, "\t%s%lld\t%s\n",
48 ENTITY_NAME(inst->name),
49 pr_addr(&inst->addr.in_addr));
50 }
51 return 0;
52}
53
54static int mdsmap_show(struct seq_file *s, void *p)
55{
56 int i;
57 struct ceph_client *client = s->private;
58
59 if (client->mdsc.mdsmap == NULL)
60 return 0;
61 seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
62 seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
63 seq_printf(s, "session_timeout %d\n",
64 client->mdsc.mdsmap->m_session_timeout);
65 seq_printf(s, "session_autoclose %d\n",
66 client->mdsc.mdsmap->m_session_autoclose);
67 for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
68 struct ceph_entity_addr *addr =
69 &client->mdsc.mdsmap->m_info[i].addr;
70 int state = client->mdsc.mdsmap->m_info[i].state;
71
72 seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
73 ceph_mds_state_name(state));
74 }
75 return 0;
76}
77
78static int osdmap_show(struct seq_file *s, void *p)
79{
80 int i;
81 struct ceph_client *client = s->private;
82 struct rb_node *n;
83
84 if (client->osdc.osdmap == NULL)
85 return 0;
86 seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
87 seq_printf(s, "flags%s%s\n",
88 (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
89 " NEARFULL" : "",
90 (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
91 " FULL" : "");
92 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
93 struct ceph_pg_pool_info *pool =
94 rb_entry(n, struct ceph_pg_pool_info, node);
95 seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
96 pool->id, pool->v.pg_num, pool->pg_num_mask,
97 pool->v.lpg_num, pool->lpg_num_mask);
98 }
99 for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
100 struct ceph_entity_addr *addr =
101 &client->osdc.osdmap->osd_addr[i];
102 int state = client->osdc.osdmap->osd_state[i];
103 char sb[64];
104
105 seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
106 i, pr_addr(&addr->in_addr),
107 ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
108 ceph_osdmap_state_str(sb, sizeof(sb), state));
109 }
110 return 0;
111}
112
113static int monc_show(struct seq_file *s, void *p)
114{
115 struct ceph_client *client = s->private;
116 struct ceph_mon_statfs_request *req;
117 struct ceph_mon_client *monc = &client->monc;
118 struct rb_node *rp;
119
120 mutex_lock(&monc->mutex);
121
122 if (monc->have_mdsmap)
123 seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
124 if (monc->have_osdmap)
125 seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
126 if (monc->want_next_osdmap)
127 seq_printf(s, "want next osdmap\n");
128
129 for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) {
130 req = rb_entry(rp, struct ceph_mon_statfs_request, node);
131 seq_printf(s, "%lld statfs\n", req->tid);
132 }
133
134 mutex_unlock(&monc->mutex);
135 return 0;
136}
137
138static int mdsc_show(struct seq_file *s, void *p)
139{
140 struct ceph_client *client = s->private;
141 struct ceph_mds_client *mdsc = &client->mdsc;
142 struct ceph_mds_request *req;
143 struct rb_node *rp;
144 int pathlen;
145 u64 pathbase;
146 char *path;
147
148 mutex_lock(&mdsc->mutex);
149 for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
150 req = rb_entry(rp, struct ceph_mds_request, r_node);
151
152 if (req->r_request)
153 seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
154 else
155 seq_printf(s, "%lld\t(no request)\t", req->r_tid);
156
157 seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
158
159 if (req->r_got_unsafe)
160 seq_printf(s, "\t(unsafe)");
161 else
162 seq_printf(s, "\t");
163
164 if (req->r_inode) {
165 seq_printf(s, " #%llx", ceph_ino(req->r_inode));
166 } else if (req->r_dentry) {
167 path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
168 &pathbase, 0);
169 spin_lock(&req->r_dentry->d_lock);
170 seq_printf(s, " #%llx/%.*s (%s)",
171 ceph_ino(req->r_dentry->d_parent->d_inode),
172 req->r_dentry->d_name.len,
173 req->r_dentry->d_name.name,
174 path ? path : "");
175 spin_unlock(&req->r_dentry->d_lock);
176 kfree(path);
177 } else if (req->r_path1) {
178 seq_printf(s, " #%llx/%s", req->r_ino1.ino,
179 req->r_path1);
180 }
181
182 if (req->r_old_dentry) {
183 path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
184 &pathbase, 0);
185 spin_lock(&req->r_old_dentry->d_lock);
186 seq_printf(s, " #%llx/%.*s (%s)",
187 ceph_ino(req->r_old_dentry->d_parent->d_inode),
188 req->r_old_dentry->d_name.len,
189 req->r_old_dentry->d_name.name,
190 path ? path : "");
191 spin_unlock(&req->r_old_dentry->d_lock);
192 kfree(path);
193 } else if (req->r_path2) {
194 if (req->r_ino2.ino)
195 seq_printf(s, " #%llx/%s", req->r_ino2.ino,
196 req->r_path2);
197 else
198 seq_printf(s, " %s", req->r_path2);
199 }
200
201 seq_printf(s, "\n");
202 }
203 mutex_unlock(&mdsc->mutex);
204
205 return 0;
206}
207
208static int osdc_show(struct seq_file *s, void *pp)
209{
210 struct ceph_client *client = s->private;
211 struct ceph_osd_client *osdc = &client->osdc;
212 struct rb_node *p;
213
214 mutex_lock(&osdc->request_mutex);
215 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
216 struct ceph_osd_request *req;
217 struct ceph_osd_request_head *head;
218 struct ceph_osd_op *op;
219 int num_ops;
220 int opcode, olen;
221 int i;
222
223 req = rb_entry(p, struct ceph_osd_request, r_node);
224
225 seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
226 req->r_osd ? req->r_osd->o_osd : -1,
227 le32_to_cpu(req->r_pgid.pool),
228 le16_to_cpu(req->r_pgid.ps));
229
230 head = req->r_request->front.iov_base;
231 op = (void *)(head + 1);
232
233 num_ops = le16_to_cpu(head->num_ops);
234 olen = le32_to_cpu(head->object_len);
235 seq_printf(s, "%.*s", olen,
236 (const char *)(head->ops + num_ops));
237
238 if (req->r_reassert_version.epoch)
239 seq_printf(s, "\t%u'%llu",
240 (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
241 le64_to_cpu(req->r_reassert_version.version));
242 else
243 seq_printf(s, "\t");
244
245 for (i = 0; i < num_ops; i++) {
246 opcode = le16_to_cpu(op->op);
247 seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
248 op++;
249 }
250
251 seq_printf(s, "\n");
252 }
253 mutex_unlock(&osdc->request_mutex);
254 return 0;
255}
256
257static int caps_show(struct seq_file *s, void *p)
258{
259 struct ceph_client *client = p;
260 int total, avail, used, reserved, min;
261
262 ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
263 seq_printf(s, "total\t\t%d\n"
264 "avail\t\t%d\n"
265 "used\t\t%d\n"
266 "reserved\t%d\n"
267 "min\t%d\n",
268 total, avail, used, reserved, min);
269 return 0;
270}
271
272static int dentry_lru_show(struct seq_file *s, void *ptr)
273{
274 struct ceph_client *client = s->private;
275 struct ceph_mds_client *mdsc = &client->mdsc;
276 struct ceph_dentry_info *di;
277
278 spin_lock(&mdsc->dentry_lru_lock);
279 list_for_each_entry(di, &mdsc->dentry_lru, lru) {
280 struct dentry *dentry = di->dentry;
281 seq_printf(s, "%p %p\t%.*s\n",
282 di, dentry, dentry->d_name.len, dentry->d_name.name);
283 }
284 spin_unlock(&mdsc->dentry_lru_lock);
285
286 return 0;
287}
288
289#define DEFINE_SHOW_FUNC(name) \
290static int name##_open(struct inode *inode, struct file *file) \
291{ \
292 struct seq_file *sf; \
293 int ret; \
294 \
295 ret = single_open(file, name, NULL); \
296 sf = file->private_data; \
297 sf->private = inode->i_private; \
298 return ret; \
299} \
300 \
301static const struct file_operations name##_fops = { \
302 .open = name##_open, \
303 .read = seq_read, \
304 .llseek = seq_lseek, \
305 .release = single_release, \
306};
307
308DEFINE_SHOW_FUNC(monmap_show)
309DEFINE_SHOW_FUNC(mdsmap_show)
310DEFINE_SHOW_FUNC(osdmap_show)
311DEFINE_SHOW_FUNC(monc_show)
312DEFINE_SHOW_FUNC(mdsc_show)
313DEFINE_SHOW_FUNC(osdc_show)
314DEFINE_SHOW_FUNC(dentry_lru_show)
315DEFINE_SHOW_FUNC(caps_show)
316
317static int congestion_kb_set(void *data, u64 val)
318{
319 struct ceph_client *client = (struct ceph_client *)data;
320
321 if (client)
322 client->mount_args->congestion_kb = (int)val;
323
324 return 0;
325}
326
327static int congestion_kb_get(void *data, u64 *val)
328{
329 struct ceph_client *client = (struct ceph_client *)data;
330
331 if (client)
332 *val = (u64)client->mount_args->congestion_kb;
333
334 return 0;
335}
336
337
338DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
339 congestion_kb_set, "%llu\n");
340
341int __init ceph_debugfs_init(void)
342{
343 ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
344 if (!ceph_debugfs_dir)
345 return -ENOMEM;
346 return 0;
347}
348
349void ceph_debugfs_cleanup(void)
350{
351 debugfs_remove(ceph_debugfs_dir);
352}
353
354int ceph_debugfs_client_init(struct ceph_client *client)
355{
356 int ret = 0;
357 char name[80];
358
359 snprintf(name, sizeof(name), FSID_FORMAT ".client%lld",
360 PR_FSID(&client->fsid), client->monc.auth->global_id);
361
362 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
363 if (!client->debugfs_dir)
364 goto out;
365
366 client->monc.debugfs_file = debugfs_create_file("monc",
367 0600,
368 client->debugfs_dir,
369 client,
370 &monc_show_fops);
371 if (!client->monc.debugfs_file)
372 goto out;
373
374 client->mdsc.debugfs_file = debugfs_create_file("mdsc",
375 0600,
376 client->debugfs_dir,
377 client,
378 &mdsc_show_fops);
379 if (!client->mdsc.debugfs_file)
380 goto out;
381
382 client->osdc.debugfs_file = debugfs_create_file("osdc",
383 0600,
384 client->debugfs_dir,
385 client,
386 &osdc_show_fops);
387 if (!client->osdc.debugfs_file)
388 goto out;
389
390 client->debugfs_monmap = debugfs_create_file("monmap",
391 0600,
392 client->debugfs_dir,
393 client,
394 &monmap_show_fops);
395 if (!client->debugfs_monmap)
396 goto out;
397
398 client->debugfs_mdsmap = debugfs_create_file("mdsmap",
399 0600,
400 client->debugfs_dir,
401 client,
402 &mdsmap_show_fops);
403 if (!client->debugfs_mdsmap)
404 goto out;
405
406 client->debugfs_osdmap = debugfs_create_file("osdmap",
407 0600,
408 client->debugfs_dir,
409 client,
410 &osdmap_show_fops);
411 if (!client->debugfs_osdmap)
412 goto out;
413
414 client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
415 0600,
416 client->debugfs_dir,
417 client,
418 &dentry_lru_show_fops);
419 if (!client->debugfs_dentry_lru)
420 goto out;
421
422 client->debugfs_caps = debugfs_create_file("caps",
423 0400,
424 client->debugfs_dir,
425 client,
426 &caps_show_fops);
427 if (!client->debugfs_caps)
428 goto out;
429
430 client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb",
431 0600,
432 client->debugfs_dir,
433 client,
434 &congestion_kb_fops);
435 if (!client->debugfs_congestion_kb)
436 goto out;
437
438 sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
439 client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
440 name);
441
442 return 0;
443
444out:
445 ceph_debugfs_client_cleanup(client);
446 return ret;
447}
448
449void ceph_debugfs_client_cleanup(struct ceph_client *client)
450{
451 debugfs_remove(client->debugfs_bdi);
452 debugfs_remove(client->debugfs_caps);
453 debugfs_remove(client->debugfs_dentry_lru);
454 debugfs_remove(client->debugfs_osdmap);
455 debugfs_remove(client->debugfs_mdsmap);
456 debugfs_remove(client->debugfs_monmap);
457 debugfs_remove(client->osdc.debugfs_file);
458 debugfs_remove(client->mdsc.debugfs_file);
459 debugfs_remove(client->monc.debugfs_file);
460 debugfs_remove(client->debugfs_congestion_kb);
461 debugfs_remove(client->debugfs_dir);
462}
463
464#else // CONFIG_DEBUG_FS
465
466int __init ceph_debugfs_init(void)
467{
468 return 0;
469}
470
471void ceph_debugfs_cleanup(void)
472{
473}
474
475int ceph_debugfs_client_init(struct ceph_client *client)
476{
477 return 0;
478}
479
480void ceph_debugfs_client_cleanup(struct ceph_client *client)
481{
482}
483
484#endif // CONFIG_DEBUG_FS
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
new file mode 100644
index 000000000000..65b3e022eaf5
--- /dev/null
+++ b/fs/ceph/decode.h
@@ -0,0 +1,194 @@
1#ifndef __CEPH_DECODE_H
2#define __CEPH_DECODE_H
3
4#include <asm/unaligned.h>
5#include <linux/time.h>
6
7#include "types.h"
8
9/*
10 * in all cases,
11 * void **p pointer to position pointer
12 * void *end pointer to end of buffer (last byte + 1)
13 */
14
15static inline u64 ceph_decode_64(void **p)
16{
17 u64 v = get_unaligned_le64(*p);
18 *p += sizeof(u64);
19 return v;
20}
21static inline u32 ceph_decode_32(void **p)
22{
23 u32 v = get_unaligned_le32(*p);
24 *p += sizeof(u32);
25 return v;
26}
27static inline u16 ceph_decode_16(void **p)
28{
29 u16 v = get_unaligned_le16(*p);
30 *p += sizeof(u16);
31 return v;
32}
33static inline u8 ceph_decode_8(void **p)
34{
35 u8 v = *(u8 *)*p;
36 (*p)++;
37 return v;
38}
39static inline void ceph_decode_copy(void **p, void *pv, size_t n)
40{
41 memcpy(pv, *p, n);
42 *p += n;
43}
44
45/*
46 * bounds check input.
47 */
48#define ceph_decode_need(p, end, n, bad) \
49 do { \
50 if (unlikely(*(p) + (n) > (end))) \
51 goto bad; \
52 } while (0)
53
54#define ceph_decode_64_safe(p, end, v, bad) \
55 do { \
56 ceph_decode_need(p, end, sizeof(u64), bad); \
57 v = ceph_decode_64(p); \
58 } while (0)
59#define ceph_decode_32_safe(p, end, v, bad) \
60 do { \
61 ceph_decode_need(p, end, sizeof(u32), bad); \
62 v = ceph_decode_32(p); \
63 } while (0)
64#define ceph_decode_16_safe(p, end, v, bad) \
65 do { \
66 ceph_decode_need(p, end, sizeof(u16), bad); \
67 v = ceph_decode_16(p); \
68 } while (0)
69#define ceph_decode_8_safe(p, end, v, bad) \
70 do { \
71 ceph_decode_need(p, end, sizeof(u8), bad); \
72 v = ceph_decode_8(p); \
73 } while (0)
74
75#define ceph_decode_copy_safe(p, end, pv, n, bad) \
76 do { \
77 ceph_decode_need(p, end, n, bad); \
78 ceph_decode_copy(p, pv, n); \
79 } while (0)
80
81/*
82 * struct ceph_timespec <-> struct timespec
83 */
84static inline void ceph_decode_timespec(struct timespec *ts,
85 const struct ceph_timespec *tv)
86{
87 ts->tv_sec = le32_to_cpu(tv->tv_sec);
88 ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
89}
90static inline void ceph_encode_timespec(struct ceph_timespec *tv,
91 const struct timespec *ts)
92{
93 tv->tv_sec = cpu_to_le32(ts->tv_sec);
94 tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
95}
96
97/*
98 * sockaddr_storage <-> ceph_sockaddr
99 */
100static inline void ceph_encode_addr(struct ceph_entity_addr *a)
101{
102 a->in_addr.ss_family = htons(a->in_addr.ss_family);
103}
104static inline void ceph_decode_addr(struct ceph_entity_addr *a)
105{
106 a->in_addr.ss_family = ntohs(a->in_addr.ss_family);
107 WARN_ON(a->in_addr.ss_family == 512);
108}
109
110/*
111 * encoders
112 */
113static inline void ceph_encode_64(void **p, u64 v)
114{
115 put_unaligned_le64(v, (__le64 *)*p);
116 *p += sizeof(u64);
117}
118static inline void ceph_encode_32(void **p, u32 v)
119{
120 put_unaligned_le32(v, (__le32 *)*p);
121 *p += sizeof(u32);
122}
123static inline void ceph_encode_16(void **p, u16 v)
124{
125 put_unaligned_le16(v, (__le16 *)*p);
126 *p += sizeof(u16);
127}
128static inline void ceph_encode_8(void **p, u8 v)
129{
130 *(u8 *)*p = v;
131 (*p)++;
132}
133static inline void ceph_encode_copy(void **p, const void *s, int len)
134{
135 memcpy(*p, s, len);
136 *p += len;
137}
138
139/*
140 * filepath, string encoders
141 */
142static inline void ceph_encode_filepath(void **p, void *end,
143 u64 ino, const char *path)
144{
145 u32 len = path ? strlen(path) : 0;
146 BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
147 ceph_encode_8(p, 1);
148 ceph_encode_64(p, ino);
149 ceph_encode_32(p, len);
150 if (len)
151 memcpy(*p, path, len);
152 *p += len;
153}
154
155static inline void ceph_encode_string(void **p, void *end,
156 const char *s, u32 len)
157{
158 BUG_ON(*p + sizeof(len) + len > end);
159 ceph_encode_32(p, len);
160 if (len)
161 memcpy(*p, s, len);
162 *p += len;
163}
164
165#define ceph_encode_need(p, end, n, bad) \
166 do { \
167 if (unlikely(*(p) + (n) > (end))) \
168 goto bad; \
169 } while (0)
170
171#define ceph_encode_64_safe(p, end, v, bad) \
172 do { \
173 ceph_encode_need(p, end, sizeof(u64), bad); \
174 ceph_encode_64(p, v); \
175 } while (0)
176#define ceph_encode_32_safe(p, end, v, bad) \
177 do { \
178 ceph_encode_need(p, end, sizeof(u32), bad); \
179 ceph_encode_32(p, v); \
180 } while (0)
181#define ceph_encode_16_safe(p, end, v, bad) \
182 do { \
183 ceph_encode_need(p, end, sizeof(u16), bad); \
184 ceph_encode_16(p, v); \
185 } while (0)
186
187#define ceph_encode_copy_safe(p, end, pv, n, bad) \
188 do { \
189 ceph_encode_need(p, end, n, bad); \
190 ceph_encode_copy(p, pv, n); \
191 } while (0)
192
193
194#endif
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
new file mode 100644
index 000000000000..650d2db5ed26
--- /dev/null
+++ b/fs/ceph/dir.c
@@ -0,0 +1,1233 @@
1#include "ceph_debug.h"
2
3#include <linux/spinlock.h>
4#include <linux/fs_struct.h>
5#include <linux/namei.h>
6#include <linux/slab.h>
7#include <linux/sched.h>
8
9#include "super.h"
10
11/*
12 * Directory operations: readdir, lookup, create, link, unlink,
13 * rename, etc.
14 */
15
16/*
17 * Ceph MDS operations are specified in terms of a base ino and
18 * relative path. Thus, the client can specify an operation on a
19 * specific inode (e.g., a getattr due to fstat(2)), or as a path
20 * relative to, say, the root directory.
21 *
22 * Normally, we limit ourselves to strict inode ops (no path component)
23 * or dentry operations (a single path component relative to an ino). The
24 * exception to this is open_root_dentry(), which will open the mount
25 * point by name.
26 */
27
28const struct inode_operations ceph_dir_iops;
29const struct file_operations ceph_dir_fops;
30struct dentry_operations ceph_dentry_ops;
31
32/*
33 * Initialize ceph dentry state.
34 */
35int ceph_init_dentry(struct dentry *dentry)
36{
37 struct ceph_dentry_info *di;
38
39 if (dentry->d_fsdata)
40 return 0;
41
42 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
43 dentry->d_op = &ceph_dentry_ops;
44 else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
45 dentry->d_op = &ceph_snapdir_dentry_ops;
46 else
47 dentry->d_op = &ceph_snap_dentry_ops;
48
49 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS);
50 if (!di)
51 return -ENOMEM; /* oh well */
52
53 spin_lock(&dentry->d_lock);
54 if (dentry->d_fsdata) /* lost a race */
55 goto out_unlock;
56 di->dentry = dentry;
57 di->lease_session = NULL;
58 dentry->d_fsdata = di;
59 dentry->d_time = jiffies;
60 ceph_dentry_lru_add(dentry);
61out_unlock:
62 spin_unlock(&dentry->d_lock);
63 return 0;
64}
65
66
67
68/*
69 * for readdir, we encode the directory frag and offset within that
70 * frag into f_pos.
71 */
72static unsigned fpos_frag(loff_t p)
73{
74 return p >> 32;
75}
76static unsigned fpos_off(loff_t p)
77{
78 return p & 0xffffffff;
79}
80
81/*
82 * When possible, we try to satisfy a readdir by peeking at the
83 * dcache. We make this work by carefully ordering dentries on
84 * d_u.d_child when we initially get results back from the MDS, and
85 * falling back to a "normal" sync readdir if any dentries in the dir
86 * are dropped.
87 *
88 * I_COMPLETE tells indicates we have all dentries in the dir. It is
89 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
90 * the MDS if/when the directory is modified).
91 */
92static int __dcache_readdir(struct file *filp,
93 void *dirent, filldir_t filldir)
94{
95 struct inode *inode = filp->f_dentry->d_inode;
96 struct ceph_file_info *fi = filp->private_data;
97 struct dentry *parent = filp->f_dentry;
98 struct inode *dir = parent->d_inode;
99 struct list_head *p;
100 struct dentry *dentry, *last;
101 struct ceph_dentry_info *di;
102 int err = 0;
103
104 /* claim ref on last dentry we returned */
105 last = fi->dentry;
106 fi->dentry = NULL;
107
108 dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
109 last);
110
111 spin_lock(&dcache_lock);
112
113 /* start at beginning? */
114 if (filp->f_pos == 2 || (last &&
115 filp->f_pos < ceph_dentry(last)->offset)) {
116 if (list_empty(&parent->d_subdirs))
117 goto out_unlock;
118 p = parent->d_subdirs.prev;
119 dout(" initial p %p/%p\n", p->prev, p->next);
120 } else {
121 p = last->d_u.d_child.prev;
122 }
123
124more:
125 dentry = list_entry(p, struct dentry, d_u.d_child);
126 di = ceph_dentry(dentry);
127 while (1) {
128 dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next,
129 parent->d_subdirs.prev, parent->d_subdirs.next);
130 if (p == &parent->d_subdirs) {
131 fi->at_end = 1;
132 goto out_unlock;
133 }
134 if (!d_unhashed(dentry) && dentry->d_inode &&
135 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
136 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
137 filp->f_pos <= di->offset)
138 break;
139 dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
140 dentry->d_name.len, dentry->d_name.name, di->offset,
141 filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
142 !dentry->d_inode ? " null" : "");
143 p = p->prev;
144 dentry = list_entry(p, struct dentry, d_u.d_child);
145 di = ceph_dentry(dentry);
146 }
147
148 atomic_inc(&dentry->d_count);
149 spin_unlock(&dcache_lock);
150 spin_unlock(&inode->i_lock);
151
152 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
153 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
154 filp->f_pos = di->offset;
155 err = filldir(dirent, dentry->d_name.name,
156 dentry->d_name.len, di->offset,
157 dentry->d_inode->i_ino,
158 dentry->d_inode->i_mode >> 12);
159
160 if (last) {
161 if (err < 0) {
162 /* remember our position */
163 fi->dentry = last;
164 fi->next_offset = di->offset;
165 } else {
166 dput(last);
167 }
168 last = NULL;
169 }
170
171 spin_lock(&inode->i_lock);
172 spin_lock(&dcache_lock);
173
174 last = dentry;
175
176 if (err < 0)
177 goto out_unlock;
178
179 p = p->prev;
180 filp->f_pos++;
181
182 /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
183 if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
184 goto more;
185 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
186 err = -EAGAIN;
187
188out_unlock:
189 spin_unlock(&dcache_lock);
190
191 if (last) {
192 spin_unlock(&inode->i_lock);
193 dput(last);
194 spin_lock(&inode->i_lock);
195 }
196
197 return err;
198}
199
200/*
201 * make note of the last dentry we read, so we can
202 * continue at the same lexicographical point,
203 * regardless of what dir changes take place on the
204 * server.
205 */
206static int note_last_dentry(struct ceph_file_info *fi, const char *name,
207 int len)
208{
209 kfree(fi->last_name);
210 fi->last_name = kmalloc(len+1, GFP_NOFS);
211 if (!fi->last_name)
212 return -ENOMEM;
213 memcpy(fi->last_name, name, len);
214 fi->last_name[len] = 0;
215 dout("note_last_dentry '%s'\n", fi->last_name);
216 return 0;
217}
218
219static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
220{
221 struct ceph_file_info *fi = filp->private_data;
222 struct inode *inode = filp->f_dentry->d_inode;
223 struct ceph_inode_info *ci = ceph_inode(inode);
224 struct ceph_client *client = ceph_inode_to_client(inode);
225 struct ceph_mds_client *mdsc = &client->mdsc;
226 unsigned frag = fpos_frag(filp->f_pos);
227 int off = fpos_off(filp->f_pos);
228 int err;
229 u32 ftype;
230 struct ceph_mds_reply_info_parsed *rinfo;
231 const int max_entries = client->mount_args->max_readdir;
232
233 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
234 if (fi->at_end)
235 return 0;
236
237 /* always start with . and .. */
238 if (filp->f_pos == 0) {
239 /* note dir version at start of readdir so we can tell
240 * if any dentries get dropped */
241 fi->dir_release_count = ci->i_release_count;
242
243 dout("readdir off 0 -> '.'\n");
244 if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
245 inode->i_ino, inode->i_mode >> 12) < 0)
246 return 0;
247 filp->f_pos = 1;
248 off = 1;
249 }
250 if (filp->f_pos == 1) {
251 dout("readdir off 1 -> '..'\n");
252 if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
253 filp->f_dentry->d_parent->d_inode->i_ino,
254 inode->i_mode >> 12) < 0)
255 return 0;
256 filp->f_pos = 2;
257 off = 2;
258 }
259
260 /* can we use the dcache? */
261 spin_lock(&inode->i_lock);
262 if ((filp->f_pos == 2 || fi->dentry) &&
263 !ceph_test_opt(client, NOASYNCREADDIR) &&
264 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
265 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
266 err = __dcache_readdir(filp, dirent, filldir);
267 if (err != -EAGAIN) {
268 spin_unlock(&inode->i_lock);
269 return err;
270 }
271 }
272 spin_unlock(&inode->i_lock);
273 if (fi->dentry) {
274 err = note_last_dentry(fi, fi->dentry->d_name.name,
275 fi->dentry->d_name.len);
276 if (err)
277 return err;
278 dput(fi->dentry);
279 fi->dentry = NULL;
280 }
281
282 /* proceed with a normal readdir */
283
284more:
285 /* do we have the correct frag content buffered? */
286 if (fi->frag != frag || fi->last_readdir == NULL) {
287 struct ceph_mds_request *req;
288 int op = ceph_snap(inode) == CEPH_SNAPDIR ?
289 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
290
291 /* discard old result, if any */
292 if (fi->last_readdir) {
293 ceph_mdsc_put_request(fi->last_readdir);
294 fi->last_readdir = NULL;
295 }
296
297 /* requery frag tree, as the frag topology may have changed */
298 frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
299
300 dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
301 ceph_vinop(inode), frag, fi->last_name);
302 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
303 if (IS_ERR(req))
304 return PTR_ERR(req);
305 req->r_inode = igrab(inode);
306 req->r_dentry = dget(filp->f_dentry);
307 /* hints to request -> mds selection code */
308 req->r_direct_mode = USE_AUTH_MDS;
309 req->r_direct_hash = ceph_frag_value(frag);
310 req->r_direct_is_hash = true;
311 req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
312 req->r_readdir_offset = fi->next_offset;
313 req->r_args.readdir.frag = cpu_to_le32(frag);
314 req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
315 req->r_num_caps = max_entries + 1;
316 err = ceph_mdsc_do_request(mdsc, NULL, req);
317 if (err < 0) {
318 ceph_mdsc_put_request(req);
319 return err;
320 }
321 dout("readdir got and parsed readdir result=%d"
322 " on frag %x, end=%d, complete=%d\n", err, frag,
323 (int)req->r_reply_info.dir_end,
324 (int)req->r_reply_info.dir_complete);
325
326 if (!req->r_did_prepopulate) {
327 dout("readdir !did_prepopulate");
328 fi->dir_release_count--; /* preclude I_COMPLETE */
329 }
330
331 /* note next offset and last dentry name */
332 fi->offset = fi->next_offset;
333 fi->last_readdir = req;
334
335 if (req->r_reply_info.dir_end) {
336 kfree(fi->last_name);
337 fi->last_name = NULL;
338 fi->next_offset = 0;
339 } else {
340 rinfo = &req->r_reply_info;
341 err = note_last_dentry(fi,
342 rinfo->dir_dname[rinfo->dir_nr-1],
343 rinfo->dir_dname_len[rinfo->dir_nr-1]);
344 if (err)
345 return err;
346 fi->next_offset += rinfo->dir_nr;
347 }
348 }
349
350 rinfo = &fi->last_readdir->r_reply_info;
351 dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
352 rinfo->dir_nr, off, fi->offset);
353 while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) {
354 u64 pos = ceph_make_fpos(frag, off);
355 struct ceph_mds_reply_inode *in =
356 rinfo->dir_in[off - fi->offset].in;
357 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
358 off, off - fi->offset, rinfo->dir_nr, pos,
359 rinfo->dir_dname_len[off - fi->offset],
360 rinfo->dir_dname[off - fi->offset], in);
361 BUG_ON(!in);
362 ftype = le32_to_cpu(in->mode) >> 12;
363 if (filldir(dirent,
364 rinfo->dir_dname[off - fi->offset],
365 rinfo->dir_dname_len[off - fi->offset],
366 pos,
367 le64_to_cpu(in->ino),
368 ftype) < 0) {
369 dout("filldir stopping us...\n");
370 return 0;
371 }
372 off++;
373 filp->f_pos = pos + 1;
374 }
375
376 if (fi->last_name) {
377 ceph_mdsc_put_request(fi->last_readdir);
378 fi->last_readdir = NULL;
379 goto more;
380 }
381
382 /* more frags? */
383 if (!ceph_frag_is_rightmost(frag)) {
384 frag = ceph_frag_next(frag);
385 off = 0;
386 filp->f_pos = ceph_make_fpos(frag, off);
387 dout("readdir next frag is %x\n", frag);
388 goto more;
389 }
390 fi->at_end = 1;
391
392 /*
393 * if dir_release_count still matches the dir, no dentries
394 * were released during the whole readdir, and we should have
395 * the complete dir contents in our cache.
396 */
397 spin_lock(&inode->i_lock);
398 if (ci->i_release_count == fi->dir_release_count) {
399 dout(" marking %p complete\n", inode);
400 ci->i_ceph_flags |= CEPH_I_COMPLETE;
401 ci->i_max_offset = filp->f_pos;
402 }
403 spin_unlock(&inode->i_lock);
404
405 dout("readdir %p filp %p done.\n", inode, filp);
406 return 0;
407}
408
409static void reset_readdir(struct ceph_file_info *fi)
410{
411 if (fi->last_readdir) {
412 ceph_mdsc_put_request(fi->last_readdir);
413 fi->last_readdir = NULL;
414 }
415 kfree(fi->last_name);
416 fi->next_offset = 2; /* compensate for . and .. */
417 if (fi->dentry) {
418 dput(fi->dentry);
419 fi->dentry = NULL;
420 }
421 fi->at_end = 0;
422}
423
424static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
425{
426 struct ceph_file_info *fi = file->private_data;
427 struct inode *inode = file->f_mapping->host;
428 loff_t old_offset = offset;
429 loff_t retval;
430
431 mutex_lock(&inode->i_mutex);
432 switch (origin) {
433 case SEEK_END:
434 offset += inode->i_size + 2; /* FIXME */
435 break;
436 case SEEK_CUR:
437 offset += file->f_pos;
438 }
439 retval = -EINVAL;
440 if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
441 if (offset != file->f_pos) {
442 file->f_pos = offset;
443 file->f_version = 0;
444 fi->at_end = 0;
445 }
446 retval = offset;
447
448 /*
449 * discard buffered readdir content on seekdir(0), or
450 * seek to new frag, or seek prior to current chunk.
451 */
452 if (offset == 0 ||
453 fpos_frag(offset) != fpos_frag(old_offset) ||
454 fpos_off(offset) < fi->offset) {
455 dout("dir_llseek dropping %p content\n", file);
456 reset_readdir(fi);
457 }
458
459 /* bump dir_release_count if we did a forward seek */
460 if (offset > old_offset)
461 fi->dir_release_count--;
462 }
463 mutex_unlock(&inode->i_mutex);
464 return retval;
465}
466
467/*
468 * Process result of a lookup/open request.
469 *
470 * Mainly, make sure we return the final req->r_dentry (if it already
471 * existed) in place of the original VFS-provided dentry when they
472 * differ.
473 *
474 * Gracefully handle the case where the MDS replies with -ENOENT and
475 * no trace (which it may do, at its discretion, e.g., if it doesn't
476 * care to issue a lease on the negative dentry).
477 */
478struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
479 struct dentry *dentry, int err)
480{
481 struct ceph_client *client = ceph_client(dentry->d_sb);
482 struct inode *parent = dentry->d_parent->d_inode;
483
484 /* .snap dir? */
485 if (err == -ENOENT &&
486 ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
487 strcmp(dentry->d_name.name,
488 client->mount_args->snapdir_name) == 0) {
489 struct inode *inode = ceph_get_snapdir(parent);
490 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
491 dentry, dentry->d_name.len, dentry->d_name.name, inode);
492 BUG_ON(!d_unhashed(dentry));
493 d_add(dentry, inode);
494 err = 0;
495 }
496
497 if (err == -ENOENT) {
498 /* no trace? */
499 err = 0;
500 if (!req->r_reply_info.head->is_dentry) {
501 dout("ENOENT and no trace, dentry %p inode %p\n",
502 dentry, dentry->d_inode);
503 if (dentry->d_inode) {
504 d_drop(dentry);
505 err = -ENOENT;
506 } else {
507 d_add(dentry, NULL);
508 }
509 }
510 }
511 if (err)
512 dentry = ERR_PTR(err);
513 else if (dentry != req->r_dentry)
514 dentry = dget(req->r_dentry); /* we got spliced */
515 else
516 dentry = NULL;
517 return dentry;
518}
519
520static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
521{
522 return ceph_ino(inode) == CEPH_INO_ROOT &&
523 strncmp(dentry->d_name.name, ".ceph", 5) == 0;
524}
525
526/*
527 * Look up a single dir entry. If there is a lookup intent, inform
528 * the MDS so that it gets our 'caps wanted' value in a single op.
529 */
530static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
531 struct nameidata *nd)
532{
533 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
534 struct ceph_mds_client *mdsc = &client->mdsc;
535 struct ceph_mds_request *req;
536 int op;
537 int err;
538
539 dout("lookup %p dentry %p '%.*s'\n",
540 dir, dentry, dentry->d_name.len, dentry->d_name.name);
541
542 if (dentry->d_name.len > NAME_MAX)
543 return ERR_PTR(-ENAMETOOLONG);
544
545 err = ceph_init_dentry(dentry);
546 if (err < 0)
547 return ERR_PTR(err);
548
549 /* open (but not create!) intent? */
550 if (nd &&
551 (nd->flags & LOOKUP_OPEN) &&
552 (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */
553 !(nd->intent.open.flags & O_CREAT)) {
554 int mode = nd->intent.open.create_mode & ~current->fs->umask;
555 return ceph_lookup_open(dir, dentry, nd, mode, 1);
556 }
557
558 /* can we conclude ENOENT locally? */
559 if (dentry->d_inode == NULL) {
560 struct ceph_inode_info *ci = ceph_inode(dir);
561 struct ceph_dentry_info *di = ceph_dentry(dentry);
562
563 spin_lock(&dir->i_lock);
564 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
565 if (strncmp(dentry->d_name.name,
566 client->mount_args->snapdir_name,
567 dentry->d_name.len) &&
568 !is_root_ceph_dentry(dir, dentry) &&
569 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
570 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
571 di->offset = ci->i_max_offset++;
572 spin_unlock(&dir->i_lock);
573 dout(" dir %p complete, -ENOENT\n", dir);
574 d_add(dentry, NULL);
575 di->lease_shared_gen = ci->i_shared_gen;
576 return NULL;
577 }
578 spin_unlock(&dir->i_lock);
579 }
580
581 op = ceph_snap(dir) == CEPH_SNAPDIR ?
582 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
583 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
584 if (IS_ERR(req))
585 return ERR_PTR(PTR_ERR(req));
586 req->r_dentry = dget(dentry);
587 req->r_num_caps = 2;
588 /* we only need inode linkage */
589 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
590 req->r_locked_dir = dir;
591 err = ceph_mdsc_do_request(mdsc, NULL, req);
592 dentry = ceph_finish_lookup(req, dentry, err);
593 ceph_mdsc_put_request(req); /* will dput(dentry) */
594 dout("lookup result=%p\n", dentry);
595 return dentry;
596}
597
598/*
599 * If we do a create but get no trace back from the MDS, follow up with
600 * a lookup (the VFS expects us to link up the provided dentry).
601 */
602int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
603{
604 struct dentry *result = ceph_lookup(dir, dentry, NULL);
605
606 if (result && !IS_ERR(result)) {
607 /*
608 * We created the item, then did a lookup, and found
609 * it was already linked to another inode we already
610 * had in our cache (and thus got spliced). Link our
611 * dentry to that inode, but don't hash it, just in
612 * case the VFS wants to dereference it.
613 */
614 BUG_ON(!result->d_inode);
615 d_instantiate(dentry, result->d_inode);
616 return 0;
617 }
618 return PTR_ERR(result);
619}
620
621static int ceph_mknod(struct inode *dir, struct dentry *dentry,
622 int mode, dev_t rdev)
623{
624 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
625 struct ceph_mds_client *mdsc = &client->mdsc;
626 struct ceph_mds_request *req;
627 int err;
628
629 if (ceph_snap(dir) != CEPH_NOSNAP)
630 return -EROFS;
631
632 dout("mknod in dir %p dentry %p mode 0%o rdev %d\n",
633 dir, dentry, mode, rdev);
634 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
635 if (IS_ERR(req)) {
636 d_drop(dentry);
637 return PTR_ERR(req);
638 }
639 req->r_dentry = dget(dentry);
640 req->r_num_caps = 2;
641 req->r_locked_dir = dir;
642 req->r_args.mknod.mode = cpu_to_le32(mode);
643 req->r_args.mknod.rdev = cpu_to_le32(rdev);
644 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
645 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
646 err = ceph_mdsc_do_request(mdsc, dir, req);
647 if (!err && !req->r_reply_info.head->is_dentry)
648 err = ceph_handle_notrace_create(dir, dentry);
649 ceph_mdsc_put_request(req);
650 if (err)
651 d_drop(dentry);
652 return err;
653}
654
655static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
656 struct nameidata *nd)
657{
658 dout("create in dir %p dentry %p name '%.*s'\n",
659 dir, dentry, dentry->d_name.len, dentry->d_name.name);
660
661 if (ceph_snap(dir) != CEPH_NOSNAP)
662 return -EROFS;
663
664 if (nd) {
665 BUG_ON((nd->flags & LOOKUP_OPEN) == 0);
666 dentry = ceph_lookup_open(dir, dentry, nd, mode, 0);
667 /* hrm, what should i do here if we get aliased? */
668 if (IS_ERR(dentry))
669 return PTR_ERR(dentry);
670 return 0;
671 }
672
673 /* fall back to mknod */
674 return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0);
675}
676
677static int ceph_symlink(struct inode *dir, struct dentry *dentry,
678 const char *dest)
679{
680 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
681 struct ceph_mds_client *mdsc = &client->mdsc;
682 struct ceph_mds_request *req;
683 int err;
684
685 if (ceph_snap(dir) != CEPH_NOSNAP)
686 return -EROFS;
687
688 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
689 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
690 if (IS_ERR(req)) {
691 d_drop(dentry);
692 return PTR_ERR(req);
693 }
694 req->r_dentry = dget(dentry);
695 req->r_num_caps = 2;
696 req->r_path2 = kstrdup(dest, GFP_NOFS);
697 req->r_locked_dir = dir;
698 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
699 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
700 err = ceph_mdsc_do_request(mdsc, dir, req);
701 if (!err && !req->r_reply_info.head->is_dentry)
702 err = ceph_handle_notrace_create(dir, dentry);
703 ceph_mdsc_put_request(req);
704 if (err)
705 d_drop(dentry);
706 return err;
707}
708
709static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
710{
711 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
712 struct ceph_mds_client *mdsc = &client->mdsc;
713 struct ceph_mds_request *req;
714 int err = -EROFS;
715 int op;
716
717 if (ceph_snap(dir) == CEPH_SNAPDIR) {
718 /* mkdir .snap/foo is a MKSNAP */
719 op = CEPH_MDS_OP_MKSNAP;
720 dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
721 dentry->d_name.len, dentry->d_name.name, dentry);
722 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
723 dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode);
724 op = CEPH_MDS_OP_MKDIR;
725 } else {
726 goto out;
727 }
728 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
729 if (IS_ERR(req)) {
730 err = PTR_ERR(req);
731 goto out;
732 }
733
734 req->r_dentry = dget(dentry);
735 req->r_num_caps = 2;
736 req->r_locked_dir = dir;
737 req->r_args.mkdir.mode = cpu_to_le32(mode);
738 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
739 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
740 err = ceph_mdsc_do_request(mdsc, dir, req);
741 if (!err && !req->r_reply_info.head->is_dentry)
742 err = ceph_handle_notrace_create(dir, dentry);
743 ceph_mdsc_put_request(req);
744out:
745 if (err < 0)
746 d_drop(dentry);
747 return err;
748}
749
750static int ceph_link(struct dentry *old_dentry, struct inode *dir,
751 struct dentry *dentry)
752{
753 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
754 struct ceph_mds_client *mdsc = &client->mdsc;
755 struct ceph_mds_request *req;
756 int err;
757
758 if (ceph_snap(dir) != CEPH_NOSNAP)
759 return -EROFS;
760
761 dout("link in dir %p old_dentry %p dentry %p\n", dir,
762 old_dentry, dentry);
763 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
764 if (IS_ERR(req)) {
765 d_drop(dentry);
766 return PTR_ERR(req);
767 }
768 req->r_dentry = dget(dentry);
769 req->r_num_caps = 2;
770 req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
771 req->r_locked_dir = dir;
772 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
773 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
774 err = ceph_mdsc_do_request(mdsc, dir, req);
775 if (err)
776 d_drop(dentry);
777 else if (!req->r_reply_info.head->is_dentry)
778 d_instantiate(dentry, igrab(old_dentry->d_inode));
779 ceph_mdsc_put_request(req);
780 return err;
781}
782
783/*
784 * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
785 * looks like the link count will hit 0, drop any other caps (other
786 * than PIN) we don't specifically want (due to the file still being
787 * open).
788 */
789static int drop_caps_for_unlink(struct inode *inode)
790{
791 struct ceph_inode_info *ci = ceph_inode(inode);
792 int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
793
794 spin_lock(&inode->i_lock);
795 if (inode->i_nlink == 1) {
796 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
797 ci->i_ceph_flags |= CEPH_I_NODELAY;
798 }
799 spin_unlock(&inode->i_lock);
800 return drop;
801}
802
803/*
804 * rmdir and unlink are differ only by the metadata op code
805 */
806static int ceph_unlink(struct inode *dir, struct dentry *dentry)
807{
808 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
809 struct ceph_mds_client *mdsc = &client->mdsc;
810 struct inode *inode = dentry->d_inode;
811 struct ceph_mds_request *req;
812 int err = -EROFS;
813 int op;
814
815 if (ceph_snap(dir) == CEPH_SNAPDIR) {
816 /* rmdir .snap/foo is RMSNAP */
817 dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
818 dentry->d_name.name, dentry);
819 op = CEPH_MDS_OP_RMSNAP;
820 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
821 dout("unlink/rmdir dir %p dn %p inode %p\n",
822 dir, dentry, inode);
823 op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ?
824 CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
825 } else
826 goto out;
827 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
828 if (IS_ERR(req)) {
829 err = PTR_ERR(req);
830 goto out;
831 }
832 req->r_dentry = dget(dentry);
833 req->r_num_caps = 2;
834 req->r_locked_dir = dir;
835 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
836 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
837 req->r_inode_drop = drop_caps_for_unlink(inode);
838 err = ceph_mdsc_do_request(mdsc, dir, req);
839 if (!err && !req->r_reply_info.head->is_dentry)
840 d_delete(dentry);
841 ceph_mdsc_put_request(req);
842out:
843 return err;
844}
845
846static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
847 struct inode *new_dir, struct dentry *new_dentry)
848{
849 struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
850 struct ceph_mds_client *mdsc = &client->mdsc;
851 struct ceph_mds_request *req;
852 int err;
853
854 if (ceph_snap(old_dir) != ceph_snap(new_dir))
855 return -EXDEV;
856 if (ceph_snap(old_dir) != CEPH_NOSNAP ||
857 ceph_snap(new_dir) != CEPH_NOSNAP)
858 return -EROFS;
859 dout("rename dir %p dentry %p to dir %p dentry %p\n",
860 old_dir, old_dentry, new_dir, new_dentry);
861 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
862 if (IS_ERR(req))
863 return PTR_ERR(req);
864 req->r_dentry = dget(new_dentry);
865 req->r_num_caps = 2;
866 req->r_old_dentry = dget(old_dentry);
867 req->r_locked_dir = new_dir;
868 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
869 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
870 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
871 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
872 /* release LINK_RDCACHE on source inode (mds will lock it) */
873 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
874 if (new_dentry->d_inode)
875 req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
876 err = ceph_mdsc_do_request(mdsc, old_dir, req);
877 if (!err && !req->r_reply_info.head->is_dentry) {
878 /*
879 * Normally d_move() is done by fill_trace (called by
880 * do_request, above). If there is no trace, we need
881 * to do it here.
882 */
883
884 /* d_move screws up d_subdirs order */
885 ceph_i_clear(new_dir, CEPH_I_COMPLETE);
886
887 d_move(old_dentry, new_dentry);
888
889 /* ensure target dentry is invalidated, despite
890 rehashing bug in vfs_rename_dir */
891 new_dentry->d_time = jiffies;
892 ceph_dentry(new_dentry)->lease_shared_gen = 0;
893 }
894 ceph_mdsc_put_request(req);
895 return err;
896}
897
898
899/*
900 * Check if dentry lease is valid. If not, delete the lease. Try to
901 * renew if the least is more than half up.
902 */
903static int dentry_lease_is_valid(struct dentry *dentry)
904{
905 struct ceph_dentry_info *di;
906 struct ceph_mds_session *s;
907 int valid = 0;
908 u32 gen;
909 unsigned long ttl;
910 struct ceph_mds_session *session = NULL;
911 struct inode *dir = NULL;
912 u32 seq = 0;
913
914 spin_lock(&dentry->d_lock);
915 di = ceph_dentry(dentry);
916 if (di && di->lease_session) {
917 s = di->lease_session;
918 spin_lock(&s->s_cap_lock);
919 gen = s->s_cap_gen;
920 ttl = s->s_cap_ttl;
921 spin_unlock(&s->s_cap_lock);
922
923 if (di->lease_gen == gen &&
924 time_before(jiffies, dentry->d_time) &&
925 time_before(jiffies, ttl)) {
926 valid = 1;
927 if (di->lease_renew_after &&
928 time_after(jiffies, di->lease_renew_after)) {
929 /* we should renew */
930 dir = dentry->d_parent->d_inode;
931 session = ceph_get_mds_session(s);
932 seq = di->lease_seq;
933 di->lease_renew_after = 0;
934 di->lease_renew_from = jiffies;
935 }
936 }
937 }
938 spin_unlock(&dentry->d_lock);
939
940 if (session) {
941 ceph_mdsc_lease_send_msg(session, dir, dentry,
942 CEPH_MDS_LEASE_RENEW, seq);
943 ceph_put_mds_session(session);
944 }
945 dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
946 return valid;
947}
948
949/*
950 * Check if directory-wide content lease/cap is valid.
951 */
952static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
953{
954 struct ceph_inode_info *ci = ceph_inode(dir);
955 struct ceph_dentry_info *di = ceph_dentry(dentry);
956 int valid = 0;
957
958 spin_lock(&dir->i_lock);
959 if (ci->i_shared_gen == di->lease_shared_gen)
960 valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
961 spin_unlock(&dir->i_lock);
962 dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
963 dir, (unsigned)ci->i_shared_gen, dentry,
964 (unsigned)di->lease_shared_gen, valid);
965 return valid;
966}
967
968/*
969 * Check if cached dentry can be trusted.
970 */
971static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
972{
973 struct inode *dir = dentry->d_parent->d_inode;
974
975 dout("d_revalidate %p '%.*s' inode %p\n", dentry,
976 dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
977
978 /* always trust cached snapped dentries, snapdir dentry */
979 if (ceph_snap(dir) != CEPH_NOSNAP) {
980 dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
981 dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
982 goto out_touch;
983 }
984 if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR)
985 goto out_touch;
986
987 if (dentry_lease_is_valid(dentry) ||
988 dir_lease_is_valid(dir, dentry))
989 goto out_touch;
990
991 dout("d_revalidate %p invalid\n", dentry);
992 d_drop(dentry);
993 return 0;
994out_touch:
995 ceph_dentry_lru_touch(dentry);
996 return 1;
997}
998
999/*
1000 * When a dentry is released, clear the dir I_COMPLETE if it was part
1001 * of the current dir gen.
1002 */
1003static void ceph_dentry_release(struct dentry *dentry)
1004{
1005 struct ceph_dentry_info *di = ceph_dentry(dentry);
1006 struct inode *parent_inode = dentry->d_parent->d_inode;
1007
1008 if (parent_inode) {
1009 struct ceph_inode_info *ci = ceph_inode(parent_inode);
1010
1011 spin_lock(&parent_inode->i_lock);
1012 if (ci->i_shared_gen == di->lease_shared_gen) {
1013 dout(" clearing %p complete (d_release)\n",
1014 parent_inode);
1015 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1016 ci->i_release_count++;
1017 }
1018 spin_unlock(&parent_inode->i_lock);
1019 }
1020 if (di) {
1021 ceph_dentry_lru_del(dentry);
1022 if (di->lease_session)
1023 ceph_put_mds_session(di->lease_session);
1024 kmem_cache_free(ceph_dentry_cachep, di);
1025 dentry->d_fsdata = NULL;
1026 }
1027}
1028
1029static int ceph_snapdir_d_revalidate(struct dentry *dentry,
1030 struct nameidata *nd)
1031{
1032 /*
1033 * Eventually, we'll want to revalidate snapped metadata
1034 * too... probably...
1035 */
1036 return 1;
1037}
1038
1039
1040
1041/*
1042 * read() on a dir. This weird interface hack only works if mounted
1043 * with '-o dirstat'.
1044 */
1045static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1046 loff_t *ppos)
1047{
1048 struct ceph_file_info *cf = file->private_data;
1049 struct inode *inode = file->f_dentry->d_inode;
1050 struct ceph_inode_info *ci = ceph_inode(inode);
1051 int left;
1052
1053 if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT))
1054 return -EISDIR;
1055
1056 if (!cf->dir_info) {
1057 cf->dir_info = kmalloc(1024, GFP_NOFS);
1058 if (!cf->dir_info)
1059 return -ENOMEM;
1060 cf->dir_info_len =
1061 sprintf(cf->dir_info,
1062 "entries: %20lld\n"
1063 " files: %20lld\n"
1064 " subdirs: %20lld\n"
1065 "rentries: %20lld\n"
1066 " rfiles: %20lld\n"
1067 " rsubdirs: %20lld\n"
1068 "rbytes: %20lld\n"
1069 "rctime: %10ld.%09ld\n",
1070 ci->i_files + ci->i_subdirs,
1071 ci->i_files,
1072 ci->i_subdirs,
1073 ci->i_rfiles + ci->i_rsubdirs,
1074 ci->i_rfiles,
1075 ci->i_rsubdirs,
1076 ci->i_rbytes,
1077 (long)ci->i_rctime.tv_sec,
1078 (long)ci->i_rctime.tv_nsec);
1079 }
1080
1081 if (*ppos >= cf->dir_info_len)
1082 return 0;
1083 size = min_t(unsigned, size, cf->dir_info_len-*ppos);
1084 left = copy_to_user(buf, cf->dir_info + *ppos, size);
1085 if (left == size)
1086 return -EFAULT;
1087 *ppos += (size - left);
1088 return size - left;
1089}
1090
1091/*
1092 * an fsync() on a dir will wait for any uncommitted directory
1093 * operations to commit.
1094 */
1095static int ceph_dir_fsync(struct file *file, struct dentry *dentry,
1096 int datasync)
1097{
1098 struct inode *inode = dentry->d_inode;
1099 struct ceph_inode_info *ci = ceph_inode(inode);
1100 struct list_head *head = &ci->i_unsafe_dirops;
1101 struct ceph_mds_request *req;
1102 u64 last_tid;
1103 int ret = 0;
1104
1105 dout("dir_fsync %p\n", inode);
1106 spin_lock(&ci->i_unsafe_lock);
1107 if (list_empty(head))
1108 goto out;
1109
1110 req = list_entry(head->prev,
1111 struct ceph_mds_request, r_unsafe_dir_item);
1112 last_tid = req->r_tid;
1113
1114 do {
1115 ceph_mdsc_get_request(req);
1116 spin_unlock(&ci->i_unsafe_lock);
1117 dout("dir_fsync %p wait on tid %llu (until %llu)\n",
1118 inode, req->r_tid, last_tid);
1119 if (req->r_timeout) {
1120 ret = wait_for_completion_timeout(
1121 &req->r_safe_completion, req->r_timeout);
1122 if (ret > 0)
1123 ret = 0;
1124 else if (ret == 0)
1125 ret = -EIO; /* timed out */
1126 } else {
1127 wait_for_completion(&req->r_safe_completion);
1128 }
1129 spin_lock(&ci->i_unsafe_lock);
1130 ceph_mdsc_put_request(req);
1131
1132 if (ret || list_empty(head))
1133 break;
1134 req = list_entry(head->next,
1135 struct ceph_mds_request, r_unsafe_dir_item);
1136 } while (req->r_tid < last_tid);
1137out:
1138 spin_unlock(&ci->i_unsafe_lock);
1139 return ret;
1140}
1141
1142/*
1143 * We maintain a private dentry LRU.
1144 *
1145 * FIXME: this needs to be changed to a per-mds lru to be useful.
1146 */
1147void ceph_dentry_lru_add(struct dentry *dn)
1148{
1149 struct ceph_dentry_info *di = ceph_dentry(dn);
1150 struct ceph_mds_client *mdsc;
1151
1152 dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1153 dn->d_name.len, dn->d_name.name);
1154 if (di) {
1155 mdsc = &ceph_client(dn->d_sb)->mdsc;
1156 spin_lock(&mdsc->dentry_lru_lock);
1157 list_add_tail(&di->lru, &mdsc->dentry_lru);
1158 mdsc->num_dentry++;
1159 spin_unlock(&mdsc->dentry_lru_lock);
1160 }
1161}
1162
1163void ceph_dentry_lru_touch(struct dentry *dn)
1164{
1165 struct ceph_dentry_info *di = ceph_dentry(dn);
1166 struct ceph_mds_client *mdsc;
1167
1168 dout("dentry_lru_touch %p %p '%.*s'\n", di, dn,
1169 dn->d_name.len, dn->d_name.name);
1170 if (di) {
1171 mdsc = &ceph_client(dn->d_sb)->mdsc;
1172 spin_lock(&mdsc->dentry_lru_lock);
1173 list_move_tail(&di->lru, &mdsc->dentry_lru);
1174 spin_unlock(&mdsc->dentry_lru_lock);
1175 }
1176}
1177
1178void ceph_dentry_lru_del(struct dentry *dn)
1179{
1180 struct ceph_dentry_info *di = ceph_dentry(dn);
1181 struct ceph_mds_client *mdsc;
1182
1183 dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1184 dn->d_name.len, dn->d_name.name);
1185 if (di) {
1186 mdsc = &ceph_client(dn->d_sb)->mdsc;
1187 spin_lock(&mdsc->dentry_lru_lock);
1188 list_del_init(&di->lru);
1189 mdsc->num_dentry--;
1190 spin_unlock(&mdsc->dentry_lru_lock);
1191 }
1192}
1193
1194const struct file_operations ceph_dir_fops = {
1195 .read = ceph_read_dir,
1196 .readdir = ceph_readdir,
1197 .llseek = ceph_dir_llseek,
1198 .open = ceph_open,
1199 .release = ceph_release,
1200 .unlocked_ioctl = ceph_ioctl,
1201 .fsync = ceph_dir_fsync,
1202};
1203
1204const struct inode_operations ceph_dir_iops = {
1205 .lookup = ceph_lookup,
1206 .permission = ceph_permission,
1207 .getattr = ceph_getattr,
1208 .setattr = ceph_setattr,
1209 .setxattr = ceph_setxattr,
1210 .getxattr = ceph_getxattr,
1211 .listxattr = ceph_listxattr,
1212 .removexattr = ceph_removexattr,
1213 .mknod = ceph_mknod,
1214 .symlink = ceph_symlink,
1215 .mkdir = ceph_mkdir,
1216 .link = ceph_link,
1217 .unlink = ceph_unlink,
1218 .rmdir = ceph_unlink,
1219 .rename = ceph_rename,
1220 .create = ceph_create,
1221};
1222
1223struct dentry_operations ceph_dentry_ops = {
1224 .d_revalidate = ceph_d_revalidate,
1225 .d_release = ceph_dentry_release,
1226};
1227
1228struct dentry_operations ceph_snapdir_dentry_ops = {
1229 .d_revalidate = ceph_snapdir_d_revalidate,
1230};
1231
1232struct dentry_operations ceph_snap_dentry_ops = {
1233};
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
new file mode 100644
index 000000000000..9d67572fb328
--- /dev/null
+++ b/fs/ceph/export.c
@@ -0,0 +1,224 @@
1#include "ceph_debug.h"
2
3#include <linux/exportfs.h>
4#include <linux/slab.h>
5#include <asm/unaligned.h>
6
7#include "super.h"
8
9/*
10 * NFS export support
11 *
12 * NFS re-export of a ceph mount is, at present, only semireliable.
13 * The basic issue is that the Ceph architectures doesn't lend itself
14 * well to generating filehandles that will remain valid forever.
15 *
16 * So, we do our best. If you're lucky, your inode will be in the
17 * client's cache. If it's not, and you have a connectable fh, then
18 * the MDS server may be able to find it for you. Otherwise, you get
19 * ESTALE.
20 *
21 * There are ways to this more reliable, but in the non-connectable fh
22 * case, we won't every work perfectly, and in the connectable case,
23 * some changes are needed on the MDS side to work better.
24 */
25
26/*
27 * Basic fh
28 */
29struct ceph_nfs_fh {
30 u64 ino;
31} __attribute__ ((packed));
32
33/*
34 * Larger 'connectable' fh that includes parent ino and name hash.
35 * Use this whenever possible, as it works more reliably.
36 */
37struct ceph_nfs_confh {
38 u64 ino, parent_ino;
39 u32 parent_name_hash;
40} __attribute__ ((packed));
41
42static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
43 int connectable)
44{
45 struct ceph_nfs_fh *fh = (void *)rawfh;
46 struct ceph_nfs_confh *cfh = (void *)rawfh;
47 struct dentry *parent = dentry->d_parent;
48 struct inode *inode = dentry->d_inode;
49 int type;
50
51 /* don't re-export snaps */
52 if (ceph_snap(inode) != CEPH_NOSNAP)
53 return -EINVAL;
54
55 if (*max_len >= sizeof(*cfh)) {
56 dout("encode_fh %p connectable\n", dentry);
57 cfh->ino = ceph_ino(dentry->d_inode);
58 cfh->parent_ino = ceph_ino(parent->d_inode);
59 cfh->parent_name_hash = parent->d_name.hash;
60 *max_len = sizeof(*cfh);
61 type = 2;
62 } else if (*max_len > sizeof(*fh)) {
63 if (connectable)
64 return -ENOSPC;
65 dout("encode_fh %p\n", dentry);
66 fh->ino = ceph_ino(dentry->d_inode);
67 *max_len = sizeof(*fh);
68 type = 1;
69 } else {
70 return -ENOSPC;
71 }
72 return type;
73}
74
75/*
76 * convert regular fh to dentry
77 *
78 * FIXME: we should try harder by querying the mds for the ino.
79 */
80static struct dentry *__fh_to_dentry(struct super_block *sb,
81 struct ceph_nfs_fh *fh)
82{
83 struct inode *inode;
84 struct dentry *dentry;
85 struct ceph_vino vino;
86 int err;
87
88 dout("__fh_to_dentry %llx\n", fh->ino);
89 vino.ino = fh->ino;
90 vino.snap = CEPH_NOSNAP;
91 inode = ceph_find_inode(sb, vino);
92 if (!inode)
93 return ERR_PTR(-ESTALE);
94
95 dentry = d_obtain_alias(inode);
96 if (!dentry) {
97 pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
98 fh->ino, inode);
99 iput(inode);
100 return ERR_PTR(-ENOMEM);
101 }
102 err = ceph_init_dentry(dentry);
103
104 if (err < 0) {
105 iput(inode);
106 return ERR_PTR(err);
107 }
108 dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
109 return dentry;
110}
111
112/*
113 * convert connectable fh to dentry
114 */
115static struct dentry *__cfh_to_dentry(struct super_block *sb,
116 struct ceph_nfs_confh *cfh)
117{
118 struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc;
119 struct inode *inode;
120 struct dentry *dentry;
121 struct ceph_vino vino;
122 int err;
123
124 dout("__cfh_to_dentry %llx (%llx/%x)\n",
125 cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
126
127 vino.ino = cfh->ino;
128 vino.snap = CEPH_NOSNAP;
129 inode = ceph_find_inode(sb, vino);
130 if (!inode) {
131 struct ceph_mds_request *req;
132
133 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
134 USE_ANY_MDS);
135 if (IS_ERR(req))
136 return ERR_PTR(PTR_ERR(req));
137
138 req->r_ino1 = vino;
139 req->r_ino2.ino = cfh->parent_ino;
140 req->r_ino2.snap = CEPH_NOSNAP;
141 req->r_path2 = kmalloc(16, GFP_NOFS);
142 snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
143 req->r_num_caps = 1;
144 err = ceph_mdsc_do_request(mdsc, NULL, req);
145 ceph_mdsc_put_request(req);
146 inode = ceph_find_inode(sb, vino);
147 if (!inode)
148 return ERR_PTR(err ? err : -ESTALE);
149 }
150
151 dentry = d_obtain_alias(inode);
152 if (!dentry) {
153 pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
154 cfh->ino, inode);
155 iput(inode);
156 return ERR_PTR(-ENOMEM);
157 }
158 err = ceph_init_dentry(dentry);
159 if (err < 0) {
160 iput(inode);
161 return ERR_PTR(err);
162 }
163 dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
164 return dentry;
165}
166
167static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
168 int fh_len, int fh_type)
169{
170 if (fh_type == 1)
171 return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw);
172 else
173 return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw);
174}
175
176/*
177 * get parent, if possible.
178 *
179 * FIXME: we could do better by querying the mds to discover the
180 * parent.
181 */
182static struct dentry *ceph_fh_to_parent(struct super_block *sb,
183 struct fid *fid,
184 int fh_len, int fh_type)
185{
186 struct ceph_nfs_confh *cfh = (void *)fid->raw;
187 struct ceph_vino vino;
188 struct inode *inode;
189 struct dentry *dentry;
190 int err;
191
192 if (fh_type == 1)
193 return ERR_PTR(-ESTALE);
194
195 pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
196 cfh->parent_name_hash);
197
198 vino.ino = cfh->ino;
199 vino.snap = CEPH_NOSNAP;
200 inode = ceph_find_inode(sb, vino);
201 if (!inode)
202 return ERR_PTR(-ESTALE);
203
204 dentry = d_obtain_alias(inode);
205 if (!dentry) {
206 pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
207 cfh->ino, inode);
208 iput(inode);
209 return ERR_PTR(-ENOMEM);
210 }
211 err = ceph_init_dentry(dentry);
212 if (err < 0) {
213 iput(inode);
214 return ERR_PTR(err);
215 }
216 dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry);
217 return dentry;
218}
219
220const struct export_operations ceph_export_ops = {
221 .encode_fh = ceph_encode_fh,
222 .fh_to_dentry = ceph_fh_to_dentry,
223 .fh_to_parent = ceph_fh_to_parent,
224};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
new file mode 100644
index 000000000000..ed6f19721d6e
--- /dev/null
+++ b/fs/ceph/file.c
@@ -0,0 +1,939 @@
1#include "ceph_debug.h"
2
3#include <linux/sched.h>
4#include <linux/slab.h>
5#include <linux/file.h>
6#include <linux/namei.h>
7#include <linux/writeback.h>
8
9#include "super.h"
10#include "mds_client.h"
11
12/*
13 * Ceph file operations
14 *
15 * Implement basic open/close functionality, and implement
16 * read/write.
17 *
18 * We implement three modes of file I/O:
19 * - buffered uses the generic_file_aio_{read,write} helpers
20 *
21 * - synchronous is used when there is multi-client read/write
22 * sharing, avoids the page cache, and synchronously waits for an
23 * ack from the OSD.
24 *
25 * - direct io takes the variant of the sync path that references
26 * user pages directly.
27 *
28 * fsync() flushes and waits on dirty pages, but just queues metadata
29 * for writeback: since the MDS can recover size and mtime there is no
30 * need to wait for MDS acknowledgement.
31 */
32
33
34/*
35 * Prepare an open request. Preallocate ceph_cap to avoid an
36 * inopportune ENOMEM later.
37 */
38static struct ceph_mds_request *
39prepare_open_request(struct super_block *sb, int flags, int create_mode)
40{
41 struct ceph_client *client = ceph_sb_to_client(sb);
42 struct ceph_mds_client *mdsc = &client->mdsc;
43 struct ceph_mds_request *req;
44 int want_auth = USE_ANY_MDS;
45 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
46
47 if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
48 want_auth = USE_AUTH_MDS;
49
50 req = ceph_mdsc_create_request(mdsc, op, want_auth);
51 if (IS_ERR(req))
52 goto out;
53 req->r_fmode = ceph_flags_to_mode(flags);
54 req->r_args.open.flags = cpu_to_le32(flags);
55 req->r_args.open.mode = cpu_to_le32(create_mode);
56 req->r_args.open.preferred = cpu_to_le32(-1);
57out:
58 return req;
59}
60
61/*
62 * initialize private struct file data.
63 * if we fail, clean up by dropping fmode reference on the ceph_inode
64 */
65static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
66{
67 struct ceph_file_info *cf;
68 int ret = 0;
69
70 switch (inode->i_mode & S_IFMT) {
71 case S_IFREG:
72 case S_IFDIR:
73 dout("init_file %p %p 0%o (regular)\n", inode, file,
74 inode->i_mode);
75 cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
76 if (cf == NULL) {
77 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
78 return -ENOMEM;
79 }
80 cf->fmode = fmode;
81 cf->next_offset = 2;
82 file->private_data = cf;
83 BUG_ON(inode->i_fop->release != ceph_release);
84 break;
85
86 case S_IFLNK:
87 dout("init_file %p %p 0%o (symlink)\n", inode, file,
88 inode->i_mode);
89 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
90 break;
91
92 default:
93 dout("init_file %p %p 0%o (special)\n", inode, file,
94 inode->i_mode);
95 /*
96 * we need to drop the open ref now, since we don't
97 * have .release set to ceph_release.
98 */
99 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
100 BUG_ON(inode->i_fop->release == ceph_release);
101
102 /* call the proper open fop */
103 ret = inode->i_fop->open(inode, file);
104 }
105 return ret;
106}
107
108/*
109 * If the filp already has private_data, that means the file was
110 * already opened by intent during lookup, and we do nothing.
111 *
112 * If we already have the requisite capabilities, we can satisfy
113 * the open request locally (no need to request new caps from the
114 * MDS). We do, however, need to inform the MDS (asynchronously)
115 * if our wanted caps set expands.
116 */
117int ceph_open(struct inode *inode, struct file *file)
118{
119 struct ceph_inode_info *ci = ceph_inode(inode);
120 struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
121 struct ceph_mds_client *mdsc = &client->mdsc;
122 struct ceph_mds_request *req;
123 struct ceph_file_info *cf = file->private_data;
124 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
125 int err;
126 int flags, fmode, wanted;
127
128 if (cf) {
129 dout("open file %p is already opened\n", file);
130 return 0;
131 }
132
133 /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */
134 flags = file->f_flags & ~(O_CREAT|O_EXCL);
135 if (S_ISDIR(inode->i_mode))
136 flags = O_DIRECTORY; /* mds likes to know */
137
138 dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
139 ceph_vinop(inode), file, flags, file->f_flags);
140 fmode = ceph_flags_to_mode(flags);
141 wanted = ceph_caps_for_mode(fmode);
142
143 /* snapped files are read-only */
144 if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
145 return -EROFS;
146
147 /* trivially open snapdir */
148 if (ceph_snap(inode) == CEPH_SNAPDIR) {
149 spin_lock(&inode->i_lock);
150 __ceph_get_fmode(ci, fmode);
151 spin_unlock(&inode->i_lock);
152 return ceph_init_file(inode, file, fmode);
153 }
154
155 /*
156 * No need to block if we have any caps. Update wanted set
157 * asynchronously.
158 */
159 spin_lock(&inode->i_lock);
160 if (__ceph_is_any_real_caps(ci)) {
161 int mds_wanted = __ceph_caps_mds_wanted(ci);
162 int issued = __ceph_caps_issued(ci, NULL);
163
164 dout("open %p fmode %d want %s issued %s using existing\n",
165 inode, fmode, ceph_cap_string(wanted),
166 ceph_cap_string(issued));
167 __ceph_get_fmode(ci, fmode);
168 spin_unlock(&inode->i_lock);
169
170 /* adjust wanted? */
171 if ((issued & wanted) != wanted &&
172 (mds_wanted & wanted) != wanted &&
173 ceph_snap(inode) != CEPH_SNAPDIR)
174 ceph_check_caps(ci, 0, NULL);
175
176 return ceph_init_file(inode, file, fmode);
177 } else if (ceph_snap(inode) != CEPH_NOSNAP &&
178 (ci->i_snap_caps & wanted) == wanted) {
179 __ceph_get_fmode(ci, fmode);
180 spin_unlock(&inode->i_lock);
181 return ceph_init_file(inode, file, fmode);
182 }
183 spin_unlock(&inode->i_lock);
184
185 dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
186 req = prepare_open_request(inode->i_sb, flags, 0);
187 if (IS_ERR(req)) {
188 err = PTR_ERR(req);
189 goto out;
190 }
191 req->r_inode = igrab(inode);
192 req->r_num_caps = 1;
193 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
194 if (!err)
195 err = ceph_init_file(inode, file, req->r_fmode);
196 ceph_mdsc_put_request(req);
197 dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
198out:
199 return err;
200}
201
202
203/*
204 * Do a lookup + open with a single request.
205 *
206 * If this succeeds, but some subsequent check in the vfs
207 * may_open() fails, the struct *file gets cleaned up (i.e.
208 * ceph_release gets called). So fear not!
209 */
210/*
211 * flags
212 * path_lookup_open -> LOOKUP_OPEN
213 * path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE
214 */
215struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
216 struct nameidata *nd, int mode,
217 int locked_dir)
218{
219 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
220 struct ceph_mds_client *mdsc = &client->mdsc;
221 struct file *file = nd->intent.open.file;
222 struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
223 struct ceph_mds_request *req;
224 int err;
225 int flags = nd->intent.open.flags - 1; /* silly vfs! */
226
227 dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n",
228 dentry, dentry->d_name.len, dentry->d_name.name, flags, mode);
229
230 /* do the open */
231 req = prepare_open_request(dir->i_sb, flags, mode);
232 if (IS_ERR(req))
233 return ERR_PTR(PTR_ERR(req));
234 req->r_dentry = dget(dentry);
235 req->r_num_caps = 2;
236 if (flags & O_CREAT) {
237 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
238 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
239 }
240 req->r_locked_dir = dir; /* caller holds dir->i_mutex */
241 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
242 dentry = ceph_finish_lookup(req, dentry, err);
243 if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
244 err = ceph_handle_notrace_create(dir, dentry);
245 if (!err)
246 err = ceph_init_file(req->r_dentry->d_inode, file,
247 req->r_fmode);
248 ceph_mdsc_put_request(req);
249 dout("ceph_lookup_open result=%p\n", dentry);
250 return dentry;
251}
252
253int ceph_release(struct inode *inode, struct file *file)
254{
255 struct ceph_inode_info *ci = ceph_inode(inode);
256 struct ceph_file_info *cf = file->private_data;
257
258 dout("release inode %p file %p\n", inode, file);
259 ceph_put_fmode(ci, cf->fmode);
260 if (cf->last_readdir)
261 ceph_mdsc_put_request(cf->last_readdir);
262 kfree(cf->last_name);
263 kfree(cf->dir_info);
264 dput(cf->dentry);
265 kmem_cache_free(ceph_file_cachep, cf);
266
267 /* wake up anyone waiting for caps on this inode */
268 wake_up(&ci->i_cap_wq);
269 return 0;
270}
271
272/*
273 * build a vector of user pages
274 */
275static struct page **get_direct_page_vector(const char __user *data,
276 int num_pages,
277 loff_t off, size_t len)
278{
279 struct page **pages;
280 int rc;
281
282 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
283 if (!pages)
284 return ERR_PTR(-ENOMEM);
285
286 down_read(&current->mm->mmap_sem);
287 rc = get_user_pages(current, current->mm, (unsigned long)data,
288 num_pages, 0, 0, pages, NULL);
289 up_read(&current->mm->mmap_sem);
290 if (rc < 0)
291 goto fail;
292 return pages;
293
294fail:
295 kfree(pages);
296 return ERR_PTR(rc);
297}
298
299static void put_page_vector(struct page **pages, int num_pages)
300{
301 int i;
302
303 for (i = 0; i < num_pages; i++)
304 put_page(pages[i]);
305 kfree(pages);
306}
307
308void ceph_release_page_vector(struct page **pages, int num_pages)
309{
310 int i;
311
312 for (i = 0; i < num_pages; i++)
313 __free_pages(pages[i], 0);
314 kfree(pages);
315}
316
317/*
318 * allocate a vector new pages
319 */
320static struct page **alloc_page_vector(int num_pages)
321{
322 struct page **pages;
323 int i;
324
325 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
326 if (!pages)
327 return ERR_PTR(-ENOMEM);
328 for (i = 0; i < num_pages; i++) {
329 pages[i] = alloc_page(GFP_NOFS);
330 if (pages[i] == NULL) {
331 ceph_release_page_vector(pages, i);
332 return ERR_PTR(-ENOMEM);
333 }
334 }
335 return pages;
336}
337
338/*
339 * copy user data into a page vector
340 */
341static int copy_user_to_page_vector(struct page **pages,
342 const char __user *data,
343 loff_t off, size_t len)
344{
345 int i = 0;
346 int po = off & ~PAGE_CACHE_MASK;
347 int left = len;
348 int l, bad;
349
350 while (left > 0) {
351 l = min_t(int, PAGE_CACHE_SIZE-po, left);
352 bad = copy_from_user(page_address(pages[i]) + po, data, l);
353 if (bad == l)
354 return -EFAULT;
355 data += l - bad;
356 left -= l - bad;
357 po += l - bad;
358 if (po == PAGE_CACHE_SIZE) {
359 po = 0;
360 i++;
361 }
362 }
363 return len;
364}
365
366/*
367 * copy user data from a page vector into a user pointer
368 */
369static int copy_page_vector_to_user(struct page **pages, char __user *data,
370 loff_t off, size_t len)
371{
372 int i = 0;
373 int po = off & ~PAGE_CACHE_MASK;
374 int left = len;
375 int l, bad;
376
377 while (left > 0) {
378 l = min_t(int, left, PAGE_CACHE_SIZE-po);
379 bad = copy_to_user(data, page_address(pages[i]) + po, l);
380 if (bad == l)
381 return -EFAULT;
382 data += l - bad;
383 left -= l - bad;
384 if (po) {
385 po += l - bad;
386 if (po == PAGE_CACHE_SIZE)
387 po = 0;
388 }
389 i++;
390 }
391 return len;
392}
393
394/*
395 * Zero an extent within a page vector. Offset is relative to the
396 * start of the first page.
397 */
398static void zero_page_vector_range(int off, int len, struct page **pages)
399{
400 int i = off >> PAGE_CACHE_SHIFT;
401
402 off &= ~PAGE_CACHE_MASK;
403
404 dout("zero_page_vector_page %u~%u\n", off, len);
405
406 /* leading partial page? */
407 if (off) {
408 int end = min((int)PAGE_CACHE_SIZE, off + len);
409 dout("zeroing %d %p head from %d\n", i, pages[i],
410 (int)off);
411 zero_user_segment(pages[i], off, end);
412 len -= (end - off);
413 i++;
414 }
415 while (len >= PAGE_CACHE_SIZE) {
416 dout("zeroing %d %p len=%d\n", i, pages[i], len);
417 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
418 len -= PAGE_CACHE_SIZE;
419 i++;
420 }
421 /* trailing partial page? */
422 if (len) {
423 dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
424 zero_user_segment(pages[i], 0, len);
425 }
426}
427
428
429/*
430 * Read a range of bytes striped over one or more objects. Iterate over
431 * objects we stripe over. (That's not atomic, but good enough for now.)
432 *
433 * If we get a short result from the OSD, check against i_size; we need to
434 * only return a short read to the caller if we hit EOF.
435 */
436static int striped_read(struct inode *inode,
437 u64 off, u64 len,
438 struct page **pages, int num_pages,
439 int *checkeof)
440{
441 struct ceph_client *client = ceph_inode_to_client(inode);
442 struct ceph_inode_info *ci = ceph_inode(inode);
443 u64 pos, this_len;
444 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
445 int left, pages_left;
446 int read;
447 struct page **page_pos;
448 int ret;
449 bool hit_stripe, was_short;
450
451 /*
452 * we may need to do multiple reads. not atomic, unfortunately.
453 */
454 pos = off;
455 left = len;
456 page_pos = pages;
457 pages_left = num_pages;
458 read = 0;
459
460more:
461 this_len = left;
462 ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
463 &ci->i_layout, pos, &this_len,
464 ci->i_truncate_seq,
465 ci->i_truncate_size,
466 page_pos, pages_left);
467 hit_stripe = this_len < left;
468 was_short = ret >= 0 && ret < this_len;
469 if (ret == -ENOENT)
470 ret = 0;
471 dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
472 ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
473
474 if (ret > 0) {
475 int didpages =
476 ((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT;
477
478 if (read < pos - off) {
479 dout(" zero gap %llu to %llu\n", off + read, pos);
480 zero_page_vector_range(page_off + read,
481 pos - off - read, pages);
482 }
483 pos += ret;
484 read = pos - off;
485 left -= ret;
486 page_pos += didpages;
487 pages_left -= didpages;
488
489 /* hit stripe? */
490 if (left && hit_stripe)
491 goto more;
492 }
493
494 if (was_short) {
495 /* was original extent fully inside i_size? */
496 if (pos + left <= inode->i_size) {
497 dout("zero tail\n");
498 zero_page_vector_range(page_off + read, len - read,
499 pages);
500 read = len;
501 goto out;
502 }
503
504 /* check i_size */
505 *checkeof = 1;
506 }
507
508out:
509 if (ret >= 0)
510 ret = read;
511 dout("striped_read returns %d\n", ret);
512 return ret;
513}
514
515/*
516 * Completely synchronous read and write methods. Direct from __user
517 * buffer to osd, or directly to user pages (if O_DIRECT).
518 *
519 * If the read spans object boundary, just do multiple reads.
520 */
521static ssize_t ceph_sync_read(struct file *file, char __user *data,
522 unsigned len, loff_t *poff, int *checkeof)
523{
524 struct inode *inode = file->f_dentry->d_inode;
525 struct page **pages;
526 u64 off = *poff;
527 int num_pages = calc_pages_for(off, len);
528 int ret;
529
530 dout("sync_read on file %p %llu~%u %s\n", file, off, len,
531 (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
532
533 if (file->f_flags & O_DIRECT) {
534 pages = get_direct_page_vector(data, num_pages, off, len);
535
536 /*
537 * flush any page cache pages in this range. this
538 * will make concurrent normal and O_DIRECT io slow,
539 * but it will at least behave sensibly when they are
540 * in sequence.
541 */
542 } else {
543 pages = alloc_page_vector(num_pages);
544 }
545 if (IS_ERR(pages))
546 return PTR_ERR(pages);
547
548 ret = filemap_write_and_wait(inode->i_mapping);
549 if (ret < 0)
550 goto done;
551
552 ret = striped_read(inode, off, len, pages, num_pages, checkeof);
553
554 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
555 ret = copy_page_vector_to_user(pages, data, off, ret);
556 if (ret >= 0)
557 *poff = off + ret;
558
559done:
560 if (file->f_flags & O_DIRECT)
561 put_page_vector(pages, num_pages);
562 else
563 ceph_release_page_vector(pages, num_pages);
564 dout("sync_read result %d\n", ret);
565 return ret;
566}
567
568/*
569 * Write commit callback, called if we requested both an ACK and
570 * ONDISK commit reply from the OSD.
571 */
572static void sync_write_commit(struct ceph_osd_request *req,
573 struct ceph_msg *msg)
574{
575 struct ceph_inode_info *ci = ceph_inode(req->r_inode);
576
577 dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
578 spin_lock(&ci->i_unsafe_lock);
579 list_del_init(&req->r_unsafe_item);
580 spin_unlock(&ci->i_unsafe_lock);
581 ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
582}
583
584/*
585 * Synchronous write, straight from __user pointer or user pages (if
586 * O_DIRECT).
587 *
588 * If write spans object boundary, just do multiple writes. (For a
589 * correct atomic write, we should e.g. take write locks on all
590 * objects, rollback on failure, etc.)
591 */
592static ssize_t ceph_sync_write(struct file *file, const char __user *data,
593 size_t left, loff_t *offset)
594{
595 struct inode *inode = file->f_dentry->d_inode;
596 struct ceph_inode_info *ci = ceph_inode(inode);
597 struct ceph_client *client = ceph_inode_to_client(inode);
598 struct ceph_osd_request *req;
599 struct page **pages;
600 int num_pages;
601 long long unsigned pos;
602 u64 len;
603 int written = 0;
604 int flags;
605 int do_sync = 0;
606 int check_caps = 0;
607 int ret;
608 struct timespec mtime = CURRENT_TIME;
609
610 if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
611 return -EROFS;
612
613 dout("sync_write on file %p %lld~%u %s\n", file, *offset,
614 (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
615
616 if (file->f_flags & O_APPEND)
617 pos = i_size_read(inode);
618 else
619 pos = *offset;
620
621 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
622 if (ret < 0)
623 return ret;
624
625 ret = invalidate_inode_pages2_range(inode->i_mapping,
626 pos >> PAGE_CACHE_SHIFT,
627 (pos + left) >> PAGE_CACHE_SHIFT);
628 if (ret < 0)
629 dout("invalidate_inode_pages2_range returned %d\n", ret);
630
631 flags = CEPH_OSD_FLAG_ORDERSNAP |
632 CEPH_OSD_FLAG_ONDISK |
633 CEPH_OSD_FLAG_WRITE;
634 if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
635 flags |= CEPH_OSD_FLAG_ACK;
636 else
637 do_sync = 1;
638
639 /*
640 * we may need to do multiple writes here if we span an object
641 * boundary. this isn't atomic, unfortunately. :(
642 */
643more:
644 len = left;
645 req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
646 ceph_vino(inode), pos, &len,
647 CEPH_OSD_OP_WRITE, flags,
648 ci->i_snap_realm->cached_context,
649 do_sync,
650 ci->i_truncate_seq, ci->i_truncate_size,
651 &mtime, false, 2);
652 if (IS_ERR(req))
653 return PTR_ERR(req);
654
655 num_pages = calc_pages_for(pos, len);
656
657 if (file->f_flags & O_DIRECT) {
658 pages = get_direct_page_vector(data, num_pages, pos, len);
659 if (IS_ERR(pages)) {
660 ret = PTR_ERR(pages);
661 goto out;
662 }
663
664 /*
665 * throw out any page cache pages in this range. this
666 * may block.
667 */
668 truncate_inode_pages_range(inode->i_mapping, pos,
669 (pos+len) | (PAGE_CACHE_SIZE-1));
670 } else {
671 pages = alloc_page_vector(num_pages);
672 if (IS_ERR(pages)) {
673 ret = PTR_ERR(pages);
674 goto out;
675 }
676 ret = copy_user_to_page_vector(pages, data, pos, len);
677 if (ret < 0) {
678 ceph_release_page_vector(pages, num_pages);
679 goto out;
680 }
681
682 if ((file->f_flags & O_SYNC) == 0) {
683 /* get a second commit callback */
684 req->r_safe_callback = sync_write_commit;
685 req->r_own_pages = 1;
686 }
687 }
688 req->r_pages = pages;
689 req->r_num_pages = num_pages;
690 req->r_inode = inode;
691
692 ret = ceph_osdc_start_request(&client->osdc, req, false);
693 if (!ret) {
694 if (req->r_safe_callback) {
695 /*
696 * Add to inode unsafe list only after we
697 * start_request so that a tid has been assigned.
698 */
699 spin_lock(&ci->i_unsafe_lock);
700 list_add(&ci->i_unsafe_writes, &req->r_unsafe_item);
701 spin_unlock(&ci->i_unsafe_lock);
702 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
703 }
704 ret = ceph_osdc_wait_request(&client->osdc, req);
705 }
706
707 if (file->f_flags & O_DIRECT)
708 put_page_vector(pages, num_pages);
709 else if (file->f_flags & O_SYNC)
710 ceph_release_page_vector(pages, num_pages);
711
712out:
713 ceph_osdc_put_request(req);
714 if (ret == 0) {
715 pos += len;
716 written += len;
717 left -= len;
718 if (left)
719 goto more;
720
721 ret = written;
722 *offset = pos;
723 if (pos > i_size_read(inode))
724 check_caps = ceph_inode_set_size(inode, pos);
725 if (check_caps)
726 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
727 NULL);
728 }
729 return ret;
730}
731
732/*
733 * Wrap generic_file_aio_read with checks for cap bits on the inode.
734 * Atomically grab references, so that those bits are not released
735 * back to the MDS mid-read.
736 *
737 * Hmm, the sync read case isn't actually async... should it be?
738 */
739static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
740 unsigned long nr_segs, loff_t pos)
741{
742 struct file *filp = iocb->ki_filp;
743 loff_t *ppos = &iocb->ki_pos;
744 size_t len = iov->iov_len;
745 struct inode *inode = filp->f_dentry->d_inode;
746 struct ceph_inode_info *ci = ceph_inode(inode);
747 void *base = iov->iov_base;
748 ssize_t ret;
749 int got = 0;
750 int checkeof = 0, read = 0;
751
752 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
753 inode, ceph_vinop(inode), pos, (unsigned)len, inode);
754again:
755 __ceph_do_pending_vmtruncate(inode);
756 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE,
757 &got, -1);
758 if (ret < 0)
759 goto out;
760 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
761 inode, ceph_vinop(inode), pos, (unsigned)len,
762 ceph_cap_string(got));
763
764 if ((got & CEPH_CAP_FILE_CACHE) == 0 ||
765 (iocb->ki_filp->f_flags & O_DIRECT) ||
766 (inode->i_sb->s_flags & MS_SYNCHRONOUS))
767 /* hmm, this isn't really async... */
768 ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
769 else
770 ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
771
772out:
773 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
774 inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
775 ceph_put_cap_refs(ci, got);
776
777 if (checkeof && ret >= 0) {
778 int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
779
780 /* hit EOF or hole? */
781 if (statret == 0 && *ppos < inode->i_size) {
782 dout("aio_read sync_read hit hole, reading more\n");
783 read += ret;
784 base += ret;
785 len -= ret;
786 checkeof = 0;
787 goto again;
788 }
789 }
790 if (ret >= 0)
791 ret += read;
792
793 return ret;
794}
795
796/*
797 * Take cap references to avoid releasing caps to MDS mid-write.
798 *
799 * If we are synchronous, and write with an old snap context, the OSD
800 * may return EOLDSNAPC. In that case, retry the write.. _after_
801 * dropping our cap refs and allowing the pending snap to logically
802 * complete _before_ this write occurs.
803 *
804 * If we are near ENOSPC, write synchronously.
805 */
806static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
807 unsigned long nr_segs, loff_t pos)
808{
809 struct file *file = iocb->ki_filp;
810 struct inode *inode = file->f_dentry->d_inode;
811 struct ceph_inode_info *ci = ceph_inode(inode);
812 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
813 loff_t endoff = pos + iov->iov_len;
814 int got = 0;
815 int ret, err;
816
817 if (ceph_snap(inode) != CEPH_NOSNAP)
818 return -EROFS;
819
820retry_snap:
821 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
822 return -ENOSPC;
823 __ceph_do_pending_vmtruncate(inode);
824 dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
825 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
826 inode->i_size);
827 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
828 &got, endoff);
829 if (ret < 0)
830 goto out;
831
832 dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n",
833 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
834 ceph_cap_string(got));
835
836 if ((got & CEPH_CAP_FILE_BUFFER) == 0 ||
837 (iocb->ki_filp->f_flags & O_DIRECT) ||
838 (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
839 ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
840 &iocb->ki_pos);
841 } else {
842 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
843
844 if ((ret >= 0 || ret == -EIOCBQUEUED) &&
845 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
846 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
847 err = vfs_fsync_range(file, file->f_path.dentry,
848 pos, pos + ret - 1, 1);
849 if (err < 0)
850 ret = err;
851 }
852 }
853 if (ret >= 0) {
854 spin_lock(&inode->i_lock);
855 __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
856 spin_unlock(&inode->i_lock);
857 }
858
859out:
860 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
861 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
862 ceph_cap_string(got));
863 ceph_put_cap_refs(ci, got);
864
865 if (ret == -EOLDSNAPC) {
866 dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
867 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
868 goto retry_snap;
869 }
870
871 return ret;
872}
873
874/*
875 * llseek. be sure to verify file size on SEEK_END.
876 */
877static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
878{
879 struct inode *inode = file->f_mapping->host;
880 int ret;
881
882 mutex_lock(&inode->i_mutex);
883 __ceph_do_pending_vmtruncate(inode);
884 switch (origin) {
885 case SEEK_END:
886 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
887 if (ret < 0) {
888 offset = ret;
889 goto out;
890 }
891 offset += inode->i_size;
892 break;
893 case SEEK_CUR:
894 /*
895 * Here we special-case the lseek(fd, 0, SEEK_CUR)
896 * position-querying operation. Avoid rewriting the "same"
897 * f_pos value back to the file because a concurrent read(),
898 * write() or lseek() might have altered it
899 */
900 if (offset == 0) {
901 offset = file->f_pos;
902 goto out;
903 }
904 offset += file->f_pos;
905 break;
906 }
907
908 if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
909 offset = -EINVAL;
910 goto out;
911 }
912
913 /* Special lock needed here? */
914 if (offset != file->f_pos) {
915 file->f_pos = offset;
916 file->f_version = 0;
917 }
918
919out:
920 mutex_unlock(&inode->i_mutex);
921 return offset;
922}
923
924const struct file_operations ceph_file_fops = {
925 .open = ceph_open,
926 .release = ceph_release,
927 .llseek = ceph_llseek,
928 .read = do_sync_read,
929 .write = do_sync_write,
930 .aio_read = ceph_aio_read,
931 .aio_write = ceph_aio_write,
932 .mmap = ceph_mmap,
933 .fsync = ceph_fsync,
934 .splice_read = generic_file_splice_read,
935 .splice_write = generic_file_splice_write,
936 .unlocked_ioctl = ceph_ioctl,
937 .compat_ioctl = ceph_ioctl,
938};
939
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
new file mode 100644
index 000000000000..85b4d2ffdeba
--- /dev/null
+++ b/fs/ceph/inode.c
@@ -0,0 +1,1782 @@
1#include "ceph_debug.h"
2
3#include <linux/module.h>
4#include <linux/fs.h>
5#include <linux/smp_lock.h>
6#include <linux/slab.h>
7#include <linux/string.h>
8#include <linux/uaccess.h>
9#include <linux/kernel.h>
10#include <linux/namei.h>
11#include <linux/writeback.h>
12#include <linux/vmalloc.h>
13#include <linux/pagevec.h>
14
15#include "super.h"
16#include "decode.h"
17
18/*
19 * Ceph inode operations
20 *
21 * Implement basic inode helpers (get, alloc) and inode ops (getattr,
22 * setattr, etc.), xattr helpers, and helpers for assimilating
23 * metadata returned by the MDS into our cache.
24 *
25 * Also define helpers for doing asynchronous writeback, invalidation,
26 * and truncation for the benefit of those who can't afford to block
27 * (typically because they are in the message handler path).
28 */
29
30static const struct inode_operations ceph_symlink_iops;
31
32static void ceph_invalidate_work(struct work_struct *work);
33static void ceph_writeback_work(struct work_struct *work);
34static void ceph_vmtruncate_work(struct work_struct *work);
35
36/*
37 * find or create an inode, given the ceph ino number
38 */
39struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
40{
41 struct inode *inode;
42 ino_t t = ceph_vino_to_ino(vino);
43
44 inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
45 if (inode == NULL)
46 return ERR_PTR(-ENOMEM);
47 if (inode->i_state & I_NEW) {
48 dout("get_inode created new inode %p %llx.%llx ino %llx\n",
49 inode, ceph_vinop(inode), (u64)inode->i_ino);
50 unlock_new_inode(inode);
51 }
52
53 dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
54 vino.snap, inode);
55 return inode;
56}
57
58/*
59 * get/constuct snapdir inode for a given directory
60 */
61struct inode *ceph_get_snapdir(struct inode *parent)
62{
63 struct ceph_vino vino = {
64 .ino = ceph_ino(parent),
65 .snap = CEPH_SNAPDIR,
66 };
67 struct inode *inode = ceph_get_inode(parent->i_sb, vino);
68 struct ceph_inode_info *ci = ceph_inode(inode);
69
70 BUG_ON(!S_ISDIR(parent->i_mode));
71 if (IS_ERR(inode))
72 return ERR_PTR(PTR_ERR(inode));
73 inode->i_mode = parent->i_mode;
74 inode->i_uid = parent->i_uid;
75 inode->i_gid = parent->i_gid;
76 inode->i_op = &ceph_dir_iops;
77 inode->i_fop = &ceph_dir_fops;
78 ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
79 ci->i_rbytes = 0;
80 return inode;
81}
82
83const struct inode_operations ceph_file_iops = {
84 .permission = ceph_permission,
85 .setattr = ceph_setattr,
86 .getattr = ceph_getattr,
87 .setxattr = ceph_setxattr,
88 .getxattr = ceph_getxattr,
89 .listxattr = ceph_listxattr,
90 .removexattr = ceph_removexattr,
91};
92
93
94/*
95 * We use a 'frag tree' to keep track of the MDS's directory fragments
96 * for a given inode (usually there is just a single fragment). We
97 * need to know when a child frag is delegated to a new MDS, or when
98 * it is flagged as replicated, so we can direct our requests
99 * accordingly.
100 */
101
102/*
103 * find/create a frag in the tree
104 */
105static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
106 u32 f)
107{
108 struct rb_node **p;
109 struct rb_node *parent = NULL;
110 struct ceph_inode_frag *frag;
111 int c;
112
113 p = &ci->i_fragtree.rb_node;
114 while (*p) {
115 parent = *p;
116 frag = rb_entry(parent, struct ceph_inode_frag, node);
117 c = ceph_frag_compare(f, frag->frag);
118 if (c < 0)
119 p = &(*p)->rb_left;
120 else if (c > 0)
121 p = &(*p)->rb_right;
122 else
123 return frag;
124 }
125
126 frag = kmalloc(sizeof(*frag), GFP_NOFS);
127 if (!frag) {
128 pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx "
129 "frag %x\n", &ci->vfs_inode,
130 ceph_vinop(&ci->vfs_inode), f);
131 return ERR_PTR(-ENOMEM);
132 }
133 frag->frag = f;
134 frag->split_by = 0;
135 frag->mds = -1;
136 frag->ndist = 0;
137
138 rb_link_node(&frag->node, parent, p);
139 rb_insert_color(&frag->node, &ci->i_fragtree);
140
141 dout("get_or_create_frag added %llx.%llx frag %x\n",
142 ceph_vinop(&ci->vfs_inode), f);
143 return frag;
144}
145
146/*
147 * find a specific frag @f
148 */
149struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
150{
151 struct rb_node *n = ci->i_fragtree.rb_node;
152
153 while (n) {
154 struct ceph_inode_frag *frag =
155 rb_entry(n, struct ceph_inode_frag, node);
156 int c = ceph_frag_compare(f, frag->frag);
157 if (c < 0)
158 n = n->rb_left;
159 else if (c > 0)
160 n = n->rb_right;
161 else
162 return frag;
163 }
164 return NULL;
165}
166
167/*
168 * Choose frag containing the given value @v. If @pfrag is
169 * specified, copy the frag delegation info to the caller if
170 * it is present.
171 */
172u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
173 struct ceph_inode_frag *pfrag,
174 int *found)
175{
176 u32 t = ceph_frag_make(0, 0);
177 struct ceph_inode_frag *frag;
178 unsigned nway, i;
179 u32 n;
180
181 if (found)
182 *found = 0;
183
184 mutex_lock(&ci->i_fragtree_mutex);
185 while (1) {
186 WARN_ON(!ceph_frag_contains_value(t, v));
187 frag = __ceph_find_frag(ci, t);
188 if (!frag)
189 break; /* t is a leaf */
190 if (frag->split_by == 0) {
191 if (pfrag)
192 memcpy(pfrag, frag, sizeof(*pfrag));
193 if (found)
194 *found = 1;
195 break;
196 }
197
198 /* choose child */
199 nway = 1 << frag->split_by;
200 dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
201 frag->split_by, nway);
202 for (i = 0; i < nway; i++) {
203 n = ceph_frag_make_child(t, frag->split_by, i);
204 if (ceph_frag_contains_value(n, v)) {
205 t = n;
206 break;
207 }
208 }
209 BUG_ON(i == nway);
210 }
211 dout("choose_frag(%x) = %x\n", v, t);
212
213 mutex_unlock(&ci->i_fragtree_mutex);
214 return t;
215}
216
217/*
218 * Process dirfrag (delegation) info from the mds. Include leaf
219 * fragment in tree ONLY if ndist > 0. Otherwise, only
220 * branches/splits are included in i_fragtree)
221 */
222static int ceph_fill_dirfrag(struct inode *inode,
223 struct ceph_mds_reply_dirfrag *dirinfo)
224{
225 struct ceph_inode_info *ci = ceph_inode(inode);
226 struct ceph_inode_frag *frag;
227 u32 id = le32_to_cpu(dirinfo->frag);
228 int mds = le32_to_cpu(dirinfo->auth);
229 int ndist = le32_to_cpu(dirinfo->ndist);
230 int i;
231 int err = 0;
232
233 mutex_lock(&ci->i_fragtree_mutex);
234 if (ndist == 0) {
235 /* no delegation info needed. */
236 frag = __ceph_find_frag(ci, id);
237 if (!frag)
238 goto out;
239 if (frag->split_by == 0) {
240 /* tree leaf, remove */
241 dout("fill_dirfrag removed %llx.%llx frag %x"
242 " (no ref)\n", ceph_vinop(inode), id);
243 rb_erase(&frag->node, &ci->i_fragtree);
244 kfree(frag);
245 } else {
246 /* tree branch, keep and clear */
247 dout("fill_dirfrag cleared %llx.%llx frag %x"
248 " referral\n", ceph_vinop(inode), id);
249 frag->mds = -1;
250 frag->ndist = 0;
251 }
252 goto out;
253 }
254
255
256 /* find/add this frag to store mds delegation info */
257 frag = __get_or_create_frag(ci, id);
258 if (IS_ERR(frag)) {
259 /* this is not the end of the world; we can continue
260 with bad/inaccurate delegation info */
261 pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
262 ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
263 err = -ENOMEM;
264 goto out;
265 }
266
267 frag->mds = mds;
268 frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
269 for (i = 0; i < frag->ndist; i++)
270 frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
271 dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
272 ceph_vinop(inode), frag->frag, frag->ndist);
273
274out:
275 mutex_unlock(&ci->i_fragtree_mutex);
276 return err;
277}
278
279
280/*
281 * initialize a newly allocated inode.
282 */
283struct inode *ceph_alloc_inode(struct super_block *sb)
284{
285 struct ceph_inode_info *ci;
286 int i;
287
288 ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
289 if (!ci)
290 return NULL;
291
292 dout("alloc_inode %p\n", &ci->vfs_inode);
293
294 ci->i_version = 0;
295 ci->i_time_warp_seq = 0;
296 ci->i_ceph_flags = 0;
297 ci->i_release_count = 0;
298 ci->i_symlink = NULL;
299
300 ci->i_fragtree = RB_ROOT;
301 mutex_init(&ci->i_fragtree_mutex);
302
303 ci->i_xattrs.blob = NULL;
304 ci->i_xattrs.prealloc_blob = NULL;
305 ci->i_xattrs.dirty = false;
306 ci->i_xattrs.index = RB_ROOT;
307 ci->i_xattrs.count = 0;
308 ci->i_xattrs.names_size = 0;
309 ci->i_xattrs.vals_size = 0;
310 ci->i_xattrs.version = 0;
311 ci->i_xattrs.index_version = 0;
312
313 ci->i_caps = RB_ROOT;
314 ci->i_auth_cap = NULL;
315 ci->i_dirty_caps = 0;
316 ci->i_flushing_caps = 0;
317 INIT_LIST_HEAD(&ci->i_dirty_item);
318 INIT_LIST_HEAD(&ci->i_flushing_item);
319 ci->i_cap_flush_seq = 0;
320 ci->i_cap_flush_last_tid = 0;
321 memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
322 init_waitqueue_head(&ci->i_cap_wq);
323 ci->i_hold_caps_min = 0;
324 ci->i_hold_caps_max = 0;
325 INIT_LIST_HEAD(&ci->i_cap_delay_list);
326 ci->i_cap_exporting_mds = 0;
327 ci->i_cap_exporting_mseq = 0;
328 ci->i_cap_exporting_issued = 0;
329 INIT_LIST_HEAD(&ci->i_cap_snaps);
330 ci->i_head_snapc = NULL;
331 ci->i_snap_caps = 0;
332
333 for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
334 ci->i_nr_by_mode[i] = 0;
335
336 ci->i_truncate_seq = 0;
337 ci->i_truncate_size = 0;
338 ci->i_truncate_pending = 0;
339
340 ci->i_max_size = 0;
341 ci->i_reported_size = 0;
342 ci->i_wanted_max_size = 0;
343 ci->i_requested_max_size = 0;
344
345 ci->i_pin_ref = 0;
346 ci->i_rd_ref = 0;
347 ci->i_rdcache_ref = 0;
348 ci->i_wr_ref = 0;
349 ci->i_wrbuffer_ref = 0;
350 ci->i_wrbuffer_ref_head = 0;
351 ci->i_shared_gen = 0;
352 ci->i_rdcache_gen = 0;
353 ci->i_rdcache_revoking = 0;
354
355 INIT_LIST_HEAD(&ci->i_unsafe_writes);
356 INIT_LIST_HEAD(&ci->i_unsafe_dirops);
357 spin_lock_init(&ci->i_unsafe_lock);
358
359 ci->i_snap_realm = NULL;
360 INIT_LIST_HEAD(&ci->i_snap_realm_item);
361 INIT_LIST_HEAD(&ci->i_snap_flush_item);
362
363 INIT_WORK(&ci->i_wb_work, ceph_writeback_work);
364 INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work);
365
366 INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
367
368 return &ci->vfs_inode;
369}
370
371void ceph_destroy_inode(struct inode *inode)
372{
373 struct ceph_inode_info *ci = ceph_inode(inode);
374 struct ceph_inode_frag *frag;
375 struct rb_node *n;
376
377 dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
378
379 ceph_queue_caps_release(inode);
380
381 /*
382 * we may still have a snap_realm reference if there are stray
383 * caps in i_cap_exporting_issued or i_snap_caps.
384 */
385 if (ci->i_snap_realm) {
386 struct ceph_mds_client *mdsc =
387 &ceph_client(ci->vfs_inode.i_sb)->mdsc;
388 struct ceph_snap_realm *realm = ci->i_snap_realm;
389
390 dout(" dropping residual ref to snap realm %p\n", realm);
391 spin_lock(&realm->inodes_with_caps_lock);
392 list_del_init(&ci->i_snap_realm_item);
393 spin_unlock(&realm->inodes_with_caps_lock);
394 ceph_put_snap_realm(mdsc, realm);
395 }
396
397 kfree(ci->i_symlink);
398 while ((n = rb_first(&ci->i_fragtree)) != NULL) {
399 frag = rb_entry(n, struct ceph_inode_frag, node);
400 rb_erase(n, &ci->i_fragtree);
401 kfree(frag);
402 }
403
404 __ceph_destroy_xattrs(ci);
405 if (ci->i_xattrs.blob)
406 ceph_buffer_put(ci->i_xattrs.blob);
407 if (ci->i_xattrs.prealloc_blob)
408 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
409
410 kmem_cache_free(ceph_inode_cachep, ci);
411}
412
413
414/*
415 * Helpers to fill in size, ctime, mtime, and atime. We have to be
416 * careful because either the client or MDS may have more up to date
417 * info, depending on which capabilities are held, and whether
418 * time_warp_seq or truncate_seq have increased. (Ordinarily, mtime
419 * and size are monotonically increasing, except when utimes() or
420 * truncate() increments the corresponding _seq values.)
421 */
422int ceph_fill_file_size(struct inode *inode, int issued,
423 u32 truncate_seq, u64 truncate_size, u64 size)
424{
425 struct ceph_inode_info *ci = ceph_inode(inode);
426 int queue_trunc = 0;
427
428 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
429 (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
430 dout("size %lld -> %llu\n", inode->i_size, size);
431 inode->i_size = size;
432 inode->i_blocks = (size + (1<<9) - 1) >> 9;
433 ci->i_reported_size = size;
434 if (truncate_seq != ci->i_truncate_seq) {
435 dout("truncate_seq %u -> %u\n",
436 ci->i_truncate_seq, truncate_seq);
437 ci->i_truncate_seq = truncate_seq;
438 /*
439 * If we hold relevant caps, or in the case where we're
440 * not the only client referencing this file and we
441 * don't hold those caps, then we need to check whether
442 * the file is either opened or mmaped
443 */
444 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
445 CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
446 CEPH_CAP_FILE_EXCL)) ||
447 mapping_mapped(inode->i_mapping) ||
448 __ceph_caps_file_wanted(ci)) {
449 ci->i_truncate_pending++;
450 queue_trunc = 1;
451 }
452 }
453 }
454 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
455 ci->i_truncate_size != truncate_size) {
456 dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
457 truncate_size);
458 ci->i_truncate_size = truncate_size;
459 }
460 return queue_trunc;
461}
462
463void ceph_fill_file_time(struct inode *inode, int issued,
464 u64 time_warp_seq, struct timespec *ctime,
465 struct timespec *mtime, struct timespec *atime)
466{
467 struct ceph_inode_info *ci = ceph_inode(inode);
468 int warn = 0;
469
470 if (issued & (CEPH_CAP_FILE_EXCL|
471 CEPH_CAP_FILE_WR|
472 CEPH_CAP_FILE_BUFFER)) {
473 if (timespec_compare(ctime, &inode->i_ctime) > 0) {
474 dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
475 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
476 ctime->tv_sec, ctime->tv_nsec);
477 inode->i_ctime = *ctime;
478 }
479 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
480 /* the MDS did a utimes() */
481 dout("mtime %ld.%09ld -> %ld.%09ld "
482 "tw %d -> %d\n",
483 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
484 mtime->tv_sec, mtime->tv_nsec,
485 ci->i_time_warp_seq, (int)time_warp_seq);
486
487 inode->i_mtime = *mtime;
488 inode->i_atime = *atime;
489 ci->i_time_warp_seq = time_warp_seq;
490 } else if (time_warp_seq == ci->i_time_warp_seq) {
491 /* nobody did utimes(); take the max */
492 if (timespec_compare(mtime, &inode->i_mtime) > 0) {
493 dout("mtime %ld.%09ld -> %ld.%09ld inc\n",
494 inode->i_mtime.tv_sec,
495 inode->i_mtime.tv_nsec,
496 mtime->tv_sec, mtime->tv_nsec);
497 inode->i_mtime = *mtime;
498 }
499 if (timespec_compare(atime, &inode->i_atime) > 0) {
500 dout("atime %ld.%09ld -> %ld.%09ld inc\n",
501 inode->i_atime.tv_sec,
502 inode->i_atime.tv_nsec,
503 atime->tv_sec, atime->tv_nsec);
504 inode->i_atime = *atime;
505 }
506 } else if (issued & CEPH_CAP_FILE_EXCL) {
507 /* we did a utimes(); ignore mds values */
508 } else {
509 warn = 1;
510 }
511 } else {
512 /* we have no write caps; whatever the MDS says is true */
513 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
514 inode->i_ctime = *ctime;
515 inode->i_mtime = *mtime;
516 inode->i_atime = *atime;
517 ci->i_time_warp_seq = time_warp_seq;
518 } else {
519 warn = 1;
520 }
521 }
522 if (warn) /* time_warp_seq shouldn't go backwards */
523 dout("%p mds time_warp_seq %llu < %u\n",
524 inode, time_warp_seq, ci->i_time_warp_seq);
525}
526
527/*
528 * Populate an inode based on info from mds. May be called on new or
529 * existing inodes.
530 */
531static int fill_inode(struct inode *inode,
532 struct ceph_mds_reply_info_in *iinfo,
533 struct ceph_mds_reply_dirfrag *dirinfo,
534 struct ceph_mds_session *session,
535 unsigned long ttl_from, int cap_fmode,
536 struct ceph_cap_reservation *caps_reservation)
537{
538 struct ceph_mds_reply_inode *info = iinfo->in;
539 struct ceph_inode_info *ci = ceph_inode(inode);
540 int i;
541 int issued, implemented;
542 struct timespec mtime, atime, ctime;
543 u32 nsplits;
544 struct ceph_buffer *xattr_blob = NULL;
545 int err = 0;
546 int queue_trunc = 0;
547
548 dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
549 inode, ceph_vinop(inode), le64_to_cpu(info->version),
550 ci->i_version);
551
552 /*
553 * prealloc xattr data, if it looks like we'll need it. only
554 * if len > 4 (meaning there are actually xattrs; the first 4
555 * bytes are the xattr count).
556 */
557 if (iinfo->xattr_len > 4) {
558 xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
559 if (!xattr_blob)
560 pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
561 iinfo->xattr_len);
562 }
563
564 spin_lock(&inode->i_lock);
565
566 /*
567 * provided version will be odd if inode value is projected,
568 * even if stable. skip the update if we have a newer info
569 * (e.g., due to inode info racing form multiple MDSs), or if
570 * we are getting projected (unstable) inode info.
571 */
572 if (le64_to_cpu(info->version) > 0 &&
573 (ci->i_version & ~1) > le64_to_cpu(info->version))
574 goto no_change;
575
576 issued = __ceph_caps_issued(ci, &implemented);
577 issued |= implemented | __ceph_caps_dirty(ci);
578
579 /* update inode */
580 ci->i_version = le64_to_cpu(info->version);
581 inode->i_version++;
582 inode->i_rdev = le32_to_cpu(info->rdev);
583
584 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
585 inode->i_mode = le32_to_cpu(info->mode);
586 inode->i_uid = le32_to_cpu(info->uid);
587 inode->i_gid = le32_to_cpu(info->gid);
588 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
589 inode->i_uid, inode->i_gid);
590 }
591
592 if ((issued & CEPH_CAP_LINK_EXCL) == 0)
593 inode->i_nlink = le32_to_cpu(info->nlink);
594
595 /* be careful with mtime, atime, size */
596 ceph_decode_timespec(&atime, &info->atime);
597 ceph_decode_timespec(&mtime, &info->mtime);
598 ceph_decode_timespec(&ctime, &info->ctime);
599 queue_trunc = ceph_fill_file_size(inode, issued,
600 le32_to_cpu(info->truncate_seq),
601 le64_to_cpu(info->truncate_size),
602 le64_to_cpu(info->size));
603 ceph_fill_file_time(inode, issued,
604 le32_to_cpu(info->time_warp_seq),
605 &ctime, &mtime, &atime);
606
607 ci->i_max_size = le64_to_cpu(info->max_size);
608 ci->i_layout = info->layout;
609 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
610
611 /* xattrs */
612 /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
613 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
614 le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
615 if (ci->i_xattrs.blob)
616 ceph_buffer_put(ci->i_xattrs.blob);
617 ci->i_xattrs.blob = xattr_blob;
618 if (xattr_blob)
619 memcpy(ci->i_xattrs.blob->vec.iov_base,
620 iinfo->xattr_data, iinfo->xattr_len);
621 ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
622 }
623
624 inode->i_mapping->a_ops = &ceph_aops;
625 inode->i_mapping->backing_dev_info =
626 &ceph_client(inode->i_sb)->backing_dev_info;
627
628 switch (inode->i_mode & S_IFMT) {
629 case S_IFIFO:
630 case S_IFBLK:
631 case S_IFCHR:
632 case S_IFSOCK:
633 init_special_inode(inode, inode->i_mode, inode->i_rdev);
634 inode->i_op = &ceph_file_iops;
635 break;
636 case S_IFREG:
637 inode->i_op = &ceph_file_iops;
638 inode->i_fop = &ceph_file_fops;
639 break;
640 case S_IFLNK:
641 inode->i_op = &ceph_symlink_iops;
642 if (!ci->i_symlink) {
643 int symlen = iinfo->symlink_len;
644 char *sym;
645
646 BUG_ON(symlen != inode->i_size);
647 spin_unlock(&inode->i_lock);
648
649 err = -ENOMEM;
650 sym = kmalloc(symlen+1, GFP_NOFS);
651 if (!sym)
652 goto out;
653 memcpy(sym, iinfo->symlink, symlen);
654 sym[symlen] = 0;
655
656 spin_lock(&inode->i_lock);
657 if (!ci->i_symlink)
658 ci->i_symlink = sym;
659 else
660 kfree(sym); /* lost a race */
661 }
662 break;
663 case S_IFDIR:
664 inode->i_op = &ceph_dir_iops;
665 inode->i_fop = &ceph_dir_fops;
666
667 ci->i_files = le64_to_cpu(info->files);
668 ci->i_subdirs = le64_to_cpu(info->subdirs);
669 ci->i_rbytes = le64_to_cpu(info->rbytes);
670 ci->i_rfiles = le64_to_cpu(info->rfiles);
671 ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
672 ceph_decode_timespec(&ci->i_rctime, &info->rctime);
673
674 /* set dir completion flag? */
675 if (ci->i_files == 0 && ci->i_subdirs == 0 &&
676 ceph_snap(inode) == CEPH_NOSNAP &&
677 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) {
678 dout(" marking %p complete (empty)\n", inode);
679 ci->i_ceph_flags |= CEPH_I_COMPLETE;
680 ci->i_max_offset = 2;
681 }
682
683 /* it may be better to set st_size in getattr instead? */
684 if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES))
685 inode->i_size = ci->i_rbytes;
686 break;
687 default:
688 pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
689 ceph_vinop(inode), inode->i_mode);
690 }
691
692no_change:
693 spin_unlock(&inode->i_lock);
694
695 /* queue truncate if we saw i_size decrease */
696 if (queue_trunc)
697 ceph_queue_vmtruncate(inode);
698
699 /* populate frag tree */
700 /* FIXME: move me up, if/when version reflects fragtree changes */
701 nsplits = le32_to_cpu(info->fragtree.nsplits);
702 mutex_lock(&ci->i_fragtree_mutex);
703 for (i = 0; i < nsplits; i++) {
704 u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
705 struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
706
707 if (IS_ERR(frag))
708 continue;
709 frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
710 dout(" frag %x split by %d\n", frag->frag, frag->split_by);
711 }
712 mutex_unlock(&ci->i_fragtree_mutex);
713
714 /* were we issued a capability? */
715 if (info->cap.caps) {
716 if (ceph_snap(inode) == CEPH_NOSNAP) {
717 ceph_add_cap(inode, session,
718 le64_to_cpu(info->cap.cap_id),
719 cap_fmode,
720 le32_to_cpu(info->cap.caps),
721 le32_to_cpu(info->cap.wanted),
722 le32_to_cpu(info->cap.seq),
723 le32_to_cpu(info->cap.mseq),
724 le64_to_cpu(info->cap.realm),
725 info->cap.flags,
726 caps_reservation);
727 } else {
728 spin_lock(&inode->i_lock);
729 dout(" %p got snap_caps %s\n", inode,
730 ceph_cap_string(le32_to_cpu(info->cap.caps)));
731 ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
732 if (cap_fmode >= 0)
733 __ceph_get_fmode(ci, cap_fmode);
734 spin_unlock(&inode->i_lock);
735 }
736 } else if (cap_fmode >= 0) {
737 pr_warning("mds issued no caps on %llx.%llx\n",
738 ceph_vinop(inode));
739 __ceph_get_fmode(ci, cap_fmode);
740 }
741
742 /* update delegation info? */
743 if (dirinfo)
744 ceph_fill_dirfrag(inode, dirinfo);
745
746 err = 0;
747
748out:
749 if (xattr_blob)
750 ceph_buffer_put(xattr_blob);
751 return err;
752}
753
754/*
755 * caller should hold session s_mutex.
756 */
757static void update_dentry_lease(struct dentry *dentry,
758 struct ceph_mds_reply_lease *lease,
759 struct ceph_mds_session *session,
760 unsigned long from_time)
761{
762 struct ceph_dentry_info *di = ceph_dentry(dentry);
763 long unsigned duration = le32_to_cpu(lease->duration_ms);
764 long unsigned ttl = from_time + (duration * HZ) / 1000;
765 long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
766 struct inode *dir;
767
768 /* only track leases on regular dentries */
769 if (dentry->d_op != &ceph_dentry_ops)
770 return;
771
772 spin_lock(&dentry->d_lock);
773 dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n",
774 dentry, le16_to_cpu(lease->mask), duration, ttl);
775
776 /* make lease_rdcache_gen match directory */
777 dir = dentry->d_parent->d_inode;
778 di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
779
780 if (lease->mask == 0)
781 goto out_unlock;
782
783 if (di->lease_gen == session->s_cap_gen &&
784 time_before(ttl, dentry->d_time))
785 goto out_unlock; /* we already have a newer lease. */
786
787 if (di->lease_session && di->lease_session != session)
788 goto out_unlock;
789
790 ceph_dentry_lru_touch(dentry);
791
792 if (!di->lease_session)
793 di->lease_session = ceph_get_mds_session(session);
794 di->lease_gen = session->s_cap_gen;
795 di->lease_seq = le32_to_cpu(lease->seq);
796 di->lease_renew_after = half_ttl;
797 di->lease_renew_from = 0;
798 dentry->d_time = ttl;
799out_unlock:
800 spin_unlock(&dentry->d_lock);
801 return;
802}
803
804/*
805 * splice a dentry to an inode.
806 * caller must hold directory i_mutex for this to be safe.
807 *
808 * we will only rehash the resulting dentry if @prehash is
809 * true; @prehash will be set to false (for the benefit of
810 * the caller) if we fail.
811 */
812static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
813 bool *prehash)
814{
815 struct dentry *realdn;
816
817 /* dn must be unhashed */
818 if (!d_unhashed(dn))
819 d_drop(dn);
820 realdn = d_materialise_unique(dn, in);
821 if (IS_ERR(realdn)) {
822 pr_err("splice_dentry error %p inode %p ino %llx.%llx\n",
823 dn, in, ceph_vinop(in));
824 if (prehash)
825 *prehash = false; /* don't rehash on error */
826 dn = realdn; /* note realdn contains the error */
827 goto out;
828 } else if (realdn) {
829 dout("dn %p (%d) spliced with %p (%d) "
830 "inode %p ino %llx.%llx\n",
831 dn, atomic_read(&dn->d_count),
832 realdn, atomic_read(&realdn->d_count),
833 realdn->d_inode, ceph_vinop(realdn->d_inode));
834 dput(dn);
835 dn = realdn;
836 } else {
837 BUG_ON(!ceph_dentry(dn));
838
839 dout("dn %p attached to %p ino %llx.%llx\n",
840 dn, dn->d_inode, ceph_vinop(dn->d_inode));
841 }
842 if ((!prehash || *prehash) && d_unhashed(dn))
843 d_rehash(dn);
844out:
845 return dn;
846}
847
848/*
849 * Set dentry's directory position based on the current dir's max, and
850 * order it in d_subdirs, so that dcache_readdir behaves.
851 */
852static void ceph_set_dentry_offset(struct dentry *dn)
853{
854 struct dentry *dir = dn->d_parent;
855 struct inode *inode = dn->d_parent->d_inode;
856 struct ceph_dentry_info *di;
857
858 BUG_ON(!inode);
859
860 di = ceph_dentry(dn);
861
862 spin_lock(&inode->i_lock);
863 di->offset = ceph_inode(inode)->i_max_offset++;
864 spin_unlock(&inode->i_lock);
865
866 spin_lock(&dcache_lock);
867 spin_lock(&dn->d_lock);
868 list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
869 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
870 dn->d_u.d_child.prev, dn->d_u.d_child.next);
871 spin_unlock(&dn->d_lock);
872 spin_unlock(&dcache_lock);
873}
874
875/*
876 * Incorporate results into the local cache. This is either just
877 * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
878 * after a lookup).
879 *
880 * A reply may contain
881 * a directory inode along with a dentry.
882 * and/or a target inode
883 *
884 * Called with snap_rwsem (read).
885 */
886int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
887 struct ceph_mds_session *session)
888{
889 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
890 struct inode *in = NULL;
891 struct ceph_mds_reply_inode *ininfo;
892 struct ceph_vino vino;
893 struct ceph_client *client = ceph_sb_to_client(sb);
894 int i = 0;
895 int err = 0;
896
897 dout("fill_trace %p is_dentry %d is_target %d\n", req,
898 rinfo->head->is_dentry, rinfo->head->is_target);
899
900#if 0
901 /*
902 * Debugging hook:
903 *
904 * If we resend completed ops to a recovering mds, we get no
905 * trace. Since that is very rare, pretend this is the case
906 * to ensure the 'no trace' handlers in the callers behave.
907 *
908 * Fill in inodes unconditionally to avoid breaking cap
909 * invariants.
910 */
911 if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
912 pr_info("fill_trace faking empty trace on %lld %s\n",
913 req->r_tid, ceph_mds_op_name(rinfo->head->op));
914 if (rinfo->head->is_dentry) {
915 rinfo->head->is_dentry = 0;
916 err = fill_inode(req->r_locked_dir,
917 &rinfo->diri, rinfo->dirfrag,
918 session, req->r_request_started, -1);
919 }
920 if (rinfo->head->is_target) {
921 rinfo->head->is_target = 0;
922 ininfo = rinfo->targeti.in;
923 vino.ino = le64_to_cpu(ininfo->ino);
924 vino.snap = le64_to_cpu(ininfo->snapid);
925 in = ceph_get_inode(sb, vino);
926 err = fill_inode(in, &rinfo->targeti, NULL,
927 session, req->r_request_started,
928 req->r_fmode);
929 iput(in);
930 }
931 }
932#endif
933
934 if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
935 dout("fill_trace reply is empty!\n");
936 if (rinfo->head->result == 0 && req->r_locked_dir) {
937 struct ceph_inode_info *ci =
938 ceph_inode(req->r_locked_dir);
939 dout(" clearing %p complete (empty trace)\n",
940 req->r_locked_dir);
941 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
942 ci->i_release_count++;
943 }
944 return 0;
945 }
946
947 if (rinfo->head->is_dentry) {
948 struct inode *dir = req->r_locked_dir;
949
950 err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
951 session, req->r_request_started, -1,
952 &req->r_caps_reservation);
953 if (err < 0)
954 return err;
955 }
956
957 /*
958 * ignore null lease/binding on snapdir ENOENT, or else we
959 * will have trouble splicing in the virtual snapdir later
960 */
961 if (rinfo->head->is_dentry && !req->r_aborted &&
962 (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
963 client->mount_args->snapdir_name,
964 req->r_dentry->d_name.len))) {
965 /*
966 * lookup link rename : null -> possibly existing inode
967 * mknod symlink mkdir : null -> new inode
968 * unlink : linked -> null
969 */
970 struct inode *dir = req->r_locked_dir;
971 struct dentry *dn = req->r_dentry;
972 bool have_dir_cap, have_lease;
973
974 BUG_ON(!dn);
975 BUG_ON(!dir);
976 BUG_ON(dn->d_parent->d_inode != dir);
977 BUG_ON(ceph_ino(dir) !=
978 le64_to_cpu(rinfo->diri.in->ino));
979 BUG_ON(ceph_snap(dir) !=
980 le64_to_cpu(rinfo->diri.in->snapid));
981
982 /* do we have a lease on the whole dir? */
983 have_dir_cap =
984 (le32_to_cpu(rinfo->diri.in->cap.caps) &
985 CEPH_CAP_FILE_SHARED);
986
987 /* do we have a dn lease? */
988 have_lease = have_dir_cap ||
989 (le16_to_cpu(rinfo->dlease->mask) &
990 CEPH_LOCK_DN);
991
992 if (!have_lease)
993 dout("fill_trace no dentry lease or dir cap\n");
994
995 /* rename? */
996 if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
997 dout(" src %p '%.*s' dst %p '%.*s'\n",
998 req->r_old_dentry,
999 req->r_old_dentry->d_name.len,
1000 req->r_old_dentry->d_name.name,
1001 dn, dn->d_name.len, dn->d_name.name);
1002 dout("fill_trace doing d_move %p -> %p\n",
1003 req->r_old_dentry, dn);
1004
1005 /* d_move screws up d_subdirs order */
1006 ceph_i_clear(dir, CEPH_I_COMPLETE);
1007
1008 d_move(req->r_old_dentry, dn);
1009 dout(" src %p '%.*s' dst %p '%.*s'\n",
1010 req->r_old_dentry,
1011 req->r_old_dentry->d_name.len,
1012 req->r_old_dentry->d_name.name,
1013 dn, dn->d_name.len, dn->d_name.name);
1014 /* ensure target dentry is invalidated, despite
1015 rehashing bug in vfs_rename_dir */
1016 dn->d_time = jiffies;
1017 ceph_dentry(dn)->lease_shared_gen = 0;
1018 /* take overwritten dentry's readdir offset */
1019 ceph_dentry(req->r_old_dentry)->offset =
1020 ceph_dentry(dn)->offset;
1021 dn = req->r_old_dentry; /* use old_dentry */
1022 in = dn->d_inode;
1023 }
1024
1025 /* null dentry? */
1026 if (!rinfo->head->is_target) {
1027 dout("fill_trace null dentry\n");
1028 if (dn->d_inode) {
1029 dout("d_delete %p\n", dn);
1030 d_delete(dn);
1031 } else {
1032 dout("d_instantiate %p NULL\n", dn);
1033 d_instantiate(dn, NULL);
1034 if (have_lease && d_unhashed(dn))
1035 d_rehash(dn);
1036 update_dentry_lease(dn, rinfo->dlease,
1037 session,
1038 req->r_request_started);
1039 }
1040 goto done;
1041 }
1042
1043 /* attach proper inode */
1044 ininfo = rinfo->targeti.in;
1045 vino.ino = le64_to_cpu(ininfo->ino);
1046 vino.snap = le64_to_cpu(ininfo->snapid);
1047 if (!dn->d_inode) {
1048 in = ceph_get_inode(sb, vino);
1049 if (IS_ERR(in)) {
1050 pr_err("fill_trace bad get_inode "
1051 "%llx.%llx\n", vino.ino, vino.snap);
1052 err = PTR_ERR(in);
1053 d_delete(dn);
1054 goto done;
1055 }
1056 dn = splice_dentry(dn, in, &have_lease);
1057 if (IS_ERR(dn)) {
1058 err = PTR_ERR(dn);
1059 goto done;
1060 }
1061 req->r_dentry = dn; /* may have spliced */
1062 ceph_set_dentry_offset(dn);
1063 igrab(in);
1064 } else if (ceph_ino(in) == vino.ino &&
1065 ceph_snap(in) == vino.snap) {
1066 igrab(in);
1067 } else {
1068 dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
1069 dn, in, ceph_ino(in), ceph_snap(in),
1070 vino.ino, vino.snap);
1071 have_lease = false;
1072 in = NULL;
1073 }
1074
1075 if (have_lease)
1076 update_dentry_lease(dn, rinfo->dlease, session,
1077 req->r_request_started);
1078 dout(" final dn %p\n", dn);
1079 i++;
1080 } else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
1081 req->r_op == CEPH_MDS_OP_MKSNAP) {
1082 struct dentry *dn = req->r_dentry;
1083
1084 /* fill out a snapdir LOOKUPSNAP dentry */
1085 BUG_ON(!dn);
1086 BUG_ON(!req->r_locked_dir);
1087 BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
1088 ininfo = rinfo->targeti.in;
1089 vino.ino = le64_to_cpu(ininfo->ino);
1090 vino.snap = le64_to_cpu(ininfo->snapid);
1091 in = ceph_get_inode(sb, vino);
1092 if (IS_ERR(in)) {
1093 pr_err("fill_inode get_inode badness %llx.%llx\n",
1094 vino.ino, vino.snap);
1095 err = PTR_ERR(in);
1096 d_delete(dn);
1097 goto done;
1098 }
1099 dout(" linking snapped dir %p to dn %p\n", in, dn);
1100 dn = splice_dentry(dn, in, NULL);
1101 if (IS_ERR(dn)) {
1102 err = PTR_ERR(dn);
1103 goto done;
1104 }
1105 ceph_set_dentry_offset(dn);
1106 req->r_dentry = dn; /* may have spliced */
1107 igrab(in);
1108 rinfo->head->is_dentry = 1; /* fool notrace handlers */
1109 }
1110
1111 if (rinfo->head->is_target) {
1112 vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1113 vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1114
1115 if (in == NULL || ceph_ino(in) != vino.ino ||
1116 ceph_snap(in) != vino.snap) {
1117 in = ceph_get_inode(sb, vino);
1118 if (IS_ERR(in)) {
1119 err = PTR_ERR(in);
1120 goto done;
1121 }
1122 }
1123 req->r_target_inode = in;
1124
1125 err = fill_inode(in,
1126 &rinfo->targeti, NULL,
1127 session, req->r_request_started,
1128 (le32_to_cpu(rinfo->head->result) == 0) ?
1129 req->r_fmode : -1,
1130 &req->r_caps_reservation);
1131 if (err < 0) {
1132 pr_err("fill_inode badness %p %llx.%llx\n",
1133 in, ceph_vinop(in));
1134 goto done;
1135 }
1136 }
1137
1138done:
1139 dout("fill_trace done err=%d\n", err);
1140 return err;
1141}
1142
1143/*
1144 * Prepopulate our cache with readdir results, leases, etc.
1145 */
1146int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1147 struct ceph_mds_session *session)
1148{
1149 struct dentry *parent = req->r_dentry;
1150 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1151 struct qstr dname;
1152 struct dentry *dn;
1153 struct inode *in;
1154 int err = 0, i;
1155 struct inode *snapdir = NULL;
1156 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
1157 u64 frag = le32_to_cpu(rhead->args.readdir.frag);
1158 struct ceph_dentry_info *di;
1159
1160 if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
1161 snapdir = ceph_get_snapdir(parent->d_inode);
1162 parent = d_find_alias(snapdir);
1163 dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
1164 rinfo->dir_nr, parent);
1165 } else {
1166 dout("readdir_prepopulate %d items under dn %p\n",
1167 rinfo->dir_nr, parent);
1168 if (rinfo->dir_dir)
1169 ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
1170 }
1171
1172 for (i = 0; i < rinfo->dir_nr; i++) {
1173 struct ceph_vino vino;
1174
1175 dname.name = rinfo->dir_dname[i];
1176 dname.len = rinfo->dir_dname_len[i];
1177 dname.hash = full_name_hash(dname.name, dname.len);
1178
1179 vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
1180 vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
1181
1182retry_lookup:
1183 dn = d_lookup(parent, &dname);
1184 dout("d_lookup on parent=%p name=%.*s got %p\n",
1185 parent, dname.len, dname.name, dn);
1186
1187 if (!dn) {
1188 dn = d_alloc(parent, &dname);
1189 dout("d_alloc %p '%.*s' = %p\n", parent,
1190 dname.len, dname.name, dn);
1191 if (dn == NULL) {
1192 dout("d_alloc badness\n");
1193 err = -ENOMEM;
1194 goto out;
1195 }
1196 err = ceph_init_dentry(dn);
1197 if (err < 0)
1198 goto out;
1199 } else if (dn->d_inode &&
1200 (ceph_ino(dn->d_inode) != vino.ino ||
1201 ceph_snap(dn->d_inode) != vino.snap)) {
1202 dout(" dn %p points to wrong inode %p\n",
1203 dn, dn->d_inode);
1204 d_delete(dn);
1205 dput(dn);
1206 goto retry_lookup;
1207 } else {
1208 /* reorder parent's d_subdirs */
1209 spin_lock(&dcache_lock);
1210 spin_lock(&dn->d_lock);
1211 list_move(&dn->d_u.d_child, &parent->d_subdirs);
1212 spin_unlock(&dn->d_lock);
1213 spin_unlock(&dcache_lock);
1214 }
1215
1216 di = dn->d_fsdata;
1217 di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
1218
1219 /* inode */
1220 if (dn->d_inode) {
1221 in = dn->d_inode;
1222 } else {
1223 in = ceph_get_inode(parent->d_sb, vino);
1224 if (in == NULL) {
1225 dout("new_inode badness\n");
1226 d_delete(dn);
1227 dput(dn);
1228 err = -ENOMEM;
1229 goto out;
1230 }
1231 dn = splice_dentry(dn, in, NULL);
1232 }
1233
1234 if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
1235 req->r_request_started, -1,
1236 &req->r_caps_reservation) < 0) {
1237 pr_err("fill_inode badness on %p\n", in);
1238 dput(dn);
1239 continue;
1240 }
1241 update_dentry_lease(dn, rinfo->dir_dlease[i],
1242 req->r_session, req->r_request_started);
1243 dput(dn);
1244 }
1245 req->r_did_prepopulate = true;
1246
1247out:
1248 if (snapdir) {
1249 iput(snapdir);
1250 dput(parent);
1251 }
1252 dout("readdir_prepopulate done\n");
1253 return err;
1254}
1255
1256int ceph_inode_set_size(struct inode *inode, loff_t size)
1257{
1258 struct ceph_inode_info *ci = ceph_inode(inode);
1259 int ret = 0;
1260
1261 spin_lock(&inode->i_lock);
1262 dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
1263 inode->i_size = size;
1264 inode->i_blocks = (size + (1 << 9) - 1) >> 9;
1265
1266 /* tell the MDS if we are approaching max_size */
1267 if ((size << 1) >= ci->i_max_size &&
1268 (ci->i_reported_size << 1) < ci->i_max_size)
1269 ret = 1;
1270
1271 spin_unlock(&inode->i_lock);
1272 return ret;
1273}
1274
1275/*
1276 * Write back inode data in a worker thread. (This can't be done
1277 * in the message handler context.)
1278 */
1279void ceph_queue_writeback(struct inode *inode)
1280{
1281 if (queue_work(ceph_inode_to_client(inode)->wb_wq,
1282 &ceph_inode(inode)->i_wb_work)) {
1283 dout("ceph_queue_writeback %p\n", inode);
1284 igrab(inode);
1285 } else {
1286 dout("ceph_queue_writeback %p failed\n", inode);
1287 }
1288}
1289
1290static void ceph_writeback_work(struct work_struct *work)
1291{
1292 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1293 i_wb_work);
1294 struct inode *inode = &ci->vfs_inode;
1295
1296 dout("writeback %p\n", inode);
1297 filemap_fdatawrite(&inode->i_data);
1298 iput(inode);
1299}
1300
1301/*
1302 * queue an async invalidation
1303 */
1304void ceph_queue_invalidate(struct inode *inode)
1305{
1306 if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
1307 &ceph_inode(inode)->i_pg_inv_work)) {
1308 dout("ceph_queue_invalidate %p\n", inode);
1309 igrab(inode);
1310 } else {
1311 dout("ceph_queue_invalidate %p failed\n", inode);
1312 }
1313}
1314
1315/*
1316 * invalidate any pages that are not dirty or under writeback. this
1317 * includes pages that are clean and mapped.
1318 */
1319static void ceph_invalidate_nondirty_pages(struct address_space *mapping)
1320{
1321 struct pagevec pvec;
1322 pgoff_t next = 0;
1323 int i;
1324
1325 pagevec_init(&pvec, 0);
1326 while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
1327 for (i = 0; i < pagevec_count(&pvec); i++) {
1328 struct page *page = pvec.pages[i];
1329 pgoff_t index;
1330 int skip_page =
1331 (PageDirty(page) || PageWriteback(page));
1332
1333 if (!skip_page)
1334 skip_page = !trylock_page(page);
1335
1336 /*
1337 * We really shouldn't be looking at the ->index of an
1338 * unlocked page. But we're not allowed to lock these
1339 * pages. So we rely upon nobody altering the ->index
1340 * of this (pinned-by-us) page.
1341 */
1342 index = page->index;
1343 if (index > next)
1344 next = index;
1345 next++;
1346
1347 if (skip_page)
1348 continue;
1349
1350 generic_error_remove_page(mapping, page);
1351 unlock_page(page);
1352 }
1353 pagevec_release(&pvec);
1354 cond_resched();
1355 }
1356}
1357
1358/*
1359 * Invalidate inode pages in a worker thread. (This can't be done
1360 * in the message handler context.)
1361 */
1362static void ceph_invalidate_work(struct work_struct *work)
1363{
1364 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1365 i_pg_inv_work);
1366 struct inode *inode = &ci->vfs_inode;
1367 u32 orig_gen;
1368 int check = 0;
1369
1370 spin_lock(&inode->i_lock);
1371 dout("invalidate_pages %p gen %d revoking %d\n", inode,
1372 ci->i_rdcache_gen, ci->i_rdcache_revoking);
1373 if (ci->i_rdcache_gen == 0 ||
1374 ci->i_rdcache_revoking != ci->i_rdcache_gen) {
1375 BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
1376 /* nevermind! */
1377 ci->i_rdcache_revoking = 0;
1378 spin_unlock(&inode->i_lock);
1379 goto out;
1380 }
1381 orig_gen = ci->i_rdcache_gen;
1382 spin_unlock(&inode->i_lock);
1383
1384 ceph_invalidate_nondirty_pages(inode->i_mapping);
1385
1386 spin_lock(&inode->i_lock);
1387 if (orig_gen == ci->i_rdcache_gen) {
1388 dout("invalidate_pages %p gen %d successful\n", inode,
1389 ci->i_rdcache_gen);
1390 ci->i_rdcache_gen = 0;
1391 ci->i_rdcache_revoking = 0;
1392 check = 1;
1393 } else {
1394 dout("invalidate_pages %p gen %d raced, gen now %d\n",
1395 inode, orig_gen, ci->i_rdcache_gen);
1396 }
1397 spin_unlock(&inode->i_lock);
1398
1399 if (check)
1400 ceph_check_caps(ci, 0, NULL);
1401out:
1402 iput(inode);
1403}
1404
1405
1406/*
1407 * called by trunc_wq; take i_mutex ourselves
1408 *
1409 * We also truncate in a separate thread as well.
1410 */
1411static void ceph_vmtruncate_work(struct work_struct *work)
1412{
1413 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1414 i_vmtruncate_work);
1415 struct inode *inode = &ci->vfs_inode;
1416
1417 dout("vmtruncate_work %p\n", inode);
1418 mutex_lock(&inode->i_mutex);
1419 __ceph_do_pending_vmtruncate(inode);
1420 mutex_unlock(&inode->i_mutex);
1421 iput(inode);
1422}
1423
1424/*
1425 * Queue an async vmtruncate. If we fail to queue work, we will handle
1426 * the truncation the next time we call __ceph_do_pending_vmtruncate.
1427 */
1428void ceph_queue_vmtruncate(struct inode *inode)
1429{
1430 struct ceph_inode_info *ci = ceph_inode(inode);
1431
1432 if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
1433 &ci->i_vmtruncate_work)) {
1434 dout("ceph_queue_vmtruncate %p\n", inode);
1435 igrab(inode);
1436 } else {
1437 dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
1438 inode, ci->i_truncate_pending);
1439 }
1440}
1441
1442/*
1443 * called with i_mutex held.
1444 *
1445 * Make sure any pending truncation is applied before doing anything
1446 * that may depend on it.
1447 */
1448void __ceph_do_pending_vmtruncate(struct inode *inode)
1449{
1450 struct ceph_inode_info *ci = ceph_inode(inode);
1451 u64 to;
1452 int wrbuffer_refs, wake = 0;
1453
1454retry:
1455 spin_lock(&inode->i_lock);
1456 if (ci->i_truncate_pending == 0) {
1457 dout("__do_pending_vmtruncate %p none pending\n", inode);
1458 spin_unlock(&inode->i_lock);
1459 return;
1460 }
1461
1462 /*
1463 * make sure any dirty snapped pages are flushed before we
1464 * possibly truncate them.. so write AND block!
1465 */
1466 if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
1467 dout("__do_pending_vmtruncate %p flushing snaps first\n",
1468 inode);
1469 spin_unlock(&inode->i_lock);
1470 filemap_write_and_wait_range(&inode->i_data, 0,
1471 inode->i_sb->s_maxbytes);
1472 goto retry;
1473 }
1474
1475 to = ci->i_truncate_size;
1476 wrbuffer_refs = ci->i_wrbuffer_ref;
1477 dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
1478 ci->i_truncate_pending, to);
1479 spin_unlock(&inode->i_lock);
1480
1481 truncate_inode_pages(inode->i_mapping, to);
1482
1483 spin_lock(&inode->i_lock);
1484 ci->i_truncate_pending--;
1485 if (ci->i_truncate_pending == 0)
1486 wake = 1;
1487 spin_unlock(&inode->i_lock);
1488
1489 if (wrbuffer_refs == 0)
1490 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
1491 if (wake)
1492 wake_up(&ci->i_cap_wq);
1493}
1494
1495
1496/*
1497 * symlinks
1498 */
1499static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
1500{
1501 struct ceph_inode_info *ci = ceph_inode(dentry->d_inode);
1502 nd_set_link(nd, ci->i_symlink);
1503 return NULL;
1504}
1505
1506static const struct inode_operations ceph_symlink_iops = {
1507 .readlink = generic_readlink,
1508 .follow_link = ceph_sym_follow_link,
1509};
1510
1511/*
1512 * setattr
1513 */
1514int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1515{
1516 struct inode *inode = dentry->d_inode;
1517 struct ceph_inode_info *ci = ceph_inode(inode);
1518 struct inode *parent_inode = dentry->d_parent->d_inode;
1519 const unsigned int ia_valid = attr->ia_valid;
1520 struct ceph_mds_request *req;
1521 struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc;
1522 int issued;
1523 int release = 0, dirtied = 0;
1524 int mask = 0;
1525 int err = 0;
1526
1527 if (ceph_snap(inode) != CEPH_NOSNAP)
1528 return -EROFS;
1529
1530 __ceph_do_pending_vmtruncate(inode);
1531
1532 err = inode_change_ok(inode, attr);
1533 if (err != 0)
1534 return err;
1535
1536 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
1537 USE_AUTH_MDS);
1538 if (IS_ERR(req))
1539 return PTR_ERR(req);
1540
1541 spin_lock(&inode->i_lock);
1542 issued = __ceph_caps_issued(ci, NULL);
1543 dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
1544
1545 if (ia_valid & ATTR_UID) {
1546 dout("setattr %p uid %d -> %d\n", inode,
1547 inode->i_uid, attr->ia_uid);
1548 if (issued & CEPH_CAP_AUTH_EXCL) {
1549 inode->i_uid = attr->ia_uid;
1550 dirtied |= CEPH_CAP_AUTH_EXCL;
1551 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1552 attr->ia_uid != inode->i_uid) {
1553 req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid);
1554 mask |= CEPH_SETATTR_UID;
1555 release |= CEPH_CAP_AUTH_SHARED;
1556 }
1557 }
1558 if (ia_valid & ATTR_GID) {
1559 dout("setattr %p gid %d -> %d\n", inode,
1560 inode->i_gid, attr->ia_gid);
1561 if (issued & CEPH_CAP_AUTH_EXCL) {
1562 inode->i_gid = attr->ia_gid;
1563 dirtied |= CEPH_CAP_AUTH_EXCL;
1564 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1565 attr->ia_gid != inode->i_gid) {
1566 req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid);
1567 mask |= CEPH_SETATTR_GID;
1568 release |= CEPH_CAP_AUTH_SHARED;
1569 }
1570 }
1571 if (ia_valid & ATTR_MODE) {
1572 dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
1573 attr->ia_mode);
1574 if (issued & CEPH_CAP_AUTH_EXCL) {
1575 inode->i_mode = attr->ia_mode;
1576 dirtied |= CEPH_CAP_AUTH_EXCL;
1577 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1578 attr->ia_mode != inode->i_mode) {
1579 req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
1580 mask |= CEPH_SETATTR_MODE;
1581 release |= CEPH_CAP_AUTH_SHARED;
1582 }
1583 }
1584
1585 if (ia_valid & ATTR_ATIME) {
1586 dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode,
1587 inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
1588 attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
1589 if (issued & CEPH_CAP_FILE_EXCL) {
1590 ci->i_time_warp_seq++;
1591 inode->i_atime = attr->ia_atime;
1592 dirtied |= CEPH_CAP_FILE_EXCL;
1593 } else if ((issued & CEPH_CAP_FILE_WR) &&
1594 timespec_compare(&inode->i_atime,
1595 &attr->ia_atime) < 0) {
1596 inode->i_atime = attr->ia_atime;
1597 dirtied |= CEPH_CAP_FILE_WR;
1598 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1599 !timespec_equal(&inode->i_atime, &attr->ia_atime)) {
1600 ceph_encode_timespec(&req->r_args.setattr.atime,
1601 &attr->ia_atime);
1602 mask |= CEPH_SETATTR_ATIME;
1603 release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
1604 CEPH_CAP_FILE_WR;
1605 }
1606 }
1607 if (ia_valid & ATTR_MTIME) {
1608 dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode,
1609 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
1610 attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
1611 if (issued & CEPH_CAP_FILE_EXCL) {
1612 ci->i_time_warp_seq++;
1613 inode->i_mtime = attr->ia_mtime;
1614 dirtied |= CEPH_CAP_FILE_EXCL;
1615 } else if ((issued & CEPH_CAP_FILE_WR) &&
1616 timespec_compare(&inode->i_mtime,
1617 &attr->ia_mtime) < 0) {
1618 inode->i_mtime = attr->ia_mtime;
1619 dirtied |= CEPH_CAP_FILE_WR;
1620 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1621 !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) {
1622 ceph_encode_timespec(&req->r_args.setattr.mtime,
1623 &attr->ia_mtime);
1624 mask |= CEPH_SETATTR_MTIME;
1625 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
1626 CEPH_CAP_FILE_WR;
1627 }
1628 }
1629 if (ia_valid & ATTR_SIZE) {
1630 dout("setattr %p size %lld -> %lld\n", inode,
1631 inode->i_size, attr->ia_size);
1632 if (attr->ia_size > inode->i_sb->s_maxbytes) {
1633 err = -EINVAL;
1634 goto out;
1635 }
1636 if ((issued & CEPH_CAP_FILE_EXCL) &&
1637 attr->ia_size > inode->i_size) {
1638 inode->i_size = attr->ia_size;
1639 inode->i_blocks =
1640 (attr->ia_size + (1 << 9) - 1) >> 9;
1641 inode->i_ctime = attr->ia_ctime;
1642 ci->i_reported_size = attr->ia_size;
1643 dirtied |= CEPH_CAP_FILE_EXCL;
1644 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1645 attr->ia_size != inode->i_size) {
1646 req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
1647 req->r_args.setattr.old_size =
1648 cpu_to_le64(inode->i_size);
1649 mask |= CEPH_SETATTR_SIZE;
1650 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
1651 CEPH_CAP_FILE_WR;
1652 }
1653 }
1654
1655 /* these do nothing */
1656 if (ia_valid & ATTR_CTIME) {
1657 bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
1658 ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
1659 dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode,
1660 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
1661 attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
1662 only ? "ctime only" : "ignored");
1663 inode->i_ctime = attr->ia_ctime;
1664 if (only) {
1665 /*
1666 * if kernel wants to dirty ctime but nothing else,
1667 * we need to choose a cap to dirty under, or do
1668 * a almost-no-op setattr
1669 */
1670 if (issued & CEPH_CAP_AUTH_EXCL)
1671 dirtied |= CEPH_CAP_AUTH_EXCL;
1672 else if (issued & CEPH_CAP_FILE_EXCL)
1673 dirtied |= CEPH_CAP_FILE_EXCL;
1674 else if (issued & CEPH_CAP_XATTR_EXCL)
1675 dirtied |= CEPH_CAP_XATTR_EXCL;
1676 else
1677 mask |= CEPH_SETATTR_CTIME;
1678 }
1679 }
1680 if (ia_valid & ATTR_FILE)
1681 dout("setattr %p ATTR_FILE ... hrm!\n", inode);
1682
1683 if (dirtied) {
1684 __ceph_mark_dirty_caps(ci, dirtied);
1685 inode->i_ctime = CURRENT_TIME;
1686 }
1687
1688 release &= issued;
1689 spin_unlock(&inode->i_lock);
1690
1691 if (mask) {
1692 req->r_inode = igrab(inode);
1693 req->r_inode_drop = release;
1694 req->r_args.setattr.mask = cpu_to_le32(mask);
1695 req->r_num_caps = 1;
1696 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
1697 }
1698 dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
1699 ceph_cap_string(dirtied), mask);
1700
1701 ceph_mdsc_put_request(req);
1702 __ceph_do_pending_vmtruncate(inode);
1703 return err;
1704out:
1705 spin_unlock(&inode->i_lock);
1706 ceph_mdsc_put_request(req);
1707 return err;
1708}
1709
1710/*
1711 * Verify that we have a lease on the given mask. If not,
1712 * do a getattr against an mds.
1713 */
1714int ceph_do_getattr(struct inode *inode, int mask)
1715{
1716 struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
1717 struct ceph_mds_client *mdsc = &client->mdsc;
1718 struct ceph_mds_request *req;
1719 int err;
1720
1721 if (ceph_snap(inode) == CEPH_SNAPDIR) {
1722 dout("do_getattr inode %p SNAPDIR\n", inode);
1723 return 0;
1724 }
1725
1726 dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
1727 if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
1728 return 0;
1729
1730 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
1731 if (IS_ERR(req))
1732 return PTR_ERR(req);
1733 req->r_inode = igrab(inode);
1734 req->r_num_caps = 1;
1735 req->r_args.getattr.mask = cpu_to_le32(mask);
1736 err = ceph_mdsc_do_request(mdsc, NULL, req);
1737 ceph_mdsc_put_request(req);
1738 dout("do_getattr result=%d\n", err);
1739 return err;
1740}
1741
1742
1743/*
1744 * Check inode permissions. We verify we have a valid value for
1745 * the AUTH cap, then call the generic handler.
1746 */
1747int ceph_permission(struct inode *inode, int mask)
1748{
1749 int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
1750
1751 if (!err)
1752 err = generic_permission(inode, mask, NULL);
1753 return err;
1754}
1755
1756/*
1757 * Get all attributes. Hopefully somedata we'll have a statlite()
1758 * and can limit the fields we require to be accurate.
1759 */
1760int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
1761 struct kstat *stat)
1762{
1763 struct inode *inode = dentry->d_inode;
1764 struct ceph_inode_info *ci = ceph_inode(inode);
1765 int err;
1766
1767 err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
1768 if (!err) {
1769 generic_fillattr(inode, stat);
1770 stat->ino = inode->i_ino;
1771 if (ceph_snap(inode) != CEPH_NOSNAP)
1772 stat->dev = ceph_snap(inode);
1773 else
1774 stat->dev = 0;
1775 if (S_ISDIR(inode->i_mode)) {
1776 stat->size = ci->i_rbytes;
1777 stat->blocks = 0;
1778 stat->blksize = 65536;
1779 }
1780 }
1781 return err;
1782}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
new file mode 100644
index 000000000000..8a5bcae62846
--- /dev/null
+++ b/fs/ceph/ioctl.c
@@ -0,0 +1,160 @@
1#include <linux/in.h>
2
3#include "ioctl.h"
4#include "super.h"
5#include "ceph_debug.h"
6
7
8/*
9 * ioctls
10 */
11
12/*
13 * get and set the file layout
14 */
15static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
16{
17 struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
18 struct ceph_ioctl_layout l;
19 int err;
20
21 err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
22 if (!err) {
23 l.stripe_unit = ceph_file_layout_su(ci->i_layout);
24 l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
25 l.object_size = ceph_file_layout_object_size(ci->i_layout);
26 l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
27 l.preferred_osd =
28 (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
29 if (copy_to_user(arg, &l, sizeof(l)))
30 return -EFAULT;
31 }
32
33 return err;
34}
35
36static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
37{
38 struct inode *inode = file->f_dentry->d_inode;
39 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
40 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
41 struct ceph_mds_request *req;
42 struct ceph_ioctl_layout l;
43 int err, i;
44
45 /* copy and validate */
46 if (copy_from_user(&l, arg, sizeof(l)))
47 return -EFAULT;
48
49 if ((l.object_size & ~PAGE_MASK) ||
50 (l.stripe_unit & ~PAGE_MASK) ||
51 !l.stripe_unit ||
52 (l.object_size &&
53 (unsigned)l.object_size % (unsigned)l.stripe_unit))
54 return -EINVAL;
55
56 /* make sure it's a valid data pool */
57 if (l.data_pool > 0) {
58 mutex_lock(&mdsc->mutex);
59 err = -EINVAL;
60 for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
61 if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
62 err = 0;
63 break;
64 }
65 mutex_unlock(&mdsc->mutex);
66 if (err)
67 return err;
68 }
69
70 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
71 USE_AUTH_MDS);
72 if (IS_ERR(req))
73 return PTR_ERR(req);
74 req->r_inode = igrab(inode);
75 req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
76
77 req->r_args.setlayout.layout.fl_stripe_unit =
78 cpu_to_le32(l.stripe_unit);
79 req->r_args.setlayout.layout.fl_stripe_count =
80 cpu_to_le32(l.stripe_count);
81 req->r_args.setlayout.layout.fl_object_size =
82 cpu_to_le32(l.object_size);
83 req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
84 req->r_args.setlayout.layout.fl_pg_preferred =
85 cpu_to_le32(l.preferred_osd);
86
87 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
88 ceph_mdsc_put_request(req);
89 return err;
90}
91
92/*
93 * Return object name, size/offset information, and location (OSD
94 * number, network address) for a given file offset.
95 */
96static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
97{
98 struct ceph_ioctl_dataloc dl;
99 struct inode *inode = file->f_dentry->d_inode;
100 struct ceph_inode_info *ci = ceph_inode(inode);
101 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
102 u64 len = 1, olen;
103 u64 tmp;
104 struct ceph_object_layout ol;
105 struct ceph_pg pgid;
106
107 /* copy and validate */
108 if (copy_from_user(&dl, arg, sizeof(dl)))
109 return -EFAULT;
110
111 down_read(&osdc->map_sem);
112 ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
113 &dl.object_no, &dl.object_offset, &olen);
114 dl.file_offset -= dl.object_offset;
115 dl.object_size = ceph_file_layout_object_size(ci->i_layout);
116 dl.block_size = ceph_file_layout_su(ci->i_layout);
117
118 /* block_offset = object_offset % block_size */
119 tmp = dl.object_offset;
120 dl.block_offset = do_div(tmp, dl.block_size);
121
122 snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
123 ceph_ino(inode), dl.object_no);
124 ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
125 osdc->osdmap);
126
127 pgid = ol.ol_pgid;
128 dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
129 if (dl.osd >= 0) {
130 struct ceph_entity_addr *a =
131 ceph_osd_addr(osdc->osdmap, dl.osd);
132 if (a)
133 memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr));
134 } else {
135 memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
136 }
137 up_read(&osdc->map_sem);
138
139 /* send result back to user */
140 if (copy_to_user(arg, &dl, sizeof(dl)))
141 return -EFAULT;
142
143 return 0;
144}
145
146long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
147{
148 dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
149 switch (cmd) {
150 case CEPH_IOC_GET_LAYOUT:
151 return ceph_ioctl_get_layout(file, (void __user *)arg);
152
153 case CEPH_IOC_SET_LAYOUT:
154 return ceph_ioctl_set_layout(file, (void __user *)arg);
155
156 case CEPH_IOC_GET_DATALOC:
157 return ceph_ioctl_get_dataloc(file, (void __user *)arg);
158 }
159 return -ENOTTY;
160}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
new file mode 100644
index 000000000000..25e4f1a9d059
--- /dev/null
+++ b/fs/ceph/ioctl.h
@@ -0,0 +1,40 @@
1#ifndef FS_CEPH_IOCTL_H
2#define FS_CEPH_IOCTL_H
3
4#include <linux/ioctl.h>
5#include <linux/types.h>
6
7#define CEPH_IOCTL_MAGIC 0x97
8
9/* just use u64 to align sanely on all archs */
10struct ceph_ioctl_layout {
11 __u64 stripe_unit, stripe_count, object_size;
12 __u64 data_pool;
13 __s64 preferred_osd;
14};
15
16#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1, \
17 struct ceph_ioctl_layout)
18#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \
19 struct ceph_ioctl_layout)
20
21/*
22 * Extract identity, address of the OSD and object storing a given
23 * file offset.
24 */
25struct ceph_ioctl_dataloc {
26 __u64 file_offset; /* in+out: file offset */
27 __u64 object_offset; /* out: offset in object */
28 __u64 object_no; /* out: object # */
29 __u64 object_size; /* out: object size */
30 char object_name[64]; /* out: object name */
31 __u64 block_offset; /* out: offset in block */
32 __u64 block_size; /* out: block length */
33 __s64 osd; /* out: osd # */
34 struct sockaddr_storage osd_addr; /* out: osd address */
35};
36
37#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
38 struct ceph_ioctl_dataloc)
39
40#endif
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
new file mode 100644
index 000000000000..24561a557e01
--- /dev/null
+++ b/fs/ceph/mds_client.c
@@ -0,0 +1,3047 @@
1#include "ceph_debug.h"
2
3#include <linux/wait.h>
4#include <linux/slab.h>
5#include <linux/sched.h>
6
7#include "mds_client.h"
8#include "mon_client.h"
9#include "super.h"
10#include "messenger.h"
11#include "decode.h"
12#include "auth.h"
13#include "pagelist.h"
14
15/*
16 * A cluster of MDS (metadata server) daemons is responsible for
17 * managing the file system namespace (the directory hierarchy and
18 * inodes) and for coordinating shared access to storage. Metadata is
19 * partitioning hierarchically across a number of servers, and that
20 * partition varies over time as the cluster adjusts the distribution
21 * in order to balance load.
22 *
23 * The MDS client is primarily responsible to managing synchronous
24 * metadata requests for operations like open, unlink, and so forth.
25 * If there is a MDS failure, we find out about it when we (possibly
26 * request and) receive a new MDS map, and can resubmit affected
27 * requests.
28 *
29 * For the most part, though, we take advantage of a lossless
30 * communications channel to the MDS, and do not need to worry about
31 * timing out or resubmitting requests.
32 *
33 * We maintain a stateful "session" with each MDS we interact with.
34 * Within each session, we sent periodic heartbeat messages to ensure
35 * any capabilities or leases we have been issues remain valid. If
36 * the session times out and goes stale, our leases and capabilities
37 * are no longer valid.
38 */
39
40static void __wake_requests(struct ceph_mds_client *mdsc,
41 struct list_head *head);
42
43const static struct ceph_connection_operations mds_con_ops;
44
45
46/*
47 * mds reply parsing
48 */
49
50/*
51 * parse individual inode info
52 */
53static int parse_reply_info_in(void **p, void *end,
54 struct ceph_mds_reply_info_in *info)
55{
56 int err = -EIO;
57
58 info->in = *p;
59 *p += sizeof(struct ceph_mds_reply_inode) +
60 sizeof(*info->in->fragtree.splits) *
61 le32_to_cpu(info->in->fragtree.nsplits);
62
63 ceph_decode_32_safe(p, end, info->symlink_len, bad);
64 ceph_decode_need(p, end, info->symlink_len, bad);
65 info->symlink = *p;
66 *p += info->symlink_len;
67
68 ceph_decode_32_safe(p, end, info->xattr_len, bad);
69 ceph_decode_need(p, end, info->xattr_len, bad);
70 info->xattr_data = *p;
71 *p += info->xattr_len;
72 return 0;
73bad:
74 return err;
75}
76
77/*
78 * parse a normal reply, which may contain a (dir+)dentry and/or a
79 * target inode.
80 */
81static int parse_reply_info_trace(void **p, void *end,
82 struct ceph_mds_reply_info_parsed *info)
83{
84 int err;
85
86 if (info->head->is_dentry) {
87 err = parse_reply_info_in(p, end, &info->diri);
88 if (err < 0)
89 goto out_bad;
90
91 if (unlikely(*p + sizeof(*info->dirfrag) > end))
92 goto bad;
93 info->dirfrag = *p;
94 *p += sizeof(*info->dirfrag) +
95 sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
96 if (unlikely(*p > end))
97 goto bad;
98
99 ceph_decode_32_safe(p, end, info->dname_len, bad);
100 ceph_decode_need(p, end, info->dname_len, bad);
101 info->dname = *p;
102 *p += info->dname_len;
103 info->dlease = *p;
104 *p += sizeof(*info->dlease);
105 }
106
107 if (info->head->is_target) {
108 err = parse_reply_info_in(p, end, &info->targeti);
109 if (err < 0)
110 goto out_bad;
111 }
112
113 if (unlikely(*p != end))
114 goto bad;
115 return 0;
116
117bad:
118 err = -EIO;
119out_bad:
120 pr_err("problem parsing mds trace %d\n", err);
121 return err;
122}
123
124/*
125 * parse readdir results
126 */
127static int parse_reply_info_dir(void **p, void *end,
128 struct ceph_mds_reply_info_parsed *info)
129{
130 u32 num, i = 0;
131 int err;
132
133 info->dir_dir = *p;
134 if (*p + sizeof(*info->dir_dir) > end)
135 goto bad;
136 *p += sizeof(*info->dir_dir) +
137 sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
138 if (*p > end)
139 goto bad;
140
141 ceph_decode_need(p, end, sizeof(num) + 2, bad);
142 num = ceph_decode_32(p);
143 info->dir_end = ceph_decode_8(p);
144 info->dir_complete = ceph_decode_8(p);
145 if (num == 0)
146 goto done;
147
148 /* alloc large array */
149 info->dir_nr = num;
150 info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
151 sizeof(*info->dir_dname) +
152 sizeof(*info->dir_dname_len) +
153 sizeof(*info->dir_dlease),
154 GFP_NOFS);
155 if (info->dir_in == NULL) {
156 err = -ENOMEM;
157 goto out_bad;
158 }
159 info->dir_dname = (void *)(info->dir_in + num);
160 info->dir_dname_len = (void *)(info->dir_dname + num);
161 info->dir_dlease = (void *)(info->dir_dname_len + num);
162
163 while (num) {
164 /* dentry */
165 ceph_decode_need(p, end, sizeof(u32)*2, bad);
166 info->dir_dname_len[i] = ceph_decode_32(p);
167 ceph_decode_need(p, end, info->dir_dname_len[i], bad);
168 info->dir_dname[i] = *p;
169 *p += info->dir_dname_len[i];
170 dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
171 info->dir_dname[i]);
172 info->dir_dlease[i] = *p;
173 *p += sizeof(struct ceph_mds_reply_lease);
174
175 /* inode */
176 err = parse_reply_info_in(p, end, &info->dir_in[i]);
177 if (err < 0)
178 goto out_bad;
179 i++;
180 num--;
181 }
182
183done:
184 if (*p != end)
185 goto bad;
186 return 0;
187
188bad:
189 err = -EIO;
190out_bad:
191 pr_err("problem parsing dir contents %d\n", err);
192 return err;
193}
194
195/*
196 * parse entire mds reply
197 */
198static int parse_reply_info(struct ceph_msg *msg,
199 struct ceph_mds_reply_info_parsed *info)
200{
201 void *p, *end;
202 u32 len;
203 int err;
204
205 info->head = msg->front.iov_base;
206 p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
207 end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
208
209 /* trace */
210 ceph_decode_32_safe(&p, end, len, bad);
211 if (len > 0) {
212 err = parse_reply_info_trace(&p, p+len, info);
213 if (err < 0)
214 goto out_bad;
215 }
216
217 /* dir content */
218 ceph_decode_32_safe(&p, end, len, bad);
219 if (len > 0) {
220 err = parse_reply_info_dir(&p, p+len, info);
221 if (err < 0)
222 goto out_bad;
223 }
224
225 /* snap blob */
226 ceph_decode_32_safe(&p, end, len, bad);
227 info->snapblob_len = len;
228 info->snapblob = p;
229 p += len;
230
231 if (p != end)
232 goto bad;
233 return 0;
234
235bad:
236 err = -EIO;
237out_bad:
238 pr_err("mds parse_reply err %d\n", err);
239 return err;
240}
241
242static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
243{
244 kfree(info->dir_in);
245}
246
247
248/*
249 * sessions
250 */
251static const char *session_state_name(int s)
252{
253 switch (s) {
254 case CEPH_MDS_SESSION_NEW: return "new";
255 case CEPH_MDS_SESSION_OPENING: return "opening";
256 case CEPH_MDS_SESSION_OPEN: return "open";
257 case CEPH_MDS_SESSION_HUNG: return "hung";
258 case CEPH_MDS_SESSION_CLOSING: return "closing";
259 case CEPH_MDS_SESSION_RESTARTING: return "restarting";
260 case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
261 default: return "???";
262 }
263}
264
265static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
266{
267 if (atomic_inc_not_zero(&s->s_ref)) {
268 dout("mdsc get_session %p %d -> %d\n", s,
269 atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
270 return s;
271 } else {
272 dout("mdsc get_session %p 0 -- FAIL", s);
273 return NULL;
274 }
275}
276
277void ceph_put_mds_session(struct ceph_mds_session *s)
278{
279 dout("mdsc put_session %p %d -> %d\n", s,
280 atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
281 if (atomic_dec_and_test(&s->s_ref)) {
282 if (s->s_authorizer)
283 s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
284 s->s_mdsc->client->monc.auth, s->s_authorizer);
285 kfree(s);
286 }
287}
288
289/*
290 * called under mdsc->mutex
291 */
292struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
293 int mds)
294{
295 struct ceph_mds_session *session;
296
297 if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
298 return NULL;
299 session = mdsc->sessions[mds];
300 dout("lookup_mds_session %p %d\n", session,
301 atomic_read(&session->s_ref));
302 get_session(session);
303 return session;
304}
305
306static bool __have_session(struct ceph_mds_client *mdsc, int mds)
307{
308 if (mds >= mdsc->max_sessions)
309 return false;
310 return mdsc->sessions[mds];
311}
312
313static int __verify_registered_session(struct ceph_mds_client *mdsc,
314 struct ceph_mds_session *s)
315{
316 if (s->s_mds >= mdsc->max_sessions ||
317 mdsc->sessions[s->s_mds] != s)
318 return -ENOENT;
319 return 0;
320}
321
322/*
323 * create+register a new session for given mds.
324 * called under mdsc->mutex.
325 */
326static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
327 int mds)
328{
329 struct ceph_mds_session *s;
330
331 s = kzalloc(sizeof(*s), GFP_NOFS);
332 if (!s)
333 return ERR_PTR(-ENOMEM);
334 s->s_mdsc = mdsc;
335 s->s_mds = mds;
336 s->s_state = CEPH_MDS_SESSION_NEW;
337 s->s_ttl = 0;
338 s->s_seq = 0;
339 mutex_init(&s->s_mutex);
340
341 ceph_con_init(mdsc->client->msgr, &s->s_con);
342 s->s_con.private = s;
343 s->s_con.ops = &mds_con_ops;
344 s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
345 s->s_con.peer_name.num = cpu_to_le64(mds);
346
347 spin_lock_init(&s->s_cap_lock);
348 s->s_cap_gen = 0;
349 s->s_cap_ttl = 0;
350 s->s_renew_requested = 0;
351 s->s_renew_seq = 0;
352 INIT_LIST_HEAD(&s->s_caps);
353 s->s_nr_caps = 0;
354 s->s_trim_caps = 0;
355 atomic_set(&s->s_ref, 1);
356 INIT_LIST_HEAD(&s->s_waiting);
357 INIT_LIST_HEAD(&s->s_unsafe);
358 s->s_num_cap_releases = 0;
359 s->s_cap_iterator = NULL;
360 INIT_LIST_HEAD(&s->s_cap_releases);
361 INIT_LIST_HEAD(&s->s_cap_releases_done);
362 INIT_LIST_HEAD(&s->s_cap_flushing);
363 INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
364
365 dout("register_session mds%d\n", mds);
366 if (mds >= mdsc->max_sessions) {
367 int newmax = 1 << get_count_order(mds+1);
368 struct ceph_mds_session **sa;
369
370 dout("register_session realloc to %d\n", newmax);
371 sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
372 if (sa == NULL)
373 goto fail_realloc;
374 if (mdsc->sessions) {
375 memcpy(sa, mdsc->sessions,
376 mdsc->max_sessions * sizeof(void *));
377 kfree(mdsc->sessions);
378 }
379 mdsc->sessions = sa;
380 mdsc->max_sessions = newmax;
381 }
382 mdsc->sessions[mds] = s;
383 atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */
384
385 ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
386
387 return s;
388
389fail_realloc:
390 kfree(s);
391 return ERR_PTR(-ENOMEM);
392}
393
394/*
395 * called under mdsc->mutex
396 */
397static void __unregister_session(struct ceph_mds_client *mdsc,
398 struct ceph_mds_session *s)
399{
400 dout("__unregister_session mds%d %p\n", s->s_mds, s);
401 BUG_ON(mdsc->sessions[s->s_mds] != s);
402 mdsc->sessions[s->s_mds] = NULL;
403 ceph_con_close(&s->s_con);
404 ceph_put_mds_session(s);
405}
406
407/*
408 * drop session refs in request.
409 *
410 * should be last request ref, or hold mdsc->mutex
411 */
412static void put_request_session(struct ceph_mds_request *req)
413{
414 if (req->r_session) {
415 ceph_put_mds_session(req->r_session);
416 req->r_session = NULL;
417 }
418}
419
420void ceph_mdsc_release_request(struct kref *kref)
421{
422 struct ceph_mds_request *req = container_of(kref,
423 struct ceph_mds_request,
424 r_kref);
425 if (req->r_request)
426 ceph_msg_put(req->r_request);
427 if (req->r_reply) {
428 ceph_msg_put(req->r_reply);
429 destroy_reply_info(&req->r_reply_info);
430 }
431 if (req->r_inode) {
432 ceph_put_cap_refs(ceph_inode(req->r_inode),
433 CEPH_CAP_PIN);
434 iput(req->r_inode);
435 }
436 if (req->r_locked_dir)
437 ceph_put_cap_refs(ceph_inode(req->r_locked_dir),
438 CEPH_CAP_PIN);
439 if (req->r_target_inode)
440 iput(req->r_target_inode);
441 if (req->r_dentry)
442 dput(req->r_dentry);
443 if (req->r_old_dentry) {
444 ceph_put_cap_refs(
445 ceph_inode(req->r_old_dentry->d_parent->d_inode),
446 CEPH_CAP_PIN);
447 dput(req->r_old_dentry);
448 }
449 kfree(req->r_path1);
450 kfree(req->r_path2);
451 put_request_session(req);
452 ceph_unreserve_caps(&req->r_caps_reservation);
453 kfree(req);
454}
455
456/*
457 * lookup session, bump ref if found.
458 *
459 * called under mdsc->mutex.
460 */
461static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
462 u64 tid)
463{
464 struct ceph_mds_request *req;
465 struct rb_node *n = mdsc->request_tree.rb_node;
466
467 while (n) {
468 req = rb_entry(n, struct ceph_mds_request, r_node);
469 if (tid < req->r_tid)
470 n = n->rb_left;
471 else if (tid > req->r_tid)
472 n = n->rb_right;
473 else {
474 ceph_mdsc_get_request(req);
475 return req;
476 }
477 }
478 return NULL;
479}
480
481static void __insert_request(struct ceph_mds_client *mdsc,
482 struct ceph_mds_request *new)
483{
484 struct rb_node **p = &mdsc->request_tree.rb_node;
485 struct rb_node *parent = NULL;
486 struct ceph_mds_request *req = NULL;
487
488 while (*p) {
489 parent = *p;
490 req = rb_entry(parent, struct ceph_mds_request, r_node);
491 if (new->r_tid < req->r_tid)
492 p = &(*p)->rb_left;
493 else if (new->r_tid > req->r_tid)
494 p = &(*p)->rb_right;
495 else
496 BUG();
497 }
498
499 rb_link_node(&new->r_node, parent, p);
500 rb_insert_color(&new->r_node, &mdsc->request_tree);
501}
502
503/*
504 * Register an in-flight request, and assign a tid. Link to directory
505 * are modifying (if any).
506 *
507 * Called under mdsc->mutex.
508 */
509static void __register_request(struct ceph_mds_client *mdsc,
510 struct ceph_mds_request *req,
511 struct inode *dir)
512{
513 req->r_tid = ++mdsc->last_tid;
514 if (req->r_num_caps)
515 ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps);
516 dout("__register_request %p tid %lld\n", req, req->r_tid);
517 ceph_mdsc_get_request(req);
518 __insert_request(mdsc, req);
519
520 if (dir) {
521 struct ceph_inode_info *ci = ceph_inode(dir);
522
523 spin_lock(&ci->i_unsafe_lock);
524 req->r_unsafe_dir = dir;
525 list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
526 spin_unlock(&ci->i_unsafe_lock);
527 }
528}
529
530static void __unregister_request(struct ceph_mds_client *mdsc,
531 struct ceph_mds_request *req)
532{
533 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
534 rb_erase(&req->r_node, &mdsc->request_tree);
535 RB_CLEAR_NODE(&req->r_node);
536
537 if (req->r_unsafe_dir) {
538 struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
539
540 spin_lock(&ci->i_unsafe_lock);
541 list_del_init(&req->r_unsafe_dir_item);
542 spin_unlock(&ci->i_unsafe_lock);
543 }
544
545 ceph_mdsc_put_request(req);
546}
547
548/*
549 * Choose mds to send request to next. If there is a hint set in the
550 * request (e.g., due to a prior forward hint from the mds), use that.
551 * Otherwise, consult frag tree and/or caps to identify the
552 * appropriate mds. If all else fails, choose randomly.
553 *
554 * Called under mdsc->mutex.
555 */
556static int __choose_mds(struct ceph_mds_client *mdsc,
557 struct ceph_mds_request *req)
558{
559 struct inode *inode;
560 struct ceph_inode_info *ci;
561 struct ceph_cap *cap;
562 int mode = req->r_direct_mode;
563 int mds = -1;
564 u32 hash = req->r_direct_hash;
565 bool is_hash = req->r_direct_is_hash;
566
567 /*
568 * is there a specific mds we should try? ignore hint if we have
569 * no session and the mds is not up (active or recovering).
570 */
571 if (req->r_resend_mds >= 0 &&
572 (__have_session(mdsc, req->r_resend_mds) ||
573 ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
574 dout("choose_mds using resend_mds mds%d\n",
575 req->r_resend_mds);
576 return req->r_resend_mds;
577 }
578
579 if (mode == USE_RANDOM_MDS)
580 goto random;
581
582 inode = NULL;
583 if (req->r_inode) {
584 inode = req->r_inode;
585 } else if (req->r_dentry) {
586 if (req->r_dentry->d_inode) {
587 inode = req->r_dentry->d_inode;
588 } else {
589 inode = req->r_dentry->d_parent->d_inode;
590 hash = req->r_dentry->d_name.hash;
591 is_hash = true;
592 }
593 }
594 dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
595 (int)hash, mode);
596 if (!inode)
597 goto random;
598 ci = ceph_inode(inode);
599
600 if (is_hash && S_ISDIR(inode->i_mode)) {
601 struct ceph_inode_frag frag;
602 int found;
603
604 ceph_choose_frag(ci, hash, &frag, &found);
605 if (found) {
606 if (mode == USE_ANY_MDS && frag.ndist > 0) {
607 u8 r;
608
609 /* choose a random replica */
610 get_random_bytes(&r, 1);
611 r %= frag.ndist;
612 mds = frag.dist[r];
613 dout("choose_mds %p %llx.%llx "
614 "frag %u mds%d (%d/%d)\n",
615 inode, ceph_vinop(inode),
616 frag.frag, frag.mds,
617 (int)r, frag.ndist);
618 return mds;
619 }
620
621 /* since this file/dir wasn't known to be
622 * replicated, then we want to look for the
623 * authoritative mds. */
624 mode = USE_AUTH_MDS;
625 if (frag.mds >= 0) {
626 /* choose auth mds */
627 mds = frag.mds;
628 dout("choose_mds %p %llx.%llx "
629 "frag %u mds%d (auth)\n",
630 inode, ceph_vinop(inode), frag.frag, mds);
631 return mds;
632 }
633 }
634 }
635
636 spin_lock(&inode->i_lock);
637 cap = NULL;
638 if (mode == USE_AUTH_MDS)
639 cap = ci->i_auth_cap;
640 if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
641 cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
642 if (!cap) {
643 spin_unlock(&inode->i_lock);
644 goto random;
645 }
646 mds = cap->session->s_mds;
647 dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
648 inode, ceph_vinop(inode), mds,
649 cap == ci->i_auth_cap ? "auth " : "", cap);
650 spin_unlock(&inode->i_lock);
651 return mds;
652
653random:
654 mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
655 dout("choose_mds chose random mds%d\n", mds);
656 return mds;
657}
658
659
660/*
661 * session messages
662 */
663static struct ceph_msg *create_session_msg(u32 op, u64 seq)
664{
665 struct ceph_msg *msg;
666 struct ceph_mds_session_head *h;
667
668 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
669 if (IS_ERR(msg)) {
670 pr_err("create_session_msg ENOMEM creating msg\n");
671 return ERR_PTR(PTR_ERR(msg));
672 }
673 h = msg->front.iov_base;
674 h->op = cpu_to_le32(op);
675 h->seq = cpu_to_le64(seq);
676 return msg;
677}
678
679/*
680 * send session open request.
681 *
682 * called under mdsc->mutex
683 */
684static int __open_session(struct ceph_mds_client *mdsc,
685 struct ceph_mds_session *session)
686{
687 struct ceph_msg *msg;
688 int mstate;
689 int mds = session->s_mds;
690 int err = 0;
691
692 /* wait for mds to go active? */
693 mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
694 dout("open_session to mds%d (%s)\n", mds,
695 ceph_mds_state_name(mstate));
696 session->s_state = CEPH_MDS_SESSION_OPENING;
697 session->s_renew_requested = jiffies;
698
699 /* send connect message */
700 msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
701 if (IS_ERR(msg)) {
702 err = PTR_ERR(msg);
703 goto out;
704 }
705 ceph_con_send(&session->s_con, msg);
706
707out:
708 return 0;
709}
710
711/*
712 * session caps
713 */
714
715/*
716 * Free preallocated cap messages assigned to this session
717 */
718static void cleanup_cap_releases(struct ceph_mds_session *session)
719{
720 struct ceph_msg *msg;
721
722 spin_lock(&session->s_cap_lock);
723 while (!list_empty(&session->s_cap_releases)) {
724 msg = list_first_entry(&session->s_cap_releases,
725 struct ceph_msg, list_head);
726 list_del_init(&msg->list_head);
727 ceph_msg_put(msg);
728 }
729 while (!list_empty(&session->s_cap_releases_done)) {
730 msg = list_first_entry(&session->s_cap_releases_done,
731 struct ceph_msg, list_head);
732 list_del_init(&msg->list_head);
733 ceph_msg_put(msg);
734 }
735 spin_unlock(&session->s_cap_lock);
736}
737
738/*
739 * Helper to safely iterate over all caps associated with a session, with
740 * special care taken to handle a racing __ceph_remove_cap().
741 *
742 * Caller must hold session s_mutex.
743 */
744static int iterate_session_caps(struct ceph_mds_session *session,
745 int (*cb)(struct inode *, struct ceph_cap *,
746 void *), void *arg)
747{
748 struct list_head *p;
749 struct ceph_cap *cap;
750 struct inode *inode, *last_inode = NULL;
751 struct ceph_cap *old_cap = NULL;
752 int ret;
753
754 dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
755 spin_lock(&session->s_cap_lock);
756 p = session->s_caps.next;
757 while (p != &session->s_caps) {
758 cap = list_entry(p, struct ceph_cap, session_caps);
759 inode = igrab(&cap->ci->vfs_inode);
760 if (!inode) {
761 p = p->next;
762 continue;
763 }
764 session->s_cap_iterator = cap;
765 spin_unlock(&session->s_cap_lock);
766
767 if (last_inode) {
768 iput(last_inode);
769 last_inode = NULL;
770 }
771 if (old_cap) {
772 ceph_put_cap(old_cap);
773 old_cap = NULL;
774 }
775
776 ret = cb(inode, cap, arg);
777 last_inode = inode;
778
779 spin_lock(&session->s_cap_lock);
780 p = p->next;
781 if (cap->ci == NULL) {
782 dout("iterate_session_caps finishing cap %p removal\n",
783 cap);
784 BUG_ON(cap->session != session);
785 list_del_init(&cap->session_caps);
786 session->s_nr_caps--;
787 cap->session = NULL;
788 old_cap = cap; /* put_cap it w/o locks held */
789 }
790 if (ret < 0)
791 goto out;
792 }
793 ret = 0;
794out:
795 session->s_cap_iterator = NULL;
796 spin_unlock(&session->s_cap_lock);
797
798 if (last_inode)
799 iput(last_inode);
800 if (old_cap)
801 ceph_put_cap(old_cap);
802
803 return ret;
804}
805
806static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
807 void *arg)
808{
809 struct ceph_inode_info *ci = ceph_inode(inode);
810 dout("removing cap %p, ci is %p, inode is %p\n",
811 cap, ci, &ci->vfs_inode);
812 ceph_remove_cap(cap);
813 return 0;
814}
815
816/*
817 * caller must hold session s_mutex
818 */
819static void remove_session_caps(struct ceph_mds_session *session)
820{
821 dout("remove_session_caps on %p\n", session);
822 iterate_session_caps(session, remove_session_caps_cb, NULL);
823 BUG_ON(session->s_nr_caps > 0);
824 cleanup_cap_releases(session);
825}
826
827/*
828 * wake up any threads waiting on this session's caps. if the cap is
829 * old (didn't get renewed on the client reconnect), remove it now.
830 *
831 * caller must hold s_mutex.
832 */
833static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
834 void *arg)
835{
836 struct ceph_inode_info *ci = ceph_inode(inode);
837
838 wake_up(&ci->i_cap_wq);
839 if (arg) {
840 spin_lock(&inode->i_lock);
841 ci->i_wanted_max_size = 0;
842 ci->i_requested_max_size = 0;
843 spin_unlock(&inode->i_lock);
844 }
845 return 0;
846}
847
848static void wake_up_session_caps(struct ceph_mds_session *session,
849 int reconnect)
850{
851 dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
852 iterate_session_caps(session, wake_up_session_cb,
853 (void *)(unsigned long)reconnect);
854}
855
856/*
857 * Send periodic message to MDS renewing all currently held caps. The
858 * ack will reset the expiration for all caps from this session.
859 *
860 * caller holds s_mutex
861 */
862static int send_renew_caps(struct ceph_mds_client *mdsc,
863 struct ceph_mds_session *session)
864{
865 struct ceph_msg *msg;
866 int state;
867
868 if (time_after_eq(jiffies, session->s_cap_ttl) &&
869 time_after_eq(session->s_cap_ttl, session->s_renew_requested))
870 pr_info("mds%d caps stale\n", session->s_mds);
871 session->s_renew_requested = jiffies;
872
873 /* do not try to renew caps until a recovering mds has reconnected
874 * with its clients. */
875 state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
876 if (state < CEPH_MDS_STATE_RECONNECT) {
877 dout("send_renew_caps ignoring mds%d (%s)\n",
878 session->s_mds, ceph_mds_state_name(state));
879 return 0;
880 }
881
882 dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
883 ceph_mds_state_name(state));
884 msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
885 ++session->s_renew_seq);
886 if (IS_ERR(msg))
887 return PTR_ERR(msg);
888 ceph_con_send(&session->s_con, msg);
889 return 0;
890}
891
892/*
893 * Note new cap ttl, and any transition from stale -> not stale (fresh?).
894 *
895 * Called under session->s_mutex
896 */
897static void renewed_caps(struct ceph_mds_client *mdsc,
898 struct ceph_mds_session *session, int is_renew)
899{
900 int was_stale;
901 int wake = 0;
902
903 spin_lock(&session->s_cap_lock);
904 was_stale = is_renew && (session->s_cap_ttl == 0 ||
905 time_after_eq(jiffies, session->s_cap_ttl));
906
907 session->s_cap_ttl = session->s_renew_requested +
908 mdsc->mdsmap->m_session_timeout*HZ;
909
910 if (was_stale) {
911 if (time_before(jiffies, session->s_cap_ttl)) {
912 pr_info("mds%d caps renewed\n", session->s_mds);
913 wake = 1;
914 } else {
915 pr_info("mds%d caps still stale\n", session->s_mds);
916 }
917 }
918 dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
919 session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
920 time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
921 spin_unlock(&session->s_cap_lock);
922
923 if (wake)
924 wake_up_session_caps(session, 0);
925}
926
927/*
928 * send a session close request
929 */
930static int request_close_session(struct ceph_mds_client *mdsc,
931 struct ceph_mds_session *session)
932{
933 struct ceph_msg *msg;
934 int err = 0;
935
936 dout("request_close_session mds%d state %s seq %lld\n",
937 session->s_mds, session_state_name(session->s_state),
938 session->s_seq);
939 msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
940 if (IS_ERR(msg))
941 err = PTR_ERR(msg);
942 else
943 ceph_con_send(&session->s_con, msg);
944 return err;
945}
946
947/*
948 * Called with s_mutex held.
949 */
950static int __close_session(struct ceph_mds_client *mdsc,
951 struct ceph_mds_session *session)
952{
953 if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
954 return 0;
955 session->s_state = CEPH_MDS_SESSION_CLOSING;
956 return request_close_session(mdsc, session);
957}
958
959/*
960 * Trim old(er) caps.
961 *
962 * Because we can't cache an inode without one or more caps, we do
963 * this indirectly: if a cap is unused, we prune its aliases, at which
964 * point the inode will hopefully get dropped to.
965 *
966 * Yes, this is a bit sloppy. Our only real goal here is to respond to
967 * memory pressure from the MDS, though, so it needn't be perfect.
968 */
969static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
970{
971 struct ceph_mds_session *session = arg;
972 struct ceph_inode_info *ci = ceph_inode(inode);
973 int used, oissued, mine;
974
975 if (session->s_trim_caps <= 0)
976 return -1;
977
978 spin_lock(&inode->i_lock);
979 mine = cap->issued | cap->implemented;
980 used = __ceph_caps_used(ci);
981 oissued = __ceph_caps_issued_other(ci, cap);
982
983 dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
984 inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
985 ceph_cap_string(used));
986 if (ci->i_dirty_caps)
987 goto out; /* dirty caps */
988 if ((used & ~oissued) & mine)
989 goto out; /* we need these caps */
990
991 session->s_trim_caps--;
992 if (oissued) {
993 /* we aren't the only cap.. just remove us */
994 __ceph_remove_cap(cap);
995 } else {
996 /* try to drop referring dentries */
997 spin_unlock(&inode->i_lock);
998 d_prune_aliases(inode);
999 dout("trim_caps_cb %p cap %p pruned, count now %d\n",
1000 inode, cap, atomic_read(&inode->i_count));
1001 return 0;
1002 }
1003
1004out:
1005 spin_unlock(&inode->i_lock);
1006 return 0;
1007}
1008
1009/*
1010 * Trim session cap count down to some max number.
1011 */
1012static int trim_caps(struct ceph_mds_client *mdsc,
1013 struct ceph_mds_session *session,
1014 int max_caps)
1015{
1016 int trim_caps = session->s_nr_caps - max_caps;
1017
1018 dout("trim_caps mds%d start: %d / %d, trim %d\n",
1019 session->s_mds, session->s_nr_caps, max_caps, trim_caps);
1020 if (trim_caps > 0) {
1021 session->s_trim_caps = trim_caps;
1022 iterate_session_caps(session, trim_caps_cb, session);
1023 dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
1024 session->s_mds, session->s_nr_caps, max_caps,
1025 trim_caps - session->s_trim_caps);
1026 session->s_trim_caps = 0;
1027 }
1028 return 0;
1029}
1030
1031/*
1032 * Allocate cap_release messages. If there is a partially full message
1033 * in the queue, try to allocate enough to cover it's remainder, so that
1034 * we can send it immediately.
1035 *
1036 * Called under s_mutex.
1037 */
1038static int add_cap_releases(struct ceph_mds_client *mdsc,
1039 struct ceph_mds_session *session,
1040 int extra)
1041{
1042 struct ceph_msg *msg;
1043 struct ceph_mds_cap_release *head;
1044 int err = -ENOMEM;
1045
1046 if (extra < 0)
1047 extra = mdsc->client->mount_args->cap_release_safety;
1048
1049 spin_lock(&session->s_cap_lock);
1050
1051 if (!list_empty(&session->s_cap_releases)) {
1052 msg = list_first_entry(&session->s_cap_releases,
1053 struct ceph_msg,
1054 list_head);
1055 head = msg->front.iov_base;
1056 extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
1057 }
1058
1059 while (session->s_num_cap_releases < session->s_nr_caps + extra) {
1060 spin_unlock(&session->s_cap_lock);
1061 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
1062 0, 0, NULL);
1063 if (!msg)
1064 goto out_unlocked;
1065 dout("add_cap_releases %p msg %p now %d\n", session, msg,
1066 (int)msg->front.iov_len);
1067 head = msg->front.iov_base;
1068 head->num = cpu_to_le32(0);
1069 msg->front.iov_len = sizeof(*head);
1070 spin_lock(&session->s_cap_lock);
1071 list_add(&msg->list_head, &session->s_cap_releases);
1072 session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
1073 }
1074
1075 if (!list_empty(&session->s_cap_releases)) {
1076 msg = list_first_entry(&session->s_cap_releases,
1077 struct ceph_msg,
1078 list_head);
1079 head = msg->front.iov_base;
1080 if (head->num) {
1081 dout(" queueing non-full %p (%d)\n", msg,
1082 le32_to_cpu(head->num));
1083 list_move_tail(&msg->list_head,
1084 &session->s_cap_releases_done);
1085 session->s_num_cap_releases -=
1086 CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
1087 }
1088 }
1089 err = 0;
1090 spin_unlock(&session->s_cap_lock);
1091out_unlocked:
1092 return err;
1093}
1094
1095/*
1096 * flush all dirty inode data to disk.
1097 *
1098 * returns true if we've flushed through want_flush_seq
1099 */
1100static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
1101{
1102 int mds, ret = 1;
1103
1104 dout("check_cap_flush want %lld\n", want_flush_seq);
1105 mutex_lock(&mdsc->mutex);
1106 for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
1107 struct ceph_mds_session *session = mdsc->sessions[mds];
1108
1109 if (!session)
1110 continue;
1111 get_session(session);
1112 mutex_unlock(&mdsc->mutex);
1113
1114 mutex_lock(&session->s_mutex);
1115 if (!list_empty(&session->s_cap_flushing)) {
1116 struct ceph_inode_info *ci =
1117 list_entry(session->s_cap_flushing.next,
1118 struct ceph_inode_info,
1119 i_flushing_item);
1120 struct inode *inode = &ci->vfs_inode;
1121
1122 spin_lock(&inode->i_lock);
1123 if (ci->i_cap_flush_seq <= want_flush_seq) {
1124 dout("check_cap_flush still flushing %p "
1125 "seq %lld <= %lld to mds%d\n", inode,
1126 ci->i_cap_flush_seq, want_flush_seq,
1127 session->s_mds);
1128 ret = 0;
1129 }
1130 spin_unlock(&inode->i_lock);
1131 }
1132 mutex_unlock(&session->s_mutex);
1133 ceph_put_mds_session(session);
1134
1135 if (!ret)
1136 return ret;
1137 mutex_lock(&mdsc->mutex);
1138 }
1139
1140 mutex_unlock(&mdsc->mutex);
1141 dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
1142 return ret;
1143}
1144
1145/*
1146 * called under s_mutex
1147 */
1148static void send_cap_releases(struct ceph_mds_client *mdsc,
1149 struct ceph_mds_session *session)
1150{
1151 struct ceph_msg *msg;
1152
1153 dout("send_cap_releases mds%d\n", session->s_mds);
1154 while (1) {
1155 spin_lock(&session->s_cap_lock);
1156 if (list_empty(&session->s_cap_releases_done))
1157 break;
1158 msg = list_first_entry(&session->s_cap_releases_done,
1159 struct ceph_msg, list_head);
1160 list_del_init(&msg->list_head);
1161 spin_unlock(&session->s_cap_lock);
1162 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1163 dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
1164 ceph_con_send(&session->s_con, msg);
1165 }
1166 spin_unlock(&session->s_cap_lock);
1167}
1168
1169/*
1170 * requests
1171 */
1172
1173/*
1174 * Create an mds request.
1175 */
1176struct ceph_mds_request *
1177ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
1178{
1179 struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
1180
1181 if (!req)
1182 return ERR_PTR(-ENOMEM);
1183
1184 req->r_started = jiffies;
1185 req->r_resend_mds = -1;
1186 INIT_LIST_HEAD(&req->r_unsafe_dir_item);
1187 req->r_fmode = -1;
1188 kref_init(&req->r_kref);
1189 INIT_LIST_HEAD(&req->r_wait);
1190 init_completion(&req->r_completion);
1191 init_completion(&req->r_safe_completion);
1192 INIT_LIST_HEAD(&req->r_unsafe_item);
1193
1194 req->r_op = op;
1195 req->r_direct_mode = mode;
1196 return req;
1197}
1198
1199/*
1200 * return oldest (lowest) request, tid in request tree, 0 if none.
1201 *
1202 * called under mdsc->mutex.
1203 */
1204static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
1205{
1206 if (RB_EMPTY_ROOT(&mdsc->request_tree))
1207 return NULL;
1208 return rb_entry(rb_first(&mdsc->request_tree),
1209 struct ceph_mds_request, r_node);
1210}
1211
1212static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
1213{
1214 struct ceph_mds_request *req = __get_oldest_req(mdsc);
1215
1216 if (req)
1217 return req->r_tid;
1218 return 0;
1219}
1220
1221/*
1222 * Build a dentry's path. Allocate on heap; caller must kfree. Based
1223 * on build_path_from_dentry in fs/cifs/dir.c.
1224 *
1225 * If @stop_on_nosnap, generate path relative to the first non-snapped
1226 * inode.
1227 *
1228 * Encode hidden .snap dirs as a double /, i.e.
1229 * foo/.snap/bar -> foo//bar
1230 */
1231char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
1232 int stop_on_nosnap)
1233{
1234 struct dentry *temp;
1235 char *path;
1236 int len, pos;
1237
1238 if (dentry == NULL)
1239 return ERR_PTR(-EINVAL);
1240
1241retry:
1242 len = 0;
1243 for (temp = dentry; !IS_ROOT(temp);) {
1244 struct inode *inode = temp->d_inode;
1245 if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
1246 len++; /* slash only */
1247 else if (stop_on_nosnap && inode &&
1248 ceph_snap(inode) == CEPH_NOSNAP)
1249 break;
1250 else
1251 len += 1 + temp->d_name.len;
1252 temp = temp->d_parent;
1253 if (temp == NULL) {
1254 pr_err("build_path_dentry corrupt dentry %p\n", dentry);
1255 return ERR_PTR(-EINVAL);
1256 }
1257 }
1258 if (len)
1259 len--; /* no leading '/' */
1260
1261 path = kmalloc(len+1, GFP_NOFS);
1262 if (path == NULL)
1263 return ERR_PTR(-ENOMEM);
1264 pos = len;
1265 path[pos] = 0; /* trailing null */
1266 for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
1267 struct inode *inode = temp->d_inode;
1268
1269 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
1270 dout("build_path_dentry path+%d: %p SNAPDIR\n",
1271 pos, temp);
1272 } else if (stop_on_nosnap && inode &&
1273 ceph_snap(inode) == CEPH_NOSNAP) {
1274 break;
1275 } else {
1276 pos -= temp->d_name.len;
1277 if (pos < 0)
1278 break;
1279 strncpy(path + pos, temp->d_name.name,
1280 temp->d_name.len);
1281 dout("build_path_dentry path+%d: %p '%.*s'\n",
1282 pos, temp, temp->d_name.len, path + pos);
1283 }
1284 if (pos)
1285 path[--pos] = '/';
1286 temp = temp->d_parent;
1287 if (temp == NULL) {
1288 pr_err("build_path_dentry corrupt dentry\n");
1289 kfree(path);
1290 return ERR_PTR(-EINVAL);
1291 }
1292 }
1293 if (pos != 0) {
1294 pr_err("build_path_dentry did not end path lookup where "
1295 "expected, namelen is %d, pos is %d\n", len, pos);
1296 /* presumably this is only possible if racing with a
1297 rename of one of the parent directories (we can not
1298 lock the dentries above us to prevent this, but
1299 retrying should be harmless) */
1300 kfree(path);
1301 goto retry;
1302 }
1303
1304 *base = ceph_ino(temp->d_inode);
1305 *plen = len;
1306 dout("build_path_dentry on %p %d built %llx '%.*s'\n",
1307 dentry, atomic_read(&dentry->d_count), *base, len, path);
1308 return path;
1309}
1310
1311static int build_dentry_path(struct dentry *dentry,
1312 const char **ppath, int *ppathlen, u64 *pino,
1313 int *pfreepath)
1314{
1315 char *path;
1316
1317 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) {
1318 *pino = ceph_ino(dentry->d_parent->d_inode);
1319 *ppath = dentry->d_name.name;
1320 *ppathlen = dentry->d_name.len;
1321 return 0;
1322 }
1323 path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
1324 if (IS_ERR(path))
1325 return PTR_ERR(path);
1326 *ppath = path;
1327 *pfreepath = 1;
1328 return 0;
1329}
1330
1331static int build_inode_path(struct inode *inode,
1332 const char **ppath, int *ppathlen, u64 *pino,
1333 int *pfreepath)
1334{
1335 struct dentry *dentry;
1336 char *path;
1337
1338 if (ceph_snap(inode) == CEPH_NOSNAP) {
1339 *pino = ceph_ino(inode);
1340 *ppathlen = 0;
1341 return 0;
1342 }
1343 dentry = d_find_alias(inode);
1344 path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
1345 dput(dentry);
1346 if (IS_ERR(path))
1347 return PTR_ERR(path);
1348 *ppath = path;
1349 *pfreepath = 1;
1350 return 0;
1351}
1352
1353/*
1354 * request arguments may be specified via an inode *, a dentry *, or
1355 * an explicit ino+path.
1356 */
1357static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
1358 const char *rpath, u64 rino,
1359 const char **ppath, int *pathlen,
1360 u64 *ino, int *freepath)
1361{
1362 int r = 0;
1363
1364 if (rinode) {
1365 r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
1366 dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
1367 ceph_snap(rinode));
1368 } else if (rdentry) {
1369 r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
1370 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
1371 *ppath);
1372 } else if (rpath) {
1373 *ino = rino;
1374 *ppath = rpath;
1375 *pathlen = strlen(rpath);
1376 dout(" path %.*s\n", *pathlen, rpath);
1377 }
1378
1379 return r;
1380}
1381
1382/*
1383 * called under mdsc->mutex
1384 */
1385static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1386 struct ceph_mds_request *req,
1387 int mds)
1388{
1389 struct ceph_msg *msg;
1390 struct ceph_mds_request_head *head;
1391 const char *path1 = NULL;
1392 const char *path2 = NULL;
1393 u64 ino1 = 0, ino2 = 0;
1394 int pathlen1 = 0, pathlen2 = 0;
1395 int freepath1 = 0, freepath2 = 0;
1396 int len;
1397 u16 releases;
1398 void *p, *end;
1399 int ret;
1400
1401 ret = set_request_path_attr(req->r_inode, req->r_dentry,
1402 req->r_path1, req->r_ino1.ino,
1403 &path1, &pathlen1, &ino1, &freepath1);
1404 if (ret < 0) {
1405 msg = ERR_PTR(ret);
1406 goto out;
1407 }
1408
1409 ret = set_request_path_attr(NULL, req->r_old_dentry,
1410 req->r_path2, req->r_ino2.ino,
1411 &path2, &pathlen2, &ino2, &freepath2);
1412 if (ret < 0) {
1413 msg = ERR_PTR(ret);
1414 goto out_free1;
1415 }
1416
1417 len = sizeof(*head) +
1418 pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
1419
1420 /* calculate (max) length for cap releases */
1421 len += sizeof(struct ceph_mds_request_release) *
1422 (!!req->r_inode_drop + !!req->r_dentry_drop +
1423 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
1424 if (req->r_dentry_drop)
1425 len += req->r_dentry->d_name.len;
1426 if (req->r_old_dentry_drop)
1427 len += req->r_old_dentry->d_name.len;
1428
1429 msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL);
1430 if (IS_ERR(msg))
1431 goto out_free2;
1432
1433 msg->hdr.tid = cpu_to_le64(req->r_tid);
1434
1435 head = msg->front.iov_base;
1436 p = msg->front.iov_base + sizeof(*head);
1437 end = msg->front.iov_base + msg->front.iov_len;
1438
1439 head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
1440 head->op = cpu_to_le32(req->r_op);
1441 head->caller_uid = cpu_to_le32(current_fsuid());
1442 head->caller_gid = cpu_to_le32(current_fsgid());
1443 head->args = req->r_args;
1444
1445 ceph_encode_filepath(&p, end, ino1, path1);
1446 ceph_encode_filepath(&p, end, ino2, path2);
1447
1448 /* cap releases */
1449 releases = 0;
1450 if (req->r_inode_drop)
1451 releases += ceph_encode_inode_release(&p,
1452 req->r_inode ? req->r_inode : req->r_dentry->d_inode,
1453 mds, req->r_inode_drop, req->r_inode_unless, 0);
1454 if (req->r_dentry_drop)
1455 releases += ceph_encode_dentry_release(&p, req->r_dentry,
1456 mds, req->r_dentry_drop, req->r_dentry_unless);
1457 if (req->r_old_dentry_drop)
1458 releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
1459 mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
1460 if (req->r_old_inode_drop)
1461 releases += ceph_encode_inode_release(&p,
1462 req->r_old_dentry->d_inode,
1463 mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
1464 head->num_releases = cpu_to_le16(releases);
1465
1466 BUG_ON(p > end);
1467 msg->front.iov_len = p - msg->front.iov_base;
1468 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1469
1470 msg->pages = req->r_pages;
1471 msg->nr_pages = req->r_num_pages;
1472 msg->hdr.data_len = cpu_to_le32(req->r_data_len);
1473 msg->hdr.data_off = cpu_to_le16(0);
1474
1475out_free2:
1476 if (freepath2)
1477 kfree((char *)path2);
1478out_free1:
1479 if (freepath1)
1480 kfree((char *)path1);
1481out:
1482 return msg;
1483}
1484
1485/*
1486 * called under mdsc->mutex if error, under no mutex if
1487 * success.
1488 */
1489static void complete_request(struct ceph_mds_client *mdsc,
1490 struct ceph_mds_request *req)
1491{
1492 if (req->r_callback)
1493 req->r_callback(mdsc, req);
1494 else
1495 complete(&req->r_completion);
1496}
1497
1498/*
1499 * called under mdsc->mutex
1500 */
1501static int __prepare_send_request(struct ceph_mds_client *mdsc,
1502 struct ceph_mds_request *req,
1503 int mds)
1504{
1505 struct ceph_mds_request_head *rhead;
1506 struct ceph_msg *msg;
1507 int flags = 0;
1508
1509 req->r_mds = mds;
1510 req->r_attempts++;
1511 dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
1512 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
1513
1514 if (req->r_request) {
1515 ceph_msg_put(req->r_request);
1516 req->r_request = NULL;
1517 }
1518 msg = create_request_message(mdsc, req, mds);
1519 if (IS_ERR(msg)) {
1520 req->r_reply = ERR_PTR(PTR_ERR(msg));
1521 complete_request(mdsc, req);
1522 return -PTR_ERR(msg);
1523 }
1524 req->r_request = msg;
1525
1526 rhead = msg->front.iov_base;
1527 rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
1528 if (req->r_got_unsafe)
1529 flags |= CEPH_MDS_FLAG_REPLAY;
1530 if (req->r_locked_dir)
1531 flags |= CEPH_MDS_FLAG_WANT_DENTRY;
1532 rhead->flags = cpu_to_le32(flags);
1533 rhead->num_fwd = req->r_num_fwd;
1534 rhead->num_retry = req->r_attempts - 1;
1535
1536 dout(" r_locked_dir = %p\n", req->r_locked_dir);
1537
1538 if (req->r_target_inode && req->r_got_unsafe)
1539 rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
1540 else
1541 rhead->ino = 0;
1542 return 0;
1543}
1544
1545/*
1546 * send request, or put it on the appropriate wait list.
1547 */
1548static int __do_request(struct ceph_mds_client *mdsc,
1549 struct ceph_mds_request *req)
1550{
1551 struct ceph_mds_session *session = NULL;
1552 int mds = -1;
1553 int err = -EAGAIN;
1554
1555 if (req->r_reply)
1556 goto out;
1557
1558 if (req->r_timeout &&
1559 time_after_eq(jiffies, req->r_started + req->r_timeout)) {
1560 dout("do_request timed out\n");
1561 err = -EIO;
1562 goto finish;
1563 }
1564
1565 mds = __choose_mds(mdsc, req);
1566 if (mds < 0 ||
1567 ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
1568 dout("do_request no mds or not active, waiting for map\n");
1569 list_add(&req->r_wait, &mdsc->waiting_for_map);
1570 goto out;
1571 }
1572
1573 /* get, open session */
1574 session = __ceph_lookup_mds_session(mdsc, mds);
1575 if (!session) {
1576 session = register_session(mdsc, mds);
1577 if (IS_ERR(session)) {
1578 err = PTR_ERR(session);
1579 goto finish;
1580 }
1581 }
1582 dout("do_request mds%d session %p state %s\n", mds, session,
1583 session_state_name(session->s_state));
1584 if (session->s_state != CEPH_MDS_SESSION_OPEN &&
1585 session->s_state != CEPH_MDS_SESSION_HUNG) {
1586 if (session->s_state == CEPH_MDS_SESSION_NEW ||
1587 session->s_state == CEPH_MDS_SESSION_CLOSING)
1588 __open_session(mdsc, session);
1589 list_add(&req->r_wait, &session->s_waiting);
1590 goto out_session;
1591 }
1592
1593 /* send request */
1594 req->r_session = get_session(session);
1595 req->r_resend_mds = -1; /* forget any previous mds hint */
1596
1597 if (req->r_request_started == 0) /* note request start time */
1598 req->r_request_started = jiffies;
1599
1600 err = __prepare_send_request(mdsc, req, mds);
1601 if (!err) {
1602 ceph_msg_get(req->r_request);
1603 ceph_con_send(&session->s_con, req->r_request);
1604 }
1605
1606out_session:
1607 ceph_put_mds_session(session);
1608out:
1609 return err;
1610
1611finish:
1612 req->r_reply = ERR_PTR(err);
1613 complete_request(mdsc, req);
1614 goto out;
1615}
1616
1617/*
1618 * called under mdsc->mutex
1619 */
1620static void __wake_requests(struct ceph_mds_client *mdsc,
1621 struct list_head *head)
1622{
1623 struct ceph_mds_request *req, *nreq;
1624
1625 list_for_each_entry_safe(req, nreq, head, r_wait) {
1626 list_del_init(&req->r_wait);
1627 __do_request(mdsc, req);
1628 }
1629}
1630
1631/*
1632 * Wake up threads with requests pending for @mds, so that they can
1633 * resubmit their requests to a possibly different mds. If @all is set,
1634 * wake up if their requests has been forwarded to @mds, too.
1635 */
1636static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all)
1637{
1638 struct ceph_mds_request *req;
1639 struct rb_node *p;
1640
1641 dout("kick_requests mds%d\n", mds);
1642 for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) {
1643 req = rb_entry(p, struct ceph_mds_request, r_node);
1644 if (req->r_got_unsafe)
1645 continue;
1646 if (req->r_session &&
1647 req->r_session->s_mds == mds) {
1648 dout(" kicking tid %llu\n", req->r_tid);
1649 put_request_session(req);
1650 __do_request(mdsc, req);
1651 }
1652 }
1653}
1654
1655void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
1656 struct ceph_mds_request *req)
1657{
1658 dout("submit_request on %p\n", req);
1659 mutex_lock(&mdsc->mutex);
1660 __register_request(mdsc, req, NULL);
1661 __do_request(mdsc, req);
1662 mutex_unlock(&mdsc->mutex);
1663}
1664
1665/*
1666 * Synchrously perform an mds request. Take care of all of the
1667 * session setup, forwarding, retry details.
1668 */
1669int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
1670 struct inode *dir,
1671 struct ceph_mds_request *req)
1672{
1673 int err;
1674
1675 dout("do_request on %p\n", req);
1676
1677 /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
1678 if (req->r_inode)
1679 ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
1680 if (req->r_locked_dir)
1681 ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
1682 if (req->r_old_dentry)
1683 ceph_get_cap_refs(
1684 ceph_inode(req->r_old_dentry->d_parent->d_inode),
1685 CEPH_CAP_PIN);
1686
1687 /* issue */
1688 mutex_lock(&mdsc->mutex);
1689 __register_request(mdsc, req, dir);
1690 __do_request(mdsc, req);
1691
1692 /* wait */
1693 if (!req->r_reply) {
1694 mutex_unlock(&mdsc->mutex);
1695 if (req->r_timeout) {
1696 err = (long)wait_for_completion_interruptible_timeout(
1697 &req->r_completion, req->r_timeout);
1698 if (err == 0)
1699 req->r_reply = ERR_PTR(-EIO);
1700 else if (err < 0)
1701 req->r_reply = ERR_PTR(err);
1702 } else {
1703 err = wait_for_completion_interruptible(
1704 &req->r_completion);
1705 if (err)
1706 req->r_reply = ERR_PTR(err);
1707 }
1708 mutex_lock(&mdsc->mutex);
1709 }
1710
1711 if (IS_ERR(req->r_reply)) {
1712 err = PTR_ERR(req->r_reply);
1713 req->r_reply = NULL;
1714
1715 if (err == -ERESTARTSYS) {
1716 /* aborted */
1717 req->r_aborted = true;
1718
1719 if (req->r_locked_dir &&
1720 (req->r_op & CEPH_MDS_OP_WRITE)) {
1721 struct ceph_inode_info *ci =
1722 ceph_inode(req->r_locked_dir);
1723
1724 dout("aborted, clearing I_COMPLETE on %p\n",
1725 req->r_locked_dir);
1726 spin_lock(&req->r_locked_dir->i_lock);
1727 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1728 ci->i_release_count++;
1729 spin_unlock(&req->r_locked_dir->i_lock);
1730 }
1731 } else {
1732 /* clean up this request */
1733 __unregister_request(mdsc, req);
1734 if (!list_empty(&req->r_unsafe_item))
1735 list_del_init(&req->r_unsafe_item);
1736 complete(&req->r_safe_completion);
1737 }
1738 } else if (req->r_err) {
1739 err = req->r_err;
1740 } else {
1741 err = le32_to_cpu(req->r_reply_info.head->result);
1742 }
1743 mutex_unlock(&mdsc->mutex);
1744
1745 dout("do_request %p done, result %d\n", req, err);
1746 return err;
1747}
1748
1749/*
1750 * Handle mds reply.
1751 *
1752 * We take the session mutex and parse and process the reply immediately.
1753 * This preserves the logical ordering of replies, capabilities, etc., sent
1754 * by the MDS as they are applied to our local cache.
1755 */
1756static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1757{
1758 struct ceph_mds_client *mdsc = session->s_mdsc;
1759 struct ceph_mds_request *req;
1760 struct ceph_mds_reply_head *head = msg->front.iov_base;
1761 struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */
1762 u64 tid;
1763 int err, result;
1764 int mds = session->s_mds;
1765
1766 if (msg->front.iov_len < sizeof(*head)) {
1767 pr_err("mdsc_handle_reply got corrupt (short) reply\n");
1768 ceph_msg_dump(msg);
1769 return;
1770 }
1771
1772 /* get request, session */
1773 tid = le64_to_cpu(msg->hdr.tid);
1774 mutex_lock(&mdsc->mutex);
1775 req = __lookup_request(mdsc, tid);
1776 if (!req) {
1777 dout("handle_reply on unknown tid %llu\n", tid);
1778 mutex_unlock(&mdsc->mutex);
1779 return;
1780 }
1781 dout("handle_reply %p\n", req);
1782
1783 /* correct session? */
1784 if (req->r_session != session) {
1785 pr_err("mdsc_handle_reply got %llu on session mds%d"
1786 " not mds%d\n", tid, session->s_mds,
1787 req->r_session ? req->r_session->s_mds : -1);
1788 mutex_unlock(&mdsc->mutex);
1789 goto out;
1790 }
1791
1792 /* dup? */
1793 if ((req->r_got_unsafe && !head->safe) ||
1794 (req->r_got_safe && head->safe)) {
1795 pr_warning("got a dup %s reply on %llu from mds%d\n",
1796 head->safe ? "safe" : "unsafe", tid, mds);
1797 mutex_unlock(&mdsc->mutex);
1798 goto out;
1799 }
1800
1801 result = le32_to_cpu(head->result);
1802
1803 /*
1804 * Tolerate 2 consecutive ESTALEs from the same mds.
1805 * FIXME: we should be looking at the cap migrate_seq.
1806 */
1807 if (result == -ESTALE) {
1808 req->r_direct_mode = USE_AUTH_MDS;
1809 req->r_num_stale++;
1810 if (req->r_num_stale <= 2) {
1811 __do_request(mdsc, req);
1812 mutex_unlock(&mdsc->mutex);
1813 goto out;
1814 }
1815 } else {
1816 req->r_num_stale = 0;
1817 }
1818
1819 if (head->safe) {
1820 req->r_got_safe = true;
1821 __unregister_request(mdsc, req);
1822 complete(&req->r_safe_completion);
1823
1824 if (req->r_got_unsafe) {
1825 /*
1826 * We already handled the unsafe response, now do the
1827 * cleanup. No need to examine the response; the MDS
1828 * doesn't include any result info in the safe
1829 * response. And even if it did, there is nothing
1830 * useful we could do with a revised return value.
1831 */
1832 dout("got safe reply %llu, mds%d\n", tid, mds);
1833 list_del_init(&req->r_unsafe_item);
1834
1835 /* last unsafe request during umount? */
1836 if (mdsc->stopping && !__get_oldest_req(mdsc))
1837 complete(&mdsc->safe_umount_waiters);
1838 mutex_unlock(&mdsc->mutex);
1839 goto out;
1840 }
1841 }
1842
1843 BUG_ON(req->r_reply);
1844
1845 if (!head->safe) {
1846 req->r_got_unsafe = true;
1847 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
1848 }
1849
1850 dout("handle_reply tid %lld result %d\n", tid, result);
1851 rinfo = &req->r_reply_info;
1852 err = parse_reply_info(msg, rinfo);
1853 mutex_unlock(&mdsc->mutex);
1854
1855 mutex_lock(&session->s_mutex);
1856 if (err < 0) {
1857 pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
1858 ceph_msg_dump(msg);
1859 goto out_err;
1860 }
1861
1862 /* snap trace */
1863 if (rinfo->snapblob_len) {
1864 down_write(&mdsc->snap_rwsem);
1865 ceph_update_snap_trace(mdsc, rinfo->snapblob,
1866 rinfo->snapblob + rinfo->snapblob_len,
1867 le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
1868 downgrade_write(&mdsc->snap_rwsem);
1869 } else {
1870 down_read(&mdsc->snap_rwsem);
1871 }
1872
1873 /* insert trace into our cache */
1874 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
1875 if (err == 0) {
1876 if (result == 0 && rinfo->dir_nr)
1877 ceph_readdir_prepopulate(req, req->r_session);
1878 ceph_unreserve_caps(&req->r_caps_reservation);
1879 }
1880
1881 up_read(&mdsc->snap_rwsem);
1882out_err:
1883 if (err) {
1884 req->r_err = err;
1885 } else {
1886 req->r_reply = msg;
1887 ceph_msg_get(msg);
1888 }
1889
1890 add_cap_releases(mdsc, req->r_session, -1);
1891 mutex_unlock(&session->s_mutex);
1892
1893 /* kick calling process */
1894 complete_request(mdsc, req);
1895out:
1896 ceph_mdsc_put_request(req);
1897 return;
1898}
1899
1900
1901
1902/*
1903 * handle mds notification that our request has been forwarded.
1904 */
1905static void handle_forward(struct ceph_mds_client *mdsc,
1906 struct ceph_mds_session *session,
1907 struct ceph_msg *msg)
1908{
1909 struct ceph_mds_request *req;
1910 u64 tid = le64_to_cpu(msg->hdr.tid);
1911 u32 next_mds;
1912 u32 fwd_seq;
1913 int err = -EINVAL;
1914 void *p = msg->front.iov_base;
1915 void *end = p + msg->front.iov_len;
1916
1917 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1918 next_mds = ceph_decode_32(&p);
1919 fwd_seq = ceph_decode_32(&p);
1920
1921 mutex_lock(&mdsc->mutex);
1922 req = __lookup_request(mdsc, tid);
1923 if (!req) {
1924 dout("forward %llu to mds%d - req dne\n", tid, next_mds);
1925 goto out; /* dup reply? */
1926 }
1927
1928 if (fwd_seq <= req->r_num_fwd) {
1929 dout("forward %llu to mds%d - old seq %d <= %d\n",
1930 tid, next_mds, req->r_num_fwd, fwd_seq);
1931 } else {
1932 /* resend. forward race not possible; mds would drop */
1933 dout("forward %llu to mds%d (we resend)\n", tid, next_mds);
1934 req->r_num_fwd = fwd_seq;
1935 req->r_resend_mds = next_mds;
1936 put_request_session(req);
1937 __do_request(mdsc, req);
1938 }
1939 ceph_mdsc_put_request(req);
1940out:
1941 mutex_unlock(&mdsc->mutex);
1942 return;
1943
1944bad:
1945 pr_err("mdsc_handle_forward decode error err=%d\n", err);
1946}
1947
1948/*
1949 * handle a mds session control message
1950 */
1951static void handle_session(struct ceph_mds_session *session,
1952 struct ceph_msg *msg)
1953{
1954 struct ceph_mds_client *mdsc = session->s_mdsc;
1955 u32 op;
1956 u64 seq;
1957 int mds = session->s_mds;
1958 struct ceph_mds_session_head *h = msg->front.iov_base;
1959 int wake = 0;
1960
1961 /* decode */
1962 if (msg->front.iov_len != sizeof(*h))
1963 goto bad;
1964 op = le32_to_cpu(h->op);
1965 seq = le64_to_cpu(h->seq);
1966
1967 mutex_lock(&mdsc->mutex);
1968 if (op == CEPH_SESSION_CLOSE)
1969 __unregister_session(mdsc, session);
1970 /* FIXME: this ttl calculation is generous */
1971 session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
1972 mutex_unlock(&mdsc->mutex);
1973
1974 mutex_lock(&session->s_mutex);
1975
1976 dout("handle_session mds%d %s %p state %s seq %llu\n",
1977 mds, ceph_session_op_name(op), session,
1978 session_state_name(session->s_state), seq);
1979
1980 if (session->s_state == CEPH_MDS_SESSION_HUNG) {
1981 session->s_state = CEPH_MDS_SESSION_OPEN;
1982 pr_info("mds%d came back\n", session->s_mds);
1983 }
1984
1985 switch (op) {
1986 case CEPH_SESSION_OPEN:
1987 session->s_state = CEPH_MDS_SESSION_OPEN;
1988 renewed_caps(mdsc, session, 0);
1989 wake = 1;
1990 if (mdsc->stopping)
1991 __close_session(mdsc, session);
1992 break;
1993
1994 case CEPH_SESSION_RENEWCAPS:
1995 if (session->s_renew_seq == seq)
1996 renewed_caps(mdsc, session, 1);
1997 break;
1998
1999 case CEPH_SESSION_CLOSE:
2000 remove_session_caps(session);
2001 wake = 1; /* for good measure */
2002 complete(&mdsc->session_close_waiters);
2003 kick_requests(mdsc, mds, 0); /* cur only */
2004 break;
2005
2006 case CEPH_SESSION_STALE:
2007 pr_info("mds%d caps went stale, renewing\n",
2008 session->s_mds);
2009 spin_lock(&session->s_cap_lock);
2010 session->s_cap_gen++;
2011 session->s_cap_ttl = 0;
2012 spin_unlock(&session->s_cap_lock);
2013 send_renew_caps(mdsc, session);
2014 break;
2015
2016 case CEPH_SESSION_RECALL_STATE:
2017 trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
2018 break;
2019
2020 default:
2021 pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
2022 WARN_ON(1);
2023 }
2024
2025 mutex_unlock(&session->s_mutex);
2026 if (wake) {
2027 mutex_lock(&mdsc->mutex);
2028 __wake_requests(mdsc, &session->s_waiting);
2029 mutex_unlock(&mdsc->mutex);
2030 }
2031 return;
2032
2033bad:
2034 pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
2035 (int)msg->front.iov_len);
2036 ceph_msg_dump(msg);
2037 return;
2038}
2039
2040
2041/*
2042 * called under session->mutex.
2043 */
2044static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
2045 struct ceph_mds_session *session)
2046{
2047 struct ceph_mds_request *req, *nreq;
2048 int err;
2049
2050 dout("replay_unsafe_requests mds%d\n", session->s_mds);
2051
2052 mutex_lock(&mdsc->mutex);
2053 list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
2054 err = __prepare_send_request(mdsc, req, session->s_mds);
2055 if (!err) {
2056 ceph_msg_get(req->r_request);
2057 ceph_con_send(&session->s_con, req->r_request);
2058 }
2059 }
2060 mutex_unlock(&mdsc->mutex);
2061}
2062
2063/*
2064 * Encode information about a cap for a reconnect with the MDS.
2065 */
2066static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2067 void *arg)
2068{
2069 struct ceph_mds_cap_reconnect rec;
2070 struct ceph_inode_info *ci;
2071 struct ceph_pagelist *pagelist = arg;
2072 char *path;
2073 int pathlen, err;
2074 u64 pathbase;
2075 struct dentry *dentry;
2076
2077 ci = cap->ci;
2078
2079 dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
2080 inode, ceph_vinop(inode), cap, cap->cap_id,
2081 ceph_cap_string(cap->issued));
2082 err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
2083 if (err)
2084 return err;
2085
2086 dentry = d_find_alias(inode);
2087 if (dentry) {
2088 path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
2089 if (IS_ERR(path)) {
2090 err = PTR_ERR(path);
2091 BUG_ON(err);
2092 }
2093 } else {
2094 path = NULL;
2095 pathlen = 0;
2096 }
2097 err = ceph_pagelist_encode_string(pagelist, path, pathlen);
2098 if (err)
2099 goto out;
2100
2101 spin_lock(&inode->i_lock);
2102 cap->seq = 0; /* reset cap seq */
2103 cap->issue_seq = 0; /* and issue_seq */
2104 rec.cap_id = cpu_to_le64(cap->cap_id);
2105 rec.pathbase = cpu_to_le64(pathbase);
2106 rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
2107 rec.issued = cpu_to_le32(cap->issued);
2108 rec.size = cpu_to_le64(inode->i_size);
2109 ceph_encode_timespec(&rec.mtime, &inode->i_mtime);
2110 ceph_encode_timespec(&rec.atime, &inode->i_atime);
2111 rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
2112 spin_unlock(&inode->i_lock);
2113
2114 err = ceph_pagelist_append(pagelist, &rec, sizeof(rec));
2115
2116out:
2117 kfree(path);
2118 dput(dentry);
2119 return err;
2120}
2121
2122
2123/*
2124 * If an MDS fails and recovers, clients need to reconnect in order to
2125 * reestablish shared state. This includes all caps issued through
2126 * this session _and_ the snap_realm hierarchy. Because it's not
2127 * clear which snap realms the mds cares about, we send everything we
2128 * know about.. that ensures we'll then get any new info the
2129 * recovering MDS might have.
2130 *
2131 * This is a relatively heavyweight operation, but it's rare.
2132 *
2133 * called with mdsc->mutex held.
2134 */
2135static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
2136{
2137 struct ceph_mds_session *session = NULL;
2138 struct ceph_msg *reply;
2139 struct rb_node *p;
2140 int err = -ENOMEM;
2141 struct ceph_pagelist *pagelist;
2142
2143 pr_info("reconnect to recovering mds%d\n", mds);
2144
2145 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
2146 if (!pagelist)
2147 goto fail_nopagelist;
2148 ceph_pagelist_init(pagelist);
2149
2150 reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL);
2151 if (IS_ERR(reply)) {
2152 err = PTR_ERR(reply);
2153 goto fail_nomsg;
2154 }
2155
2156 /* find session */
2157 session = __ceph_lookup_mds_session(mdsc, mds);
2158 mutex_unlock(&mdsc->mutex); /* drop lock for duration */
2159
2160 if (session) {
2161 mutex_lock(&session->s_mutex);
2162
2163 session->s_state = CEPH_MDS_SESSION_RECONNECTING;
2164 session->s_seq = 0;
2165
2166 ceph_con_open(&session->s_con,
2167 ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
2168
2169 /* replay unsafe requests */
2170 replay_unsafe_requests(mdsc, session);
2171 } else {
2172 dout("no session for mds%d, will send short reconnect\n",
2173 mds);
2174 }
2175
2176 down_read(&mdsc->snap_rwsem);
2177
2178 if (!session)
2179 goto send;
2180 dout("session %p state %s\n", session,
2181 session_state_name(session->s_state));
2182
2183 /* traverse this session's caps */
2184 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
2185 if (err)
2186 goto fail;
2187 err = iterate_session_caps(session, encode_caps_cb, pagelist);
2188 if (err < 0)
2189 goto fail;
2190
2191 /*
2192 * snaprealms. we provide mds with the ino, seq (version), and
2193 * parent for all of our realms. If the mds has any newer info,
2194 * it will tell us.
2195 */
2196 for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
2197 struct ceph_snap_realm *realm =
2198 rb_entry(p, struct ceph_snap_realm, node);
2199 struct ceph_mds_snaprealm_reconnect sr_rec;
2200
2201 dout(" adding snap realm %llx seq %lld parent %llx\n",
2202 realm->ino, realm->seq, realm->parent_ino);
2203 sr_rec.ino = cpu_to_le64(realm->ino);
2204 sr_rec.seq = cpu_to_le64(realm->seq);
2205 sr_rec.parent = cpu_to_le64(realm->parent_ino);
2206 err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
2207 if (err)
2208 goto fail;
2209 }
2210
2211send:
2212 reply->pagelist = pagelist;
2213 reply->hdr.data_len = cpu_to_le32(pagelist->length);
2214 reply->nr_pages = calc_pages_for(0, pagelist->length);
2215 ceph_con_send(&session->s_con, reply);
2216
2217 session->s_state = CEPH_MDS_SESSION_OPEN;
2218 mutex_unlock(&session->s_mutex);
2219
2220 mutex_lock(&mdsc->mutex);
2221 __wake_requests(mdsc, &session->s_waiting);
2222 mutex_unlock(&mdsc->mutex);
2223
2224 ceph_put_mds_session(session);
2225
2226 up_read(&mdsc->snap_rwsem);
2227 mutex_lock(&mdsc->mutex);
2228 return;
2229
2230fail:
2231 ceph_msg_put(reply);
2232 up_read(&mdsc->snap_rwsem);
2233 mutex_unlock(&session->s_mutex);
2234 ceph_put_mds_session(session);
2235fail_nomsg:
2236 ceph_pagelist_release(pagelist);
2237 kfree(pagelist);
2238fail_nopagelist:
2239 pr_err("error %d preparing reconnect for mds%d\n", err, mds);
2240 mutex_lock(&mdsc->mutex);
2241 return;
2242}
2243
2244
2245/*
2246 * compare old and new mdsmaps, kicking requests
2247 * and closing out old connections as necessary
2248 *
2249 * called under mdsc->mutex.
2250 */
2251static void check_new_map(struct ceph_mds_client *mdsc,
2252 struct ceph_mdsmap *newmap,
2253 struct ceph_mdsmap *oldmap)
2254{
2255 int i;
2256 int oldstate, newstate;
2257 struct ceph_mds_session *s;
2258
2259 dout("check_new_map new %u old %u\n",
2260 newmap->m_epoch, oldmap->m_epoch);
2261
2262 for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
2263 if (mdsc->sessions[i] == NULL)
2264 continue;
2265 s = mdsc->sessions[i];
2266 oldstate = ceph_mdsmap_get_state(oldmap, i);
2267 newstate = ceph_mdsmap_get_state(newmap, i);
2268
2269 dout("check_new_map mds%d state %s -> %s (session %s)\n",
2270 i, ceph_mds_state_name(oldstate),
2271 ceph_mds_state_name(newstate),
2272 session_state_name(s->s_state));
2273
2274 if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
2275 ceph_mdsmap_get_addr(newmap, i),
2276 sizeof(struct ceph_entity_addr))) {
2277 if (s->s_state == CEPH_MDS_SESSION_OPENING) {
2278 /* the session never opened, just close it
2279 * out now */
2280 __wake_requests(mdsc, &s->s_waiting);
2281 __unregister_session(mdsc, s);
2282 } else {
2283 /* just close it */
2284 mutex_unlock(&mdsc->mutex);
2285 mutex_lock(&s->s_mutex);
2286 mutex_lock(&mdsc->mutex);
2287 ceph_con_close(&s->s_con);
2288 mutex_unlock(&s->s_mutex);
2289 s->s_state = CEPH_MDS_SESSION_RESTARTING;
2290 }
2291
2292 /* kick any requests waiting on the recovering mds */
2293 kick_requests(mdsc, i, 1);
2294 } else if (oldstate == newstate) {
2295 continue; /* nothing new with this mds */
2296 }
2297
2298 /*
2299 * send reconnect?
2300 */
2301 if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
2302 newstate >= CEPH_MDS_STATE_RECONNECT)
2303 send_mds_reconnect(mdsc, i);
2304
2305 /*
2306 * kick requests on any mds that has gone active.
2307 *
2308 * kick requests on cur or forwarder: we may have sent
2309 * the request to mds1, mds1 told us it forwarded it
2310 * to mds2, but then we learn mds1 failed and can't be
2311 * sure it successfully forwarded our request before
2312 * it died.
2313 */
2314 if (oldstate < CEPH_MDS_STATE_ACTIVE &&
2315 newstate >= CEPH_MDS_STATE_ACTIVE) {
2316 pr_info("mds%d reconnect completed\n", s->s_mds);
2317 kick_requests(mdsc, i, 1);
2318 ceph_kick_flushing_caps(mdsc, s);
2319 wake_up_session_caps(s, 1);
2320 }
2321 }
2322}
2323
2324
2325
2326/*
2327 * leases
2328 */
2329
2330/*
2331 * caller must hold session s_mutex, dentry->d_lock
2332 */
2333void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
2334{
2335 struct ceph_dentry_info *di = ceph_dentry(dentry);
2336
2337 ceph_put_mds_session(di->lease_session);
2338 di->lease_session = NULL;
2339}
2340
2341static void handle_lease(struct ceph_mds_client *mdsc,
2342 struct ceph_mds_session *session,
2343 struct ceph_msg *msg)
2344{
2345 struct super_block *sb = mdsc->client->sb;
2346 struct inode *inode;
2347 struct ceph_inode_info *ci;
2348 struct dentry *parent, *dentry;
2349 struct ceph_dentry_info *di;
2350 int mds = session->s_mds;
2351 struct ceph_mds_lease *h = msg->front.iov_base;
2352 struct ceph_vino vino;
2353 int mask;
2354 struct qstr dname;
2355 int release = 0;
2356
2357 dout("handle_lease from mds%d\n", mds);
2358
2359 /* decode */
2360 if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
2361 goto bad;
2362 vino.ino = le64_to_cpu(h->ino);
2363 vino.snap = CEPH_NOSNAP;
2364 mask = le16_to_cpu(h->mask);
2365 dname.name = (void *)h + sizeof(*h) + sizeof(u32);
2366 dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
2367 if (dname.len != get_unaligned_le32(h+1))
2368 goto bad;
2369
2370 mutex_lock(&session->s_mutex);
2371 session->s_seq++;
2372
2373 /* lookup inode */
2374 inode = ceph_find_inode(sb, vino);
2375 dout("handle_lease '%s', mask %d, ino %llx %p\n",
2376 ceph_lease_op_name(h->action), mask, vino.ino, inode);
2377 if (inode == NULL) {
2378 dout("handle_lease no inode %llx\n", vino.ino);
2379 goto release;
2380 }
2381 ci = ceph_inode(inode);
2382
2383 /* dentry */
2384 parent = d_find_alias(inode);
2385 if (!parent) {
2386 dout("no parent dentry on inode %p\n", inode);
2387 WARN_ON(1);
2388 goto release; /* hrm... */
2389 }
2390 dname.hash = full_name_hash(dname.name, dname.len);
2391 dentry = d_lookup(parent, &dname);
2392 dput(parent);
2393 if (!dentry)
2394 goto release;
2395
2396 spin_lock(&dentry->d_lock);
2397 di = ceph_dentry(dentry);
2398 switch (h->action) {
2399 case CEPH_MDS_LEASE_REVOKE:
2400 if (di && di->lease_session == session) {
2401 h->seq = cpu_to_le32(di->lease_seq);
2402 __ceph_mdsc_drop_dentry_lease(dentry);
2403 }
2404 release = 1;
2405 break;
2406
2407 case CEPH_MDS_LEASE_RENEW:
2408 if (di && di->lease_session == session &&
2409 di->lease_gen == session->s_cap_gen &&
2410 di->lease_renew_from &&
2411 di->lease_renew_after == 0) {
2412 unsigned long duration =
2413 le32_to_cpu(h->duration_ms) * HZ / 1000;
2414
2415 di->lease_seq = le32_to_cpu(h->seq);
2416 dentry->d_time = di->lease_renew_from + duration;
2417 di->lease_renew_after = di->lease_renew_from +
2418 (duration >> 1);
2419 di->lease_renew_from = 0;
2420 }
2421 break;
2422 }
2423 spin_unlock(&dentry->d_lock);
2424 dput(dentry);
2425
2426 if (!release)
2427 goto out;
2428
2429release:
2430 /* let's just reuse the same message */
2431 h->action = CEPH_MDS_LEASE_REVOKE_ACK;
2432 ceph_msg_get(msg);
2433 ceph_con_send(&session->s_con, msg);
2434
2435out:
2436 iput(inode);
2437 mutex_unlock(&session->s_mutex);
2438 return;
2439
2440bad:
2441 pr_err("corrupt lease message\n");
2442 ceph_msg_dump(msg);
2443}
2444
2445void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
2446 struct inode *inode,
2447 struct dentry *dentry, char action,
2448 u32 seq)
2449{
2450 struct ceph_msg *msg;
2451 struct ceph_mds_lease *lease;
2452 int len = sizeof(*lease) + sizeof(u32);
2453 int dnamelen = 0;
2454
2455 dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
2456 inode, dentry, ceph_lease_op_name(action), session->s_mds);
2457 dnamelen = dentry->d_name.len;
2458 len += dnamelen;
2459
2460 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
2461 if (IS_ERR(msg))
2462 return;
2463 lease = msg->front.iov_base;
2464 lease->action = action;
2465 lease->mask = cpu_to_le16(CEPH_LOCK_DN);
2466 lease->ino = cpu_to_le64(ceph_vino(inode).ino);
2467 lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
2468 lease->seq = cpu_to_le32(seq);
2469 put_unaligned_le32(dnamelen, lease + 1);
2470 memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
2471
2472 /*
2473 * if this is a preemptive lease RELEASE, no need to
2474 * flush request stream, since the actual request will
2475 * soon follow.
2476 */
2477 msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
2478
2479 ceph_con_send(&session->s_con, msg);
2480}
2481
2482/*
2483 * Preemptively release a lease we expect to invalidate anyway.
2484 * Pass @inode always, @dentry is optional.
2485 */
2486void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
2487 struct dentry *dentry, int mask)
2488{
2489 struct ceph_dentry_info *di;
2490 struct ceph_mds_session *session;
2491 u32 seq;
2492
2493 BUG_ON(inode == NULL);
2494 BUG_ON(dentry == NULL);
2495 BUG_ON(mask != CEPH_LOCK_DN);
2496
2497 /* is dentry lease valid? */
2498 spin_lock(&dentry->d_lock);
2499 di = ceph_dentry(dentry);
2500 if (!di || !di->lease_session ||
2501 di->lease_session->s_mds < 0 ||
2502 di->lease_gen != di->lease_session->s_cap_gen ||
2503 !time_before(jiffies, dentry->d_time)) {
2504 dout("lease_release inode %p dentry %p -- "
2505 "no lease on %d\n",
2506 inode, dentry, mask);
2507 spin_unlock(&dentry->d_lock);
2508 return;
2509 }
2510
2511 /* we do have a lease on this dentry; note mds and seq */
2512 session = ceph_get_mds_session(di->lease_session);
2513 seq = di->lease_seq;
2514 __ceph_mdsc_drop_dentry_lease(dentry);
2515 spin_unlock(&dentry->d_lock);
2516
2517 dout("lease_release inode %p dentry %p mask %d to mds%d\n",
2518 inode, dentry, mask, session->s_mds);
2519 ceph_mdsc_lease_send_msg(session, inode, dentry,
2520 CEPH_MDS_LEASE_RELEASE, seq);
2521 ceph_put_mds_session(session);
2522}
2523
2524/*
2525 * drop all leases (and dentry refs) in preparation for umount
2526 */
2527static void drop_leases(struct ceph_mds_client *mdsc)
2528{
2529 int i;
2530
2531 dout("drop_leases\n");
2532 mutex_lock(&mdsc->mutex);
2533 for (i = 0; i < mdsc->max_sessions; i++) {
2534 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
2535 if (!s)
2536 continue;
2537 mutex_unlock(&mdsc->mutex);
2538 mutex_lock(&s->s_mutex);
2539 mutex_unlock(&s->s_mutex);
2540 ceph_put_mds_session(s);
2541 mutex_lock(&mdsc->mutex);
2542 }
2543 mutex_unlock(&mdsc->mutex);
2544}
2545
2546
2547
2548/*
2549 * delayed work -- periodically trim expired leases, renew caps with mds
2550 */
2551static void schedule_delayed(struct ceph_mds_client *mdsc)
2552{
2553 int delay = 5;
2554 unsigned hz = round_jiffies_relative(HZ * delay);
2555 schedule_delayed_work(&mdsc->delayed_work, hz);
2556}
2557
2558static void delayed_work(struct work_struct *work)
2559{
2560 int i;
2561 struct ceph_mds_client *mdsc =
2562 container_of(work, struct ceph_mds_client, delayed_work.work);
2563 int renew_interval;
2564 int renew_caps;
2565
2566 dout("mdsc delayed_work\n");
2567 ceph_check_delayed_caps(mdsc);
2568
2569 mutex_lock(&mdsc->mutex);
2570 renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
2571 renew_caps = time_after_eq(jiffies, HZ*renew_interval +
2572 mdsc->last_renew_caps);
2573 if (renew_caps)
2574 mdsc->last_renew_caps = jiffies;
2575
2576 for (i = 0; i < mdsc->max_sessions; i++) {
2577 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
2578 if (s == NULL)
2579 continue;
2580 if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
2581 dout("resending session close request for mds%d\n",
2582 s->s_mds);
2583 request_close_session(mdsc, s);
2584 ceph_put_mds_session(s);
2585 continue;
2586 }
2587 if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
2588 if (s->s_state == CEPH_MDS_SESSION_OPEN) {
2589 s->s_state = CEPH_MDS_SESSION_HUNG;
2590 pr_info("mds%d hung\n", s->s_mds);
2591 }
2592 }
2593 if (s->s_state < CEPH_MDS_SESSION_OPEN) {
2594 /* this mds is failed or recovering, just wait */
2595 ceph_put_mds_session(s);
2596 continue;
2597 }
2598 mutex_unlock(&mdsc->mutex);
2599
2600 mutex_lock(&s->s_mutex);
2601 if (renew_caps)
2602 send_renew_caps(mdsc, s);
2603 else
2604 ceph_con_keepalive(&s->s_con);
2605 add_cap_releases(mdsc, s, -1);
2606 send_cap_releases(mdsc, s);
2607 mutex_unlock(&s->s_mutex);
2608 ceph_put_mds_session(s);
2609
2610 mutex_lock(&mdsc->mutex);
2611 }
2612 mutex_unlock(&mdsc->mutex);
2613
2614 schedule_delayed(mdsc);
2615}
2616
2617
2618int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2619{
2620 mdsc->client = client;
2621 mutex_init(&mdsc->mutex);
2622 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
2623 init_completion(&mdsc->safe_umount_waiters);
2624 init_completion(&mdsc->session_close_waiters);
2625 INIT_LIST_HEAD(&mdsc->waiting_for_map);
2626 mdsc->sessions = NULL;
2627 mdsc->max_sessions = 0;
2628 mdsc->stopping = 0;
2629 init_rwsem(&mdsc->snap_rwsem);
2630 mdsc->snap_realms = RB_ROOT;
2631 INIT_LIST_HEAD(&mdsc->snap_empty);
2632 spin_lock_init(&mdsc->snap_empty_lock);
2633 mdsc->last_tid = 0;
2634 mdsc->request_tree = RB_ROOT;
2635 INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
2636 mdsc->last_renew_caps = jiffies;
2637 INIT_LIST_HEAD(&mdsc->cap_delay_list);
2638 spin_lock_init(&mdsc->cap_delay_lock);
2639 INIT_LIST_HEAD(&mdsc->snap_flush_list);
2640 spin_lock_init(&mdsc->snap_flush_lock);
2641 mdsc->cap_flush_seq = 0;
2642 INIT_LIST_HEAD(&mdsc->cap_dirty);
2643 mdsc->num_cap_flushing = 0;
2644 spin_lock_init(&mdsc->cap_dirty_lock);
2645 init_waitqueue_head(&mdsc->cap_flushing_wq);
2646 spin_lock_init(&mdsc->dentry_lru_lock);
2647 INIT_LIST_HEAD(&mdsc->dentry_lru);
2648 return 0;
2649}
2650
2651/*
2652 * Wait for safe replies on open mds requests. If we time out, drop
2653 * all requests from the tree to avoid dangling dentry refs.
2654 */
2655static void wait_requests(struct ceph_mds_client *mdsc)
2656{
2657 struct ceph_mds_request *req;
2658 struct ceph_client *client = mdsc->client;
2659
2660 mutex_lock(&mdsc->mutex);
2661 if (__get_oldest_req(mdsc)) {
2662 mutex_unlock(&mdsc->mutex);
2663
2664 dout("wait_requests waiting for requests\n");
2665 wait_for_completion_timeout(&mdsc->safe_umount_waiters,
2666 client->mount_args->mount_timeout * HZ);
2667
2668 /* tear down remaining requests */
2669 mutex_lock(&mdsc->mutex);
2670 while ((req = __get_oldest_req(mdsc))) {
2671 dout("wait_requests timed out on tid %llu\n",
2672 req->r_tid);
2673 __unregister_request(mdsc, req);
2674 }
2675 }
2676 mutex_unlock(&mdsc->mutex);
2677 dout("wait_requests done\n");
2678}
2679
2680/*
2681 * called before mount is ro, and before dentries are torn down.
2682 * (hmm, does this still race with new lookups?)
2683 */
2684void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
2685{
2686 dout("pre_umount\n");
2687 mdsc->stopping = 1;
2688
2689 drop_leases(mdsc);
2690 ceph_flush_dirty_caps(mdsc);
2691 wait_requests(mdsc);
2692}
2693
2694/*
2695 * wait for all write mds requests to flush.
2696 */
2697static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
2698{
2699 struct ceph_mds_request *req = NULL, *nextreq;
2700 struct rb_node *n;
2701
2702 mutex_lock(&mdsc->mutex);
2703 dout("wait_unsafe_requests want %lld\n", want_tid);
2704restart:
2705 req = __get_oldest_req(mdsc);
2706 while (req && req->r_tid <= want_tid) {
2707 /* find next request */
2708 n = rb_next(&req->r_node);
2709 if (n)
2710 nextreq = rb_entry(n, struct ceph_mds_request, r_node);
2711 else
2712 nextreq = NULL;
2713 if ((req->r_op & CEPH_MDS_OP_WRITE)) {
2714 /* write op */
2715 ceph_mdsc_get_request(req);
2716 if (nextreq)
2717 ceph_mdsc_get_request(nextreq);
2718 mutex_unlock(&mdsc->mutex);
2719 dout("wait_unsafe_requests wait on %llu (want %llu)\n",
2720 req->r_tid, want_tid);
2721 wait_for_completion(&req->r_safe_completion);
2722 mutex_lock(&mdsc->mutex);
2723 ceph_mdsc_put_request(req);
2724 if (!nextreq)
2725 break; /* next dne before, so we're done! */
2726 if (RB_EMPTY_NODE(&nextreq->r_node)) {
2727 /* next request was removed from tree */
2728 ceph_mdsc_put_request(nextreq);
2729 goto restart;
2730 }
2731 ceph_mdsc_put_request(nextreq); /* won't go away */
2732 }
2733 req = nextreq;
2734 }
2735 mutex_unlock(&mdsc->mutex);
2736 dout("wait_unsafe_requests done\n");
2737}
2738
2739void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
2740{
2741 u64 want_tid, want_flush;
2742
2743 dout("sync\n");
2744 mutex_lock(&mdsc->mutex);
2745 want_tid = mdsc->last_tid;
2746 want_flush = mdsc->cap_flush_seq;
2747 mutex_unlock(&mdsc->mutex);
2748 dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
2749
2750 ceph_flush_dirty_caps(mdsc);
2751
2752 wait_unsafe_requests(mdsc, want_tid);
2753 wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
2754}
2755
2756
2757/*
2758 * called after sb is ro.
2759 */
2760void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
2761{
2762 struct ceph_mds_session *session;
2763 int i;
2764 int n;
2765 struct ceph_client *client = mdsc->client;
2766 unsigned long started, timeout = client->mount_args->mount_timeout * HZ;
2767
2768 dout("close_sessions\n");
2769
2770 mutex_lock(&mdsc->mutex);
2771
2772 /* close sessions */
2773 started = jiffies;
2774 while (time_before(jiffies, started + timeout)) {
2775 dout("closing sessions\n");
2776 n = 0;
2777 for (i = 0; i < mdsc->max_sessions; i++) {
2778 session = __ceph_lookup_mds_session(mdsc, i);
2779 if (!session)
2780 continue;
2781 mutex_unlock(&mdsc->mutex);
2782 mutex_lock(&session->s_mutex);
2783 __close_session(mdsc, session);
2784 mutex_unlock(&session->s_mutex);
2785 ceph_put_mds_session(session);
2786 mutex_lock(&mdsc->mutex);
2787 n++;
2788 }
2789 if (n == 0)
2790 break;
2791
2792 if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
2793 break;
2794
2795 dout("waiting for sessions to close\n");
2796 mutex_unlock(&mdsc->mutex);
2797 wait_for_completion_timeout(&mdsc->session_close_waiters,
2798 timeout);
2799 mutex_lock(&mdsc->mutex);
2800 }
2801
2802 /* tear down remaining sessions */
2803 for (i = 0; i < mdsc->max_sessions; i++) {
2804 if (mdsc->sessions[i]) {
2805 session = get_session(mdsc->sessions[i]);
2806 __unregister_session(mdsc, session);
2807 mutex_unlock(&mdsc->mutex);
2808 mutex_lock(&session->s_mutex);
2809 remove_session_caps(session);
2810 mutex_unlock(&session->s_mutex);
2811 ceph_put_mds_session(session);
2812 mutex_lock(&mdsc->mutex);
2813 }
2814 }
2815
2816 WARN_ON(!list_empty(&mdsc->cap_delay_list));
2817
2818 mutex_unlock(&mdsc->mutex);
2819
2820 ceph_cleanup_empty_realms(mdsc);
2821
2822 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
2823
2824 dout("stopped\n");
2825}
2826
2827void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
2828{
2829 dout("stop\n");
2830 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
2831 if (mdsc->mdsmap)
2832 ceph_mdsmap_destroy(mdsc->mdsmap);
2833 kfree(mdsc->sessions);
2834}
2835
2836
2837/*
2838 * handle mds map update.
2839 */
2840void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
2841{
2842 u32 epoch;
2843 u32 maplen;
2844 void *p = msg->front.iov_base;
2845 void *end = p + msg->front.iov_len;
2846 struct ceph_mdsmap *newmap, *oldmap;
2847 struct ceph_fsid fsid;
2848 int err = -EINVAL;
2849
2850 ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
2851 ceph_decode_copy(&p, &fsid, sizeof(fsid));
2852 if (ceph_check_fsid(mdsc->client, &fsid) < 0)
2853 return;
2854 epoch = ceph_decode_32(&p);
2855 maplen = ceph_decode_32(&p);
2856 dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
2857
2858 /* do we need it? */
2859 ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
2860 mutex_lock(&mdsc->mutex);
2861 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
2862 dout("handle_map epoch %u <= our %u\n",
2863 epoch, mdsc->mdsmap->m_epoch);
2864 mutex_unlock(&mdsc->mutex);
2865 return;
2866 }
2867
2868 newmap = ceph_mdsmap_decode(&p, end);
2869 if (IS_ERR(newmap)) {
2870 err = PTR_ERR(newmap);
2871 goto bad_unlock;
2872 }
2873
2874 /* swap into place */
2875 if (mdsc->mdsmap) {
2876 oldmap = mdsc->mdsmap;
2877 mdsc->mdsmap = newmap;
2878 check_new_map(mdsc, newmap, oldmap);
2879 ceph_mdsmap_destroy(oldmap);
2880 } else {
2881 mdsc->mdsmap = newmap; /* first mds map */
2882 }
2883 mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
2884
2885 __wake_requests(mdsc, &mdsc->waiting_for_map);
2886
2887 mutex_unlock(&mdsc->mutex);
2888 schedule_delayed(mdsc);
2889 return;
2890
2891bad_unlock:
2892 mutex_unlock(&mdsc->mutex);
2893bad:
2894 pr_err("error decoding mdsmap %d\n", err);
2895 return;
2896}
2897
2898static struct ceph_connection *con_get(struct ceph_connection *con)
2899{
2900 struct ceph_mds_session *s = con->private;
2901
2902 if (get_session(s)) {
2903 dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
2904 return con;
2905 }
2906 dout("mdsc con_get %p FAIL\n", s);
2907 return NULL;
2908}
2909
2910static void con_put(struct ceph_connection *con)
2911{
2912 struct ceph_mds_session *s = con->private;
2913
2914 ceph_put_mds_session(s);
2915 dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref));
2916}
2917
2918/*
2919 * if the client is unresponsive for long enough, the mds will kill
2920 * the session entirely.
2921 */
2922static void peer_reset(struct ceph_connection *con)
2923{
2924 struct ceph_mds_session *s = con->private;
2925
2926 pr_err("mds%d gave us the boot. IMPLEMENT RECONNECT.\n",
2927 s->s_mds);
2928}
2929
2930static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
2931{
2932 struct ceph_mds_session *s = con->private;
2933 struct ceph_mds_client *mdsc = s->s_mdsc;
2934 int type = le16_to_cpu(msg->hdr.type);
2935
2936 mutex_lock(&mdsc->mutex);
2937 if (__verify_registered_session(mdsc, s) < 0) {
2938 mutex_unlock(&mdsc->mutex);
2939 goto out;
2940 }
2941 mutex_unlock(&mdsc->mutex);
2942
2943 switch (type) {
2944 case CEPH_MSG_MDS_MAP:
2945 ceph_mdsc_handle_map(mdsc, msg);
2946 break;
2947 case CEPH_MSG_CLIENT_SESSION:
2948 handle_session(s, msg);
2949 break;
2950 case CEPH_MSG_CLIENT_REPLY:
2951 handle_reply(s, msg);
2952 break;
2953 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2954 handle_forward(mdsc, s, msg);
2955 break;
2956 case CEPH_MSG_CLIENT_CAPS:
2957 ceph_handle_caps(s, msg);
2958 break;
2959 case CEPH_MSG_CLIENT_SNAP:
2960 ceph_handle_snap(mdsc, s, msg);
2961 break;
2962 case CEPH_MSG_CLIENT_LEASE:
2963 handle_lease(mdsc, s, msg);
2964 break;
2965
2966 default:
2967 pr_err("received unknown message type %d %s\n", type,
2968 ceph_msg_type_name(type));
2969 }
2970out:
2971 ceph_msg_put(msg);
2972}
2973
2974/*
2975 * authentication
2976 */
2977static int get_authorizer(struct ceph_connection *con,
2978 void **buf, int *len, int *proto,
2979 void **reply_buf, int *reply_len, int force_new)
2980{
2981 struct ceph_mds_session *s = con->private;
2982 struct ceph_mds_client *mdsc = s->s_mdsc;
2983 struct ceph_auth_client *ac = mdsc->client->monc.auth;
2984 int ret = 0;
2985
2986 if (force_new && s->s_authorizer) {
2987 ac->ops->destroy_authorizer(ac, s->s_authorizer);
2988 s->s_authorizer = NULL;
2989 }
2990 if (s->s_authorizer == NULL) {
2991 if (ac->ops->create_authorizer) {
2992 ret = ac->ops->create_authorizer(
2993 ac, CEPH_ENTITY_TYPE_MDS,
2994 &s->s_authorizer,
2995 &s->s_authorizer_buf,
2996 &s->s_authorizer_buf_len,
2997 &s->s_authorizer_reply_buf,
2998 &s->s_authorizer_reply_buf_len);
2999 if (ret)
3000 return ret;
3001 }
3002 }
3003
3004 *proto = ac->protocol;
3005 *buf = s->s_authorizer_buf;
3006 *len = s->s_authorizer_buf_len;
3007 *reply_buf = s->s_authorizer_reply_buf;
3008 *reply_len = s->s_authorizer_reply_buf_len;
3009 return 0;
3010}
3011
3012
3013static int verify_authorizer_reply(struct ceph_connection *con, int len)
3014{
3015 struct ceph_mds_session *s = con->private;
3016 struct ceph_mds_client *mdsc = s->s_mdsc;
3017 struct ceph_auth_client *ac = mdsc->client->monc.auth;
3018
3019 return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
3020}
3021
3022static int invalidate_authorizer(struct ceph_connection *con)
3023{
3024 struct ceph_mds_session *s = con->private;
3025 struct ceph_mds_client *mdsc = s->s_mdsc;
3026 struct ceph_auth_client *ac = mdsc->client->monc.auth;
3027
3028 if (ac->ops->invalidate_authorizer)
3029 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
3030
3031 return ceph_monc_validate_auth(&mdsc->client->monc);
3032}
3033
3034const static struct ceph_connection_operations mds_con_ops = {
3035 .get = con_get,
3036 .put = con_put,
3037 .dispatch = dispatch,
3038 .get_authorizer = get_authorizer,
3039 .verify_authorizer_reply = verify_authorizer_reply,
3040 .invalidate_authorizer = invalidate_authorizer,
3041 .peer_reset = peer_reset,
3042};
3043
3044
3045
3046
3047/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
new file mode 100644
index 000000000000..961cc6f65878
--- /dev/null
+++ b/fs/ceph/mds_client.h
@@ -0,0 +1,335 @@
1#ifndef _FS_CEPH_MDS_CLIENT_H
2#define _FS_CEPH_MDS_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/list.h>
7#include <linux/mutex.h>
8#include <linux/rbtree.h>
9#include <linux/spinlock.h>
10
11#include "types.h"
12#include "messenger.h"
13#include "mdsmap.h"
14
15/*
16 * Some lock dependencies:
17 *
18 * session->s_mutex
19 * mdsc->mutex
20 *
21 * mdsc->snap_rwsem
22 *
23 * inode->i_lock
24 * mdsc->snap_flush_lock
25 * mdsc->cap_delay_lock
26 *
27 */
28
29struct ceph_client;
30struct ceph_cap;
31
32/*
33 * parsed info about a single inode. pointers are into the encoded
34 * on-wire structures within the mds reply message payload.
35 */
36struct ceph_mds_reply_info_in {
37 struct ceph_mds_reply_inode *in;
38 u32 symlink_len;
39 char *symlink;
40 u32 xattr_len;
41 char *xattr_data;
42};
43
44/*
45 * parsed info about an mds reply, including information about the
46 * target inode and/or its parent directory and dentry, and directory
47 * contents (for readdir results).
48 */
49struct ceph_mds_reply_info_parsed {
50 struct ceph_mds_reply_head *head;
51
52 struct ceph_mds_reply_info_in diri, targeti;
53 struct ceph_mds_reply_dirfrag *dirfrag;
54 char *dname;
55 u32 dname_len;
56 struct ceph_mds_reply_lease *dlease;
57
58 struct ceph_mds_reply_dirfrag *dir_dir;
59 int dir_nr;
60 char **dir_dname;
61 u32 *dir_dname_len;
62 struct ceph_mds_reply_lease **dir_dlease;
63 struct ceph_mds_reply_info_in *dir_in;
64 u8 dir_complete, dir_end;
65
66 /* encoded blob describing snapshot contexts for certain
67 operations (e.g., open) */
68 void *snapblob;
69 int snapblob_len;
70};
71
72
73/*
74 * cap releases are batched and sent to the MDS en masse.
75 */
76#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \
77 sizeof(struct ceph_mds_cap_release)) / \
78 sizeof(struct ceph_mds_cap_item))
79
80
81/*
82 * state associated with each MDS<->client session
83 */
84enum {
85 CEPH_MDS_SESSION_NEW = 1,
86 CEPH_MDS_SESSION_OPENING = 2,
87 CEPH_MDS_SESSION_OPEN = 3,
88 CEPH_MDS_SESSION_HUNG = 4,
89 CEPH_MDS_SESSION_CLOSING = 5,
90 CEPH_MDS_SESSION_RESTARTING = 6,
91 CEPH_MDS_SESSION_RECONNECTING = 7,
92};
93
94struct ceph_mds_session {
95 struct ceph_mds_client *s_mdsc;
96 int s_mds;
97 int s_state;
98 unsigned long s_ttl; /* time until mds kills us */
99 u64 s_seq; /* incoming msg seq # */
100 struct mutex s_mutex; /* serialize session messages */
101
102 struct ceph_connection s_con;
103
104 struct ceph_authorizer *s_authorizer;
105 void *s_authorizer_buf, *s_authorizer_reply_buf;
106 size_t s_authorizer_buf_len, s_authorizer_reply_buf_len;
107
108 /* protected by s_cap_lock */
109 spinlock_t s_cap_lock;
110 u32 s_cap_gen; /* inc each time we get mds stale msg */
111 unsigned long s_cap_ttl; /* when session caps expire */
112 struct list_head s_caps; /* all caps issued by this session */
113 int s_nr_caps, s_trim_caps;
114 int s_num_cap_releases;
115 struct list_head s_cap_releases; /* waiting cap_release messages */
116 struct list_head s_cap_releases_done; /* ready to send */
117 struct ceph_cap *s_cap_iterator;
118
119 /* protected by mutex */
120 struct list_head s_cap_flushing; /* inodes w/ flushing caps */
121 struct list_head s_cap_snaps_flushing;
122 unsigned long s_renew_requested; /* last time we sent a renew req */
123 u64 s_renew_seq;
124
125 atomic_t s_ref;
126 struct list_head s_waiting; /* waiting requests */
127 struct list_head s_unsafe; /* unsafe requests */
128};
129
130/*
131 * modes of choosing which MDS to send a request to
132 */
133enum {
134 USE_ANY_MDS,
135 USE_RANDOM_MDS,
136 USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */
137};
138
139struct ceph_mds_request;
140struct ceph_mds_client;
141
142/*
143 * request completion callback
144 */
145typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
146 struct ceph_mds_request *req);
147
148/*
149 * an in-flight mds request
150 */
151struct ceph_mds_request {
152 u64 r_tid; /* transaction id */
153 struct rb_node r_node;
154
155 int r_op; /* mds op code */
156 int r_mds;
157
158 /* operation on what? */
159 struct inode *r_inode; /* arg1 */
160 struct dentry *r_dentry; /* arg1 */
161 struct dentry *r_old_dentry; /* arg2: rename from or link from */
162 char *r_path1, *r_path2;
163 struct ceph_vino r_ino1, r_ino2;
164
165 struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
166 struct inode *r_target_inode; /* resulting inode */
167
168 union ceph_mds_request_args r_args;
169 int r_fmode; /* file mode, if expecting cap */
170
171 /* for choosing which mds to send this request to */
172 int r_direct_mode;
173 u32 r_direct_hash; /* choose dir frag based on this dentry hash */
174 bool r_direct_is_hash; /* true if r_direct_hash is valid */
175
176 /* data payload is used for xattr ops */
177 struct page **r_pages;
178 int r_num_pages;
179 int r_data_len;
180
181 /* what caps shall we drop? */
182 int r_inode_drop, r_inode_unless;
183 int r_dentry_drop, r_dentry_unless;
184 int r_old_dentry_drop, r_old_dentry_unless;
185 struct inode *r_old_inode;
186 int r_old_inode_drop, r_old_inode_unless;
187
188 struct ceph_msg *r_request; /* original request */
189 struct ceph_msg *r_reply;
190 struct ceph_mds_reply_info_parsed r_reply_info;
191 int r_err;
192 bool r_aborted;
193
194 unsigned long r_timeout; /* optional. jiffies */
195 unsigned long r_started; /* start time to measure timeout against */
196 unsigned long r_request_started; /* start time for mds request only,
197 used to measure lease durations */
198
199 /* link unsafe requests to parent directory, for fsync */
200 struct inode *r_unsafe_dir;
201 struct list_head r_unsafe_dir_item;
202
203 struct ceph_mds_session *r_session;
204
205 int r_attempts; /* resend attempts */
206 int r_num_fwd; /* number of forward attempts */
207 int r_num_stale;
208 int r_resend_mds; /* mds to resend to next, if any*/
209
210 struct kref r_kref;
211 struct list_head r_wait;
212 struct completion r_completion;
213 struct completion r_safe_completion;
214 ceph_mds_request_callback_t r_callback;
215 struct list_head r_unsafe_item; /* per-session unsafe list item */
216 bool r_got_unsafe, r_got_safe;
217
218 bool r_did_prepopulate;
219 u32 r_readdir_offset;
220
221 struct ceph_cap_reservation r_caps_reservation;
222 int r_num_caps;
223};
224
225/*
226 * mds client state
227 */
228struct ceph_mds_client {
229 struct ceph_client *client;
230 struct mutex mutex; /* all nested structures */
231
232 struct ceph_mdsmap *mdsmap;
233 struct completion safe_umount_waiters, session_close_waiters;
234 struct list_head waiting_for_map;
235
236 struct ceph_mds_session **sessions; /* NULL for mds if no session */
237 int max_sessions; /* len of s_mds_sessions */
238 int stopping; /* true if shutting down */
239
240 /*
241 * snap_rwsem will cover cap linkage into snaprealms, and
242 * realm snap contexts. (later, we can do per-realm snap
243 * contexts locks..) the empty list contains realms with no
244 * references (implying they contain no inodes with caps) that
245 * should be destroyed.
246 */
247 struct rw_semaphore snap_rwsem;
248 struct rb_root snap_realms;
249 struct list_head snap_empty;
250 spinlock_t snap_empty_lock; /* protect snap_empty */
251
252 u64 last_tid; /* most recent mds request */
253 struct rb_root request_tree; /* pending mds requests */
254 struct delayed_work delayed_work; /* delayed work */
255 unsigned long last_renew_caps; /* last time we renewed our caps */
256 struct list_head cap_delay_list; /* caps with delayed release */
257 spinlock_t cap_delay_lock; /* protects cap_delay_list */
258 struct list_head snap_flush_list; /* cap_snaps ready to flush */
259 spinlock_t snap_flush_lock;
260
261 u64 cap_flush_seq;
262 struct list_head cap_dirty; /* inodes with dirty caps */
263 int num_cap_flushing; /* # caps we are flushing */
264 spinlock_t cap_dirty_lock; /* protects above items */
265 wait_queue_head_t cap_flushing_wq;
266
267#ifdef CONFIG_DEBUG_FS
268 struct dentry *debugfs_file;
269#endif
270
271 spinlock_t dentry_lru_lock;
272 struct list_head dentry_lru;
273 int num_dentry;
274};
275
276extern const char *ceph_mds_op_name(int op);
277
278extern struct ceph_mds_session *
279__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
280
281static inline struct ceph_mds_session *
282ceph_get_mds_session(struct ceph_mds_session *s)
283{
284 atomic_inc(&s->s_ref);
285 return s;
286}
287
288extern void ceph_put_mds_session(struct ceph_mds_session *s);
289
290extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
291 struct ceph_msg *msg, int mds);
292
293extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
294 struct ceph_client *client);
295extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
296extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
297
298extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
299
300extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
301 struct inode *inode,
302 struct dentry *dn, int mask);
303
304extern struct ceph_mds_request *
305ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
306extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
307 struct ceph_mds_request *req);
308extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
309 struct inode *dir,
310 struct ceph_mds_request *req);
311static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
312{
313 kref_get(&req->r_kref);
314}
315extern void ceph_mdsc_release_request(struct kref *kref);
316static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
317{
318 kref_put(&req->r_kref, ceph_mdsc_release_request);
319}
320
321extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
322
323extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
324 int stop_on_nosnap);
325
326extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
327extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
328 struct inode *inode,
329 struct dentry *dentry, char action,
330 u32 seq);
331
332extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
333 struct ceph_msg *msg);
334
335#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
new file mode 100644
index 000000000000..c4c498e6dfef
--- /dev/null
+++ b/fs/ceph/mdsmap.c
@@ -0,0 +1,174 @@
1#include "ceph_debug.h"
2
3#include <linux/bug.h>
4#include <linux/err.h>
5#include <linux/random.h>
6#include <linux/slab.h>
7#include <linux/types.h>
8
9#include "mdsmap.h"
10#include "messenger.h"
11#include "decode.h"
12
13#include "super.h"
14
15
16/*
17 * choose a random mds that is "up" (i.e. has a state > 0), or -1.
18 */
19int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
20{
21 int n = 0;
22 int i;
23 char r;
24
25 /* count */
26 for (i = 0; i < m->m_max_mds; i++)
27 if (m->m_info[i].state > 0)
28 n++;
29 if (n == 0)
30 return -1;
31
32 /* pick */
33 get_random_bytes(&r, 1);
34 n = r % n;
35 i = 0;
36 for (i = 0; n > 0; i++, n--)
37 while (m->m_info[i].state <= 0)
38 i++;
39
40 return i;
41}
42
43/*
44 * Decode an MDS map
45 *
46 * Ignore any fields we don't care about (there are quite a few of
47 * them).
48 */
49struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
50{
51 struct ceph_mdsmap *m;
52 const void *start = *p;
53 int i, j, n;
54 int err = -EINVAL;
55 u16 version;
56
57 m = kzalloc(sizeof(*m), GFP_NOFS);
58 if (m == NULL)
59 return ERR_PTR(-ENOMEM);
60
61 ceph_decode_16_safe(p, end, version, bad);
62
63 ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
64 m->m_epoch = ceph_decode_32(p);
65 m->m_client_epoch = ceph_decode_32(p);
66 m->m_last_failure = ceph_decode_32(p);
67 m->m_root = ceph_decode_32(p);
68 m->m_session_timeout = ceph_decode_32(p);
69 m->m_session_autoclose = ceph_decode_32(p);
70 m->m_max_file_size = ceph_decode_64(p);
71 m->m_max_mds = ceph_decode_32(p);
72
73 m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
74 if (m->m_info == NULL)
75 goto badmem;
76
77 /* pick out active nodes from mds_info (state > 0) */
78 n = ceph_decode_32(p);
79 for (i = 0; i < n; i++) {
80 u64 global_id;
81 u32 namelen;
82 s32 mds, inc, state;
83 u64 state_seq;
84 u8 infoversion;
85 struct ceph_entity_addr addr;
86 u32 num_export_targets;
87 void *pexport_targets = NULL;
88
89 ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
90 global_id = ceph_decode_64(p);
91 infoversion = ceph_decode_8(p);
92 *p += sizeof(u64);
93 namelen = ceph_decode_32(p); /* skip mds name */
94 *p += namelen;
95
96 ceph_decode_need(p, end,
97 4*sizeof(u32) + sizeof(u64) +
98 sizeof(addr) + sizeof(struct ceph_timespec),
99 bad);
100 mds = ceph_decode_32(p);
101 inc = ceph_decode_32(p);
102 state = ceph_decode_32(p);
103 state_seq = ceph_decode_64(p);
104 ceph_decode_copy(p, &addr, sizeof(addr));
105 ceph_decode_addr(&addr);
106 *p += sizeof(struct ceph_timespec);
107 *p += sizeof(u32);
108 ceph_decode_32_safe(p, end, namelen, bad);
109 *p += namelen;
110 if (infoversion >= 2) {
111 ceph_decode_32_safe(p, end, num_export_targets, bad);
112 pexport_targets = *p;
113 *p += num_export_targets * sizeof(u32);
114 } else {
115 num_export_targets = 0;
116 }
117
118 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
119 i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr),
120 ceph_mds_state_name(state));
121 if (mds >= 0 && mds < m->m_max_mds && state > 0) {
122 m->m_info[mds].global_id = global_id;
123 m->m_info[mds].state = state;
124 m->m_info[mds].addr = addr;
125 m->m_info[mds].num_export_targets = num_export_targets;
126 if (num_export_targets) {
127 m->m_info[mds].export_targets =
128 kcalloc(num_export_targets, sizeof(u32),
129 GFP_NOFS);
130 for (j = 0; j < num_export_targets; j++)
131 m->m_info[mds].export_targets[j] =
132 ceph_decode_32(&pexport_targets);
133 } else {
134 m->m_info[mds].export_targets = NULL;
135 }
136 }
137 }
138
139 /* pg_pools */
140 ceph_decode_32_safe(p, end, n, bad);
141 m->m_num_data_pg_pools = n;
142 m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
143 if (!m->m_data_pg_pools)
144 goto badmem;
145 ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
146 for (i = 0; i < n; i++)
147 m->m_data_pg_pools[i] = ceph_decode_32(p);
148 m->m_cas_pg_pool = ceph_decode_32(p);
149
150 /* ok, we don't care about the rest. */
151 dout("mdsmap_decode success epoch %u\n", m->m_epoch);
152 return m;
153
154badmem:
155 err = -ENOMEM;
156bad:
157 pr_err("corrupt mdsmap\n");
158 print_hex_dump(KERN_DEBUG, "mdsmap: ",
159 DUMP_PREFIX_OFFSET, 16, 1,
160 start, end - start, true);
161 ceph_mdsmap_destroy(m);
162 return ERR_PTR(-EINVAL);
163}
164
165void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
166{
167 int i;
168
169 for (i = 0; i < m->m_max_mds; i++)
170 kfree(m->m_info[i].export_targets);
171 kfree(m->m_info);
172 kfree(m->m_data_pg_pools);
173 kfree(m);
174}
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
new file mode 100644
index 000000000000..eacc131aa5cb
--- /dev/null
+++ b/fs/ceph/mdsmap.h
@@ -0,0 +1,54 @@
1#ifndef _FS_CEPH_MDSMAP_H
2#define _FS_CEPH_MDSMAP_H
3
4#include "types.h"
5
6/*
7 * mds map - describe servers in the mds cluster.
8 *
9 * we limit fields to those the client actually xcares about
10 */
11struct ceph_mds_info {
12 u64 global_id;
13 struct ceph_entity_addr addr;
14 s32 state;
15 int num_export_targets;
16 u32 *export_targets;
17};
18
19struct ceph_mdsmap {
20 u32 m_epoch, m_client_epoch, m_last_failure;
21 u32 m_root;
22 u32 m_session_timeout; /* seconds */
23 u32 m_session_autoclose; /* seconds */
24 u64 m_max_file_size;
25 u32 m_max_mds; /* size of m_addr, m_state arrays */
26 struct ceph_mds_info *m_info;
27
28 /* which object pools file data can be stored in */
29 int m_num_data_pg_pools;
30 u32 *m_data_pg_pools;
31 u32 m_cas_pg_pool;
32};
33
34static inline struct ceph_entity_addr *
35ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
36{
37 if (w >= m->m_max_mds)
38 return NULL;
39 return &m->m_info[w].addr;
40}
41
42static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
43{
44 BUG_ON(w < 0);
45 if (w >= m->m_max_mds)
46 return CEPH_MDS_STATE_DNE;
47 return m->m_info[w].state;
48}
49
50extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
51extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
52extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
53
54#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
new file mode 100644
index 000000000000..cd4fadb6491a
--- /dev/null
+++ b/fs/ceph/messenger.c
@@ -0,0 +1,2284 @@
1#include "ceph_debug.h"
2
3#include <linux/crc32c.h>
4#include <linux/ctype.h>
5#include <linux/highmem.h>
6#include <linux/inet.h>
7#include <linux/kthread.h>
8#include <linux/net.h>
9#include <linux/slab.h>
10#include <linux/socket.h>
11#include <linux/string.h>
12#include <net/tcp.h>
13
14#include "super.h"
15#include "messenger.h"
16#include "decode.h"
17#include "pagelist.h"
18
19/*
20 * Ceph uses the messenger to exchange ceph_msg messages with other
21 * hosts in the system. The messenger provides ordered and reliable
22 * delivery. We tolerate TCP disconnects by reconnecting (with
23 * exponential backoff) in the case of a fault (disconnection, bad
24 * crc, protocol error). Acks allow sent messages to be discarded by
25 * the sender.
26 */
27
28/* static tag bytes (protocol control messages) */
29static char tag_msg = CEPH_MSGR_TAG_MSG;
30static char tag_ack = CEPH_MSGR_TAG_ACK;
31static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
32
33#ifdef CONFIG_LOCKDEP
34static struct lock_class_key socket_class;
35#endif
36
37
38static void queue_con(struct ceph_connection *con);
39static void con_work(struct work_struct *);
40static void ceph_fault(struct ceph_connection *con);
41
42const char *ceph_name_type_str(int t)
43{
44 switch (t) {
45 case CEPH_ENTITY_TYPE_MON: return "mon";
46 case CEPH_ENTITY_TYPE_MDS: return "mds";
47 case CEPH_ENTITY_TYPE_OSD: return "osd";
48 case CEPH_ENTITY_TYPE_CLIENT: return "client";
49 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
50 default: return "???";
51 }
52}
53
54/*
55 * nicely render a sockaddr as a string.
56 */
57#define MAX_ADDR_STR 20
58static char addr_str[MAX_ADDR_STR][40];
59static DEFINE_SPINLOCK(addr_str_lock);
60static int last_addr_str;
61
62const char *pr_addr(const struct sockaddr_storage *ss)
63{
64 int i;
65 char *s;
66 struct sockaddr_in *in4 = (void *)ss;
67 unsigned char *quad = (void *)&in4->sin_addr.s_addr;
68 struct sockaddr_in6 *in6 = (void *)ss;
69
70 spin_lock(&addr_str_lock);
71 i = last_addr_str++;
72 if (last_addr_str == MAX_ADDR_STR)
73 last_addr_str = 0;
74 spin_unlock(&addr_str_lock);
75 s = addr_str[i];
76
77 switch (ss->ss_family) {
78 case AF_INET:
79 sprintf(s, "%u.%u.%u.%u:%u",
80 (unsigned int)quad[0],
81 (unsigned int)quad[1],
82 (unsigned int)quad[2],
83 (unsigned int)quad[3],
84 (unsigned int)ntohs(in4->sin_port));
85 break;
86
87 case AF_INET6:
88 sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u",
89 in6->sin6_addr.s6_addr16[0],
90 in6->sin6_addr.s6_addr16[1],
91 in6->sin6_addr.s6_addr16[2],
92 in6->sin6_addr.s6_addr16[3],
93 in6->sin6_addr.s6_addr16[4],
94 in6->sin6_addr.s6_addr16[5],
95 in6->sin6_addr.s6_addr16[6],
96 in6->sin6_addr.s6_addr16[7],
97 (unsigned int)ntohs(in6->sin6_port));
98 break;
99
100 default:
101 sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
102 }
103
104 return s;
105}
106
107static void encode_my_addr(struct ceph_messenger *msgr)
108{
109 memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
110 ceph_encode_addr(&msgr->my_enc_addr);
111}
112
113/*
114 * work queue for all reading and writing to/from the socket.
115 */
116struct workqueue_struct *ceph_msgr_wq;
117
118int __init ceph_msgr_init(void)
119{
120 ceph_msgr_wq = create_workqueue("ceph-msgr");
121 if (IS_ERR(ceph_msgr_wq)) {
122 int ret = PTR_ERR(ceph_msgr_wq);
123 pr_err("msgr_init failed to create workqueue: %d\n", ret);
124 ceph_msgr_wq = NULL;
125 return ret;
126 }
127 return 0;
128}
129
130void ceph_msgr_exit(void)
131{
132 destroy_workqueue(ceph_msgr_wq);
133}
134
135/*
136 * socket callback functions
137 */
138
139/* data available on socket, or listen socket received a connect */
140static void ceph_data_ready(struct sock *sk, int count_unused)
141{
142 struct ceph_connection *con =
143 (struct ceph_connection *)sk->sk_user_data;
144 if (sk->sk_state != TCP_CLOSE_WAIT) {
145 dout("ceph_data_ready on %p state = %lu, queueing work\n",
146 con, con->state);
147 queue_con(con);
148 }
149}
150
151/* socket has buffer space for writing */
152static void ceph_write_space(struct sock *sk)
153{
154 struct ceph_connection *con =
155 (struct ceph_connection *)sk->sk_user_data;
156
157 /* only queue to workqueue if there is data we want to write. */
158 if (test_bit(WRITE_PENDING, &con->state)) {
159 dout("ceph_write_space %p queueing write work\n", con);
160 queue_con(con);
161 } else {
162 dout("ceph_write_space %p nothing to write\n", con);
163 }
164
165 /* since we have our own write_space, clear the SOCK_NOSPACE flag */
166 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
167}
168
169/* socket's state has changed */
170static void ceph_state_change(struct sock *sk)
171{
172 struct ceph_connection *con =
173 (struct ceph_connection *)sk->sk_user_data;
174
175 dout("ceph_state_change %p state = %lu sk_state = %u\n",
176 con, con->state, sk->sk_state);
177
178 if (test_bit(CLOSED, &con->state))
179 return;
180
181 switch (sk->sk_state) {
182 case TCP_CLOSE:
183 dout("ceph_state_change TCP_CLOSE\n");
184 case TCP_CLOSE_WAIT:
185 dout("ceph_state_change TCP_CLOSE_WAIT\n");
186 if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
187 if (test_bit(CONNECTING, &con->state))
188 con->error_msg = "connection failed";
189 else
190 con->error_msg = "socket closed";
191 queue_con(con);
192 }
193 break;
194 case TCP_ESTABLISHED:
195 dout("ceph_state_change TCP_ESTABLISHED\n");
196 queue_con(con);
197 break;
198 }
199}
200
201/*
202 * set up socket callbacks
203 */
204static void set_sock_callbacks(struct socket *sock,
205 struct ceph_connection *con)
206{
207 struct sock *sk = sock->sk;
208 sk->sk_user_data = (void *)con;
209 sk->sk_data_ready = ceph_data_ready;
210 sk->sk_write_space = ceph_write_space;
211 sk->sk_state_change = ceph_state_change;
212}
213
214
215/*
216 * socket helpers
217 */
218
219/*
220 * initiate connection to a remote socket.
221 */
222static struct socket *ceph_tcp_connect(struct ceph_connection *con)
223{
224 struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr;
225 struct socket *sock;
226 int ret;
227
228 BUG_ON(con->sock);
229 ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
230 if (ret)
231 return ERR_PTR(ret);
232 con->sock = sock;
233 sock->sk->sk_allocation = GFP_NOFS;
234
235#ifdef CONFIG_LOCKDEP
236 lockdep_set_class(&sock->sk->sk_lock, &socket_class);
237#endif
238
239 set_sock_callbacks(sock, con);
240
241 dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
242
243 ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK);
244 if (ret == -EINPROGRESS) {
245 dout("connect %s EINPROGRESS sk_state = %u\n",
246 pr_addr(&con->peer_addr.in_addr),
247 sock->sk->sk_state);
248 ret = 0;
249 }
250 if (ret < 0) {
251 pr_err("connect %s error %d\n",
252 pr_addr(&con->peer_addr.in_addr), ret);
253 sock_release(sock);
254 con->sock = NULL;
255 con->error_msg = "connect error";
256 }
257
258 if (ret < 0)
259 return ERR_PTR(ret);
260 return sock;
261}
262
263static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
264{
265 struct kvec iov = {buf, len};
266 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
267
268 return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
269}
270
271/*
272 * write something. @more is true if caller will be sending more data
273 * shortly.
274 */
275static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
276 size_t kvlen, size_t len, int more)
277{
278 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
279
280 if (more)
281 msg.msg_flags |= MSG_MORE;
282 else
283 msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
284
285 return kernel_sendmsg(sock, &msg, iov, kvlen, len);
286}
287
288
289/*
290 * Shutdown/close the socket for the given connection.
291 */
292static int con_close_socket(struct ceph_connection *con)
293{
294 int rc;
295
296 dout("con_close_socket on %p sock %p\n", con, con->sock);
297 if (!con->sock)
298 return 0;
299 set_bit(SOCK_CLOSED, &con->state);
300 rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
301 sock_release(con->sock);
302 con->sock = NULL;
303 clear_bit(SOCK_CLOSED, &con->state);
304 return rc;
305}
306
307/*
308 * Reset a connection. Discard all incoming and outgoing messages
309 * and clear *_seq state.
310 */
311static void ceph_msg_remove(struct ceph_msg *msg)
312{
313 list_del_init(&msg->list_head);
314 ceph_msg_put(msg);
315}
316static void ceph_msg_remove_list(struct list_head *head)
317{
318 while (!list_empty(head)) {
319 struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
320 list_head);
321 ceph_msg_remove(msg);
322 }
323}
324
325static void reset_connection(struct ceph_connection *con)
326{
327 /* reset connection, out_queue, msg_ and connect_seq */
328 /* discard existing out_queue and msg_seq */
329 ceph_msg_remove_list(&con->out_queue);
330 ceph_msg_remove_list(&con->out_sent);
331
332 if (con->in_msg) {
333 ceph_msg_put(con->in_msg);
334 con->in_msg = NULL;
335 }
336
337 con->connect_seq = 0;
338 con->out_seq = 0;
339 if (con->out_msg) {
340 ceph_msg_put(con->out_msg);
341 con->out_msg = NULL;
342 }
343 con->in_seq = 0;
344 con->in_seq_acked = 0;
345}
346
347/*
348 * mark a peer down. drop any open connections.
349 */
350void ceph_con_close(struct ceph_connection *con)
351{
352 dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
353 set_bit(CLOSED, &con->state); /* in case there's queued work */
354 clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
355 clear_bit(LOSSYTX, &con->state); /* so we retry next connect */
356 clear_bit(KEEPALIVE_PENDING, &con->state);
357 clear_bit(WRITE_PENDING, &con->state);
358 mutex_lock(&con->mutex);
359 reset_connection(con);
360 cancel_delayed_work(&con->work);
361 mutex_unlock(&con->mutex);
362 queue_con(con);
363}
364
365/*
366 * Reopen a closed connection, with a new peer address.
367 */
368void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
369{
370 dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
371 set_bit(OPENING, &con->state);
372 clear_bit(CLOSED, &con->state);
373 memcpy(&con->peer_addr, addr, sizeof(*addr));
374 con->delay = 0; /* reset backoff memory */
375 queue_con(con);
376}
377
378/*
379 * return true if this connection ever successfully opened
380 */
381bool ceph_con_opened(struct ceph_connection *con)
382{
383 return con->connect_seq > 0;
384}
385
386/*
387 * generic get/put
388 */
389struct ceph_connection *ceph_con_get(struct ceph_connection *con)
390{
391 dout("con_get %p nref = %d -> %d\n", con,
392 atomic_read(&con->nref), atomic_read(&con->nref) + 1);
393 if (atomic_inc_not_zero(&con->nref))
394 return con;
395 return NULL;
396}
397
398void ceph_con_put(struct ceph_connection *con)
399{
400 dout("con_put %p nref = %d -> %d\n", con,
401 atomic_read(&con->nref), atomic_read(&con->nref) - 1);
402 BUG_ON(atomic_read(&con->nref) == 0);
403 if (atomic_dec_and_test(&con->nref)) {
404 BUG_ON(con->sock);
405 kfree(con);
406 }
407}
408
409/*
410 * initialize a new connection.
411 */
412void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
413{
414 dout("con_init %p\n", con);
415 memset(con, 0, sizeof(*con));
416 atomic_set(&con->nref, 1);
417 con->msgr = msgr;
418 mutex_init(&con->mutex);
419 INIT_LIST_HEAD(&con->out_queue);
420 INIT_LIST_HEAD(&con->out_sent);
421 INIT_DELAYED_WORK(&con->work, con_work);
422}
423
424
425/*
426 * We maintain a global counter to order connection attempts. Get
427 * a unique seq greater than @gt.
428 */
429static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
430{
431 u32 ret;
432
433 spin_lock(&msgr->global_seq_lock);
434 if (msgr->global_seq < gt)
435 msgr->global_seq = gt;
436 ret = ++msgr->global_seq;
437 spin_unlock(&msgr->global_seq_lock);
438 return ret;
439}
440
441
442/*
443 * Prepare footer for currently outgoing message, and finish things
444 * off. Assumes out_kvec* are already valid.. we just add on to the end.
445 */
446static void prepare_write_message_footer(struct ceph_connection *con, int v)
447{
448 struct ceph_msg *m = con->out_msg;
449
450 dout("prepare_write_message_footer %p\n", con);
451 con->out_kvec_is_msg = true;
452 con->out_kvec[v].iov_base = &m->footer;
453 con->out_kvec[v].iov_len = sizeof(m->footer);
454 con->out_kvec_bytes += sizeof(m->footer);
455 con->out_kvec_left++;
456 con->out_more = m->more_to_follow;
457 con->out_msg_done = true;
458}
459
460/*
461 * Prepare headers for the next outgoing message.
462 */
463static void prepare_write_message(struct ceph_connection *con)
464{
465 struct ceph_msg *m;
466 int v = 0;
467
468 con->out_kvec_bytes = 0;
469 con->out_kvec_is_msg = true;
470 con->out_msg_done = false;
471
472 /* Sneak an ack in there first? If we can get it into the same
473 * TCP packet that's a good thing. */
474 if (con->in_seq > con->in_seq_acked) {
475 con->in_seq_acked = con->in_seq;
476 con->out_kvec[v].iov_base = &tag_ack;
477 con->out_kvec[v++].iov_len = 1;
478 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
479 con->out_kvec[v].iov_base = &con->out_temp_ack;
480 con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
481 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
482 }
483
484 m = list_first_entry(&con->out_queue,
485 struct ceph_msg, list_head);
486 con->out_msg = m;
487 if (test_bit(LOSSYTX, &con->state)) {
488 list_del_init(&m->list_head);
489 } else {
490 /* put message on sent list */
491 ceph_msg_get(m);
492 list_move_tail(&m->list_head, &con->out_sent);
493 }
494
495 /*
496 * only assign outgoing seq # if we haven't sent this message
497 * yet. if it is requeued, resend with it's original seq.
498 */
499 if (m->needs_out_seq) {
500 m->hdr.seq = cpu_to_le64(++con->out_seq);
501 m->needs_out_seq = false;
502 }
503
504 dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
505 m, con->out_seq, le16_to_cpu(m->hdr.type),
506 le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
507 le32_to_cpu(m->hdr.data_len),
508 m->nr_pages);
509 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
510
511 /* tag + hdr + front + middle */
512 con->out_kvec[v].iov_base = &tag_msg;
513 con->out_kvec[v++].iov_len = 1;
514 con->out_kvec[v].iov_base = &m->hdr;
515 con->out_kvec[v++].iov_len = sizeof(m->hdr);
516 con->out_kvec[v++] = m->front;
517 if (m->middle)
518 con->out_kvec[v++] = m->middle->vec;
519 con->out_kvec_left = v;
520 con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
521 (m->middle ? m->middle->vec.iov_len : 0);
522 con->out_kvec_cur = con->out_kvec;
523
524 /* fill in crc (except data pages), footer */
525 con->out_msg->hdr.crc =
526 cpu_to_le32(crc32c(0, (void *)&m->hdr,
527 sizeof(m->hdr) - sizeof(m->hdr.crc)));
528 con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
529 con->out_msg->footer.front_crc =
530 cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
531 if (m->middle)
532 con->out_msg->footer.middle_crc =
533 cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
534 m->middle->vec.iov_len));
535 else
536 con->out_msg->footer.middle_crc = 0;
537 con->out_msg->footer.data_crc = 0;
538 dout("prepare_write_message front_crc %u data_crc %u\n",
539 le32_to_cpu(con->out_msg->footer.front_crc),
540 le32_to_cpu(con->out_msg->footer.middle_crc));
541
542 /* is there a data payload? */
543 if (le32_to_cpu(m->hdr.data_len) > 0) {
544 /* initialize page iterator */
545 con->out_msg_pos.page = 0;
546 con->out_msg_pos.page_pos =
547 le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
548 con->out_msg_pos.data_pos = 0;
549 con->out_msg_pos.did_page_crc = 0;
550 con->out_more = 1; /* data + footer will follow */
551 } else {
552 /* no, queue up footer too and be done */
553 prepare_write_message_footer(con, v);
554 }
555
556 set_bit(WRITE_PENDING, &con->state);
557}
558
559/*
560 * Prepare an ack.
561 */
562static void prepare_write_ack(struct ceph_connection *con)
563{
564 dout("prepare_write_ack %p %llu -> %llu\n", con,
565 con->in_seq_acked, con->in_seq);
566 con->in_seq_acked = con->in_seq;
567
568 con->out_kvec[0].iov_base = &tag_ack;
569 con->out_kvec[0].iov_len = 1;
570 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
571 con->out_kvec[1].iov_base = &con->out_temp_ack;
572 con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
573 con->out_kvec_left = 2;
574 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
575 con->out_kvec_cur = con->out_kvec;
576 con->out_more = 1; /* more will follow.. eventually.. */
577 set_bit(WRITE_PENDING, &con->state);
578}
579
580/*
581 * Prepare to write keepalive byte.
582 */
583static void prepare_write_keepalive(struct ceph_connection *con)
584{
585 dout("prepare_write_keepalive %p\n", con);
586 con->out_kvec[0].iov_base = &tag_keepalive;
587 con->out_kvec[0].iov_len = 1;
588 con->out_kvec_left = 1;
589 con->out_kvec_bytes = 1;
590 con->out_kvec_cur = con->out_kvec;
591 set_bit(WRITE_PENDING, &con->state);
592}
593
594/*
595 * Connection negotiation.
596 */
597
598static void prepare_connect_authorizer(struct ceph_connection *con)
599{
600 void *auth_buf;
601 int auth_len = 0;
602 int auth_protocol = 0;
603
604 mutex_unlock(&con->mutex);
605 if (con->ops->get_authorizer)
606 con->ops->get_authorizer(con, &auth_buf, &auth_len,
607 &auth_protocol, &con->auth_reply_buf,
608 &con->auth_reply_buf_len,
609 con->auth_retry);
610 mutex_lock(&con->mutex);
611
612 con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
613 con->out_connect.authorizer_len = cpu_to_le32(auth_len);
614
615 con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
616 con->out_kvec[con->out_kvec_left].iov_len = auth_len;
617 con->out_kvec_left++;
618 con->out_kvec_bytes += auth_len;
619}
620
621/*
622 * We connected to a peer and are saying hello.
623 */
624static void prepare_write_banner(struct ceph_messenger *msgr,
625 struct ceph_connection *con)
626{
627 int len = strlen(CEPH_BANNER);
628
629 con->out_kvec[0].iov_base = CEPH_BANNER;
630 con->out_kvec[0].iov_len = len;
631 con->out_kvec[1].iov_base = &msgr->my_enc_addr;
632 con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
633 con->out_kvec_left = 2;
634 con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
635 con->out_kvec_cur = con->out_kvec;
636 con->out_more = 0;
637 set_bit(WRITE_PENDING, &con->state);
638}
639
640static void prepare_write_connect(struct ceph_messenger *msgr,
641 struct ceph_connection *con,
642 int after_banner)
643{
644 unsigned global_seq = get_global_seq(con->msgr, 0);
645 int proto;
646
647 switch (con->peer_name.type) {
648 case CEPH_ENTITY_TYPE_MON:
649 proto = CEPH_MONC_PROTOCOL;
650 break;
651 case CEPH_ENTITY_TYPE_OSD:
652 proto = CEPH_OSDC_PROTOCOL;
653 break;
654 case CEPH_ENTITY_TYPE_MDS:
655 proto = CEPH_MDSC_PROTOCOL;
656 break;
657 default:
658 BUG();
659 }
660
661 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
662 con->connect_seq, global_seq, proto);
663
664 con->out_connect.features = CEPH_FEATURE_SUPPORTED;
665 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
666 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
667 con->out_connect.global_seq = cpu_to_le32(global_seq);
668 con->out_connect.protocol_version = cpu_to_le32(proto);
669 con->out_connect.flags = 0;
670
671 if (!after_banner) {
672 con->out_kvec_left = 0;
673 con->out_kvec_bytes = 0;
674 }
675 con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
676 con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
677 con->out_kvec_left++;
678 con->out_kvec_bytes += sizeof(con->out_connect);
679 con->out_kvec_cur = con->out_kvec;
680 con->out_more = 0;
681 set_bit(WRITE_PENDING, &con->state);
682
683 prepare_connect_authorizer(con);
684}
685
686
687/*
688 * write as much of pending kvecs to the socket as we can.
689 * 1 -> done
690 * 0 -> socket full, but more to do
691 * <0 -> error
692 */
693static int write_partial_kvec(struct ceph_connection *con)
694{
695 int ret;
696
697 dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
698 while (con->out_kvec_bytes > 0) {
699 ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
700 con->out_kvec_left, con->out_kvec_bytes,
701 con->out_more);
702 if (ret <= 0)
703 goto out;
704 con->out_kvec_bytes -= ret;
705 if (con->out_kvec_bytes == 0)
706 break; /* done */
707 while (ret > 0) {
708 if (ret >= con->out_kvec_cur->iov_len) {
709 ret -= con->out_kvec_cur->iov_len;
710 con->out_kvec_cur++;
711 con->out_kvec_left--;
712 } else {
713 con->out_kvec_cur->iov_len -= ret;
714 con->out_kvec_cur->iov_base += ret;
715 ret = 0;
716 break;
717 }
718 }
719 }
720 con->out_kvec_left = 0;
721 con->out_kvec_is_msg = false;
722 ret = 1;
723out:
724 dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
725 con->out_kvec_bytes, con->out_kvec_left, ret);
726 return ret; /* done! */
727}
728
729/*
730 * Write as much message data payload as we can. If we finish, queue
731 * up the footer.
732 * 1 -> done, footer is now queued in out_kvec[].
733 * 0 -> socket full, but more to do
734 * <0 -> error
735 */
736static int write_partial_msg_pages(struct ceph_connection *con)
737{
738 struct ceph_msg *msg = con->out_msg;
739 unsigned data_len = le32_to_cpu(msg->hdr.data_len);
740 size_t len;
741 int crc = con->msgr->nocrc;
742 int ret;
743
744 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
745 con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
746 con->out_msg_pos.page_pos);
747
748 while (con->out_msg_pos.page < con->out_msg->nr_pages) {
749 struct page *page = NULL;
750 void *kaddr = NULL;
751
752 /*
753 * if we are calculating the data crc (the default), we need
754 * to map the page. if our pages[] has been revoked, use the
755 * zero page.
756 */
757 if (msg->pages) {
758 page = msg->pages[con->out_msg_pos.page];
759 if (crc)
760 kaddr = kmap(page);
761 } else if (msg->pagelist) {
762 page = list_first_entry(&msg->pagelist->head,
763 struct page, lru);
764 if (crc)
765 kaddr = kmap(page);
766 } else {
767 page = con->msgr->zero_page;
768 if (crc)
769 kaddr = page_address(con->msgr->zero_page);
770 }
771 len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
772 (int)(data_len - con->out_msg_pos.data_pos));
773 if (crc && !con->out_msg_pos.did_page_crc) {
774 void *base = kaddr + con->out_msg_pos.page_pos;
775 u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
776
777 BUG_ON(kaddr == NULL);
778 con->out_msg->footer.data_crc =
779 cpu_to_le32(crc32c(tmpcrc, base, len));
780 con->out_msg_pos.did_page_crc = 1;
781 }
782
783 ret = kernel_sendpage(con->sock, page,
784 con->out_msg_pos.page_pos, len,
785 MSG_DONTWAIT | MSG_NOSIGNAL |
786 MSG_MORE);
787
788 if (crc && (msg->pages || msg->pagelist))
789 kunmap(page);
790
791 if (ret <= 0)
792 goto out;
793
794 con->out_msg_pos.data_pos += ret;
795 con->out_msg_pos.page_pos += ret;
796 if (ret == len) {
797 con->out_msg_pos.page_pos = 0;
798 con->out_msg_pos.page++;
799 con->out_msg_pos.did_page_crc = 0;
800 if (msg->pagelist)
801 list_move_tail(&page->lru,
802 &msg->pagelist->head);
803 }
804 }
805
806 dout("write_partial_msg_pages %p msg %p done\n", con, msg);
807
808 /* prepare and queue up footer, too */
809 if (!crc)
810 con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
811 con->out_kvec_bytes = 0;
812 con->out_kvec_left = 0;
813 con->out_kvec_cur = con->out_kvec;
814 prepare_write_message_footer(con, 0);
815 ret = 1;
816out:
817 return ret;
818}
819
820/*
821 * write some zeros
822 */
823static int write_partial_skip(struct ceph_connection *con)
824{
825 int ret;
826
827 while (con->out_skip > 0) {
828 struct kvec iov = {
829 .iov_base = page_address(con->msgr->zero_page),
830 .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
831 };
832
833 ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
834 if (ret <= 0)
835 goto out;
836 con->out_skip -= ret;
837 }
838 ret = 1;
839out:
840 return ret;
841}
842
843/*
844 * Prepare to read connection handshake, or an ack.
845 */
846static void prepare_read_banner(struct ceph_connection *con)
847{
848 dout("prepare_read_banner %p\n", con);
849 con->in_base_pos = 0;
850}
851
852static void prepare_read_connect(struct ceph_connection *con)
853{
854 dout("prepare_read_connect %p\n", con);
855 con->in_base_pos = 0;
856}
857
858static void prepare_read_ack(struct ceph_connection *con)
859{
860 dout("prepare_read_ack %p\n", con);
861 con->in_base_pos = 0;
862}
863
864static void prepare_read_tag(struct ceph_connection *con)
865{
866 dout("prepare_read_tag %p\n", con);
867 con->in_base_pos = 0;
868 con->in_tag = CEPH_MSGR_TAG_READY;
869}
870
871/*
872 * Prepare to read a message.
873 */
874static int prepare_read_message(struct ceph_connection *con)
875{
876 dout("prepare_read_message %p\n", con);
877 BUG_ON(con->in_msg != NULL);
878 con->in_base_pos = 0;
879 con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
880 return 0;
881}
882
883
884static int read_partial(struct ceph_connection *con,
885 int *to, int size, void *object)
886{
887 *to += size;
888 while (con->in_base_pos < *to) {
889 int left = *to - con->in_base_pos;
890 int have = size - left;
891 int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
892 if (ret <= 0)
893 return ret;
894 con->in_base_pos += ret;
895 }
896 return 1;
897}
898
899
900/*
901 * Read all or part of the connect-side handshake on a new connection
902 */
903static int read_partial_banner(struct ceph_connection *con)
904{
905 int ret, to = 0;
906
907 dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
908
909 /* peer's banner */
910 ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
911 if (ret <= 0)
912 goto out;
913 ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
914 &con->actual_peer_addr);
915 if (ret <= 0)
916 goto out;
917 ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
918 &con->peer_addr_for_me);
919 if (ret <= 0)
920 goto out;
921out:
922 return ret;
923}
924
925static int read_partial_connect(struct ceph_connection *con)
926{
927 int ret, to = 0;
928
929 dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
930
931 ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
932 if (ret <= 0)
933 goto out;
934 ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
935 con->auth_reply_buf);
936 if (ret <= 0)
937 goto out;
938
939 dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
940 con, (int)con->in_reply.tag,
941 le32_to_cpu(con->in_reply.connect_seq),
942 le32_to_cpu(con->in_reply.global_seq));
943out:
944 return ret;
945
946}
947
948/*
949 * Verify the hello banner looks okay.
950 */
951static int verify_hello(struct ceph_connection *con)
952{
953 if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
954 pr_err("connect to %s got bad banner\n",
955 pr_addr(&con->peer_addr.in_addr));
956 con->error_msg = "protocol error, bad banner";
957 return -1;
958 }
959 return 0;
960}
961
962static bool addr_is_blank(struct sockaddr_storage *ss)
963{
964 switch (ss->ss_family) {
965 case AF_INET:
966 return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
967 case AF_INET6:
968 return
969 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
970 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
971 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
972 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
973 }
974 return false;
975}
976
977static int addr_port(struct sockaddr_storage *ss)
978{
979 switch (ss->ss_family) {
980 case AF_INET:
981 return ntohs(((struct sockaddr_in *)ss)->sin_port);
982 case AF_INET6:
983 return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
984 }
985 return 0;
986}
987
988static void addr_set_port(struct sockaddr_storage *ss, int p)
989{
990 switch (ss->ss_family) {
991 case AF_INET:
992 ((struct sockaddr_in *)ss)->sin_port = htons(p);
993 case AF_INET6:
994 ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
995 }
996}
997
998/*
999 * Parse an ip[:port] list into an addr array. Use the default
1000 * monitor port if a port isn't specified.
1001 */
1002int ceph_parse_ips(const char *c, const char *end,
1003 struct ceph_entity_addr *addr,
1004 int max_count, int *count)
1005{
1006 int i;
1007 const char *p = c;
1008
1009 dout("parse_ips on '%.*s'\n", (int)(end-c), c);
1010 for (i = 0; i < max_count; i++) {
1011 const char *ipend;
1012 struct sockaddr_storage *ss = &addr[i].in_addr;
1013 struct sockaddr_in *in4 = (void *)ss;
1014 struct sockaddr_in6 *in6 = (void *)ss;
1015 int port;
1016
1017 memset(ss, 0, sizeof(*ss));
1018 if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
1019 ',', &ipend)) {
1020 ss->ss_family = AF_INET;
1021 } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
1022 ',', &ipend)) {
1023 ss->ss_family = AF_INET6;
1024 } else {
1025 goto bad;
1026 }
1027 p = ipend;
1028
1029 /* port? */
1030 if (p < end && *p == ':') {
1031 port = 0;
1032 p++;
1033 while (p < end && *p >= '0' && *p <= '9') {
1034 port = (port * 10) + (*p - '0');
1035 p++;
1036 }
1037 if (port > 65535 || port == 0)
1038 goto bad;
1039 } else {
1040 port = CEPH_MON_PORT;
1041 }
1042
1043 addr_set_port(ss, port);
1044
1045 dout("parse_ips got %s\n", pr_addr(ss));
1046
1047 if (p == end)
1048 break;
1049 if (*p != ',')
1050 goto bad;
1051 p++;
1052 }
1053
1054 if (p != end)
1055 goto bad;
1056
1057 if (count)
1058 *count = i + 1;
1059 return 0;
1060
1061bad:
1062 pr_err("parse_ips bad ip '%s'\n", c);
1063 return -EINVAL;
1064}
1065
1066static int process_banner(struct ceph_connection *con)
1067{
1068 dout("process_banner on %p\n", con);
1069
1070 if (verify_hello(con) < 0)
1071 return -1;
1072
1073 ceph_decode_addr(&con->actual_peer_addr);
1074 ceph_decode_addr(&con->peer_addr_for_me);
1075
1076 /*
1077 * Make sure the other end is who we wanted. note that the other
1078 * end may not yet know their ip address, so if it's 0.0.0.0, give
1079 * them the benefit of the doubt.
1080 */
1081 if (memcmp(&con->peer_addr, &con->actual_peer_addr,
1082 sizeof(con->peer_addr)) != 0 &&
1083 !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
1084 con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
1085 pr_warning("wrong peer, want %s/%lld, got %s/%lld\n",
1086 pr_addr(&con->peer_addr.in_addr),
1087 le64_to_cpu(con->peer_addr.nonce),
1088 pr_addr(&con->actual_peer_addr.in_addr),
1089 le64_to_cpu(con->actual_peer_addr.nonce));
1090 con->error_msg = "wrong peer at address";
1091 return -1;
1092 }
1093
1094 /*
1095 * did we learn our address?
1096 */
1097 if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
1098 int port = addr_port(&con->msgr->inst.addr.in_addr);
1099
1100 memcpy(&con->msgr->inst.addr.in_addr,
1101 &con->peer_addr_for_me.in_addr,
1102 sizeof(con->peer_addr_for_me.in_addr));
1103 addr_set_port(&con->msgr->inst.addr.in_addr, port);
1104 encode_my_addr(con->msgr);
1105 dout("process_banner learned my addr is %s\n",
1106 pr_addr(&con->msgr->inst.addr.in_addr));
1107 }
1108
1109 set_bit(NEGOTIATING, &con->state);
1110 prepare_read_connect(con);
1111 return 0;
1112}
1113
1114static void fail_protocol(struct ceph_connection *con)
1115{
1116 reset_connection(con);
1117 set_bit(CLOSED, &con->state); /* in case there's queued work */
1118
1119 mutex_unlock(&con->mutex);
1120 if (con->ops->bad_proto)
1121 con->ops->bad_proto(con);
1122 mutex_lock(&con->mutex);
1123}
1124
1125static int process_connect(struct ceph_connection *con)
1126{
1127 u64 sup_feat = CEPH_FEATURE_SUPPORTED;
1128 u64 req_feat = CEPH_FEATURE_REQUIRED;
1129 u64 server_feat = le64_to_cpu(con->in_reply.features);
1130
1131 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
1132
1133 switch (con->in_reply.tag) {
1134 case CEPH_MSGR_TAG_FEATURES:
1135 pr_err("%s%lld %s feature set mismatch,"
1136 " my %llx < server's %llx, missing %llx\n",
1137 ENTITY_NAME(con->peer_name),
1138 pr_addr(&con->peer_addr.in_addr),
1139 sup_feat, server_feat, server_feat & ~sup_feat);
1140 con->error_msg = "missing required protocol features";
1141 fail_protocol(con);
1142 return -1;
1143
1144 case CEPH_MSGR_TAG_BADPROTOVER:
1145 pr_err("%s%lld %s protocol version mismatch,"
1146 " my %d != server's %d\n",
1147 ENTITY_NAME(con->peer_name),
1148 pr_addr(&con->peer_addr.in_addr),
1149 le32_to_cpu(con->out_connect.protocol_version),
1150 le32_to_cpu(con->in_reply.protocol_version));
1151 con->error_msg = "protocol version mismatch";
1152 fail_protocol(con);
1153 return -1;
1154
1155 case CEPH_MSGR_TAG_BADAUTHORIZER:
1156 con->auth_retry++;
1157 dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
1158 con->auth_retry);
1159 if (con->auth_retry == 2) {
1160 con->error_msg = "connect authorization failure";
1161 reset_connection(con);
1162 set_bit(CLOSED, &con->state);
1163 return -1;
1164 }
1165 con->auth_retry = 1;
1166 prepare_write_connect(con->msgr, con, 0);
1167 prepare_read_connect(con);
1168 break;
1169
1170 case CEPH_MSGR_TAG_RESETSESSION:
1171 /*
1172 * If we connected with a large connect_seq but the peer
1173 * has no record of a session with us (no connection, or
1174 * connect_seq == 0), they will send RESETSESION to indicate
1175 * that they must have reset their session, and may have
1176 * dropped messages.
1177 */
1178 dout("process_connect got RESET peer seq %u\n",
1179 le32_to_cpu(con->in_connect.connect_seq));
1180 pr_err("%s%lld %s connection reset\n",
1181 ENTITY_NAME(con->peer_name),
1182 pr_addr(&con->peer_addr.in_addr));
1183 reset_connection(con);
1184 prepare_write_connect(con->msgr, con, 0);
1185 prepare_read_connect(con);
1186
1187 /* Tell ceph about it. */
1188 mutex_unlock(&con->mutex);
1189 pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
1190 if (con->ops->peer_reset)
1191 con->ops->peer_reset(con);
1192 mutex_lock(&con->mutex);
1193 break;
1194
1195 case CEPH_MSGR_TAG_RETRY_SESSION:
1196 /*
1197 * If we sent a smaller connect_seq than the peer has, try
1198 * again with a larger value.
1199 */
1200 dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
1201 le32_to_cpu(con->out_connect.connect_seq),
1202 le32_to_cpu(con->in_connect.connect_seq));
1203 con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
1204 prepare_write_connect(con->msgr, con, 0);
1205 prepare_read_connect(con);
1206 break;
1207
1208 case CEPH_MSGR_TAG_RETRY_GLOBAL:
1209 /*
1210 * If we sent a smaller global_seq than the peer has, try
1211 * again with a larger value.
1212 */
1213 dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
1214 con->peer_global_seq,
1215 le32_to_cpu(con->in_connect.global_seq));
1216 get_global_seq(con->msgr,
1217 le32_to_cpu(con->in_connect.global_seq));
1218 prepare_write_connect(con->msgr, con, 0);
1219 prepare_read_connect(con);
1220 break;
1221
1222 case CEPH_MSGR_TAG_READY:
1223 if (req_feat & ~server_feat) {
1224 pr_err("%s%lld %s protocol feature mismatch,"
1225 " my required %llx > server's %llx, need %llx\n",
1226 ENTITY_NAME(con->peer_name),
1227 pr_addr(&con->peer_addr.in_addr),
1228 req_feat, server_feat, req_feat & ~server_feat);
1229 con->error_msg = "missing required protocol features";
1230 fail_protocol(con);
1231 return -1;
1232 }
1233 clear_bit(CONNECTING, &con->state);
1234 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1235 con->connect_seq++;
1236 dout("process_connect got READY gseq %d cseq %d (%d)\n",
1237 con->peer_global_seq,
1238 le32_to_cpu(con->in_reply.connect_seq),
1239 con->connect_seq);
1240 WARN_ON(con->connect_seq !=
1241 le32_to_cpu(con->in_reply.connect_seq));
1242
1243 if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
1244 set_bit(LOSSYTX, &con->state);
1245
1246 prepare_read_tag(con);
1247 break;
1248
1249 case CEPH_MSGR_TAG_WAIT:
1250 /*
1251 * If there is a connection race (we are opening
1252 * connections to each other), one of us may just have
1253 * to WAIT. This shouldn't happen if we are the
1254 * client.
1255 */
1256 pr_err("process_connect peer connecting WAIT\n");
1257
1258 default:
1259 pr_err("connect protocol error, will retry\n");
1260 con->error_msg = "protocol error, garbage tag during connect";
1261 return -1;
1262 }
1263 return 0;
1264}
1265
1266
1267/*
1268 * read (part of) an ack
1269 */
1270static int read_partial_ack(struct ceph_connection *con)
1271{
1272 int to = 0;
1273
1274 return read_partial(con, &to, sizeof(con->in_temp_ack),
1275 &con->in_temp_ack);
1276}
1277
1278
1279/*
1280 * We can finally discard anything that's been acked.
1281 */
1282static void process_ack(struct ceph_connection *con)
1283{
1284 struct ceph_msg *m;
1285 u64 ack = le64_to_cpu(con->in_temp_ack);
1286 u64 seq;
1287
1288 while (!list_empty(&con->out_sent)) {
1289 m = list_first_entry(&con->out_sent, struct ceph_msg,
1290 list_head);
1291 seq = le64_to_cpu(m->hdr.seq);
1292 if (seq > ack)
1293 break;
1294 dout("got ack for seq %llu type %d at %p\n", seq,
1295 le16_to_cpu(m->hdr.type), m);
1296 ceph_msg_remove(m);
1297 }
1298 prepare_read_tag(con);
1299}
1300
1301
1302
1303
1304static int read_partial_message_section(struct ceph_connection *con,
1305 struct kvec *section, unsigned int sec_len,
1306 u32 *crc)
1307{
1308 int left;
1309 int ret;
1310
1311 BUG_ON(!section);
1312
1313 while (section->iov_len < sec_len) {
1314 BUG_ON(section->iov_base == NULL);
1315 left = sec_len - section->iov_len;
1316 ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
1317 section->iov_len, left);
1318 if (ret <= 0)
1319 return ret;
1320 section->iov_len += ret;
1321 if (section->iov_len == sec_len)
1322 *crc = crc32c(0, section->iov_base,
1323 section->iov_len);
1324 }
1325
1326 return 1;
1327}
1328
1329static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
1330 struct ceph_msg_header *hdr,
1331 int *skip);
1332/*
1333 * read (part of) a message.
1334 */
1335static int read_partial_message(struct ceph_connection *con)
1336{
1337 struct ceph_msg *m = con->in_msg;
1338 void *p;
1339 int ret;
1340 int to, left;
1341 unsigned front_len, middle_len, data_len, data_off;
1342 int datacrc = con->msgr->nocrc;
1343 int skip;
1344 u64 seq;
1345
1346 dout("read_partial_message con %p msg %p\n", con, m);
1347
1348 /* header */
1349 while (con->in_base_pos < sizeof(con->in_hdr)) {
1350 left = sizeof(con->in_hdr) - con->in_base_pos;
1351 ret = ceph_tcp_recvmsg(con->sock,
1352 (char *)&con->in_hdr + con->in_base_pos,
1353 left);
1354 if (ret <= 0)
1355 return ret;
1356 con->in_base_pos += ret;
1357 if (con->in_base_pos == sizeof(con->in_hdr)) {
1358 u32 crc = crc32c(0, (void *)&con->in_hdr,
1359 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
1360 if (crc != le32_to_cpu(con->in_hdr.crc)) {
1361 pr_err("read_partial_message bad hdr "
1362 " crc %u != expected %u\n",
1363 crc, con->in_hdr.crc);
1364 return -EBADMSG;
1365 }
1366 }
1367 }
1368 front_len = le32_to_cpu(con->in_hdr.front_len);
1369 if (front_len > CEPH_MSG_MAX_FRONT_LEN)
1370 return -EIO;
1371 middle_len = le32_to_cpu(con->in_hdr.middle_len);
1372 if (middle_len > CEPH_MSG_MAX_DATA_LEN)
1373 return -EIO;
1374 data_len = le32_to_cpu(con->in_hdr.data_len);
1375 if (data_len > CEPH_MSG_MAX_DATA_LEN)
1376 return -EIO;
1377 data_off = le16_to_cpu(con->in_hdr.data_off);
1378
1379 /* verify seq# */
1380 seq = le64_to_cpu(con->in_hdr.seq);
1381 if ((s64)seq - (s64)con->in_seq < 1) {
1382 pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
1383 ENTITY_NAME(con->peer_name),
1384 pr_addr(&con->peer_addr.in_addr),
1385 seq, con->in_seq + 1);
1386 con->in_base_pos = -front_len - middle_len - data_len -
1387 sizeof(m->footer);
1388 con->in_tag = CEPH_MSGR_TAG_READY;
1389 con->in_seq++;
1390 return 0;
1391 } else if ((s64)seq - (s64)con->in_seq > 1) {
1392 pr_err("read_partial_message bad seq %lld expected %lld\n",
1393 seq, con->in_seq + 1);
1394 con->error_msg = "bad message sequence # for incoming message";
1395 return -EBADMSG;
1396 }
1397
1398 /* allocate message? */
1399 if (!con->in_msg) {
1400 dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
1401 con->in_hdr.front_len, con->in_hdr.data_len);
1402 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
1403 if (skip) {
1404 /* skip this message */
1405 dout("alloc_msg returned NULL, skipping message\n");
1406 con->in_base_pos = -front_len - middle_len - data_len -
1407 sizeof(m->footer);
1408 con->in_tag = CEPH_MSGR_TAG_READY;
1409 con->in_seq++;
1410 return 0;
1411 }
1412 if (IS_ERR(con->in_msg)) {
1413 ret = PTR_ERR(con->in_msg);
1414 con->in_msg = NULL;
1415 con->error_msg =
1416 "error allocating memory for incoming message";
1417 return ret;
1418 }
1419 m = con->in_msg;
1420 m->front.iov_len = 0; /* haven't read it yet */
1421 if (m->middle)
1422 m->middle->vec.iov_len = 0;
1423
1424 con->in_msg_pos.page = 0;
1425 con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
1426 con->in_msg_pos.data_pos = 0;
1427 }
1428
1429 /* front */
1430 ret = read_partial_message_section(con, &m->front, front_len,
1431 &con->in_front_crc);
1432 if (ret <= 0)
1433 return ret;
1434
1435 /* middle */
1436 if (m->middle) {
1437 ret = read_partial_message_section(con, &m->middle->vec, middle_len,
1438 &con->in_middle_crc);
1439 if (ret <= 0)
1440 return ret;
1441 }
1442
1443 /* (page) data */
1444 while (con->in_msg_pos.data_pos < data_len) {
1445 left = min((int)(data_len - con->in_msg_pos.data_pos),
1446 (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
1447 BUG_ON(m->pages == NULL);
1448 p = kmap(m->pages[con->in_msg_pos.page]);
1449 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1450 left);
1451 if (ret > 0 && datacrc)
1452 con->in_data_crc =
1453 crc32c(con->in_data_crc,
1454 p + con->in_msg_pos.page_pos, ret);
1455 kunmap(m->pages[con->in_msg_pos.page]);
1456 if (ret <= 0)
1457 return ret;
1458 con->in_msg_pos.data_pos += ret;
1459 con->in_msg_pos.page_pos += ret;
1460 if (con->in_msg_pos.page_pos == PAGE_SIZE) {
1461 con->in_msg_pos.page_pos = 0;
1462 con->in_msg_pos.page++;
1463 }
1464 }
1465
1466 /* footer */
1467 to = sizeof(m->hdr) + sizeof(m->footer);
1468 while (con->in_base_pos < to) {
1469 left = to - con->in_base_pos;
1470 ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
1471 (con->in_base_pos - sizeof(m->hdr)),
1472 left);
1473 if (ret <= 0)
1474 return ret;
1475 con->in_base_pos += ret;
1476 }
1477 dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
1478 m, front_len, m->footer.front_crc, middle_len,
1479 m->footer.middle_crc, data_len, m->footer.data_crc);
1480
1481 /* crc ok? */
1482 if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
1483 pr_err("read_partial_message %p front crc %u != exp. %u\n",
1484 m, con->in_front_crc, m->footer.front_crc);
1485 return -EBADMSG;
1486 }
1487 if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
1488 pr_err("read_partial_message %p middle crc %u != exp %u\n",
1489 m, con->in_middle_crc, m->footer.middle_crc);
1490 return -EBADMSG;
1491 }
1492 if (datacrc &&
1493 (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
1494 con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
1495 pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
1496 con->in_data_crc, le32_to_cpu(m->footer.data_crc));
1497 return -EBADMSG;
1498 }
1499
1500 return 1; /* done! */
1501}
1502
1503/*
1504 * Process message. This happens in the worker thread. The callback should
1505 * be careful not to do anything that waits on other incoming messages or it
1506 * may deadlock.
1507 */
1508static void process_message(struct ceph_connection *con)
1509{
1510 struct ceph_msg *msg;
1511
1512 msg = con->in_msg;
1513 con->in_msg = NULL;
1514
1515 /* if first message, set peer_name */
1516 if (con->peer_name.type == 0)
1517 con->peer_name = msg->hdr.src.name;
1518
1519 con->in_seq++;
1520 mutex_unlock(&con->mutex);
1521
1522 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
1523 msg, le64_to_cpu(msg->hdr.seq),
1524 ENTITY_NAME(msg->hdr.src.name),
1525 le16_to_cpu(msg->hdr.type),
1526 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1527 le32_to_cpu(msg->hdr.front_len),
1528 le32_to_cpu(msg->hdr.data_len),
1529 con->in_front_crc, con->in_middle_crc, con->in_data_crc);
1530 con->ops->dispatch(con, msg);
1531
1532 mutex_lock(&con->mutex);
1533 prepare_read_tag(con);
1534}
1535
1536
1537/*
1538 * Write something to the socket. Called in a worker thread when the
1539 * socket appears to be writeable and we have something ready to send.
1540 */
1541static int try_write(struct ceph_connection *con)
1542{
1543 struct ceph_messenger *msgr = con->msgr;
1544 int ret = 1;
1545
1546 dout("try_write start %p state %lu nref %d\n", con, con->state,
1547 atomic_read(&con->nref));
1548
1549 mutex_lock(&con->mutex);
1550more:
1551 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
1552
1553 /* open the socket first? */
1554 if (con->sock == NULL) {
1555 /*
1556 * if we were STANDBY and are reconnecting _this_
1557 * connection, bump connect_seq now. Always bump
1558 * global_seq.
1559 */
1560 if (test_and_clear_bit(STANDBY, &con->state))
1561 con->connect_seq++;
1562
1563 prepare_write_banner(msgr, con);
1564 prepare_write_connect(msgr, con, 1);
1565 prepare_read_banner(con);
1566 set_bit(CONNECTING, &con->state);
1567 clear_bit(NEGOTIATING, &con->state);
1568
1569 BUG_ON(con->in_msg);
1570 con->in_tag = CEPH_MSGR_TAG_READY;
1571 dout("try_write initiating connect on %p new state %lu\n",
1572 con, con->state);
1573 con->sock = ceph_tcp_connect(con);
1574 if (IS_ERR(con->sock)) {
1575 con->sock = NULL;
1576 con->error_msg = "connect error";
1577 ret = -1;
1578 goto out;
1579 }
1580 }
1581
1582more_kvec:
1583 /* kvec data queued? */
1584 if (con->out_skip) {
1585 ret = write_partial_skip(con);
1586 if (ret <= 0)
1587 goto done;
1588 if (ret < 0) {
1589 dout("try_write write_partial_skip err %d\n", ret);
1590 goto done;
1591 }
1592 }
1593 if (con->out_kvec_left) {
1594 ret = write_partial_kvec(con);
1595 if (ret <= 0)
1596 goto done;
1597 }
1598
1599 /* msg pages? */
1600 if (con->out_msg) {
1601 if (con->out_msg_done) {
1602 ceph_msg_put(con->out_msg);
1603 con->out_msg = NULL; /* we're done with this one */
1604 goto do_next;
1605 }
1606
1607 ret = write_partial_msg_pages(con);
1608 if (ret == 1)
1609 goto more_kvec; /* we need to send the footer, too! */
1610 if (ret == 0)
1611 goto done;
1612 if (ret < 0) {
1613 dout("try_write write_partial_msg_pages err %d\n",
1614 ret);
1615 goto done;
1616 }
1617 }
1618
1619do_next:
1620 if (!test_bit(CONNECTING, &con->state)) {
1621 /* is anything else pending? */
1622 if (!list_empty(&con->out_queue)) {
1623 prepare_write_message(con);
1624 goto more;
1625 }
1626 if (con->in_seq > con->in_seq_acked) {
1627 prepare_write_ack(con);
1628 goto more;
1629 }
1630 if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
1631 prepare_write_keepalive(con);
1632 goto more;
1633 }
1634 }
1635
1636 /* Nothing to do! */
1637 clear_bit(WRITE_PENDING, &con->state);
1638 dout("try_write nothing else to write.\n");
1639done:
1640 ret = 0;
1641out:
1642 mutex_unlock(&con->mutex);
1643 dout("try_write done on %p\n", con);
1644 return ret;
1645}
1646
1647
1648
1649/*
1650 * Read what we can from the socket.
1651 */
1652static int try_read(struct ceph_connection *con)
1653{
1654 struct ceph_messenger *msgr;
1655 int ret = -1;
1656
1657 if (!con->sock)
1658 return 0;
1659
1660 if (test_bit(STANDBY, &con->state))
1661 return 0;
1662
1663 dout("try_read start on %p\n", con);
1664 msgr = con->msgr;
1665
1666 mutex_lock(&con->mutex);
1667
1668more:
1669 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
1670 con->in_base_pos);
1671 if (test_bit(CONNECTING, &con->state)) {
1672 if (!test_bit(NEGOTIATING, &con->state)) {
1673 dout("try_read connecting\n");
1674 ret = read_partial_banner(con);
1675 if (ret <= 0)
1676 goto done;
1677 if (process_banner(con) < 0) {
1678 ret = -1;
1679 goto out;
1680 }
1681 }
1682 ret = read_partial_connect(con);
1683 if (ret <= 0)
1684 goto done;
1685 if (process_connect(con) < 0) {
1686 ret = -1;
1687 goto out;
1688 }
1689 goto more;
1690 }
1691
1692 if (con->in_base_pos < 0) {
1693 /*
1694 * skipping + discarding content.
1695 *
1696 * FIXME: there must be a better way to do this!
1697 */
1698 static char buf[1024];
1699 int skip = min(1024, -con->in_base_pos);
1700 dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
1701 ret = ceph_tcp_recvmsg(con->sock, buf, skip);
1702 if (ret <= 0)
1703 goto done;
1704 con->in_base_pos += ret;
1705 if (con->in_base_pos)
1706 goto more;
1707 }
1708 if (con->in_tag == CEPH_MSGR_TAG_READY) {
1709 /*
1710 * what's next?
1711 */
1712 ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
1713 if (ret <= 0)
1714 goto done;
1715 dout("try_read got tag %d\n", (int)con->in_tag);
1716 switch (con->in_tag) {
1717 case CEPH_MSGR_TAG_MSG:
1718 prepare_read_message(con);
1719 break;
1720 case CEPH_MSGR_TAG_ACK:
1721 prepare_read_ack(con);
1722 break;
1723 case CEPH_MSGR_TAG_CLOSE:
1724 set_bit(CLOSED, &con->state); /* fixme */
1725 goto done;
1726 default:
1727 goto bad_tag;
1728 }
1729 }
1730 if (con->in_tag == CEPH_MSGR_TAG_MSG) {
1731 ret = read_partial_message(con);
1732 if (ret <= 0) {
1733 switch (ret) {
1734 case -EBADMSG:
1735 con->error_msg = "bad crc";
1736 ret = -EIO;
1737 goto out;
1738 case -EIO:
1739 con->error_msg = "io error";
1740 goto out;
1741 default:
1742 goto done;
1743 }
1744 }
1745 if (con->in_tag == CEPH_MSGR_TAG_READY)
1746 goto more;
1747 process_message(con);
1748 goto more;
1749 }
1750 if (con->in_tag == CEPH_MSGR_TAG_ACK) {
1751 ret = read_partial_ack(con);
1752 if (ret <= 0)
1753 goto done;
1754 process_ack(con);
1755 goto more;
1756 }
1757
1758done:
1759 ret = 0;
1760out:
1761 mutex_unlock(&con->mutex);
1762 dout("try_read done on %p\n", con);
1763 return ret;
1764
1765bad_tag:
1766 pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
1767 con->error_msg = "protocol error, garbage tag";
1768 ret = -1;
1769 goto out;
1770}
1771
1772
1773/*
1774 * Atomically queue work on a connection. Bump @con reference to
1775 * avoid races with connection teardown.
1776 *
1777 * There is some trickery going on with QUEUED and BUSY because we
1778 * only want a _single_ thread operating on each connection at any
1779 * point in time, but we want to use all available CPUs.
1780 *
1781 * The worker thread only proceeds if it can atomically set BUSY. It
1782 * clears QUEUED and does it's thing. When it thinks it's done, it
1783 * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
1784 * (tries again to set BUSY).
1785 *
1786 * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
1787 * try to queue work. If that fails (work is already queued, or BUSY)
1788 * we give up (work also already being done or is queued) but leave QUEUED
1789 * set so that the worker thread will loop if necessary.
1790 */
1791static void queue_con(struct ceph_connection *con)
1792{
1793 if (test_bit(DEAD, &con->state)) {
1794 dout("queue_con %p ignoring: DEAD\n",
1795 con);
1796 return;
1797 }
1798
1799 if (!con->ops->get(con)) {
1800 dout("queue_con %p ref count 0\n", con);
1801 return;
1802 }
1803
1804 set_bit(QUEUED, &con->state);
1805 if (test_bit(BUSY, &con->state)) {
1806 dout("queue_con %p - already BUSY\n", con);
1807 con->ops->put(con);
1808 } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
1809 dout("queue_con %p - already queued\n", con);
1810 con->ops->put(con);
1811 } else {
1812 dout("queue_con %p\n", con);
1813 }
1814}
1815
1816/*
1817 * Do some work on a connection. Drop a connection ref when we're done.
1818 */
1819static void con_work(struct work_struct *work)
1820{
1821 struct ceph_connection *con = container_of(work, struct ceph_connection,
1822 work.work);
1823 int backoff = 0;
1824
1825more:
1826 if (test_and_set_bit(BUSY, &con->state) != 0) {
1827 dout("con_work %p BUSY already set\n", con);
1828 goto out;
1829 }
1830 dout("con_work %p start, clearing QUEUED\n", con);
1831 clear_bit(QUEUED, &con->state);
1832
1833 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
1834 dout("con_work CLOSED\n");
1835 con_close_socket(con);
1836 goto done;
1837 }
1838 if (test_and_clear_bit(OPENING, &con->state)) {
1839 /* reopen w/ new peer */
1840 dout("con_work OPENING\n");
1841 con_close_socket(con);
1842 }
1843
1844 if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
1845 try_read(con) < 0 ||
1846 try_write(con) < 0) {
1847 backoff = 1;
1848 ceph_fault(con); /* error/fault path */
1849 }
1850
1851done:
1852 clear_bit(BUSY, &con->state);
1853 dout("con->state=%lu\n", con->state);
1854 if (test_bit(QUEUED, &con->state)) {
1855 if (!backoff || test_bit(OPENING, &con->state)) {
1856 dout("con_work %p QUEUED reset, looping\n", con);
1857 goto more;
1858 }
1859 dout("con_work %p QUEUED reset, but just faulted\n", con);
1860 clear_bit(QUEUED, &con->state);
1861 }
1862 dout("con_work %p done\n", con);
1863
1864out:
1865 con->ops->put(con);
1866}
1867
1868
1869/*
1870 * Generic error/fault handler. A retry mechanism is used with
1871 * exponential backoff
1872 */
1873static void ceph_fault(struct ceph_connection *con)
1874{
1875 pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
1876 pr_addr(&con->peer_addr.in_addr), con->error_msg);
1877 dout("fault %p state %lu to peer %s\n",
1878 con, con->state, pr_addr(&con->peer_addr.in_addr));
1879
1880 if (test_bit(LOSSYTX, &con->state)) {
1881 dout("fault on LOSSYTX channel\n");
1882 goto out;
1883 }
1884
1885 mutex_lock(&con->mutex);
1886 if (test_bit(CLOSED, &con->state))
1887 goto out_unlock;
1888
1889 con_close_socket(con);
1890
1891 if (con->in_msg) {
1892 ceph_msg_put(con->in_msg);
1893 con->in_msg = NULL;
1894 }
1895
1896 /* Requeue anything that hasn't been acked */
1897 list_splice_init(&con->out_sent, &con->out_queue);
1898
1899 /* If there are no messages in the queue, place the connection
1900 * in a STANDBY state (i.e., don't try to reconnect just yet). */
1901 if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
1902 dout("fault setting STANDBY\n");
1903 set_bit(STANDBY, &con->state);
1904 } else {
1905 /* retry after a delay. */
1906 if (con->delay == 0)
1907 con->delay = BASE_DELAY_INTERVAL;
1908 else if (con->delay < MAX_DELAY_INTERVAL)
1909 con->delay *= 2;
1910 dout("fault queueing %p delay %lu\n", con, con->delay);
1911 con->ops->get(con);
1912 if (queue_delayed_work(ceph_msgr_wq, &con->work,
1913 round_jiffies_relative(con->delay)) == 0)
1914 con->ops->put(con);
1915 }
1916
1917out_unlock:
1918 mutex_unlock(&con->mutex);
1919out:
1920 /*
1921 * in case we faulted due to authentication, invalidate our
1922 * current tickets so that we can get new ones.
1923 */
1924 if (con->auth_retry && con->ops->invalidate_authorizer) {
1925 dout("calling invalidate_authorizer()\n");
1926 con->ops->invalidate_authorizer(con);
1927 }
1928
1929 if (con->ops->fault)
1930 con->ops->fault(con);
1931}
1932
1933
1934
1935/*
1936 * create a new messenger instance
1937 */
1938struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
1939{
1940 struct ceph_messenger *msgr;
1941
1942 msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
1943 if (msgr == NULL)
1944 return ERR_PTR(-ENOMEM);
1945
1946 spin_lock_init(&msgr->global_seq_lock);
1947
1948 /* the zero page is needed if a request is "canceled" while the message
1949 * is being written over the socket */
1950 msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
1951 if (!msgr->zero_page) {
1952 kfree(msgr);
1953 return ERR_PTR(-ENOMEM);
1954 }
1955 kmap(msgr->zero_page);
1956
1957 if (myaddr)
1958 msgr->inst.addr = *myaddr;
1959
1960 /* select a random nonce */
1961 msgr->inst.addr.type = 0;
1962 get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
1963 encode_my_addr(msgr);
1964
1965 dout("messenger_create %p\n", msgr);
1966 return msgr;
1967}
1968
1969void ceph_messenger_destroy(struct ceph_messenger *msgr)
1970{
1971 dout("destroy %p\n", msgr);
1972 kunmap(msgr->zero_page);
1973 __free_page(msgr->zero_page);
1974 kfree(msgr);
1975 dout("destroyed messenger %p\n", msgr);
1976}
1977
1978/*
1979 * Queue up an outgoing message on the given connection.
1980 */
1981void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
1982{
1983 if (test_bit(CLOSED, &con->state)) {
1984 dout("con_send %p closed, dropping %p\n", con, msg);
1985 ceph_msg_put(msg);
1986 return;
1987 }
1988
1989 /* set src+dst */
1990 msg->hdr.src.name = con->msgr->inst.name;
1991 msg->hdr.src.addr = con->msgr->my_enc_addr;
1992 msg->hdr.orig_src = msg->hdr.src;
1993
1994 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
1995
1996 msg->needs_out_seq = true;
1997
1998 /* queue */
1999 mutex_lock(&con->mutex);
2000 BUG_ON(!list_empty(&msg->list_head));
2001 list_add_tail(&msg->list_head, &con->out_queue);
2002 dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
2003 ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
2004 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
2005 le32_to_cpu(msg->hdr.front_len),
2006 le32_to_cpu(msg->hdr.middle_len),
2007 le32_to_cpu(msg->hdr.data_len));
2008 mutex_unlock(&con->mutex);
2009
2010 /* if there wasn't anything waiting to send before, queue
2011 * new work */
2012 if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2013 queue_con(con);
2014}
2015
2016/*
2017 * Revoke a message that was previously queued for send
2018 */
2019void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
2020{
2021 mutex_lock(&con->mutex);
2022 if (!list_empty(&msg->list_head)) {
2023 dout("con_revoke %p msg %p\n", con, msg);
2024 list_del_init(&msg->list_head);
2025 ceph_msg_put(msg);
2026 msg->hdr.seq = 0;
2027 if (con->out_msg == msg) {
2028 ceph_msg_put(con->out_msg);
2029 con->out_msg = NULL;
2030 }
2031 if (con->out_kvec_is_msg) {
2032 con->out_skip = con->out_kvec_bytes;
2033 con->out_kvec_is_msg = false;
2034 }
2035 } else {
2036 dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg);
2037 }
2038 mutex_unlock(&con->mutex);
2039}
2040
2041/*
2042 * Revoke a message that we may be reading data into
2043 */
2044void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
2045{
2046 mutex_lock(&con->mutex);
2047 if (con->in_msg && con->in_msg == msg) {
2048 unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
2049 unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
2050 unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
2051
2052 /* skip rest of message */
2053 dout("con_revoke_pages %p msg %p revoked\n", con, msg);
2054 con->in_base_pos = con->in_base_pos -
2055 sizeof(struct ceph_msg_header) -
2056 front_len -
2057 middle_len -
2058 data_len -
2059 sizeof(struct ceph_msg_footer);
2060 ceph_msg_put(con->in_msg);
2061 con->in_msg = NULL;
2062 con->in_tag = CEPH_MSGR_TAG_READY;
2063 con->in_seq++;
2064 } else {
2065 dout("con_revoke_pages %p msg %p pages %p no-op\n",
2066 con, con->in_msg, msg);
2067 }
2068 mutex_unlock(&con->mutex);
2069}
2070
2071/*
2072 * Queue a keepalive byte to ensure the tcp connection is alive.
2073 */
2074void ceph_con_keepalive(struct ceph_connection *con)
2075{
2076 if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
2077 test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2078 queue_con(con);
2079}
2080
2081
2082/*
2083 * construct a new message with given type, size
2084 * the new msg has a ref count of 1.
2085 */
2086struct ceph_msg *ceph_msg_new(int type, int front_len,
2087 int page_len, int page_off, struct page **pages)
2088{
2089 struct ceph_msg *m;
2090
2091 m = kmalloc(sizeof(*m), GFP_NOFS);
2092 if (m == NULL)
2093 goto out;
2094 kref_init(&m->kref);
2095 INIT_LIST_HEAD(&m->list_head);
2096
2097 m->hdr.tid = 0;
2098 m->hdr.type = cpu_to_le16(type);
2099 m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
2100 m->hdr.version = 0;
2101 m->hdr.front_len = cpu_to_le32(front_len);
2102 m->hdr.middle_len = 0;
2103 m->hdr.data_len = cpu_to_le32(page_len);
2104 m->hdr.data_off = cpu_to_le16(page_off);
2105 m->hdr.reserved = 0;
2106 m->footer.front_crc = 0;
2107 m->footer.middle_crc = 0;
2108 m->footer.data_crc = 0;
2109 m->footer.flags = 0;
2110 m->front_max = front_len;
2111 m->front_is_vmalloc = false;
2112 m->more_to_follow = false;
2113 m->pool = NULL;
2114
2115 /* front */
2116 if (front_len) {
2117 if (front_len > PAGE_CACHE_SIZE) {
2118 m->front.iov_base = __vmalloc(front_len, GFP_NOFS,
2119 PAGE_KERNEL);
2120 m->front_is_vmalloc = true;
2121 } else {
2122 m->front.iov_base = kmalloc(front_len, GFP_NOFS);
2123 }
2124 if (m->front.iov_base == NULL) {
2125 pr_err("msg_new can't allocate %d bytes\n",
2126 front_len);
2127 goto out2;
2128 }
2129 } else {
2130 m->front.iov_base = NULL;
2131 }
2132 m->front.iov_len = front_len;
2133
2134 /* middle */
2135 m->middle = NULL;
2136
2137 /* data */
2138 m->nr_pages = calc_pages_for(page_off, page_len);
2139 m->pages = pages;
2140 m->pagelist = NULL;
2141
2142 dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
2143 m->nr_pages);
2144 return m;
2145
2146out2:
2147 ceph_msg_put(m);
2148out:
2149 pr_err("msg_new can't create type %d len %d\n", type, front_len);
2150 return ERR_PTR(-ENOMEM);
2151}
2152
2153/*
2154 * Allocate "middle" portion of a message, if it is needed and wasn't
2155 * allocated by alloc_msg. This allows us to read a small fixed-size
2156 * per-type header in the front and then gracefully fail (i.e.,
2157 * propagate the error to the caller based on info in the front) when
2158 * the middle is too large.
2159 */
2160static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
2161{
2162 int type = le16_to_cpu(msg->hdr.type);
2163 int middle_len = le32_to_cpu(msg->hdr.middle_len);
2164
2165 dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
2166 ceph_msg_type_name(type), middle_len);
2167 BUG_ON(!middle_len);
2168 BUG_ON(msg->middle);
2169
2170 msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
2171 if (!msg->middle)
2172 return -ENOMEM;
2173 return 0;
2174}
2175
2176/*
2177 * Generic message allocator, for incoming messages.
2178 */
2179static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
2180 struct ceph_msg_header *hdr,
2181 int *skip)
2182{
2183 int type = le16_to_cpu(hdr->type);
2184 int front_len = le32_to_cpu(hdr->front_len);
2185 int middle_len = le32_to_cpu(hdr->middle_len);
2186 struct ceph_msg *msg = NULL;
2187 int ret;
2188
2189 if (con->ops->alloc_msg) {
2190 mutex_unlock(&con->mutex);
2191 msg = con->ops->alloc_msg(con, hdr, skip);
2192 mutex_lock(&con->mutex);
2193 if (IS_ERR(msg))
2194 return msg;
2195
2196 if (*skip)
2197 return NULL;
2198 }
2199 if (!msg) {
2200 *skip = 0;
2201 msg = ceph_msg_new(type, front_len, 0, 0, NULL);
2202 if (!msg) {
2203 pr_err("unable to allocate msg type %d len %d\n",
2204 type, front_len);
2205 return ERR_PTR(-ENOMEM);
2206 }
2207 }
2208 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2209
2210 if (middle_len) {
2211 ret = ceph_alloc_middle(con, msg);
2212
2213 if (ret < 0) {
2214 ceph_msg_put(msg);
2215 return msg;
2216 }
2217 }
2218
2219 return msg;
2220}
2221
2222
2223/*
2224 * Free a generically kmalloc'd message.
2225 */
2226void ceph_msg_kfree(struct ceph_msg *m)
2227{
2228 dout("msg_kfree %p\n", m);
2229 if (m->front_is_vmalloc)
2230 vfree(m->front.iov_base);
2231 else
2232 kfree(m->front.iov_base);
2233 kfree(m);
2234}
2235
2236/*
2237 * Drop a msg ref. Destroy as needed.
2238 */
2239void ceph_msg_last_put(struct kref *kref)
2240{
2241 struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
2242
2243 dout("ceph_msg_put last one on %p\n", m);
2244 WARN_ON(!list_empty(&m->list_head));
2245
2246 /* drop middle, data, if any */
2247 if (m->middle) {
2248 ceph_buffer_put(m->middle);
2249 m->middle = NULL;
2250 }
2251 m->nr_pages = 0;
2252 m->pages = NULL;
2253
2254 if (m->pagelist) {
2255 ceph_pagelist_release(m->pagelist);
2256 kfree(m->pagelist);
2257 m->pagelist = NULL;
2258 }
2259
2260 if (m->pool)
2261 ceph_msgpool_put(m->pool, m);
2262 else
2263 ceph_msg_kfree(m);
2264}
2265
2266void ceph_msg_dump(struct ceph_msg *msg)
2267{
2268 pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
2269 msg->front_max, msg->nr_pages);
2270 print_hex_dump(KERN_DEBUG, "header: ",
2271 DUMP_PREFIX_OFFSET, 16, 1,
2272 &msg->hdr, sizeof(msg->hdr), true);
2273 print_hex_dump(KERN_DEBUG, " front: ",
2274 DUMP_PREFIX_OFFSET, 16, 1,
2275 msg->front.iov_base, msg->front.iov_len, true);
2276 if (msg->middle)
2277 print_hex_dump(KERN_DEBUG, "middle: ",
2278 DUMP_PREFIX_OFFSET, 16, 1,
2279 msg->middle->vec.iov_base,
2280 msg->middle->vec.iov_len, true);
2281 print_hex_dump(KERN_DEBUG, "footer: ",
2282 DUMP_PREFIX_OFFSET, 16, 1,
2283 &msg->footer, sizeof(msg->footer), true);
2284}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
new file mode 100644
index 000000000000..a5caf91cc971
--- /dev/null
+++ b/fs/ceph/messenger.h
@@ -0,0 +1,256 @@
1#ifndef __FS_CEPH_MESSENGER_H
2#define __FS_CEPH_MESSENGER_H
3
4#include <linux/kref.h>
5#include <linux/mutex.h>
6#include <linux/net.h>
7#include <linux/radix-tree.h>
8#include <linux/uio.h>
9#include <linux/version.h>
10#include <linux/workqueue.h>
11
12#include "types.h"
13#include "buffer.h"
14
15struct ceph_msg;
16struct ceph_connection;
17
18extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */
19
20/*
21 * Ceph defines these callbacks for handling connection events.
22 */
23struct ceph_connection_operations {
24 struct ceph_connection *(*get)(struct ceph_connection *);
25 void (*put)(struct ceph_connection *);
26
27 /* handle an incoming message. */
28 void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
29
30 /* authorize an outgoing connection */
31 int (*get_authorizer) (struct ceph_connection *con,
32 void **buf, int *len, int *proto,
33 void **reply_buf, int *reply_len, int force_new);
34 int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
35 int (*invalidate_authorizer)(struct ceph_connection *con);
36
37 /* protocol version mismatch */
38 void (*bad_proto) (struct ceph_connection *con);
39
40 /* there was some error on the socket (disconnect, whatever) */
41 void (*fault) (struct ceph_connection *con);
42
43 /* a remote host as terminated a message exchange session, and messages
44 * we sent (or they tried to send us) may be lost. */
45 void (*peer_reset) (struct ceph_connection *con);
46
47 struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
48 struct ceph_msg_header *hdr,
49 int *skip);
50};
51
52extern const char *ceph_name_type_str(int t);
53
54/* use format string %s%d */
55#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num)
56
57struct ceph_messenger {
58 struct ceph_entity_inst inst; /* my name+address */
59 struct ceph_entity_addr my_enc_addr;
60 struct page *zero_page; /* used in certain error cases */
61
62 bool nocrc;
63
64 /*
65 * the global_seq counts connections i (attempt to) initiate
66 * in order to disambiguate certain connect race conditions.
67 */
68 u32 global_seq;
69 spinlock_t global_seq_lock;
70};
71
72/*
73 * a single message. it contains a header (src, dest, message type, etc.),
74 * footer (crc values, mainly), a "front" message body, and possibly a
75 * data payload (stored in some number of pages).
76 */
77struct ceph_msg {
78 struct ceph_msg_header hdr; /* header */
79 struct ceph_msg_footer footer; /* footer */
80 struct kvec front; /* unaligned blobs of message */
81 struct ceph_buffer *middle;
82 struct page **pages; /* data payload. NOT OWNER. */
83 unsigned nr_pages; /* size of page array */
84 struct ceph_pagelist *pagelist; /* instead of pages */
85 struct list_head list_head;
86 struct kref kref;
87 bool front_is_vmalloc;
88 bool more_to_follow;
89 bool needs_out_seq;
90 int front_max;
91
92 struct ceph_msgpool *pool;
93};
94
95struct ceph_msg_pos {
96 int page, page_pos; /* which page; offset in page */
97 int data_pos; /* offset in data payload */
98 int did_page_crc; /* true if we've calculated crc for current page */
99};
100
101/* ceph connection fault delay defaults, for exponential backoff */
102#define BASE_DELAY_INTERVAL (HZ/2)
103#define MAX_DELAY_INTERVAL (5 * 60 * HZ)
104
105/*
106 * ceph_connection state bit flags
107 *
108 * QUEUED and BUSY are used together to ensure that only a single
109 * thread is currently opening, reading or writing data to the socket.
110 */
111#define LOSSYTX 0 /* we can close channel or drop messages on errors */
112#define CONNECTING 1
113#define NEGOTIATING 2
114#define KEEPALIVE_PENDING 3
115#define WRITE_PENDING 4 /* we have data ready to send */
116#define QUEUED 5 /* there is work queued on this connection */
117#define BUSY 6 /* work is being done */
118#define STANDBY 8 /* no outgoing messages, socket closed. we keep
119 * the ceph_connection around to maintain shared
120 * state with the peer. */
121#define CLOSED 10 /* we've closed the connection */
122#define SOCK_CLOSED 11 /* socket state changed to closed */
123#define OPENING 13 /* open connection w/ (possibly new) peer */
124#define DEAD 14 /* dead, about to kfree */
125
126/*
127 * A single connection with another host.
128 *
129 * We maintain a queue of outgoing messages, and some session state to
130 * ensure that we can preserve the lossless, ordered delivery of
131 * messages in the case of a TCP disconnect.
132 */
133struct ceph_connection {
134 void *private;
135 atomic_t nref;
136
137 const struct ceph_connection_operations *ops;
138
139 struct ceph_messenger *msgr;
140 struct socket *sock;
141 unsigned long state; /* connection state (see flags above) */
142 const char *error_msg; /* error message, if any */
143
144 struct ceph_entity_addr peer_addr; /* peer address */
145 struct ceph_entity_name peer_name; /* peer name */
146 struct ceph_entity_addr peer_addr_for_me;
147 u32 connect_seq; /* identify the most recent connection
148 attempt for this connection, client */
149 u32 peer_global_seq; /* peer's global seq for this connection */
150
151 int auth_retry; /* true if we need a newer authorizer */
152 void *auth_reply_buf; /* where to put the authorizer reply */
153 int auth_reply_buf_len;
154
155 struct mutex mutex;
156
157 /* out queue */
158 struct list_head out_queue;
159 struct list_head out_sent; /* sending or sent but unacked */
160 u64 out_seq; /* last message queued for send */
161 u64 out_seq_sent; /* last message sent */
162 bool out_keepalive_pending;
163
164 u64 in_seq, in_seq_acked; /* last message received, acked */
165
166 /* connection negotiation temps */
167 char in_banner[CEPH_BANNER_MAX_LEN];
168 union {
169 struct { /* outgoing connection */
170 struct ceph_msg_connect out_connect;
171 struct ceph_msg_connect_reply in_reply;
172 };
173 struct { /* incoming */
174 struct ceph_msg_connect in_connect;
175 struct ceph_msg_connect_reply out_reply;
176 };
177 };
178 struct ceph_entity_addr actual_peer_addr;
179
180 /* message out temps */
181 struct ceph_msg *out_msg; /* sending message (== tail of
182 out_sent) */
183 bool out_msg_done;
184 struct ceph_msg_pos out_msg_pos;
185
186 struct kvec out_kvec[8], /* sending header/footer data */
187 *out_kvec_cur;
188 int out_kvec_left; /* kvec's left in out_kvec */
189 int out_skip; /* skip this many bytes */
190 int out_kvec_bytes; /* total bytes left */
191 bool out_kvec_is_msg; /* kvec refers to out_msg */
192 int out_more; /* there is more data after the kvecs */
193 __le64 out_temp_ack; /* for writing an ack */
194
195 /* message in temps */
196 struct ceph_msg_header in_hdr;
197 struct ceph_msg *in_msg;
198 struct ceph_msg_pos in_msg_pos;
199 u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
200
201 char in_tag; /* protocol control byte */
202 int in_base_pos; /* bytes read */
203 __le64 in_temp_ack; /* for reading an ack */
204
205 struct delayed_work work; /* send|recv work */
206 unsigned long delay; /* current delay interval */
207};
208
209
210extern const char *pr_addr(const struct sockaddr_storage *ss);
211extern int ceph_parse_ips(const char *c, const char *end,
212 struct ceph_entity_addr *addr,
213 int max_count, int *count);
214
215
216extern int ceph_msgr_init(void);
217extern void ceph_msgr_exit(void);
218
219extern struct ceph_messenger *ceph_messenger_create(
220 struct ceph_entity_addr *myaddr);
221extern void ceph_messenger_destroy(struct ceph_messenger *);
222
223extern void ceph_con_init(struct ceph_messenger *msgr,
224 struct ceph_connection *con);
225extern void ceph_con_open(struct ceph_connection *con,
226 struct ceph_entity_addr *addr);
227extern bool ceph_con_opened(struct ceph_connection *con);
228extern void ceph_con_close(struct ceph_connection *con);
229extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
230extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
231extern void ceph_con_revoke_message(struct ceph_connection *con,
232 struct ceph_msg *msg);
233extern void ceph_con_keepalive(struct ceph_connection *con);
234extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
235extern void ceph_con_put(struct ceph_connection *con);
236
237extern struct ceph_msg *ceph_msg_new(int type, int front_len,
238 int page_len, int page_off,
239 struct page **pages);
240extern void ceph_msg_kfree(struct ceph_msg *m);
241
242
243static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
244{
245 kref_get(&msg->kref);
246 return msg;
247}
248extern void ceph_msg_last_put(struct kref *kref);
249static inline void ceph_msg_put(struct ceph_msg *msg)
250{
251 kref_put(&msg->kref, ceph_msg_last_put);
252}
253
254extern void ceph_msg_dump(struct ceph_msg *msg);
255
256#endif
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
new file mode 100644
index 000000000000..8fdc011ca956
--- /dev/null
+++ b/fs/ceph/mon_client.c
@@ -0,0 +1,835 @@
1#include "ceph_debug.h"
2
3#include <linux/types.h>
4#include <linux/slab.h>
5#include <linux/random.h>
6#include <linux/sched.h>
7
8#include "mon_client.h"
9#include "super.h"
10#include "auth.h"
11#include "decode.h"
12
13/*
14 * Interact with Ceph monitor cluster. Handle requests for new map
15 * versions, and periodically resend as needed. Also implement
16 * statfs() and umount().
17 *
18 * A small cluster of Ceph "monitors" are responsible for managing critical
19 * cluster configuration and state information. An odd number (e.g., 3, 5)
20 * of cmon daemons use a modified version of the Paxos part-time parliament
21 * algorithm to manage the MDS map (mds cluster membership), OSD map, and
22 * list of clients who have mounted the file system.
23 *
24 * We maintain an open, active session with a monitor at all times in order to
25 * receive timely MDSMap updates. We periodically send a keepalive byte on the
26 * TCP socket to ensure we detect a failure. If the connection does break, we
27 * randomly hunt for a new monitor. Once the connection is reestablished, we
28 * resend any outstanding requests.
29 */
30
31const static struct ceph_connection_operations mon_con_ops;
32
33static int __validate_auth(struct ceph_mon_client *monc);
34
35/*
36 * Decode a monmap blob (e.g., during mount).
37 */
38struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
39{
40 struct ceph_monmap *m = NULL;
41 int i, err = -EINVAL;
42 struct ceph_fsid fsid;
43 u32 epoch, num_mon;
44 u16 version;
45 u32 len;
46
47 ceph_decode_32_safe(&p, end, len, bad);
48 ceph_decode_need(&p, end, len, bad);
49
50 dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
51
52 ceph_decode_16_safe(&p, end, version, bad);
53
54 ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
55 ceph_decode_copy(&p, &fsid, sizeof(fsid));
56 epoch = ceph_decode_32(&p);
57
58 num_mon = ceph_decode_32(&p);
59 ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
60
61 if (num_mon >= CEPH_MAX_MON)
62 goto bad;
63 m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
64 if (m == NULL)
65 return ERR_PTR(-ENOMEM);
66 m->fsid = fsid;
67 m->epoch = epoch;
68 m->num_mon = num_mon;
69 ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
70 for (i = 0; i < num_mon; i++)
71 ceph_decode_addr(&m->mon_inst[i].addr);
72
73 dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
74 m->num_mon);
75 for (i = 0; i < m->num_mon; i++)
76 dout("monmap_decode mon%d is %s\n", i,
77 pr_addr(&m->mon_inst[i].addr.in_addr));
78 return m;
79
80bad:
81 dout("monmap_decode failed with %d\n", err);
82 kfree(m);
83 return ERR_PTR(err);
84}
85
86/*
87 * return true if *addr is included in the monmap.
88 */
89int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
90{
91 int i;
92
93 for (i = 0; i < m->num_mon; i++)
94 if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
95 return 1;
96 return 0;
97}
98
99/*
100 * Send an auth request.
101 */
102static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
103{
104 monc->pending_auth = 1;
105 monc->m_auth->front.iov_len = len;
106 monc->m_auth->hdr.front_len = cpu_to_le32(len);
107 ceph_msg_get(monc->m_auth); /* keep our ref */
108 ceph_con_send(monc->con, monc->m_auth);
109}
110
111/*
112 * Close monitor session, if any.
113 */
114static void __close_session(struct ceph_mon_client *monc)
115{
116 if (monc->con) {
117 dout("__close_session closing mon%d\n", monc->cur_mon);
118 ceph_con_revoke(monc->con, monc->m_auth);
119 ceph_con_close(monc->con);
120 monc->cur_mon = -1;
121 monc->pending_auth = 0;
122 ceph_auth_reset(monc->auth);
123 }
124}
125
126/*
127 * Open a session with a (new) monitor.
128 */
129static int __open_session(struct ceph_mon_client *monc)
130{
131 char r;
132 int ret;
133
134 if (monc->cur_mon < 0) {
135 get_random_bytes(&r, 1);
136 monc->cur_mon = r % monc->monmap->num_mon;
137 dout("open_session num=%d r=%d -> mon%d\n",
138 monc->monmap->num_mon, r, monc->cur_mon);
139 monc->sub_sent = 0;
140 monc->sub_renew_after = jiffies; /* i.e., expired */
141 monc->want_next_osdmap = !!monc->want_next_osdmap;
142
143 dout("open_session mon%d opening\n", monc->cur_mon);
144 monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
145 monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
146 ceph_con_open(monc->con,
147 &monc->monmap->mon_inst[monc->cur_mon].addr);
148
149 /* initiatiate authentication handshake */
150 ret = ceph_auth_build_hello(monc->auth,
151 monc->m_auth->front.iov_base,
152 monc->m_auth->front_max);
153 __send_prepared_auth_request(monc, ret);
154 } else {
155 dout("open_session mon%d already open\n", monc->cur_mon);
156 }
157 return 0;
158}
159
160static bool __sub_expired(struct ceph_mon_client *monc)
161{
162 return time_after_eq(jiffies, monc->sub_renew_after);
163}
164
165/*
166 * Reschedule delayed work timer.
167 */
168static void __schedule_delayed(struct ceph_mon_client *monc)
169{
170 unsigned delay;
171
172 if (monc->cur_mon < 0 || __sub_expired(monc))
173 delay = 10 * HZ;
174 else
175 delay = 20 * HZ;
176 dout("__schedule_delayed after %u\n", delay);
177 schedule_delayed_work(&monc->delayed_work, delay);
178}
179
180/*
181 * Send subscribe request for mdsmap and/or osdmap.
182 */
183static void __send_subscribe(struct ceph_mon_client *monc)
184{
185 dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
186 (unsigned)monc->sub_sent, __sub_expired(monc),
187 monc->want_next_osdmap);
188 if ((__sub_expired(monc) && !monc->sub_sent) ||
189 monc->want_next_osdmap == 1) {
190 struct ceph_msg *msg;
191 struct ceph_mon_subscribe_item *i;
192 void *p, *end;
193
194 msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
195 if (!msg)
196 return;
197
198 p = msg->front.iov_base;
199 end = p + msg->front.iov_len;
200
201 dout("__send_subscribe to 'mdsmap' %u+\n",
202 (unsigned)monc->have_mdsmap);
203 if (monc->want_next_osdmap) {
204 dout("__send_subscribe to 'osdmap' %u\n",
205 (unsigned)monc->have_osdmap);
206 ceph_encode_32(&p, 3);
207 ceph_encode_string(&p, end, "osdmap", 6);
208 i = p;
209 i->have = cpu_to_le64(monc->have_osdmap);
210 i->onetime = 1;
211 p += sizeof(*i);
212 monc->want_next_osdmap = 2; /* requested */
213 } else {
214 ceph_encode_32(&p, 2);
215 }
216 ceph_encode_string(&p, end, "mdsmap", 6);
217 i = p;
218 i->have = cpu_to_le64(monc->have_mdsmap);
219 i->onetime = 0;
220 p += sizeof(*i);
221 ceph_encode_string(&p, end, "monmap", 6);
222 i = p;
223 i->have = 0;
224 i->onetime = 0;
225 p += sizeof(*i);
226
227 msg->front.iov_len = p - msg->front.iov_base;
228 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
229 ceph_con_send(monc->con, msg);
230
231 monc->sub_sent = jiffies | 1; /* never 0 */
232 }
233}
234
235static void handle_subscribe_ack(struct ceph_mon_client *monc,
236 struct ceph_msg *msg)
237{
238 unsigned seconds;
239 struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
240
241 if (msg->front.iov_len < sizeof(*h))
242 goto bad;
243 seconds = le32_to_cpu(h->duration);
244
245 mutex_lock(&monc->mutex);
246 if (monc->hunting) {
247 pr_info("mon%d %s session established\n",
248 monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
249 monc->hunting = false;
250 }
251 dout("handle_subscribe_ack after %d seconds\n", seconds);
252 monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
253 monc->sub_sent = 0;
254 mutex_unlock(&monc->mutex);
255 return;
256bad:
257 pr_err("got corrupt subscribe-ack msg\n");
258 ceph_msg_dump(msg);
259}
260
261/*
262 * Keep track of which maps we have
263 */
264int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
265{
266 mutex_lock(&monc->mutex);
267 monc->have_mdsmap = got;
268 mutex_unlock(&monc->mutex);
269 return 0;
270}
271
272int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
273{
274 mutex_lock(&monc->mutex);
275 monc->have_osdmap = got;
276 monc->want_next_osdmap = 0;
277 mutex_unlock(&monc->mutex);
278 return 0;
279}
280
281/*
282 * Register interest in the next osdmap
283 */
284void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
285{
286 dout("request_next_osdmap have %u\n", monc->have_osdmap);
287 mutex_lock(&monc->mutex);
288 if (!monc->want_next_osdmap)
289 monc->want_next_osdmap = 1;
290 if (monc->want_next_osdmap < 2)
291 __send_subscribe(monc);
292 mutex_unlock(&monc->mutex);
293}
294
295/*
296 *
297 */
298int ceph_monc_open_session(struct ceph_mon_client *monc)
299{
300 if (!monc->con) {
301 monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
302 if (!monc->con)
303 return -ENOMEM;
304 ceph_con_init(monc->client->msgr, monc->con);
305 monc->con->private = monc;
306 monc->con->ops = &mon_con_ops;
307 }
308
309 mutex_lock(&monc->mutex);
310 __open_session(monc);
311 __schedule_delayed(monc);
312 mutex_unlock(&monc->mutex);
313 return 0;
314}
315
316/*
317 * The monitor responds with mount ack indicate mount success. The
318 * included client ticket allows the client to talk to MDSs and OSDs.
319 */
320static void ceph_monc_handle_map(struct ceph_mon_client *monc,
321 struct ceph_msg *msg)
322{
323 struct ceph_client *client = monc->client;
324 struct ceph_monmap *monmap = NULL, *old = monc->monmap;
325 void *p, *end;
326
327 mutex_lock(&monc->mutex);
328
329 dout("handle_monmap\n");
330 p = msg->front.iov_base;
331 end = p + msg->front.iov_len;
332
333 monmap = ceph_monmap_decode(p, end);
334 if (IS_ERR(monmap)) {
335 pr_err("problem decoding monmap, %d\n",
336 (int)PTR_ERR(monmap));
337 goto out;
338 }
339
340 if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
341 kfree(monmap);
342 goto out;
343 }
344
345 client->monc.monmap = monmap;
346 kfree(old);
347
348out:
349 mutex_unlock(&monc->mutex);
350 wake_up(&client->auth_wq);
351}
352
353/*
354 * statfs
355 */
356static struct ceph_mon_statfs_request *__lookup_statfs(
357 struct ceph_mon_client *monc, u64 tid)
358{
359 struct ceph_mon_statfs_request *req;
360 struct rb_node *n = monc->statfs_request_tree.rb_node;
361
362 while (n) {
363 req = rb_entry(n, struct ceph_mon_statfs_request, node);
364 if (tid < req->tid)
365 n = n->rb_left;
366 else if (tid > req->tid)
367 n = n->rb_right;
368 else
369 return req;
370 }
371 return NULL;
372}
373
374static void __insert_statfs(struct ceph_mon_client *monc,
375 struct ceph_mon_statfs_request *new)
376{
377 struct rb_node **p = &monc->statfs_request_tree.rb_node;
378 struct rb_node *parent = NULL;
379 struct ceph_mon_statfs_request *req = NULL;
380
381 while (*p) {
382 parent = *p;
383 req = rb_entry(parent, struct ceph_mon_statfs_request, node);
384 if (new->tid < req->tid)
385 p = &(*p)->rb_left;
386 else if (new->tid > req->tid)
387 p = &(*p)->rb_right;
388 else
389 BUG();
390 }
391
392 rb_link_node(&new->node, parent, p);
393 rb_insert_color(&new->node, &monc->statfs_request_tree);
394}
395
396static void handle_statfs_reply(struct ceph_mon_client *monc,
397 struct ceph_msg *msg)
398{
399 struct ceph_mon_statfs_request *req;
400 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
401 u64 tid;
402
403 if (msg->front.iov_len != sizeof(*reply))
404 goto bad;
405 tid = le64_to_cpu(msg->hdr.tid);
406 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
407
408 mutex_lock(&monc->mutex);
409 req = __lookup_statfs(monc, tid);
410 if (req) {
411 *req->buf = reply->st;
412 req->result = 0;
413 }
414 mutex_unlock(&monc->mutex);
415 if (req)
416 complete(&req->completion);
417 return;
418
419bad:
420 pr_err("corrupt statfs reply, no tid\n");
421 ceph_msg_dump(msg);
422}
423
424/*
425 * (re)send a statfs request
426 */
427static int send_statfs(struct ceph_mon_client *monc,
428 struct ceph_mon_statfs_request *req)
429{
430 struct ceph_msg *msg;
431 struct ceph_mon_statfs *h;
432
433 dout("send_statfs tid %llu\n", req->tid);
434 msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
435 if (IS_ERR(msg))
436 return PTR_ERR(msg);
437 req->request = msg;
438 msg->hdr.tid = cpu_to_le64(req->tid);
439 h = msg->front.iov_base;
440 h->monhdr.have_version = 0;
441 h->monhdr.session_mon = cpu_to_le16(-1);
442 h->monhdr.session_mon_tid = 0;
443 h->fsid = monc->monmap->fsid;
444 ceph_con_send(monc->con, msg);
445 return 0;
446}
447
448/*
449 * Do a synchronous statfs().
450 */
451int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
452{
453 struct ceph_mon_statfs_request req;
454 int err;
455
456 req.buf = buf;
457 init_completion(&req.completion);
458
459 /* allocate memory for reply */
460 err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
461 if (err)
462 return err;
463
464 /* register request */
465 mutex_lock(&monc->mutex);
466 req.tid = ++monc->last_tid;
467 req.last_attempt = jiffies;
468 req.delay = BASE_DELAY_INTERVAL;
469 __insert_statfs(monc, &req);
470 monc->num_statfs_requests++;
471 mutex_unlock(&monc->mutex);
472
473 /* send request and wait */
474 err = send_statfs(monc, &req);
475 if (!err)
476 err = wait_for_completion_interruptible(&req.completion);
477
478 mutex_lock(&monc->mutex);
479 rb_erase(&req.node, &monc->statfs_request_tree);
480 monc->num_statfs_requests--;
481 ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
482 mutex_unlock(&monc->mutex);
483
484 if (!err)
485 err = req.result;
486 return err;
487}
488
489/*
490 * Resend pending statfs requests.
491 */
492static void __resend_statfs(struct ceph_mon_client *monc)
493{
494 struct ceph_mon_statfs_request *req;
495 struct rb_node *p;
496
497 for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) {
498 req = rb_entry(p, struct ceph_mon_statfs_request, node);
499 send_statfs(monc, req);
500 }
501}
502
503/*
504 * Delayed work. If we haven't mounted yet, retry. Otherwise,
505 * renew/retry subscription as needed (in case it is timing out, or we
506 * got an ENOMEM). And keep the monitor connection alive.
507 */
508static void delayed_work(struct work_struct *work)
509{
510 struct ceph_mon_client *monc =
511 container_of(work, struct ceph_mon_client, delayed_work.work);
512
513 dout("monc delayed_work\n");
514 mutex_lock(&monc->mutex);
515 if (monc->hunting) {
516 __close_session(monc);
517 __open_session(monc); /* continue hunting */
518 } else {
519 ceph_con_keepalive(monc->con);
520
521 __validate_auth(monc);
522
523 if (monc->auth->ops->is_authenticated(monc->auth))
524 __send_subscribe(monc);
525 }
526 __schedule_delayed(monc);
527 mutex_unlock(&monc->mutex);
528}
529
530/*
531 * On startup, we build a temporary monmap populated with the IPs
532 * provided by mount(2).
533 */
534static int build_initial_monmap(struct ceph_mon_client *monc)
535{
536 struct ceph_mount_args *args = monc->client->mount_args;
537 struct ceph_entity_addr *mon_addr = args->mon_addr;
538 int num_mon = args->num_mon;
539 int i;
540
541 /* build initial monmap */
542 monc->monmap = kzalloc(sizeof(*monc->monmap) +
543 num_mon*sizeof(monc->monmap->mon_inst[0]),
544 GFP_KERNEL);
545 if (!monc->monmap)
546 return -ENOMEM;
547 for (i = 0; i < num_mon; i++) {
548 monc->monmap->mon_inst[i].addr = mon_addr[i];
549 monc->monmap->mon_inst[i].addr.nonce = 0;
550 monc->monmap->mon_inst[i].name.type =
551 CEPH_ENTITY_TYPE_MON;
552 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
553 }
554 monc->monmap->num_mon = num_mon;
555 monc->have_fsid = false;
556
557 /* release addr memory */
558 kfree(args->mon_addr);
559 args->mon_addr = NULL;
560 args->num_mon = 0;
561 return 0;
562}
563
564int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
565{
566 int err = 0;
567
568 dout("init\n");
569 memset(monc, 0, sizeof(*monc));
570 monc->client = cl;
571 monc->monmap = NULL;
572 mutex_init(&monc->mutex);
573
574 err = build_initial_monmap(monc);
575 if (err)
576 goto out;
577
578 monc->con = NULL;
579
580 /* authentication */
581 monc->auth = ceph_auth_init(cl->mount_args->name,
582 cl->mount_args->secret);
583 if (IS_ERR(monc->auth))
584 return PTR_ERR(monc->auth);
585 monc->auth->want_keys =
586 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
587 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
588
589 /* msg pools */
590 err = ceph_msgpool_init(&monc->msgpool_subscribe_ack,
591 sizeof(struct ceph_mon_subscribe_ack), 1, false);
592 if (err < 0)
593 goto out_monmap;
594 err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
595 sizeof(struct ceph_mon_statfs_reply), 0, false);
596 if (err < 0)
597 goto out_pool1;
598 err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
599 if (err < 0)
600 goto out_pool2;
601
602 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
603 monc->pending_auth = 0;
604 if (IS_ERR(monc->m_auth)) {
605 err = PTR_ERR(monc->m_auth);
606 monc->m_auth = NULL;
607 goto out_pool3;
608 }
609
610 monc->cur_mon = -1;
611 monc->hunting = true;
612 monc->sub_renew_after = jiffies;
613 monc->sub_sent = 0;
614
615 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
616 monc->statfs_request_tree = RB_ROOT;
617 monc->num_statfs_requests = 0;
618 monc->last_tid = 0;
619
620 monc->have_mdsmap = 0;
621 monc->have_osdmap = 0;
622 monc->want_next_osdmap = 1;
623 return 0;
624
625out_pool3:
626 ceph_msgpool_destroy(&monc->msgpool_auth_reply);
627out_pool2:
628 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
629out_pool1:
630 ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
631out_monmap:
632 kfree(monc->monmap);
633out:
634 return err;
635}
636
637void ceph_monc_stop(struct ceph_mon_client *monc)
638{
639 dout("stop\n");
640 cancel_delayed_work_sync(&monc->delayed_work);
641
642 mutex_lock(&monc->mutex);
643 __close_session(monc);
644 if (monc->con) {
645 monc->con->private = NULL;
646 monc->con->ops->put(monc->con);
647 monc->con = NULL;
648 }
649 mutex_unlock(&monc->mutex);
650
651 ceph_auth_destroy(monc->auth);
652
653 ceph_msg_put(monc->m_auth);
654 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
655 ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
656 ceph_msgpool_destroy(&monc->msgpool_auth_reply);
657
658 kfree(monc->monmap);
659}
660
661static void handle_auth_reply(struct ceph_mon_client *monc,
662 struct ceph_msg *msg)
663{
664 int ret;
665
666 mutex_lock(&monc->mutex);
667 monc->pending_auth = 0;
668 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
669 msg->front.iov_len,
670 monc->m_auth->front.iov_base,
671 monc->m_auth->front_max);
672 if (ret < 0) {
673 monc->client->auth_err = ret;
674 wake_up(&monc->client->auth_wq);
675 } else if (ret > 0) {
676 __send_prepared_auth_request(monc, ret);
677 } else if (monc->auth->ops->is_authenticated(monc->auth)) {
678 dout("authenticated, starting session\n");
679
680 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
681 monc->client->msgr->inst.name.num = monc->auth->global_id;
682
683 __send_subscribe(monc);
684 __resend_statfs(monc);
685 }
686 mutex_unlock(&monc->mutex);
687}
688
689static int __validate_auth(struct ceph_mon_client *monc)
690{
691 int ret;
692
693 if (monc->pending_auth)
694 return 0;
695
696 ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
697 monc->m_auth->front_max);
698 if (ret <= 0)
699 return ret; /* either an error, or no need to authenticate */
700 __send_prepared_auth_request(monc, ret);
701 return 0;
702}
703
704int ceph_monc_validate_auth(struct ceph_mon_client *monc)
705{
706 int ret;
707
708 mutex_lock(&monc->mutex);
709 ret = __validate_auth(monc);
710 mutex_unlock(&monc->mutex);
711 return ret;
712}
713
714/*
715 * handle incoming message
716 */
717static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
718{
719 struct ceph_mon_client *monc = con->private;
720 int type = le16_to_cpu(msg->hdr.type);
721
722 if (!monc)
723 return;
724
725 switch (type) {
726 case CEPH_MSG_AUTH_REPLY:
727 handle_auth_reply(monc, msg);
728 break;
729
730 case CEPH_MSG_MON_SUBSCRIBE_ACK:
731 handle_subscribe_ack(monc, msg);
732 break;
733
734 case CEPH_MSG_STATFS_REPLY:
735 handle_statfs_reply(monc, msg);
736 break;
737
738 case CEPH_MSG_MON_MAP:
739 ceph_monc_handle_map(monc, msg);
740 break;
741
742 case CEPH_MSG_MDS_MAP:
743 ceph_mdsc_handle_map(&monc->client->mdsc, msg);
744 break;
745
746 case CEPH_MSG_OSD_MAP:
747 ceph_osdc_handle_map(&monc->client->osdc, msg);
748 break;
749
750 default:
751 pr_err("received unknown message type %d %s\n", type,
752 ceph_msg_type_name(type));
753 }
754 ceph_msg_put(msg);
755}
756
757/*
758 * Allocate memory for incoming message
759 */
760static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
761 struct ceph_msg_header *hdr,
762 int *skip)
763{
764 struct ceph_mon_client *monc = con->private;
765 int type = le16_to_cpu(hdr->type);
766 int front_len = le32_to_cpu(hdr->front_len);
767 struct ceph_msg *m = NULL;
768
769 *skip = 0;
770
771 switch (type) {
772 case CEPH_MSG_MON_SUBSCRIBE_ACK:
773 m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len);
774 break;
775 case CEPH_MSG_STATFS_REPLY:
776 m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len);
777 break;
778 case CEPH_MSG_AUTH_REPLY:
779 m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len);
780 break;
781 case CEPH_MSG_MON_MAP:
782 case CEPH_MSG_MDS_MAP:
783 case CEPH_MSG_OSD_MAP:
784 m = ceph_msg_new(type, front_len, 0, 0, NULL);
785 break;
786 }
787
788 if (!m) {
789 pr_info("alloc_msg unknown type %d\n", type);
790 *skip = 1;
791 }
792 return m;
793}
794
795/*
796 * If the monitor connection resets, pick a new monitor and resubmit
797 * any pending requests.
798 */
799static void mon_fault(struct ceph_connection *con)
800{
801 struct ceph_mon_client *monc = con->private;
802
803 if (!monc)
804 return;
805
806 dout("mon_fault\n");
807 mutex_lock(&monc->mutex);
808 if (!con->private)
809 goto out;
810
811 if (monc->con && !monc->hunting)
812 pr_info("mon%d %s session lost, "
813 "hunting for new mon\n", monc->cur_mon,
814 pr_addr(&monc->con->peer_addr.in_addr));
815
816 __close_session(monc);
817 if (!monc->hunting) {
818 /* start hunting */
819 monc->hunting = true;
820 __open_session(monc);
821 } else {
822 /* already hunting, let's wait a bit */
823 __schedule_delayed(monc);
824 }
825out:
826 mutex_unlock(&monc->mutex);
827}
828
829const static struct ceph_connection_operations mon_con_ops = {
830 .get = ceph_con_get,
831 .put = ceph_con_put,
832 .dispatch = dispatch,
833 .fault = mon_fault,
834 .alloc_msg = mon_alloc_msg,
835};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
new file mode 100644
index 000000000000..b958ad5afa06
--- /dev/null
+++ b/fs/ceph/mon_client.h
@@ -0,0 +1,119 @@
1#ifndef _FS_CEPH_MON_CLIENT_H
2#define _FS_CEPH_MON_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/rbtree.h>
6
7#include "messenger.h"
8#include "msgpool.h"
9
10struct ceph_client;
11struct ceph_mount_args;
12struct ceph_auth_client;
13
14/*
15 * The monitor map enumerates the set of all monitors.
16 */
17struct ceph_monmap {
18 struct ceph_fsid fsid;
19 u32 epoch;
20 u32 num_mon;
21 struct ceph_entity_inst mon_inst[0];
22};
23
24struct ceph_mon_client;
25struct ceph_mon_statfs_request;
26
27
28/*
29 * Generic mechanism for resending monitor requests.
30 */
31typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
32 int newmon);
33
34/* a pending monitor request */
35struct ceph_mon_request {
36 struct ceph_mon_client *monc;
37 struct delayed_work delayed_work;
38 unsigned long delay;
39 ceph_monc_request_func_t do_request;
40};
41
42/*
43 * statfs() is done a bit differently because we need to get data back
44 * to the caller
45 */
46struct ceph_mon_statfs_request {
47 u64 tid;
48 struct rb_node node;
49 int result;
50 struct ceph_statfs *buf;
51 struct completion completion;
52 unsigned long last_attempt, delay; /* jiffies */
53 struct ceph_msg *request; /* original request */
54};
55
56struct ceph_mon_client {
57 struct ceph_client *client;
58 struct ceph_monmap *monmap;
59
60 struct mutex mutex;
61 struct delayed_work delayed_work;
62
63 struct ceph_auth_client *auth;
64 struct ceph_msg *m_auth;
65 int pending_auth;
66
67 bool hunting;
68 int cur_mon; /* last monitor i contacted */
69 unsigned long sub_sent, sub_renew_after;
70 struct ceph_connection *con;
71 bool have_fsid;
72
73 /* msg pools */
74 struct ceph_msgpool msgpool_subscribe_ack;
75 struct ceph_msgpool msgpool_statfs_reply;
76 struct ceph_msgpool msgpool_auth_reply;
77
78 /* pending statfs requests */
79 struct rb_root statfs_request_tree;
80 int num_statfs_requests;
81 u64 last_tid;
82
83 /* mds/osd map */
84 int want_next_osdmap; /* 1 = want, 2 = want+asked */
85 u32 have_osdmap, have_mdsmap;
86
87#ifdef CONFIG_DEBUG_FS
88 struct dentry *debugfs_file;
89#endif
90};
91
92extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
93extern int ceph_monmap_contains(struct ceph_monmap *m,
94 struct ceph_entity_addr *addr);
95
96extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
97extern void ceph_monc_stop(struct ceph_mon_client *monc);
98
99/*
100 * The model here is to indicate that we need a new map of at least
101 * epoch @want, and also call in when we receive a map. We will
102 * periodically rerequest the map from the monitor cluster until we
103 * get what we want.
104 */
105extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
106extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
107
108extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
109
110extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
111 struct ceph_statfs *buf);
112
113extern int ceph_monc_open_session(struct ceph_mon_client *monc);
114
115extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
116
117
118
119#endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
new file mode 100644
index 000000000000..ca3b44a89f2d
--- /dev/null
+++ b/fs/ceph/msgpool.c
@@ -0,0 +1,186 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/sched.h>
5#include <linux/types.h>
6#include <linux/vmalloc.h>
7
8#include "msgpool.h"
9
10/*
11 * We use msg pools to preallocate memory for messages we expect to
12 * receive over the wire, to avoid getting ourselves into OOM
13 * conditions at unexpected times. We take use a few different
14 * strategies:
15 *
16 * - for request/response type interactions, we preallocate the
17 * memory needed for the response when we generate the request.
18 *
19 * - for messages we can receive at any time from the MDS, we preallocate
20 * a pool of messages we can re-use.
21 *
22 * - for writeback, we preallocate some number of messages to use for
23 * requests and their replies, so that we always make forward
24 * progress.
25 *
26 * The msgpool behaves like a mempool_t, but keeps preallocated
27 * ceph_msgs strung together on a list_head instead of using a pointer
28 * vector. This avoids vector reallocation when we adjust the number
29 * of preallocated items (which happens frequently).
30 */
31
32
33/*
34 * Allocate or release as necessary to meet our target pool size.
35 */
36static int __fill_msgpool(struct ceph_msgpool *pool)
37{
38 struct ceph_msg *msg;
39
40 while (pool->num < pool->min) {
41 dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
42 pool->min);
43 spin_unlock(&pool->lock);
44 msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
45 spin_lock(&pool->lock);
46 if (IS_ERR(msg))
47 return PTR_ERR(msg);
48 msg->pool = pool;
49 list_add(&msg->list_head, &pool->msgs);
50 pool->num++;
51 }
52 while (pool->num > pool->min) {
53 msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
54 dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
55 pool->min, msg);
56 list_del_init(&msg->list_head);
57 pool->num--;
58 ceph_msg_kfree(msg);
59 }
60 return 0;
61}
62
63int ceph_msgpool_init(struct ceph_msgpool *pool,
64 int front_len, int min, bool blocking)
65{
66 int ret;
67
68 dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
69 spin_lock_init(&pool->lock);
70 pool->front_len = front_len;
71 INIT_LIST_HEAD(&pool->msgs);
72 pool->num = 0;
73 pool->min = min;
74 pool->blocking = blocking;
75 init_waitqueue_head(&pool->wait);
76
77 spin_lock(&pool->lock);
78 ret = __fill_msgpool(pool);
79 spin_unlock(&pool->lock);
80 return ret;
81}
82
83void ceph_msgpool_destroy(struct ceph_msgpool *pool)
84{
85 dout("msgpool_destroy %p\n", pool);
86 spin_lock(&pool->lock);
87 pool->min = 0;
88 __fill_msgpool(pool);
89 spin_unlock(&pool->lock);
90}
91
92int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
93{
94 int ret;
95
96 spin_lock(&pool->lock);
97 dout("msgpool_resv %p delta %d\n", pool, delta);
98 pool->min += delta;
99 ret = __fill_msgpool(pool);
100 spin_unlock(&pool->lock);
101 return ret;
102}
103
104struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
105{
106 wait_queue_t wait;
107 struct ceph_msg *msg;
108
109 if (front_len && front_len > pool->front_len) {
110 pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
111 pool, front_len, pool->front_len);
112 WARN_ON(1);
113
114 /* try to alloc a fresh message */
115 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
116 if (!IS_ERR(msg))
117 return msg;
118 }
119
120 if (!front_len)
121 front_len = pool->front_len;
122
123 if (pool->blocking) {
124 /* mempool_t behavior; first try to alloc */
125 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
126 if (!IS_ERR(msg))
127 return msg;
128 }
129
130 while (1) {
131 spin_lock(&pool->lock);
132 if (likely(pool->num)) {
133 msg = list_entry(pool->msgs.next, struct ceph_msg,
134 list_head);
135 list_del_init(&msg->list_head);
136 pool->num--;
137 dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
138 pool->num, pool->min);
139 spin_unlock(&pool->lock);
140 return msg;
141 }
142 pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
143 pool->min, pool->blocking ? "waiting" : "may fail");
144 spin_unlock(&pool->lock);
145
146 if (!pool->blocking) {
147 WARN_ON(1);
148
149 /* maybe we can allocate it now? */
150 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
151 if (!IS_ERR(msg))
152 return msg;
153
154 pr_err("msgpool_get %p empty + alloc failed\n", pool);
155 return ERR_PTR(-ENOMEM);
156 }
157
158 init_wait(&wait);
159 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
160 schedule();
161 finish_wait(&pool->wait, &wait);
162 }
163}
164
165void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
166{
167 spin_lock(&pool->lock);
168 if (pool->num < pool->min) {
169 /* reset msg front_len; user may have changed it */
170 msg->front.iov_len = pool->front_len;
171 msg->hdr.front_len = cpu_to_le32(pool->front_len);
172
173 kref_set(&msg->kref, 1); /* retake a single ref */
174 list_add(&msg->list_head, &pool->msgs);
175 pool->num++;
176 dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
177 pool->num, pool->min);
178 spin_unlock(&pool->lock);
179 wake_up(&pool->wait);
180 } else {
181 dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
182 pool->num, pool->min);
183 spin_unlock(&pool->lock);
184 ceph_msg_kfree(msg);
185 }
186}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
new file mode 100644
index 000000000000..bc834bfcd720
--- /dev/null
+++ b/fs/ceph/msgpool.h
@@ -0,0 +1,27 @@
1#ifndef _FS_CEPH_MSGPOOL
2#define _FS_CEPH_MSGPOOL
3
4#include "messenger.h"
5
6/*
7 * we use memory pools for preallocating messages we may receive, to
8 * avoid unexpected OOM conditions.
9 */
10struct ceph_msgpool {
11 spinlock_t lock;
12 int front_len; /* preallocated payload size */
13 struct list_head msgs; /* msgs in the pool; each has 1 ref */
14 int num, min; /* cur, min # msgs in the pool */
15 bool blocking;
16 wait_queue_head_t wait;
17};
18
19extern int ceph_msgpool_init(struct ceph_msgpool *pool,
20 int front_len, int size, bool blocking);
21extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
22extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
23extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
24 int front_len);
25extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
26
27#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
new file mode 100644
index 000000000000..8aaab414f3f8
--- /dev/null
+++ b/fs/ceph/msgr.h
@@ -0,0 +1,158 @@
1#ifndef __MSGR_H
2#define __MSGR_H
3
4/*
5 * Data types for message passing layer used by Ceph.
6 */
7
8#define CEPH_MON_PORT 6789 /* default monitor port */
9
10/*
11 * client-side processes will try to bind to ports in this
12 * range, simply for the benefit of tools like nmap or wireshark
13 * that would like to identify the protocol.
14 */
15#define CEPH_PORT_FIRST 6789
16#define CEPH_PORT_START 6800 /* non-monitors start here */
17#define CEPH_PORT_LAST 6900
18
19/*
20 * tcp connection banner. include a protocol version. and adjust
21 * whenever the wire protocol changes. try to keep this string length
22 * constant.
23 */
24#define CEPH_BANNER "ceph v027"
25#define CEPH_BANNER_MAX_LEN 30
26
27
28/*
29 * Rollover-safe type and comparator for 32-bit sequence numbers.
30 * Comparator returns -1, 0, or 1.
31 */
32typedef __u32 ceph_seq_t;
33
34static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
35{
36 return (__s32)a - (__s32)b;
37}
38
39
40/*
41 * entity_name -- logical name for a process participating in the
42 * network, e.g. 'mds0' or 'osd3'.
43 */
44struct ceph_entity_name {
45 __u8 type; /* CEPH_ENTITY_TYPE_* */
46 __le64 num;
47} __attribute__ ((packed));
48
49#define CEPH_ENTITY_TYPE_MON 0x01
50#define CEPH_ENTITY_TYPE_MDS 0x02
51#define CEPH_ENTITY_TYPE_OSD 0x04
52#define CEPH_ENTITY_TYPE_CLIENT 0x08
53#define CEPH_ENTITY_TYPE_ADMIN 0x10
54#define CEPH_ENTITY_TYPE_AUTH 0x20
55
56#define CEPH_ENTITY_TYPE_ANY 0xFF
57
58extern const char *ceph_entity_type_name(int type);
59
60/*
61 * entity_addr -- network address
62 */
63struct ceph_entity_addr {
64 __le32 type;
65 __le32 nonce; /* unique id for process (e.g. pid) */
66 struct sockaddr_storage in_addr;
67} __attribute__ ((packed));
68
69struct ceph_entity_inst {
70 struct ceph_entity_name name;
71 struct ceph_entity_addr addr;
72} __attribute__ ((packed));
73
74
75/* used by message exchange protocol */
76#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
77#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
78#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
79 incoming connection */
80#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
81 with higher cseq */
82#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
83 with higher gseq */
84#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
85#define CEPH_MSGR_TAG_MSG 7 /* message */
86#define CEPH_MSGR_TAG_ACK 8 /* message ack */
87#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
88#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
89#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
90#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */
91
92
93/*
94 * connection negotiation
95 */
96struct ceph_msg_connect {
97 __le64 features; /* supported feature bits */
98 __le32 host_type; /* CEPH_ENTITY_TYPE_* */
99 __le32 global_seq; /* count connections initiated by this host */
100 __le32 connect_seq; /* count connections initiated in this session */
101 __le32 protocol_version;
102 __le32 authorizer_protocol;
103 __le32 authorizer_len;
104 __u8 flags; /* CEPH_MSG_CONNECT_* */
105} __attribute__ ((packed));
106
107struct ceph_msg_connect_reply {
108 __u8 tag;
109 __le64 features; /* feature bits for this session */
110 __le32 global_seq;
111 __le32 connect_seq;
112 __le32 protocol_version;
113 __le32 authorizer_len;
114 __u8 flags;
115} __attribute__ ((packed));
116
117#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
118
119
120/*
121 * message header
122 */
123struct ceph_msg_header {
124 __le64 seq; /* message seq# for this session */
125 __le64 tid; /* transaction id */
126 __le16 type; /* message type */
127 __le16 priority; /* priority. higher value == higher priority */
128 __le16 version; /* version of message encoding */
129
130 __le32 front_len; /* bytes in main payload */
131 __le32 middle_len;/* bytes in middle payload */
132 __le32 data_len; /* bytes of data payload */
133 __le16 data_off; /* sender: include full offset;
134 receiver: mask against ~PAGE_MASK */
135
136 struct ceph_entity_inst src, orig_src;
137 __le32 reserved;
138 __le32 crc; /* header crc32c */
139} __attribute__ ((packed));
140
141#define CEPH_MSG_PRIO_LOW 64
142#define CEPH_MSG_PRIO_DEFAULT 127
143#define CEPH_MSG_PRIO_HIGH 196
144#define CEPH_MSG_PRIO_HIGHEST 255
145
146/*
147 * follows data payload
148 */
149struct ceph_msg_footer {
150 __le32 front_crc, middle_crc, data_crc;
151 __u8 flags;
152} __attribute__ ((packed));
153
154#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */
155#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
156
157
158#endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
new file mode 100644
index 000000000000..3514f71ff85f
--- /dev/null
+++ b/fs/ceph/osd_client.c
@@ -0,0 +1,1564 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/highmem.h>
5#include <linux/mm.h>
6#include <linux/pagemap.h>
7#include <linux/slab.h>
8#include <linux/uaccess.h>
9
10#include "super.h"
11#include "osd_client.h"
12#include "messenger.h"
13#include "decode.h"
14#include "auth.h"
15
16#define OSD_OP_FRONT_LEN 4096
17#define OSD_OPREPLY_FRONT_LEN 512
18
19const static struct ceph_connection_operations osd_con_ops;
20static int __kick_requests(struct ceph_osd_client *osdc,
21 struct ceph_osd *kickosd);
22
23static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
24
25/*
26 * Implement client access to distributed object storage cluster.
27 *
28 * All data objects are stored within a cluster/cloud of OSDs, or
29 * "object storage devices." (Note that Ceph OSDs have _nothing_ to
30 * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
31 * remote daemons serving up and coordinating consistent and safe
32 * access to storage.
33 *
34 * Cluster membership and the mapping of data objects onto storage devices
35 * are described by the osd map.
36 *
37 * We keep track of pending OSD requests (read, write), resubmit
38 * requests to different OSDs when the cluster topology/data layout
39 * change, or retry the affected requests when the communications
40 * channel with an OSD is reset.
41 */
42
43/*
44 * calculate the mapping of a file extent onto an object, and fill out the
45 * request accordingly. shorten extent as necessary if it crosses an
46 * object boundary.
47 *
48 * fill osd op in request message.
49 */
50static void calc_layout(struct ceph_osd_client *osdc,
51 struct ceph_vino vino, struct ceph_file_layout *layout,
52 u64 off, u64 *plen,
53 struct ceph_osd_request *req)
54{
55 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
56 struct ceph_osd_op *op = (void *)(reqhead + 1);
57 u64 orig_len = *plen;
58 u64 objoff, objlen; /* extent in object */
59 u64 bno;
60
61 reqhead->snapid = cpu_to_le64(vino.snap);
62
63 /* object extent? */
64 ceph_calc_file_object_mapping(layout, off, plen, &bno,
65 &objoff, &objlen);
66 if (*plen < orig_len)
67 dout(" skipping last %llu, final file extent %llu~%llu\n",
68 orig_len - *plen, off, *plen);
69
70 sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
71 req->r_oid_len = strlen(req->r_oid);
72
73 op->extent.offset = cpu_to_le64(objoff);
74 op->extent.length = cpu_to_le64(objlen);
75 req->r_num_pages = calc_pages_for(off, *plen);
76
77 dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
78 req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
79}
80
81/*
82 * requests
83 */
84void ceph_osdc_release_request(struct kref *kref)
85{
86 struct ceph_osd_request *req = container_of(kref,
87 struct ceph_osd_request,
88 r_kref);
89
90 if (req->r_request)
91 ceph_msg_put(req->r_request);
92 if (req->r_reply)
93 ceph_msg_put(req->r_reply);
94 if (req->r_con_filling_msg) {
95 dout("release_request revoking pages %p from con %p\n",
96 req->r_pages, req->r_con_filling_msg);
97 ceph_con_revoke_message(req->r_con_filling_msg,
98 req->r_reply);
99 ceph_con_put(req->r_con_filling_msg);
100 }
101 if (req->r_own_pages)
102 ceph_release_page_vector(req->r_pages,
103 req->r_num_pages);
104 ceph_put_snap_context(req->r_snapc);
105 if (req->r_mempool)
106 mempool_free(req, req->r_osdc->req_mempool);
107 else
108 kfree(req);
109}
110
111/*
112 * build new request AND message, calculate layout, and adjust file
113 * extent as needed.
114 *
115 * if the file was recently truncated, we include information about its
116 * old and new size so that the object can be updated appropriately. (we
117 * avoid synchronously deleting truncated objects because it's slow.)
118 *
119 * if @do_sync, include a 'startsync' command so that the osd will flush
120 * data quickly.
121 */
122struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
123 struct ceph_file_layout *layout,
124 struct ceph_vino vino,
125 u64 off, u64 *plen,
126 int opcode, int flags,
127 struct ceph_snap_context *snapc,
128 int do_sync,
129 u32 truncate_seq,
130 u64 truncate_size,
131 struct timespec *mtime,
132 bool use_mempool, int num_reply)
133{
134 struct ceph_osd_request *req;
135 struct ceph_msg *msg;
136 struct ceph_osd_request_head *head;
137 struct ceph_osd_op *op;
138 void *p;
139 int num_op = 1 + do_sync;
140 size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
141 int i;
142
143 if (use_mempool) {
144 req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
145 memset(req, 0, sizeof(*req));
146 } else {
147 req = kzalloc(sizeof(*req), GFP_NOFS);
148 }
149 if (req == NULL)
150 return ERR_PTR(-ENOMEM);
151
152 req->r_osdc = osdc;
153 req->r_mempool = use_mempool;
154 kref_init(&req->r_kref);
155 init_completion(&req->r_completion);
156 init_completion(&req->r_safe_completion);
157 INIT_LIST_HEAD(&req->r_unsafe_item);
158 req->r_flags = flags;
159
160 WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
161
162 /* create reply message */
163 if (use_mempool)
164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
165 else
166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
167 OSD_OPREPLY_FRONT_LEN, 0, 0, NULL);
168 if (IS_ERR(msg)) {
169 ceph_osdc_put_request(req);
170 return ERR_PTR(PTR_ERR(msg));
171 }
172 req->r_reply = msg;
173
174 /* create request message; allow space for oid */
175 msg_size += 40;
176 if (snapc)
177 msg_size += sizeof(u64) * snapc->num_snaps;
178 if (use_mempool)
179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
180 else
181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
182 if (IS_ERR(msg)) {
183 ceph_osdc_put_request(req);
184 return ERR_PTR(PTR_ERR(msg));
185 }
186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
187 memset(msg->front.iov_base, 0, msg->front.iov_len);
188 head = msg->front.iov_base;
189 op = (void *)(head + 1);
190 p = (void *)(op + num_op);
191
192 req->r_request = msg;
193 req->r_snapc = ceph_get_snap_context(snapc);
194
195 head->client_inc = cpu_to_le32(1); /* always, for now. */
196 head->flags = cpu_to_le32(flags);
197 if (flags & CEPH_OSD_FLAG_WRITE)
198 ceph_encode_timespec(&head->mtime, mtime);
199 head->num_ops = cpu_to_le16(num_op);
200 op->op = cpu_to_le16(opcode);
201
202 /* calculate max write size */
203 calc_layout(osdc, vino, layout, off, plen, req);
204 req->r_file_layout = *layout; /* keep a copy */
205
206 if (flags & CEPH_OSD_FLAG_WRITE) {
207 req->r_request->hdr.data_off = cpu_to_le16(off);
208 req->r_request->hdr.data_len = cpu_to_le32(*plen);
209 op->payload_len = cpu_to_le32(*plen);
210 }
211 op->extent.truncate_size = cpu_to_le64(truncate_size);
212 op->extent.truncate_seq = cpu_to_le32(truncate_seq);
213
214 /* fill in oid */
215 head->object_len = cpu_to_le32(req->r_oid_len);
216 memcpy(p, req->r_oid, req->r_oid_len);
217 p += req->r_oid_len;
218
219 if (do_sync) {
220 op++;
221 op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
222 }
223 if (snapc) {
224 head->snap_seq = cpu_to_le64(snapc->seq);
225 head->num_snaps = cpu_to_le32(snapc->num_snaps);
226 for (i = 0; i < snapc->num_snaps; i++) {
227 put_unaligned_le64(snapc->snaps[i], p);
228 p += sizeof(u64);
229 }
230 }
231
232 BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
233 msg_size = p - msg->front.iov_base;
234 msg->front.iov_len = msg_size;
235 msg->hdr.front_len = cpu_to_le32(msg_size);
236 return req;
237}
238
239/*
240 * We keep osd requests in an rbtree, sorted by ->r_tid.
241 */
242static void __insert_request(struct ceph_osd_client *osdc,
243 struct ceph_osd_request *new)
244{
245 struct rb_node **p = &osdc->requests.rb_node;
246 struct rb_node *parent = NULL;
247 struct ceph_osd_request *req = NULL;
248
249 while (*p) {
250 parent = *p;
251 req = rb_entry(parent, struct ceph_osd_request, r_node);
252 if (new->r_tid < req->r_tid)
253 p = &(*p)->rb_left;
254 else if (new->r_tid > req->r_tid)
255 p = &(*p)->rb_right;
256 else
257 BUG();
258 }
259
260 rb_link_node(&new->r_node, parent, p);
261 rb_insert_color(&new->r_node, &osdc->requests);
262}
263
264static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
265 u64 tid)
266{
267 struct ceph_osd_request *req;
268 struct rb_node *n = osdc->requests.rb_node;
269
270 while (n) {
271 req = rb_entry(n, struct ceph_osd_request, r_node);
272 if (tid < req->r_tid)
273 n = n->rb_left;
274 else if (tid > req->r_tid)
275 n = n->rb_right;
276 else
277 return req;
278 }
279 return NULL;
280}
281
282static struct ceph_osd_request *
283__lookup_request_ge(struct ceph_osd_client *osdc,
284 u64 tid)
285{
286 struct ceph_osd_request *req;
287 struct rb_node *n = osdc->requests.rb_node;
288
289 while (n) {
290 req = rb_entry(n, struct ceph_osd_request, r_node);
291 if (tid < req->r_tid) {
292 if (!n->rb_left)
293 return req;
294 n = n->rb_left;
295 } else if (tid > req->r_tid) {
296 n = n->rb_right;
297 } else {
298 return req;
299 }
300 }
301 return NULL;
302}
303
304
305/*
306 * If the osd connection drops, we need to resubmit all requests.
307 */
308static void osd_reset(struct ceph_connection *con)
309{
310 struct ceph_osd *osd = con->private;
311 struct ceph_osd_client *osdc;
312
313 if (!osd)
314 return;
315 dout("osd_reset osd%d\n", osd->o_osd);
316 osdc = osd->o_osdc;
317 down_read(&osdc->map_sem);
318 kick_requests(osdc, osd);
319 up_read(&osdc->map_sem);
320}
321
322/*
323 * Track open sessions with osds.
324 */
325static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
326{
327 struct ceph_osd *osd;
328
329 osd = kzalloc(sizeof(*osd), GFP_NOFS);
330 if (!osd)
331 return NULL;
332
333 atomic_set(&osd->o_ref, 1);
334 osd->o_osdc = osdc;
335 INIT_LIST_HEAD(&osd->o_requests);
336 INIT_LIST_HEAD(&osd->o_osd_lru);
337 osd->o_incarnation = 1;
338
339 ceph_con_init(osdc->client->msgr, &osd->o_con);
340 osd->o_con.private = osd;
341 osd->o_con.ops = &osd_con_ops;
342 osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
343
344 INIT_LIST_HEAD(&osd->o_keepalive_item);
345 return osd;
346}
347
348static struct ceph_osd *get_osd(struct ceph_osd *osd)
349{
350 if (atomic_inc_not_zero(&osd->o_ref)) {
351 dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
352 atomic_read(&osd->o_ref));
353 return osd;
354 } else {
355 dout("get_osd %p FAIL\n", osd);
356 return NULL;
357 }
358}
359
360static void put_osd(struct ceph_osd *osd)
361{
362 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
363 atomic_read(&osd->o_ref) - 1);
364 if (atomic_dec_and_test(&osd->o_ref))
365 kfree(osd);
366}
367
368/*
369 * remove an osd from our map
370 */
371static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
372{
373 dout("__remove_osd %p\n", osd);
374 BUG_ON(!list_empty(&osd->o_requests));
375 rb_erase(&osd->o_node, &osdc->osds);
376 list_del_init(&osd->o_osd_lru);
377 ceph_con_close(&osd->o_con);
378 put_osd(osd);
379}
380
381static void __move_osd_to_lru(struct ceph_osd_client *osdc,
382 struct ceph_osd *osd)
383{
384 dout("__move_osd_to_lru %p\n", osd);
385 BUG_ON(!list_empty(&osd->o_osd_lru));
386 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
387 osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
388}
389
390static void __remove_osd_from_lru(struct ceph_osd *osd)
391{
392 dout("__remove_osd_from_lru %p\n", osd);
393 if (!list_empty(&osd->o_osd_lru))
394 list_del_init(&osd->o_osd_lru);
395}
396
397static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
398{
399 struct ceph_osd *osd, *nosd;
400
401 dout("__remove_old_osds %p\n", osdc);
402 mutex_lock(&osdc->request_mutex);
403 list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
404 if (!remove_all && time_before(jiffies, osd->lru_ttl))
405 break;
406 __remove_osd(osdc, osd);
407 }
408 mutex_unlock(&osdc->request_mutex);
409}
410
411/*
412 * reset osd connect
413 */
414static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
415{
416 struct ceph_osd_request *req;
417 int ret = 0;
418
419 dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
420 if (list_empty(&osd->o_requests)) {
421 __remove_osd(osdc, osd);
422 } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
423 &osd->o_con.peer_addr,
424 sizeof(osd->o_con.peer_addr)) == 0 &&
425 !ceph_con_opened(&osd->o_con)) {
426 dout(" osd addr hasn't changed and connection never opened,"
427 " letting msgr retry");
428 /* touch each r_stamp for handle_timeout()'s benfit */
429 list_for_each_entry(req, &osd->o_requests, r_osd_item)
430 req->r_stamp = jiffies;
431 ret = -EAGAIN;
432 } else {
433 ceph_con_close(&osd->o_con);
434 ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
435 osd->o_incarnation++;
436 }
437 return ret;
438}
439
440static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
441{
442 struct rb_node **p = &osdc->osds.rb_node;
443 struct rb_node *parent = NULL;
444 struct ceph_osd *osd = NULL;
445
446 while (*p) {
447 parent = *p;
448 osd = rb_entry(parent, struct ceph_osd, o_node);
449 if (new->o_osd < osd->o_osd)
450 p = &(*p)->rb_left;
451 else if (new->o_osd > osd->o_osd)
452 p = &(*p)->rb_right;
453 else
454 BUG();
455 }
456
457 rb_link_node(&new->o_node, parent, p);
458 rb_insert_color(&new->o_node, &osdc->osds);
459}
460
461static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
462{
463 struct ceph_osd *osd;
464 struct rb_node *n = osdc->osds.rb_node;
465
466 while (n) {
467 osd = rb_entry(n, struct ceph_osd, o_node);
468 if (o < osd->o_osd)
469 n = n->rb_left;
470 else if (o > osd->o_osd)
471 n = n->rb_right;
472 else
473 return osd;
474 }
475 return NULL;
476}
477
478static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
479{
480 schedule_delayed_work(&osdc->timeout_work,
481 osdc->client->mount_args->osd_keepalive_timeout * HZ);
482}
483
484static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
485{
486 cancel_delayed_work(&osdc->timeout_work);
487}
488
489/*
490 * Register request, assign tid. If this is the first request, set up
491 * the timeout event.
492 */
493static void register_request(struct ceph_osd_client *osdc,
494 struct ceph_osd_request *req)
495{
496 mutex_lock(&osdc->request_mutex);
497 req->r_tid = ++osdc->last_tid;
498 req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
499 INIT_LIST_HEAD(&req->r_req_lru_item);
500
501 dout("register_request %p tid %lld\n", req, req->r_tid);
502 __insert_request(osdc, req);
503 ceph_osdc_get_request(req);
504 osdc->num_requests++;
505
506 if (osdc->num_requests == 1) {
507 dout(" first request, scheduling timeout\n");
508 __schedule_osd_timeout(osdc);
509 }
510 mutex_unlock(&osdc->request_mutex);
511}
512
513/*
514 * called under osdc->request_mutex
515 */
516static void __unregister_request(struct ceph_osd_client *osdc,
517 struct ceph_osd_request *req)
518{
519 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
520 rb_erase(&req->r_node, &osdc->requests);
521 osdc->num_requests--;
522
523 if (req->r_osd) {
524 /* make sure the original request isn't in flight. */
525 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
526
527 list_del_init(&req->r_osd_item);
528 if (list_empty(&req->r_osd->o_requests))
529 __move_osd_to_lru(osdc, req->r_osd);
530 req->r_osd = NULL;
531 }
532
533 ceph_osdc_put_request(req);
534
535 list_del_init(&req->r_req_lru_item);
536 if (osdc->num_requests == 0) {
537 dout(" no requests, canceling timeout\n");
538 __cancel_osd_timeout(osdc);
539 }
540}
541
542/*
543 * Cancel a previously queued request message
544 */
545static void __cancel_request(struct ceph_osd_request *req)
546{
547 if (req->r_sent) {
548 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
549 req->r_sent = 0;
550 }
551 list_del_init(&req->r_req_lru_item);
552}
553
554/*
555 * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
556 * (as needed), and set the request r_osd appropriately. If there is
557 * no up osd, set r_osd to NULL.
558 *
559 * Return 0 if unchanged, 1 if changed, or negative on error.
560 *
561 * Caller should hold map_sem for read and request_mutex.
562 */
563static int __map_osds(struct ceph_osd_client *osdc,
564 struct ceph_osd_request *req)
565{
566 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
567 struct ceph_pg pgid;
568 int acting[CEPH_PG_MAX_SIZE];
569 int o = -1, num = 0;
570 int err;
571
572 dout("map_osds %p tid %lld\n", req, req->r_tid);
573 err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
574 &req->r_file_layout, osdc->osdmap);
575 if (err)
576 return err;
577 pgid = reqhead->layout.ol_pgid;
578 req->r_pgid = pgid;
579
580 err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
581 if (err > 0) {
582 o = acting[0];
583 num = err;
584 }
585
586 if ((req->r_osd && req->r_osd->o_osd == o &&
587 req->r_sent >= req->r_osd->o_incarnation &&
588 req->r_num_pg_osds == num &&
589 memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
590 (req->r_osd == NULL && o == -1))
591 return 0; /* no change */
592
593 dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
594 req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
595 req->r_osd ? req->r_osd->o_osd : -1);
596
597 /* record full pg acting set */
598 memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
599 req->r_num_pg_osds = num;
600
601 if (req->r_osd) {
602 __cancel_request(req);
603 list_del_init(&req->r_osd_item);
604 req->r_osd = NULL;
605 }
606
607 req->r_osd = __lookup_osd(osdc, o);
608 if (!req->r_osd && o >= 0) {
609 err = -ENOMEM;
610 req->r_osd = create_osd(osdc);
611 if (!req->r_osd)
612 goto out;
613
614 dout("map_osds osd %p is osd%d\n", req->r_osd, o);
615 req->r_osd->o_osd = o;
616 req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
617 __insert_osd(osdc, req->r_osd);
618
619 ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
620 }
621
622 if (req->r_osd) {
623 __remove_osd_from_lru(req->r_osd);
624 list_add(&req->r_osd_item, &req->r_osd->o_requests);
625 }
626 err = 1; /* osd or pg changed */
627
628out:
629 return err;
630}
631
632/*
633 * caller should hold map_sem (for read) and request_mutex
634 */
635static int __send_request(struct ceph_osd_client *osdc,
636 struct ceph_osd_request *req)
637{
638 struct ceph_osd_request_head *reqhead;
639 int err;
640
641 err = __map_osds(osdc, req);
642 if (err < 0)
643 return err;
644 if (req->r_osd == NULL) {
645 dout("send_request %p no up osds in pg\n", req);
646 ceph_monc_request_next_osdmap(&osdc->client->monc);
647 return 0;
648 }
649
650 dout("send_request %p tid %llu to osd%d flags %d\n",
651 req, req->r_tid, req->r_osd->o_osd, req->r_flags);
652
653 reqhead = req->r_request->front.iov_base;
654 reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
655 reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */
656 reqhead->reassert_version = req->r_reassert_version;
657
658 req->r_stamp = jiffies;
659 list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
660
661 ceph_msg_get(req->r_request); /* send consumes a ref */
662 ceph_con_send(&req->r_osd->o_con, req->r_request);
663 req->r_sent = req->r_osd->o_incarnation;
664 return 0;
665}
666
667/*
668 * Timeout callback, called every N seconds when 1 or more osd
669 * requests has been active for more than N seconds. When this
670 * happens, we ping all OSDs with requests who have timed out to
671 * ensure any communications channel reset is detected. Reset the
672 * request timeouts another N seconds in the future as we go.
673 * Reschedule the timeout event another N seconds in future (unless
674 * there are no open requests).
675 */
676static void handle_timeout(struct work_struct *work)
677{
678 struct ceph_osd_client *osdc =
679 container_of(work, struct ceph_osd_client, timeout_work.work);
680 struct ceph_osd_request *req, *last_req = NULL;
681 struct ceph_osd *osd;
682 unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
683 unsigned long keepalive =
684 osdc->client->mount_args->osd_keepalive_timeout * HZ;
685 unsigned long last_stamp = 0;
686 struct rb_node *p;
687 struct list_head slow_osds;
688
689 dout("timeout\n");
690 down_read(&osdc->map_sem);
691
692 ceph_monc_request_next_osdmap(&osdc->client->monc);
693
694 mutex_lock(&osdc->request_mutex);
695 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
696 req = rb_entry(p, struct ceph_osd_request, r_node);
697
698 if (req->r_resend) {
699 int err;
700
701 dout("osdc resending prev failed %lld\n", req->r_tid);
702 err = __send_request(osdc, req);
703 if (err)
704 dout("osdc failed again on %lld\n", req->r_tid);
705 else
706 req->r_resend = false;
707 continue;
708 }
709 }
710
711 /*
712 * reset osds that appear to be _really_ unresponsive. this
713 * is a failsafe measure.. we really shouldn't be getting to
714 * this point if the system is working properly. the monitors
715 * should mark the osd as failed and we should find out about
716 * it from an updated osd map.
717 */
718 while (!list_empty(&osdc->req_lru)) {
719 req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
720 r_req_lru_item);
721
722 if (time_before(jiffies, req->r_stamp + timeout))
723 break;
724
725 BUG_ON(req == last_req && req->r_stamp == last_stamp);
726 last_req = req;
727 last_stamp = req->r_stamp;
728
729 osd = req->r_osd;
730 BUG_ON(!osd);
731 pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
732 req->r_tid, osd->o_osd);
733 __kick_requests(osdc, osd);
734 }
735
736 /*
737 * ping osds that are a bit slow. this ensures that if there
738 * is a break in the TCP connection we will notice, and reopen
739 * a connection with that osd (from the fault callback).
740 */
741 INIT_LIST_HEAD(&slow_osds);
742 list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
743 if (time_before(jiffies, req->r_stamp + keepalive))
744 break;
745
746 osd = req->r_osd;
747 BUG_ON(!osd);
748 dout(" tid %llu is slow, will send keepalive on osd%d\n",
749 req->r_tid, osd->o_osd);
750 list_move_tail(&osd->o_keepalive_item, &slow_osds);
751 }
752 while (!list_empty(&slow_osds)) {
753 osd = list_entry(slow_osds.next, struct ceph_osd,
754 o_keepalive_item);
755 list_del_init(&osd->o_keepalive_item);
756 ceph_con_keepalive(&osd->o_con);
757 }
758
759 __schedule_osd_timeout(osdc);
760 mutex_unlock(&osdc->request_mutex);
761
762 up_read(&osdc->map_sem);
763}
764
765static void handle_osds_timeout(struct work_struct *work)
766{
767 struct ceph_osd_client *osdc =
768 container_of(work, struct ceph_osd_client,
769 osds_timeout_work.work);
770 unsigned long delay =
771 osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
772
773 dout("osds timeout\n");
774 down_read(&osdc->map_sem);
775 remove_old_osds(osdc, 0);
776 up_read(&osdc->map_sem);
777
778 schedule_delayed_work(&osdc->osds_timeout_work,
779 round_jiffies_relative(delay));
780}
781
782/*
783 * handle osd op reply. either call the callback if it is specified,
784 * or do the completion to wake up the waiting thread.
785 */
786static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
787 struct ceph_connection *con)
788{
789 struct ceph_osd_reply_head *rhead = msg->front.iov_base;
790 struct ceph_osd_request *req;
791 u64 tid;
792 int numops, object_len, flags;
793 s32 result;
794
795 tid = le64_to_cpu(msg->hdr.tid);
796 if (msg->front.iov_len < sizeof(*rhead))
797 goto bad;
798 numops = le32_to_cpu(rhead->num_ops);
799 object_len = le32_to_cpu(rhead->object_len);
800 result = le32_to_cpu(rhead->result);
801 if (msg->front.iov_len != sizeof(*rhead) + object_len +
802 numops * sizeof(struct ceph_osd_op))
803 goto bad;
804 dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
805
806 /* lookup */
807 mutex_lock(&osdc->request_mutex);
808 req = __lookup_request(osdc, tid);
809 if (req == NULL) {
810 dout("handle_reply tid %llu dne\n", tid);
811 mutex_unlock(&osdc->request_mutex);
812 return;
813 }
814 ceph_osdc_get_request(req);
815 flags = le32_to_cpu(rhead->flags);
816
817 /*
818 * if this connection filled our message, drop our reference now, to
819 * avoid a (safe but slower) revoke later.
820 */
821 if (req->r_con_filling_msg == con && req->r_reply == msg) {
822 dout(" dropping con_filling_msg ref %p\n", con);
823 req->r_con_filling_msg = NULL;
824 ceph_con_put(con);
825 }
826
827 if (!req->r_got_reply) {
828 unsigned bytes;
829
830 req->r_result = le32_to_cpu(rhead->result);
831 bytes = le32_to_cpu(msg->hdr.data_len);
832 dout("handle_reply result %d bytes %d\n", req->r_result,
833 bytes);
834 if (req->r_result == 0)
835 req->r_result = bytes;
836
837 /* in case this is a write and we need to replay, */
838 req->r_reassert_version = rhead->reassert_version;
839
840 req->r_got_reply = 1;
841 } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
842 dout("handle_reply tid %llu dup ack\n", tid);
843 mutex_unlock(&osdc->request_mutex);
844 goto done;
845 }
846
847 dout("handle_reply tid %llu flags %d\n", tid, flags);
848
849 /* either this is a read, or we got the safe response */
850 if (result < 0 ||
851 (flags & CEPH_OSD_FLAG_ONDISK) ||
852 ((flags & CEPH_OSD_FLAG_WRITE) == 0))
853 __unregister_request(osdc, req);
854
855 mutex_unlock(&osdc->request_mutex);
856
857 if (req->r_callback)
858 req->r_callback(req, msg);
859 else
860 complete(&req->r_completion);
861
862 if (flags & CEPH_OSD_FLAG_ONDISK) {
863 if (req->r_safe_callback)
864 req->r_safe_callback(req, msg);
865 complete(&req->r_safe_completion); /* fsync waiter */
866 }
867
868done:
869 ceph_osdc_put_request(req);
870 return;
871
872bad:
873 pr_err("corrupt osd_op_reply got %d %d expected %d\n",
874 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
875 (int)sizeof(*rhead));
876 ceph_msg_dump(msg);
877}
878
879
880static int __kick_requests(struct ceph_osd_client *osdc,
881 struct ceph_osd *kickosd)
882{
883 struct ceph_osd_request *req;
884 struct rb_node *p, *n;
885 int needmap = 0;
886 int err;
887
888 dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
889 if (kickosd) {
890 err = __reset_osd(osdc, kickosd);
891 if (err == -EAGAIN)
892 return 1;
893 } else {
894 for (p = rb_first(&osdc->osds); p; p = n) {
895 struct ceph_osd *osd =
896 rb_entry(p, struct ceph_osd, o_node);
897
898 n = rb_next(p);
899 if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
900 memcmp(&osd->o_con.peer_addr,
901 ceph_osd_addr(osdc->osdmap,
902 osd->o_osd),
903 sizeof(struct ceph_entity_addr)) != 0)
904 __reset_osd(osdc, osd);
905 }
906 }
907
908 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
909 req = rb_entry(p, struct ceph_osd_request, r_node);
910
911 if (req->r_resend) {
912 dout(" r_resend set on tid %llu\n", req->r_tid);
913 __cancel_request(req);
914 goto kick;
915 }
916 if (req->r_osd && kickosd == req->r_osd) {
917 __cancel_request(req);
918 goto kick;
919 }
920
921 err = __map_osds(osdc, req);
922 if (err == 0)
923 continue; /* no change */
924 if (err < 0) {
925 /*
926 * FIXME: really, we should set the request
927 * error and fail if this isn't a 'nofail'
928 * request, but that's a fair bit more
929 * complicated to do. So retry!
930 */
931 dout(" setting r_resend on %llu\n", req->r_tid);
932 req->r_resend = true;
933 continue;
934 }
935 if (req->r_osd == NULL) {
936 dout("tid %llu maps to no valid osd\n", req->r_tid);
937 needmap++; /* request a newer map */
938 continue;
939 }
940
941kick:
942 dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
943 req->r_osd ? req->r_osd->o_osd : -1);
944 req->r_flags |= CEPH_OSD_FLAG_RETRY;
945 err = __send_request(osdc, req);
946 if (err) {
947 dout(" setting r_resend on %llu\n", req->r_tid);
948 req->r_resend = true;
949 }
950 }
951
952 return needmap;
953}
954
955/*
956 * Resubmit osd requests whose osd or osd address has changed. Request
957 * a new osd map if osds are down, or we are otherwise unable to determine
958 * how to direct a request.
959 *
960 * Close connections to down osds.
961 *
962 * If @who is specified, resubmit requests for that specific osd.
963 *
964 * Caller should hold map_sem for read and request_mutex.
965 */
966static void kick_requests(struct ceph_osd_client *osdc,
967 struct ceph_osd *kickosd)
968{
969 int needmap;
970
971 mutex_lock(&osdc->request_mutex);
972 needmap = __kick_requests(osdc, kickosd);
973 mutex_unlock(&osdc->request_mutex);
974
975 if (needmap) {
976 dout("%d requests for down osds, need new map\n", needmap);
977 ceph_monc_request_next_osdmap(&osdc->client->monc);
978 }
979
980}
981/*
982 * Process updated osd map.
983 *
984 * The message contains any number of incremental and full maps, normally
985 * indicating some sort of topology change in the cluster. Kick requests
986 * off to different OSDs as needed.
987 */
988void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
989{
990 void *p, *end, *next;
991 u32 nr_maps, maplen;
992 u32 epoch;
993 struct ceph_osdmap *newmap = NULL, *oldmap;
994 int err;
995 struct ceph_fsid fsid;
996
997 dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
998 p = msg->front.iov_base;
999 end = p + msg->front.iov_len;
1000
1001 /* verify fsid */
1002 ceph_decode_need(&p, end, sizeof(fsid), bad);
1003 ceph_decode_copy(&p, &fsid, sizeof(fsid));
1004 if (ceph_check_fsid(osdc->client, &fsid) < 0)
1005 return;
1006
1007 down_write(&osdc->map_sem);
1008
1009 /* incremental maps */
1010 ceph_decode_32_safe(&p, end, nr_maps, bad);
1011 dout(" %d inc maps\n", nr_maps);
1012 while (nr_maps > 0) {
1013 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1014 epoch = ceph_decode_32(&p);
1015 maplen = ceph_decode_32(&p);
1016 ceph_decode_need(&p, end, maplen, bad);
1017 next = p + maplen;
1018 if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
1019 dout("applying incremental map %u len %d\n",
1020 epoch, maplen);
1021 newmap = osdmap_apply_incremental(&p, next,
1022 osdc->osdmap,
1023 osdc->client->msgr);
1024 if (IS_ERR(newmap)) {
1025 err = PTR_ERR(newmap);
1026 goto bad;
1027 }
1028 BUG_ON(!newmap);
1029 if (newmap != osdc->osdmap) {
1030 ceph_osdmap_destroy(osdc->osdmap);
1031 osdc->osdmap = newmap;
1032 }
1033 } else {
1034 dout("ignoring incremental map %u len %d\n",
1035 epoch, maplen);
1036 }
1037 p = next;
1038 nr_maps--;
1039 }
1040 if (newmap)
1041 goto done;
1042
1043 /* full maps */
1044 ceph_decode_32_safe(&p, end, nr_maps, bad);
1045 dout(" %d full maps\n", nr_maps);
1046 while (nr_maps) {
1047 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1048 epoch = ceph_decode_32(&p);
1049 maplen = ceph_decode_32(&p);
1050 ceph_decode_need(&p, end, maplen, bad);
1051 if (nr_maps > 1) {
1052 dout("skipping non-latest full map %u len %d\n",
1053 epoch, maplen);
1054 } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
1055 dout("skipping full map %u len %d, "
1056 "older than our %u\n", epoch, maplen,
1057 osdc->osdmap->epoch);
1058 } else {
1059 dout("taking full map %u len %d\n", epoch, maplen);
1060 newmap = osdmap_decode(&p, p+maplen);
1061 if (IS_ERR(newmap)) {
1062 err = PTR_ERR(newmap);
1063 goto bad;
1064 }
1065 BUG_ON(!newmap);
1066 oldmap = osdc->osdmap;
1067 osdc->osdmap = newmap;
1068 if (oldmap)
1069 ceph_osdmap_destroy(oldmap);
1070 }
1071 p += maplen;
1072 nr_maps--;
1073 }
1074
1075done:
1076 downgrade_write(&osdc->map_sem);
1077 ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
1078 if (newmap)
1079 kick_requests(osdc, NULL);
1080 up_read(&osdc->map_sem);
1081 return;
1082
1083bad:
1084 pr_err("osdc handle_map corrupt msg\n");
1085 ceph_msg_dump(msg);
1086 up_write(&osdc->map_sem);
1087 return;
1088}
1089
1090
1091/*
1092 * A read request prepares specific pages that data is to be read into.
1093 * When a message is being read off the wire, we call prepare_pages to
1094 * find those pages.
1095 * 0 = success, -1 failure.
1096 */
1097static int __prepare_pages(struct ceph_connection *con,
1098 struct ceph_msg_header *hdr,
1099 struct ceph_osd_request *req,
1100 u64 tid,
1101 struct ceph_msg *m)
1102{
1103 struct ceph_osd *osd = con->private;
1104 struct ceph_osd_client *osdc;
1105 int ret = -1;
1106 int data_len = le32_to_cpu(hdr->data_len);
1107 unsigned data_off = le16_to_cpu(hdr->data_off);
1108
1109 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1110
1111 if (!osd)
1112 return -1;
1113
1114 osdc = osd->o_osdc;
1115
1116 dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
1117 tid, req->r_num_pages, want);
1118 if (unlikely(req->r_num_pages < want))
1119 goto out;
1120 m->pages = req->r_pages;
1121 m->nr_pages = req->r_num_pages;
1122 ret = 0; /* success */
1123out:
1124 BUG_ON(ret < 0 || m->nr_pages < want);
1125
1126 return ret;
1127}
1128
1129/*
1130 * Register request, send initial attempt.
1131 */
1132int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1133 struct ceph_osd_request *req,
1134 bool nofail)
1135{
1136 int rc = 0;
1137
1138 req->r_request->pages = req->r_pages;
1139 req->r_request->nr_pages = req->r_num_pages;
1140
1141 register_request(osdc, req);
1142
1143 down_read(&osdc->map_sem);
1144 mutex_lock(&osdc->request_mutex);
1145 /*
1146 * a racing kick_requests() may have sent the message for us
1147 * while we dropped request_mutex above, so only send now if
1148 * the request still han't been touched yet.
1149 */
1150 if (req->r_sent == 0) {
1151 rc = __send_request(osdc, req);
1152 if (rc) {
1153 if (nofail) {
1154 dout("osdc_start_request failed send, "
1155 " marking %lld\n", req->r_tid);
1156 req->r_resend = true;
1157 rc = 0;
1158 } else {
1159 __unregister_request(osdc, req);
1160 }
1161 }
1162 }
1163 mutex_unlock(&osdc->request_mutex);
1164 up_read(&osdc->map_sem);
1165 return rc;
1166}
1167
1168/*
1169 * wait for a request to complete
1170 */
1171int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
1172 struct ceph_osd_request *req)
1173{
1174 int rc;
1175
1176 rc = wait_for_completion_interruptible(&req->r_completion);
1177 if (rc < 0) {
1178 mutex_lock(&osdc->request_mutex);
1179 __cancel_request(req);
1180 __unregister_request(osdc, req);
1181 mutex_unlock(&osdc->request_mutex);
1182 dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
1183 return rc;
1184 }
1185
1186 dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
1187 return req->r_result;
1188}
1189
1190/*
1191 * sync - wait for all in-flight requests to flush. avoid starvation.
1192 */
1193void ceph_osdc_sync(struct ceph_osd_client *osdc)
1194{
1195 struct ceph_osd_request *req;
1196 u64 last_tid, next_tid = 0;
1197
1198 mutex_lock(&osdc->request_mutex);
1199 last_tid = osdc->last_tid;
1200 while (1) {
1201 req = __lookup_request_ge(osdc, next_tid);
1202 if (!req)
1203 break;
1204 if (req->r_tid > last_tid)
1205 break;
1206
1207 next_tid = req->r_tid + 1;
1208 if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
1209 continue;
1210
1211 ceph_osdc_get_request(req);
1212 mutex_unlock(&osdc->request_mutex);
1213 dout("sync waiting on tid %llu (last is %llu)\n",
1214 req->r_tid, last_tid);
1215 wait_for_completion(&req->r_safe_completion);
1216 mutex_lock(&osdc->request_mutex);
1217 ceph_osdc_put_request(req);
1218 }
1219 mutex_unlock(&osdc->request_mutex);
1220 dout("sync done (thru tid %llu)\n", last_tid);
1221}
1222
1223/*
1224 * init, shutdown
1225 */
1226int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1227{
1228 int err;
1229
1230 dout("init\n");
1231 osdc->client = client;
1232 osdc->osdmap = NULL;
1233 init_rwsem(&osdc->map_sem);
1234 init_completion(&osdc->map_waiters);
1235 osdc->last_requested_map = 0;
1236 mutex_init(&osdc->request_mutex);
1237 osdc->last_tid = 0;
1238 osdc->osds = RB_ROOT;
1239 INIT_LIST_HEAD(&osdc->osd_lru);
1240 osdc->requests = RB_ROOT;
1241 INIT_LIST_HEAD(&osdc->req_lru);
1242 osdc->num_requests = 0;
1243 INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
1244 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
1245
1246 schedule_delayed_work(&osdc->osds_timeout_work,
1247 round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
1248
1249 err = -ENOMEM;
1250 osdc->req_mempool = mempool_create_kmalloc_pool(10,
1251 sizeof(struct ceph_osd_request));
1252 if (!osdc->req_mempool)
1253 goto out;
1254
1255 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true);
1256 if (err < 0)
1257 goto out_mempool;
1258 err = ceph_msgpool_init(&osdc->msgpool_op_reply,
1259 OSD_OPREPLY_FRONT_LEN, 10, true);
1260 if (err < 0)
1261 goto out_msgpool;
1262 return 0;
1263
1264out_msgpool:
1265 ceph_msgpool_destroy(&osdc->msgpool_op);
1266out_mempool:
1267 mempool_destroy(osdc->req_mempool);
1268out:
1269 return err;
1270}
1271
1272void ceph_osdc_stop(struct ceph_osd_client *osdc)
1273{
1274 cancel_delayed_work_sync(&osdc->timeout_work);
1275 cancel_delayed_work_sync(&osdc->osds_timeout_work);
1276 if (osdc->osdmap) {
1277 ceph_osdmap_destroy(osdc->osdmap);
1278 osdc->osdmap = NULL;
1279 }
1280 remove_old_osds(osdc, 1);
1281 mempool_destroy(osdc->req_mempool);
1282 ceph_msgpool_destroy(&osdc->msgpool_op);
1283 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
1284}
1285
1286/*
1287 * Read some contiguous pages. If we cross a stripe boundary, shorten
1288 * *plen. Return number of bytes read, or error.
1289 */
1290int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1291 struct ceph_vino vino, struct ceph_file_layout *layout,
1292 u64 off, u64 *plen,
1293 u32 truncate_seq, u64 truncate_size,
1294 struct page **pages, int num_pages)
1295{
1296 struct ceph_osd_request *req;
1297 int rc = 0;
1298
1299 dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
1300 vino.snap, off, *plen);
1301 req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
1302 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1303 NULL, 0, truncate_seq, truncate_size, NULL,
1304 false, 1);
1305 if (IS_ERR(req))
1306 return PTR_ERR(req);
1307
1308 /* it may be a short read due to an object boundary */
1309 req->r_pages = pages;
1310 num_pages = calc_pages_for(off, *plen);
1311 req->r_num_pages = num_pages;
1312
1313 dout("readpages final extent is %llu~%llu (%d pages)\n",
1314 off, *plen, req->r_num_pages);
1315
1316 rc = ceph_osdc_start_request(osdc, req, false);
1317 if (!rc)
1318 rc = ceph_osdc_wait_request(osdc, req);
1319
1320 ceph_osdc_put_request(req);
1321 dout("readpages result %d\n", rc);
1322 return rc;
1323}
1324
1325/*
1326 * do a synchronous write on N pages
1327 */
1328int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1329 struct ceph_file_layout *layout,
1330 struct ceph_snap_context *snapc,
1331 u64 off, u64 len,
1332 u32 truncate_seq, u64 truncate_size,
1333 struct timespec *mtime,
1334 struct page **pages, int num_pages,
1335 int flags, int do_sync, bool nofail)
1336{
1337 struct ceph_osd_request *req;
1338 int rc = 0;
1339
1340 BUG_ON(vino.snap != CEPH_NOSNAP);
1341 req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
1342 CEPH_OSD_OP_WRITE,
1343 flags | CEPH_OSD_FLAG_ONDISK |
1344 CEPH_OSD_FLAG_WRITE,
1345 snapc, do_sync,
1346 truncate_seq, truncate_size, mtime,
1347 nofail, 1);
1348 if (IS_ERR(req))
1349 return PTR_ERR(req);
1350
1351 /* it may be a short write due to an object boundary */
1352 req->r_pages = pages;
1353 req->r_num_pages = calc_pages_for(off, len);
1354 dout("writepages %llu~%llu (%d pages)\n", off, len,
1355 req->r_num_pages);
1356
1357 rc = ceph_osdc_start_request(osdc, req, nofail);
1358 if (!rc)
1359 rc = ceph_osdc_wait_request(osdc, req);
1360
1361 ceph_osdc_put_request(req);
1362 if (rc == 0)
1363 rc = len;
1364 dout("writepages result %d\n", rc);
1365 return rc;
1366}
1367
1368/*
1369 * handle incoming message
1370 */
1371static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1372{
1373 struct ceph_osd *osd = con->private;
1374 struct ceph_osd_client *osdc;
1375 int type = le16_to_cpu(msg->hdr.type);
1376
1377 if (!osd)
1378 return;
1379 osdc = osd->o_osdc;
1380
1381 switch (type) {
1382 case CEPH_MSG_OSD_MAP:
1383 ceph_osdc_handle_map(osdc, msg);
1384 break;
1385 case CEPH_MSG_OSD_OPREPLY:
1386 handle_reply(osdc, msg, con);
1387 break;
1388
1389 default:
1390 pr_err("received unknown message type %d %s\n", type,
1391 ceph_msg_type_name(type));
1392 }
1393 ceph_msg_put(msg);
1394}
1395
1396/*
1397 * lookup and return message for incoming reply
1398 */
1399static struct ceph_msg *get_reply(struct ceph_connection *con,
1400 struct ceph_msg_header *hdr,
1401 int *skip)
1402{
1403 struct ceph_osd *osd = con->private;
1404 struct ceph_osd_client *osdc = osd->o_osdc;
1405 struct ceph_msg *m;
1406 struct ceph_osd_request *req;
1407 int front = le32_to_cpu(hdr->front_len);
1408 int data_len = le32_to_cpu(hdr->data_len);
1409 u64 tid;
1410 int err;
1411
1412 tid = le64_to_cpu(hdr->tid);
1413 mutex_lock(&osdc->request_mutex);
1414 req = __lookup_request(osdc, tid);
1415 if (!req) {
1416 *skip = 1;
1417 m = NULL;
1418 pr_info("get_reply unknown tid %llu from osd%d\n", tid,
1419 osd->o_osd);
1420 goto out;
1421 }
1422
1423 if (req->r_con_filling_msg) {
1424 dout("get_reply revoking msg %p from old con %p\n",
1425 req->r_reply, req->r_con_filling_msg);
1426 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
1427 ceph_con_put(req->r_con_filling_msg);
1428 }
1429
1430 if (front > req->r_reply->front.iov_len) {
1431 pr_warning("get_reply front %d > preallocated %d\n",
1432 front, (int)req->r_reply->front.iov_len);
1433 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL);
1434 if (IS_ERR(m))
1435 goto out;
1436 ceph_msg_put(req->r_reply);
1437 req->r_reply = m;
1438 }
1439 m = ceph_msg_get(req->r_reply);
1440
1441 if (data_len > 0) {
1442 err = __prepare_pages(con, hdr, req, tid, m);
1443 if (err < 0) {
1444 *skip = 1;
1445 ceph_msg_put(m);
1446 m = ERR_PTR(err);
1447 }
1448 }
1449 *skip = 0;
1450 req->r_con_filling_msg = ceph_con_get(con);
1451 dout("get_reply tid %lld %p\n", tid, m);
1452
1453out:
1454 mutex_unlock(&osdc->request_mutex);
1455 return m;
1456
1457}
1458
1459static struct ceph_msg *alloc_msg(struct ceph_connection *con,
1460 struct ceph_msg_header *hdr,
1461 int *skip)
1462{
1463 struct ceph_osd *osd = con->private;
1464 int type = le16_to_cpu(hdr->type);
1465 int front = le32_to_cpu(hdr->front_len);
1466
1467 switch (type) {
1468 case CEPH_MSG_OSD_MAP:
1469 return ceph_msg_new(type, front, 0, 0, NULL);
1470 case CEPH_MSG_OSD_OPREPLY:
1471 return get_reply(con, hdr, skip);
1472 default:
1473 pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
1474 osd->o_osd);
1475 *skip = 1;
1476 return NULL;
1477 }
1478}
1479
1480/*
1481 * Wrappers to refcount containing ceph_osd struct
1482 */
1483static struct ceph_connection *get_osd_con(struct ceph_connection *con)
1484{
1485 struct ceph_osd *osd = con->private;
1486 if (get_osd(osd))
1487 return con;
1488 return NULL;
1489}
1490
1491static void put_osd_con(struct ceph_connection *con)
1492{
1493 struct ceph_osd *osd = con->private;
1494 put_osd(osd);
1495}
1496
1497/*
1498 * authentication
1499 */
1500static int get_authorizer(struct ceph_connection *con,
1501 void **buf, int *len, int *proto,
1502 void **reply_buf, int *reply_len, int force_new)
1503{
1504 struct ceph_osd *o = con->private;
1505 struct ceph_osd_client *osdc = o->o_osdc;
1506 struct ceph_auth_client *ac = osdc->client->monc.auth;
1507 int ret = 0;
1508
1509 if (force_new && o->o_authorizer) {
1510 ac->ops->destroy_authorizer(ac, o->o_authorizer);
1511 o->o_authorizer = NULL;
1512 }
1513 if (o->o_authorizer == NULL) {
1514 ret = ac->ops->create_authorizer(
1515 ac, CEPH_ENTITY_TYPE_OSD,
1516 &o->o_authorizer,
1517 &o->o_authorizer_buf,
1518 &o->o_authorizer_buf_len,
1519 &o->o_authorizer_reply_buf,
1520 &o->o_authorizer_reply_buf_len);
1521 if (ret)
1522 return ret;
1523 }
1524
1525 *proto = ac->protocol;
1526 *buf = o->o_authorizer_buf;
1527 *len = o->o_authorizer_buf_len;
1528 *reply_buf = o->o_authorizer_reply_buf;
1529 *reply_len = o->o_authorizer_reply_buf_len;
1530 return 0;
1531}
1532
1533
1534static int verify_authorizer_reply(struct ceph_connection *con, int len)
1535{
1536 struct ceph_osd *o = con->private;
1537 struct ceph_osd_client *osdc = o->o_osdc;
1538 struct ceph_auth_client *ac = osdc->client->monc.auth;
1539
1540 return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
1541}
1542
1543static int invalidate_authorizer(struct ceph_connection *con)
1544{
1545 struct ceph_osd *o = con->private;
1546 struct ceph_osd_client *osdc = o->o_osdc;
1547 struct ceph_auth_client *ac = osdc->client->monc.auth;
1548
1549 if (ac->ops->invalidate_authorizer)
1550 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
1551
1552 return ceph_monc_validate_auth(&osdc->client->monc);
1553}
1554
1555const static struct ceph_connection_operations osd_con_ops = {
1556 .get = get_osd_con,
1557 .put = put_osd_con,
1558 .dispatch = dispatch,
1559 .get_authorizer = get_authorizer,
1560 .verify_authorizer_reply = verify_authorizer_reply,
1561 .invalidate_authorizer = invalidate_authorizer,
1562 .alloc_msg = alloc_msg,
1563 .fault = osd_reset,
1564};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
new file mode 100644
index 000000000000..ce776989ef6a
--- /dev/null
+++ b/fs/ceph/osd_client.h
@@ -0,0 +1,167 @@
1#ifndef _FS_CEPH_OSD_CLIENT_H
2#define _FS_CEPH_OSD_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/mempool.h>
7#include <linux/rbtree.h>
8
9#include "types.h"
10#include "osdmap.h"
11#include "messenger.h"
12
13struct ceph_msg;
14struct ceph_snap_context;
15struct ceph_osd_request;
16struct ceph_osd_client;
17struct ceph_authorizer;
18
19/*
20 * completion callback for async writepages
21 */
22typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
23 struct ceph_msg *);
24
25/* a given osd we're communicating with */
26struct ceph_osd {
27 atomic_t o_ref;
28 struct ceph_osd_client *o_osdc;
29 int o_osd;
30 int o_incarnation;
31 struct rb_node o_node;
32 struct ceph_connection o_con;
33 struct list_head o_requests;
34 struct list_head o_osd_lru;
35 struct ceph_authorizer *o_authorizer;
36 void *o_authorizer_buf, *o_authorizer_reply_buf;
37 size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
38 unsigned long lru_ttl;
39 int o_marked_for_keepalive;
40 struct list_head o_keepalive_item;
41};
42
43/* an in-flight request */
44struct ceph_osd_request {
45 u64 r_tid; /* unique for this client */
46 struct rb_node r_node;
47 struct list_head r_req_lru_item;
48 struct list_head r_osd_item;
49 struct ceph_osd *r_osd;
50 struct ceph_pg r_pgid;
51 int r_pg_osds[CEPH_PG_MAX_SIZE];
52 int r_num_pg_osds;
53
54 struct ceph_connection *r_con_filling_msg;
55
56 struct ceph_msg *r_request, *r_reply;
57 int r_result;
58 int r_flags; /* any additional flags for the osd */
59 u32 r_sent; /* >0 if r_request is sending/sent */
60 int r_got_reply;
61
62 struct ceph_osd_client *r_osdc;
63 struct kref r_kref;
64 bool r_mempool;
65 struct completion r_completion, r_safe_completion;
66 ceph_osdc_callback_t r_callback, r_safe_callback;
67 struct ceph_eversion r_reassert_version;
68 struct list_head r_unsafe_item;
69
70 struct inode *r_inode; /* for use by callbacks */
71
72 char r_oid[40]; /* object name */
73 int r_oid_len;
74 unsigned long r_stamp; /* send OR check time */
75 bool r_resend; /* msg send failed, needs retry */
76
77 struct ceph_file_layout r_file_layout;
78 struct ceph_snap_context *r_snapc; /* snap context for writes */
79 unsigned r_num_pages; /* size of page array (follows) */
80 struct page **r_pages; /* pages for data payload */
81 int r_pages_from_pool;
82 int r_own_pages; /* if true, i own page list */
83};
84
85struct ceph_osd_client {
86 struct ceph_client *client;
87
88 struct ceph_osdmap *osdmap; /* current map */
89 struct rw_semaphore map_sem;
90 struct completion map_waiters;
91 u64 last_requested_map;
92
93 struct mutex request_mutex;
94 struct rb_root osds; /* osds */
95 struct list_head osd_lru; /* idle osds */
96 u64 timeout_tid; /* tid of timeout triggering rq */
97 u64 last_tid; /* tid of last request */
98 struct rb_root requests; /* pending requests */
99 struct list_head req_lru; /* pending requests lru */
100 int num_requests;
101 struct delayed_work timeout_work;
102 struct delayed_work osds_timeout_work;
103#ifdef CONFIG_DEBUG_FS
104 struct dentry *debugfs_file;
105#endif
106
107 mempool_t *req_mempool;
108
109 struct ceph_msgpool msgpool_op;
110 struct ceph_msgpool msgpool_op_reply;
111};
112
113extern int ceph_osdc_init(struct ceph_osd_client *osdc,
114 struct ceph_client *client);
115extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
116
117extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
118 struct ceph_msg *msg);
119extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
120 struct ceph_msg *msg);
121
122extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
123 struct ceph_file_layout *layout,
124 struct ceph_vino vino,
125 u64 offset, u64 *len, int op, int flags,
126 struct ceph_snap_context *snapc,
127 int do_sync, u32 truncate_seq,
128 u64 truncate_size,
129 struct timespec *mtime,
130 bool use_mempool, int num_reply);
131
132static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
133{
134 kref_get(&req->r_kref);
135}
136extern void ceph_osdc_release_request(struct kref *kref);
137static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
138{
139 kref_put(&req->r_kref, ceph_osdc_release_request);
140}
141
142extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
143 struct ceph_osd_request *req,
144 bool nofail);
145extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
146 struct ceph_osd_request *req);
147extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
148
149extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
150 struct ceph_vino vino,
151 struct ceph_file_layout *layout,
152 u64 off, u64 *plen,
153 u32 truncate_seq, u64 truncate_size,
154 struct page **pages, int nr_pages);
155
156extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
157 struct ceph_vino vino,
158 struct ceph_file_layout *layout,
159 struct ceph_snap_context *sc,
160 u64 off, u64 len,
161 u32 truncate_seq, u64 truncate_size,
162 struct timespec *mtime,
163 struct page **pages, int nr_pages,
164 int flags, int do_sync, bool nofail);
165
166#endif
167
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
new file mode 100644
index 000000000000..cfdd8f4388b7
--- /dev/null
+++ b/fs/ceph/osdmap.c
@@ -0,0 +1,1081 @@
1
2#include "ceph_debug.h"
3
4#include <linux/slab.h>
5#include <asm/div64.h>
6
7#include "super.h"
8#include "osdmap.h"
9#include "crush/hash.h"
10#include "crush/mapper.h"
11#include "decode.h"
12
13char *ceph_osdmap_state_str(char *str, int len, int state)
14{
15 int flag = 0;
16
17 if (!len)
18 goto done;
19
20 *str = '\0';
21 if (state) {
22 if (state & CEPH_OSD_EXISTS) {
23 snprintf(str, len, "exists");
24 flag = 1;
25 }
26 if (state & CEPH_OSD_UP) {
27 snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
28 "up");
29 flag = 1;
30 }
31 } else {
32 snprintf(str, len, "doesn't exist");
33 }
34done:
35 return str;
36}
37
38/* maps */
39
40static int calc_bits_of(unsigned t)
41{
42 int b = 0;
43 while (t) {
44 t = t >> 1;
45 b++;
46 }
47 return b;
48}
49
50/*
51 * the foo_mask is the smallest value 2^n-1 that is >= foo.
52 */
53static void calc_pg_masks(struct ceph_pg_pool_info *pi)
54{
55 pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
56 pi->pgp_num_mask =
57 (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
58 pi->lpg_num_mask =
59 (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
60 pi->lpgp_num_mask =
61 (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
62}
63
64/*
65 * decode crush map
66 */
67static int crush_decode_uniform_bucket(void **p, void *end,
68 struct crush_bucket_uniform *b)
69{
70 dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
71 ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
72 b->item_weight = ceph_decode_32(p);
73 return 0;
74bad:
75 return -EINVAL;
76}
77
78static int crush_decode_list_bucket(void **p, void *end,
79 struct crush_bucket_list *b)
80{
81 int j;
82 dout("crush_decode_list_bucket %p to %p\n", *p, end);
83 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
84 if (b->item_weights == NULL)
85 return -ENOMEM;
86 b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
87 if (b->sum_weights == NULL)
88 return -ENOMEM;
89 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
90 for (j = 0; j < b->h.size; j++) {
91 b->item_weights[j] = ceph_decode_32(p);
92 b->sum_weights[j] = ceph_decode_32(p);
93 }
94 return 0;
95bad:
96 return -EINVAL;
97}
98
99static int crush_decode_tree_bucket(void **p, void *end,
100 struct crush_bucket_tree *b)
101{
102 int j;
103 dout("crush_decode_tree_bucket %p to %p\n", *p, end);
104 ceph_decode_32_safe(p, end, b->num_nodes, bad);
105 b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
106 if (b->node_weights == NULL)
107 return -ENOMEM;
108 ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
109 for (j = 0; j < b->num_nodes; j++)
110 b->node_weights[j] = ceph_decode_32(p);
111 return 0;
112bad:
113 return -EINVAL;
114}
115
116static int crush_decode_straw_bucket(void **p, void *end,
117 struct crush_bucket_straw *b)
118{
119 int j;
120 dout("crush_decode_straw_bucket %p to %p\n", *p, end);
121 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
122 if (b->item_weights == NULL)
123 return -ENOMEM;
124 b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
125 if (b->straws == NULL)
126 return -ENOMEM;
127 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
128 for (j = 0; j < b->h.size; j++) {
129 b->item_weights[j] = ceph_decode_32(p);
130 b->straws[j] = ceph_decode_32(p);
131 }
132 return 0;
133bad:
134 return -EINVAL;
135}
136
137static struct crush_map *crush_decode(void *pbyval, void *end)
138{
139 struct crush_map *c;
140 int err = -EINVAL;
141 int i, j;
142 void **p = &pbyval;
143 void *start = pbyval;
144 u32 magic;
145
146 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
147
148 c = kzalloc(sizeof(*c), GFP_NOFS);
149 if (c == NULL)
150 return ERR_PTR(-ENOMEM);
151
152 ceph_decode_need(p, end, 4*sizeof(u32), bad);
153 magic = ceph_decode_32(p);
154 if (magic != CRUSH_MAGIC) {
155 pr_err("crush_decode magic %x != current %x\n",
156 (unsigned)magic, (unsigned)CRUSH_MAGIC);
157 goto bad;
158 }
159 c->max_buckets = ceph_decode_32(p);
160 c->max_rules = ceph_decode_32(p);
161 c->max_devices = ceph_decode_32(p);
162
163 c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
164 if (c->device_parents == NULL)
165 goto badmem;
166 c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
167 if (c->bucket_parents == NULL)
168 goto badmem;
169
170 c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
171 if (c->buckets == NULL)
172 goto badmem;
173 c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
174 if (c->rules == NULL)
175 goto badmem;
176
177 /* buckets */
178 for (i = 0; i < c->max_buckets; i++) {
179 int size = 0;
180 u32 alg;
181 struct crush_bucket *b;
182
183 ceph_decode_32_safe(p, end, alg, bad);
184 if (alg == 0) {
185 c->buckets[i] = NULL;
186 continue;
187 }
188 dout("crush_decode bucket %d off %x %p to %p\n",
189 i, (int)(*p-start), *p, end);
190
191 switch (alg) {
192 case CRUSH_BUCKET_UNIFORM:
193 size = sizeof(struct crush_bucket_uniform);
194 break;
195 case CRUSH_BUCKET_LIST:
196 size = sizeof(struct crush_bucket_list);
197 break;
198 case CRUSH_BUCKET_TREE:
199 size = sizeof(struct crush_bucket_tree);
200 break;
201 case CRUSH_BUCKET_STRAW:
202 size = sizeof(struct crush_bucket_straw);
203 break;
204 default:
205 err = -EINVAL;
206 goto bad;
207 }
208 BUG_ON(size == 0);
209 b = c->buckets[i] = kzalloc(size, GFP_NOFS);
210 if (b == NULL)
211 goto badmem;
212
213 ceph_decode_need(p, end, 4*sizeof(u32), bad);
214 b->id = ceph_decode_32(p);
215 b->type = ceph_decode_16(p);
216 b->alg = ceph_decode_8(p);
217 b->hash = ceph_decode_8(p);
218 b->weight = ceph_decode_32(p);
219 b->size = ceph_decode_32(p);
220
221 dout("crush_decode bucket size %d off %x %p to %p\n",
222 b->size, (int)(*p-start), *p, end);
223
224 b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
225 if (b->items == NULL)
226 goto badmem;
227 b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
228 if (b->perm == NULL)
229 goto badmem;
230 b->perm_n = 0;
231
232 ceph_decode_need(p, end, b->size*sizeof(u32), bad);
233 for (j = 0; j < b->size; j++)
234 b->items[j] = ceph_decode_32(p);
235
236 switch (b->alg) {
237 case CRUSH_BUCKET_UNIFORM:
238 err = crush_decode_uniform_bucket(p, end,
239 (struct crush_bucket_uniform *)b);
240 if (err < 0)
241 goto bad;
242 break;
243 case CRUSH_BUCKET_LIST:
244 err = crush_decode_list_bucket(p, end,
245 (struct crush_bucket_list *)b);
246 if (err < 0)
247 goto bad;
248 break;
249 case CRUSH_BUCKET_TREE:
250 err = crush_decode_tree_bucket(p, end,
251 (struct crush_bucket_tree *)b);
252 if (err < 0)
253 goto bad;
254 break;
255 case CRUSH_BUCKET_STRAW:
256 err = crush_decode_straw_bucket(p, end,
257 (struct crush_bucket_straw *)b);
258 if (err < 0)
259 goto bad;
260 break;
261 }
262 }
263
264 /* rules */
265 dout("rule vec is %p\n", c->rules);
266 for (i = 0; i < c->max_rules; i++) {
267 u32 yes;
268 struct crush_rule *r;
269
270 ceph_decode_32_safe(p, end, yes, bad);
271 if (!yes) {
272 dout("crush_decode NO rule %d off %x %p to %p\n",
273 i, (int)(*p-start), *p, end);
274 c->rules[i] = NULL;
275 continue;
276 }
277
278 dout("crush_decode rule %d off %x %p to %p\n",
279 i, (int)(*p-start), *p, end);
280
281 /* len */
282 ceph_decode_32_safe(p, end, yes, bad);
283#if BITS_PER_LONG == 32
284 err = -EINVAL;
285 if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
286 goto bad;
287#endif
288 r = c->rules[i] = kmalloc(sizeof(*r) +
289 yes*sizeof(struct crush_rule_step),
290 GFP_NOFS);
291 if (r == NULL)
292 goto badmem;
293 dout(" rule %d is at %p\n", i, r);
294 r->len = yes;
295 ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
296 ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
297 for (j = 0; j < r->len; j++) {
298 r->steps[j].op = ceph_decode_32(p);
299 r->steps[j].arg1 = ceph_decode_32(p);
300 r->steps[j].arg2 = ceph_decode_32(p);
301 }
302 }
303
304 /* ignore trailing name maps. */
305
306 dout("crush_decode success\n");
307 return c;
308
309badmem:
310 err = -ENOMEM;
311bad:
312 dout("crush_decode fail %d\n", err);
313 crush_destroy(c);
314 return ERR_PTR(err);
315}
316
317/*
318 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
319 * to a set of osds)
320 */
321static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
322{
323 u64 a = *(u64 *)&l;
324 u64 b = *(u64 *)&r;
325
326 if (a < b)
327 return -1;
328 if (a > b)
329 return 1;
330 return 0;
331}
332
333static int __insert_pg_mapping(struct ceph_pg_mapping *new,
334 struct rb_root *root)
335{
336 struct rb_node **p = &root->rb_node;
337 struct rb_node *parent = NULL;
338 struct ceph_pg_mapping *pg = NULL;
339 int c;
340
341 while (*p) {
342 parent = *p;
343 pg = rb_entry(parent, struct ceph_pg_mapping, node);
344 c = pgid_cmp(new->pgid, pg->pgid);
345 if (c < 0)
346 p = &(*p)->rb_left;
347 else if (c > 0)
348 p = &(*p)->rb_right;
349 else
350 return -EEXIST;
351 }
352
353 rb_link_node(&new->node, parent, p);
354 rb_insert_color(&new->node, root);
355 return 0;
356}
357
358static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
359 struct ceph_pg pgid)
360{
361 struct rb_node *n = root->rb_node;
362 struct ceph_pg_mapping *pg;
363 int c;
364
365 while (n) {
366 pg = rb_entry(n, struct ceph_pg_mapping, node);
367 c = pgid_cmp(pgid, pg->pgid);
368 if (c < 0)
369 n = n->rb_left;
370 else if (c > 0)
371 n = n->rb_right;
372 else
373 return pg;
374 }
375 return NULL;
376}
377
378/*
379 * rbtree of pg pool info
380 */
381static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
382{
383 struct rb_node **p = &root->rb_node;
384 struct rb_node *parent = NULL;
385 struct ceph_pg_pool_info *pi = NULL;
386
387 while (*p) {
388 parent = *p;
389 pi = rb_entry(parent, struct ceph_pg_pool_info, node);
390 if (new->id < pi->id)
391 p = &(*p)->rb_left;
392 else if (new->id > pi->id)
393 p = &(*p)->rb_right;
394 else
395 return -EEXIST;
396 }
397
398 rb_link_node(&new->node, parent, p);
399 rb_insert_color(&new->node, root);
400 return 0;
401}
402
403static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
404{
405 struct ceph_pg_pool_info *pi;
406 struct rb_node *n = root->rb_node;
407
408 while (n) {
409 pi = rb_entry(n, struct ceph_pg_pool_info, node);
410 if (id < pi->id)
411 n = n->rb_left;
412 else if (id > pi->id)
413 n = n->rb_right;
414 else
415 return pi;
416 }
417 return NULL;
418}
419
420static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
421{
422 rb_erase(&pi->node, root);
423 kfree(pi->name);
424 kfree(pi);
425}
426
427void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
428{
429 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
430 calc_pg_masks(pi);
431 *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
432 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
433}
434
435static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
436{
437 struct ceph_pg_pool_info *pi;
438 u32 num, len, pool;
439
440 ceph_decode_32_safe(p, end, num, bad);
441 dout(" %d pool names\n", num);
442 while (num--) {
443 ceph_decode_32_safe(p, end, pool, bad);
444 ceph_decode_32_safe(p, end, len, bad);
445 dout(" pool %d len %d\n", pool, len);
446 pi = __lookup_pg_pool(&map->pg_pools, pool);
447 if (pi) {
448 kfree(pi->name);
449 pi->name = kmalloc(len + 1, GFP_NOFS);
450 if (pi->name) {
451 memcpy(pi->name, *p, len);
452 pi->name[len] = '\0';
453 dout(" name is %s\n", pi->name);
454 }
455 }
456 *p += len;
457 }
458 return 0;
459
460bad:
461 return -EINVAL;
462}
463
464/*
465 * osd map
466 */
467void ceph_osdmap_destroy(struct ceph_osdmap *map)
468{
469 dout("osdmap_destroy %p\n", map);
470 if (map->crush)
471 crush_destroy(map->crush);
472 while (!RB_EMPTY_ROOT(&map->pg_temp)) {
473 struct ceph_pg_mapping *pg =
474 rb_entry(rb_first(&map->pg_temp),
475 struct ceph_pg_mapping, node);
476 rb_erase(&pg->node, &map->pg_temp);
477 kfree(pg);
478 }
479 while (!RB_EMPTY_ROOT(&map->pg_pools)) {
480 struct ceph_pg_pool_info *pi =
481 rb_entry(rb_first(&map->pg_pools),
482 struct ceph_pg_pool_info, node);
483 __remove_pg_pool(&map->pg_pools, pi);
484 }
485 kfree(map->osd_state);
486 kfree(map->osd_weight);
487 kfree(map->osd_addr);
488 kfree(map);
489}
490
491/*
492 * adjust max osd value. reallocate arrays.
493 */
494static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
495{
496 u8 *state;
497 struct ceph_entity_addr *addr;
498 u32 *weight;
499
500 state = kcalloc(max, sizeof(*state), GFP_NOFS);
501 addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
502 weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
503 if (state == NULL || addr == NULL || weight == NULL) {
504 kfree(state);
505 kfree(addr);
506 kfree(weight);
507 return -ENOMEM;
508 }
509
510 /* copy old? */
511 if (map->osd_state) {
512 memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
513 memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
514 memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
515 kfree(map->osd_state);
516 kfree(map->osd_addr);
517 kfree(map->osd_weight);
518 }
519
520 map->osd_state = state;
521 map->osd_weight = weight;
522 map->osd_addr = addr;
523 map->max_osd = max;
524 return 0;
525}
526
527/*
528 * decode a full map.
529 */
530struct ceph_osdmap *osdmap_decode(void **p, void *end)
531{
532 struct ceph_osdmap *map;
533 u16 version;
534 u32 len, max, i;
535 u8 ev;
536 int err = -EINVAL;
537 void *start = *p;
538 struct ceph_pg_pool_info *pi;
539
540 dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
541
542 map = kzalloc(sizeof(*map), GFP_NOFS);
543 if (map == NULL)
544 return ERR_PTR(-ENOMEM);
545 map->pg_temp = RB_ROOT;
546
547 ceph_decode_16_safe(p, end, version, bad);
548 if (version > CEPH_OSDMAP_VERSION) {
549 pr_warning("got unknown v %d > %d of osdmap\n", version,
550 CEPH_OSDMAP_VERSION);
551 goto bad;
552 }
553
554 ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
555 ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
556 map->epoch = ceph_decode_32(p);
557 ceph_decode_copy(p, &map->created, sizeof(map->created));
558 ceph_decode_copy(p, &map->modified, sizeof(map->modified));
559
560 ceph_decode_32_safe(p, end, max, bad);
561 while (max--) {
562 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
563 pi = kzalloc(sizeof(*pi), GFP_NOFS);
564 if (!pi)
565 goto bad;
566 pi->id = ceph_decode_32(p);
567 ev = ceph_decode_8(p); /* encoding version */
568 if (ev > CEPH_PG_POOL_VERSION) {
569 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
570 ev, CEPH_PG_POOL_VERSION);
571 goto bad;
572 }
573 __decode_pool(p, pi);
574 __insert_pg_pool(&map->pg_pools, pi);
575 }
576
577 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
578 goto bad;
579
580 ceph_decode_32_safe(p, end, map->pool_max, bad);
581
582 ceph_decode_32_safe(p, end, map->flags, bad);
583
584 max = ceph_decode_32(p);
585
586 /* (re)alloc osd arrays */
587 err = osdmap_set_max_osd(map, max);
588 if (err < 0)
589 goto bad;
590 dout("osdmap_decode max_osd = %d\n", map->max_osd);
591
592 /* osds */
593 err = -EINVAL;
594 ceph_decode_need(p, end, 3*sizeof(u32) +
595 map->max_osd*(1 + sizeof(*map->osd_weight) +
596 sizeof(*map->osd_addr)), bad);
597 *p += 4; /* skip length field (should match max) */
598 ceph_decode_copy(p, map->osd_state, map->max_osd);
599
600 *p += 4; /* skip length field (should match max) */
601 for (i = 0; i < map->max_osd; i++)
602 map->osd_weight[i] = ceph_decode_32(p);
603
604 *p += 4; /* skip length field (should match max) */
605 ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
606 for (i = 0; i < map->max_osd; i++)
607 ceph_decode_addr(&map->osd_addr[i]);
608
609 /* pg_temp */
610 ceph_decode_32_safe(p, end, len, bad);
611 for (i = 0; i < len; i++) {
612 int n, j;
613 struct ceph_pg pgid;
614 struct ceph_pg_mapping *pg;
615
616 ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
617 ceph_decode_copy(p, &pgid, sizeof(pgid));
618 n = ceph_decode_32(p);
619 ceph_decode_need(p, end, n * sizeof(u32), bad);
620 err = -ENOMEM;
621 pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
622 if (!pg)
623 goto bad;
624 pg->pgid = pgid;
625 pg->len = n;
626 for (j = 0; j < n; j++)
627 pg->osds[j] = ceph_decode_32(p);
628
629 err = __insert_pg_mapping(pg, &map->pg_temp);
630 if (err)
631 goto bad;
632 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
633 }
634
635 /* crush */
636 ceph_decode_32_safe(p, end, len, bad);
637 dout("osdmap_decode crush len %d from off 0x%x\n", len,
638 (int)(*p - start));
639 ceph_decode_need(p, end, len, bad);
640 map->crush = crush_decode(*p, end);
641 *p += len;
642 if (IS_ERR(map->crush)) {
643 err = PTR_ERR(map->crush);
644 map->crush = NULL;
645 goto bad;
646 }
647
648 /* ignore the rest of the map */
649 *p = end;
650
651 dout("osdmap_decode done %p %p\n", *p, end);
652 return map;
653
654bad:
655 dout("osdmap_decode fail\n");
656 ceph_osdmap_destroy(map);
657 return ERR_PTR(err);
658}
659
660/*
661 * decode and apply an incremental map update.
662 */
663struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
664 struct ceph_osdmap *map,
665 struct ceph_messenger *msgr)
666{
667 struct crush_map *newcrush = NULL;
668 struct ceph_fsid fsid;
669 u32 epoch = 0;
670 struct ceph_timespec modified;
671 u32 len, pool;
672 __s32 new_pool_max, new_flags, max;
673 void *start = *p;
674 int err = -EINVAL;
675 u16 version;
676 struct rb_node *rbp;
677
678 ceph_decode_16_safe(p, end, version, bad);
679 if (version > CEPH_OSDMAP_INC_VERSION) {
680 pr_warning("got unknown v %d > %d of inc osdmap\n", version,
681 CEPH_OSDMAP_INC_VERSION);
682 goto bad;
683 }
684
685 ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
686 bad);
687 ceph_decode_copy(p, &fsid, sizeof(fsid));
688 epoch = ceph_decode_32(p);
689 BUG_ON(epoch != map->epoch+1);
690 ceph_decode_copy(p, &modified, sizeof(modified));
691 new_pool_max = ceph_decode_32(p);
692 new_flags = ceph_decode_32(p);
693
694 /* full map? */
695 ceph_decode_32_safe(p, end, len, bad);
696 if (len > 0) {
697 dout("apply_incremental full map len %d, %p to %p\n",
698 len, *p, end);
699 return osdmap_decode(p, min(*p+len, end));
700 }
701
702 /* new crush? */
703 ceph_decode_32_safe(p, end, len, bad);
704 if (len > 0) {
705 dout("apply_incremental new crush map len %d, %p to %p\n",
706 len, *p, end);
707 newcrush = crush_decode(*p, min(*p+len, end));
708 if (IS_ERR(newcrush))
709 return ERR_PTR(PTR_ERR(newcrush));
710 }
711
712 /* new flags? */
713 if (new_flags >= 0)
714 map->flags = new_flags;
715 if (new_pool_max >= 0)
716 map->pool_max = new_pool_max;
717
718 ceph_decode_need(p, end, 5*sizeof(u32), bad);
719
720 /* new max? */
721 max = ceph_decode_32(p);
722 if (max >= 0) {
723 err = osdmap_set_max_osd(map, max);
724 if (err < 0)
725 goto bad;
726 }
727
728 map->epoch++;
729 map->modified = map->modified;
730 if (newcrush) {
731 if (map->crush)
732 crush_destroy(map->crush);
733 map->crush = newcrush;
734 newcrush = NULL;
735 }
736
737 /* new_pool */
738 ceph_decode_32_safe(p, end, len, bad);
739 while (len--) {
740 __u8 ev;
741 struct ceph_pg_pool_info *pi;
742
743 ceph_decode_32_safe(p, end, pool, bad);
744 ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
745 ev = ceph_decode_8(p); /* encoding version */
746 if (ev > CEPH_PG_POOL_VERSION) {
747 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
748 ev, CEPH_PG_POOL_VERSION);
749 goto bad;
750 }
751 pi = __lookup_pg_pool(&map->pg_pools, pool);
752 if (!pi) {
753 pi = kzalloc(sizeof(*pi), GFP_NOFS);
754 if (!pi) {
755 err = -ENOMEM;
756 goto bad;
757 }
758 pi->id = pool;
759 __insert_pg_pool(&map->pg_pools, pi);
760 }
761 __decode_pool(p, pi);
762 }
763 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
764 goto bad;
765
766 /* old_pool */
767 ceph_decode_32_safe(p, end, len, bad);
768 while (len--) {
769 struct ceph_pg_pool_info *pi;
770
771 ceph_decode_32_safe(p, end, pool, bad);
772 pi = __lookup_pg_pool(&map->pg_pools, pool);
773 if (pi)
774 __remove_pg_pool(&map->pg_pools, pi);
775 }
776
777 /* new_up */
778 err = -EINVAL;
779 ceph_decode_32_safe(p, end, len, bad);
780 while (len--) {
781 u32 osd;
782 struct ceph_entity_addr addr;
783 ceph_decode_32_safe(p, end, osd, bad);
784 ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
785 ceph_decode_addr(&addr);
786 pr_info("osd%d up\n", osd);
787 BUG_ON(osd >= map->max_osd);
788 map->osd_state[osd] |= CEPH_OSD_UP;
789 map->osd_addr[osd] = addr;
790 }
791
792 /* new_down */
793 ceph_decode_32_safe(p, end, len, bad);
794 while (len--) {
795 u32 osd;
796 ceph_decode_32_safe(p, end, osd, bad);
797 (*p)++; /* clean flag */
798 pr_info("osd%d down\n", osd);
799 if (osd < map->max_osd)
800 map->osd_state[osd] &= ~CEPH_OSD_UP;
801 }
802
803 /* new_weight */
804 ceph_decode_32_safe(p, end, len, bad);
805 while (len--) {
806 u32 osd, off;
807 ceph_decode_need(p, end, sizeof(u32)*2, bad);
808 osd = ceph_decode_32(p);
809 off = ceph_decode_32(p);
810 pr_info("osd%d weight 0x%x %s\n", osd, off,
811 off == CEPH_OSD_IN ? "(in)" :
812 (off == CEPH_OSD_OUT ? "(out)" : ""));
813 if (osd < map->max_osd)
814 map->osd_weight[osd] = off;
815 }
816
817 /* new_pg_temp */
818 rbp = rb_first(&map->pg_temp);
819 ceph_decode_32_safe(p, end, len, bad);
820 while (len--) {
821 struct ceph_pg_mapping *pg;
822 int j;
823 struct ceph_pg pgid;
824 u32 pglen;
825 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
826 ceph_decode_copy(p, &pgid, sizeof(pgid));
827 pglen = ceph_decode_32(p);
828
829 /* remove any? */
830 while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
831 node)->pgid, pgid) <= 0) {
832 struct rb_node *cur = rbp;
833 rbp = rb_next(rbp);
834 dout(" removed pg_temp %llx\n",
835 *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
836 node)->pgid);
837 rb_erase(cur, &map->pg_temp);
838 }
839
840 if (pglen) {
841 /* insert */
842 ceph_decode_need(p, end, pglen*sizeof(u32), bad);
843 pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
844 if (!pg) {
845 err = -ENOMEM;
846 goto bad;
847 }
848 pg->pgid = pgid;
849 pg->len = pglen;
850 for (j = 0; j < pglen; j++)
851 pg->osds[j] = ceph_decode_32(p);
852 err = __insert_pg_mapping(pg, &map->pg_temp);
853 if (err)
854 goto bad;
855 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
856 pglen);
857 }
858 }
859 while (rbp) {
860 struct rb_node *cur = rbp;
861 rbp = rb_next(rbp);
862 dout(" removed pg_temp %llx\n",
863 *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
864 node)->pgid);
865 rb_erase(cur, &map->pg_temp);
866 }
867
868 /* ignore the rest */
869 *p = end;
870 return map;
871
872bad:
873 pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
874 epoch, (int)(*p - start), *p, start, end);
875 print_hex_dump(KERN_DEBUG, "osdmap: ",
876 DUMP_PREFIX_OFFSET, 16, 1,
877 start, end - start, true);
878 if (newcrush)
879 crush_destroy(newcrush);
880 return ERR_PTR(err);
881}
882
883
884
885
886/*
887 * calculate file layout from given offset, length.
888 * fill in correct oid, logical length, and object extent
889 * offset, length.
890 *
891 * for now, we write only a single su, until we can
892 * pass a stride back to the caller.
893 */
894void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
895 u64 off, u64 *plen,
896 u64 *ono,
897 u64 *oxoff, u64 *oxlen)
898{
899 u32 osize = le32_to_cpu(layout->fl_object_size);
900 u32 su = le32_to_cpu(layout->fl_stripe_unit);
901 u32 sc = le32_to_cpu(layout->fl_stripe_count);
902 u32 bl, stripeno, stripepos, objsetno;
903 u32 su_per_object;
904 u64 t, su_offset;
905
906 dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen,
907 osize, su);
908 su_per_object = osize / su;
909 dout("osize %u / su %u = su_per_object %u\n", osize, su,
910 su_per_object);
911
912 BUG_ON((su & ~PAGE_MASK) != 0);
913 /* bl = *off / su; */
914 t = off;
915 do_div(t, su);
916 bl = t;
917 dout("off %llu / su %u = bl %u\n", off, su, bl);
918
919 stripeno = bl / sc;
920 stripepos = bl % sc;
921 objsetno = stripeno / su_per_object;
922
923 *ono = objsetno * sc + stripepos;
924 dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
925
926 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
927 t = off;
928 su_offset = do_div(t, su);
929 *oxoff = su_offset + (stripeno % su_per_object) * su;
930
931 /*
932 * Calculate the length of the extent being written to the selected
933 * object. This is the minimum of the full length requested (plen) or
934 * the remainder of the current stripe being written to.
935 */
936 *oxlen = min_t(u64, *plen, su - su_offset);
937 *plen = *oxlen;
938
939 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
940}
941
942/*
943 * calculate an object layout (i.e. pgid) from an oid,
944 * file_layout, and osdmap
945 */
946int ceph_calc_object_layout(struct ceph_object_layout *ol,
947 const char *oid,
948 struct ceph_file_layout *fl,
949 struct ceph_osdmap *osdmap)
950{
951 unsigned num, num_mask;
952 struct ceph_pg pgid;
953 s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
954 int poolid = le32_to_cpu(fl->fl_pg_pool);
955 struct ceph_pg_pool_info *pool;
956 unsigned ps;
957
958 BUG_ON(!osdmap);
959
960 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
961 if (!pool)
962 return -EIO;
963 ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
964 if (preferred >= 0) {
965 ps += preferred;
966 num = le32_to_cpu(pool->v.lpg_num);
967 num_mask = pool->lpg_num_mask;
968 } else {
969 num = le32_to_cpu(pool->v.pg_num);
970 num_mask = pool->pg_num_mask;
971 }
972
973 pgid.ps = cpu_to_le16(ps);
974 pgid.preferred = cpu_to_le16(preferred);
975 pgid.pool = fl->fl_pg_pool;
976 if (preferred >= 0)
977 dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
978 (int)preferred);
979 else
980 dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
981
982 ol->ol_pgid = pgid;
983 ol->ol_stripe_unit = fl->fl_object_stripe_unit;
984 return 0;
985}
986
987/*
988 * Calculate raw osd vector for the given pgid. Return pointer to osd
989 * array, or NULL on failure.
990 */
991static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
992 int *osds, int *num)
993{
994 struct ceph_pg_mapping *pg;
995 struct ceph_pg_pool_info *pool;
996 int ruleno;
997 unsigned poolid, ps, pps;
998 int preferred;
999
1000 /* pg_temp? */
1001 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
1002 if (pg) {
1003 *num = pg->len;
1004 return pg->osds;
1005 }
1006
1007 /* crush */
1008 poolid = le32_to_cpu(pgid.pool);
1009 ps = le16_to_cpu(pgid.ps);
1010 preferred = (s16)le16_to_cpu(pgid.preferred);
1011
1012 /* don't forcefeed bad device ids to crush */
1013 if (preferred >= osdmap->max_osd ||
1014 preferred >= osdmap->crush->max_devices)
1015 preferred = -1;
1016
1017 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
1018 if (!pool)
1019 return NULL;
1020 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
1021 pool->v.type, pool->v.size);
1022 if (ruleno < 0) {
1023 pr_err("no crush rule pool %d type %d size %d\n",
1024 poolid, pool->v.type, pool->v.size);
1025 return NULL;
1026 }
1027
1028 if (preferred >= 0)
1029 pps = ceph_stable_mod(ps,
1030 le32_to_cpu(pool->v.lpgp_num),
1031 pool->lpgp_num_mask);
1032 else
1033 pps = ceph_stable_mod(ps,
1034 le32_to_cpu(pool->v.pgp_num),
1035 pool->pgp_num_mask);
1036 pps += poolid;
1037 *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
1038 min_t(int, pool->v.size, *num),
1039 preferred, osdmap->osd_weight);
1040 return osds;
1041}
1042
1043/*
1044 * Return acting set for given pgid.
1045 */
1046int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1047 int *acting)
1048{
1049 int rawosds[CEPH_PG_MAX_SIZE], *osds;
1050 int i, o, num = CEPH_PG_MAX_SIZE;
1051
1052 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1053 if (!osds)
1054 return -1;
1055
1056 /* primary is first up osd */
1057 o = 0;
1058 for (i = 0; i < num; i++)
1059 if (ceph_osd_is_up(osdmap, osds[i]))
1060 acting[o++] = osds[i];
1061 return o;
1062}
1063
1064/*
1065 * Return primary osd for given pgid, or -1 if none.
1066 */
1067int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
1068{
1069 int rawosds[CEPH_PG_MAX_SIZE], *osds;
1070 int i, num = CEPH_PG_MAX_SIZE;
1071
1072 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1073 if (!osds)
1074 return -1;
1075
1076 /* primary is first up osd */
1077 for (i = 0; i < num; i++)
1078 if (ceph_osd_is_up(osdmap, osds[i]))
1079 return osds[i];
1080 return -1;
1081}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
new file mode 100644
index 000000000000..970b547e510d
--- /dev/null
+++ b/fs/ceph/osdmap.h
@@ -0,0 +1,128 @@
1#ifndef _FS_CEPH_OSDMAP_H
2#define _FS_CEPH_OSDMAP_H
3
4#include <linux/rbtree.h>
5#include "types.h"
6#include "ceph_fs.h"
7#include "crush/crush.h"
8
9/*
10 * The osd map describes the current membership of the osd cluster and
11 * specifies the mapping of objects to placement groups and placement
12 * groups to (sets of) osds. That is, it completely specifies the
13 * (desired) distribution of all data objects in the system at some
14 * point in time.
15 *
16 * Each map version is identified by an epoch, which increases monotonically.
17 *
18 * The map can be updated either via an incremental map (diff) describing
19 * the change between two successive epochs, or as a fully encoded map.
20 */
21struct ceph_pg_pool_info {
22 struct rb_node node;
23 int id;
24 struct ceph_pg_pool v;
25 int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
26 char *name;
27};
28
29struct ceph_pg_mapping {
30 struct rb_node node;
31 struct ceph_pg pgid;
32 int len;
33 int osds[];
34};
35
36struct ceph_osdmap {
37 struct ceph_fsid fsid;
38 u32 epoch;
39 u32 mkfs_epoch;
40 struct ceph_timespec created, modified;
41
42 u32 flags; /* CEPH_OSDMAP_* */
43
44 u32 max_osd; /* size of osd_state, _offload, _addr arrays */
45 u8 *osd_state; /* CEPH_OSD_* */
46 u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
47 struct ceph_entity_addr *osd_addr;
48
49 struct rb_root pg_temp;
50 struct rb_root pg_pools;
51 u32 pool_max;
52
53 /* the CRUSH map specifies the mapping of placement groups to
54 * the list of osds that store+replicate them. */
55 struct crush_map *crush;
56};
57
58/*
59 * file layout helpers
60 */
61#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
62#define ceph_file_layout_stripe_count(l) \
63 ((__s32)le32_to_cpu((l).fl_stripe_count))
64#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
65#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
66#define ceph_file_layout_object_su(l) \
67 ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
68#define ceph_file_layout_pg_preferred(l) \
69 ((__s32)le32_to_cpu((l).fl_pg_preferred))
70#define ceph_file_layout_pg_pool(l) \
71 ((__s32)le32_to_cpu((l).fl_pg_pool))
72
73static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
74{
75 return le32_to_cpu(l->fl_stripe_unit) *
76 le32_to_cpu(l->fl_stripe_count);
77}
78
79/* "period" == bytes before i start on a new set of objects */
80static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
81{
82 return le32_to_cpu(l->fl_object_size) *
83 le32_to_cpu(l->fl_stripe_count);
84}
85
86
87static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
88{
89 return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
90}
91
92static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
93{
94 return map && (map->flags & flag);
95}
96
97extern char *ceph_osdmap_state_str(char *str, int len, int state);
98
99static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
100 int osd)
101{
102 if (osd >= map->max_osd)
103 return NULL;
104 return &map->osd_addr[osd];
105}
106
107extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
108extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
109 struct ceph_osdmap *map,
110 struct ceph_messenger *msgr);
111extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
112
113/* calculate mapping of a file extent to an object */
114extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
115 u64 off, u64 *plen,
116 u64 *bno, u64 *oxoff, u64 *oxlen);
117
118/* calculate mapping of object to a placement group */
119extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
120 const char *oid,
121 struct ceph_file_layout *fl,
122 struct ceph_osdmap *osdmap);
123extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
124 int *acting);
125extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
126 struct ceph_pg pgid);
127
128#endif
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
new file mode 100644
index 000000000000..5f8dbf7c745a
--- /dev/null
+++ b/fs/ceph/pagelist.c
@@ -0,0 +1,55 @@
1
2#include <linux/gfp.h>
3#include <linux/pagemap.h>
4#include <linux/highmem.h>
5
6#include "pagelist.h"
7
8int ceph_pagelist_release(struct ceph_pagelist *pl)
9{
10 if (pl->mapped_tail)
11 kunmap(pl->mapped_tail);
12 while (!list_empty(&pl->head)) {
13 struct page *page = list_first_entry(&pl->head, struct page,
14 lru);
15 list_del(&page->lru);
16 __free_page(page);
17 }
18 return 0;
19}
20
21static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
22{
23 struct page *page = alloc_page(GFP_NOFS);
24 if (!page)
25 return -ENOMEM;
26 pl->room += PAGE_SIZE;
27 list_add_tail(&page->lru, &pl->head);
28 if (pl->mapped_tail)
29 kunmap(pl->mapped_tail);
30 pl->mapped_tail = kmap(page);
31 return 0;
32}
33
34int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
35{
36 while (pl->room < len) {
37 size_t bit = pl->room;
38 int ret;
39
40 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
41 buf, bit);
42 pl->length += bit;
43 pl->room -= bit;
44 buf += bit;
45 len -= bit;
46 ret = ceph_pagelist_addpage(pl);
47 if (ret)
48 return ret;
49 }
50
51 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
52 pl->length += len;
53 pl->room -= len;
54 return 0;
55}
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h
new file mode 100644
index 000000000000..e8a4187e1087
--- /dev/null
+++ b/fs/ceph/pagelist.h
@@ -0,0 +1,54 @@
1#ifndef __FS_CEPH_PAGELIST_H
2#define __FS_CEPH_PAGELIST_H
3
4#include <linux/list.h>
5
6struct ceph_pagelist {
7 struct list_head head;
8 void *mapped_tail;
9 size_t length;
10 size_t room;
11};
12
13static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
14{
15 INIT_LIST_HEAD(&pl->head);
16 pl->mapped_tail = NULL;
17 pl->length = 0;
18 pl->room = 0;
19}
20extern int ceph_pagelist_release(struct ceph_pagelist *pl);
21
22extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
23
24static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
25{
26 __le64 ev = cpu_to_le64(v);
27 return ceph_pagelist_append(pl, &ev, sizeof(ev));
28}
29static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
30{
31 __le32 ev = cpu_to_le32(v);
32 return ceph_pagelist_append(pl, &ev, sizeof(ev));
33}
34static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
35{
36 __le16 ev = cpu_to_le16(v);
37 return ceph_pagelist_append(pl, &ev, sizeof(ev));
38}
39static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
40{
41 return ceph_pagelist_append(pl, &v, 1);
42}
43static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
44 char *s, size_t len)
45{
46 int ret = ceph_pagelist_encode_32(pl, len);
47 if (ret)
48 return ret;
49 if (len)
50 return ceph_pagelist_append(pl, s, len);
51 return 0;
52}
53
54#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
new file mode 100644
index 000000000000..fd56451a871f
--- /dev/null
+++ b/fs/ceph/rados.h
@@ -0,0 +1,377 @@
1#ifndef __RADOS_H
2#define __RADOS_H
3
4/*
5 * Data types for the Ceph distributed object storage layer RADOS
6 * (Reliable Autonomic Distributed Object Store).
7 */
8
9#include "msgr.h"
10
11/*
12 * osdmap encoding versions
13 */
14#define CEPH_OSDMAP_INC_VERSION 5
15#define CEPH_OSDMAP_INC_VERSION_EXT 5
16#define CEPH_OSDMAP_VERSION 5
17#define CEPH_OSDMAP_VERSION_EXT 5
18
19/*
20 * fs id
21 */
22struct ceph_fsid {
23 unsigned char fsid[16];
24};
25
26static inline int ceph_fsid_compare(const struct ceph_fsid *a,
27 const struct ceph_fsid *b)
28{
29 return memcmp(a, b, sizeof(*a));
30}
31
32/*
33 * ino, object, etc.
34 */
35typedef __le64 ceph_snapid_t;
36#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */
37#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */
38#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */
39
40struct ceph_timespec {
41 __le32 tv_sec;
42 __le32 tv_nsec;
43} __attribute__ ((packed));
44
45
46/*
47 * object layout - how objects are mapped into PGs
48 */
49#define CEPH_OBJECT_LAYOUT_HASH 1
50#define CEPH_OBJECT_LAYOUT_LINEAR 2
51#define CEPH_OBJECT_LAYOUT_HASHINO 3
52
53/*
54 * pg layout -- how PGs are mapped onto (sets of) OSDs
55 */
56#define CEPH_PG_LAYOUT_CRUSH 0
57#define CEPH_PG_LAYOUT_HASH 1
58#define CEPH_PG_LAYOUT_LINEAR 2
59#define CEPH_PG_LAYOUT_HYBRID 3
60
61#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */
62
63/*
64 * placement group.
65 * we encode this into one __le64.
66 */
67struct ceph_pg {
68 __le16 preferred; /* preferred primary osd */
69 __le16 ps; /* placement seed */
70 __le32 pool; /* object pool */
71} __attribute__ ((packed));
72
73/*
74 * pg_pool is a set of pgs storing a pool of objects
75 *
76 * pg_num -- base number of pseudorandomly placed pgs
77 *
78 * pgp_num -- effective number when calculating pg placement. this
79 * is used for pg_num increases. new pgs result in data being "split"
80 * into new pgs. for this to proceed smoothly, new pgs are intiially
81 * colocated with their parents; that is, pgp_num doesn't increase
82 * until the new pgs have successfully split. only _then_ are the new
83 * pgs placed independently.
84 *
85 * lpg_num -- localized pg count (per device). replicas are randomly
86 * selected.
87 *
88 * lpgp_num -- as above.
89 */
90#define CEPH_PG_TYPE_REP 1
91#define CEPH_PG_TYPE_RAID4 2
92#define CEPH_PG_POOL_VERSION 2
93struct ceph_pg_pool {
94 __u8 type; /* CEPH_PG_TYPE_* */
95 __u8 size; /* number of osds in each pg */
96 __u8 crush_ruleset; /* crush placement rule */
97 __u8 object_hash; /* hash mapping object name to ps */
98 __le32 pg_num, pgp_num; /* number of pg's */
99 __le32 lpg_num, lpgp_num; /* number of localized pg's */
100 __le32 last_change; /* most recent epoch changed */
101 __le64 snap_seq; /* seq for per-pool snapshot */
102 __le32 snap_epoch; /* epoch of last snap */
103 __le32 num_snaps;
104 __le32 num_removed_snap_intervals;
105 __le64 uid;
106} __attribute__ ((packed));
107
108/*
109 * stable_mod func is used to control number of placement groups.
110 * similar to straight-up modulo, but produces a stable mapping as b
111 * increases over time. b is the number of bins, and bmask is the
112 * containing power of 2 minus 1.
113 *
114 * b <= bmask and bmask=(2**n)-1
115 * e.g., b=12 -> bmask=15, b=123 -> bmask=127
116 */
117static inline int ceph_stable_mod(int x, int b, int bmask)
118{
119 if ((x & bmask) < b)
120 return x & bmask;
121 else
122 return x & (bmask >> 1);
123}
124
125/*
126 * object layout - how a given object should be stored.
127 */
128struct ceph_object_layout {
129 struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */
130 __le32 ol_stripe_unit; /* for per-object parity, if any */
131} __attribute__ ((packed));
132
133/*
134 * compound epoch+version, used by storage layer to serialize mutations
135 */
136struct ceph_eversion {
137 __le32 epoch;
138 __le64 version;
139} __attribute__ ((packed));
140
141/*
142 * osd map bits
143 */
144
145/* status bits */
146#define CEPH_OSD_EXISTS 1
147#define CEPH_OSD_UP 2
148
149/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
150#define CEPH_OSD_IN 0x10000
151#define CEPH_OSD_OUT 0
152
153
154/*
155 * osd map flag bits
156 */
157#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
158#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
159#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
160#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
161#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
162
163/*
164 * osd ops
165 */
166#define CEPH_OSD_OP_MODE 0xf000
167#define CEPH_OSD_OP_MODE_RD 0x1000
168#define CEPH_OSD_OP_MODE_WR 0x2000
169#define CEPH_OSD_OP_MODE_RMW 0x3000
170#define CEPH_OSD_OP_MODE_SUB 0x4000
171
172#define CEPH_OSD_OP_TYPE 0x0f00
173#define CEPH_OSD_OP_TYPE_LOCK 0x0100
174#define CEPH_OSD_OP_TYPE_DATA 0x0200
175#define CEPH_OSD_OP_TYPE_ATTR 0x0300
176#define CEPH_OSD_OP_TYPE_EXEC 0x0400
177#define CEPH_OSD_OP_TYPE_PG 0x0500
178
179enum {
180 /** data **/
181 /* read */
182 CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
183 CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
184
185 /* fancy read */
186 CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
187
188 /* write */
189 CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
190 CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
191 CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
192 CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
193 CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
194
195 /* fancy write */
196 CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
197 CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
198 CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
199 CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
200
201 CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
202 CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
203 CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
204
205 CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
206
207 /** attrs **/
208 /* read */
209 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
210 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
211
212 /* write */
213 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
214 CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
215 CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
216 CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
217
218 /** subop **/
219 CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1,
220 CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2,
221 CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3,
222 CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
223 CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5,
224
225 /** lock **/
226 CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
227 CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
228 CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
229 CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
230 CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
231 CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
232
233 /** exec **/
234 CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
235
236 /** pg **/
237 CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
238};
239
240static inline int ceph_osd_op_type_lock(int op)
241{
242 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
243}
244static inline int ceph_osd_op_type_data(int op)
245{
246 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
247}
248static inline int ceph_osd_op_type_attr(int op)
249{
250 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
251}
252static inline int ceph_osd_op_type_exec(int op)
253{
254 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
255}
256static inline int ceph_osd_op_type_pg(int op)
257{
258 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
259}
260
261static inline int ceph_osd_op_mode_subop(int op)
262{
263 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
264}
265static inline int ceph_osd_op_mode_read(int op)
266{
267 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
268}
269static inline int ceph_osd_op_mode_modify(int op)
270{
271 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
272}
273
274#define CEPH_OSD_TMAP_HDR 'h'
275#define CEPH_OSD_TMAP_SET 's'
276#define CEPH_OSD_TMAP_RM 'r'
277
278extern const char *ceph_osd_op_name(int op);
279
280
281/*
282 * osd op flags
283 *
284 * An op may be READ, WRITE, or READ|WRITE.
285 */
286enum {
287 CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */
288 CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */
289 CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */
290 CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */
291 CEPH_OSD_FLAG_READ = 16, /* op may read */
292 CEPH_OSD_FLAG_WRITE = 32, /* op may write */
293 CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */
294 CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */
295 CEPH_OSD_FLAG_BALANCE_READS = 256,
296 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
297 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */
298 CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */
299};
300
301enum {
302 CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
303};
304
305#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
306#define EBLACKLISTED ESHUTDOWN /* blacklisted */
307
308/*
309 * an individual object operation. each may be accompanied by some data
310 * payload
311 */
312struct ceph_osd_op {
313 __le16 op; /* CEPH_OSD_OP_* */
314 __le32 flags; /* CEPH_OSD_FLAG_* */
315 union {
316 struct {
317 __le64 offset, length;
318 __le64 truncate_size;
319 __le32 truncate_seq;
320 } __attribute__ ((packed)) extent;
321 struct {
322 __le32 name_len;
323 __le32 value_len;
324 } __attribute__ ((packed)) xattr;
325 struct {
326 __u8 class_len;
327 __u8 method_len;
328 __u8 argc;
329 __le32 indata_len;
330 } __attribute__ ((packed)) cls;
331 struct {
332 __le64 cookie, count;
333 } __attribute__ ((packed)) pgls;
334 };
335 __le32 payload_len;
336} __attribute__ ((packed));
337
338/*
339 * osd request message header. each request may include multiple
340 * ceph_osd_op object operations.
341 */
342struct ceph_osd_request_head {
343 __le32 client_inc; /* client incarnation */
344 struct ceph_object_layout layout; /* pgid */
345 __le32 osdmap_epoch; /* client's osdmap epoch */
346
347 __le32 flags;
348
349 struct ceph_timespec mtime; /* for mutations only */
350 struct ceph_eversion reassert_version; /* if we are replaying op */
351
352 __le32 object_len; /* length of object name */
353
354 __le64 snapid; /* snapid to read */
355 __le64 snap_seq; /* writer's snap context */
356 __le32 num_snaps;
357
358 __le16 num_ops;
359 struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */
360} __attribute__ ((packed));
361
362struct ceph_osd_reply_head {
363 __le32 client_inc; /* client incarnation */
364 __le32 flags;
365 struct ceph_object_layout layout;
366 __le32 osdmap_epoch;
367 struct ceph_eversion reassert_version; /* for replaying uncommitted */
368
369 __le32 result; /* result code */
370
371 __le32 object_len; /* length of object name */
372 __le32 num_ops;
373 struct ceph_osd_op ops[0]; /* ops[], object */
374} __attribute__ ((packed));
375
376
377#endif
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
new file mode 100644
index 000000000000..d5114db70453
--- /dev/null
+++ b/fs/ceph/snap.c
@@ -0,0 +1,911 @@
1#include "ceph_debug.h"
2
3#include <linux/sort.h>
4#include <linux/slab.h>
5
6#include "super.h"
7#include "decode.h"
8
9/*
10 * Snapshots in ceph are driven in large part by cooperation from the
11 * client. In contrast to local file systems or file servers that
12 * implement snapshots at a single point in the system, ceph's
13 * distributed access to storage requires clients to help decide
14 * whether a write logically occurs before or after a recently created
15 * snapshot.
16 *
17 * This provides a perfect instantanous client-wide snapshot. Between
18 * clients, however, snapshots may appear to be applied at slightly
19 * different points in time, depending on delays in delivering the
20 * snapshot notification.
21 *
22 * Snapshots are _not_ file system-wide. Instead, each snapshot
23 * applies to the subdirectory nested beneath some directory. This
24 * effectively divides the hierarchy into multiple "realms," where all
25 * of the files contained by each realm share the same set of
26 * snapshots. An individual realm's snap set contains snapshots
27 * explicitly created on that realm, as well as any snaps in its
28 * parent's snap set _after_ the point at which the parent became it's
29 * parent (due to, say, a rename). Similarly, snaps from prior parents
30 * during the time intervals during which they were the parent are included.
31 *
32 * The client is spared most of this detail, fortunately... it must only
33 * maintains a hierarchy of realms reflecting the current parent/child
34 * realm relationship, and for each realm has an explicit list of snaps
35 * inherited from prior parents.
36 *
37 * A snap_realm struct is maintained for realms containing every inode
38 * with an open cap in the system. (The needed snap realm information is
39 * provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq'
40 * version number is used to ensure that as realm parameters change (new
41 * snapshot, new parent, etc.) the client's realm hierarchy is updated.
42 *
43 * The realm hierarchy drives the generation of a 'snap context' for each
44 * realm, which simply lists the resulting set of snaps for the realm. This
45 * is attached to any writes sent to OSDs.
46 */
47/*
48 * Unfortunately error handling is a bit mixed here. If we get a snap
49 * update, but don't have enough memory to update our realm hierarchy,
50 * it's not clear what we can do about it (besides complaining to the
51 * console).
52 */
53
54
55/*
56 * increase ref count for the realm
57 *
58 * caller must hold snap_rwsem for write.
59 */
60void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
61 struct ceph_snap_realm *realm)
62{
63 dout("get_realm %p %d -> %d\n", realm,
64 atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
65 /*
66 * since we _only_ increment realm refs or empty the empty
67 * list with snap_rwsem held, adjusting the empty list here is
68 * safe. we do need to protect against concurrent empty list
69 * additions, however.
70 */
71 if (atomic_read(&realm->nref) == 0) {
72 spin_lock(&mdsc->snap_empty_lock);
73 list_del_init(&realm->empty_item);
74 spin_unlock(&mdsc->snap_empty_lock);
75 }
76
77 atomic_inc(&realm->nref);
78}
79
80static void __insert_snap_realm(struct rb_root *root,
81 struct ceph_snap_realm *new)
82{
83 struct rb_node **p = &root->rb_node;
84 struct rb_node *parent = NULL;
85 struct ceph_snap_realm *r = NULL;
86
87 while (*p) {
88 parent = *p;
89 r = rb_entry(parent, struct ceph_snap_realm, node);
90 if (new->ino < r->ino)
91 p = &(*p)->rb_left;
92 else if (new->ino > r->ino)
93 p = &(*p)->rb_right;
94 else
95 BUG();
96 }
97
98 rb_link_node(&new->node, parent, p);
99 rb_insert_color(&new->node, root);
100}
101
102/*
103 * create and get the realm rooted at @ino and bump its ref count.
104 *
105 * caller must hold snap_rwsem for write.
106 */
107static struct ceph_snap_realm *ceph_create_snap_realm(
108 struct ceph_mds_client *mdsc,
109 u64 ino)
110{
111 struct ceph_snap_realm *realm;
112
113 realm = kzalloc(sizeof(*realm), GFP_NOFS);
114 if (!realm)
115 return ERR_PTR(-ENOMEM);
116
117 atomic_set(&realm->nref, 0); /* tree does not take a ref */
118 realm->ino = ino;
119 INIT_LIST_HEAD(&realm->children);
120 INIT_LIST_HEAD(&realm->child_item);
121 INIT_LIST_HEAD(&realm->empty_item);
122 INIT_LIST_HEAD(&realm->inodes_with_caps);
123 spin_lock_init(&realm->inodes_with_caps_lock);
124 __insert_snap_realm(&mdsc->snap_realms, realm);
125 dout("create_snap_realm %llx %p\n", realm->ino, realm);
126 return realm;
127}
128
129/*
130 * lookup the realm rooted at @ino.
131 *
132 * caller must hold snap_rwsem for write.
133 */
134struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
135 u64 ino)
136{
137 struct rb_node *n = mdsc->snap_realms.rb_node;
138 struct ceph_snap_realm *r;
139
140 while (n) {
141 r = rb_entry(n, struct ceph_snap_realm, node);
142 if (ino < r->ino)
143 n = n->rb_left;
144 else if (ino > r->ino)
145 n = n->rb_right;
146 else {
147 dout("lookup_snap_realm %llx %p\n", r->ino, r);
148 return r;
149 }
150 }
151 return NULL;
152}
153
154static void __put_snap_realm(struct ceph_mds_client *mdsc,
155 struct ceph_snap_realm *realm);
156
157/*
158 * called with snap_rwsem (write)
159 */
160static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
161 struct ceph_snap_realm *realm)
162{
163 dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
164
165 rb_erase(&realm->node, &mdsc->snap_realms);
166
167 if (realm->parent) {
168 list_del_init(&realm->child_item);
169 __put_snap_realm(mdsc, realm->parent);
170 }
171
172 kfree(realm->prior_parent_snaps);
173 kfree(realm->snaps);
174 ceph_put_snap_context(realm->cached_context);
175 kfree(realm);
176}
177
178/*
179 * caller holds snap_rwsem (write)
180 */
181static void __put_snap_realm(struct ceph_mds_client *mdsc,
182 struct ceph_snap_realm *realm)
183{
184 dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
185 atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
186 if (atomic_dec_and_test(&realm->nref))
187 __destroy_snap_realm(mdsc, realm);
188}
189
190/*
191 * caller needn't hold any locks
192 */
193void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
194 struct ceph_snap_realm *realm)
195{
196 dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
197 atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
198 if (!atomic_dec_and_test(&realm->nref))
199 return;
200
201 if (down_write_trylock(&mdsc->snap_rwsem)) {
202 __destroy_snap_realm(mdsc, realm);
203 up_write(&mdsc->snap_rwsem);
204 } else {
205 spin_lock(&mdsc->snap_empty_lock);
206 list_add(&mdsc->snap_empty, &realm->empty_item);
207 spin_unlock(&mdsc->snap_empty_lock);
208 }
209}
210
211/*
212 * Clean up any realms whose ref counts have dropped to zero. Note
213 * that this does not include realms who were created but not yet
214 * used.
215 *
216 * Called under snap_rwsem (write)
217 */
218static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
219{
220 struct ceph_snap_realm *realm;
221
222 spin_lock(&mdsc->snap_empty_lock);
223 while (!list_empty(&mdsc->snap_empty)) {
224 realm = list_first_entry(&mdsc->snap_empty,
225 struct ceph_snap_realm, empty_item);
226 list_del(&realm->empty_item);
227 spin_unlock(&mdsc->snap_empty_lock);
228 __destroy_snap_realm(mdsc, realm);
229 spin_lock(&mdsc->snap_empty_lock);
230 }
231 spin_unlock(&mdsc->snap_empty_lock);
232}
233
234void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
235{
236 down_write(&mdsc->snap_rwsem);
237 __cleanup_empty_realms(mdsc);
238 up_write(&mdsc->snap_rwsem);
239}
240
241/*
242 * adjust the parent realm of a given @realm. adjust child list, and parent
243 * pointers, and ref counts appropriately.
244 *
245 * return true if parent was changed, 0 if unchanged, <0 on error.
246 *
247 * caller must hold snap_rwsem for write.
248 */
249static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
250 struct ceph_snap_realm *realm,
251 u64 parentino)
252{
253 struct ceph_snap_realm *parent;
254
255 if (realm->parent_ino == parentino)
256 return 0;
257
258 parent = ceph_lookup_snap_realm(mdsc, parentino);
259 if (!parent) {
260 parent = ceph_create_snap_realm(mdsc, parentino);
261 if (IS_ERR(parent))
262 return PTR_ERR(parent);
263 }
264 dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
265 realm->ino, realm, realm->parent_ino, realm->parent,
266 parentino, parent);
267 if (realm->parent) {
268 list_del_init(&realm->child_item);
269 ceph_put_snap_realm(mdsc, realm->parent);
270 }
271 realm->parent_ino = parentino;
272 realm->parent = parent;
273 ceph_get_snap_realm(mdsc, parent);
274 list_add(&realm->child_item, &parent->children);
275 return 1;
276}
277
278
279static int cmpu64_rev(const void *a, const void *b)
280{
281 if (*(u64 *)a < *(u64 *)b)
282 return 1;
283 if (*(u64 *)a > *(u64 *)b)
284 return -1;
285 return 0;
286}
287
288/*
289 * build the snap context for a given realm.
290 */
291static int build_snap_context(struct ceph_snap_realm *realm)
292{
293 struct ceph_snap_realm *parent = realm->parent;
294 struct ceph_snap_context *snapc;
295 int err = 0;
296 int i;
297 int num = realm->num_prior_parent_snaps + realm->num_snaps;
298
299 /*
300 * build parent context, if it hasn't been built.
301 * conservatively estimate that all parent snaps might be
302 * included by us.
303 */
304 if (parent) {
305 if (!parent->cached_context) {
306 err = build_snap_context(parent);
307 if (err)
308 goto fail;
309 }
310 num += parent->cached_context->num_snaps;
311 }
312
313 /* do i actually need to update? not if my context seq
314 matches realm seq, and my parents' does to. (this works
315 because we rebuild_snap_realms() works _downward_ in
316 hierarchy after each update.) */
317 if (realm->cached_context &&
318 realm->cached_context->seq == realm->seq &&
319 (!parent ||
320 realm->cached_context->seq >= parent->cached_context->seq)) {
321 dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
322 " (unchanged)\n",
323 realm->ino, realm, realm->cached_context,
324 realm->cached_context->seq,
325 realm->cached_context->num_snaps);
326 return 0;
327 }
328
329 /* alloc new snap context */
330 err = -ENOMEM;
331 if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc))
332 goto fail;
333 snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
334 if (!snapc)
335 goto fail;
336 atomic_set(&snapc->nref, 1);
337
338 /* build (reverse sorted) snap vector */
339 num = 0;
340 snapc->seq = realm->seq;
341 if (parent) {
342 /* include any of parent's snaps occuring _after_ my
343 parent became my parent */
344 for (i = 0; i < parent->cached_context->num_snaps; i++)
345 if (parent->cached_context->snaps[i] >=
346 realm->parent_since)
347 snapc->snaps[num++] =
348 parent->cached_context->snaps[i];
349 if (parent->cached_context->seq > snapc->seq)
350 snapc->seq = parent->cached_context->seq;
351 }
352 memcpy(snapc->snaps + num, realm->snaps,
353 sizeof(u64)*realm->num_snaps);
354 num += realm->num_snaps;
355 memcpy(snapc->snaps + num, realm->prior_parent_snaps,
356 sizeof(u64)*realm->num_prior_parent_snaps);
357 num += realm->num_prior_parent_snaps;
358
359 sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
360 snapc->num_snaps = num;
361 dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n",
362 realm->ino, realm, snapc, snapc->seq, snapc->num_snaps);
363
364 if (realm->cached_context)
365 ceph_put_snap_context(realm->cached_context);
366 realm->cached_context = snapc;
367 return 0;
368
369fail:
370 /*
371 * if we fail, clear old (incorrect) cached_context... hopefully
372 * we'll have better luck building it later
373 */
374 if (realm->cached_context) {
375 ceph_put_snap_context(realm->cached_context);
376 realm->cached_context = NULL;
377 }
378 pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
379 realm, err);
380 return err;
381}
382
383/*
384 * rebuild snap context for the given realm and all of its children.
385 */
386static void rebuild_snap_realms(struct ceph_snap_realm *realm)
387{
388 struct ceph_snap_realm *child;
389
390 dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
391 build_snap_context(realm);
392
393 list_for_each_entry(child, &realm->children, child_item)
394 rebuild_snap_realms(child);
395}
396
397
398/*
399 * helper to allocate and decode an array of snapids. free prior
400 * instance, if any.
401 */
402static int dup_array(u64 **dst, __le64 *src, int num)
403{
404 int i;
405
406 kfree(*dst);
407 if (num) {
408 *dst = kcalloc(num, sizeof(u64), GFP_NOFS);
409 if (!*dst)
410 return -ENOMEM;
411 for (i = 0; i < num; i++)
412 (*dst)[i] = get_unaligned_le64(src + i);
413 } else {
414 *dst = NULL;
415 }
416 return 0;
417}
418
419
420/*
421 * When a snapshot is applied, the size/mtime inode metadata is queued
422 * in a ceph_cap_snap (one for each snapshot) until writeback
423 * completes and the metadata can be flushed back to the MDS.
424 *
425 * However, if a (sync) write is currently in-progress when we apply
426 * the snapshot, we have to wait until the write succeeds or fails
427 * (and a final size/mtime is known). In this case the
428 * cap_snap->writing = 1, and is said to be "pending." When the write
429 * finishes, we __ceph_finish_cap_snap().
430 *
431 * Caller must hold snap_rwsem for read (i.e., the realm topology won't
432 * change).
433 */
434void ceph_queue_cap_snap(struct ceph_inode_info *ci)
435{
436 struct inode *inode = &ci->vfs_inode;
437 struct ceph_cap_snap *capsnap;
438 int used;
439
440 capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
441 if (!capsnap) {
442 pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
443 return;
444 }
445
446 spin_lock(&inode->i_lock);
447 used = __ceph_caps_used(ci);
448 if (__ceph_have_pending_cap_snap(ci)) {
449 /* there is no point in queuing multiple "pending" cap_snaps,
450 as no new writes are allowed to start when pending, so any
451 writes in progress now were started before the previous
452 cap_snap. lucky us. */
453 dout("queue_cap_snap %p already pending\n", inode);
454 kfree(capsnap);
455 } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
456 struct ceph_snap_context *snapc = ci->i_head_snapc;
457
458 igrab(inode);
459
460 atomic_set(&capsnap->nref, 1);
461 capsnap->ci = ci;
462 INIT_LIST_HEAD(&capsnap->ci_item);
463 INIT_LIST_HEAD(&capsnap->flushing_item);
464
465 capsnap->follows = snapc->seq - 1;
466 capsnap->issued = __ceph_caps_issued(ci, NULL);
467 capsnap->dirty = __ceph_caps_dirty(ci);
468
469 capsnap->mode = inode->i_mode;
470 capsnap->uid = inode->i_uid;
471 capsnap->gid = inode->i_gid;
472
473 /* fixme? */
474 capsnap->xattr_blob = NULL;
475 capsnap->xattr_len = 0;
476
477 /* dirty page count moved from _head to this cap_snap;
478 all subsequent writes page dirties occur _after_ this
479 snapshot. */
480 capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
481 ci->i_wrbuffer_ref_head = 0;
482 capsnap->context = snapc;
483 ci->i_head_snapc = NULL;
484 list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
485
486 if (used & CEPH_CAP_FILE_WR) {
487 dout("queue_cap_snap %p cap_snap %p snapc %p"
488 " seq %llu used WR, now pending\n", inode,
489 capsnap, snapc, snapc->seq);
490 capsnap->writing = 1;
491 } else {
492 /* note mtime, size NOW. */
493 __ceph_finish_cap_snap(ci, capsnap);
494 }
495 } else {
496 dout("queue_cap_snap %p nothing dirty|writing\n", inode);
497 kfree(capsnap);
498 }
499
500 spin_unlock(&inode->i_lock);
501}
502
503/*
504 * Finalize the size, mtime for a cap_snap.. that is, settle on final values
505 * to be used for the snapshot, to be flushed back to the mds.
506 *
507 * If capsnap can now be flushed, add to snap_flush list, and return 1.
508 *
509 * Caller must hold i_lock.
510 */
511int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
512 struct ceph_cap_snap *capsnap)
513{
514 struct inode *inode = &ci->vfs_inode;
515 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
516
517 BUG_ON(capsnap->writing);
518 capsnap->size = inode->i_size;
519 capsnap->mtime = inode->i_mtime;
520 capsnap->atime = inode->i_atime;
521 capsnap->ctime = inode->i_ctime;
522 capsnap->time_warp_seq = ci->i_time_warp_seq;
523 if (capsnap->dirty_pages) {
524 dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
525 "still has %d dirty pages\n", inode, capsnap,
526 capsnap->context, capsnap->context->seq,
527 ceph_cap_string(capsnap->dirty), capsnap->size,
528 capsnap->dirty_pages);
529 return 0;
530 }
531 dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
532 inode, capsnap, capsnap->context,
533 capsnap->context->seq, ceph_cap_string(capsnap->dirty),
534 capsnap->size);
535
536 spin_lock(&mdsc->snap_flush_lock);
537 list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
538 spin_unlock(&mdsc->snap_flush_lock);
539 return 1; /* caller may want to ceph_flush_snaps */
540}
541
542
543/*
544 * Parse and apply a snapblob "snap trace" from the MDS. This specifies
545 * the snap realm parameters from a given realm and all of its ancestors,
546 * up to the root.
547 *
548 * Caller must hold snap_rwsem for write.
549 */
550int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
551 void *p, void *e, bool deletion)
552{
553 struct ceph_mds_snap_realm *ri; /* encoded */
554 __le64 *snaps; /* encoded */
555 __le64 *prior_parent_snaps; /* encoded */
556 struct ceph_snap_realm *realm;
557 int invalidate = 0;
558 int err = -ENOMEM;
559
560 dout("update_snap_trace deletion=%d\n", deletion);
561more:
562 ceph_decode_need(&p, e, sizeof(*ri), bad);
563 ri = p;
564 p += sizeof(*ri);
565 ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
566 le32_to_cpu(ri->num_prior_parent_snaps)), bad);
567 snaps = p;
568 p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
569 prior_parent_snaps = p;
570 p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
571
572 realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
573 if (!realm) {
574 realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
575 if (IS_ERR(realm)) {
576 err = PTR_ERR(realm);
577 goto fail;
578 }
579 }
580
581 if (le64_to_cpu(ri->seq) > realm->seq) {
582 dout("update_snap_trace updating %llx %p %lld -> %lld\n",
583 realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
584 /*
585 * if the realm seq has changed, queue a cap_snap for every
586 * inode with open caps. we do this _before_ we update
587 * the realm info so that we prepare for writeback under the
588 * _previous_ snap context.
589 *
590 * ...unless it's a snap deletion!
591 */
592 if (!deletion) {
593 struct ceph_inode_info *ci;
594 struct inode *lastinode = NULL;
595
596 spin_lock(&realm->inodes_with_caps_lock);
597 list_for_each_entry(ci, &realm->inodes_with_caps,
598 i_snap_realm_item) {
599 struct inode *inode = igrab(&ci->vfs_inode);
600 if (!inode)
601 continue;
602 spin_unlock(&realm->inodes_with_caps_lock);
603 if (lastinode)
604 iput(lastinode);
605 lastinode = inode;
606 ceph_queue_cap_snap(ci);
607 spin_lock(&realm->inodes_with_caps_lock);
608 }
609 spin_unlock(&realm->inodes_with_caps_lock);
610 if (lastinode)
611 iput(lastinode);
612 dout("update_snap_trace cap_snaps queued\n");
613 }
614
615 } else {
616 dout("update_snap_trace %llx %p seq %lld unchanged\n",
617 realm->ino, realm, realm->seq);
618 }
619
620 /* ensure the parent is correct */
621 err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
622 if (err < 0)
623 goto fail;
624 invalidate += err;
625
626 if (le64_to_cpu(ri->seq) > realm->seq) {
627 /* update realm parameters, snap lists */
628 realm->seq = le64_to_cpu(ri->seq);
629 realm->created = le64_to_cpu(ri->created);
630 realm->parent_since = le64_to_cpu(ri->parent_since);
631
632 realm->num_snaps = le32_to_cpu(ri->num_snaps);
633 err = dup_array(&realm->snaps, snaps, realm->num_snaps);
634 if (err < 0)
635 goto fail;
636
637 realm->num_prior_parent_snaps =
638 le32_to_cpu(ri->num_prior_parent_snaps);
639 err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps,
640 realm->num_prior_parent_snaps);
641 if (err < 0)
642 goto fail;
643
644 invalidate = 1;
645 } else if (!realm->cached_context) {
646 invalidate = 1;
647 }
648
649 dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
650 realm, invalidate, p, e);
651
652 if (p < e)
653 goto more;
654
655 /* invalidate when we reach the _end_ (root) of the trace */
656 if (invalidate)
657 rebuild_snap_realms(realm);
658
659 __cleanup_empty_realms(mdsc);
660 return 0;
661
662bad:
663 err = -EINVAL;
664fail:
665 pr_err("update_snap_trace error %d\n", err);
666 return err;
667}
668
669
670/*
671 * Send any cap_snaps that are queued for flush. Try to carry
672 * s_mutex across multiple snap flushes to avoid locking overhead.
673 *
674 * Caller holds no locks.
675 */
676static void flush_snaps(struct ceph_mds_client *mdsc)
677{
678 struct ceph_inode_info *ci;
679 struct inode *inode;
680 struct ceph_mds_session *session = NULL;
681
682 dout("flush_snaps\n");
683 spin_lock(&mdsc->snap_flush_lock);
684 while (!list_empty(&mdsc->snap_flush_list)) {
685 ci = list_first_entry(&mdsc->snap_flush_list,
686 struct ceph_inode_info, i_snap_flush_item);
687 inode = &ci->vfs_inode;
688 igrab(inode);
689 spin_unlock(&mdsc->snap_flush_lock);
690 spin_lock(&inode->i_lock);
691 __ceph_flush_snaps(ci, &session);
692 spin_unlock(&inode->i_lock);
693 iput(inode);
694 spin_lock(&mdsc->snap_flush_lock);
695 }
696 spin_unlock(&mdsc->snap_flush_lock);
697
698 if (session) {
699 mutex_unlock(&session->s_mutex);
700 ceph_put_mds_session(session);
701 }
702 dout("flush_snaps done\n");
703}
704
705
706/*
707 * Handle a snap notification from the MDS.
708 *
709 * This can take two basic forms: the simplest is just a snap creation
710 * or deletion notification on an existing realm. This should update the
711 * realm and its children.
712 *
713 * The more difficult case is realm creation, due to snap creation at a
714 * new point in the file hierarchy, or due to a rename that moves a file or
715 * directory into another realm.
716 */
717void ceph_handle_snap(struct ceph_mds_client *mdsc,
718 struct ceph_mds_session *session,
719 struct ceph_msg *msg)
720{
721 struct super_block *sb = mdsc->client->sb;
722 int mds = session->s_mds;
723 u64 split;
724 int op;
725 int trace_len;
726 struct ceph_snap_realm *realm = NULL;
727 void *p = msg->front.iov_base;
728 void *e = p + msg->front.iov_len;
729 struct ceph_mds_snap_head *h;
730 int num_split_inos, num_split_realms;
731 __le64 *split_inos = NULL, *split_realms = NULL;
732 int i;
733 int locked_rwsem = 0;
734
735 /* decode */
736 if (msg->front.iov_len < sizeof(*h))
737 goto bad;
738 h = p;
739 op = le32_to_cpu(h->op);
740 split = le64_to_cpu(h->split); /* non-zero if we are splitting an
741 * existing realm */
742 num_split_inos = le32_to_cpu(h->num_split_inos);
743 num_split_realms = le32_to_cpu(h->num_split_realms);
744 trace_len = le32_to_cpu(h->trace_len);
745 p += sizeof(*h);
746
747 dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
748 ceph_snap_op_name(op), split, trace_len);
749
750 mutex_lock(&session->s_mutex);
751 session->s_seq++;
752 mutex_unlock(&session->s_mutex);
753
754 down_write(&mdsc->snap_rwsem);
755 locked_rwsem = 1;
756
757 if (op == CEPH_SNAP_OP_SPLIT) {
758 struct ceph_mds_snap_realm *ri;
759
760 /*
761 * A "split" breaks part of an existing realm off into
762 * a new realm. The MDS provides a list of inodes
763 * (with caps) and child realms that belong to the new
764 * child.
765 */
766 split_inos = p;
767 p += sizeof(u64) * num_split_inos;
768 split_realms = p;
769 p += sizeof(u64) * num_split_realms;
770 ceph_decode_need(&p, e, sizeof(*ri), bad);
771 /* we will peek at realm info here, but will _not_
772 * advance p, as the realm update will occur below in
773 * ceph_update_snap_trace. */
774 ri = p;
775
776 realm = ceph_lookup_snap_realm(mdsc, split);
777 if (!realm) {
778 realm = ceph_create_snap_realm(mdsc, split);
779 if (IS_ERR(realm))
780 goto out;
781 }
782 ceph_get_snap_realm(mdsc, realm);
783
784 dout("splitting snap_realm %llx %p\n", realm->ino, realm);
785 for (i = 0; i < num_split_inos; i++) {
786 struct ceph_vino vino = {
787 .ino = le64_to_cpu(split_inos[i]),
788 .snap = CEPH_NOSNAP,
789 };
790 struct inode *inode = ceph_find_inode(sb, vino);
791 struct ceph_inode_info *ci;
792
793 if (!inode)
794 continue;
795 ci = ceph_inode(inode);
796
797 spin_lock(&inode->i_lock);
798 if (!ci->i_snap_realm)
799 goto skip_inode;
800 /*
801 * If this inode belongs to a realm that was
802 * created after our new realm, we experienced
803 * a race (due to another split notifications
804 * arriving from a different MDS). So skip
805 * this inode.
806 */
807 if (ci->i_snap_realm->created >
808 le64_to_cpu(ri->created)) {
809 dout(" leaving %p in newer realm %llx %p\n",
810 inode, ci->i_snap_realm->ino,
811 ci->i_snap_realm);
812 goto skip_inode;
813 }
814 dout(" will move %p to split realm %llx %p\n",
815 inode, realm->ino, realm);
816 /*
817 * Remove the inode from the realm's inode
818 * list, but don't add it to the new realm
819 * yet. We don't want the cap_snap to be
820 * queued (again) by ceph_update_snap_trace()
821 * below. Queue it _now_, under the old context.
822 */
823 spin_lock(&realm->inodes_with_caps_lock);
824 list_del_init(&ci->i_snap_realm_item);
825 spin_unlock(&realm->inodes_with_caps_lock);
826 spin_unlock(&inode->i_lock);
827
828 ceph_queue_cap_snap(ci);
829
830 iput(inode);
831 continue;
832
833skip_inode:
834 spin_unlock(&inode->i_lock);
835 iput(inode);
836 }
837
838 /* we may have taken some of the old realm's children. */
839 for (i = 0; i < num_split_realms; i++) {
840 struct ceph_snap_realm *child =
841 ceph_lookup_snap_realm(mdsc,
842 le64_to_cpu(split_realms[i]));
843 if (!child)
844 continue;
845 adjust_snap_realm_parent(mdsc, child, realm->ino);
846 }
847 }
848
849 /*
850 * update using the provided snap trace. if we are deleting a
851 * snap, we can avoid queueing cap_snaps.
852 */
853 ceph_update_snap_trace(mdsc, p, e,
854 op == CEPH_SNAP_OP_DESTROY);
855
856 if (op == CEPH_SNAP_OP_SPLIT) {
857 /*
858 * ok, _now_ add the inodes into the new realm.
859 */
860 for (i = 0; i < num_split_inos; i++) {
861 struct ceph_vino vino = {
862 .ino = le64_to_cpu(split_inos[i]),
863 .snap = CEPH_NOSNAP,
864 };
865 struct inode *inode = ceph_find_inode(sb, vino);
866 struct ceph_inode_info *ci;
867
868 if (!inode)
869 continue;
870 ci = ceph_inode(inode);
871 spin_lock(&inode->i_lock);
872 if (list_empty(&ci->i_snap_realm_item)) {
873 struct ceph_snap_realm *oldrealm =
874 ci->i_snap_realm;
875
876 dout(" moving %p to split realm %llx %p\n",
877 inode, realm->ino, realm);
878 spin_lock(&realm->inodes_with_caps_lock);
879 list_add(&ci->i_snap_realm_item,
880 &realm->inodes_with_caps);
881 ci->i_snap_realm = realm;
882 spin_unlock(&realm->inodes_with_caps_lock);
883 ceph_get_snap_realm(mdsc, realm);
884 ceph_put_snap_realm(mdsc, oldrealm);
885 }
886 spin_unlock(&inode->i_lock);
887 iput(inode);
888 }
889
890 /* we took a reference when we created the realm, above */
891 ceph_put_snap_realm(mdsc, realm);
892 }
893
894 __cleanup_empty_realms(mdsc);
895
896 up_write(&mdsc->snap_rwsem);
897
898 flush_snaps(mdsc);
899 return;
900
901bad:
902 pr_err("corrupt snap message from mds%d\n", mds);
903 ceph_msg_dump(msg);
904out:
905 if (locked_rwsem)
906 up_write(&mdsc->snap_rwsem);
907 return;
908}
909
910
911
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
new file mode 100644
index 000000000000..110857ba9269
--- /dev/null
+++ b/fs/ceph/super.c
@@ -0,0 +1,1041 @@
1
2#include "ceph_debug.h"
3
4#include <linux/backing-dev.h>
5#include <linux/fs.h>
6#include <linux/inet.h>
7#include <linux/in6.h>
8#include <linux/module.h>
9#include <linux/mount.h>
10#include <linux/parser.h>
11#include <linux/rwsem.h>
12#include <linux/sched.h>
13#include <linux/seq_file.h>
14#include <linux/slab.h>
15#include <linux/statfs.h>
16#include <linux/string.h>
17#include <linux/version.h>
18#include <linux/vmalloc.h>
19
20#include "decode.h"
21#include "super.h"
22#include "mon_client.h"
23#include "auth.h"
24
25/*
26 * Ceph superblock operations
27 *
28 * Handle the basics of mounting, unmounting.
29 */
30
31
32/*
33 * find filename portion of a path (/foo/bar/baz -> baz)
34 */
35const char *ceph_file_part(const char *s, int len)
36{
37 const char *e = s + len;
38
39 while (e != s && *(e-1) != '/')
40 e--;
41 return e;
42}
43
44
45/*
46 * super ops
47 */
48static void ceph_put_super(struct super_block *s)
49{
50 struct ceph_client *client = ceph_sb_to_client(s);
51
52 dout("put_super\n");
53 ceph_mdsc_close_sessions(&client->mdsc);
54
55 /*
56 * ensure we release the bdi before put_anon_super releases
57 * the device name.
58 */
59 if (s->s_bdi == &client->backing_dev_info) {
60 bdi_unregister(&client->backing_dev_info);
61 s->s_bdi = NULL;
62 }
63
64 return;
65}
66
67static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
68{
69 struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
70 struct ceph_monmap *monmap = client->monc.monmap;
71 struct ceph_statfs st;
72 u64 fsid;
73 int err;
74
75 dout("statfs\n");
76 err = ceph_monc_do_statfs(&client->monc, &st);
77 if (err < 0)
78 return err;
79
80 /* fill in kstatfs */
81 buf->f_type = CEPH_SUPER_MAGIC; /* ?? */
82
83 /*
84 * express utilization in terms of large blocks to avoid
85 * overflow on 32-bit machines.
86 */
87 buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
88 buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
89 buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >>
90 (CEPH_BLOCK_SHIFT-10);
91 buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
92
93 buf->f_files = le64_to_cpu(st.num_objects);
94 buf->f_ffree = -1;
95 buf->f_namelen = PATH_MAX;
96 buf->f_frsize = PAGE_CACHE_SIZE;
97
98 /* leave fsid little-endian, regardless of host endianness */
99 fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
100 buf->f_fsid.val[0] = fsid & 0xffffffff;
101 buf->f_fsid.val[1] = fsid >> 32;
102
103 return 0;
104}
105
106
107static int ceph_syncfs(struct super_block *sb, int wait)
108{
109 dout("sync_fs %d\n", wait);
110 ceph_osdc_sync(&ceph_client(sb)->osdc);
111 ceph_mdsc_sync(&ceph_client(sb)->mdsc);
112 dout("sync_fs %d done\n", wait);
113 return 0;
114}
115
116
117/**
118 * ceph_show_options - Show mount options in /proc/mounts
119 * @m: seq_file to write to
120 * @mnt: mount descriptor
121 */
122static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
123{
124 struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
125 struct ceph_mount_args *args = client->mount_args;
126
127 if (args->flags & CEPH_OPT_FSID)
128 seq_printf(m, ",fsidmajor=%llu,fsidminor%llu",
129 le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
130 le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
131 if (args->flags & CEPH_OPT_NOSHARE)
132 seq_puts(m, ",noshare");
133 if (args->flags & CEPH_OPT_DIRSTAT)
134 seq_puts(m, ",dirstat");
135 if ((args->flags & CEPH_OPT_RBYTES) == 0)
136 seq_puts(m, ",norbytes");
137 if (args->flags & CEPH_OPT_NOCRC)
138 seq_puts(m, ",nocrc");
139 if (args->flags & CEPH_OPT_NOASYNCREADDIR)
140 seq_puts(m, ",noasyncreaddir");
141 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
142 seq_printf(m, ",snapdirname=%s", args->snapdir_name);
143 if (args->name)
144 seq_printf(m, ",name=%s", args->name);
145 if (args->secret)
146 seq_puts(m, ",secret=<hidden>");
147 return 0;
148}
149
150/*
151 * caches
152 */
153struct kmem_cache *ceph_inode_cachep;
154struct kmem_cache *ceph_cap_cachep;
155struct kmem_cache *ceph_dentry_cachep;
156struct kmem_cache *ceph_file_cachep;
157
158static void ceph_inode_init_once(void *foo)
159{
160 struct ceph_inode_info *ci = foo;
161 inode_init_once(&ci->vfs_inode);
162}
163
164static int default_congestion_kb(void)
165{
166 int congestion_kb;
167
168 /*
169 * Copied from NFS
170 *
171 * congestion size, scale with available memory.
172 *
173 * 64MB: 8192k
174 * 128MB: 11585k
175 * 256MB: 16384k
176 * 512MB: 23170k
177 * 1GB: 32768k
178 * 2GB: 46340k
179 * 4GB: 65536k
180 * 8GB: 92681k
181 * 16GB: 131072k
182 *
183 * This allows larger machines to have larger/more transfers.
184 * Limit the default to 256M
185 */
186 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
187 if (congestion_kb > 256*1024)
188 congestion_kb = 256*1024;
189
190 return congestion_kb;
191}
192
193static int __init init_caches(void)
194{
195 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
196 sizeof(struct ceph_inode_info),
197 __alignof__(struct ceph_inode_info),
198 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
199 ceph_inode_init_once);
200 if (ceph_inode_cachep == NULL)
201 return -ENOMEM;
202
203 ceph_cap_cachep = KMEM_CACHE(ceph_cap,
204 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
205 if (ceph_cap_cachep == NULL)
206 goto bad_cap;
207
208 ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
209 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
210 if (ceph_dentry_cachep == NULL)
211 goto bad_dentry;
212
213 ceph_file_cachep = KMEM_CACHE(ceph_file_info,
214 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
215 if (ceph_file_cachep == NULL)
216 goto bad_file;
217
218 return 0;
219
220bad_file:
221 kmem_cache_destroy(ceph_dentry_cachep);
222bad_dentry:
223 kmem_cache_destroy(ceph_cap_cachep);
224bad_cap:
225 kmem_cache_destroy(ceph_inode_cachep);
226 return -ENOMEM;
227}
228
229static void destroy_caches(void)
230{
231 kmem_cache_destroy(ceph_inode_cachep);
232 kmem_cache_destroy(ceph_cap_cachep);
233 kmem_cache_destroy(ceph_dentry_cachep);
234 kmem_cache_destroy(ceph_file_cachep);
235}
236
237
238/*
239 * ceph_umount_begin - initiate forced umount. Tear down down the
240 * mount, skipping steps that may hang while waiting for server(s).
241 */
242static void ceph_umount_begin(struct super_block *sb)
243{
244 struct ceph_client *client = ceph_sb_to_client(sb);
245
246 dout("ceph_umount_begin - starting forced umount\n");
247 if (!client)
248 return;
249 client->mount_state = CEPH_MOUNT_SHUTDOWN;
250 return;
251}
252
253static const struct super_operations ceph_super_ops = {
254 .alloc_inode = ceph_alloc_inode,
255 .destroy_inode = ceph_destroy_inode,
256 .write_inode = ceph_write_inode,
257 .sync_fs = ceph_syncfs,
258 .put_super = ceph_put_super,
259 .show_options = ceph_show_options,
260 .statfs = ceph_statfs,
261 .umount_begin = ceph_umount_begin,
262};
263
264
265const char *ceph_msg_type_name(int type)
266{
267 switch (type) {
268 case CEPH_MSG_SHUTDOWN: return "shutdown";
269 case CEPH_MSG_PING: return "ping";
270 case CEPH_MSG_AUTH: return "auth";
271 case CEPH_MSG_AUTH_REPLY: return "auth_reply";
272 case CEPH_MSG_MON_MAP: return "mon_map";
273 case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
274 case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
275 case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
276 case CEPH_MSG_STATFS: return "statfs";
277 case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
278 case CEPH_MSG_MDS_MAP: return "mds_map";
279 case CEPH_MSG_CLIENT_SESSION: return "client_session";
280 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
281 case CEPH_MSG_CLIENT_REQUEST: return "client_request";
282 case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
283 case CEPH_MSG_CLIENT_REPLY: return "client_reply";
284 case CEPH_MSG_CLIENT_CAPS: return "client_caps";
285 case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
286 case CEPH_MSG_CLIENT_SNAP: return "client_snap";
287 case CEPH_MSG_CLIENT_LEASE: return "client_lease";
288 case CEPH_MSG_OSD_MAP: return "osd_map";
289 case CEPH_MSG_OSD_OP: return "osd_op";
290 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
291 default: return "unknown";
292 }
293}
294
295
296/*
297 * mount options
298 */
299enum {
300 Opt_fsidmajor,
301 Opt_fsidminor,
302 Opt_monport,
303 Opt_wsize,
304 Opt_rsize,
305 Opt_osdtimeout,
306 Opt_osdkeepalivetimeout,
307 Opt_mount_timeout,
308 Opt_osd_idle_ttl,
309 Opt_caps_wanted_delay_min,
310 Opt_caps_wanted_delay_max,
311 Opt_readdir_max_entries,
312 Opt_congestion_kb,
313 Opt_last_int,
314 /* int args above */
315 Opt_snapdirname,
316 Opt_name,
317 Opt_secret,
318 Opt_last_string,
319 /* string args above */
320 Opt_ip,
321 Opt_noshare,
322 Opt_dirstat,
323 Opt_nodirstat,
324 Opt_rbytes,
325 Opt_norbytes,
326 Opt_nocrc,
327 Opt_noasyncreaddir,
328};
329
330static match_table_t arg_tokens = {
331 {Opt_fsidmajor, "fsidmajor=%ld"},
332 {Opt_fsidminor, "fsidminor=%ld"},
333 {Opt_monport, "monport=%d"},
334 {Opt_wsize, "wsize=%d"},
335 {Opt_rsize, "rsize=%d"},
336 {Opt_osdtimeout, "osdtimeout=%d"},
337 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
338 {Opt_mount_timeout, "mount_timeout=%d"},
339 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
340 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
341 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
342 {Opt_readdir_max_entries, "readdir_max_entries=%d"},
343 {Opt_congestion_kb, "write_congestion_kb=%d"},
344 /* int args above */
345 {Opt_snapdirname, "snapdirname=%s"},
346 {Opt_name, "name=%s"},
347 {Opt_secret, "secret=%s"},
348 /* string args above */
349 {Opt_ip, "ip=%s"},
350 {Opt_noshare, "noshare"},
351 {Opt_dirstat, "dirstat"},
352 {Opt_nodirstat, "nodirstat"},
353 {Opt_rbytes, "rbytes"},
354 {Opt_norbytes, "norbytes"},
355 {Opt_nocrc, "nocrc"},
356 {Opt_noasyncreaddir, "noasyncreaddir"},
357 {-1, NULL}
358};
359
360
361static struct ceph_mount_args *parse_mount_args(int flags, char *options,
362 const char *dev_name,
363 const char **path)
364{
365 struct ceph_mount_args *args;
366 const char *c;
367 int err = -ENOMEM;
368 substring_t argstr[MAX_OPT_ARGS];
369
370 args = kzalloc(sizeof(*args), GFP_KERNEL);
371 if (!args)
372 return ERR_PTR(-ENOMEM);
373 args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
374 GFP_KERNEL);
375 if (!args->mon_addr)
376 goto out;
377
378 dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
379
380 /* start with defaults */
381 args->sb_flags = flags;
382 args->flags = CEPH_OPT_DEFAULT;
383 args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
384 args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
385 args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
386 args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
387 args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
388 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
389 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
390 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
391 args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
392 args->max_readdir = 1024;
393 args->congestion_kb = default_congestion_kb();
394
395 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
396 err = -EINVAL;
397 if (!dev_name)
398 goto out;
399 *path = strstr(dev_name, ":/");
400 if (*path == NULL) {
401 pr_err("device name is missing path (no :/ in %s)\n",
402 dev_name);
403 goto out;
404 }
405
406 /* get mon ip(s) */
407 err = ceph_parse_ips(dev_name, *path, args->mon_addr,
408 CEPH_MAX_MON, &args->num_mon);
409 if (err < 0)
410 goto out;
411
412 /* path on server */
413 *path += 2;
414 dout("server path '%s'\n", *path);
415
416 /* parse mount options */
417 while ((c = strsep(&options, ",")) != NULL) {
418 int token, intval, ret;
419 if (!*c)
420 continue;
421 err = -EINVAL;
422 token = match_token((char *)c, arg_tokens, argstr);
423 if (token < 0) {
424 pr_err("bad mount option at '%s'\n", c);
425 goto out;
426 }
427 if (token < Opt_last_int) {
428 ret = match_int(&argstr[0], &intval);
429 if (ret < 0) {
430 pr_err("bad mount option arg (not int) "
431 "at '%s'\n", c);
432 continue;
433 }
434 dout("got int token %d val %d\n", token, intval);
435 } else if (token > Opt_last_int && token < Opt_last_string) {
436 dout("got string token %d val %s\n", token,
437 argstr[0].from);
438 } else {
439 dout("got token %d\n", token);
440 }
441 switch (token) {
442 case Opt_fsidmajor:
443 *(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval);
444 break;
445 case Opt_fsidminor:
446 *(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval);
447 break;
448 case Opt_ip:
449 err = ceph_parse_ips(argstr[0].from,
450 argstr[0].to,
451 &args->my_addr,
452 1, NULL);
453 if (err < 0)
454 goto out;
455 args->flags |= CEPH_OPT_MYIP;
456 break;
457
458 case Opt_snapdirname:
459 kfree(args->snapdir_name);
460 args->snapdir_name = kstrndup(argstr[0].from,
461 argstr[0].to-argstr[0].from,
462 GFP_KERNEL);
463 break;
464 case Opt_name:
465 args->name = kstrndup(argstr[0].from,
466 argstr[0].to-argstr[0].from,
467 GFP_KERNEL);
468 break;
469 case Opt_secret:
470 args->secret = kstrndup(argstr[0].from,
471 argstr[0].to-argstr[0].from,
472 GFP_KERNEL);
473 break;
474
475 /* misc */
476 case Opt_wsize:
477 args->wsize = intval;
478 break;
479 case Opt_rsize:
480 args->rsize = intval;
481 break;
482 case Opt_osdtimeout:
483 args->osd_timeout = intval;
484 break;
485 case Opt_osdkeepalivetimeout:
486 args->osd_keepalive_timeout = intval;
487 break;
488 case Opt_mount_timeout:
489 args->mount_timeout = intval;
490 break;
491 case Opt_caps_wanted_delay_min:
492 args->caps_wanted_delay_min = intval;
493 break;
494 case Opt_caps_wanted_delay_max:
495 args->caps_wanted_delay_max = intval;
496 break;
497 case Opt_readdir_max_entries:
498 args->max_readdir = intval;
499 break;
500 case Opt_congestion_kb:
501 args->congestion_kb = intval;
502 break;
503
504 case Opt_noshare:
505 args->flags |= CEPH_OPT_NOSHARE;
506 break;
507
508 case Opt_dirstat:
509 args->flags |= CEPH_OPT_DIRSTAT;
510 break;
511 case Opt_nodirstat:
512 args->flags &= ~CEPH_OPT_DIRSTAT;
513 break;
514 case Opt_rbytes:
515 args->flags |= CEPH_OPT_RBYTES;
516 break;
517 case Opt_norbytes:
518 args->flags &= ~CEPH_OPT_RBYTES;
519 break;
520 case Opt_nocrc:
521 args->flags |= CEPH_OPT_NOCRC;
522 break;
523 case Opt_noasyncreaddir:
524 args->flags |= CEPH_OPT_NOASYNCREADDIR;
525 break;
526
527 default:
528 BUG_ON(token);
529 }
530 }
531 return args;
532
533out:
534 kfree(args->mon_addr);
535 kfree(args);
536 return ERR_PTR(err);
537}
538
539static void destroy_mount_args(struct ceph_mount_args *args)
540{
541 dout("destroy_mount_args %p\n", args);
542 kfree(args->snapdir_name);
543 args->snapdir_name = NULL;
544 kfree(args->name);
545 args->name = NULL;
546 kfree(args->secret);
547 args->secret = NULL;
548 kfree(args);
549}
550
551/*
552 * create a fresh client instance
553 */
554static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
555{
556 struct ceph_client *client;
557 int err = -ENOMEM;
558
559 client = kzalloc(sizeof(*client), GFP_KERNEL);
560 if (client == NULL)
561 return ERR_PTR(-ENOMEM);
562
563 mutex_init(&client->mount_mutex);
564
565 init_waitqueue_head(&client->auth_wq);
566
567 client->sb = NULL;
568 client->mount_state = CEPH_MOUNT_MOUNTING;
569 client->mount_args = args;
570
571 client->msgr = NULL;
572
573 client->auth_err = 0;
574 atomic_long_set(&client->writeback_count, 0);
575
576 err = bdi_init(&client->backing_dev_info);
577 if (err < 0)
578 goto fail;
579
580 err = -ENOMEM;
581 client->wb_wq = create_workqueue("ceph-writeback");
582 if (client->wb_wq == NULL)
583 goto fail_bdi;
584 client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
585 if (client->pg_inv_wq == NULL)
586 goto fail_wb_wq;
587 client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
588 if (client->trunc_wq == NULL)
589 goto fail_pg_inv_wq;
590
591 /* set up mempools */
592 err = -ENOMEM;
593 client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
594 client->mount_args->wsize >> PAGE_CACHE_SHIFT);
595 if (!client->wb_pagevec_pool)
596 goto fail_trunc_wq;
597
598 /* caps */
599 client->min_caps = args->max_readdir;
600 ceph_adjust_min_caps(client->min_caps);
601
602 /* subsystems */
603 err = ceph_monc_init(&client->monc, client);
604 if (err < 0)
605 goto fail_mempool;
606 err = ceph_osdc_init(&client->osdc, client);
607 if (err < 0)
608 goto fail_monc;
609 err = ceph_mdsc_init(&client->mdsc, client);
610 if (err < 0)
611 goto fail_osdc;
612 return client;
613
614fail_osdc:
615 ceph_osdc_stop(&client->osdc);
616fail_monc:
617 ceph_monc_stop(&client->monc);
618fail_mempool:
619 mempool_destroy(client->wb_pagevec_pool);
620fail_trunc_wq:
621 destroy_workqueue(client->trunc_wq);
622fail_pg_inv_wq:
623 destroy_workqueue(client->pg_inv_wq);
624fail_wb_wq:
625 destroy_workqueue(client->wb_wq);
626fail_bdi:
627 bdi_destroy(&client->backing_dev_info);
628fail:
629 kfree(client);
630 return ERR_PTR(err);
631}
632
633static void ceph_destroy_client(struct ceph_client *client)
634{
635 dout("destroy_client %p\n", client);
636
637 /* unmount */
638 ceph_mdsc_stop(&client->mdsc);
639 ceph_monc_stop(&client->monc);
640 ceph_osdc_stop(&client->osdc);
641
642 ceph_adjust_min_caps(-client->min_caps);
643
644 ceph_debugfs_client_cleanup(client);
645 destroy_workqueue(client->wb_wq);
646 destroy_workqueue(client->pg_inv_wq);
647 destroy_workqueue(client->trunc_wq);
648
649 bdi_destroy(&client->backing_dev_info);
650
651 if (client->msgr)
652 ceph_messenger_destroy(client->msgr);
653 mempool_destroy(client->wb_pagevec_pool);
654
655 destroy_mount_args(client->mount_args);
656
657 kfree(client);
658 dout("destroy_client %p done\n", client);
659}
660
661/*
662 * Initially learn our fsid, or verify an fsid matches.
663 */
664int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
665{
666 if (client->have_fsid) {
667 if (ceph_fsid_compare(&client->fsid, fsid)) {
668 pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT,
669 PR_FSID(&client->fsid), PR_FSID(fsid));
670 return -1;
671 }
672 } else {
673 pr_info("client%lld fsid " FSID_FORMAT "\n",
674 client->monc.auth->global_id, PR_FSID(fsid));
675 memcpy(&client->fsid, fsid, sizeof(*fsid));
676 ceph_debugfs_client_init(client);
677 client->have_fsid = true;
678 }
679 return 0;
680}
681
682/*
683 * true if we have the mon map (and have thus joined the cluster)
684 */
685static int have_mon_map(struct ceph_client *client)
686{
687 return client->monc.monmap && client->monc.monmap->epoch;
688}
689
690/*
691 * Bootstrap mount by opening the root directory. Note the mount
692 * @started time from caller, and time out if this takes too long.
693 */
694static struct dentry *open_root_dentry(struct ceph_client *client,
695 const char *path,
696 unsigned long started)
697{
698 struct ceph_mds_client *mdsc = &client->mdsc;
699 struct ceph_mds_request *req = NULL;
700 int err;
701 struct dentry *root;
702
703 /* open dir */
704 dout("open_root_inode opening '%s'\n", path);
705 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
706 if (IS_ERR(req))
707 return ERR_PTR(PTR_ERR(req));
708 req->r_path1 = kstrdup(path, GFP_NOFS);
709 req->r_ino1.ino = CEPH_INO_ROOT;
710 req->r_ino1.snap = CEPH_NOSNAP;
711 req->r_started = started;
712 req->r_timeout = client->mount_args->mount_timeout * HZ;
713 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
714 req->r_num_caps = 2;
715 err = ceph_mdsc_do_request(mdsc, NULL, req);
716 if (err == 0) {
717 dout("open_root_inode success\n");
718 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
719 client->sb->s_root == NULL)
720 root = d_alloc_root(req->r_target_inode);
721 else
722 root = d_obtain_alias(req->r_target_inode);
723 req->r_target_inode = NULL;
724 dout("open_root_inode success, root dentry is %p\n", root);
725 } else {
726 root = ERR_PTR(err);
727 }
728 ceph_mdsc_put_request(req);
729 return root;
730}
731
732/*
733 * mount: join the ceph cluster, and open root directory.
734 */
735static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
736 const char *path)
737{
738 struct ceph_entity_addr *myaddr = NULL;
739 int err;
740 unsigned long timeout = client->mount_args->mount_timeout * HZ;
741 unsigned long started = jiffies; /* note the start time */
742 struct dentry *root;
743
744 dout("mount start\n");
745 mutex_lock(&client->mount_mutex);
746
747 /* initialize the messenger */
748 if (client->msgr == NULL) {
749 if (ceph_test_opt(client, MYIP))
750 myaddr = &client->mount_args->my_addr;
751 client->msgr = ceph_messenger_create(myaddr);
752 if (IS_ERR(client->msgr)) {
753 err = PTR_ERR(client->msgr);
754 client->msgr = NULL;
755 goto out;
756 }
757 client->msgr->nocrc = ceph_test_opt(client, NOCRC);
758 }
759
760 /* open session, and wait for mon, mds, and osd maps */
761 err = ceph_monc_open_session(&client->monc);
762 if (err < 0)
763 goto out;
764
765 while (!have_mon_map(client)) {
766 err = -EIO;
767 if (timeout && time_after_eq(jiffies, started + timeout))
768 goto out;
769
770 /* wait */
771 dout("mount waiting for mon_map\n");
772 err = wait_event_interruptible_timeout(client->auth_wq,
773 have_mon_map(client) || (client->auth_err < 0),
774 timeout);
775 if (err == -EINTR || err == -ERESTARTSYS)
776 goto out;
777 if (client->auth_err < 0) {
778 err = client->auth_err;
779 goto out;
780 }
781 }
782
783 dout("mount opening root\n");
784 root = open_root_dentry(client, "", started);
785 if (IS_ERR(root)) {
786 err = PTR_ERR(root);
787 goto out;
788 }
789 if (client->sb->s_root)
790 dput(root);
791 else
792 client->sb->s_root = root;
793
794 if (path[0] == 0) {
795 dget(root);
796 } else {
797 dout("mount opening base mountpoint\n");
798 root = open_root_dentry(client, path, started);
799 if (IS_ERR(root)) {
800 err = PTR_ERR(root);
801 dput(client->sb->s_root);
802 client->sb->s_root = NULL;
803 goto out;
804 }
805 }
806
807 mnt->mnt_root = root;
808 mnt->mnt_sb = client->sb;
809
810 client->mount_state = CEPH_MOUNT_MOUNTED;
811 dout("mount success\n");
812 err = 0;
813
814out:
815 mutex_unlock(&client->mount_mutex);
816 return err;
817}
818
819static int ceph_set_super(struct super_block *s, void *data)
820{
821 struct ceph_client *client = data;
822 int ret;
823
824 dout("set_super %p data %p\n", s, data);
825
826 s->s_flags = client->mount_args->sb_flags;
827 s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
828
829 s->s_fs_info = client;
830 client->sb = s;
831
832 s->s_op = &ceph_super_ops;
833 s->s_export_op = &ceph_export_ops;
834
835 s->s_time_gran = 1000; /* 1000 ns == 1 us */
836
837 ret = set_anon_super(s, NULL); /* what is that second arg for? */
838 if (ret != 0)
839 goto fail;
840
841 return ret;
842
843fail:
844 s->s_fs_info = NULL;
845 client->sb = NULL;
846 return ret;
847}
848
849/*
850 * share superblock if same fs AND options
851 */
852static int ceph_compare_super(struct super_block *sb, void *data)
853{
854 struct ceph_client *new = data;
855 struct ceph_mount_args *args = new->mount_args;
856 struct ceph_client *other = ceph_sb_to_client(sb);
857 int i;
858
859 dout("ceph_compare_super %p\n", sb);
860 if (args->flags & CEPH_OPT_FSID) {
861 if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
862 dout("fsid doesn't match\n");
863 return 0;
864 }
865 } else {
866 /* do we share (a) monitor? */
867 for (i = 0; i < new->monc.monmap->num_mon; i++)
868 if (ceph_monmap_contains(other->monc.monmap,
869 &new->monc.monmap->mon_inst[i].addr))
870 break;
871 if (i == new->monc.monmap->num_mon) {
872 dout("mon ip not part of monmap\n");
873 return 0;
874 }
875 dout("mon ip matches existing sb %p\n", sb);
876 }
877 if (args->sb_flags != other->mount_args->sb_flags) {
878 dout("flags differ\n");
879 return 0;
880 }
881 return 1;
882}
883
884/*
885 * construct our own bdi so we can control readahead, etc.
886 */
887static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
888{
889 int err;
890
891 /* set ra_pages based on rsize mount option? */
892 if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
893 client->backing_dev_info.ra_pages =
894 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
895 >> PAGE_SHIFT;
896 err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
897 if (!err)
898 sb->s_bdi = &client->backing_dev_info;
899 return err;
900}
901
902static int ceph_get_sb(struct file_system_type *fs_type,
903 int flags, const char *dev_name, void *data,
904 struct vfsmount *mnt)
905{
906 struct super_block *sb;
907 struct ceph_client *client;
908 int err;
909 int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
910 const char *path = NULL;
911 struct ceph_mount_args *args;
912
913 dout("ceph_get_sb\n");
914 args = parse_mount_args(flags, data, dev_name, &path);
915 if (IS_ERR(args)) {
916 err = PTR_ERR(args);
917 goto out_final;
918 }
919
920 /* create client (which we may/may not use) */
921 client = ceph_create_client(args);
922 if (IS_ERR(client)) {
923 err = PTR_ERR(client);
924 goto out_final;
925 }
926
927 if (client->mount_args->flags & CEPH_OPT_NOSHARE)
928 compare_super = NULL;
929 sb = sget(fs_type, compare_super, ceph_set_super, client);
930 if (IS_ERR(sb)) {
931 err = PTR_ERR(sb);
932 goto out;
933 }
934
935 if (ceph_client(sb) != client) {
936 ceph_destroy_client(client);
937 client = ceph_client(sb);
938 dout("get_sb got existing client %p\n", client);
939 } else {
940 dout("get_sb using new client %p\n", client);
941 err = ceph_register_bdi(sb, client);
942 if (err < 0)
943 goto out_splat;
944 }
945
946 err = ceph_mount(client, mnt, path);
947 if (err < 0)
948 goto out_splat;
949 dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
950 mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode));
951 return 0;
952
953out_splat:
954 ceph_mdsc_close_sessions(&client->mdsc);
955 up_write(&sb->s_umount);
956 deactivate_super(sb);
957 goto out_final;
958
959out:
960 ceph_destroy_client(client);
961out_final:
962 dout("ceph_get_sb fail %d\n", err);
963 return err;
964}
965
966static void ceph_kill_sb(struct super_block *s)
967{
968 struct ceph_client *client = ceph_sb_to_client(s);
969 dout("kill_sb %p\n", s);
970 ceph_mdsc_pre_umount(&client->mdsc);
971 kill_anon_super(s); /* will call put_super after sb is r/o */
972 ceph_destroy_client(client);
973}
974
975static struct file_system_type ceph_fs_type = {
976 .owner = THIS_MODULE,
977 .name = "ceph",
978 .get_sb = ceph_get_sb,
979 .kill_sb = ceph_kill_sb,
980 .fs_flags = FS_RENAME_DOES_D_MOVE,
981};
982
983#define _STRINGIFY(x) #x
984#define STRINGIFY(x) _STRINGIFY(x)
985
986static int __init init_ceph(void)
987{
988 int ret = 0;
989
990 ret = ceph_debugfs_init();
991 if (ret < 0)
992 goto out;
993
994 ret = ceph_msgr_init();
995 if (ret < 0)
996 goto out_debugfs;
997
998 ret = init_caches();
999 if (ret)
1000 goto out_msgr;
1001
1002 ceph_caps_init();
1003
1004 ret = register_filesystem(&ceph_fs_type);
1005 if (ret)
1006 goto out_icache;
1007
1008 pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n",
1009 CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL,
1010 CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
1011 CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
1012 return 0;
1013
1014out_icache:
1015 destroy_caches();
1016out_msgr:
1017 ceph_msgr_exit();
1018out_debugfs:
1019 ceph_debugfs_cleanup();
1020out:
1021 return ret;
1022}
1023
1024static void __exit exit_ceph(void)
1025{
1026 dout("exit_ceph\n");
1027 unregister_filesystem(&ceph_fs_type);
1028 ceph_caps_finalize();
1029 destroy_caches();
1030 ceph_msgr_exit();
1031 ceph_debugfs_cleanup();
1032}
1033
1034module_init(init_ceph);
1035module_exit(exit_ceph);
1036
1037MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
1038MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
1039MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
1040MODULE_DESCRIPTION("Ceph filesystem for Linux");
1041MODULE_LICENSE("GPL");
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
new file mode 100644
index 000000000000..13513b80d87f
--- /dev/null
+++ b/fs/ceph/super.h
@@ -0,0 +1,902 @@
1#ifndef _FS_CEPH_SUPER_H
2#define _FS_CEPH_SUPER_H
3
4#include "ceph_debug.h"
5
6#include <asm/unaligned.h>
7#include <linux/backing-dev.h>
8#include <linux/completion.h>
9#include <linux/exportfs.h>
10#include <linux/fs.h>
11#include <linux/mempool.h>
12#include <linux/pagemap.h>
13#include <linux/slab.h>
14#include <linux/wait.h>
15#include <linux/writeback.h>
16#include <linux/slab.h>
17
18#include "types.h"
19#include "messenger.h"
20#include "msgpool.h"
21#include "mon_client.h"
22#include "mds_client.h"
23#include "osd_client.h"
24#include "ceph_fs.h"
25
26/* f_type in struct statfs */
27#define CEPH_SUPER_MAGIC 0x00c36400
28
29/* large granularity for statfs utilization stats to facilitate
30 * large volume sizes on 32-bit machines. */
31#define CEPH_BLOCK_SHIFT 20 /* 1 MB */
32#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
33
34/*
35 * mount options
36 */
37#define CEPH_OPT_FSID (1<<0)
38#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
39#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
40#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */
41#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
42#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */
43#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
44
45#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES)
46
47#define ceph_set_opt(client, opt) \
48 (client)->mount_args->flags |= CEPH_OPT_##opt;
49#define ceph_test_opt(client, opt) \
50 (!!((client)->mount_args->flags & CEPH_OPT_##opt))
51
52
53struct ceph_mount_args {
54 int sb_flags;
55 int num_mon;
56 struct ceph_entity_addr *mon_addr;
57 int flags;
58 int mount_timeout;
59 int osd_idle_ttl;
60 int caps_wanted_delay_min, caps_wanted_delay_max;
61 struct ceph_fsid fsid;
62 struct ceph_entity_addr my_addr;
63 int wsize;
64 int rsize; /* max readahead */
65 int max_readdir; /* max readdir size */
66 int congestion_kb; /* max readdir size */
67 int osd_timeout;
68 int osd_keepalive_timeout;
69 char *snapdir_name; /* default ".snap" */
70 char *name;
71 char *secret;
72 int cap_release_safety;
73};
74
75/*
76 * defaults
77 */
78#define CEPH_MOUNT_TIMEOUT_DEFAULT 60
79#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */
80#define CEPH_OSD_KEEPALIVE_DEFAULT 5
81#define CEPH_OSD_IDLE_TTL_DEFAULT 60
82#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
83
84#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
85#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
86
87#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
88#define CEPH_AUTH_NAME_DEFAULT "guest"
89
90/*
91 * Delay telling the MDS we no longer want caps, in case we reopen
92 * the file. Delay a minimum amount of time, even if we send a cap
93 * message for some other reason. Otherwise, take the oppotunity to
94 * update the mds to avoid sending another message later.
95 */
96#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
97#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
98
99
100/* mount state */
101enum {
102 CEPH_MOUNT_MOUNTING,
103 CEPH_MOUNT_MOUNTED,
104 CEPH_MOUNT_UNMOUNTING,
105 CEPH_MOUNT_UNMOUNTED,
106 CEPH_MOUNT_SHUTDOWN,
107};
108
109/*
110 * subtract jiffies
111 */
112static inline unsigned long time_sub(unsigned long a, unsigned long b)
113{
114 BUG_ON(time_after(b, a));
115 return (long)a - (long)b;
116}
117
118/*
119 * per-filesystem client state
120 *
121 * possibly shared by multiple mount points, if they are
122 * mounting the same ceph filesystem/cluster.
123 */
124struct ceph_client {
125 struct ceph_fsid fsid;
126 bool have_fsid;
127
128 struct mutex mount_mutex; /* serialize mount attempts */
129 struct ceph_mount_args *mount_args;
130
131 struct super_block *sb;
132
133 unsigned long mount_state;
134 wait_queue_head_t auth_wq;
135
136 int auth_err;
137
138 int min_caps; /* min caps i added */
139
140 struct ceph_messenger *msgr; /* messenger instance */
141 struct ceph_mon_client monc;
142 struct ceph_mds_client mdsc;
143 struct ceph_osd_client osdc;
144
145 /* writeback */
146 mempool_t *wb_pagevec_pool;
147 struct workqueue_struct *wb_wq;
148 struct workqueue_struct *pg_inv_wq;
149 struct workqueue_struct *trunc_wq;
150 atomic_long_t writeback_count;
151
152 struct backing_dev_info backing_dev_info;
153
154#ifdef CONFIG_DEBUG_FS
155 struct dentry *debugfs_monmap;
156 struct dentry *debugfs_mdsmap, *debugfs_osdmap;
157 struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
158 struct dentry *debugfs_congestion_kb;
159 struct dentry *debugfs_bdi;
160#endif
161};
162
163static inline struct ceph_client *ceph_client(struct super_block *sb)
164{
165 return sb->s_fs_info;
166}
167
168
169/*
170 * File i/o capability. This tracks shared state with the metadata
171 * server that allows us to cache or writeback attributes or to read
172 * and write data. For any given inode, we should have one or more
173 * capabilities, one issued by each metadata server, and our
174 * cumulative access is the OR of all issued capabilities.
175 *
176 * Each cap is referenced by the inode's i_caps rbtree and by per-mds
177 * session capability lists.
178 */
179struct ceph_cap {
180 struct ceph_inode_info *ci;
181 struct rb_node ci_node; /* per-ci cap tree */
182 struct ceph_mds_session *session;
183 struct list_head session_caps; /* per-session caplist */
184 int mds;
185 u64 cap_id; /* unique cap id (mds provided) */
186 int issued; /* latest, from the mds */
187 int implemented; /* implemented superset of issued (for revocation) */
188 int mds_wanted;
189 u32 seq, issue_seq, mseq;
190 u32 cap_gen; /* active/stale cycle */
191 unsigned long last_used;
192 struct list_head caps_item;
193};
194
195#define CHECK_CAPS_NODELAY 1 /* do not delay any further */
196#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
197#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
198
199/*
200 * Snapped cap state that is pending flush to mds. When a snapshot occurs,
201 * we first complete any in-process sync writes and writeback any dirty
202 * data before flushing the snapped state (tracked here) back to the MDS.
203 */
204struct ceph_cap_snap {
205 atomic_t nref;
206 struct ceph_inode_info *ci;
207 struct list_head ci_item, flushing_item;
208
209 u64 follows, flush_tid;
210 int issued, dirty;
211 struct ceph_snap_context *context;
212
213 mode_t mode;
214 uid_t uid;
215 gid_t gid;
216
217 void *xattr_blob;
218 int xattr_len;
219 u64 xattr_version;
220
221 u64 size;
222 struct timespec mtime, atime, ctime;
223 u64 time_warp_seq;
224 int writing; /* a sync write is still in progress */
225 int dirty_pages; /* dirty pages awaiting writeback */
226};
227
228static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
229{
230 if (atomic_dec_and_test(&capsnap->nref))
231 kfree(capsnap);
232}
233
234/*
235 * The frag tree describes how a directory is fragmented, potentially across
236 * multiple metadata servers. It is also used to indicate points where
237 * metadata authority is delegated, and whether/where metadata is replicated.
238 *
239 * A _leaf_ frag will be present in the i_fragtree IFF there is
240 * delegation info. That is, if mds >= 0 || ndist > 0.
241 */
242#define CEPH_MAX_DIRFRAG_REP 4
243
244struct ceph_inode_frag {
245 struct rb_node node;
246
247 /* fragtree state */
248 u32 frag;
249 int split_by; /* i.e. 2^(split_by) children */
250
251 /* delegation and replication info */
252 int mds; /* -1 if same authority as parent */
253 int ndist; /* >0 if replicated */
254 int dist[CEPH_MAX_DIRFRAG_REP];
255};
256
257/*
258 * We cache inode xattrs as an encoded blob until they are first used,
259 * at which point we parse them into an rbtree.
260 */
261struct ceph_inode_xattr {
262 struct rb_node node;
263
264 const char *name;
265 int name_len;
266 const char *val;
267 int val_len;
268 int dirty;
269
270 int should_free_name;
271 int should_free_val;
272};
273
274struct ceph_inode_xattrs_info {
275 /*
276 * (still encoded) xattr blob. we avoid the overhead of parsing
277 * this until someone actually calls getxattr, etc.
278 *
279 * blob->vec.iov_len == 4 implies there are no xattrs; blob ==
280 * NULL means we don't know.
281 */
282 struct ceph_buffer *blob, *prealloc_blob;
283
284 struct rb_root index;
285 bool dirty;
286 int count;
287 int names_size;
288 int vals_size;
289 u64 version, index_version;
290};
291
292/*
293 * Ceph inode.
294 */
295#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
296#define CEPH_I_NODELAY 4 /* do not delay cap release */
297#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
298#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
299
300struct ceph_inode_info {
301 struct ceph_vino i_vino; /* ceph ino + snap */
302
303 u64 i_version;
304 u32 i_time_warp_seq;
305
306 unsigned i_ceph_flags;
307 unsigned long i_release_count;
308
309 struct ceph_file_layout i_layout;
310 char *i_symlink;
311
312 /* for dirs */
313 struct timespec i_rctime;
314 u64 i_rbytes, i_rfiles, i_rsubdirs;
315 u64 i_files, i_subdirs;
316 u64 i_max_offset; /* largest readdir offset, set with I_COMPLETE */
317
318 struct rb_root i_fragtree;
319 struct mutex i_fragtree_mutex;
320
321 struct ceph_inode_xattrs_info i_xattrs;
322
323 /* capabilities. protected _both_ by i_lock and cap->session's
324 * s_mutex. */
325 struct rb_root i_caps; /* cap list */
326 struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
327 unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
328 struct list_head i_dirty_item, i_flushing_item;
329 u64 i_cap_flush_seq;
330 /* we need to track cap writeback on a per-cap-bit basis, to allow
331 * overlapping, pipelined cap flushes to the mds. we can probably
332 * reduce the tid to 8 bits if we're concerned about inode size. */
333 u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
334 wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
335 unsigned long i_hold_caps_min; /* jiffies */
336 unsigned long i_hold_caps_max; /* jiffies */
337 struct list_head i_cap_delay_list; /* for delayed cap release to mds */
338 int i_cap_exporting_mds; /* to handle cap migration between */
339 unsigned i_cap_exporting_mseq; /* mds's. */
340 unsigned i_cap_exporting_issued;
341 struct ceph_cap_reservation i_cap_migration_resv;
342 struct list_head i_cap_snaps; /* snapped state pending flush to mds */
343 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 */
344 unsigned i_snap_caps; /* cap bits for snapped files */
345
346 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
347
348 u32 i_truncate_seq; /* last truncate to smaller size */
349 u64 i_truncate_size; /* and the size we last truncated down to */
350 int i_truncate_pending; /* still need to call vmtruncate */
351
352 u64 i_max_size; /* max file size authorized by mds */
353 u64 i_reported_size; /* (max_)size reported to or requested of mds */
354 u64 i_wanted_max_size; /* offset we'd like to write too */
355 u64 i_requested_max_size; /* max_size we've requested */
356
357 /* held references to caps */
358 int i_pin_ref;
359 int i_rd_ref, i_rdcache_ref, i_wr_ref;
360 int i_wrbuffer_ref, i_wrbuffer_ref_head;
361 u32 i_shared_gen; /* increment each time we get FILE_SHARED */
362 u32 i_rdcache_gen; /* we increment this each time we get
363 FILE_CACHE. If it's non-zero, we
364 _may_ have cached pages. */
365 u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
366
367 struct list_head i_unsafe_writes; /* uncommitted sync writes */
368 struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
369 spinlock_t i_unsafe_lock;
370
371 struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
372 int i_snap_realm_counter; /* snap realm (if caps) */
373 struct list_head i_snap_realm_item;
374 struct list_head i_snap_flush_item;
375
376 struct work_struct i_wb_work; /* writeback work */
377 struct work_struct i_pg_inv_work; /* page invalidation work */
378
379 struct work_struct i_vmtruncate_work;
380
381 struct inode vfs_inode; /* at end */
382};
383
384static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
385{
386 return container_of(inode, struct ceph_inode_info, vfs_inode);
387}
388
389static inline void ceph_i_clear(struct inode *inode, unsigned mask)
390{
391 struct ceph_inode_info *ci = ceph_inode(inode);
392
393 spin_lock(&inode->i_lock);
394 ci->i_ceph_flags &= ~mask;
395 spin_unlock(&inode->i_lock);
396}
397
398static inline void ceph_i_set(struct inode *inode, unsigned mask)
399{
400 struct ceph_inode_info *ci = ceph_inode(inode);
401
402 spin_lock(&inode->i_lock);
403 ci->i_ceph_flags |= mask;
404 spin_unlock(&inode->i_lock);
405}
406
407static inline bool ceph_i_test(struct inode *inode, unsigned mask)
408{
409 struct ceph_inode_info *ci = ceph_inode(inode);
410 bool r;
411
412 smp_mb();
413 r = (ci->i_ceph_flags & mask) == mask;
414 return r;
415}
416
417
418/* find a specific frag @f */
419extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
420 u32 f);
421
422/*
423 * choose fragment for value @v. copy frag content to pfrag, if leaf
424 * exists
425 */
426extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
427 struct ceph_inode_frag *pfrag,
428 int *found);
429
430/*
431 * Ceph dentry state
432 */
433struct ceph_dentry_info {
434 struct ceph_mds_session *lease_session;
435 u32 lease_gen, lease_shared_gen;
436 u32 lease_seq;
437 unsigned long lease_renew_after, lease_renew_from;
438 struct list_head lru;
439 struct dentry *dentry;
440 u64 time;
441 u64 offset;
442};
443
444static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
445{
446 return (struct ceph_dentry_info *)dentry->d_fsdata;
447}
448
449static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
450{
451 return ((loff_t)frag << 32) | (loff_t)off;
452}
453
454/*
455 * ino_t is <64 bits on many architectures, blech.
456 *
457 * don't include snap in ino hash, at least for now.
458 */
459static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
460{
461 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
462#if BITS_PER_LONG == 32
463 ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
464 if (!ino)
465 ino = 1;
466#endif
467 return ino;
468}
469
470static inline int ceph_set_ino_cb(struct inode *inode, void *data)
471{
472 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
473 inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
474 return 0;
475}
476
477static inline struct ceph_vino ceph_vino(struct inode *inode)
478{
479 return ceph_inode(inode)->i_vino;
480}
481
482/* for printf-style formatting */
483#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
484
485static inline u64 ceph_ino(struct inode *inode)
486{
487 return ceph_inode(inode)->i_vino.ino;
488}
489static inline u64 ceph_snap(struct inode *inode)
490{
491 return ceph_inode(inode)->i_vino.snap;
492}
493
494static inline int ceph_ino_compare(struct inode *inode, void *data)
495{
496 struct ceph_vino *pvino = (struct ceph_vino *)data;
497 struct ceph_inode_info *ci = ceph_inode(inode);
498 return ci->i_vino.ino == pvino->ino &&
499 ci->i_vino.snap == pvino->snap;
500}
501
502static inline struct inode *ceph_find_inode(struct super_block *sb,
503 struct ceph_vino vino)
504{
505 ino_t t = ceph_vino_to_ino(vino);
506 return ilookup5(sb, t, ceph_ino_compare, &vino);
507}
508
509
510/*
511 * caps helpers
512 */
513static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
514{
515 return !RB_EMPTY_ROOT(&ci->i_caps);
516}
517
518extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
519extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
520extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
521 struct ceph_cap *cap);
522
523static inline int ceph_caps_issued(struct ceph_inode_info *ci)
524{
525 int issued;
526 spin_lock(&ci->vfs_inode.i_lock);
527 issued = __ceph_caps_issued(ci, NULL);
528 spin_unlock(&ci->vfs_inode.i_lock);
529 return issued;
530}
531
532static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
533 int touch)
534{
535 int r;
536 spin_lock(&ci->vfs_inode.i_lock);
537 r = __ceph_caps_issued_mask(ci, mask, touch);
538 spin_unlock(&ci->vfs_inode.i_lock);
539 return r;
540}
541
542static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
543{
544 return ci->i_dirty_caps | ci->i_flushing_caps;
545}
546extern void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
547
548extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
549extern int __ceph_caps_used(struct ceph_inode_info *ci);
550
551extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
552
553/*
554 * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
555 */
556static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
557{
558 int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
559 if (w & CEPH_CAP_FILE_BUFFER)
560 w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */
561 return w;
562}
563
564/* what the mds thinks we want */
565extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
566
567extern void ceph_caps_init(void);
568extern void ceph_caps_finalize(void);
569extern void ceph_adjust_min_caps(int delta);
570extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need);
571extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx);
572extern void ceph_reservation_status(struct ceph_client *client,
573 int *total, int *avail, int *used,
574 int *reserved, int *min);
575
576static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
577{
578 return (struct ceph_client *)inode->i_sb->s_fs_info;
579}
580
581static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
582{
583 return (struct ceph_client *)sb->s_fs_info;
584}
585
586
587/*
588 * we keep buffered readdir results attached to file->private_data
589 */
590struct ceph_file_info {
591 int fmode; /* initialized on open */
592
593 /* readdir: position within the dir */
594 u32 frag;
595 struct ceph_mds_request *last_readdir;
596 int at_end;
597
598 /* readdir: position within a frag */
599 unsigned offset; /* offset of last chunk, adjusted for . and .. */
600 u64 next_offset; /* offset of next chunk (last_name's + 1) */
601 char *last_name; /* last entry in previous chunk */
602 struct dentry *dentry; /* next dentry (for dcache readdir) */
603 unsigned long dir_release_count;
604
605 /* used for -o dirstat read() on directory thing */
606 char *dir_info;
607 int dir_info_len;
608};
609
610
611
612/*
613 * snapshots
614 */
615
616/*
617 * A "snap context" is the set of existing snapshots when we
618 * write data. It is used by the OSD to guide its COW behavior.
619 *
620 * The ceph_snap_context is refcounted, and attached to each dirty
621 * page, indicating which context the dirty data belonged when it was
622 * dirtied.
623 */
624struct ceph_snap_context {
625 atomic_t nref;
626 u64 seq;
627 int num_snaps;
628 u64 snaps[];
629};
630
631static inline struct ceph_snap_context *
632ceph_get_snap_context(struct ceph_snap_context *sc)
633{
634 /*
635 printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
636 atomic_read(&sc->nref)+1);
637 */
638 if (sc)
639 atomic_inc(&sc->nref);
640 return sc;
641}
642
643static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
644{
645 if (!sc)
646 return;
647 /*
648 printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
649 atomic_read(&sc->nref)-1);
650 */
651 if (atomic_dec_and_test(&sc->nref)) {
652 /*printk(" deleting snap_context %p\n", sc);*/
653 kfree(sc);
654 }
655}
656
657/*
658 * A "snap realm" describes a subset of the file hierarchy sharing
659 * the same set of snapshots that apply to it. The realms themselves
660 * are organized into a hierarchy, such that children inherit (some of)
661 * the snapshots of their parents.
662 *
663 * All inodes within the realm that have capabilities are linked into a
664 * per-realm list.
665 */
666struct ceph_snap_realm {
667 u64 ino;
668 atomic_t nref;
669 struct rb_node node;
670
671 u64 created, seq;
672 u64 parent_ino;
673 u64 parent_since; /* snapid when our current parent became so */
674
675 u64 *prior_parent_snaps; /* snaps inherited from any parents we */
676 int num_prior_parent_snaps; /* had prior to parent_since */
677 u64 *snaps; /* snaps specific to this realm */
678 int num_snaps;
679
680 struct ceph_snap_realm *parent;
681 struct list_head children; /* list of child realms */
682 struct list_head child_item;
683
684 struct list_head empty_item; /* if i have ref==0 */
685
686 /* the current set of snaps for this realm */
687 struct ceph_snap_context *cached_context;
688
689 struct list_head inodes_with_caps;
690 spinlock_t inodes_with_caps_lock;
691};
692
693
694
695/*
696 * calculate the number of pages a given length and offset map onto,
697 * if we align the data.
698 */
699static inline int calc_pages_for(u64 off, u64 len)
700{
701 return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
702 (off >> PAGE_CACHE_SHIFT);
703}
704
705
706
707/* snap.c */
708struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
709 u64 ino);
710extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
711 struct ceph_snap_realm *realm);
712extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
713 struct ceph_snap_realm *realm);
714extern int ceph_update_snap_trace(struct ceph_mds_client *m,
715 void *p, void *e, bool deletion);
716extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
717 struct ceph_mds_session *session,
718 struct ceph_msg *msg);
719extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
720extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
721 struct ceph_cap_snap *capsnap);
722extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
723
724/*
725 * a cap_snap is "pending" if it is still awaiting an in-progress
726 * sync write (that may/may not still update size, mtime, etc.).
727 */
728static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
729{
730 return !list_empty(&ci->i_cap_snaps) &&
731 list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
732 ci_item)->writing;
733}
734
735
736/* super.c */
737extern struct kmem_cache *ceph_inode_cachep;
738extern struct kmem_cache *ceph_cap_cachep;
739extern struct kmem_cache *ceph_dentry_cachep;
740extern struct kmem_cache *ceph_file_cachep;
741
742extern const char *ceph_msg_type_name(int type);
743extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
744
745#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
746 "%02x%02x%02x%02x%02x%02x"
747#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \
748 (f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7], \
749 (f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11], \
750 (f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15]
751
752/* inode.c */
753extern const struct inode_operations ceph_file_iops;
754
755extern struct inode *ceph_alloc_inode(struct super_block *sb);
756extern void ceph_destroy_inode(struct inode *inode);
757
758extern struct inode *ceph_get_inode(struct super_block *sb,
759 struct ceph_vino vino);
760extern struct inode *ceph_get_snapdir(struct inode *parent);
761extern int ceph_fill_file_size(struct inode *inode, int issued,
762 u32 truncate_seq, u64 truncate_size, u64 size);
763extern void ceph_fill_file_time(struct inode *inode, int issued,
764 u64 time_warp_seq, struct timespec *ctime,
765 struct timespec *mtime, struct timespec *atime);
766extern int ceph_fill_trace(struct super_block *sb,
767 struct ceph_mds_request *req,
768 struct ceph_mds_session *session);
769extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
770 struct ceph_mds_session *session);
771
772extern int ceph_inode_holds_cap(struct inode *inode, int mask);
773
774extern int ceph_inode_set_size(struct inode *inode, loff_t size);
775extern void __ceph_do_pending_vmtruncate(struct inode *inode);
776extern void ceph_queue_vmtruncate(struct inode *inode);
777
778extern void ceph_queue_invalidate(struct inode *inode);
779extern void ceph_queue_writeback(struct inode *inode);
780
781extern int ceph_do_getattr(struct inode *inode, int mask);
782extern int ceph_permission(struct inode *inode, int mask);
783extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
784extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
785 struct kstat *stat);
786
787/* xattr.c */
788extern int ceph_setxattr(struct dentry *, const char *, const void *,
789 size_t, int);
790extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
791extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
792extern int ceph_removexattr(struct dentry *, const char *);
793extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
794extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
795
796/* caps.c */
797extern const char *ceph_cap_string(int c);
798extern void ceph_handle_caps(struct ceph_mds_session *session,
799 struct ceph_msg *msg);
800extern int ceph_add_cap(struct inode *inode,
801 struct ceph_mds_session *session, u64 cap_id,
802 int fmode, unsigned issued, unsigned wanted,
803 unsigned cap, unsigned seq, u64 realmino, int flags,
804 struct ceph_cap_reservation *caps_reservation);
805extern void __ceph_remove_cap(struct ceph_cap *cap);
806static inline void ceph_remove_cap(struct ceph_cap *cap)
807{
808 struct inode *inode = &cap->ci->vfs_inode;
809 spin_lock(&inode->i_lock);
810 __ceph_remove_cap(cap);
811 spin_unlock(&inode->i_lock);
812}
813extern void ceph_put_cap(struct ceph_cap *cap);
814
815extern void ceph_queue_caps_release(struct inode *inode);
816extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
817extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync);
818extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
819 struct ceph_mds_session *session);
820extern int ceph_get_cap_mds(struct inode *inode);
821extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
822extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
823extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
824 struct ceph_snap_context *snapc);
825extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
826 struct ceph_mds_session **psession);
827extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
828 struct ceph_mds_session *session);
829extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
830extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
831
832extern int ceph_encode_inode_release(void **p, struct inode *inode,
833 int mds, int drop, int unless, int force);
834extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
835 int mds, int drop, int unless);
836
837extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
838 int *got, loff_t endoff);
839
840/* for counting open files by mode */
841static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
842{
843 ci->i_nr_by_mode[mode]++;
844}
845extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
846
847/* addr.c */
848extern const struct address_space_operations ceph_aops;
849extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
850
851/* file.c */
852extern const struct file_operations ceph_file_fops;
853extern const struct address_space_operations ceph_aops;
854extern int ceph_open(struct inode *inode, struct file *file);
855extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
856 struct nameidata *nd, int mode,
857 int locked_dir);
858extern int ceph_release(struct inode *inode, struct file *filp);
859extern void ceph_release_page_vector(struct page **pages, int num_pages);
860
861/* dir.c */
862extern const struct file_operations ceph_dir_fops;
863extern const struct inode_operations ceph_dir_iops;
864extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
865 ceph_snapdir_dentry_ops;
866
867extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
868extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
869 struct dentry *dentry, int err);
870
871extern void ceph_dentry_lru_add(struct dentry *dn);
872extern void ceph_dentry_lru_touch(struct dentry *dn);
873extern void ceph_dentry_lru_del(struct dentry *dn);
874
875/*
876 * our d_ops vary depending on whether the inode is live,
877 * snapshotted (read-only), or a virtual ".snap" directory.
878 */
879int ceph_init_dentry(struct dentry *dentry);
880
881
882/* ioctl.c */
883extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
884
885/* export.c */
886extern const struct export_operations ceph_export_ops;
887
888/* debugfs.c */
889extern int ceph_debugfs_init(void);
890extern void ceph_debugfs_cleanup(void);
891extern int ceph_debugfs_client_init(struct ceph_client *client);
892extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
893
894static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
895{
896 if (dentry && dentry->d_parent)
897 return dentry->d_parent->d_inode;
898
899 return NULL;
900}
901
902#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
new file mode 100644
index 000000000000..28b35a005ec2
--- /dev/null
+++ b/fs/ceph/types.h
@@ -0,0 +1,29 @@
1#ifndef _FS_CEPH_TYPES_H
2#define _FS_CEPH_TYPES_H
3
4/* needed before including ceph_fs.h */
5#include <linux/in.h>
6#include <linux/types.h>
7#include <linux/fcntl.h>
8#include <linux/string.h>
9
10#include "ceph_fs.h"
11#include "ceph_frag.h"
12#include "ceph_hash.h"
13
14/*
15 * Identify inodes by both their ino AND snapshot id (a u64).
16 */
17struct ceph_vino {
18 u64 ino;
19 u64 snap;
20};
21
22
23/* context for the caps reservation mechanism */
24struct ceph_cap_reservation {
25 int count;
26};
27
28
29#endif
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
new file mode 100644
index 000000000000..2845422907fc
--- /dev/null
+++ b/fs/ceph/xattr.c
@@ -0,0 +1,845 @@
1#include "ceph_debug.h"
2#include "super.h"
3#include "decode.h"
4
5#include <linux/xattr.h>
6#include <linux/slab.h>
7
8static bool ceph_is_valid_xattr(const char *name)
9{
10 return !strncmp(name, XATTR_SECURITY_PREFIX,
11 XATTR_SECURITY_PREFIX_LEN) ||
12 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
13 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
14}
15
16/*
17 * These define virtual xattrs exposing the recursive directory
18 * statistics and layout metadata.
19 */
20struct ceph_vxattr_cb {
21 bool readonly;
22 char *name;
23 size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
24 size_t size);
25};
26
27/* directories */
28
29static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val,
30 size_t size)
31{
32 return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
33}
34
35static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val,
36 size_t size)
37{
38 return snprintf(val, size, "%lld", ci->i_files);
39}
40
41static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val,
42 size_t size)
43{
44 return snprintf(val, size, "%lld", ci->i_subdirs);
45}
46
47static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val,
48 size_t size)
49{
50 return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
51}
52
53static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val,
54 size_t size)
55{
56 return snprintf(val, size, "%lld", ci->i_rfiles);
57}
58
59static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val,
60 size_t size)
61{
62 return snprintf(val, size, "%lld", ci->i_rsubdirs);
63}
64
65static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val,
66 size_t size)
67{
68 return snprintf(val, size, "%lld", ci->i_rbytes);
69}
70
71static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
72 size_t size)
73{
74 return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec,
75 (long)ci->i_rctime.tv_nsec);
76}
77
78static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
79 { true, "user.ceph.dir.entries", ceph_vxattrcb_entries},
80 { true, "user.ceph.dir.files", ceph_vxattrcb_files},
81 { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs},
82 { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries},
83 { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles},
84 { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
85 { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes},
86 { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime},
87 { true, NULL, NULL }
88};
89
90/* files */
91
92static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
93 size_t size)
94{
95 int ret;
96
97 ret = snprintf(val, size,
98 "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
99 (unsigned long long)ceph_file_layout_su(ci->i_layout),
100 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
101 (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
102 if (ceph_file_layout_pg_preferred(ci->i_layout))
103 ret += snprintf(val + ret, size, "preferred_osd=%lld\n",
104 (unsigned long long)ceph_file_layout_pg_preferred(
105 ci->i_layout));
106 return ret;
107}
108
109static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
110 { true, "user.ceph.layout", ceph_vxattrcb_layout},
111 { NULL, NULL }
112};
113
114static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
115{
116 if (S_ISDIR(inode->i_mode))
117 return ceph_dir_vxattrs;
118 else if (S_ISREG(inode->i_mode))
119 return ceph_file_vxattrs;
120 return NULL;
121}
122
123static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr,
124 const char *name)
125{
126 do {
127 if (strcmp(vxattr->name, name) == 0)
128 return vxattr;
129 vxattr++;
130 } while (vxattr->name);
131 return NULL;
132}
133
134static int __set_xattr(struct ceph_inode_info *ci,
135 const char *name, int name_len,
136 const char *val, int val_len,
137 int dirty,
138 int should_free_name, int should_free_val,
139 struct ceph_inode_xattr **newxattr)
140{
141 struct rb_node **p;
142 struct rb_node *parent = NULL;
143 struct ceph_inode_xattr *xattr = NULL;
144 int c;
145 int new = 0;
146
147 p = &ci->i_xattrs.index.rb_node;
148 while (*p) {
149 parent = *p;
150 xattr = rb_entry(parent, struct ceph_inode_xattr, node);
151 c = strncmp(name, xattr->name, min(name_len, xattr->name_len));
152 if (c < 0)
153 p = &(*p)->rb_left;
154 else if (c > 0)
155 p = &(*p)->rb_right;
156 else {
157 if (name_len == xattr->name_len)
158 break;
159 else if (name_len < xattr->name_len)
160 p = &(*p)->rb_left;
161 else
162 p = &(*p)->rb_right;
163 }
164 xattr = NULL;
165 }
166
167 if (!xattr) {
168 new = 1;
169 xattr = *newxattr;
170 xattr->name = name;
171 xattr->name_len = name_len;
172 xattr->should_free_name = should_free_name;
173
174 ci->i_xattrs.count++;
175 dout("__set_xattr count=%d\n", ci->i_xattrs.count);
176 } else {
177 kfree(*newxattr);
178 *newxattr = NULL;
179 if (xattr->should_free_val)
180 kfree((void *)xattr->val);
181
182 if (should_free_name) {
183 kfree((void *)name);
184 name = xattr->name;
185 }
186 ci->i_xattrs.names_size -= xattr->name_len;
187 ci->i_xattrs.vals_size -= xattr->val_len;
188 }
189 if (!xattr) {
190 pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
191 &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
192 xattr->val);
193 return -ENOMEM;
194 }
195 ci->i_xattrs.names_size += name_len;
196 ci->i_xattrs.vals_size += val_len;
197 if (val)
198 xattr->val = val;
199 else
200 xattr->val = "";
201
202 xattr->val_len = val_len;
203 xattr->dirty = dirty;
204 xattr->should_free_val = (val && should_free_val);
205
206 if (new) {
207 rb_link_node(&xattr->node, parent, p);
208 rb_insert_color(&xattr->node, &ci->i_xattrs.index);
209 dout("__set_xattr_val p=%p\n", p);
210 }
211
212 dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
213 ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
214
215 return 0;
216}
217
218static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
219 const char *name)
220{
221 struct rb_node **p;
222 struct rb_node *parent = NULL;
223 struct ceph_inode_xattr *xattr = NULL;
224 int c;
225
226 p = &ci->i_xattrs.index.rb_node;
227 while (*p) {
228 parent = *p;
229 xattr = rb_entry(parent, struct ceph_inode_xattr, node);
230 c = strncmp(name, xattr->name, xattr->name_len);
231 if (c < 0)
232 p = &(*p)->rb_left;
233 else if (c > 0)
234 p = &(*p)->rb_right;
235 else {
236 dout("__get_xattr %s: found %.*s\n", name,
237 xattr->val_len, xattr->val);
238 return xattr;
239 }
240 }
241
242 dout("__get_xattr %s: not found\n", name);
243
244 return NULL;
245}
246
247static void __free_xattr(struct ceph_inode_xattr *xattr)
248{
249 BUG_ON(!xattr);
250
251 if (xattr->should_free_name)
252 kfree((void *)xattr->name);
253 if (xattr->should_free_val)
254 kfree((void *)xattr->val);
255
256 kfree(xattr);
257}
258
259static int __remove_xattr(struct ceph_inode_info *ci,
260 struct ceph_inode_xattr *xattr)
261{
262 if (!xattr)
263 return -EOPNOTSUPP;
264
265 rb_erase(&xattr->node, &ci->i_xattrs.index);
266
267 if (xattr->should_free_name)
268 kfree((void *)xattr->name);
269 if (xattr->should_free_val)
270 kfree((void *)xattr->val);
271
272 ci->i_xattrs.names_size -= xattr->name_len;
273 ci->i_xattrs.vals_size -= xattr->val_len;
274 ci->i_xattrs.count--;
275 kfree(xattr);
276
277 return 0;
278}
279
280static int __remove_xattr_by_name(struct ceph_inode_info *ci,
281 const char *name)
282{
283 struct rb_node **p;
284 struct ceph_inode_xattr *xattr;
285 int err;
286
287 p = &ci->i_xattrs.index.rb_node;
288 xattr = __get_xattr(ci, name);
289 err = __remove_xattr(ci, xattr);
290 return err;
291}
292
293static char *__copy_xattr_names(struct ceph_inode_info *ci,
294 char *dest)
295{
296 struct rb_node *p;
297 struct ceph_inode_xattr *xattr = NULL;
298
299 p = rb_first(&ci->i_xattrs.index);
300 dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
301
302 while (p) {
303 xattr = rb_entry(p, struct ceph_inode_xattr, node);
304 memcpy(dest, xattr->name, xattr->name_len);
305 dest[xattr->name_len] = '\0';
306
307 dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
308 xattr->name_len, ci->i_xattrs.names_size);
309
310 dest += xattr->name_len + 1;
311 p = rb_next(p);
312 }
313
314 return dest;
315}
316
317void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
318{
319 struct rb_node *p, *tmp;
320 struct ceph_inode_xattr *xattr = NULL;
321
322 p = rb_first(&ci->i_xattrs.index);
323
324 dout("__ceph_destroy_xattrs p=%p\n", p);
325
326 while (p) {
327 xattr = rb_entry(p, struct ceph_inode_xattr, node);
328 tmp = p;
329 p = rb_next(tmp);
330 dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
331 xattr->name_len, xattr->name);
332 rb_erase(tmp, &ci->i_xattrs.index);
333
334 __free_xattr(xattr);
335 }
336
337 ci->i_xattrs.names_size = 0;
338 ci->i_xattrs.vals_size = 0;
339 ci->i_xattrs.index_version = 0;
340 ci->i_xattrs.count = 0;
341 ci->i_xattrs.index = RB_ROOT;
342}
343
344static int __build_xattrs(struct inode *inode)
345{
346 u32 namelen;
347 u32 numattr = 0;
348 void *p, *end;
349 u32 len;
350 const char *name, *val;
351 struct ceph_inode_info *ci = ceph_inode(inode);
352 int xattr_version;
353 struct ceph_inode_xattr **xattrs = NULL;
354 int err = 0;
355 int i;
356
357 dout("__build_xattrs() len=%d\n",
358 ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
359
360 if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
361 return 0; /* already built */
362
363 __ceph_destroy_xattrs(ci);
364
365start:
366 /* updated internal xattr rb tree */
367 if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) {
368 p = ci->i_xattrs.blob->vec.iov_base;
369 end = p + ci->i_xattrs.blob->vec.iov_len;
370 ceph_decode_32_safe(&p, end, numattr, bad);
371 xattr_version = ci->i_xattrs.version;
372 spin_unlock(&inode->i_lock);
373
374 xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
375 GFP_NOFS);
376 err = -ENOMEM;
377 if (!xattrs)
378 goto bad_lock;
379 memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
380 for (i = 0; i < numattr; i++) {
381 xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
382 GFP_NOFS);
383 if (!xattrs[i])
384 goto bad_lock;
385 }
386
387 spin_lock(&inode->i_lock);
388 if (ci->i_xattrs.version != xattr_version) {
389 /* lost a race, retry */
390 for (i = 0; i < numattr; i++)
391 kfree(xattrs[i]);
392 kfree(xattrs);
393 goto start;
394 }
395 err = -EIO;
396 while (numattr--) {
397 ceph_decode_32_safe(&p, end, len, bad);
398 namelen = len;
399 name = p;
400 p += len;
401 ceph_decode_32_safe(&p, end, len, bad);
402 val = p;
403 p += len;
404
405 err = __set_xattr(ci, name, namelen, val, len,
406 0, 0, 0, &xattrs[numattr]);
407
408 if (err < 0)
409 goto bad;
410 }
411 kfree(xattrs);
412 }
413 ci->i_xattrs.index_version = ci->i_xattrs.version;
414 ci->i_xattrs.dirty = false;
415
416 return err;
417bad_lock:
418 spin_lock(&inode->i_lock);
419bad:
420 if (xattrs) {
421 for (i = 0; i < numattr; i++)
422 kfree(xattrs[i]);
423 kfree(xattrs);
424 }
425 ci->i_xattrs.names_size = 0;
426 return err;
427}
428
429static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
430 int val_size)
431{
432 /*
433 * 4 bytes for the length, and additional 4 bytes per each xattr name,
434 * 4 bytes per each value
435 */
436 int size = 4 + ci->i_xattrs.count*(4 + 4) +
437 ci->i_xattrs.names_size +
438 ci->i_xattrs.vals_size;
439 dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
440 ci->i_xattrs.count, ci->i_xattrs.names_size,
441 ci->i_xattrs.vals_size);
442
443 if (name_size)
444 size += 4 + 4 + name_size + val_size;
445
446 return size;
447}
448
449/*
450 * If there are dirty xattrs, reencode xattrs into the prealloc_blob
451 * and swap into place.
452 */
453void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
454{
455 struct rb_node *p;
456 struct ceph_inode_xattr *xattr = NULL;
457 void *dest;
458
459 dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
460 if (ci->i_xattrs.dirty) {
461 int need = __get_required_blob_size(ci, 0, 0);
462
463 BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len);
464
465 p = rb_first(&ci->i_xattrs.index);
466 dest = ci->i_xattrs.prealloc_blob->vec.iov_base;
467
468 ceph_encode_32(&dest, ci->i_xattrs.count);
469 while (p) {
470 xattr = rb_entry(p, struct ceph_inode_xattr, node);
471
472 ceph_encode_32(&dest, xattr->name_len);
473 memcpy(dest, xattr->name, xattr->name_len);
474 dest += xattr->name_len;
475 ceph_encode_32(&dest, xattr->val_len);
476 memcpy(dest, xattr->val, xattr->val_len);
477 dest += xattr->val_len;
478
479 p = rb_next(p);
480 }
481
482 /* adjust buffer len; it may be larger than we need */
483 ci->i_xattrs.prealloc_blob->vec.iov_len =
484 dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
485
486 if (ci->i_xattrs.blob)
487 ceph_buffer_put(ci->i_xattrs.blob);
488 ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
489 ci->i_xattrs.prealloc_blob = NULL;
490 ci->i_xattrs.dirty = false;
491 }
492}
493
494ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
495 size_t size)
496{
497 struct inode *inode = dentry->d_inode;
498 struct ceph_inode_info *ci = ceph_inode(inode);
499 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
500 int err;
501 struct ceph_inode_xattr *xattr;
502 struct ceph_vxattr_cb *vxattr = NULL;
503
504 if (!ceph_is_valid_xattr(name))
505 return -ENODATA;
506
507 /* let's see if a virtual xattr was requested */
508 if (vxattrs)
509 vxattr = ceph_match_vxattr(vxattrs, name);
510
511 spin_lock(&inode->i_lock);
512 dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
513 ci->i_xattrs.version, ci->i_xattrs.index_version);
514
515 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
516 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
517 goto get_xattr;
518 } else {
519 spin_unlock(&inode->i_lock);
520 /* get xattrs from mds (if we don't already have them) */
521 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
522 if (err)
523 return err;
524 }
525
526 spin_lock(&inode->i_lock);
527
528 if (vxattr && vxattr->readonly) {
529 err = vxattr->getxattr_cb(ci, value, size);
530 goto out;
531 }
532
533 err = __build_xattrs(inode);
534 if (err < 0)
535 goto out;
536
537get_xattr:
538 err = -ENODATA; /* == ENOATTR */
539 xattr = __get_xattr(ci, name);
540 if (!xattr) {
541 if (vxattr)
542 err = vxattr->getxattr_cb(ci, value, size);
543 goto out;
544 }
545
546 err = -ERANGE;
547 if (size && size < xattr->val_len)
548 goto out;
549
550 err = xattr->val_len;
551 if (size == 0)
552 goto out;
553
554 memcpy(value, xattr->val, xattr->val_len);
555
556out:
557 spin_unlock(&inode->i_lock);
558 return err;
559}
560
561ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
562{
563 struct inode *inode = dentry->d_inode;
564 struct ceph_inode_info *ci = ceph_inode(inode);
565 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
566 u32 vir_namelen = 0;
567 u32 namelen;
568 int err;
569 u32 len;
570 int i;
571
572 spin_lock(&inode->i_lock);
573 dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
574 ci->i_xattrs.version, ci->i_xattrs.index_version);
575
576 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
577 (ci->i_xattrs.index_version > ci->i_xattrs.version)) {
578 goto list_xattr;
579 } else {
580 spin_unlock(&inode->i_lock);
581 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
582 if (err)
583 return err;
584 }
585
586 spin_lock(&inode->i_lock);
587
588 err = __build_xattrs(inode);
589 if (err < 0)
590 goto out;
591
592list_xattr:
593 vir_namelen = 0;
594 /* include virtual dir xattrs */
595 if (vxattrs)
596 for (i = 0; vxattrs[i].name; i++)
597 vir_namelen += strlen(vxattrs[i].name) + 1;
598 /* adding 1 byte per each variable due to the null termination */
599 namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
600 err = -ERANGE;
601 if (size && namelen > size)
602 goto out;
603
604 err = namelen;
605 if (size == 0)
606 goto out;
607
608 names = __copy_xattr_names(ci, names);
609
610 /* virtual xattr names, too */
611 if (vxattrs)
612 for (i = 0; vxattrs[i].name; i++) {
613 len = sprintf(names, "%s", vxattrs[i].name);
614 names += len + 1;
615 }
616
617out:
618 spin_unlock(&inode->i_lock);
619 return err;
620}
621
622static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
623 const char *value, size_t size, int flags)
624{
625 struct ceph_client *client = ceph_client(dentry->d_sb);
626 struct inode *inode = dentry->d_inode;
627 struct ceph_inode_info *ci = ceph_inode(inode);
628 struct inode *parent_inode = dentry->d_parent->d_inode;
629 struct ceph_mds_request *req;
630 struct ceph_mds_client *mdsc = &client->mdsc;
631 int err;
632 int i, nr_pages;
633 struct page **pages = NULL;
634 void *kaddr;
635
636 /* copy value into some pages */
637 nr_pages = calc_pages_for(0, size);
638 if (nr_pages) {
639 pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS);
640 if (!pages)
641 return -ENOMEM;
642 err = -ENOMEM;
643 for (i = 0; i < nr_pages; i++) {
644 pages[i] = alloc_page(GFP_NOFS);
645 if (!pages[i]) {
646 nr_pages = i;
647 goto out;
648 }
649 kaddr = kmap(pages[i]);
650 memcpy(kaddr, value + i*PAGE_CACHE_SIZE,
651 min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE));
652 }
653 }
654
655 dout("setxattr value=%.*s\n", (int)size, value);
656
657 /* do request */
658 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
659 USE_AUTH_MDS);
660 if (IS_ERR(req)) {
661 err = PTR_ERR(req);
662 goto out;
663 }
664 req->r_inode = igrab(inode);
665 req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
666 req->r_num_caps = 1;
667 req->r_args.setxattr.flags = cpu_to_le32(flags);
668 req->r_path2 = kstrdup(name, GFP_NOFS);
669
670 req->r_pages = pages;
671 req->r_num_pages = nr_pages;
672 req->r_data_len = size;
673
674 dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
675 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
676 ceph_mdsc_put_request(req);
677 dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
678
679out:
680 if (pages) {
681 for (i = 0; i < nr_pages; i++)
682 __free_page(pages[i]);
683 kfree(pages);
684 }
685 return err;
686}
687
688int ceph_setxattr(struct dentry *dentry, const char *name,
689 const void *value, size_t size, int flags)
690{
691 struct inode *inode = dentry->d_inode;
692 struct ceph_inode_info *ci = ceph_inode(inode);
693 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
694 int err;
695 int name_len = strlen(name);
696 int val_len = size;
697 char *newname = NULL;
698 char *newval = NULL;
699 struct ceph_inode_xattr *xattr = NULL;
700 int issued;
701 int required_blob_size;
702
703 if (ceph_snap(inode) != CEPH_NOSNAP)
704 return -EROFS;
705
706 if (!ceph_is_valid_xattr(name))
707 return -EOPNOTSUPP;
708
709 if (vxattrs) {
710 struct ceph_vxattr_cb *vxattr =
711 ceph_match_vxattr(vxattrs, name);
712 if (vxattr && vxattr->readonly)
713 return -EOPNOTSUPP;
714 }
715
716 /* preallocate memory for xattr name, value, index node */
717 err = -ENOMEM;
718 newname = kmalloc(name_len + 1, GFP_NOFS);
719 if (!newname)
720 goto out;
721 memcpy(newname, name, name_len + 1);
722
723 if (val_len) {
724 newval = kmalloc(val_len + 1, GFP_NOFS);
725 if (!newval)
726 goto out;
727 memcpy(newval, value, val_len);
728 newval[val_len] = '\0';
729 }
730
731 xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
732 if (!xattr)
733 goto out;
734
735 spin_lock(&inode->i_lock);
736retry:
737 issued = __ceph_caps_issued(ci, NULL);
738 if (!(issued & CEPH_CAP_XATTR_EXCL))
739 goto do_sync;
740 __build_xattrs(inode);
741
742 required_blob_size = __get_required_blob_size(ci, name_len, val_len);
743
744 if (!ci->i_xattrs.prealloc_blob ||
745 required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
746 struct ceph_buffer *blob = NULL;
747
748 spin_unlock(&inode->i_lock);
749 dout(" preaallocating new blob size=%d\n", required_blob_size);
750 blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
751 if (!blob)
752 goto out;
753 spin_lock(&inode->i_lock);
754 if (ci->i_xattrs.prealloc_blob)
755 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
756 ci->i_xattrs.prealloc_blob = blob;
757 goto retry;
758 }
759
760 dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
761 err = __set_xattr(ci, newname, name_len, newval,
762 val_len, 1, 1, 1, &xattr);
763 __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
764 ci->i_xattrs.dirty = true;
765 inode->i_ctime = CURRENT_TIME;
766 spin_unlock(&inode->i_lock);
767
768 return err;
769
770do_sync:
771 spin_unlock(&inode->i_lock);
772 err = ceph_sync_setxattr(dentry, name, value, size, flags);
773out:
774 kfree(newname);
775 kfree(newval);
776 kfree(xattr);
777 return err;
778}
779
780static int ceph_send_removexattr(struct dentry *dentry, const char *name)
781{
782 struct ceph_client *client = ceph_client(dentry->d_sb);
783 struct ceph_mds_client *mdsc = &client->mdsc;
784 struct inode *inode = dentry->d_inode;
785 struct inode *parent_inode = dentry->d_parent->d_inode;
786 struct ceph_mds_request *req;
787 int err;
788
789 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR,
790 USE_AUTH_MDS);
791 if (IS_ERR(req))
792 return PTR_ERR(req);
793 req->r_inode = igrab(inode);
794 req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
795 req->r_num_caps = 1;
796 req->r_path2 = kstrdup(name, GFP_NOFS);
797
798 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
799 ceph_mdsc_put_request(req);
800 return err;
801}
802
803int ceph_removexattr(struct dentry *dentry, const char *name)
804{
805 struct inode *inode = dentry->d_inode;
806 struct ceph_inode_info *ci = ceph_inode(inode);
807 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
808 int issued;
809 int err;
810
811 if (ceph_snap(inode) != CEPH_NOSNAP)
812 return -EROFS;
813
814 if (!ceph_is_valid_xattr(name))
815 return -EOPNOTSUPP;
816
817 if (vxattrs) {
818 struct ceph_vxattr_cb *vxattr =
819 ceph_match_vxattr(vxattrs, name);
820 if (vxattr && vxattr->readonly)
821 return -EOPNOTSUPP;
822 }
823
824 spin_lock(&inode->i_lock);
825 __build_xattrs(inode);
826 issued = __ceph_caps_issued(ci, NULL);
827 dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
828
829 if (!(issued & CEPH_CAP_XATTR_EXCL))
830 goto do_sync;
831
832 err = __remove_xattr_by_name(ceph_inode(inode), name);
833 __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
834 ci->i_xattrs.dirty = true;
835 inode->i_ctime = CURRENT_TIME;
836
837 spin_unlock(&inode->i_lock);
838
839 return err;
840do_sync:
841 spin_unlock(&inode->i_lock);
842 err = ceph_send_removexattr(dentry, name);
843 return err;
844}
845
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index 20692fbfdb24..cfd1ce34e0bc 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -136,7 +136,7 @@ asn1_enum_decode(struct asn1_ctx *ctx, __le32 *val)
136 return 0; 136 return 0;
137 } 137 }
138 138
139 ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to lenght octet */ 139 ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to length octet */
140 if ((ch) == ASN1_ENUM) /* if ch value is ENUM, 0xa */ 140 if ((ch) == ASN1_ENUM) /* if ch value is ENUM, 0xa */
141 *val = *(++(ctx->pointer)); /* value has enum value */ 141 *val = *(++(ctx->pointer)); /* value has enum value */
142 else 142 else
@@ -492,17 +492,13 @@ compare_oid(unsigned long *oid1, unsigned int oid1len,
492 492
493int 493int
494decode_negTokenInit(unsigned char *security_blob, int length, 494decode_negTokenInit(unsigned char *security_blob, int length,
495 enum securityEnum *secType) 495 struct TCP_Server_Info *server)
496{ 496{
497 struct asn1_ctx ctx; 497 struct asn1_ctx ctx;
498 unsigned char *end; 498 unsigned char *end;
499 unsigned char *sequence_end; 499 unsigned char *sequence_end;
500 unsigned long *oid = NULL; 500 unsigned long *oid = NULL;
501 unsigned int cls, con, tag, oidlen, rc; 501 unsigned int cls, con, tag, oidlen, rc;
502 bool use_ntlmssp = false;
503 bool use_kerberos = false;
504 bool use_kerberosu2u = false;
505 bool use_mskerberos = false;
506 502
507 /* cifs_dump_mem(" Received SecBlob ", security_blob, length); */ 503 /* cifs_dump_mem(" Received SecBlob ", security_blob, length); */
508 504
@@ -510,11 +506,11 @@ decode_negTokenInit(unsigned char *security_blob, int length,
510 506
511 /* GSSAPI header */ 507 /* GSSAPI header */
512 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 508 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
513 cFYI(1, ("Error decoding negTokenInit header")); 509 cFYI(1, "Error decoding negTokenInit header");
514 return 0; 510 return 0;
515 } else if ((cls != ASN1_APL) || (con != ASN1_CON) 511 } else if ((cls != ASN1_APL) || (con != ASN1_CON)
516 || (tag != ASN1_EOC)) { 512 || (tag != ASN1_EOC)) {
517 cFYI(1, ("cls = %d con = %d tag = %d", cls, con, tag)); 513 cFYI(1, "cls = %d con = %d tag = %d", cls, con, tag);
518 return 0; 514 return 0;
519 } 515 }
520 516
@@ -535,56 +531,52 @@ decode_negTokenInit(unsigned char *security_blob, int length,
535 531
536 /* SPNEGO OID not present or garbled -- bail out */ 532 /* SPNEGO OID not present or garbled -- bail out */
537 if (!rc) { 533 if (!rc) {
538 cFYI(1, ("Error decoding negTokenInit header")); 534 cFYI(1, "Error decoding negTokenInit header");
539 return 0; 535 return 0;
540 } 536 }
541 537
542 /* SPNEGO */ 538 /* SPNEGO */
543 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 539 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
544 cFYI(1, ("Error decoding negTokenInit")); 540 cFYI(1, "Error decoding negTokenInit");
545 return 0; 541 return 0;
546 } else if ((cls != ASN1_CTX) || (con != ASN1_CON) 542 } else if ((cls != ASN1_CTX) || (con != ASN1_CON)
547 || (tag != ASN1_EOC)) { 543 || (tag != ASN1_EOC)) {
548 cFYI(1, 544 cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0",
549 ("cls = %d con = %d tag = %d end = %p (%d) exit 0", 545 cls, con, tag, end, *end);
550 cls, con, tag, end, *end));
551 return 0; 546 return 0;
552 } 547 }
553 548
554 /* negTokenInit */ 549 /* negTokenInit */
555 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 550 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
556 cFYI(1, ("Error decoding negTokenInit")); 551 cFYI(1, "Error decoding negTokenInit");
557 return 0; 552 return 0;
558 } else if ((cls != ASN1_UNI) || (con != ASN1_CON) 553 } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
559 || (tag != ASN1_SEQ)) { 554 || (tag != ASN1_SEQ)) {
560 cFYI(1, 555 cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1",
561 ("cls = %d con = %d tag = %d end = %p (%d) exit 1", 556 cls, con, tag, end, *end);
562 cls, con, tag, end, *end));
563 return 0; 557 return 0;
564 } 558 }
565 559
566 /* sequence */ 560 /* sequence */
567 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 561 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
568 cFYI(1, ("Error decoding 2nd part of negTokenInit")); 562 cFYI(1, "Error decoding 2nd part of negTokenInit");
569 return 0; 563 return 0;
570 } else if ((cls != ASN1_CTX) || (con != ASN1_CON) 564 } else if ((cls != ASN1_CTX) || (con != ASN1_CON)
571 || (tag != ASN1_EOC)) { 565 || (tag != ASN1_EOC)) {
572 cFYI(1, 566 cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0",
573 ("cls = %d con = %d tag = %d end = %p (%d) exit 0", 567 cls, con, tag, end, *end);
574 cls, con, tag, end, *end));
575 return 0; 568 return 0;
576 } 569 }
577 570
578 /* sequence of */ 571 /* sequence of */
579 if (asn1_header_decode 572 if (asn1_header_decode
580 (&ctx, &sequence_end, &cls, &con, &tag) == 0) { 573 (&ctx, &sequence_end, &cls, &con, &tag) == 0) {
581 cFYI(1, ("Error decoding 2nd part of negTokenInit")); 574 cFYI(1, "Error decoding 2nd part of negTokenInit");
582 return 0; 575 return 0;
583 } else if ((cls != ASN1_UNI) || (con != ASN1_CON) 576 } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
584 || (tag != ASN1_SEQ)) { 577 || (tag != ASN1_SEQ)) {
585 cFYI(1, 578 cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1",
586 ("cls = %d con = %d tag = %d end = %p (%d) exit 1", 579 cls, con, tag, end, *end);
587 cls, con, tag, end, *end));
588 return 0; 580 return 0;
589 } 581 }
590 582
@@ -592,37 +584,33 @@ decode_negTokenInit(unsigned char *security_blob, int length,
592 while (!asn1_eoc_decode(&ctx, sequence_end)) { 584 while (!asn1_eoc_decode(&ctx, sequence_end)) {
593 rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag); 585 rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
594 if (!rc) { 586 if (!rc) {
595 cFYI(1, 587 cFYI(1, "Error decoding negTokenInit hdr exit2");
596 ("Error decoding negTokenInit hdr exit2"));
597 return 0; 588 return 0;
598 } 589 }
599 if ((tag == ASN1_OJI) && (con == ASN1_PRI)) { 590 if ((tag == ASN1_OJI) && (con == ASN1_PRI)) {
600 if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) { 591 if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) {
601 592
602 cFYI(1, ("OID len = %d oid = 0x%lx 0x%lx " 593 cFYI(1, "OID len = %d oid = 0x%lx 0x%lx "
603 "0x%lx 0x%lx", oidlen, *oid, 594 "0x%lx 0x%lx", oidlen, *oid,
604 *(oid + 1), *(oid + 2), *(oid + 3))); 595 *(oid + 1), *(oid + 2), *(oid + 3));
605 596
606 if (compare_oid(oid, oidlen, MSKRB5_OID, 597 if (compare_oid(oid, oidlen, MSKRB5_OID,
607 MSKRB5_OID_LEN) && 598 MSKRB5_OID_LEN))
608 !use_mskerberos) 599 server->sec_mskerberos = true;
609 use_mskerberos = true;
610 else if (compare_oid(oid, oidlen, KRB5U2U_OID, 600 else if (compare_oid(oid, oidlen, KRB5U2U_OID,
611 KRB5U2U_OID_LEN) && 601 KRB5U2U_OID_LEN))
612 !use_kerberosu2u) 602 server->sec_kerberosu2u = true;
613 use_kerberosu2u = true;
614 else if (compare_oid(oid, oidlen, KRB5_OID, 603 else if (compare_oid(oid, oidlen, KRB5_OID,
615 KRB5_OID_LEN) && 604 KRB5_OID_LEN))
616 !use_kerberos) 605 server->sec_kerberos = true;
617 use_kerberos = true;
618 else if (compare_oid(oid, oidlen, NTLMSSP_OID, 606 else if (compare_oid(oid, oidlen, NTLMSSP_OID,
619 NTLMSSP_OID_LEN)) 607 NTLMSSP_OID_LEN))
620 use_ntlmssp = true; 608 server->sec_ntlmssp = true;
621 609
622 kfree(oid); 610 kfree(oid);
623 } 611 }
624 } else { 612 } else {
625 cFYI(1, ("Should be an oid what is going on?")); 613 cFYI(1, "Should be an oid what is going on?");
626 } 614 }
627 } 615 }
628 616
@@ -632,54 +620,47 @@ decode_negTokenInit(unsigned char *security_blob, int length,
632 no mechListMic (e.g. NTLMSSP instead of KRB5) */ 620 no mechListMic (e.g. NTLMSSP instead of KRB5) */
633 if (ctx.error == ASN1_ERR_DEC_EMPTY) 621 if (ctx.error == ASN1_ERR_DEC_EMPTY)
634 goto decode_negtoken_exit; 622 goto decode_negtoken_exit;
635 cFYI(1, ("Error decoding last part negTokenInit exit3")); 623 cFYI(1, "Error decoding last part negTokenInit exit3");
636 return 0; 624 return 0;
637 } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { 625 } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
638 /* tag = 3 indicating mechListMIC */ 626 /* tag = 3 indicating mechListMIC */
639 cFYI(1, ("Exit 4 cls = %d con = %d tag = %d end = %p (%d)", 627 cFYI(1, "Exit 4 cls = %d con = %d tag = %d end = %p (%d)",
640 cls, con, tag, end, *end)); 628 cls, con, tag, end, *end);
641 return 0; 629 return 0;
642 } 630 }
643 631
644 /* sequence */ 632 /* sequence */
645 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 633 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
646 cFYI(1, ("Error decoding last part negTokenInit exit5")); 634 cFYI(1, "Error decoding last part negTokenInit exit5");
647 return 0; 635 return 0;
648 } else if ((cls != ASN1_UNI) || (con != ASN1_CON) 636 } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
649 || (tag != ASN1_SEQ)) { 637 || (tag != ASN1_SEQ)) {
650 cFYI(1, ("cls = %d con = %d tag = %d end = %p (%d)", 638 cFYI(1, "cls = %d con = %d tag = %d end = %p (%d)",
651 cls, con, tag, end, *end)); 639 cls, con, tag, end, *end);
652 } 640 }
653 641
654 /* sequence of */ 642 /* sequence of */
655 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 643 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
656 cFYI(1, ("Error decoding last part negTokenInit exit 7")); 644 cFYI(1, "Error decoding last part negTokenInit exit 7");
657 return 0; 645 return 0;
658 } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { 646 } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
659 cFYI(1, ("Exit 8 cls = %d con = %d tag = %d end = %p (%d)", 647 cFYI(1, "Exit 8 cls = %d con = %d tag = %d end = %p (%d)",
660 cls, con, tag, end, *end)); 648 cls, con, tag, end, *end);
661 return 0; 649 return 0;
662 } 650 }
663 651
664 /* general string */ 652 /* general string */
665 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { 653 if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
666 cFYI(1, ("Error decoding last part negTokenInit exit9")); 654 cFYI(1, "Error decoding last part negTokenInit exit9");
667 return 0; 655 return 0;
668 } else if ((cls != ASN1_UNI) || (con != ASN1_PRI) 656 } else if ((cls != ASN1_UNI) || (con != ASN1_PRI)
669 || (tag != ASN1_GENSTR)) { 657 || (tag != ASN1_GENSTR)) {
670 cFYI(1, ("Exit10 cls = %d con = %d tag = %d end = %p (%d)", 658 cFYI(1, "Exit10 cls = %d con = %d tag = %d end = %p (%d)",
671 cls, con, tag, end, *end)); 659 cls, con, tag, end, *end);
672 return 0; 660 return 0;
673 } 661 }
674 cFYI(1, ("Need to call asn1_octets_decode() function for %s", 662 cFYI(1, "Need to call asn1_octets_decode() function for %s",
675 ctx.pointer)); /* is this UTF-8 or ASCII? */ 663 ctx.pointer); /* is this UTF-8 or ASCII? */
676decode_negtoken_exit: 664decode_negtoken_exit:
677 if (use_kerberos)
678 *secType = Kerberos;
679 else if (use_mskerberos)
680 *secType = MSKerberos;
681 else if (use_ntlmssp)
682 *secType = RawNTLMSSP;
683
684 return 1; 665 return 1;
685} 666}
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 42cec2a7c0cf..4fce6e61b34e 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -60,10 +60,10 @@ cifs_dump_mem(char *label, void *data, int length)
60#ifdef CONFIG_CIFS_DEBUG2 60#ifdef CONFIG_CIFS_DEBUG2
61void cifs_dump_detail(struct smb_hdr *smb) 61void cifs_dump_detail(struct smb_hdr *smb)
62{ 62{
63 cERROR(1, ("Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d", 63 cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d",
64 smb->Command, smb->Status.CifsError, 64 smb->Command, smb->Status.CifsError,
65 smb->Flags, smb->Flags2, smb->Mid, smb->Pid)); 65 smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
66 cERROR(1, ("smb buf %p len %d", smb, smbCalcSize_LE(smb))); 66 cERROR(1, "smb buf %p len %d", smb, smbCalcSize_LE(smb));
67} 67}
68 68
69 69
@@ -75,25 +75,25 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
75 if (server == NULL) 75 if (server == NULL)
76 return; 76 return;
77 77
78 cERROR(1, ("Dump pending requests:")); 78 cERROR(1, "Dump pending requests:");
79 spin_lock(&GlobalMid_Lock); 79 spin_lock(&GlobalMid_Lock);
80 list_for_each(tmp, &server->pending_mid_q) { 80 list_for_each(tmp, &server->pending_mid_q) {
81 mid_entry = list_entry(tmp, struct mid_q_entry, qhead); 81 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
82 cERROR(1, ("State: %d Cmd: %d Pid: %d Tsk: %p Mid %d", 82 cERROR(1, "State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
83 mid_entry->midState, 83 mid_entry->midState,
84 (int)mid_entry->command, 84 (int)mid_entry->command,
85 mid_entry->pid, 85 mid_entry->pid,
86 mid_entry->tsk, 86 mid_entry->tsk,
87 mid_entry->mid)); 87 mid_entry->mid);
88#ifdef CONFIG_CIFS_STATS2 88#ifdef CONFIG_CIFS_STATS2
89 cERROR(1, ("IsLarge: %d buf: %p time rcv: %ld now: %ld", 89 cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld",
90 mid_entry->largeBuf, 90 mid_entry->largeBuf,
91 mid_entry->resp_buf, 91 mid_entry->resp_buf,
92 mid_entry->when_received, 92 mid_entry->when_received,
93 jiffies)); 93 jiffies);
94#endif /* STATS2 */ 94#endif /* STATS2 */
95 cERROR(1, ("IsMult: %d IsEnd: %d", mid_entry->multiRsp, 95 cERROR(1, "IsMult: %d IsEnd: %d", mid_entry->multiRsp,
96 mid_entry->multiEnd)); 96 mid_entry->multiEnd);
97 if (mid_entry->resp_buf) { 97 if (mid_entry->resp_buf) {
98 cifs_dump_detail(mid_entry->resp_buf); 98 cifs_dump_detail(mid_entry->resp_buf);
99 cifs_dump_mem("existing buf: ", 99 cifs_dump_mem("existing buf: ",
@@ -716,7 +716,7 @@ static const struct file_operations cifs_multiuser_mount_proc_fops = {
716 716
717static int cifs_security_flags_proc_show(struct seq_file *m, void *v) 717static int cifs_security_flags_proc_show(struct seq_file *m, void *v)
718{ 718{
719 seq_printf(m, "0x%x\n", extended_security); 719 seq_printf(m, "0x%x\n", global_secflags);
720 return 0; 720 return 0;
721} 721}
722 722
@@ -744,13 +744,13 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
744 /* single char or single char followed by null */ 744 /* single char or single char followed by null */
745 c = flags_string[0]; 745 c = flags_string[0];
746 if (c == '0' || c == 'n' || c == 'N') { 746 if (c == '0' || c == 'n' || c == 'N') {
747 extended_security = CIFSSEC_DEF; /* default */ 747 global_secflags = CIFSSEC_DEF; /* default */
748 return count; 748 return count;
749 } else if (c == '1' || c == 'y' || c == 'Y') { 749 } else if (c == '1' || c == 'y' || c == 'Y') {
750 extended_security = CIFSSEC_MAX; 750 global_secflags = CIFSSEC_MAX;
751 return count; 751 return count;
752 } else if (!isdigit(c)) { 752 } else if (!isdigit(c)) {
753 cERROR(1, ("invalid flag %c", c)); 753 cERROR(1, "invalid flag %c", c);
754 return -EINVAL; 754 return -EINVAL;
755 } 755 }
756 } 756 }
@@ -758,26 +758,26 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
758 758
759 flags = simple_strtoul(flags_string, NULL, 0); 759 flags = simple_strtoul(flags_string, NULL, 0);
760 760
761 cFYI(1, ("sec flags 0x%x", flags)); 761 cFYI(1, "sec flags 0x%x", flags);
762 762
763 if (flags <= 0) { 763 if (flags <= 0) {
764 cERROR(1, ("invalid security flags %s", flags_string)); 764 cERROR(1, "invalid security flags %s", flags_string);
765 return -EINVAL; 765 return -EINVAL;
766 } 766 }
767 767
768 if (flags & ~CIFSSEC_MASK) { 768 if (flags & ~CIFSSEC_MASK) {
769 cERROR(1, ("attempt to set unsupported security flags 0x%x", 769 cERROR(1, "attempt to set unsupported security flags 0x%x",
770 flags & ~CIFSSEC_MASK)); 770 flags & ~CIFSSEC_MASK);
771 return -EINVAL; 771 return -EINVAL;
772 } 772 }
773 /* flags look ok - update the global security flags for cifs module */ 773 /* flags look ok - update the global security flags for cifs module */
774 extended_security = flags; 774 global_secflags = flags;
775 if (extended_security & CIFSSEC_MUST_SIGN) { 775 if (global_secflags & CIFSSEC_MUST_SIGN) {
776 /* requiring signing implies signing is allowed */ 776 /* requiring signing implies signing is allowed */
777 extended_security |= CIFSSEC_MAY_SIGN; 777 global_secflags |= CIFSSEC_MAY_SIGN;
778 cFYI(1, ("packet signing now required")); 778 cFYI(1, "packet signing now required");
779 } else if ((extended_security & CIFSSEC_MAY_SIGN) == 0) { 779 } else if ((global_secflags & CIFSSEC_MAY_SIGN) == 0) {
780 cFYI(1, ("packet signing disabled")); 780 cFYI(1, "packet signing disabled");
781 } 781 }
782 /* BB should we turn on MAY flags for other MUST options? */ 782 /* BB should we turn on MAY flags for other MUST options? */
783 return count; 783 return count;
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index 5eb3b83bbfa7..aa316891ac0c 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -43,34 +43,54 @@ void dump_smb(struct smb_hdr *, int);
43 */ 43 */
44#ifdef CIFS_DEBUG 44#ifdef CIFS_DEBUG
45 45
46
47/* information message: e.g., configuration, major event */ 46/* information message: e.g., configuration, major event */
48extern int cifsFYI; 47extern int cifsFYI;
49#define cifsfyi(format,arg...) if (cifsFYI & CIFS_INFO) printk(KERN_DEBUG " " __FILE__ ": " format "\n" "" , ## arg) 48#define cifsfyi(fmt, arg...) \
49do { \
50 if (cifsFYI & CIFS_INFO) \
51 printk(KERN_DEBUG "%s: " fmt "\n", __FILE__, ##arg); \
52} while (0)
50 53
51#define cFYI(button,prspec) if (button) cifsfyi prspec 54#define cFYI(set, fmt, arg...) \
55do { \
56 if (set) \
57 cifsfyi(fmt, ##arg); \
58} while (0)
52 59
53#define cifswarn(format, arg...) printk(KERN_WARNING ": " format "\n" , ## arg) 60#define cifswarn(fmt, arg...) \
61 printk(KERN_WARNING fmt "\n", ##arg)
54 62
55/* debug event message: */ 63/* debug event message: */
56extern int cifsERROR; 64extern int cifsERROR;
57 65
58#define cEVENT(format,arg...) if (cifsERROR) printk(KERN_EVENT __FILE__ ": " format "\n" , ## arg) 66#define cEVENT(fmt, arg...) \
67do { \
68 if (cifsERROR) \
69 printk(KERN_EVENT "%s: " fmt "\n", __FILE__, ##arg); \
70} while (0)
59 71
60/* error event message: e.g., i/o error */ 72/* error event message: e.g., i/o error */
61#define cifserror(format,arg...) if (cifsERROR) printk(KERN_ERR " CIFS VFS: " format "\n" "" , ## arg) 73#define cifserror(fmt, arg...) \
74do { \
75 if (cifsERROR) \
76 printk(KERN_ERR "CIFS VFS: " fmt "\n", ##arg); \
77} while (0)
62 78
63#define cERROR(button, prspec) if (button) cifserror prspec 79#define cERROR(set, fmt, arg...) \
80do { \
81 if (set) \
82 cifserror(fmt, ##arg); \
83} while (0)
64 84
65/* 85/*
66 * debug OFF 86 * debug OFF
67 * --------- 87 * ---------
68 */ 88 */
69#else /* _CIFS_DEBUG */ 89#else /* _CIFS_DEBUG */
70#define cERROR(button, prspec) 90#define cERROR(set, fmt, arg...)
71#define cEVENT(format, arg...) 91#define cEVENT(fmt, arg...)
72#define cFYI(button, prspec) 92#define cFYI(set, fmt, arg...)
73#define cifserror(format, arg...) 93#define cifserror(fmt, arg...)
74#endif /* _CIFS_DEBUG */ 94#endif /* _CIFS_DEBUG */
75 95
76#endif /* _H_CIFS_DEBUG */ 96#endif /* _H_CIFS_DEBUG */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index b44ce0a0711c..ac19a6f3dae0 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -15,6 +15,7 @@
15#include <linux/dcache.h> 15#include <linux/dcache.h>
16#include <linux/mount.h> 16#include <linux/mount.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/slab.h>
18#include <linux/vfs.h> 19#include <linux/vfs.h>
19#include <linux/fs.h> 20#include <linux/fs.h>
20#include "cifsglob.h" 21#include "cifsglob.h"
@@ -54,7 +55,7 @@ void cifs_dfs_release_automount_timer(void)
54 * Extracts sharename form full UNC. 55 * Extracts sharename form full UNC.
55 * i.e. strips from UNC trailing path that is not part of share 56 * i.e. strips from UNC trailing path that is not part of share
56 * name and fixup missing '\' in the begining of DFS node refferal 57 * name and fixup missing '\' in the begining of DFS node refferal
57 * if neccessary. 58 * if necessary.
58 * Returns pointer to share name on success or ERR_PTR on error. 59 * Returns pointer to share name on success or ERR_PTR on error.
59 * Caller is responsible for freeing returned string. 60 * Caller is responsible for freeing returned string.
60 */ 61 */
@@ -84,8 +85,8 @@ static char *cifs_get_share_name(const char *node_name)
84 /* find server name end */ 85 /* find server name end */
85 pSep = memchr(UNC+2, '\\', len-2); 86 pSep = memchr(UNC+2, '\\', len-2);
86 if (!pSep) { 87 if (!pSep) {
87 cERROR(1, ("%s: no server name end in node name: %s", 88 cERROR(1, "%s: no server name end in node name: %s",
88 __func__, node_name)); 89 __func__, node_name);
89 kfree(UNC); 90 kfree(UNC);
90 return ERR_PTR(-EINVAL); 91 return ERR_PTR(-EINVAL);
91 } 92 }
@@ -141,8 +142,8 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
141 142
142 rc = dns_resolve_server_name_to_ip(*devname, &srvIP); 143 rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
143 if (rc != 0) { 144 if (rc != 0) {
144 cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d", 145 cERROR(1, "%s: Failed to resolve server part of %s to IP: %d",
145 __func__, *devname, rc)); 146 __func__, *devname, rc);
146 goto compose_mount_options_err; 147 goto compose_mount_options_err;
147 } 148 }
148 /* md_len = strlen(...) + 12 for 'sep+prefixpath=' 149 /* md_len = strlen(...) + 12 for 'sep+prefixpath='
@@ -216,8 +217,8 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
216 strcat(mountdata, fullpath + ref->path_consumed); 217 strcat(mountdata, fullpath + ref->path_consumed);
217 } 218 }
218 219
219 /*cFYI(1,("%s: parent mountdata: %s", __func__,sb_mountdata));*/ 220 /*cFYI(1, "%s: parent mountdata: %s", __func__,sb_mountdata);*/
220 /*cFYI(1, ("%s: submount mountdata: %s", __func__, mountdata ));*/ 221 /*cFYI(1, "%s: submount mountdata: %s", __func__, mountdata );*/
221 222
222compose_mount_options_out: 223compose_mount_options_out:
223 kfree(srvIP); 224 kfree(srvIP);
@@ -293,11 +294,11 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
293 294
294static void dump_referral(const struct dfs_info3_param *ref) 295static void dump_referral(const struct dfs_info3_param *ref)
295{ 296{
296 cFYI(1, ("DFS: ref path: %s", ref->path_name)); 297 cFYI(1, "DFS: ref path: %s", ref->path_name);
297 cFYI(1, ("DFS: node path: %s", ref->node_name)); 298 cFYI(1, "DFS: node path: %s", ref->node_name);
298 cFYI(1, ("DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type)); 299 cFYI(1, "DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type);
299 cFYI(1, ("DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag, 300 cFYI(1, "DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag,
300 ref->path_consumed)); 301 ref->path_consumed);
301} 302}
302 303
303 304
@@ -313,7 +314,7 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
313 int rc = 0; 314 int rc = 0;
314 struct vfsmount *mnt = ERR_PTR(-ENOENT); 315 struct vfsmount *mnt = ERR_PTR(-ENOENT);
315 316
316 cFYI(1, ("in %s", __func__)); 317 cFYI(1, "in %s", __func__);
317 BUG_ON(IS_ROOT(dentry)); 318 BUG_ON(IS_ROOT(dentry));
318 319
319 xid = GetXid(); 320 xid = GetXid();
@@ -351,15 +352,15 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
351 /* connect to a node */ 352 /* connect to a node */
352 len = strlen(referrals[i].node_name); 353 len = strlen(referrals[i].node_name);
353 if (len < 2) { 354 if (len < 2) {
354 cERROR(1, ("%s: Net Address path too short: %s", 355 cERROR(1, "%s: Net Address path too short: %s",
355 __func__, referrals[i].node_name)); 356 __func__, referrals[i].node_name);
356 rc = -EINVAL; 357 rc = -EINVAL;
357 goto out_err; 358 goto out_err;
358 } 359 }
359 mnt = cifs_dfs_do_refmount(nd->path.mnt, 360 mnt = cifs_dfs_do_refmount(nd->path.mnt,
360 nd->path.dentry, referrals + i); 361 nd->path.dentry, referrals + i);
361 cFYI(1, ("%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__, 362 cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
362 referrals[i].node_name, mnt)); 363 referrals[i].node_name, mnt);
363 364
364 /* complete mount procedure if we accured submount */ 365 /* complete mount procedure if we accured submount */
365 if (!IS_ERR(mnt)) 366 if (!IS_ERR(mnt))
@@ -377,7 +378,7 @@ out:
377 FreeXid(xid); 378 FreeXid(xid);
378 free_dfs_info_array(referrals, num_referrals); 379 free_dfs_info_array(referrals, num_referrals);
379 kfree(full_path); 380 kfree(full_path);
380 cFYI(1, ("leaving %s" , __func__)); 381 cFYI(1, "leaving %s" , __func__);
381 return ERR_PTR(rc); 382 return ERR_PTR(rc);
382out_err: 383out_err:
383 path_put(&nd->path); 384 path_put(&nd->path);
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 4797787c6a44..246a167cb913 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -18,6 +18,8 @@
18#ifndef _CIFS_FS_SB_H 18#ifndef _CIFS_FS_SB_H
19#define _CIFS_FS_SB_H 19#define _CIFS_FS_SB_H
20 20
21#include <linux/backing-dev.h>
22
21#define CIFS_MOUNT_NO_PERM 1 /* do not do client vfs_perm check */ 23#define CIFS_MOUNT_NO_PERM 1 /* do not do client vfs_perm check */
22#define CIFS_MOUNT_SET_UID 2 /* set current's euid in create etc. */ 24#define CIFS_MOUNT_SET_UID 2 /* set current's euid in create etc. */
23#define CIFS_MOUNT_SERVER_INUM 4 /* inode numbers from uniqueid from server */ 25#define CIFS_MOUNT_SERVER_INUM 4 /* inode numbers from uniqueid from server */
@@ -50,5 +52,6 @@ struct cifs_sb_info {
50#ifdef CONFIG_CIFS_DFS_UPCALL 52#ifdef CONFIG_CIFS_DFS_UPCALL
51 char *mountdata; /* mount options received at mount time */ 53 char *mountdata; /* mount options received at mount time */
52#endif 54#endif
55 struct backing_dev_info bdi;
53}; 56};
54#endif /* _CIFS_FS_SB_H */ 57#endif /* _CIFS_FS_SB_H */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 8ec7736ce954..379bd7d9c05f 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -20,6 +20,7 @@
20 */ 20 */
21 21
22#include <linux/list.h> 22#include <linux/list.h>
23#include <linux/slab.h>
23#include <linux/string.h> 24#include <linux/string.h>
24#include <keys/user-type.h> 25#include <keys/user-type.h>
25#include <linux/key-type.h> 26#include <linux/key-type.h>
@@ -132,9 +133,9 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
132 dp = description + strlen(description); 133 dp = description + strlen(description);
133 134
134 /* for now, only sec=krb5 and sec=mskrb5 are valid */ 135 /* for now, only sec=krb5 and sec=mskrb5 are valid */
135 if (server->secType == Kerberos) 136 if (server->sec_kerberos)
136 sprintf(dp, ";sec=krb5"); 137 sprintf(dp, ";sec=krb5");
137 else if (server->secType == MSKerberos) 138 else if (server->sec_mskerberos)
138 sprintf(dp, ";sec=mskrb5"); 139 sprintf(dp, ";sec=mskrb5");
139 else 140 else
140 goto out; 141 goto out;
@@ -148,7 +149,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
148 dp = description + strlen(description); 149 dp = description + strlen(description);
149 sprintf(dp, ";pid=0x%x", current->pid); 150 sprintf(dp, ";pid=0x%x", current->pid);
150 151
151 cFYI(1, ("key description = %s", description)); 152 cFYI(1, "key description = %s", description);
152 spnego_key = request_key(&cifs_spnego_key_type, description, ""); 153 spnego_key = request_key(&cifs_spnego_key_type, description, "");
153 154
154#ifdef CONFIG_CIFS_DEBUG2 155#ifdef CONFIG_CIFS_DEBUG2
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 714a542cbafc..430f510a1720 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -19,6 +19,7 @@
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/slab.h>
22#include "cifs_unicode.h" 23#include "cifs_unicode.h"
23#include "cifs_uniupr.h" 24#include "cifs_uniupr.h"
24#include "cifspdu.h" 25#include "cifspdu.h"
@@ -199,9 +200,8 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
199 /* works for 2.4.0 kernel or later */ 200 /* works for 2.4.0 kernel or later */
200 charlen = codepage->char2uni(from, len, &wchar_to[i]); 201 charlen = codepage->char2uni(from, len, &wchar_to[i]);
201 if (charlen < 1) { 202 if (charlen < 1) {
202 cERROR(1, 203 cERROR(1, "strtoUCS: char2uni of %d returned %d",
203 ("strtoUCS: char2uni of %d returned %d", 204 (int)*from, charlen);
204 (int)*from, charlen));
205 /* A question mark */ 205 /* A question mark */
206 to[i] = cpu_to_le16(0x003f); 206 to[i] = cpu_to_le16(0x003f);
207 charlen = 1; 207 charlen = 1;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 7dfe0842a6f6..85d7cf7ff2c8 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -22,6 +22,7 @@
22 */ 22 */
23 23
24#include <linux/fs.h> 24#include <linux/fs.h>
25#include <linux/slab.h>
25#include "cifspdu.h" 26#include "cifspdu.h"
26#include "cifsglob.h" 27#include "cifsglob.h"
27#include "cifsacl.h" 28#include "cifsacl.h"
@@ -86,11 +87,11 @@ int match_sid(struct cifs_sid *ctsid)
86 continue; /* all sub_auth values do not match */ 87 continue; /* all sub_auth values do not match */
87 } 88 }
88 89
89 cFYI(1, ("matching sid: %s\n", wksidarr[i].sidname)); 90 cFYI(1, "matching sid: %s\n", wksidarr[i].sidname);
90 return 0; /* sids compare/match */ 91 return 0; /* sids compare/match */
91 } 92 }
92 93
93 cFYI(1, ("No matching sid")); 94 cFYI(1, "No matching sid");
94 return -1; 95 return -1;
95} 96}
96 97
@@ -207,14 +208,14 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
207 *pbits_to_set &= ~S_IXUGO; 208 *pbits_to_set &= ~S_IXUGO;
208 return; 209 return;
209 } else if (type != ACCESS_ALLOWED) { 210 } else if (type != ACCESS_ALLOWED) {
210 cERROR(1, ("unknown access control type %d", type)); 211 cERROR(1, "unknown access control type %d", type);
211 return; 212 return;
212 } 213 }
213 /* else ACCESS_ALLOWED type */ 214 /* else ACCESS_ALLOWED type */
214 215
215 if (flags & GENERIC_ALL) { 216 if (flags & GENERIC_ALL) {
216 *pmode |= (S_IRWXUGO & (*pbits_to_set)); 217 *pmode |= (S_IRWXUGO & (*pbits_to_set));
217 cFYI(DBG2, ("all perms")); 218 cFYI(DBG2, "all perms");
218 return; 219 return;
219 } 220 }
220 if ((flags & GENERIC_WRITE) || 221 if ((flags & GENERIC_WRITE) ||
@@ -227,7 +228,7 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
227 ((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS)) 228 ((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
228 *pmode |= (S_IXUGO & (*pbits_to_set)); 229 *pmode |= (S_IXUGO & (*pbits_to_set));
229 230
230 cFYI(DBG2, ("access flags 0x%x mode now 0x%x", flags, *pmode)); 231 cFYI(DBG2, "access flags 0x%x mode now 0x%x", flags, *pmode);
231 return; 232 return;
232} 233}
233 234
@@ -256,7 +257,7 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use,
256 if (mode & S_IXUGO) 257 if (mode & S_IXUGO)
257 *pace_flags |= SET_FILE_EXEC_RIGHTS; 258 *pace_flags |= SET_FILE_EXEC_RIGHTS;
258 259
259 cFYI(DBG2, ("mode: 0x%x, access flags now 0x%x", mode, *pace_flags)); 260 cFYI(DBG2, "mode: 0x%x, access flags now 0x%x", mode, *pace_flags);
260 return; 261 return;
261} 262}
262 263
@@ -296,24 +297,24 @@ static void dump_ace(struct cifs_ace *pace, char *end_of_acl)
296 /* validate that we do not go past end of acl */ 297 /* validate that we do not go past end of acl */
297 298
298 if (le16_to_cpu(pace->size) < 16) { 299 if (le16_to_cpu(pace->size) < 16) {
299 cERROR(1, ("ACE too small, %d", le16_to_cpu(pace->size))); 300 cERROR(1, "ACE too small %d", le16_to_cpu(pace->size));
300 return; 301 return;
301 } 302 }
302 303
303 if (end_of_acl < (char *)pace + le16_to_cpu(pace->size)) { 304 if (end_of_acl < (char *)pace + le16_to_cpu(pace->size)) {
304 cERROR(1, ("ACL too small to parse ACE")); 305 cERROR(1, "ACL too small to parse ACE");
305 return; 306 return;
306 } 307 }
307 308
308 num_subauth = pace->sid.num_subauth; 309 num_subauth = pace->sid.num_subauth;
309 if (num_subauth) { 310 if (num_subauth) {
310 int i; 311 int i;
311 cFYI(1, ("ACE revision %d num_auth %d type %d flags %d size %d", 312 cFYI(1, "ACE revision %d num_auth %d type %d flags %d size %d",
312 pace->sid.revision, pace->sid.num_subauth, pace->type, 313 pace->sid.revision, pace->sid.num_subauth, pace->type,
313 pace->flags, le16_to_cpu(pace->size))); 314 pace->flags, le16_to_cpu(pace->size));
314 for (i = 0; i < num_subauth; ++i) { 315 for (i = 0; i < num_subauth; ++i) {
315 cFYI(1, ("ACE sub_auth[%d]: 0x%x", i, 316 cFYI(1, "ACE sub_auth[%d]: 0x%x", i,
316 le32_to_cpu(pace->sid.sub_auth[i]))); 317 le32_to_cpu(pace->sid.sub_auth[i]));
317 } 318 }
318 319
319 /* BB add length check to make sure that we do not have huge 320 /* BB add length check to make sure that we do not have huge
@@ -346,13 +347,13 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
346 347
347 /* validate that we do not go past end of acl */ 348 /* validate that we do not go past end of acl */
348 if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) { 349 if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) {
349 cERROR(1, ("ACL too small to parse DACL")); 350 cERROR(1, "ACL too small to parse DACL");
350 return; 351 return;
351 } 352 }
352 353
353 cFYI(DBG2, ("DACL revision %d size %d num aces %d", 354 cFYI(DBG2, "DACL revision %d size %d num aces %d",
354 le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size), 355 le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size),
355 le32_to_cpu(pdacl->num_aces))); 356 le32_to_cpu(pdacl->num_aces));
356 357
357 /* reset rwx permissions for user/group/other. 358 /* reset rwx permissions for user/group/other.
358 Also, if num_aces is 0 i.e. DACL has no ACEs, 359 Also, if num_aces is 0 i.e. DACL has no ACEs,
@@ -436,25 +437,25 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
436 /* validate that we do not go past end of ACL - sid must be at least 8 437 /* validate that we do not go past end of ACL - sid must be at least 8
437 bytes long (assuming no sub-auths - e.g. the null SID */ 438 bytes long (assuming no sub-auths - e.g. the null SID */
438 if (end_of_acl < (char *)psid + 8) { 439 if (end_of_acl < (char *)psid + 8) {
439 cERROR(1, ("ACL too small to parse SID %p", psid)); 440 cERROR(1, "ACL too small to parse SID %p", psid);
440 return -EINVAL; 441 return -EINVAL;
441 } 442 }
442 443
443 if (psid->num_subauth) { 444 if (psid->num_subauth) {
444#ifdef CONFIG_CIFS_DEBUG2 445#ifdef CONFIG_CIFS_DEBUG2
445 int i; 446 int i;
446 cFYI(1, ("SID revision %d num_auth %d", 447 cFYI(1, "SID revision %d num_auth %d",
447 psid->revision, psid->num_subauth)); 448 psid->revision, psid->num_subauth);
448 449
449 for (i = 0; i < psid->num_subauth; i++) { 450 for (i = 0; i < psid->num_subauth; i++) {
450 cFYI(1, ("SID sub_auth[%d]: 0x%x ", i, 451 cFYI(1, "SID sub_auth[%d]: 0x%x ", i,
451 le32_to_cpu(psid->sub_auth[i]))); 452 le32_to_cpu(psid->sub_auth[i]));
452 } 453 }
453 454
454 /* BB add length check to make sure that we do not have huge 455 /* BB add length check to make sure that we do not have huge
455 num auths and therefore go off the end */ 456 num auths and therefore go off the end */
456 cFYI(1, ("RID 0x%x", 457 cFYI(1, "RID 0x%x",
457 le32_to_cpu(psid->sub_auth[psid->num_subauth-1]))); 458 le32_to_cpu(psid->sub_auth[psid->num_subauth-1]));
458#endif 459#endif
459 } 460 }
460 461
@@ -481,11 +482,11 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
481 le32_to_cpu(pntsd->gsidoffset)); 482 le32_to_cpu(pntsd->gsidoffset));
482 dacloffset = le32_to_cpu(pntsd->dacloffset); 483 dacloffset = le32_to_cpu(pntsd->dacloffset);
483 dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); 484 dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
484 cFYI(DBG2, ("revision %d type 0x%x ooffset 0x%x goffset 0x%x " 485 cFYI(DBG2, "revision %d type 0x%x ooffset 0x%x goffset 0x%x "
485 "sacloffset 0x%x dacloffset 0x%x", 486 "sacloffset 0x%x dacloffset 0x%x",
486 pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset), 487 pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset),
487 le32_to_cpu(pntsd->gsidoffset), 488 le32_to_cpu(pntsd->gsidoffset),
488 le32_to_cpu(pntsd->sacloffset), dacloffset)); 489 le32_to_cpu(pntsd->sacloffset), dacloffset);
489/* cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */ 490/* cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */
490 rc = parse_sid(owner_sid_ptr, end_of_acl); 491 rc = parse_sid(owner_sid_ptr, end_of_acl);
491 if (rc) 492 if (rc)
@@ -499,7 +500,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
499 parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr, 500 parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr,
500 group_sid_ptr, fattr); 501 group_sid_ptr, fattr);
501 else 502 else
502 cFYI(1, ("no ACL")); /* BB grant all or default perms? */ 503 cFYI(1, "no ACL"); /* BB grant all or default perms? */
503 504
504/* cifscred->uid = owner_sid_ptr->rid; 505/* cifscred->uid = owner_sid_ptr->rid;
505 cifscred->gid = group_sid_ptr->rid; 506 cifscred->gid = group_sid_ptr->rid;
@@ -562,7 +563,7 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
562 FreeXid(xid); 563 FreeXid(xid);
563 564
564 565
565 cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen)); 566 cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
566 return pntsd; 567 return pntsd;
567} 568}
568 569
@@ -580,12 +581,12 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
580 &fid, &oplock, NULL, cifs_sb->local_nls, 581 &fid, &oplock, NULL, cifs_sb->local_nls,
581 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 582 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
582 if (rc) { 583 if (rc) {
583 cERROR(1, ("Unable to open file to get ACL")); 584 cERROR(1, "Unable to open file to get ACL");
584 goto out; 585 goto out;
585 } 586 }
586 587
587 rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen); 588 rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
588 cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen)); 589 cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
589 590
590 CIFSSMBClose(xid, cifs_sb->tcon, fid); 591 CIFSSMBClose(xid, cifs_sb->tcon, fid);
591 out: 592 out:
@@ -620,7 +621,7 @@ static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
620 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen); 621 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
621 FreeXid(xid); 622 FreeXid(xid);
622 623
623 cFYI(DBG2, ("SetCIFSACL rc = %d", rc)); 624 cFYI(DBG2, "SetCIFSACL rc = %d", rc);
624 return rc; 625 return rc;
625} 626}
626 627
@@ -637,12 +638,12 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
637 &fid, &oplock, NULL, cifs_sb->local_nls, 638 &fid, &oplock, NULL, cifs_sb->local_nls,
638 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 639 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
639 if (rc) { 640 if (rc) {
640 cERROR(1, ("Unable to open file to set ACL")); 641 cERROR(1, "Unable to open file to set ACL");
641 goto out; 642 goto out;
642 } 643 }
643 644
644 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen); 645 rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
645 cFYI(DBG2, ("SetCIFSACL rc = %d", rc)); 646 cFYI(DBG2, "SetCIFSACL rc = %d", rc);
646 647
647 CIFSSMBClose(xid, cifs_sb->tcon, fid); 648 CIFSSMBClose(xid, cifs_sb->tcon, fid);
648 out: 649 out:
@@ -658,7 +659,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
658 struct cifsFileInfo *open_file; 659 struct cifsFileInfo *open_file;
659 int rc; 660 int rc;
660 661
661 cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode)); 662 cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
662 663
663 open_file = find_readable_file(CIFS_I(inode)); 664 open_file = find_readable_file(CIFS_I(inode));
664 if (!open_file) 665 if (!open_file)
@@ -678,7 +679,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
678 u32 acllen = 0; 679 u32 acllen = 0;
679 int rc = 0; 680 int rc = 0;
680 681
681 cFYI(DBG2, ("converting ACL to mode for %s", path)); 682 cFYI(DBG2, "converting ACL to mode for %s", path);
682 683
683 if (pfid) 684 if (pfid)
684 pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen); 685 pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen);
@@ -689,7 +690,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
689 if (pntsd) 690 if (pntsd)
690 rc = parse_sec_desc(pntsd, acllen, fattr); 691 rc = parse_sec_desc(pntsd, acllen, fattr);
691 if (rc) 692 if (rc)
692 cFYI(1, ("parse sec desc failed rc = %d", rc)); 693 cFYI(1, "parse sec desc failed rc = %d", rc);
693 694
694 kfree(pntsd); 695 kfree(pntsd);
695 return; 696 return;
@@ -703,7 +704,7 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
703 struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */ 704 struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
704 struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ 705 struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
705 706
706 cFYI(DBG2, ("set ACL from mode for %s", path)); 707 cFYI(DBG2, "set ACL from mode for %s", path);
707 708
708 /* Get the security descriptor */ 709 /* Get the security descriptor */
709 pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen); 710 pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
@@ -720,19 +721,19 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
720 DEFSECDESCLEN : secdesclen; 721 DEFSECDESCLEN : secdesclen;
721 pnntsd = kmalloc(secdesclen, GFP_KERNEL); 722 pnntsd = kmalloc(secdesclen, GFP_KERNEL);
722 if (!pnntsd) { 723 if (!pnntsd) {
723 cERROR(1, ("Unable to allocate security descriptor")); 724 cERROR(1, "Unable to allocate security descriptor");
724 kfree(pntsd); 725 kfree(pntsd);
725 return -ENOMEM; 726 return -ENOMEM;
726 } 727 }
727 728
728 rc = build_sec_desc(pntsd, pnntsd, inode, nmode); 729 rc = build_sec_desc(pntsd, pnntsd, inode, nmode);
729 730
730 cFYI(DBG2, ("build_sec_desc rc: %d", rc)); 731 cFYI(DBG2, "build_sec_desc rc: %d", rc);
731 732
732 if (!rc) { 733 if (!rc) {
733 /* Set the security descriptor */ 734 /* Set the security descriptor */
734 rc = set_cifs_acl(pnntsd, secdesclen, inode, path); 735 rc = set_cifs_acl(pnntsd, secdesclen, inode, path);
735 cFYI(DBG2, ("set_cifs_acl rc: %d", rc)); 736 cFYI(DBG2, "set_cifs_acl rc: %d", rc);
736 } 737 }
737 738
738 kfree(pnntsd); 739 kfree(pnntsd);
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 7efe1745494d..847628dfdc44 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -20,6 +20,7 @@
20 */ 20 */
21 21
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/slab.h>
23#include "cifspdu.h" 24#include "cifspdu.h"
24#include "cifsglob.h" 25#include "cifsglob.h"
25#include "cifs_debug.h" 26#include "cifs_debug.h"
@@ -102,7 +103,7 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
102 if (iov[i].iov_len == 0) 103 if (iov[i].iov_len == 0)
103 continue; 104 continue;
104 if (iov[i].iov_base == NULL) { 105 if (iov[i].iov_base == NULL) {
105 cERROR(1, ("null iovec entry")); 106 cERROR(1, "null iovec entry");
106 return -EIO; 107 return -EIO;
107 } 108 }
108 /* The first entry includes a length field (which does not get 109 /* The first entry includes a length field (which does not get
@@ -180,8 +181,8 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
180 181
181 /* Do not need to verify session setups with signature "BSRSPYL " */ 182 /* Do not need to verify session setups with signature "BSRSPYL " */
182 if (memcmp(cifs_pdu->Signature.SecuritySignature, "BSRSPYL ", 8) == 0) 183 if (memcmp(cifs_pdu->Signature.SecuritySignature, "BSRSPYL ", 8) == 0)
183 cFYI(1, ("dummy signature received for smb command 0x%x", 184 cFYI(1, "dummy signature received for smb command 0x%x",
184 cifs_pdu->Command)); 185 cifs_pdu->Command);
185 186
186 /* save off the origiginal signature so we can modify the smb and check 187 /* save off the origiginal signature so we can modify the smb and check
187 its signature against what the server sent */ 188 its signature against what the server sent */
@@ -290,7 +291,7 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
290 if (password) 291 if (password)
291 strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE); 292 strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE);
292 293
293 if (!encrypt && extended_security & CIFSSEC_MAY_PLNTXT) { 294 if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) {
294 memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE); 295 memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
295 memcpy(lnm_session_key, password_with_pad, 296 memcpy(lnm_session_key, password_with_pad,
296 CIFS_ENCPWD_SIZE); 297 CIFS_ENCPWD_SIZE);
@@ -397,7 +398,7 @@ void setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
397 /* calculate buf->ntlmv2_hash */ 398 /* calculate buf->ntlmv2_hash */
398 rc = calc_ntlmv2_hash(ses, nls_cp); 399 rc = calc_ntlmv2_hash(ses, nls_cp);
399 if (rc) 400 if (rc)
400 cERROR(1, ("could not get v2 hash rc %d", rc)); 401 cERROR(1, "could not get v2 hash rc %d", rc);
401 CalcNTLMv2_response(ses, resp_buf); 402 CalcNTLMv2_response(ses, resp_buf);
402 403
403 /* now calculate the MAC key for NTLMv2 */ 404 /* now calculate the MAC key for NTLMv2 */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8c6a03627176..78c02eb4cb1f 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -49,10 +49,6 @@
49#include "cifs_spnego.h" 49#include "cifs_spnego.h"
50#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ 50#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */
51 51
52#ifdef CONFIG_CIFS_QUOTA
53static const struct quotactl_ops cifs_quotactl_ops;
54#endif /* QUOTA */
55
56int cifsFYI = 0; 52int cifsFYI = 0;
57int cifsERROR = 1; 53int cifsERROR = 1;
58int traceSMB = 0; 54int traceSMB = 0;
@@ -61,7 +57,7 @@ unsigned int experimEnabled = 0;
61unsigned int linuxExtEnabled = 1; 57unsigned int linuxExtEnabled = 1;
62unsigned int lookupCacheEnabled = 1; 58unsigned int lookupCacheEnabled = 1;
63unsigned int multiuser_mount = 0; 59unsigned int multiuser_mount = 0;
64unsigned int extended_security = CIFSSEC_DEF; 60unsigned int global_secflags = CIFSSEC_DEF;
65/* unsigned int ntlmv2_support = 0; */ 61/* unsigned int ntlmv2_support = 0; */
66unsigned int sign_CIFS_PDUs = 1; 62unsigned int sign_CIFS_PDUs = 1;
67static const struct super_operations cifs_super_ops; 63static const struct super_operations cifs_super_ops;
@@ -86,8 +82,6 @@ extern mempool_t *cifs_sm_req_poolp;
86extern mempool_t *cifs_req_poolp; 82extern mempool_t *cifs_req_poolp;
87extern mempool_t *cifs_mid_poolp; 83extern mempool_t *cifs_mid_poolp;
88 84
89extern struct kmem_cache *cifs_oplock_cachep;
90
91static int 85static int
92cifs_read_super(struct super_block *sb, void *data, 86cifs_read_super(struct super_block *sb, void *data,
93 const char *devname, int silent) 87 const char *devname, int silent)
@@ -103,6 +97,12 @@ cifs_read_super(struct super_block *sb, void *data,
103 if (cifs_sb == NULL) 97 if (cifs_sb == NULL)
104 return -ENOMEM; 98 return -ENOMEM;
105 99
100 rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
101 if (rc) {
102 kfree(cifs_sb);
103 return rc;
104 }
105
106#ifdef CONFIG_CIFS_DFS_UPCALL 106#ifdef CONFIG_CIFS_DFS_UPCALL
107 /* copy mount params to sb for use in submounts */ 107 /* copy mount params to sb for use in submounts */
108 /* BB: should we move this after the mount so we 108 /* BB: should we move this after the mount so we
@@ -115,6 +115,7 @@ cifs_read_super(struct super_block *sb, void *data,
115 int len = strlen(data); 115 int len = strlen(data);
116 cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL); 116 cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL);
117 if (cifs_sb->mountdata == NULL) { 117 if (cifs_sb->mountdata == NULL) {
118 bdi_destroy(&cifs_sb->bdi);
118 kfree(sb->s_fs_info); 119 kfree(sb->s_fs_info);
119 sb->s_fs_info = NULL; 120 sb->s_fs_info = NULL;
120 return -ENOMEM; 121 return -ENOMEM;
@@ -128,19 +129,16 @@ cifs_read_super(struct super_block *sb, void *data,
128 129
129 if (rc) { 130 if (rc) {
130 if (!silent) 131 if (!silent)
131 cERROR(1, 132 cERROR(1, "cifs_mount failed w/return code = %d", rc);
132 ("cifs_mount failed w/return code = %d", rc));
133 goto out_mount_failed; 133 goto out_mount_failed;
134 } 134 }
135 135
136 sb->s_magic = CIFS_MAGIC_NUMBER; 136 sb->s_magic = CIFS_MAGIC_NUMBER;
137 sb->s_op = &cifs_super_ops; 137 sb->s_op = &cifs_super_ops;
138 sb->s_bdi = &cifs_sb->bdi;
138/* if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512) 139/* if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512)
139 sb->s_blocksize = 140 sb->s_blocksize =
140 cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */ 141 cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */
141#ifdef CONFIG_CIFS_QUOTA
142 sb->s_qcop = &cifs_quotactl_ops;
143#endif
144 sb->s_blocksize = CIFS_MAX_MSGSIZE; 142 sb->s_blocksize = CIFS_MAX_MSGSIZE;
145 sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */ 143 sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */
146 inode = cifs_root_iget(sb, ROOT_I); 144 inode = cifs_root_iget(sb, ROOT_I);
@@ -160,7 +158,7 @@ cifs_read_super(struct super_block *sb, void *data,
160 158
161#ifdef CONFIG_CIFS_EXPERIMENTAL 159#ifdef CONFIG_CIFS_EXPERIMENTAL
162 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { 160 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
163 cFYI(1, ("export ops supported")); 161 cFYI(1, "export ops supported");
164 sb->s_export_op = &cifs_export_ops; 162 sb->s_export_op = &cifs_export_ops;
165 } 163 }
166#endif /* EXPERIMENTAL */ 164#endif /* EXPERIMENTAL */
@@ -168,7 +166,7 @@ cifs_read_super(struct super_block *sb, void *data,
168 return 0; 166 return 0;
169 167
170out_no_root: 168out_no_root:
171 cERROR(1, ("cifs_read_super: get root inode failed")); 169 cERROR(1, "cifs_read_super: get root inode failed");
172 if (inode) 170 if (inode)
173 iput(inode); 171 iput(inode);
174 172
@@ -183,6 +181,7 @@ out_mount_failed:
183 } 181 }
184#endif 182#endif
185 unload_nls(cifs_sb->local_nls); 183 unload_nls(cifs_sb->local_nls);
184 bdi_destroy(&cifs_sb->bdi);
186 kfree(cifs_sb); 185 kfree(cifs_sb);
187 } 186 }
188 return rc; 187 return rc;
@@ -194,10 +193,10 @@ cifs_put_super(struct super_block *sb)
194 int rc = 0; 193 int rc = 0;
195 struct cifs_sb_info *cifs_sb; 194 struct cifs_sb_info *cifs_sb;
196 195
197 cFYI(1, ("In cifs_put_super")); 196 cFYI(1, "In cifs_put_super");
198 cifs_sb = CIFS_SB(sb); 197 cifs_sb = CIFS_SB(sb);
199 if (cifs_sb == NULL) { 198 if (cifs_sb == NULL) {
200 cFYI(1, ("Empty cifs superblock info passed to unmount")); 199 cFYI(1, "Empty cifs superblock info passed to unmount");
201 return; 200 return;
202 } 201 }
203 202
@@ -205,7 +204,7 @@ cifs_put_super(struct super_block *sb)
205 204
206 rc = cifs_umount(sb, cifs_sb); 205 rc = cifs_umount(sb, cifs_sb);
207 if (rc) 206 if (rc)
208 cERROR(1, ("cifs_umount failed with return code %d", rc)); 207 cERROR(1, "cifs_umount failed with return code %d", rc);
209#ifdef CONFIG_CIFS_DFS_UPCALL 208#ifdef CONFIG_CIFS_DFS_UPCALL
210 if (cifs_sb->mountdata) { 209 if (cifs_sb->mountdata) {
211 kfree(cifs_sb->mountdata); 210 kfree(cifs_sb->mountdata);
@@ -214,6 +213,7 @@ cifs_put_super(struct super_block *sb)
214#endif 213#endif
215 214
216 unload_nls(cifs_sb->local_nls); 215 unload_nls(cifs_sb->local_nls);
216 bdi_destroy(&cifs_sb->bdi);
217 kfree(cifs_sb); 217 kfree(cifs_sb);
218 218
219 unlock_kernel(); 219 unlock_kernel();
@@ -290,7 +290,6 @@ static int cifs_permission(struct inode *inode, int mask)
290static struct kmem_cache *cifs_inode_cachep; 290static struct kmem_cache *cifs_inode_cachep;
291static struct kmem_cache *cifs_req_cachep; 291static struct kmem_cache *cifs_req_cachep;
292static struct kmem_cache *cifs_mid_cachep; 292static struct kmem_cache *cifs_mid_cachep;
293struct kmem_cache *cifs_oplock_cachep;
294static struct kmem_cache *cifs_sm_req_cachep; 293static struct kmem_cache *cifs_sm_req_cachep;
295mempool_t *cifs_sm_req_poolp; 294mempool_t *cifs_sm_req_poolp;
296mempool_t *cifs_req_poolp; 295mempool_t *cifs_req_poolp;
@@ -312,6 +311,7 @@ cifs_alloc_inode(struct super_block *sb)
312 cifs_inode->clientCanCacheRead = false; 311 cifs_inode->clientCanCacheRead = false;
313 cifs_inode->clientCanCacheAll = false; 312 cifs_inode->clientCanCacheAll = false;
314 cifs_inode->delete_pending = false; 313 cifs_inode->delete_pending = false;
314 cifs_inode->invalid_mapping = false;
315 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ 315 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
316 cifs_inode->server_eof = 0; 316 cifs_inode->server_eof = 0;
317 317
@@ -421,106 +421,6 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
421 return 0; 421 return 0;
422} 422}
423 423
424#ifdef CONFIG_CIFS_QUOTA
425int cifs_xquota_set(struct super_block *sb, int quota_type, qid_t qid,
426 struct fs_disk_quota *pdquota)
427{
428 int xid;
429 int rc = 0;
430 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
431 struct cifsTconInfo *pTcon;
432
433 if (cifs_sb)
434 pTcon = cifs_sb->tcon;
435 else
436 return -EIO;
437
438
439 xid = GetXid();
440 if (pTcon) {
441 cFYI(1, ("set type: 0x%x id: %d", quota_type, qid));
442 } else
443 rc = -EIO;
444
445 FreeXid(xid);
446 return rc;
447}
448
449int cifs_xquota_get(struct super_block *sb, int quota_type, qid_t qid,
450 struct fs_disk_quota *pdquota)
451{
452 int xid;
453 int rc = 0;
454 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
455 struct cifsTconInfo *pTcon;
456
457 if (cifs_sb)
458 pTcon = cifs_sb->tcon;
459 else
460 return -EIO;
461
462 xid = GetXid();
463 if (pTcon) {
464 cFYI(1, ("set type: 0x%x id: %d", quota_type, qid));
465 } else
466 rc = -EIO;
467
468 FreeXid(xid);
469 return rc;
470}
471
472int cifs_xstate_set(struct super_block *sb, unsigned int flags, int operation)
473{
474 int xid;
475 int rc = 0;
476 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
477 struct cifsTconInfo *pTcon;
478
479 if (cifs_sb)
480 pTcon = cifs_sb->tcon;
481 else
482 return -EIO;
483
484 xid = GetXid();
485 if (pTcon) {
486 cFYI(1, ("flags: 0x%x operation: 0x%x", flags, operation));
487 } else
488 rc = -EIO;
489
490 FreeXid(xid);
491 return rc;
492}
493
494int cifs_xstate_get(struct super_block *sb, struct fs_quota_stat *qstats)
495{
496 int xid;
497 int rc = 0;
498 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
499 struct cifsTconInfo *pTcon;
500
501 if (cifs_sb)
502 pTcon = cifs_sb->tcon;
503 else
504 return -EIO;
505
506 xid = GetXid();
507 if (pTcon) {
508 cFYI(1, ("pqstats %p", qstats));
509 } else
510 rc = -EIO;
511
512 FreeXid(xid);
513 return rc;
514}
515
516static const struct quotactl_ops cifs_quotactl_ops = {
517 .set_xquota = cifs_xquota_set,
518 .get_xquota = cifs_xquota_get,
519 .set_xstate = cifs_xstate_set,
520 .get_xstate = cifs_xstate_get,
521};
522#endif
523
524static void cifs_umount_begin(struct super_block *sb) 424static void cifs_umount_begin(struct super_block *sb)
525{ 425{
526 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 426 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -547,7 +447,7 @@ static void cifs_umount_begin(struct super_block *sb)
547 /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */ 447 /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
548 /* cancel_notify_requests(tcon); */ 448 /* cancel_notify_requests(tcon); */
549 if (tcon->ses && tcon->ses->server) { 449 if (tcon->ses && tcon->ses->server) {
550 cFYI(1, ("wake up tasks now - umount begin not complete")); 450 cFYI(1, "wake up tasks now - umount begin not complete");
551 wake_up_all(&tcon->ses->server->request_q); 451 wake_up_all(&tcon->ses->server->request_q);
552 wake_up_all(&tcon->ses->server->response_q); 452 wake_up_all(&tcon->ses->server->response_q);
553 msleep(1); /* yield */ 453 msleep(1); /* yield */
@@ -598,7 +498,7 @@ cifs_get_sb(struct file_system_type *fs_type,
598 int rc; 498 int rc;
599 struct super_block *sb = sget(fs_type, NULL, set_anon_super, NULL); 499 struct super_block *sb = sget(fs_type, NULL, set_anon_super, NULL);
600 500
601 cFYI(1, ("Devname: %s flags: %d ", dev_name, flags)); 501 cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
602 502
603 if (IS_ERR(sb)) 503 if (IS_ERR(sb))
604 return PTR_ERR(sb); 504 return PTR_ERR(sb);
@@ -638,14 +538,13 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
638 setting the revalidate time to zero */ 538 setting the revalidate time to zero */
639 CIFS_I(file->f_path.dentry->d_inode)->time = 0; 539 CIFS_I(file->f_path.dentry->d_inode)->time = 0;
640 540
641 retval = cifs_revalidate(file->f_path.dentry); 541 retval = cifs_revalidate_file(file);
642 if (retval < 0) 542 if (retval < 0)
643 return (loff_t)retval; 543 return (loff_t)retval;
644 } 544 }
645 return generic_file_llseek_unlocked(file, offset, origin); 545 return generic_file_llseek_unlocked(file, offset, origin);
646} 546}
647 547
648#ifdef CONFIG_CIFS_EXPERIMENTAL
649static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) 548static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
650{ 549{
651 /* note that this is called by vfs setlease with the BKL held 550 /* note that this is called by vfs setlease with the BKL held
@@ -674,7 +573,6 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
674 else 573 else
675 return -EAGAIN; 574 return -EAGAIN;
676} 575}
677#endif
678 576
679struct file_system_type cifs_fs_type = { 577struct file_system_type cifs_fs_type = {
680 .owner = THIS_MODULE, 578 .owner = THIS_MODULE,
@@ -751,10 +649,7 @@ const struct file_operations cifs_file_ops = {
751#ifdef CONFIG_CIFS_POSIX 649#ifdef CONFIG_CIFS_POSIX
752 .unlocked_ioctl = cifs_ioctl, 650 .unlocked_ioctl = cifs_ioctl,
753#endif /* CONFIG_CIFS_POSIX */ 651#endif /* CONFIG_CIFS_POSIX */
754
755#ifdef CONFIG_CIFS_EXPERIMENTAL
756 .setlease = cifs_setlease, 652 .setlease = cifs_setlease,
757#endif /* CONFIG_CIFS_EXPERIMENTAL */
758}; 653};
759 654
760const struct file_operations cifs_file_direct_ops = { 655const struct file_operations cifs_file_direct_ops = {
@@ -773,9 +668,7 @@ const struct file_operations cifs_file_direct_ops = {
773 .unlocked_ioctl = cifs_ioctl, 668 .unlocked_ioctl = cifs_ioctl,
774#endif /* CONFIG_CIFS_POSIX */ 669#endif /* CONFIG_CIFS_POSIX */
775 .llseek = cifs_llseek, 670 .llseek = cifs_llseek,
776#ifdef CONFIG_CIFS_EXPERIMENTAL
777 .setlease = cifs_setlease, 671 .setlease = cifs_setlease,
778#endif /* CONFIG_CIFS_EXPERIMENTAL */
779}; 672};
780const struct file_operations cifs_file_nobrl_ops = { 673const struct file_operations cifs_file_nobrl_ops = {
781 .read = do_sync_read, 674 .read = do_sync_read,
@@ -792,10 +685,7 @@ const struct file_operations cifs_file_nobrl_ops = {
792#ifdef CONFIG_CIFS_POSIX 685#ifdef CONFIG_CIFS_POSIX
793 .unlocked_ioctl = cifs_ioctl, 686 .unlocked_ioctl = cifs_ioctl,
794#endif /* CONFIG_CIFS_POSIX */ 687#endif /* CONFIG_CIFS_POSIX */
795
796#ifdef CONFIG_CIFS_EXPERIMENTAL
797 .setlease = cifs_setlease, 688 .setlease = cifs_setlease,
798#endif /* CONFIG_CIFS_EXPERIMENTAL */
799}; 689};
800 690
801const struct file_operations cifs_file_direct_nobrl_ops = { 691const struct file_operations cifs_file_direct_nobrl_ops = {
@@ -807,14 +697,13 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
807 .release = cifs_close, 697 .release = cifs_close,
808 .fsync = cifs_fsync, 698 .fsync = cifs_fsync,
809 .flush = cifs_flush, 699 .flush = cifs_flush,
700 .mmap = cifs_file_mmap,
810 .splice_read = generic_file_splice_read, 701 .splice_read = generic_file_splice_read,
811#ifdef CONFIG_CIFS_POSIX 702#ifdef CONFIG_CIFS_POSIX
812 .unlocked_ioctl = cifs_ioctl, 703 .unlocked_ioctl = cifs_ioctl,
813#endif /* CONFIG_CIFS_POSIX */ 704#endif /* CONFIG_CIFS_POSIX */
814 .llseek = cifs_llseek, 705 .llseek = cifs_llseek,
815#ifdef CONFIG_CIFS_EXPERIMENTAL
816 .setlease = cifs_setlease, 706 .setlease = cifs_setlease,
817#endif /* CONFIG_CIFS_EXPERIMENTAL */
818}; 707};
819 708
820const struct file_operations cifs_dir_ops = { 709const struct file_operations cifs_dir_ops = {
@@ -866,7 +755,7 @@ cifs_init_request_bufs(void)
866 } else { 755 } else {
867 CIFSMaxBufSize &= 0x1FE00; /* Round size to even 512 byte mult*/ 756 CIFSMaxBufSize &= 0x1FE00; /* Round size to even 512 byte mult*/
868 } 757 }
869/* cERROR(1,("CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize)); */ 758/* cERROR(1, "CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize); */
870 cifs_req_cachep = kmem_cache_create("cifs_request", 759 cifs_req_cachep = kmem_cache_create("cifs_request",
871 CIFSMaxBufSize + 760 CIFSMaxBufSize +
872 MAX_CIFS_HDR_SIZE, 0, 761 MAX_CIFS_HDR_SIZE, 0,
@@ -878,7 +767,7 @@ cifs_init_request_bufs(void)
878 cifs_min_rcv = 1; 767 cifs_min_rcv = 1;
879 else if (cifs_min_rcv > 64) { 768 else if (cifs_min_rcv > 64) {
880 cifs_min_rcv = 64; 769 cifs_min_rcv = 64;
881 cERROR(1, ("cifs_min_rcv set to maximum (64)")); 770 cERROR(1, "cifs_min_rcv set to maximum (64)");
882 } 771 }
883 772
884 cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv, 773 cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv,
@@ -909,7 +798,7 @@ cifs_init_request_bufs(void)
909 cifs_min_small = 2; 798 cifs_min_small = 2;
910 else if (cifs_min_small > 256) { 799 else if (cifs_min_small > 256) {
911 cifs_min_small = 256; 800 cifs_min_small = 256;
912 cFYI(1, ("cifs_min_small set to maximum (256)")); 801 cFYI(1, "cifs_min_small set to maximum (256)");
913 } 802 }
914 803
915 cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small, 804 cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small,
@@ -950,15 +839,6 @@ cifs_init_mids(void)
950 return -ENOMEM; 839 return -ENOMEM;
951 } 840 }
952 841
953 cifs_oplock_cachep = kmem_cache_create("cifs_oplock_structs",
954 sizeof(struct oplock_q_entry), 0,
955 SLAB_HWCACHE_ALIGN, NULL);
956 if (cifs_oplock_cachep == NULL) {
957 mempool_destroy(cifs_mid_poolp);
958 kmem_cache_destroy(cifs_mid_cachep);
959 return -ENOMEM;
960 }
961
962 return 0; 842 return 0;
963} 843}
964 844
@@ -967,7 +847,6 @@ cifs_destroy_mids(void)
967{ 847{
968 mempool_destroy(cifs_mid_poolp); 848 mempool_destroy(cifs_mid_poolp);
969 kmem_cache_destroy(cifs_mid_cachep); 849 kmem_cache_destroy(cifs_mid_cachep);
970 kmem_cache_destroy(cifs_oplock_cachep);
971} 850}
972 851
973static int __init 852static int __init
@@ -1007,10 +886,10 @@ init_cifs(void)
1007 886
1008 if (cifs_max_pending < 2) { 887 if (cifs_max_pending < 2) {
1009 cifs_max_pending = 2; 888 cifs_max_pending = 2;
1010 cFYI(1, ("cifs_max_pending set to min of 2")); 889 cFYI(1, "cifs_max_pending set to min of 2");
1011 } else if (cifs_max_pending > 256) { 890 } else if (cifs_max_pending > 256) {
1012 cifs_max_pending = 256; 891 cifs_max_pending = 256;
1013 cFYI(1, ("cifs_max_pending set to max of 256")); 892 cFYI(1, "cifs_max_pending set to max of 256");
1014 } 893 }
1015 894
1016 rc = cifs_init_inodecache(); 895 rc = cifs_init_inodecache();
@@ -1068,7 +947,7 @@ init_cifs(void)
1068static void __exit 947static void __exit
1069exit_cifs(void) 948exit_cifs(void)
1070{ 949{
1071 cFYI(DBG2, ("exit_cifs")); 950 cFYI(DBG2, "exit_cifs");
1072 cifs_proc_clean(); 951 cifs_proc_clean();
1073#ifdef CONFIG_CIFS_DFS_UPCALL 952#ifdef CONFIG_CIFS_DFS_UPCALL
1074 cifs_dfs_release_automount_timer(); 953 cifs_dfs_release_automount_timer();
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 78c1b86d55f6..0242ff9cbf41 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -61,7 +61,8 @@ extern int cifs_mkdir(struct inode *, struct dentry *, int);
61extern int cifs_rmdir(struct inode *, struct dentry *); 61extern int cifs_rmdir(struct inode *, struct dentry *);
62extern int cifs_rename(struct inode *, struct dentry *, struct inode *, 62extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
63 struct dentry *); 63 struct dentry *);
64extern int cifs_revalidate(struct dentry *); 64extern int cifs_revalidate_file(struct file *filp);
65extern int cifs_revalidate_dentry(struct dentry *);
65extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *); 66extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
66extern int cifs_setattr(struct dentry *, struct iattr *); 67extern int cifs_setattr(struct dentry *, struct iattr *);
67 68
@@ -113,5 +114,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
113extern const struct export_operations cifs_export_ops; 114extern const struct export_operations cifs_export_ops;
114#endif /* EXPERIMENTAL */ 115#endif /* EXPERIMENTAL */
115 116
116#define CIFS_VERSION "1.62" 117#define CIFS_VERSION "1.64"
117#endif /* _CIFSFS_H */ 118#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index a1c817eb291a..a88479ceaad5 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -18,6 +18,7 @@
18 */ 18 */
19#include <linux/in.h> 19#include <linux/in.h>
20#include <linux/in6.h> 20#include <linux/in6.h>
21#include <linux/slab.h>
21#include <linux/slow-work.h> 22#include <linux/slow-work.h>
22#include "cifs_fs_sb.h" 23#include "cifs_fs_sb.h"
23#include "cifsacl.h" 24#include "cifsacl.h"
@@ -86,7 +87,6 @@ enum securityEnum {
86 RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */ 87 RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */
87/* NTLMSSP, */ /* can use rawNTLMSSP instead of NTLMSSP via SPNEGO */ 88/* NTLMSSP, */ /* can use rawNTLMSSP instead of NTLMSSP via SPNEGO */
88 Kerberos, /* Kerberos via SPNEGO */ 89 Kerberos, /* Kerberos via SPNEGO */
89 MSKerberos, /* MS Kerberos via SPNEGO */
90}; 90};
91 91
92enum protocolEnum { 92enum protocolEnum {
@@ -184,6 +184,12 @@ struct TCP_Server_Info {
184 struct mac_key mac_signing_key; 184 struct mac_key mac_signing_key;
185 char ntlmv2_hash[16]; 185 char ntlmv2_hash[16];
186 unsigned long lstrp; /* when we got last response from this server */ 186 unsigned long lstrp; /* when we got last response from this server */
187 u16 dialect; /* dialect index that server chose */
188 /* extended security flavors that server supports */
189 bool sec_kerberos; /* supports plain Kerberos */
190 bool sec_mskerberos; /* supports legacy MS Kerberos */
191 bool sec_kerberosu2u; /* supports U2U Kerberos */
192 bool sec_ntlmssp; /* supports NTLMSSP */
187}; 193};
188 194
189/* 195/*
@@ -389,6 +395,7 @@ struct cifsInodeInfo {
389 bool clientCanCacheRead:1; /* read oplock */ 395 bool clientCanCacheRead:1; /* read oplock */
390 bool clientCanCacheAll:1; /* read and writebehind oplock */ 396 bool clientCanCacheAll:1; /* read and writebehind oplock */
391 bool delete_pending:1; /* DELETE_ON_CLOSE is set */ 397 bool delete_pending:1; /* DELETE_ON_CLOSE is set */
398 bool invalid_mapping:1; /* pagecache is invalid */
392 u64 server_eof; /* current file size on server */ 399 u64 server_eof; /* current file size on server */
393 u64 uniqueid; /* server inode number */ 400 u64 uniqueid; /* server inode number */
394 struct inode vfs_inode; 401 struct inode vfs_inode;
@@ -500,6 +507,7 @@ struct dfs_info3_param {
500#define CIFS_FATTR_DFS_REFERRAL 0x1 507#define CIFS_FATTR_DFS_REFERRAL 0x1
501#define CIFS_FATTR_DELETE_PENDING 0x2 508#define CIFS_FATTR_DELETE_PENDING 0x2
502#define CIFS_FATTR_NEED_REVAL 0x4 509#define CIFS_FATTR_NEED_REVAL 0x4
510#define CIFS_FATTR_INO_COLLISION 0x8
503 511
504struct cifs_fattr { 512struct cifs_fattr {
505 u32 cf_flags; 513 u32 cf_flags;
@@ -715,7 +723,7 @@ GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions
715GLOBAL_EXTERN unsigned int oplockEnabled; 723GLOBAL_EXTERN unsigned int oplockEnabled;
716GLOBAL_EXTERN unsigned int experimEnabled; 724GLOBAL_EXTERN unsigned int experimEnabled;
717GLOBAL_EXTERN unsigned int lookupCacheEnabled; 725GLOBAL_EXTERN unsigned int lookupCacheEnabled;
718GLOBAL_EXTERN unsigned int extended_security; /* if on, session setup sent 726GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent
719 with more secure ntlmssp2 challenge/resp */ 727 with more secure ntlmssp2 challenge/resp */
720GLOBAL_EXTERN unsigned int sign_CIFS_PDUs; /* enable smb packet signing */ 728GLOBAL_EXTERN unsigned int sign_CIFS_PDUs; /* enable smb packet signing */
721GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/ 729GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 88e2bc44ac58..fb1657e0fdb8 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -39,8 +39,20 @@ extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *,
39 unsigned int /* length */); 39 unsigned int /* length */);
40extern unsigned int _GetXid(void); 40extern unsigned int _GetXid(void);
41extern void _FreeXid(unsigned int); 41extern void _FreeXid(unsigned int);
42#define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid())); 42#define GetXid() \
43#define FreeXid(curr_xid) {_FreeXid(curr_xid); cFYI(1,("CIFS VFS: leaving %s (xid = %d) rc = %d",__func__,curr_xid,(int)rc));} 43({ \
44 int __xid = (int)_GetXid(); \
45 cFYI(1, "CIFS VFS: in %s as Xid: %d with uid: %d", \
46 __func__, __xid, current_fsuid()); \
47 __xid; \
48})
49
50#define FreeXid(curr_xid) \
51do { \
52 _FreeXid(curr_xid); \
53 cFYI(1, "CIFS VFS: leaving %s (xid = %d) rc = %d", \
54 __func__, curr_xid, (int)rc); \
55} while (0)
44extern char *build_path_from_dentry(struct dentry *); 56extern char *build_path_from_dentry(struct dentry *);
45extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb); 57extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb);
46extern char *build_wildcard_path_from_dentry(struct dentry *direntry); 58extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
@@ -73,7 +85,7 @@ extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *);
73extern unsigned int smbCalcSize(struct smb_hdr *ptr); 85extern unsigned int smbCalcSize(struct smb_hdr *ptr);
74extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr); 86extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
75extern int decode_negTokenInit(unsigned char *security_blob, int length, 87extern int decode_negTokenInit(unsigned char *security_blob, int length,
76 enum securityEnum *secType); 88 struct TCP_Server_Info *server);
77extern int cifs_convert_address(char *src, void *dst); 89extern int cifs_convert_address(char *src, void *dst);
78extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr); 90extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr);
79extern void header_assemble(struct smb_hdr *, char /* command */ , 91extern void header_assemble(struct smb_hdr *, char /* command */ ,
@@ -83,7 +95,6 @@ extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
83 struct cifsSesInfo *ses, 95 struct cifsSesInfo *ses,
84 void **request_buf); 96 void **request_buf);
85extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, 97extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
86 const int stage,
87 const struct nls_table *nls_cp); 98 const struct nls_table *nls_cp);
88extern __u16 GetNextMid(struct TCP_Server_Info *server); 99extern __u16 GetNextMid(struct TCP_Server_Info *server);
89extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); 100extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
@@ -95,8 +106,11 @@ extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode,
95 __u16 fileHandle, struct file *file, 106 __u16 fileHandle, struct file *file,
96 struct vfsmount *mnt, unsigned int oflags); 107 struct vfsmount *mnt, unsigned int oflags);
97extern int cifs_posix_open(char *full_path, struct inode **pinode, 108extern int cifs_posix_open(char *full_path, struct inode **pinode,
98 struct vfsmount *mnt, int mode, int oflags, 109 struct vfsmount *mnt,
99 __u32 *poplock, __u16 *pnetfid, int xid); 110 struct super_block *sb,
111 int mode, int oflags,
112 __u32 *poplock, __u16 *pnetfid, int xid);
113void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr);
100extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, 114extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
101 FILE_UNIX_BASIC_INFO *info, 115 FILE_UNIX_BASIC_INFO *info,
102 struct cifs_sb_info *cifs_sb); 116 struct cifs_sb_info *cifs_sb);
@@ -104,10 +118,12 @@ extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr);
104extern struct inode *cifs_iget(struct super_block *sb, 118extern struct inode *cifs_iget(struct super_block *sb,
105 struct cifs_fattr *fattr); 119 struct cifs_fattr *fattr);
106 120
121extern int cifs_get_file_info(struct file *filp);
107extern int cifs_get_inode_info(struct inode **pinode, 122extern int cifs_get_inode_info(struct inode **pinode,
108 const unsigned char *search_path, 123 const unsigned char *search_path,
109 FILE_ALL_INFO *pfile_info, 124 FILE_ALL_INFO *pfile_info,
110 struct super_block *sb, int xid, const __u16 *pfid); 125 struct super_block *sb, int xid, const __u16 *pfid);
126extern int cifs_get_file_info_unix(struct file *filp);
111extern int cifs_get_inode_info_unix(struct inode **pinode, 127extern int cifs_get_inode_info_unix(struct inode **pinode,
112 const unsigned char *search_path, 128 const unsigned char *search_path,
113 struct super_block *sb, int xid); 129 struct super_block *sb, int xid);
@@ -123,7 +139,9 @@ extern void cifs_dfs_release_automount_timer(void);
123void cifs_proc_init(void); 139void cifs_proc_init(void);
124void cifs_proc_clean(void); 140void cifs_proc_clean(void);
125 141
126extern int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, 142extern int cifs_negotiate_protocol(unsigned int xid,
143 struct cifsSesInfo *ses);
144extern int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
127 struct nls_table *nls_info); 145 struct nls_table *nls_info);
128extern int CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses); 146extern int CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses);
129 147
@@ -142,6 +160,8 @@ extern int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
142extern int CIFSFindClose(const int, struct cifsTconInfo *tcon, 160extern int CIFSFindClose(const int, struct cifsTconInfo *tcon,
143 const __u16 search_handle); 161 const __u16 search_handle);
144 162
163extern int CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
164 u16 netfid, FILE_ALL_INFO *pFindData);
145extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon, 165extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
146 const unsigned char *searchName, 166 const unsigned char *searchName,
147 FILE_ALL_INFO *findData, 167 FILE_ALL_INFO *findData,
@@ -152,6 +172,8 @@ extern int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
152 FILE_ALL_INFO *findData, 172 FILE_ALL_INFO *findData,
153 const struct nls_table *nls_codepage, int remap); 173 const struct nls_table *nls_codepage, int remap);
154 174
175extern int CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
176 u16 netfid, FILE_UNIX_BASIC_INFO *pFindData);
155extern int CIFSSMBUnixQPathInfo(const int xid, 177extern int CIFSSMBUnixQPathInfo(const int xid,
156 struct cifsTconInfo *tcon, 178 struct cifsTconInfo *tcon,
157 const unsigned char *searchName, 179 const unsigned char *searchName,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 9d17df3e0768..c65c3419dd37 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/cifssmb.c 2 * fs/cifs/cifssmb.c
3 * 3 *
4 * Copyright (C) International Business Machines Corp., 2002,2009 4 * Copyright (C) International Business Machines Corp., 2002,2010
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * Contains the routines for constructing the SMB PDUs themselves 7 * Contains the routines for constructing the SMB PDUs themselves
@@ -30,6 +30,7 @@
30#include <linux/fs.h> 30#include <linux/fs.h>
31#include <linux/kernel.h> 31#include <linux/kernel.h>
32#include <linux/vfs.h> 32#include <linux/vfs.h>
33#include <linux/slab.h>
33#include <linux/posix_acl_xattr.h> 34#include <linux/posix_acl_xattr.h>
34#include <asm/uaccess.h> 35#include <asm/uaccess.h>
35#include "cifspdu.h" 36#include "cifspdu.h"
@@ -129,8 +130,8 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
129 if (smb_command != SMB_COM_WRITE_ANDX && 130 if (smb_command != SMB_COM_WRITE_ANDX &&
130 smb_command != SMB_COM_OPEN_ANDX && 131 smb_command != SMB_COM_OPEN_ANDX &&
131 smb_command != SMB_COM_TREE_DISCONNECT) { 132 smb_command != SMB_COM_TREE_DISCONNECT) {
132 cFYI(1, ("can not send cmd %d while umounting", 133 cFYI(1, "can not send cmd %d while umounting",
133 smb_command)); 134 smb_command);
134 return -ENODEV; 135 return -ENODEV;
135 } 136 }
136 } 137 }
@@ -156,7 +157,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
156 * back on-line 157 * back on-line
157 */ 158 */
158 if (!tcon->retry || ses->status == CifsExiting) { 159 if (!tcon->retry || ses->status == CifsExiting) {
159 cFYI(1, ("gave up waiting on reconnect in smb_init")); 160 cFYI(1, "gave up waiting on reconnect in smb_init");
160 return -EHOSTDOWN; 161 return -EHOSTDOWN;
161 } 162 }
162 } 163 }
@@ -171,7 +172,8 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
171 * reconnect the same SMB session 172 * reconnect the same SMB session
172 */ 173 */
173 mutex_lock(&ses->session_mutex); 174 mutex_lock(&ses->session_mutex);
174 if (ses->need_reconnect) 175 rc = cifs_negotiate_protocol(0, ses);
176 if (rc == 0 && ses->need_reconnect)
175 rc = cifs_setup_session(0, ses, nls_codepage); 177 rc = cifs_setup_session(0, ses, nls_codepage);
176 178
177 /* do we need to reconnect tcon? */ 179 /* do we need to reconnect tcon? */
@@ -183,7 +185,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
183 mark_open_files_invalid(tcon); 185 mark_open_files_invalid(tcon);
184 rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage); 186 rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
185 mutex_unlock(&ses->session_mutex); 187 mutex_unlock(&ses->session_mutex);
186 cFYI(1, ("reconnect tcon rc = %d", rc)); 188 cFYI(1, "reconnect tcon rc = %d", rc);
187 189
188 if (rc) 190 if (rc)
189 goto out; 191 goto out;
@@ -354,7 +356,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
354 struct TCP_Server_Info *server; 356 struct TCP_Server_Info *server;
355 u16 count; 357 u16 count;
356 unsigned int secFlags; 358 unsigned int secFlags;
357 u16 dialect;
358 359
359 if (ses->server) 360 if (ses->server)
360 server = ses->server; 361 server = ses->server;
@@ -371,9 +372,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
371 if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL))) 372 if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
372 secFlags = ses->overrideSecFlg; /* BB FIXME fix sign flags? */ 373 secFlags = ses->overrideSecFlg; /* BB FIXME fix sign flags? */
373 else /* if override flags set only sign/seal OR them with global auth */ 374 else /* if override flags set only sign/seal OR them with global auth */
374 secFlags = extended_security | ses->overrideSecFlg; 375 secFlags = global_secflags | ses->overrideSecFlg;
375 376
376 cFYI(1, ("secFlags 0x%x", secFlags)); 377 cFYI(1, "secFlags 0x%x", secFlags);
377 378
378 pSMB->hdr.Mid = GetNextMid(server); 379 pSMB->hdr.Mid = GetNextMid(server);
379 pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS); 380 pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS);
@@ -381,14 +382,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
381 if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5) 382 if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
382 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 383 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
383 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) { 384 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) {
384 cFYI(1, ("Kerberos only mechanism, enable extended security")); 385 cFYI(1, "Kerberos only mechanism, enable extended security");
385 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 386 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
386 } 387 }
387#ifdef CONFIG_CIFS_EXPERIMENTAL 388#ifdef CONFIG_CIFS_EXPERIMENTAL
388 else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP) 389 else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
389 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 390 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
390 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) { 391 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
391 cFYI(1, ("NTLMSSP only mechanism, enable extended security")); 392 cFYI(1, "NTLMSSP only mechanism, enable extended security");
392 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 393 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
393 } 394 }
394#endif 395#endif
@@ -407,10 +408,10 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
407 if (rc != 0) 408 if (rc != 0)
408 goto neg_err_exit; 409 goto neg_err_exit;
409 410
410 dialect = le16_to_cpu(pSMBr->DialectIndex); 411 server->dialect = le16_to_cpu(pSMBr->DialectIndex);
411 cFYI(1, ("Dialect: %d", dialect)); 412 cFYI(1, "Dialect: %d", server->dialect);
412 /* Check wct = 1 error case */ 413 /* Check wct = 1 error case */
413 if ((pSMBr->hdr.WordCount < 13) || (dialect == BAD_PROT)) { 414 if ((pSMBr->hdr.WordCount < 13) || (server->dialect == BAD_PROT)) {
414 /* core returns wct = 1, but we do not ask for core - otherwise 415 /* core returns wct = 1, but we do not ask for core - otherwise
415 small wct just comes when dialect index is -1 indicating we 416 small wct just comes when dialect index is -1 indicating we
416 could not negotiate a common dialect */ 417 could not negotiate a common dialect */
@@ -418,8 +419,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
418 goto neg_err_exit; 419 goto neg_err_exit;
419#ifdef CONFIG_CIFS_WEAK_PW_HASH 420#ifdef CONFIG_CIFS_WEAK_PW_HASH
420 } else if ((pSMBr->hdr.WordCount == 13) 421 } else if ((pSMBr->hdr.WordCount == 13)
421 && ((dialect == LANMAN_PROT) 422 && ((server->dialect == LANMAN_PROT)
422 || (dialect == LANMAN2_PROT))) { 423 || (server->dialect == LANMAN2_PROT))) {
423 __s16 tmp; 424 __s16 tmp;
424 struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr; 425 struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr;
425 426
@@ -427,8 +428,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
427 (secFlags & CIFSSEC_MAY_PLNTXT)) 428 (secFlags & CIFSSEC_MAY_PLNTXT))
428 server->secType = LANMAN; 429 server->secType = LANMAN;
429 else { 430 else {
430 cERROR(1, ("mount failed weak security disabled" 431 cERROR(1, "mount failed weak security disabled"
431 " in /proc/fs/cifs/SecurityFlags")); 432 " in /proc/fs/cifs/SecurityFlags");
432 rc = -EOPNOTSUPP; 433 rc = -EOPNOTSUPP;
433 goto neg_err_exit; 434 goto neg_err_exit;
434 } 435 }
@@ -461,9 +462,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
461 utc = CURRENT_TIME; 462 utc = CURRENT_TIME;
462 ts = cnvrtDosUnixTm(rsp->SrvTime.Date, 463 ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
463 rsp->SrvTime.Time, 0); 464 rsp->SrvTime.Time, 0);
464 cFYI(1, ("SrvTime %d sec since 1970 (utc: %d) diff: %d", 465 cFYI(1, "SrvTime %d sec since 1970 (utc: %d) diff: %d",
465 (int)ts.tv_sec, (int)utc.tv_sec, 466 (int)ts.tv_sec, (int)utc.tv_sec,
466 (int)(utc.tv_sec - ts.tv_sec))); 467 (int)(utc.tv_sec - ts.tv_sec));
467 val = (int)(utc.tv_sec - ts.tv_sec); 468 val = (int)(utc.tv_sec - ts.tv_sec);
468 seconds = abs(val); 469 seconds = abs(val);
469 result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ; 470 result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
@@ -477,7 +478,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
477 server->timeAdj = (int)tmp; 478 server->timeAdj = (int)tmp;
478 server->timeAdj *= 60; /* also in seconds */ 479 server->timeAdj *= 60; /* also in seconds */
479 } 480 }
480 cFYI(1, ("server->timeAdj: %d seconds", server->timeAdj)); 481 cFYI(1, "server->timeAdj: %d seconds", server->timeAdj);
481 482
482 483
483 /* BB get server time for time conversions and add 484 /* BB get server time for time conversions and add
@@ -492,15 +493,15 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
492 goto neg_err_exit; 493 goto neg_err_exit;
493 } 494 }
494 495
495 cFYI(1, ("LANMAN negotiated")); 496 cFYI(1, "LANMAN negotiated");
496 /* we will not end up setting signing flags - as no signing 497 /* we will not end up setting signing flags - as no signing
497 was in LANMAN and server did not return the flags on */ 498 was in LANMAN and server did not return the flags on */
498 goto signing_check; 499 goto signing_check;
499#else /* weak security disabled */ 500#else /* weak security disabled */
500 } else if (pSMBr->hdr.WordCount == 13) { 501 } else if (pSMBr->hdr.WordCount == 13) {
501 cERROR(1, ("mount failed, cifs module not built " 502 cERROR(1, "mount failed, cifs module not built "
502 "with CIFS_WEAK_PW_HASH support")); 503 "with CIFS_WEAK_PW_HASH support");
503 rc = -EOPNOTSUPP; 504 rc = -EOPNOTSUPP;
504#endif /* WEAK_PW_HASH */ 505#endif /* WEAK_PW_HASH */
505 goto neg_err_exit; 506 goto neg_err_exit;
506 } else if (pSMBr->hdr.WordCount != 17) { 507 } else if (pSMBr->hdr.WordCount != 17) {
@@ -511,14 +512,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
511 /* else wct == 17 NTLM */ 512 /* else wct == 17 NTLM */
512 server->secMode = pSMBr->SecurityMode; 513 server->secMode = pSMBr->SecurityMode;
513 if ((server->secMode & SECMODE_USER) == 0) 514 if ((server->secMode & SECMODE_USER) == 0)
514 cFYI(1, ("share mode security")); 515 cFYI(1, "share mode security");
515 516
516 if ((server->secMode & SECMODE_PW_ENCRYPT) == 0) 517 if ((server->secMode & SECMODE_PW_ENCRYPT) == 0)
517#ifdef CONFIG_CIFS_WEAK_PW_HASH 518#ifdef CONFIG_CIFS_WEAK_PW_HASH
518 if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0) 519 if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0)
519#endif /* CIFS_WEAK_PW_HASH */ 520#endif /* CIFS_WEAK_PW_HASH */
520 cERROR(1, ("Server requests plain text password" 521 cERROR(1, "Server requests plain text password"
521 " but client support disabled")); 522 " but client support disabled");
522 523
523 if ((secFlags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2) 524 if ((secFlags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
524 server->secType = NTLMv2; 525 server->secType = NTLMv2;
@@ -538,7 +539,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
538#endif */ 539#endif */
539 else { 540 else {
540 rc = -EOPNOTSUPP; 541 rc = -EOPNOTSUPP;
541 cERROR(1, ("Invalid security type")); 542 cERROR(1, "Invalid security type");
542 goto neg_err_exit; 543 goto neg_err_exit;
543 } 544 }
544 /* else ... any others ...? */ 545 /* else ... any others ...? */
@@ -550,7 +551,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
550 server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize), 551 server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize),
551 (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); 552 (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
552 server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); 553 server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
553 cFYI(DBG2, ("Max buf = %d", ses->server->maxBuf)); 554 cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
554 GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey); 555 GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
555 server->capabilities = le32_to_cpu(pSMBr->Capabilities); 556 server->capabilities = le32_to_cpu(pSMBr->Capabilities);
556 server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone); 557 server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
@@ -581,7 +582,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
581 if (memcmp(server->server_GUID, 582 if (memcmp(server->server_GUID,
582 pSMBr->u.extended_response. 583 pSMBr->u.extended_response.
583 GUID, 16) != 0) { 584 GUID, 16) != 0) {
584 cFYI(1, ("server UID changed")); 585 cFYI(1, "server UID changed");
585 memcpy(server->server_GUID, 586 memcpy(server->server_GUID,
586 pSMBr->u.extended_response.GUID, 587 pSMBr->u.extended_response.GUID,
587 16); 588 16);
@@ -596,13 +597,19 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
596 server->secType = RawNTLMSSP; 597 server->secType = RawNTLMSSP;
597 } else { 598 } else {
598 rc = decode_negTokenInit(pSMBr->u.extended_response. 599 rc = decode_negTokenInit(pSMBr->u.extended_response.
599 SecurityBlob, 600 SecurityBlob, count - 16,
600 count - 16, 601 server);
601 &server->secType);
602 if (rc == 1) 602 if (rc == 1)
603 rc = 0; 603 rc = 0;
604 else 604 else
605 rc = -EINVAL; 605 rc = -EINVAL;
606
607 if (server->sec_kerberos || server->sec_mskerberos)
608 server->secType = Kerberos;
609 else if (server->sec_ntlmssp)
610 server->secType = RawNTLMSSP;
611 else
612 rc = -EOPNOTSUPP;
606 } 613 }
607 } else 614 } else
608 server->capabilities &= ~CAP_EXTENDED_SECURITY; 615 server->capabilities &= ~CAP_EXTENDED_SECURITY;
@@ -613,22 +620,21 @@ signing_check:
613 if ((secFlags & CIFSSEC_MAY_SIGN) == 0) { 620 if ((secFlags & CIFSSEC_MAY_SIGN) == 0) {
614 /* MUST_SIGN already includes the MAY_SIGN FLAG 621 /* MUST_SIGN already includes the MAY_SIGN FLAG
615 so if this is zero it means that signing is disabled */ 622 so if this is zero it means that signing is disabled */
616 cFYI(1, ("Signing disabled")); 623 cFYI(1, "Signing disabled");
617 if (server->secMode & SECMODE_SIGN_REQUIRED) { 624 if (server->secMode & SECMODE_SIGN_REQUIRED) {
618 cERROR(1, ("Server requires " 625 cERROR(1, "Server requires "
619 "packet signing to be enabled in " 626 "packet signing to be enabled in "
620 "/proc/fs/cifs/SecurityFlags.")); 627 "/proc/fs/cifs/SecurityFlags.");
621 rc = -EOPNOTSUPP; 628 rc = -EOPNOTSUPP;
622 } 629 }
623 server->secMode &= 630 server->secMode &=
624 ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED); 631 ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
625 } else if ((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) { 632 } else if ((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
626 /* signing required */ 633 /* signing required */
627 cFYI(1, ("Must sign - secFlags 0x%x", secFlags)); 634 cFYI(1, "Must sign - secFlags 0x%x", secFlags);
628 if ((server->secMode & 635 if ((server->secMode &
629 (SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) { 636 (SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) {
630 cERROR(1, 637 cERROR(1, "signing required but server lacks support");
631 ("signing required but server lacks support"));
632 rc = -EOPNOTSUPP; 638 rc = -EOPNOTSUPP;
633 } else 639 } else
634 server->secMode |= SECMODE_SIGN_REQUIRED; 640 server->secMode |= SECMODE_SIGN_REQUIRED;
@@ -642,7 +648,7 @@ signing_check:
642neg_err_exit: 648neg_err_exit:
643 cifs_buf_release(pSMB); 649 cifs_buf_release(pSMB);
644 650
645 cFYI(1, ("negprot rc %d", rc)); 651 cFYI(1, "negprot rc %d", rc);
646 return rc; 652 return rc;
647} 653}
648 654
@@ -652,7 +658,7 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
652 struct smb_hdr *smb_buffer; 658 struct smb_hdr *smb_buffer;
653 int rc = 0; 659 int rc = 0;
654 660
655 cFYI(1, ("In tree disconnect")); 661 cFYI(1, "In tree disconnect");
656 662
657 /* BB: do we need to check this? These should never be NULL. */ 663 /* BB: do we need to check this? These should never be NULL. */
658 if ((tcon->ses == NULL) || (tcon->ses->server == NULL)) 664 if ((tcon->ses == NULL) || (tcon->ses->server == NULL))
@@ -674,7 +680,7 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
674 680
675 rc = SendReceiveNoRsp(xid, tcon->ses, smb_buffer, 0); 681 rc = SendReceiveNoRsp(xid, tcon->ses, smb_buffer, 0);
676 if (rc) 682 if (rc)
677 cFYI(1, ("Tree disconnect failed %d", rc)); 683 cFYI(1, "Tree disconnect failed %d", rc);
678 684
679 /* No need to return error on this operation if tid invalidated and 685 /* No need to return error on this operation if tid invalidated and
680 closed on server already e.g. due to tcp session crashing */ 686 closed on server already e.g. due to tcp session crashing */
@@ -690,7 +696,7 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
690 LOGOFF_ANDX_REQ *pSMB; 696 LOGOFF_ANDX_REQ *pSMB;
691 int rc = 0; 697 int rc = 0;
692 698
693 cFYI(1, ("In SMBLogoff for session disconnect")); 699 cFYI(1, "In SMBLogoff for session disconnect");
694 700
695 /* 701 /*
696 * BB: do we need to check validity of ses and server? They should 702 * BB: do we need to check validity of ses and server? They should
@@ -743,7 +749,7 @@ CIFSPOSIXDelFile(const int xid, struct cifsTconInfo *tcon, const char *fileName,
743 int bytes_returned = 0; 749 int bytes_returned = 0;
744 __u16 params, param_offset, offset, byte_count; 750 __u16 params, param_offset, offset, byte_count;
745 751
746 cFYI(1, ("In POSIX delete")); 752 cFYI(1, "In POSIX delete");
747PsxDelete: 753PsxDelete:
748 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 754 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
749 (void **) &pSMBr); 755 (void **) &pSMBr);
@@ -795,7 +801,7 @@ PsxDelete:
795 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 801 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
796 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 802 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
797 if (rc) 803 if (rc)
798 cFYI(1, ("Posix delete returned %d", rc)); 804 cFYI(1, "Posix delete returned %d", rc);
799 cifs_buf_release(pSMB); 805 cifs_buf_release(pSMB);
800 806
801 cifs_stats_inc(&tcon->num_deletes); 807 cifs_stats_inc(&tcon->num_deletes);
@@ -842,7 +848,7 @@ DelFileRetry:
842 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 848 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
843 cifs_stats_inc(&tcon->num_deletes); 849 cifs_stats_inc(&tcon->num_deletes);
844 if (rc) 850 if (rc)
845 cFYI(1, ("Error in RMFile = %d", rc)); 851 cFYI(1, "Error in RMFile = %d", rc);
846 852
847 cifs_buf_release(pSMB); 853 cifs_buf_release(pSMB);
848 if (rc == -EAGAIN) 854 if (rc == -EAGAIN)
@@ -861,7 +867,7 @@ CIFSSMBRmDir(const int xid, struct cifsTconInfo *tcon, const char *dirName,
861 int bytes_returned; 867 int bytes_returned;
862 int name_len; 868 int name_len;
863 869
864 cFYI(1, ("In CIFSSMBRmDir")); 870 cFYI(1, "In CIFSSMBRmDir");
865RmDirRetry: 871RmDirRetry:
866 rc = smb_init(SMB_COM_DELETE_DIRECTORY, 0, tcon, (void **) &pSMB, 872 rc = smb_init(SMB_COM_DELETE_DIRECTORY, 0, tcon, (void **) &pSMB,
867 (void **) &pSMBr); 873 (void **) &pSMBr);
@@ -886,7 +892,7 @@ RmDirRetry:
886 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 892 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
887 cifs_stats_inc(&tcon->num_rmdirs); 893 cifs_stats_inc(&tcon->num_rmdirs);
888 if (rc) 894 if (rc)
889 cFYI(1, ("Error in RMDir = %d", rc)); 895 cFYI(1, "Error in RMDir = %d", rc);
890 896
891 cifs_buf_release(pSMB); 897 cifs_buf_release(pSMB);
892 if (rc == -EAGAIN) 898 if (rc == -EAGAIN)
@@ -904,7 +910,7 @@ CIFSSMBMkDir(const int xid, struct cifsTconInfo *tcon,
904 int bytes_returned; 910 int bytes_returned;
905 int name_len; 911 int name_len;
906 912
907 cFYI(1, ("In CIFSSMBMkDir")); 913 cFYI(1, "In CIFSSMBMkDir");
908MkDirRetry: 914MkDirRetry:
909 rc = smb_init(SMB_COM_CREATE_DIRECTORY, 0, tcon, (void **) &pSMB, 915 rc = smb_init(SMB_COM_CREATE_DIRECTORY, 0, tcon, (void **) &pSMB,
910 (void **) &pSMBr); 916 (void **) &pSMBr);
@@ -929,7 +935,7 @@ MkDirRetry:
929 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 935 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
930 cifs_stats_inc(&tcon->num_mkdirs); 936 cifs_stats_inc(&tcon->num_mkdirs);
931 if (rc) 937 if (rc)
932 cFYI(1, ("Error in Mkdir = %d", rc)); 938 cFYI(1, "Error in Mkdir = %d", rc);
933 939
934 cifs_buf_release(pSMB); 940 cifs_buf_release(pSMB);
935 if (rc == -EAGAIN) 941 if (rc == -EAGAIN)
@@ -952,7 +958,7 @@ CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon, __u32 posix_flags,
952 OPEN_PSX_REQ *pdata; 958 OPEN_PSX_REQ *pdata;
953 OPEN_PSX_RSP *psx_rsp; 959 OPEN_PSX_RSP *psx_rsp;
954 960
955 cFYI(1, ("In POSIX Create")); 961 cFYI(1, "In POSIX Create");
956PsxCreat: 962PsxCreat:
957 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 963 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
958 (void **) &pSMBr); 964 (void **) &pSMBr);
@@ -1006,11 +1012,11 @@ PsxCreat:
1006 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 1012 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
1007 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 1013 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
1008 if (rc) { 1014 if (rc) {
1009 cFYI(1, ("Posix create returned %d", rc)); 1015 cFYI(1, "Posix create returned %d", rc);
1010 goto psx_create_err; 1016 goto psx_create_err;
1011 } 1017 }
1012 1018
1013 cFYI(1, ("copying inode info")); 1019 cFYI(1, "copying inode info");
1014 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 1020 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
1015 1021
1016 if (rc || (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP))) { 1022 if (rc || (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP))) {
@@ -1032,11 +1038,11 @@ PsxCreat:
1032 /* check to make sure response data is there */ 1038 /* check to make sure response data is there */
1033 if (psx_rsp->ReturnedLevel != cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC)) { 1039 if (psx_rsp->ReturnedLevel != cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC)) {
1034 pRetData->Type = cpu_to_le32(-1); /* unknown */ 1040 pRetData->Type = cpu_to_le32(-1); /* unknown */
1035 cFYI(DBG2, ("unknown type")); 1041 cFYI(DBG2, "unknown type");
1036 } else { 1042 } else {
1037 if (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP) 1043 if (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP)
1038 + sizeof(FILE_UNIX_BASIC_INFO)) { 1044 + sizeof(FILE_UNIX_BASIC_INFO)) {
1039 cERROR(1, ("Open response data too small")); 1045 cERROR(1, "Open response data too small");
1040 pRetData->Type = cpu_to_le32(-1); 1046 pRetData->Type = cpu_to_le32(-1);
1041 goto psx_create_err; 1047 goto psx_create_err;
1042 } 1048 }
@@ -1083,7 +1089,7 @@ static __u16 convert_disposition(int disposition)
1083 ofun = SMBOPEN_OCREATE | SMBOPEN_OTRUNC; 1089 ofun = SMBOPEN_OCREATE | SMBOPEN_OTRUNC;
1084 break; 1090 break;
1085 default: 1091 default:
1086 cFYI(1, ("unknown disposition %d", disposition)); 1092 cFYI(1, "unknown disposition %d", disposition);
1087 ofun = SMBOPEN_OAPPEND; /* regular open */ 1093 ofun = SMBOPEN_OAPPEND; /* regular open */
1088 } 1094 }
1089 return ofun; 1095 return ofun;
@@ -1174,7 +1180,7 @@ OldOpenRetry:
1174 (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP); 1180 (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
1175 cifs_stats_inc(&tcon->num_opens); 1181 cifs_stats_inc(&tcon->num_opens);
1176 if (rc) { 1182 if (rc) {
1177 cFYI(1, ("Error in Open = %d", rc)); 1183 cFYI(1, "Error in Open = %d", rc);
1178 } else { 1184 } else {
1179 /* BB verify if wct == 15 */ 1185 /* BB verify if wct == 15 */
1180 1186
@@ -1287,7 +1293,7 @@ openRetry:
1287 (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP); 1293 (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
1288 cifs_stats_inc(&tcon->num_opens); 1294 cifs_stats_inc(&tcon->num_opens);
1289 if (rc) { 1295 if (rc) {
1290 cFYI(1, ("Error in Open = %d", rc)); 1296 cFYI(1, "Error in Open = %d", rc);
1291 } else { 1297 } else {
1292 *pOplock = pSMBr->OplockLevel; /* 1 byte no need to le_to_cpu */ 1298 *pOplock = pSMBr->OplockLevel; /* 1 byte no need to le_to_cpu */
1293 *netfid = pSMBr->Fid; /* cifs fid stays in le */ 1299 *netfid = pSMBr->Fid; /* cifs fid stays in le */
@@ -1325,7 +1331,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
1325 int resp_buf_type = 0; 1331 int resp_buf_type = 0;
1326 struct kvec iov[1]; 1332 struct kvec iov[1];
1327 1333
1328 cFYI(1, ("Reading %d bytes on fid %d", count, netfid)); 1334 cFYI(1, "Reading %d bytes on fid %d", count, netfid);
1329 if (tcon->ses->capabilities & CAP_LARGE_FILES) 1335 if (tcon->ses->capabilities & CAP_LARGE_FILES)
1330 wct = 12; 1336 wct = 12;
1331 else { 1337 else {
@@ -1370,7 +1376,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
1370 cifs_stats_inc(&tcon->num_reads); 1376 cifs_stats_inc(&tcon->num_reads);
1371 pSMBr = (READ_RSP *)iov[0].iov_base; 1377 pSMBr = (READ_RSP *)iov[0].iov_base;
1372 if (rc) { 1378 if (rc) {
1373 cERROR(1, ("Send error in read = %d", rc)); 1379 cERROR(1, "Send error in read = %d", rc);
1374 } else { 1380 } else {
1375 int data_length = le16_to_cpu(pSMBr->DataLengthHigh); 1381 int data_length = le16_to_cpu(pSMBr->DataLengthHigh);
1376 data_length = data_length << 16; 1382 data_length = data_length << 16;
@@ -1380,15 +1386,15 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
1380 /*check that DataLength would not go beyond end of SMB */ 1386 /*check that DataLength would not go beyond end of SMB */
1381 if ((data_length > CIFSMaxBufSize) 1387 if ((data_length > CIFSMaxBufSize)
1382 || (data_length > count)) { 1388 || (data_length > count)) {
1383 cFYI(1, ("bad length %d for count %d", 1389 cFYI(1, "bad length %d for count %d",
1384 data_length, count)); 1390 data_length, count);
1385 rc = -EIO; 1391 rc = -EIO;
1386 *nbytes = 0; 1392 *nbytes = 0;
1387 } else { 1393 } else {
1388 pReadData = (char *) (&pSMBr->hdr.Protocol) + 1394 pReadData = (char *) (&pSMBr->hdr.Protocol) +
1389 le16_to_cpu(pSMBr->DataOffset); 1395 le16_to_cpu(pSMBr->DataOffset);
1390/* if (rc = copy_to_user(buf, pReadData, data_length)) { 1396/* if (rc = copy_to_user(buf, pReadData, data_length)) {
1391 cERROR(1,("Faulting on read rc = %d",rc)); 1397 cERROR(1, "Faulting on read rc = %d",rc);
1392 rc = -EFAULT; 1398 rc = -EFAULT;
1393 }*/ /* can not use copy_to_user when using page cache*/ 1399 }*/ /* can not use copy_to_user when using page cache*/
1394 if (*buf) 1400 if (*buf)
@@ -1430,7 +1436,9 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
1430 __u32 bytes_sent; 1436 __u32 bytes_sent;
1431 __u16 byte_count; 1437 __u16 byte_count;
1432 1438
1433 /* cFYI(1, ("write at %lld %d bytes", offset, count));*/ 1439 *nbytes = 0;
1440
1441 /* cFYI(1, "write at %lld %d bytes", offset, count);*/
1434 if (tcon->ses == NULL) 1442 if (tcon->ses == NULL)
1435 return -ECONNABORTED; 1443 return -ECONNABORTED;
1436 1444
@@ -1511,12 +1519,19 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
1511 (struct smb_hdr *) pSMBr, &bytes_returned, long_op); 1519 (struct smb_hdr *) pSMBr, &bytes_returned, long_op);
1512 cifs_stats_inc(&tcon->num_writes); 1520 cifs_stats_inc(&tcon->num_writes);
1513 if (rc) { 1521 if (rc) {
1514 cFYI(1, ("Send error in write = %d", rc)); 1522 cFYI(1, "Send error in write = %d", rc);
1515 *nbytes = 0;
1516 } else { 1523 } else {
1517 *nbytes = le16_to_cpu(pSMBr->CountHigh); 1524 *nbytes = le16_to_cpu(pSMBr->CountHigh);
1518 *nbytes = (*nbytes) << 16; 1525 *nbytes = (*nbytes) << 16;
1519 *nbytes += le16_to_cpu(pSMBr->Count); 1526 *nbytes += le16_to_cpu(pSMBr->Count);
1527
1528 /*
1529 * Mask off high 16 bits when bytes written as returned by the
1530 * server is greater than bytes requested by the client. Some
1531 * OS/2 servers are known to set incorrect CountHigh values.
1532 */
1533 if (*nbytes > count)
1534 *nbytes &= 0xFFFF;
1520 } 1535 }
1521 1536
1522 cifs_buf_release(pSMB); 1537 cifs_buf_release(pSMB);
@@ -1541,7 +1556,7 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
1541 1556
1542 *nbytes = 0; 1557 *nbytes = 0;
1543 1558
1544 cFYI(1, ("write2 at %lld %d bytes", (long long)offset, count)); 1559 cFYI(1, "write2 at %lld %d bytes", (long long)offset, count);
1545 1560
1546 if (tcon->ses->capabilities & CAP_LARGE_FILES) { 1561 if (tcon->ses->capabilities & CAP_LARGE_FILES) {
1547 wct = 14; 1562 wct = 14;
@@ -1596,7 +1611,7 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
1596 long_op); 1611 long_op);
1597 cifs_stats_inc(&tcon->num_writes); 1612 cifs_stats_inc(&tcon->num_writes);
1598 if (rc) { 1613 if (rc) {
1599 cFYI(1, ("Send error Write2 = %d", rc)); 1614 cFYI(1, "Send error Write2 = %d", rc);
1600 } else if (resp_buf_type == 0) { 1615 } else if (resp_buf_type == 0) {
1601 /* presumably this can not happen, but best to be safe */ 1616 /* presumably this can not happen, but best to be safe */
1602 rc = -EIO; 1617 rc = -EIO;
@@ -1605,6 +1620,14 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
1605 *nbytes = le16_to_cpu(pSMBr->CountHigh); 1620 *nbytes = le16_to_cpu(pSMBr->CountHigh);
1606 *nbytes = (*nbytes) << 16; 1621 *nbytes = (*nbytes) << 16;
1607 *nbytes += le16_to_cpu(pSMBr->Count); 1622 *nbytes += le16_to_cpu(pSMBr->Count);
1623
1624 /*
1625 * Mask off high 16 bits when bytes written as returned by the
1626 * server is greater than bytes requested by the client. OS/2
1627 * servers are known to set incorrect CountHigh values.
1628 */
1629 if (*nbytes > count)
1630 *nbytes &= 0xFFFF;
1608 } 1631 }
1609 1632
1610/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ 1633/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
@@ -1633,7 +1656,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
1633 int timeout = 0; 1656 int timeout = 0;
1634 __u16 count; 1657 __u16 count;
1635 1658
1636 cFYI(1, ("CIFSSMBLock timeout %d numLock %d", (int)waitFlag, numLock)); 1659 cFYI(1, "CIFSSMBLock timeout %d numLock %d", (int)waitFlag, numLock);
1637 rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB); 1660 rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB);
1638 1661
1639 if (rc) 1662 if (rc)
@@ -1681,7 +1704,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
1681 } 1704 }
1682 cifs_stats_inc(&tcon->num_locks); 1705 cifs_stats_inc(&tcon->num_locks);
1683 if (rc) 1706 if (rc)
1684 cFYI(1, ("Send error in Lock = %d", rc)); 1707 cFYI(1, "Send error in Lock = %d", rc);
1685 1708
1686 /* Note: On -EAGAIN error only caller can retry on handle based calls 1709 /* Note: On -EAGAIN error only caller can retry on handle based calls
1687 since file handle passed in no longer valid */ 1710 since file handle passed in no longer valid */
@@ -1704,7 +1727,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
1704 __u16 params, param_offset, offset, byte_count, count; 1727 __u16 params, param_offset, offset, byte_count, count;
1705 struct kvec iov[1]; 1728 struct kvec iov[1];
1706 1729
1707 cFYI(1, ("Posix Lock")); 1730 cFYI(1, "Posix Lock");
1708 1731
1709 if (pLockData == NULL) 1732 if (pLockData == NULL)
1710 return -EINVAL; 1733 return -EINVAL;
@@ -1774,7 +1797,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
1774 } 1797 }
1775 1798
1776 if (rc) { 1799 if (rc) {
1777 cFYI(1, ("Send error in Posix Lock = %d", rc)); 1800 cFYI(1, "Send error in Posix Lock = %d", rc);
1778 } else if (get_flag) { 1801 } else if (get_flag) {
1779 /* lock structure can be returned on get */ 1802 /* lock structure can be returned on get */
1780 __u16 data_offset; 1803 __u16 data_offset;
@@ -1793,8 +1816,21 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
1793 } 1816 }
1794 parm_data = (struct cifs_posix_lock *) 1817 parm_data = (struct cifs_posix_lock *)
1795 ((char *)&pSMBr->hdr.Protocol + data_offset); 1818 ((char *)&pSMBr->hdr.Protocol + data_offset);
1796 if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK)) 1819 if (parm_data->lock_type == __constant_cpu_to_le16(CIFS_UNLCK))
1797 pLockData->fl_type = F_UNLCK; 1820 pLockData->fl_type = F_UNLCK;
1821 else {
1822 if (parm_data->lock_type ==
1823 __constant_cpu_to_le16(CIFS_RDLCK))
1824 pLockData->fl_type = F_RDLCK;
1825 else if (parm_data->lock_type ==
1826 __constant_cpu_to_le16(CIFS_WRLCK))
1827 pLockData->fl_type = F_WRLCK;
1828
1829 pLockData->fl_start = parm_data->start;
1830 pLockData->fl_end = parm_data->start +
1831 parm_data->length - 1;
1832 pLockData->fl_pid = parm_data->pid;
1833 }
1798 } 1834 }
1799 1835
1800plk_err_exit: 1836plk_err_exit:
@@ -1818,7 +1854,7 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
1818{ 1854{
1819 int rc = 0; 1855 int rc = 0;
1820 CLOSE_REQ *pSMB = NULL; 1856 CLOSE_REQ *pSMB = NULL;
1821 cFYI(1, ("In CIFSSMBClose")); 1857 cFYI(1, "In CIFSSMBClose");
1822 1858
1823/* do not retry on dead session on close */ 1859/* do not retry on dead session on close */
1824 rc = small_smb_init(SMB_COM_CLOSE, 3, tcon, (void **) &pSMB); 1860 rc = small_smb_init(SMB_COM_CLOSE, 3, tcon, (void **) &pSMB);
@@ -1835,7 +1871,7 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
1835 if (rc) { 1871 if (rc) {
1836 if (rc != -EINTR) { 1872 if (rc != -EINTR) {
1837 /* EINTR is expected when user ctl-c to kill app */ 1873 /* EINTR is expected when user ctl-c to kill app */
1838 cERROR(1, ("Send error in Close = %d", rc)); 1874 cERROR(1, "Send error in Close = %d", rc);
1839 } 1875 }
1840 } 1876 }
1841 1877
@@ -1851,7 +1887,7 @@ CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
1851{ 1887{
1852 int rc = 0; 1888 int rc = 0;
1853 FLUSH_REQ *pSMB = NULL; 1889 FLUSH_REQ *pSMB = NULL;
1854 cFYI(1, ("In CIFSSMBFlush")); 1890 cFYI(1, "In CIFSSMBFlush");
1855 1891
1856 rc = small_smb_init(SMB_COM_FLUSH, 1, tcon, (void **) &pSMB); 1892 rc = small_smb_init(SMB_COM_FLUSH, 1, tcon, (void **) &pSMB);
1857 if (rc) 1893 if (rc)
@@ -1862,7 +1898,7 @@ CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
1862 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); 1898 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
1863 cifs_stats_inc(&tcon->num_flushes); 1899 cifs_stats_inc(&tcon->num_flushes);
1864 if (rc) 1900 if (rc)
1865 cERROR(1, ("Send error in Flush = %d", rc)); 1901 cERROR(1, "Send error in Flush = %d", rc);
1866 1902
1867 return rc; 1903 return rc;
1868} 1904}
@@ -1879,7 +1915,7 @@ CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
1879 int name_len, name_len2; 1915 int name_len, name_len2;
1880 __u16 count; 1916 __u16 count;
1881 1917
1882 cFYI(1, ("In CIFSSMBRename")); 1918 cFYI(1, "In CIFSSMBRename");
1883renameRetry: 1919renameRetry:
1884 rc = smb_init(SMB_COM_RENAME, 1, tcon, (void **) &pSMB, 1920 rc = smb_init(SMB_COM_RENAME, 1, tcon, (void **) &pSMB,
1885 (void **) &pSMBr); 1921 (void **) &pSMBr);
@@ -1925,7 +1961,7 @@ renameRetry:
1925 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 1961 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
1926 cifs_stats_inc(&tcon->num_renames); 1962 cifs_stats_inc(&tcon->num_renames);
1927 if (rc) 1963 if (rc)
1928 cFYI(1, ("Send error in rename = %d", rc)); 1964 cFYI(1, "Send error in rename = %d", rc);
1929 1965
1930 cifs_buf_release(pSMB); 1966 cifs_buf_release(pSMB);
1931 1967
@@ -1949,7 +1985,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
1949 int len_of_str; 1985 int len_of_str;
1950 __u16 params, param_offset, offset, count, byte_count; 1986 __u16 params, param_offset, offset, count, byte_count;
1951 1987
1952 cFYI(1, ("Rename to File by handle")); 1988 cFYI(1, "Rename to File by handle");
1953 rc = smb_init(SMB_COM_TRANSACTION2, 15, pTcon, (void **) &pSMB, 1989 rc = smb_init(SMB_COM_TRANSACTION2, 15, pTcon, (void **) &pSMB,
1954 (void **) &pSMBr); 1990 (void **) &pSMBr);
1955 if (rc) 1991 if (rc)
@@ -2004,7 +2040,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
2004 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2040 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2005 cifs_stats_inc(&pTcon->num_t2renames); 2041 cifs_stats_inc(&pTcon->num_t2renames);
2006 if (rc) 2042 if (rc)
2007 cFYI(1, ("Send error in Rename (by file handle) = %d", rc)); 2043 cFYI(1, "Send error in Rename (by file handle) = %d", rc);
2008 2044
2009 cifs_buf_release(pSMB); 2045 cifs_buf_release(pSMB);
2010 2046
@@ -2026,7 +2062,7 @@ CIFSSMBCopy(const int xid, struct cifsTconInfo *tcon, const char *fromName,
2026 int name_len, name_len2; 2062 int name_len, name_len2;
2027 __u16 count; 2063 __u16 count;
2028 2064
2029 cFYI(1, ("In CIFSSMBCopy")); 2065 cFYI(1, "In CIFSSMBCopy");
2030copyRetry: 2066copyRetry:
2031 rc = smb_init(SMB_COM_COPY, 1, tcon, (void **) &pSMB, 2067 rc = smb_init(SMB_COM_COPY, 1, tcon, (void **) &pSMB,
2032 (void **) &pSMBr); 2068 (void **) &pSMBr);
@@ -2071,8 +2107,8 @@ copyRetry:
2071 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 2107 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
2072 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2108 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2073 if (rc) { 2109 if (rc) {
2074 cFYI(1, ("Send error in copy = %d with %d files copied", 2110 cFYI(1, "Send error in copy = %d with %d files copied",
2075 rc, le16_to_cpu(pSMBr->CopyCount))); 2111 rc, le16_to_cpu(pSMBr->CopyCount));
2076 } 2112 }
2077 cifs_buf_release(pSMB); 2113 cifs_buf_release(pSMB);
2078 2114
@@ -2096,7 +2132,7 @@ CIFSUnixCreateSymLink(const int xid, struct cifsTconInfo *tcon,
2096 int bytes_returned = 0; 2132 int bytes_returned = 0;
2097 __u16 params, param_offset, offset, byte_count; 2133 __u16 params, param_offset, offset, byte_count;
2098 2134
2099 cFYI(1, ("In Symlink Unix style")); 2135 cFYI(1, "In Symlink Unix style");
2100createSymLinkRetry: 2136createSymLinkRetry:
2101 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 2137 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
2102 (void **) &pSMBr); 2138 (void **) &pSMBr);
@@ -2161,7 +2197,7 @@ createSymLinkRetry:
2161 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2197 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2162 cifs_stats_inc(&tcon->num_symlinks); 2198 cifs_stats_inc(&tcon->num_symlinks);
2163 if (rc) 2199 if (rc)
2164 cFYI(1, ("Send error in SetPathInfo create symlink = %d", rc)); 2200 cFYI(1, "Send error in SetPathInfo create symlink = %d", rc);
2165 2201
2166 cifs_buf_release(pSMB); 2202 cifs_buf_release(pSMB);
2167 2203
@@ -2185,7 +2221,7 @@ CIFSUnixCreateHardLink(const int xid, struct cifsTconInfo *tcon,
2185 int bytes_returned = 0; 2221 int bytes_returned = 0;
2186 __u16 params, param_offset, offset, byte_count; 2222 __u16 params, param_offset, offset, byte_count;
2187 2223
2188 cFYI(1, ("In Create Hard link Unix style")); 2224 cFYI(1, "In Create Hard link Unix style");
2189createHardLinkRetry: 2225createHardLinkRetry:
2190 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 2226 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
2191 (void **) &pSMBr); 2227 (void **) &pSMBr);
@@ -2247,7 +2283,7 @@ createHardLinkRetry:
2247 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2283 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2248 cifs_stats_inc(&tcon->num_hardlinks); 2284 cifs_stats_inc(&tcon->num_hardlinks);
2249 if (rc) 2285 if (rc)
2250 cFYI(1, ("Send error in SetPathInfo (hard link) = %d", rc)); 2286 cFYI(1, "Send error in SetPathInfo (hard link) = %d", rc);
2251 2287
2252 cifs_buf_release(pSMB); 2288 cifs_buf_release(pSMB);
2253 if (rc == -EAGAIN) 2289 if (rc == -EAGAIN)
@@ -2268,7 +2304,7 @@ CIFSCreateHardLink(const int xid, struct cifsTconInfo *tcon,
2268 int name_len, name_len2; 2304 int name_len, name_len2;
2269 __u16 count; 2305 __u16 count;
2270 2306
2271 cFYI(1, ("In CIFSCreateHardLink")); 2307 cFYI(1, "In CIFSCreateHardLink");
2272winCreateHardLinkRetry: 2308winCreateHardLinkRetry:
2273 2309
2274 rc = smb_init(SMB_COM_NT_RENAME, 4, tcon, (void **) &pSMB, 2310 rc = smb_init(SMB_COM_NT_RENAME, 4, tcon, (void **) &pSMB,
@@ -2319,7 +2355,7 @@ winCreateHardLinkRetry:
2319 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2355 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2320 cifs_stats_inc(&tcon->num_hardlinks); 2356 cifs_stats_inc(&tcon->num_hardlinks);
2321 if (rc) 2357 if (rc)
2322 cFYI(1, ("Send error in hard link (NT rename) = %d", rc)); 2358 cFYI(1, "Send error in hard link (NT rename) = %d", rc);
2323 2359
2324 cifs_buf_release(pSMB); 2360 cifs_buf_release(pSMB);
2325 if (rc == -EAGAIN) 2361 if (rc == -EAGAIN)
@@ -2342,7 +2378,7 @@ CIFSSMBUnixQuerySymLink(const int xid, struct cifsTconInfo *tcon,
2342 __u16 params, byte_count; 2378 __u16 params, byte_count;
2343 char *data_start; 2379 char *data_start;
2344 2380
2345 cFYI(1, ("In QPathSymLinkInfo (Unix) for path %s", searchName)); 2381 cFYI(1, "In QPathSymLinkInfo (Unix) for path %s", searchName);
2346 2382
2347querySymLinkRetry: 2383querySymLinkRetry:
2348 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 2384 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -2389,7 +2425,7 @@ querySymLinkRetry:
2389 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 2425 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
2390 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2426 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2391 if (rc) { 2427 if (rc) {
2392 cFYI(1, ("Send error in QuerySymLinkInfo = %d", rc)); 2428 cFYI(1, "Send error in QuerySymLinkInfo = %d", rc);
2393 } else { 2429 } else {
2394 /* decode response */ 2430 /* decode response */
2395 2431
@@ -2490,21 +2526,21 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
2490 2526
2491 /* should we also check that parm and data areas do not overlap? */ 2527 /* should we also check that parm and data areas do not overlap? */
2492 if (*ppparm > end_of_smb) { 2528 if (*ppparm > end_of_smb) {
2493 cFYI(1, ("parms start after end of smb")); 2529 cFYI(1, "parms start after end of smb");
2494 return -EINVAL; 2530 return -EINVAL;
2495 } else if (parm_count + *ppparm > end_of_smb) { 2531 } else if (parm_count + *ppparm > end_of_smb) {
2496 cFYI(1, ("parm end after end of smb")); 2532 cFYI(1, "parm end after end of smb");
2497 return -EINVAL; 2533 return -EINVAL;
2498 } else if (*ppdata > end_of_smb) { 2534 } else if (*ppdata > end_of_smb) {
2499 cFYI(1, ("data starts after end of smb")); 2535 cFYI(1, "data starts after end of smb");
2500 return -EINVAL; 2536 return -EINVAL;
2501 } else if (data_count + *ppdata > end_of_smb) { 2537 } else if (data_count + *ppdata > end_of_smb) {
2502 cFYI(1, ("data %p + count %d (%p) ends after end of smb %p start %p", 2538 cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
2503 *ppdata, data_count, (data_count + *ppdata), 2539 *ppdata, data_count, (data_count + *ppdata),
2504 end_of_smb, pSMBr)); 2540 end_of_smb, pSMBr);
2505 return -EINVAL; 2541 return -EINVAL;
2506 } else if (parm_count + data_count > pSMBr->ByteCount) { 2542 } else if (parm_count + data_count > pSMBr->ByteCount) {
2507 cFYI(1, ("parm count and data count larger than SMB")); 2543 cFYI(1, "parm count and data count larger than SMB");
2508 return -EINVAL; 2544 return -EINVAL;
2509 } 2545 }
2510 *pdatalen = data_count; 2546 *pdatalen = data_count;
@@ -2523,7 +2559,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
2523 struct smb_com_transaction_ioctl_req *pSMB; 2559 struct smb_com_transaction_ioctl_req *pSMB;
2524 struct smb_com_transaction_ioctl_rsp *pSMBr; 2560 struct smb_com_transaction_ioctl_rsp *pSMBr;
2525 2561
2526 cFYI(1, ("In Windows reparse style QueryLink for path %s", searchName)); 2562 cFYI(1, "In Windows reparse style QueryLink for path %s", searchName);
2527 rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB, 2563 rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
2528 (void **) &pSMBr); 2564 (void **) &pSMBr);
2529 if (rc) 2565 if (rc)
@@ -2552,7 +2588,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
2552 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 2588 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
2553 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2589 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2554 if (rc) { 2590 if (rc) {
2555 cFYI(1, ("Send error in QueryReparseLinkInfo = %d", rc)); 2591 cFYI(1, "Send error in QueryReparseLinkInfo = %d", rc);
2556 } else { /* decode response */ 2592 } else { /* decode response */
2557 __u32 data_offset = le32_to_cpu(pSMBr->DataOffset); 2593 __u32 data_offset = le32_to_cpu(pSMBr->DataOffset);
2558 __u32 data_count = le32_to_cpu(pSMBr->DataCount); 2594 __u32 data_count = le32_to_cpu(pSMBr->DataCount);
@@ -2576,7 +2612,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
2576 if ((reparse_buf->LinkNamesBuf + 2612 if ((reparse_buf->LinkNamesBuf +
2577 reparse_buf->TargetNameOffset + 2613 reparse_buf->TargetNameOffset +
2578 reparse_buf->TargetNameLen) > end_of_smb) { 2614 reparse_buf->TargetNameLen) > end_of_smb) {
2579 cFYI(1, ("reparse buf beyond SMB")); 2615 cFYI(1, "reparse buf beyond SMB");
2580 rc = -EIO; 2616 rc = -EIO;
2581 goto qreparse_out; 2617 goto qreparse_out;
2582 } 2618 }
@@ -2597,12 +2633,12 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
2597 } 2633 }
2598 } else { 2634 } else {
2599 rc = -EIO; 2635 rc = -EIO;
2600 cFYI(1, ("Invalid return data count on " 2636 cFYI(1, "Invalid return data count on "
2601 "get reparse info ioctl")); 2637 "get reparse info ioctl");
2602 } 2638 }
2603 symlinkinfo[buflen] = 0; /* just in case so the caller 2639 symlinkinfo[buflen] = 0; /* just in case so the caller
2604 does not go off the end of the buffer */ 2640 does not go off the end of the buffer */
2605 cFYI(1, ("readlink result - %s", symlinkinfo)); 2641 cFYI(1, "readlink result - %s", symlinkinfo);
2606 } 2642 }
2607 2643
2608qreparse_out: 2644qreparse_out:
@@ -2625,7 +2661,7 @@ static void cifs_convert_ace(posix_acl_xattr_entry *ace,
2625 ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm); 2661 ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm);
2626 ace->e_tag = cpu_to_le16(cifs_ace->cifs_e_tag); 2662 ace->e_tag = cpu_to_le16(cifs_ace->cifs_e_tag);
2627 ace->e_id = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid)); 2663 ace->e_id = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid));
2628 /* cFYI(1,("perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id)); */ 2664 /* cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id); */
2629 2665
2630 return; 2666 return;
2631} 2667}
@@ -2651,8 +2687,8 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
2651 size += sizeof(struct cifs_posix_ace) * count; 2687 size += sizeof(struct cifs_posix_ace) * count;
2652 /* check if we would go beyond end of SMB */ 2688 /* check if we would go beyond end of SMB */
2653 if (size_of_data_area < size) { 2689 if (size_of_data_area < size) {
2654 cFYI(1, ("bad CIFS POSIX ACL size %d vs. %d", 2690 cFYI(1, "bad CIFS POSIX ACL size %d vs. %d",
2655 size_of_data_area, size)); 2691 size_of_data_area, size);
2656 return -EINVAL; 2692 return -EINVAL;
2657 } 2693 }
2658 } else if (acl_type & ACL_TYPE_DEFAULT) { 2694 } else if (acl_type & ACL_TYPE_DEFAULT) {
@@ -2699,7 +2735,7 @@ static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace,
2699 cifs_ace->cifs_uid = cpu_to_le64(-1); 2735 cifs_ace->cifs_uid = cpu_to_le64(-1);
2700 } else 2736 } else
2701 cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id)); 2737 cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id));
2702 /*cFYI(1,("perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id));*/ 2738 /*cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id);*/
2703 return rc; 2739 return rc;
2704} 2740}
2705 2741
@@ -2717,12 +2753,12 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
2717 return 0; 2753 return 0;
2718 2754
2719 count = posix_acl_xattr_count((size_t)buflen); 2755 count = posix_acl_xattr_count((size_t)buflen);
2720 cFYI(1, ("setting acl with %d entries from buf of length %d and " 2756 cFYI(1, "setting acl with %d entries from buf of length %d and "
2721 "version of %d", 2757 "version of %d",
2722 count, buflen, le32_to_cpu(local_acl->a_version))); 2758 count, buflen, le32_to_cpu(local_acl->a_version));
2723 if (le32_to_cpu(local_acl->a_version) != 2) { 2759 if (le32_to_cpu(local_acl->a_version) != 2) {
2724 cFYI(1, ("unknown POSIX ACL version %d", 2760 cFYI(1, "unknown POSIX ACL version %d",
2725 le32_to_cpu(local_acl->a_version))); 2761 le32_to_cpu(local_acl->a_version));
2726 return 0; 2762 return 0;
2727 } 2763 }
2728 cifs_acl->version = cpu_to_le16(1); 2764 cifs_acl->version = cpu_to_le16(1);
@@ -2731,7 +2767,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
2731 else if (acl_type == ACL_TYPE_DEFAULT) 2767 else if (acl_type == ACL_TYPE_DEFAULT)
2732 cifs_acl->default_entry_count = cpu_to_le16(count); 2768 cifs_acl->default_entry_count = cpu_to_le16(count);
2733 else { 2769 else {
2734 cFYI(1, ("unknown ACL type %d", acl_type)); 2770 cFYI(1, "unknown ACL type %d", acl_type);
2735 return 0; 2771 return 0;
2736 } 2772 }
2737 for (i = 0; i < count; i++) { 2773 for (i = 0; i < count; i++) {
@@ -2764,7 +2800,7 @@ CIFSSMBGetPosixACL(const int xid, struct cifsTconInfo *tcon,
2764 int name_len; 2800 int name_len;
2765 __u16 params, byte_count; 2801 __u16 params, byte_count;
2766 2802
2767 cFYI(1, ("In GetPosixACL (Unix) for path %s", searchName)); 2803 cFYI(1, "In GetPosixACL (Unix) for path %s", searchName);
2768 2804
2769queryAclRetry: 2805queryAclRetry:
2770 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 2806 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -2816,7 +2852,7 @@ queryAclRetry:
2816 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2852 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2817 cifs_stats_inc(&tcon->num_acl_get); 2853 cifs_stats_inc(&tcon->num_acl_get);
2818 if (rc) { 2854 if (rc) {
2819 cFYI(1, ("Send error in Query POSIX ACL = %d", rc)); 2855 cFYI(1, "Send error in Query POSIX ACL = %d", rc);
2820 } else { 2856 } else {
2821 /* decode response */ 2857 /* decode response */
2822 2858
@@ -2853,7 +2889,7 @@ CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
2853 int bytes_returned = 0; 2889 int bytes_returned = 0;
2854 __u16 params, byte_count, data_count, param_offset, offset; 2890 __u16 params, byte_count, data_count, param_offset, offset;
2855 2891
2856 cFYI(1, ("In SetPosixACL (Unix) for path %s", fileName)); 2892 cFYI(1, "In SetPosixACL (Unix) for path %s", fileName);
2857setAclRetry: 2893setAclRetry:
2858 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 2894 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
2859 (void **) &pSMBr); 2895 (void **) &pSMBr);
@@ -2908,7 +2944,7 @@ setAclRetry:
2908 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 2944 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
2909 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2945 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2910 if (rc) 2946 if (rc)
2911 cFYI(1, ("Set POSIX ACL returned %d", rc)); 2947 cFYI(1, "Set POSIX ACL returned %d", rc);
2912 2948
2913setACLerrorExit: 2949setACLerrorExit:
2914 cifs_buf_release(pSMB); 2950 cifs_buf_release(pSMB);
@@ -2928,7 +2964,7 @@ CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
2928 int bytes_returned; 2964 int bytes_returned;
2929 __u16 params, byte_count; 2965 __u16 params, byte_count;
2930 2966
2931 cFYI(1, ("In GetExtAttr")); 2967 cFYI(1, "In GetExtAttr");
2932 if (tcon == NULL) 2968 if (tcon == NULL)
2933 return -ENODEV; 2969 return -ENODEV;
2934 2970
@@ -2967,7 +3003,7 @@ GetExtAttrRetry:
2967 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 3003 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
2968 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 3004 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2969 if (rc) { 3005 if (rc) {
2970 cFYI(1, ("error %d in GetExtAttr", rc)); 3006 cFYI(1, "error %d in GetExtAttr", rc);
2971 } else { 3007 } else {
2972 /* decode response */ 3008 /* decode response */
2973 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 3009 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -2982,7 +3018,7 @@ GetExtAttrRetry:
2982 struct file_chattr_info *pfinfo; 3018 struct file_chattr_info *pfinfo;
2983 /* BB Do we need a cast or hash here ? */ 3019 /* BB Do we need a cast or hash here ? */
2984 if (count != 16) { 3020 if (count != 16) {
2985 cFYI(1, ("Illegal size ret in GetExtAttr")); 3021 cFYI(1, "Illegal size ret in GetExtAttr");
2986 rc = -EIO; 3022 rc = -EIO;
2987 goto GetExtAttrOut; 3023 goto GetExtAttrOut;
2988 } 3024 }
@@ -3012,7 +3048,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
3012 QUERY_SEC_DESC_REQ *pSMB; 3048 QUERY_SEC_DESC_REQ *pSMB;
3013 struct kvec iov[1]; 3049 struct kvec iov[1];
3014 3050
3015 cFYI(1, ("GetCifsACL")); 3051 cFYI(1, "GetCifsACL");
3016 3052
3017 *pbuflen = 0; 3053 *pbuflen = 0;
3018 *acl_inf = NULL; 3054 *acl_inf = NULL;
@@ -3037,7 +3073,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
3037 CIFS_STD_OP); 3073 CIFS_STD_OP);
3038 cifs_stats_inc(&tcon->num_acl_get); 3074 cifs_stats_inc(&tcon->num_acl_get);
3039 if (rc) { 3075 if (rc) {
3040 cFYI(1, ("Send error in QuerySecDesc = %d", rc)); 3076 cFYI(1, "Send error in QuerySecDesc = %d", rc);
3041 } else { /* decode response */ 3077 } else { /* decode response */
3042 __le32 *parm; 3078 __le32 *parm;
3043 __u32 parm_len; 3079 __u32 parm_len;
@@ -3052,7 +3088,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
3052 goto qsec_out; 3088 goto qsec_out;
3053 pSMBr = (struct smb_com_ntransact_rsp *)iov[0].iov_base; 3089 pSMBr = (struct smb_com_ntransact_rsp *)iov[0].iov_base;
3054 3090
3055 cFYI(1, ("smb %p parm %p data %p", pSMBr, parm, *acl_inf)); 3091 cFYI(1, "smb %p parm %p data %p", pSMBr, parm, *acl_inf);
3056 3092
3057 if (le32_to_cpu(pSMBr->ParameterCount) != 4) { 3093 if (le32_to_cpu(pSMBr->ParameterCount) != 4) {
3058 rc = -EIO; /* bad smb */ 3094 rc = -EIO; /* bad smb */
@@ -3064,8 +3100,8 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
3064 3100
3065 acl_len = le32_to_cpu(*parm); 3101 acl_len = le32_to_cpu(*parm);
3066 if (acl_len != *pbuflen) { 3102 if (acl_len != *pbuflen) {
3067 cERROR(1, ("acl length %d does not match %d", 3103 cERROR(1, "acl length %d does not match %d",
3068 acl_len, *pbuflen)); 3104 acl_len, *pbuflen);
3069 if (*pbuflen > acl_len) 3105 if (*pbuflen > acl_len)
3070 *pbuflen = acl_len; 3106 *pbuflen = acl_len;
3071 } 3107 }
@@ -3074,7 +3110,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
3074 header followed by the smallest SID */ 3110 header followed by the smallest SID */
3075 if ((*pbuflen < sizeof(struct cifs_ntsd) + 8) || 3111 if ((*pbuflen < sizeof(struct cifs_ntsd) + 8) ||
3076 (*pbuflen >= 64 * 1024)) { 3112 (*pbuflen >= 64 * 1024)) {
3077 cERROR(1, ("bad acl length %d", *pbuflen)); 3113 cERROR(1, "bad acl length %d", *pbuflen);
3078 rc = -EINVAL; 3114 rc = -EINVAL;
3079 *pbuflen = 0; 3115 *pbuflen = 0;
3080 } else { 3116 } else {
@@ -3148,9 +3184,9 @@ setCifsAclRetry:
3148 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 3184 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3149 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 3185 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3150 3186
3151 cFYI(1, ("SetCIFSACL bytes_returned: %d, rc: %d", bytes_returned, rc)); 3187 cFYI(1, "SetCIFSACL bytes_returned: %d, rc: %d", bytes_returned, rc);
3152 if (rc) 3188 if (rc)
3153 cFYI(1, ("Set CIFS ACL returned %d", rc)); 3189 cFYI(1, "Set CIFS ACL returned %d", rc);
3154 cifs_buf_release(pSMB); 3190 cifs_buf_release(pSMB);
3155 3191
3156 if (rc == -EAGAIN) 3192 if (rc == -EAGAIN)
@@ -3174,7 +3210,7 @@ int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
3174 int bytes_returned; 3210 int bytes_returned;
3175 int name_len; 3211 int name_len;
3176 3212
3177 cFYI(1, ("In SMBQPath path %s", searchName)); 3213 cFYI(1, "In SMBQPath path %s", searchName);
3178QInfRetry: 3214QInfRetry:
3179 rc = smb_init(SMB_COM_QUERY_INFORMATION, 0, tcon, (void **) &pSMB, 3215 rc = smb_init(SMB_COM_QUERY_INFORMATION, 0, tcon, (void **) &pSMB,
3180 (void **) &pSMBr); 3216 (void **) &pSMBr);
@@ -3200,7 +3236,7 @@ QInfRetry:
3200 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 3236 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3201 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 3237 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3202 if (rc) { 3238 if (rc) {
3203 cFYI(1, ("Send error in QueryInfo = %d", rc)); 3239 cFYI(1, "Send error in QueryInfo = %d", rc);
3204 } else if (pFinfo) { 3240 } else if (pFinfo) {
3205 struct timespec ts; 3241 struct timespec ts;
3206 __u32 time = le32_to_cpu(pSMBr->last_write_time); 3242 __u32 time = le32_to_cpu(pSMBr->last_write_time);
@@ -3230,8 +3266,72 @@ QInfRetry:
3230 return rc; 3266 return rc;
3231} 3267}
3232 3268
3269int
3270CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
3271 u16 netfid, FILE_ALL_INFO *pFindData)
3272{
3273 struct smb_t2_qfi_req *pSMB = NULL;
3274 struct smb_t2_qfi_rsp *pSMBr = NULL;
3275 int rc = 0;
3276 int bytes_returned;
3277 __u16 params, byte_count;
3278
3279QFileInfoRetry:
3280 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
3281 (void **) &pSMBr);
3282 if (rc)
3283 return rc;
3233 3284
3285 params = 2 /* level */ + 2 /* fid */;
3286 pSMB->t2.TotalDataCount = 0;
3287 pSMB->t2.MaxParameterCount = cpu_to_le16(4);
3288 /* BB find exact max data count below from sess structure BB */
3289 pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
3290 pSMB->t2.MaxSetupCount = 0;
3291 pSMB->t2.Reserved = 0;
3292 pSMB->t2.Flags = 0;
3293 pSMB->t2.Timeout = 0;
3294 pSMB->t2.Reserved2 = 0;
3295 pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
3296 Fid) - 4);
3297 pSMB->t2.DataCount = 0;
3298 pSMB->t2.DataOffset = 0;
3299 pSMB->t2.SetupCount = 1;
3300 pSMB->t2.Reserved3 = 0;
3301 pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
3302 byte_count = params + 1 /* pad */ ;
3303 pSMB->t2.TotalParameterCount = cpu_to_le16(params);
3304 pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
3305 pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
3306 pSMB->Pad = 0;
3307 pSMB->Fid = netfid;
3308 pSMB->hdr.smb_buf_length += byte_count;
3234 3309
3310 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3311 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3312 if (rc) {
3313 cFYI(1, "Send error in QPathInfo = %d", rc);
3314 } else { /* decode response */
3315 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3316
3317 if (rc) /* BB add auto retry on EOPNOTSUPP? */
3318 rc = -EIO;
3319 else if (pSMBr->ByteCount < 40)
3320 rc = -EIO; /* bad smb */
3321 else if (pFindData) {
3322 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
3323 memcpy((char *) pFindData,
3324 (char *) &pSMBr->hdr.Protocol +
3325 data_offset, sizeof(FILE_ALL_INFO));
3326 } else
3327 rc = -ENOMEM;
3328 }
3329 cifs_buf_release(pSMB);
3330 if (rc == -EAGAIN)
3331 goto QFileInfoRetry;
3332
3333 return rc;
3334}
3235 3335
3236int 3336int
3237CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon, 3337CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
@@ -3248,7 +3348,7 @@ CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
3248 int name_len; 3348 int name_len;
3249 __u16 params, byte_count; 3349 __u16 params, byte_count;
3250 3350
3251/* cFYI(1, ("In QPathInfo path %s", searchName)); */ 3351/* cFYI(1, "In QPathInfo path %s", searchName); */
3252QPathInfoRetry: 3352QPathInfoRetry:
3253 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 3353 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
3254 (void **) &pSMBr); 3354 (void **) &pSMBr);
@@ -3298,7 +3398,7 @@ QPathInfoRetry:
3298 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 3398 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3299 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 3399 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3300 if (rc) { 3400 if (rc) {
3301 cFYI(1, ("Send error in QPathInfo = %d", rc)); 3401 cFYI(1, "Send error in QPathInfo = %d", rc);
3302 } else { /* decode response */ 3402 } else { /* decode response */
3303 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 3403 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3304 3404
@@ -3335,6 +3435,75 @@ QPathInfoRetry:
3335} 3435}
3336 3436
3337int 3437int
3438CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
3439 u16 netfid, FILE_UNIX_BASIC_INFO *pFindData)
3440{
3441 struct smb_t2_qfi_req *pSMB = NULL;
3442 struct smb_t2_qfi_rsp *pSMBr = NULL;
3443 int rc = 0;
3444 int bytes_returned;
3445 __u16 params, byte_count;
3446
3447UnixQFileInfoRetry:
3448 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
3449 (void **) &pSMBr);
3450 if (rc)
3451 return rc;
3452
3453 params = 2 /* level */ + 2 /* fid */;
3454 pSMB->t2.TotalDataCount = 0;
3455 pSMB->t2.MaxParameterCount = cpu_to_le16(4);
3456 /* BB find exact max data count below from sess structure BB */
3457 pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
3458 pSMB->t2.MaxSetupCount = 0;
3459 pSMB->t2.Reserved = 0;
3460 pSMB->t2.Flags = 0;
3461 pSMB->t2.Timeout = 0;
3462 pSMB->t2.Reserved2 = 0;
3463 pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
3464 Fid) - 4);
3465 pSMB->t2.DataCount = 0;
3466 pSMB->t2.DataOffset = 0;
3467 pSMB->t2.SetupCount = 1;
3468 pSMB->t2.Reserved3 = 0;
3469 pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
3470 byte_count = params + 1 /* pad */ ;
3471 pSMB->t2.TotalParameterCount = cpu_to_le16(params);
3472 pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
3473 pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
3474 pSMB->Pad = 0;
3475 pSMB->Fid = netfid;
3476 pSMB->hdr.smb_buf_length += byte_count;
3477
3478 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3479 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3480 if (rc) {
3481 cFYI(1, "Send error in QPathInfo = %d", rc);
3482 } else { /* decode response */
3483 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3484
3485 if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
3486 cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n"
3487 "Unix Extensions can be disabled on mount "
3488 "by specifying the nosfu mount option.");
3489 rc = -EIO; /* bad smb */
3490 } else {
3491 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
3492 memcpy((char *) pFindData,
3493 (char *) &pSMBr->hdr.Protocol +
3494 data_offset,
3495 sizeof(FILE_UNIX_BASIC_INFO));
3496 }
3497 }
3498
3499 cifs_buf_release(pSMB);
3500 if (rc == -EAGAIN)
3501 goto UnixQFileInfoRetry;
3502
3503 return rc;
3504}
3505
3506int
3338CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon, 3507CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
3339 const unsigned char *searchName, 3508 const unsigned char *searchName,
3340 FILE_UNIX_BASIC_INFO *pFindData, 3509 FILE_UNIX_BASIC_INFO *pFindData,
@@ -3348,7 +3517,7 @@ CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
3348 int name_len; 3517 int name_len;
3349 __u16 params, byte_count; 3518 __u16 params, byte_count;
3350 3519
3351 cFYI(1, ("In QPathInfo (Unix) the path %s", searchName)); 3520 cFYI(1, "In QPathInfo (Unix) the path %s", searchName);
3352UnixQPathInfoRetry: 3521UnixQPathInfoRetry:
3353 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 3522 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
3354 (void **) &pSMBr); 3523 (void **) &pSMBr);
@@ -3395,14 +3564,14 @@ UnixQPathInfoRetry:
3395 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 3564 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3396 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 3565 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3397 if (rc) { 3566 if (rc) {
3398 cFYI(1, ("Send error in QPathInfo = %d", rc)); 3567 cFYI(1, "Send error in QPathInfo = %d", rc);
3399 } else { /* decode response */ 3568 } else { /* decode response */
3400 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 3569 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3401 3570
3402 if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) { 3571 if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
3403 cERROR(1, ("Malformed FILE_UNIX_BASIC_INFO response.\n" 3572 cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n"
3404 "Unix Extensions can be disabled on mount " 3573 "Unix Extensions can be disabled on mount "
3405 "by specifying the nosfu mount option.")); 3574 "by specifying the nosfu mount option.");
3406 rc = -EIO; /* bad smb */ 3575 rc = -EIO; /* bad smb */
3407 } else { 3576 } else {
3408 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); 3577 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -3436,7 +3605,7 @@ CIFSFindFirst(const int xid, struct cifsTconInfo *tcon,
3436 int name_len; 3605 int name_len;
3437 __u16 params, byte_count; 3606 __u16 params, byte_count;
3438 3607
3439 cFYI(1, ("In FindFirst for %s", searchName)); 3608 cFYI(1, "In FindFirst for %s", searchName);
3440 3609
3441findFirstRetry: 3610findFirstRetry:
3442 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 3611 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -3513,7 +3682,7 @@ findFirstRetry:
3513 if (rc) {/* BB add logic to retry regular search if Unix search 3682 if (rc) {/* BB add logic to retry regular search if Unix search
3514 rejected unexpectedly by server */ 3683 rejected unexpectedly by server */
3515 /* BB Add code to handle unsupported level rc */ 3684 /* BB Add code to handle unsupported level rc */
3516 cFYI(1, ("Error in FindFirst = %d", rc)); 3685 cFYI(1, "Error in FindFirst = %d", rc);
3517 3686
3518 cifs_buf_release(pSMB); 3687 cifs_buf_release(pSMB);
3519 3688
@@ -3552,7 +3721,7 @@ findFirstRetry:
3552 lnoff = le16_to_cpu(parms->LastNameOffset); 3721 lnoff = le16_to_cpu(parms->LastNameOffset);
3553 if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE < 3722 if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
3554 lnoff) { 3723 lnoff) {
3555 cERROR(1, ("ignoring corrupt resume name")); 3724 cERROR(1, "ignoring corrupt resume name");
3556 psrch_inf->last_entry = NULL; 3725 psrch_inf->last_entry = NULL;
3557 return rc; 3726 return rc;
3558 } 3727 }
@@ -3580,7 +3749,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
3580 int bytes_returned, name_len; 3749 int bytes_returned, name_len;
3581 __u16 params, byte_count; 3750 __u16 params, byte_count;
3582 3751
3583 cFYI(1, ("In FindNext")); 3752 cFYI(1, "In FindNext");
3584 3753
3585 if (psrch_inf->endOfSearch) 3754 if (psrch_inf->endOfSearch)
3586 return -ENOENT; 3755 return -ENOENT;
@@ -3644,7 +3813,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
3644 cifs_buf_release(pSMB); 3813 cifs_buf_release(pSMB);
3645 rc = 0; /* search probably was closed at end of search*/ 3814 rc = 0; /* search probably was closed at end of search*/
3646 } else 3815 } else
3647 cFYI(1, ("FindNext returned = %d", rc)); 3816 cFYI(1, "FindNext returned = %d", rc);
3648 } else { /* decode response */ 3817 } else { /* decode response */
3649 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 3818 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3650 3819
@@ -3680,15 +3849,15 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
3680 lnoff = le16_to_cpu(parms->LastNameOffset); 3849 lnoff = le16_to_cpu(parms->LastNameOffset);
3681 if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE < 3850 if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
3682 lnoff) { 3851 lnoff) {
3683 cERROR(1, ("ignoring corrupt resume name")); 3852 cERROR(1, "ignoring corrupt resume name");
3684 psrch_inf->last_entry = NULL; 3853 psrch_inf->last_entry = NULL;
3685 return rc; 3854 return rc;
3686 } else 3855 } else
3687 psrch_inf->last_entry = 3856 psrch_inf->last_entry =
3688 psrch_inf->srch_entries_start + lnoff; 3857 psrch_inf->srch_entries_start + lnoff;
3689 3858
3690/* cFYI(1,("fnxt2 entries in buf %d index_of_last %d", 3859/* cFYI(1, "fnxt2 entries in buf %d index_of_last %d",
3691 psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry)); */ 3860 psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry); */
3692 3861
3693 /* BB fixme add unlock here */ 3862 /* BB fixme add unlock here */
3694 } 3863 }
@@ -3713,7 +3882,7 @@ CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
3713 int rc = 0; 3882 int rc = 0;
3714 FINDCLOSE_REQ *pSMB = NULL; 3883 FINDCLOSE_REQ *pSMB = NULL;
3715 3884
3716 cFYI(1, ("In CIFSSMBFindClose")); 3885 cFYI(1, "In CIFSSMBFindClose");
3717 rc = small_smb_init(SMB_COM_FIND_CLOSE2, 1, tcon, (void **)&pSMB); 3886 rc = small_smb_init(SMB_COM_FIND_CLOSE2, 1, tcon, (void **)&pSMB);
3718 3887
3719 /* no sense returning error if session restarted 3888 /* no sense returning error if session restarted
@@ -3727,7 +3896,7 @@ CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
3727 pSMB->ByteCount = 0; 3896 pSMB->ByteCount = 0;
3728 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); 3897 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
3729 if (rc) 3898 if (rc)
3730 cERROR(1, ("Send error in FindClose = %d", rc)); 3899 cERROR(1, "Send error in FindClose = %d", rc);
3731 3900
3732 cifs_stats_inc(&tcon->num_fclose); 3901 cifs_stats_inc(&tcon->num_fclose);
3733 3902
@@ -3750,7 +3919,7 @@ CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon,
3750 int name_len, bytes_returned; 3919 int name_len, bytes_returned;
3751 __u16 params, byte_count; 3920 __u16 params, byte_count;
3752 3921
3753 cFYI(1, ("In GetSrvInodeNum for %s", searchName)); 3922 cFYI(1, "In GetSrvInodeNum for %s", searchName);
3754 if (tcon == NULL) 3923 if (tcon == NULL)
3755 return -ENODEV; 3924 return -ENODEV;
3756 3925
@@ -3800,7 +3969,7 @@ GetInodeNumberRetry:
3800 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 3969 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3801 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 3970 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3802 if (rc) { 3971 if (rc) {
3803 cFYI(1, ("error %d in QueryInternalInfo", rc)); 3972 cFYI(1, "error %d in QueryInternalInfo", rc);
3804 } else { 3973 } else {
3805 /* decode response */ 3974 /* decode response */
3806 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 3975 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -3815,7 +3984,7 @@ GetInodeNumberRetry:
3815 struct file_internal_info *pfinfo; 3984 struct file_internal_info *pfinfo;
3816 /* BB Do we need a cast or hash here ? */ 3985 /* BB Do we need a cast or hash here ? */
3817 if (count < 8) { 3986 if (count < 8) {
3818 cFYI(1, ("Illegal size ret in QryIntrnlInf")); 3987 cFYI(1, "Illegal size ret in QryIntrnlInf");
3819 rc = -EIO; 3988 rc = -EIO;
3820 goto GetInodeNumOut; 3989 goto GetInodeNumOut;
3821 } 3990 }
@@ -3856,16 +4025,16 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
3856 *num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals); 4025 *num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals);
3857 4026
3858 if (*num_of_nodes < 1) { 4027 if (*num_of_nodes < 1) {
3859 cERROR(1, ("num_referrals: must be at least > 0," 4028 cERROR(1, "num_referrals: must be at least > 0,"
3860 "but we get num_referrals = %d\n", *num_of_nodes)); 4029 "but we get num_referrals = %d\n", *num_of_nodes);
3861 rc = -EINVAL; 4030 rc = -EINVAL;
3862 goto parse_DFS_referrals_exit; 4031 goto parse_DFS_referrals_exit;
3863 } 4032 }
3864 4033
3865 ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals); 4034 ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals);
3866 if (ref->VersionNumber != cpu_to_le16(3)) { 4035 if (ref->VersionNumber != cpu_to_le16(3)) {
3867 cERROR(1, ("Referrals of V%d version are not supported," 4036 cERROR(1, "Referrals of V%d version are not supported,"
3868 "should be V3", le16_to_cpu(ref->VersionNumber))); 4037 "should be V3", le16_to_cpu(ref->VersionNumber));
3869 rc = -EINVAL; 4038 rc = -EINVAL;
3870 goto parse_DFS_referrals_exit; 4039 goto parse_DFS_referrals_exit;
3871 } 4040 }
@@ -3874,19 +4043,19 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
3874 data_end = (char *)(&(pSMBr->PathConsumed)) + 4043 data_end = (char *)(&(pSMBr->PathConsumed)) +
3875 le16_to_cpu(pSMBr->t2.DataCount); 4044 le16_to_cpu(pSMBr->t2.DataCount);
3876 4045
3877 cFYI(1, ("num_referrals: %d dfs flags: 0x%x ... \n", 4046 cFYI(1, "num_referrals: %d dfs flags: 0x%x ...\n",
3878 *num_of_nodes, 4047 *num_of_nodes,
3879 le32_to_cpu(pSMBr->DFSFlags))); 4048 le32_to_cpu(pSMBr->DFSFlags));
3880 4049
3881 *target_nodes = kzalloc(sizeof(struct dfs_info3_param) * 4050 *target_nodes = kzalloc(sizeof(struct dfs_info3_param) *
3882 *num_of_nodes, GFP_KERNEL); 4051 *num_of_nodes, GFP_KERNEL);
3883 if (*target_nodes == NULL) { 4052 if (*target_nodes == NULL) {
3884 cERROR(1, ("Failed to allocate buffer for target_nodes\n")); 4053 cERROR(1, "Failed to allocate buffer for target_nodes\n");
3885 rc = -ENOMEM; 4054 rc = -ENOMEM;
3886 goto parse_DFS_referrals_exit; 4055 goto parse_DFS_referrals_exit;
3887 } 4056 }
3888 4057
3889 /* collect neccessary data from referrals */ 4058 /* collect necessary data from referrals */
3890 for (i = 0; i < *num_of_nodes; i++) { 4059 for (i = 0; i < *num_of_nodes; i++) {
3891 char *temp; 4060 char *temp;
3892 int max_len; 4061 int max_len;
@@ -3957,7 +4126,7 @@ CIFSGetDFSRefer(const int xid, struct cifsSesInfo *ses,
3957 *num_of_nodes = 0; 4126 *num_of_nodes = 0;
3958 *target_nodes = NULL; 4127 *target_nodes = NULL;
3959 4128
3960 cFYI(1, ("In GetDFSRefer the path %s", searchName)); 4129 cFYI(1, "In GetDFSRefer the path %s", searchName);
3961 if (ses == NULL) 4130 if (ses == NULL)
3962 return -ENODEV; 4131 return -ENODEV;
3963getDFSRetry: 4132getDFSRetry:
@@ -4024,7 +4193,7 @@ getDFSRetry:
4024 rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB, 4193 rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
4025 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4194 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4026 if (rc) { 4195 if (rc) {
4027 cFYI(1, ("Send error in GetDFSRefer = %d", rc)); 4196 cFYI(1, "Send error in GetDFSRefer = %d", rc);
4028 goto GetDFSRefExit; 4197 goto GetDFSRefExit;
4029 } 4198 }
4030 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4199 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4035,9 +4204,9 @@ getDFSRetry:
4035 goto GetDFSRefExit; 4204 goto GetDFSRefExit;
4036 } 4205 }
4037 4206
4038 cFYI(1, ("Decoding GetDFSRefer response BCC: %d Offset %d", 4207 cFYI(1, "Decoding GetDFSRefer response BCC: %d Offset %d",
4039 pSMBr->ByteCount, 4208 pSMBr->ByteCount,
4040 le16_to_cpu(pSMBr->t2.DataOffset))); 4209 le16_to_cpu(pSMBr->t2.DataOffset));
4041 4210
4042 /* parse returned result into more usable form */ 4211 /* parse returned result into more usable form */
4043 rc = parse_DFS_referrals(pSMBr, num_of_nodes, 4212 rc = parse_DFS_referrals(pSMBr, num_of_nodes,
@@ -4065,7 +4234,7 @@ SMBOldQFSInfo(const int xid, struct cifsTconInfo *tcon, struct kstatfs *FSData)
4065 int bytes_returned = 0; 4234 int bytes_returned = 0;
4066 __u16 params, byte_count; 4235 __u16 params, byte_count;
4067 4236
4068 cFYI(1, ("OldQFSInfo")); 4237 cFYI(1, "OldQFSInfo");
4069oldQFSInfoRetry: 4238oldQFSInfoRetry:
4070 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4239 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
4071 (void **) &pSMBr); 4240 (void **) &pSMBr);
@@ -4098,7 +4267,7 @@ oldQFSInfoRetry:
4098 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4267 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4099 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4268 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4100 if (rc) { 4269 if (rc) {
4101 cFYI(1, ("Send error in QFSInfo = %d", rc)); 4270 cFYI(1, "Send error in QFSInfo = %d", rc);
4102 } else { /* decode response */ 4271 } else { /* decode response */
4103 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4272 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
4104 4273
@@ -4106,8 +4275,8 @@ oldQFSInfoRetry:
4106 rc = -EIO; /* bad smb */ 4275 rc = -EIO; /* bad smb */
4107 else { 4276 else {
4108 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); 4277 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
4109 cFYI(1, ("qfsinf resp BCC: %d Offset %d", 4278 cFYI(1, "qfsinf resp BCC: %d Offset %d",
4110 pSMBr->ByteCount, data_offset)); 4279 pSMBr->ByteCount, data_offset);
4111 4280
4112 response_data = (FILE_SYSTEM_ALLOC_INFO *) 4281 response_data = (FILE_SYSTEM_ALLOC_INFO *)
4113 (((char *) &pSMBr->hdr.Protocol) + data_offset); 4282 (((char *) &pSMBr->hdr.Protocol) + data_offset);
@@ -4119,11 +4288,10 @@ oldQFSInfoRetry:
4119 le32_to_cpu(response_data->TotalAllocationUnits); 4288 le32_to_cpu(response_data->TotalAllocationUnits);
4120 FSData->f_bfree = FSData->f_bavail = 4289 FSData->f_bfree = FSData->f_bavail =
4121 le32_to_cpu(response_data->FreeAllocationUnits); 4290 le32_to_cpu(response_data->FreeAllocationUnits);
4122 cFYI(1, 4291 cFYI(1, "Blocks: %lld Free: %lld Block size %ld",
4123 ("Blocks: %lld Free: %lld Block size %ld", 4292 (unsigned long long)FSData->f_blocks,
4124 (unsigned long long)FSData->f_blocks, 4293 (unsigned long long)FSData->f_bfree,
4125 (unsigned long long)FSData->f_bfree, 4294 FSData->f_bsize);
4126 FSData->f_bsize));
4127 } 4295 }
4128 } 4296 }
4129 cifs_buf_release(pSMB); 4297 cifs_buf_release(pSMB);
@@ -4145,7 +4313,7 @@ CIFSSMBQFSInfo(const int xid, struct cifsTconInfo *tcon, struct kstatfs *FSData)
4145 int bytes_returned = 0; 4313 int bytes_returned = 0;
4146 __u16 params, byte_count; 4314 __u16 params, byte_count;
4147 4315
4148 cFYI(1, ("In QFSInfo")); 4316 cFYI(1, "In QFSInfo");
4149QFSInfoRetry: 4317QFSInfoRetry:
4150 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4318 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
4151 (void **) &pSMBr); 4319 (void **) &pSMBr);
@@ -4178,7 +4346,7 @@ QFSInfoRetry:
4178 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4346 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4179 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4347 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4180 if (rc) { 4348 if (rc) {
4181 cFYI(1, ("Send error in QFSInfo = %d", rc)); 4349 cFYI(1, "Send error in QFSInfo = %d", rc);
4182 } else { /* decode response */ 4350 } else { /* decode response */
4183 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4351 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
4184 4352
@@ -4199,11 +4367,10 @@ QFSInfoRetry:
4199 le64_to_cpu(response_data->TotalAllocationUnits); 4367 le64_to_cpu(response_data->TotalAllocationUnits);
4200 FSData->f_bfree = FSData->f_bavail = 4368 FSData->f_bfree = FSData->f_bavail =
4201 le64_to_cpu(response_data->FreeAllocationUnits); 4369 le64_to_cpu(response_data->FreeAllocationUnits);
4202 cFYI(1, 4370 cFYI(1, "Blocks: %lld Free: %lld Block size %ld",
4203 ("Blocks: %lld Free: %lld Block size %ld", 4371 (unsigned long long)FSData->f_blocks,
4204 (unsigned long long)FSData->f_blocks, 4372 (unsigned long long)FSData->f_bfree,
4205 (unsigned long long)FSData->f_bfree, 4373 FSData->f_bsize);
4206 FSData->f_bsize));
4207 } 4374 }
4208 } 4375 }
4209 cifs_buf_release(pSMB); 4376 cifs_buf_release(pSMB);
@@ -4225,7 +4392,7 @@ CIFSSMBQFSAttributeInfo(const int xid, struct cifsTconInfo *tcon)
4225 int bytes_returned = 0; 4392 int bytes_returned = 0;
4226 __u16 params, byte_count; 4393 __u16 params, byte_count;
4227 4394
4228 cFYI(1, ("In QFSAttributeInfo")); 4395 cFYI(1, "In QFSAttributeInfo");
4229QFSAttributeRetry: 4396QFSAttributeRetry:
4230 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4397 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
4231 (void **) &pSMBr); 4398 (void **) &pSMBr);
@@ -4259,7 +4426,7 @@ QFSAttributeRetry:
4259 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4426 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4260 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4427 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4261 if (rc) { 4428 if (rc) {
4262 cERROR(1, ("Send error in QFSAttributeInfo = %d", rc)); 4429 cERROR(1, "Send error in QFSAttributeInfo = %d", rc);
4263 } else { /* decode response */ 4430 } else { /* decode response */
4264 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4431 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
4265 4432
@@ -4295,7 +4462,7 @@ CIFSSMBQFSDeviceInfo(const int xid, struct cifsTconInfo *tcon)
4295 int bytes_returned = 0; 4462 int bytes_returned = 0;
4296 __u16 params, byte_count; 4463 __u16 params, byte_count;
4297 4464
4298 cFYI(1, ("In QFSDeviceInfo")); 4465 cFYI(1, "In QFSDeviceInfo");
4299QFSDeviceRetry: 4466QFSDeviceRetry:
4300 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4467 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
4301 (void **) &pSMBr); 4468 (void **) &pSMBr);
@@ -4330,7 +4497,7 @@ QFSDeviceRetry:
4330 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4497 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4331 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4498 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4332 if (rc) { 4499 if (rc) {
4333 cFYI(1, ("Send error in QFSDeviceInfo = %d", rc)); 4500 cFYI(1, "Send error in QFSDeviceInfo = %d", rc);
4334 } else { /* decode response */ 4501 } else { /* decode response */
4335 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4502 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
4336 4503
@@ -4365,7 +4532,7 @@ CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon)
4365 int bytes_returned = 0; 4532 int bytes_returned = 0;
4366 __u16 params, byte_count; 4533 __u16 params, byte_count;
4367 4534
4368 cFYI(1, ("In QFSUnixInfo")); 4535 cFYI(1, "In QFSUnixInfo");
4369QFSUnixRetry: 4536QFSUnixRetry:
4370 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4537 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
4371 (void **) &pSMBr); 4538 (void **) &pSMBr);
@@ -4399,7 +4566,7 @@ QFSUnixRetry:
4399 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4566 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4400 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4567 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4401 if (rc) { 4568 if (rc) {
4402 cERROR(1, ("Send error in QFSUnixInfo = %d", rc)); 4569 cERROR(1, "Send error in QFSUnixInfo = %d", rc);
4403 } else { /* decode response */ 4570 } else { /* decode response */
4404 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4571 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
4405 4572
@@ -4434,7 +4601,7 @@ CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon, __u64 cap)
4434 int bytes_returned = 0; 4601 int bytes_returned = 0;
4435 __u16 params, param_offset, offset, byte_count; 4602 __u16 params, param_offset, offset, byte_count;
4436 4603
4437 cFYI(1, ("In SETFSUnixInfo")); 4604 cFYI(1, "In SETFSUnixInfo");
4438SETFSUnixRetry: 4605SETFSUnixRetry:
4439 /* BB switch to small buf init to save memory */ 4606 /* BB switch to small buf init to save memory */
4440 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4607 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -4482,7 +4649,7 @@ SETFSUnixRetry:
4482 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4649 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4483 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4650 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4484 if (rc) { 4651 if (rc) {
4485 cERROR(1, ("Send error in SETFSUnixInfo = %d", rc)); 4652 cERROR(1, "Send error in SETFSUnixInfo = %d", rc);
4486 } else { /* decode response */ 4653 } else { /* decode response */
4487 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4654 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
4488 if (rc) 4655 if (rc)
@@ -4510,7 +4677,7 @@ CIFSSMBQFSPosixInfo(const int xid, struct cifsTconInfo *tcon,
4510 int bytes_returned = 0; 4677 int bytes_returned = 0;
4511 __u16 params, byte_count; 4678 __u16 params, byte_count;
4512 4679
4513 cFYI(1, ("In QFSPosixInfo")); 4680 cFYI(1, "In QFSPosixInfo");
4514QFSPosixRetry: 4681QFSPosixRetry:
4515 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4682 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
4516 (void **) &pSMBr); 4683 (void **) &pSMBr);
@@ -4544,7 +4711,7 @@ QFSPosixRetry:
4544 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4711 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4545 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4712 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4546 if (rc) { 4713 if (rc) {
4547 cFYI(1, ("Send error in QFSUnixInfo = %d", rc)); 4714 cFYI(1, "Send error in QFSUnixInfo = %d", rc);
4548 } else { /* decode response */ 4715 } else { /* decode response */
4549 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4716 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
4550 4717
@@ -4604,7 +4771,7 @@ CIFSSMBSetEOF(const int xid, struct cifsTconInfo *tcon, const char *fileName,
4604 int bytes_returned = 0; 4771 int bytes_returned = 0;
4605 __u16 params, byte_count, data_count, param_offset, offset; 4772 __u16 params, byte_count, data_count, param_offset, offset;
4606 4773
4607 cFYI(1, ("In SetEOF")); 4774 cFYI(1, "In SetEOF");
4608SetEOFRetry: 4775SetEOFRetry:
4609 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4776 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
4610 (void **) &pSMBr); 4777 (void **) &pSMBr);
@@ -4670,7 +4837,7 @@ SetEOFRetry:
4670 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4837 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4671 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4838 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4672 if (rc) 4839 if (rc)
4673 cFYI(1, ("SetPathInfo (file size) returned %d", rc)); 4840 cFYI(1, "SetPathInfo (file size) returned %d", rc);
4674 4841
4675 cifs_buf_release(pSMB); 4842 cifs_buf_release(pSMB);
4676 4843
@@ -4690,8 +4857,8 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
4690 int rc = 0; 4857 int rc = 0;
4691 __u16 params, param_offset, offset, byte_count, count; 4858 __u16 params, param_offset, offset, byte_count, count;
4692 4859
4693 cFYI(1, ("SetFileSize (via SetFileInfo) %lld", 4860 cFYI(1, "SetFileSize (via SetFileInfo) %lld",
4694 (long long)size)); 4861 (long long)size);
4695 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); 4862 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
4696 4863
4697 if (rc) 4864 if (rc)
@@ -4750,9 +4917,7 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
4750 pSMB->ByteCount = cpu_to_le16(byte_count); 4917 pSMB->ByteCount = cpu_to_le16(byte_count);
4751 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); 4918 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
4752 if (rc) { 4919 if (rc) {
4753 cFYI(1, 4920 cFYI(1, "Send error in SetFileInfo (SetFileSize) = %d", rc);
4754 ("Send error in SetFileInfo (SetFileSize) = %d",
4755 rc));
4756 } 4921 }
4757 4922
4758 /* Note: On -EAGAIN error only caller can retry on handle based calls 4923 /* Note: On -EAGAIN error only caller can retry on handle based calls
@@ -4776,7 +4941,7 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
4776 int rc = 0; 4941 int rc = 0;
4777 __u16 params, param_offset, offset, byte_count, count; 4942 __u16 params, param_offset, offset, byte_count, count;
4778 4943
4779 cFYI(1, ("Set Times (via SetFileInfo)")); 4944 cFYI(1, "Set Times (via SetFileInfo)");
4780 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); 4945 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
4781 4946
4782 if (rc) 4947 if (rc)
@@ -4821,7 +4986,7 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
4821 memcpy(data_offset, data, sizeof(FILE_BASIC_INFO)); 4986 memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
4822 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); 4987 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
4823 if (rc) 4988 if (rc)
4824 cFYI(1, ("Send error in Set Time (SetFileInfo) = %d", rc)); 4989 cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc);
4825 4990
4826 /* Note: On -EAGAIN error only caller can retry on handle based calls 4991 /* Note: On -EAGAIN error only caller can retry on handle based calls
4827 since file handle passed in no longer valid */ 4992 since file handle passed in no longer valid */
@@ -4838,7 +5003,7 @@ CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
4838 int rc = 0; 5003 int rc = 0;
4839 __u16 params, param_offset, offset, byte_count, count; 5004 __u16 params, param_offset, offset, byte_count, count;
4840 5005
4841 cFYI(1, ("Set File Disposition (via SetFileInfo)")); 5006 cFYI(1, "Set File Disposition (via SetFileInfo)");
4842 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); 5007 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
4843 5008
4844 if (rc) 5009 if (rc)
@@ -4880,7 +5045,7 @@ CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
4880 *data_offset = delete_file ? 1 : 0; 5045 *data_offset = delete_file ? 1 : 0;
4881 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); 5046 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
4882 if (rc) 5047 if (rc)
4883 cFYI(1, ("Send error in SetFileDisposition = %d", rc)); 5048 cFYI(1, "Send error in SetFileDisposition = %d", rc);
4884 5049
4885 return rc; 5050 return rc;
4886} 5051}
@@ -4898,7 +5063,7 @@ CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
4898 char *data_offset; 5063 char *data_offset;
4899 __u16 params, param_offset, offset, byte_count, count; 5064 __u16 params, param_offset, offset, byte_count, count;
4900 5065
4901 cFYI(1, ("In SetTimes")); 5066 cFYI(1, "In SetTimes");
4902 5067
4903SetTimesRetry: 5068SetTimesRetry:
4904 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 5069 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -4954,7 +5119,7 @@ SetTimesRetry:
4954 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 5119 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4955 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 5120 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4956 if (rc) 5121 if (rc)
4957 cFYI(1, ("SetPathInfo (times) returned %d", rc)); 5122 cFYI(1, "SetPathInfo (times) returned %d", rc);
4958 5123
4959 cifs_buf_release(pSMB); 5124 cifs_buf_release(pSMB);
4960 5125
@@ -4979,7 +5144,7 @@ CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon, char *fileName,
4979 int bytes_returned; 5144 int bytes_returned;
4980 int name_len; 5145 int name_len;
4981 5146
4982 cFYI(1, ("In SetAttrLegacy")); 5147 cFYI(1, "In SetAttrLegacy");
4983 5148
4984SetAttrLgcyRetry: 5149SetAttrLgcyRetry:
4985 rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB, 5150 rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB,
@@ -5005,7 +5170,7 @@ SetAttrLgcyRetry:
5005 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 5170 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
5006 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 5171 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
5007 if (rc) 5172 if (rc)
5008 cFYI(1, ("Error in LegacySetAttr = %d", rc)); 5173 cFYI(1, "Error in LegacySetAttr = %d", rc);
5009 5174
5010 cifs_buf_release(pSMB); 5175 cifs_buf_release(pSMB);
5011 5176
@@ -5067,7 +5232,7 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
5067 int rc = 0; 5232 int rc = 0;
5068 u16 params, param_offset, offset, byte_count, count; 5233 u16 params, param_offset, offset, byte_count, count;
5069 5234
5070 cFYI(1, ("Set Unix Info (via SetFileInfo)")); 5235 cFYI(1, "Set Unix Info (via SetFileInfo)");
5071 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); 5236 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
5072 5237
5073 if (rc) 5238 if (rc)
@@ -5112,7 +5277,7 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
5112 5277
5113 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); 5278 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
5114 if (rc) 5279 if (rc)
5115 cFYI(1, ("Send error in Set Time (SetFileInfo) = %d", rc)); 5280 cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc);
5116 5281
5117 /* Note: On -EAGAIN error only caller can retry on handle based calls 5282 /* Note: On -EAGAIN error only caller can retry on handle based calls
5118 since file handle passed in no longer valid */ 5283 since file handle passed in no longer valid */
@@ -5133,7 +5298,7 @@ CIFSSMBUnixSetPathInfo(const int xid, struct cifsTconInfo *tcon, char *fileName,
5133 FILE_UNIX_BASIC_INFO *data_offset; 5298 FILE_UNIX_BASIC_INFO *data_offset;
5134 __u16 params, param_offset, offset, count, byte_count; 5299 __u16 params, param_offset, offset, count, byte_count;
5135 5300
5136 cFYI(1, ("In SetUID/GID/Mode")); 5301 cFYI(1, "In SetUID/GID/Mode");
5137setPermsRetry: 5302setPermsRetry:
5138 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 5303 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
5139 (void **) &pSMBr); 5304 (void **) &pSMBr);
@@ -5189,7 +5354,7 @@ setPermsRetry:
5189 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 5354 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
5190 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 5355 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
5191 if (rc) 5356 if (rc)
5192 cFYI(1, ("SetPathInfo (perms) returned %d", rc)); 5357 cFYI(1, "SetPathInfo (perms) returned %d", rc);
5193 5358
5194 cifs_buf_release(pSMB); 5359 cifs_buf_release(pSMB);
5195 if (rc == -EAGAIN) 5360 if (rc == -EAGAIN)
@@ -5208,7 +5373,7 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
5208 struct dir_notify_req *dnotify_req; 5373 struct dir_notify_req *dnotify_req;
5209 int bytes_returned; 5374 int bytes_returned;
5210 5375
5211 cFYI(1, ("In CIFSSMBNotify for file handle %d", (int)netfid)); 5376 cFYI(1, "In CIFSSMBNotify for file handle %d", (int)netfid);
5212 rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB, 5377 rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
5213 (void **) &pSMBr); 5378 (void **) &pSMBr);
5214 if (rc) 5379 if (rc)
@@ -5242,7 +5407,7 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
5242 (struct smb_hdr *)pSMBr, &bytes_returned, 5407 (struct smb_hdr *)pSMBr, &bytes_returned,
5243 CIFS_ASYNC_OP); 5408 CIFS_ASYNC_OP);
5244 if (rc) { 5409 if (rc) {
5245 cFYI(1, ("Error in Notify = %d", rc)); 5410 cFYI(1, "Error in Notify = %d", rc);
5246 } else { 5411 } else {
5247 /* Add file to outstanding requests */ 5412 /* Add file to outstanding requests */
5248 /* BB change to kmem cache alloc */ 5413 /* BB change to kmem cache alloc */
@@ -5298,7 +5463,7 @@ CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
5298 char *end_of_smb; 5463 char *end_of_smb;
5299 __u16 params, byte_count, data_offset; 5464 __u16 params, byte_count, data_offset;
5300 5465
5301 cFYI(1, ("In Query All EAs path %s", searchName)); 5466 cFYI(1, "In Query All EAs path %s", searchName);
5302QAllEAsRetry: 5467QAllEAsRetry:
5303 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 5468 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
5304 (void **) &pSMBr); 5469 (void **) &pSMBr);
@@ -5345,7 +5510,7 @@ QAllEAsRetry:
5345 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 5510 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
5346 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 5511 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
5347 if (rc) { 5512 if (rc) {
5348 cFYI(1, ("Send error in QueryAllEAs = %d", rc)); 5513 cFYI(1, "Send error in QueryAllEAs = %d", rc);
5349 goto QAllEAsOut; 5514 goto QAllEAsOut;
5350 } 5515 }
5351 5516
@@ -5373,16 +5538,16 @@ QAllEAsRetry:
5373 (((char *) &pSMBr->hdr.Protocol) + data_offset); 5538 (((char *) &pSMBr->hdr.Protocol) + data_offset);
5374 5539
5375 list_len = le32_to_cpu(ea_response_data->list_len); 5540 list_len = le32_to_cpu(ea_response_data->list_len);
5376 cFYI(1, ("ea length %d", list_len)); 5541 cFYI(1, "ea length %d", list_len);
5377 if (list_len <= 8) { 5542 if (list_len <= 8) {
5378 cFYI(1, ("empty EA list returned from server")); 5543 cFYI(1, "empty EA list returned from server");
5379 goto QAllEAsOut; 5544 goto QAllEAsOut;
5380 } 5545 }
5381 5546
5382 /* make sure list_len doesn't go past end of SMB */ 5547 /* make sure list_len doesn't go past end of SMB */
5383 end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr); 5548 end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr);
5384 if ((char *)ea_response_data + list_len > end_of_smb) { 5549 if ((char *)ea_response_data + list_len > end_of_smb) {
5385 cFYI(1, ("EA list appears to go beyond SMB")); 5550 cFYI(1, "EA list appears to go beyond SMB");
5386 rc = -EIO; 5551 rc = -EIO;
5387 goto QAllEAsOut; 5552 goto QAllEAsOut;
5388 } 5553 }
@@ -5399,7 +5564,7 @@ QAllEAsRetry:
5399 temp_ptr += 4; 5564 temp_ptr += 4;
5400 /* make sure we can read name_len and value_len */ 5565 /* make sure we can read name_len and value_len */
5401 if (list_len < 0) { 5566 if (list_len < 0) {
5402 cFYI(1, ("EA entry goes beyond length of list")); 5567 cFYI(1, "EA entry goes beyond length of list");
5403 rc = -EIO; 5568 rc = -EIO;
5404 goto QAllEAsOut; 5569 goto QAllEAsOut;
5405 } 5570 }
@@ -5408,7 +5573,7 @@ QAllEAsRetry:
5408 value_len = le16_to_cpu(temp_fea->value_len); 5573 value_len = le16_to_cpu(temp_fea->value_len);
5409 list_len -= name_len + 1 + value_len; 5574 list_len -= name_len + 1 + value_len;
5410 if (list_len < 0) { 5575 if (list_len < 0) {
5411 cFYI(1, ("EA entry goes beyond length of list")); 5576 cFYI(1, "EA entry goes beyond length of list");
5412 rc = -EIO; 5577 rc = -EIO;
5413 goto QAllEAsOut; 5578 goto QAllEAsOut;
5414 } 5579 }
@@ -5475,7 +5640,7 @@ CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon, const char *fileName,
5475 int bytes_returned = 0; 5640 int bytes_returned = 0;
5476 __u16 params, param_offset, byte_count, offset, count; 5641 __u16 params, param_offset, byte_count, offset, count;
5477 5642
5478 cFYI(1, ("In SetEA")); 5643 cFYI(1, "In SetEA");
5479SetEARetry: 5644SetEARetry:
5480 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 5645 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
5481 (void **) &pSMBr); 5646 (void **) &pSMBr);
@@ -5557,7 +5722,7 @@ SetEARetry:
5557 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 5722 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
5558 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 5723 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
5559 if (rc) 5724 if (rc)
5560 cFYI(1, ("SetPathInfo (EA) returned %d", rc)); 5725 cFYI(1, "SetPathInfo (EA) returned %d", rc);
5561 5726
5562 cifs_buf_release(pSMB); 5727 cifs_buf_release(pSMB);
5563 5728
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 45eb6cba793f..2208f06e4c45 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -23,6 +23,7 @@
23#include <linux/string.h> 23#include <linux/string.h>
24#include <linux/list.h> 24#include <linux/list.h>
25#include <linux/wait.h> 25#include <linux/wait.h>
26#include <linux/slab.h>
26#include <linux/pagemap.h> 27#include <linux/pagemap.h>
27#include <linux/ctype.h> 28#include <linux/ctype.h>
28#include <linux/utsname.h> 29#include <linux/utsname.h>
@@ -101,6 +102,7 @@ struct smb_vol {
101 bool sockopt_tcp_nodelay:1; 102 bool sockopt_tcp_nodelay:1;
102 unsigned short int port; 103 unsigned short int port;
103 char *prepath; 104 char *prepath;
105 struct nls_table *local_nls;
104}; 106};
105 107
106static int ipv4_connect(struct TCP_Server_Info *server); 108static int ipv4_connect(struct TCP_Server_Info *server);
@@ -134,7 +136,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
134 spin_unlock(&GlobalMid_Lock); 136 spin_unlock(&GlobalMid_Lock);
135 server->maxBuf = 0; 137 server->maxBuf = 0;
136 138
137 cFYI(1, ("Reconnecting tcp session")); 139 cFYI(1, "Reconnecting tcp session");
138 140
139 /* before reconnecting the tcp session, mark the smb session (uid) 141 /* before reconnecting the tcp session, mark the smb session (uid)
140 and the tid bad so they are not used until reconnected */ 142 and the tid bad so they are not used until reconnected */
@@ -152,12 +154,12 @@ cifs_reconnect(struct TCP_Server_Info *server)
152 /* do not want to be sending data on a socket we are freeing */ 154 /* do not want to be sending data on a socket we are freeing */
153 mutex_lock(&server->srv_mutex); 155 mutex_lock(&server->srv_mutex);
154 if (server->ssocket) { 156 if (server->ssocket) {
155 cFYI(1, ("State: 0x%x Flags: 0x%lx", server->ssocket->state, 157 cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state,
156 server->ssocket->flags)); 158 server->ssocket->flags);
157 kernel_sock_shutdown(server->ssocket, SHUT_WR); 159 kernel_sock_shutdown(server->ssocket, SHUT_WR);
158 cFYI(1, ("Post shutdown state: 0x%x Flags: 0x%lx", 160 cFYI(1, "Post shutdown state: 0x%x Flags: 0x%lx",
159 server->ssocket->state, 161 server->ssocket->state,
160 server->ssocket->flags)); 162 server->ssocket->flags);
161 sock_release(server->ssocket); 163 sock_release(server->ssocket);
162 server->ssocket = NULL; 164 server->ssocket = NULL;
163 } 165 }
@@ -186,7 +188,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
186 else 188 else
187 rc = ipv4_connect(server); 189 rc = ipv4_connect(server);
188 if (rc) { 190 if (rc) {
189 cFYI(1, ("reconnect error %d", rc)); 191 cFYI(1, "reconnect error %d", rc);
190 msleep(3000); 192 msleep(3000);
191 } else { 193 } else {
192 atomic_inc(&tcpSesReconnectCount); 194 atomic_inc(&tcpSesReconnectCount);
@@ -222,7 +224,7 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
222 /* check for plausible wct, bcc and t2 data and parm sizes */ 224 /* check for plausible wct, bcc and t2 data and parm sizes */
223 /* check for parm and data offset going beyond end of smb */ 225 /* check for parm and data offset going beyond end of smb */
224 if (pSMB->WordCount != 10) { /* coalesce_t2 depends on this */ 226 if (pSMB->WordCount != 10) { /* coalesce_t2 depends on this */
225 cFYI(1, ("invalid transact2 word count")); 227 cFYI(1, "invalid transact2 word count");
226 return -EINVAL; 228 return -EINVAL;
227 } 229 }
228 230
@@ -236,15 +238,15 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
236 if (remaining == 0) 238 if (remaining == 0)
237 return 0; 239 return 0;
238 else if (remaining < 0) { 240 else if (remaining < 0) {
239 cFYI(1, ("total data %d smaller than data in frame %d", 241 cFYI(1, "total data %d smaller than data in frame %d",
240 total_data_size, data_in_this_rsp)); 242 total_data_size, data_in_this_rsp);
241 return -EINVAL; 243 return -EINVAL;
242 } else { 244 } else {
243 cFYI(1, ("missing %d bytes from transact2, check next response", 245 cFYI(1, "missing %d bytes from transact2, check next response",
244 remaining)); 246 remaining);
245 if (total_data_size > maxBufSize) { 247 if (total_data_size > maxBufSize) {
246 cERROR(1, ("TotalDataSize %d is over maximum buffer %d", 248 cERROR(1, "TotalDataSize %d is over maximum buffer %d",
247 total_data_size, maxBufSize)); 249 total_data_size, maxBufSize);
248 return -EINVAL; 250 return -EINVAL;
249 } 251 }
250 return remaining; 252 return remaining;
@@ -266,7 +268,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
266 total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount); 268 total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount);
267 269
268 if (total_data_size != le16_to_cpu(pSMB2->t2_rsp.TotalDataCount)) { 270 if (total_data_size != le16_to_cpu(pSMB2->t2_rsp.TotalDataCount)) {
269 cFYI(1, ("total data size of primary and secondary t2 differ")); 271 cFYI(1, "total data size of primary and secondary t2 differ");
270 } 272 }
271 273
272 total_in_buf = le16_to_cpu(pSMBt->t2_rsp.DataCount); 274 total_in_buf = le16_to_cpu(pSMBt->t2_rsp.DataCount);
@@ -281,7 +283,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
281 283
282 total_in_buf2 = le16_to_cpu(pSMB2->t2_rsp.DataCount); 284 total_in_buf2 = le16_to_cpu(pSMB2->t2_rsp.DataCount);
283 if (remaining < total_in_buf2) { 285 if (remaining < total_in_buf2) {
284 cFYI(1, ("transact2 2nd response contains too much data")); 286 cFYI(1, "transact2 2nd response contains too much data");
285 } 287 }
286 288
287 /* find end of first SMB data area */ 289 /* find end of first SMB data area */
@@ -310,7 +312,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
310 pTargetSMB->smb_buf_length = byte_count; 312 pTargetSMB->smb_buf_length = byte_count;
311 313
312 if (remaining == total_in_buf2) { 314 if (remaining == total_in_buf2) {
313 cFYI(1, ("found the last secondary response")); 315 cFYI(1, "found the last secondary response");
314 return 0; /* we are done */ 316 return 0; /* we are done */
315 } else /* more responses to go */ 317 } else /* more responses to go */
316 return 1; 318 return 1;
@@ -338,7 +340,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
338 int reconnect; 340 int reconnect;
339 341
340 current->flags |= PF_MEMALLOC; 342 current->flags |= PF_MEMALLOC;
341 cFYI(1, ("Demultiplex PID: %d", task_pid_nr(current))); 343 cFYI(1, "Demultiplex PID: %d", task_pid_nr(current));
342 344
343 length = atomic_inc_return(&tcpSesAllocCount); 345 length = atomic_inc_return(&tcpSesAllocCount);
344 if (length > 1) 346 if (length > 1)
@@ -352,7 +354,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
352 if (bigbuf == NULL) { 354 if (bigbuf == NULL) {
353 bigbuf = cifs_buf_get(); 355 bigbuf = cifs_buf_get();
354 if (!bigbuf) { 356 if (!bigbuf) {
355 cERROR(1, ("No memory for large SMB response")); 357 cERROR(1, "No memory for large SMB response");
356 msleep(3000); 358 msleep(3000);
357 /* retry will check if exiting */ 359 /* retry will check if exiting */
358 continue; 360 continue;
@@ -365,7 +367,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
365 if (smallbuf == NULL) { 367 if (smallbuf == NULL) {
366 smallbuf = cifs_small_buf_get(); 368 smallbuf = cifs_small_buf_get();
367 if (!smallbuf) { 369 if (!smallbuf) {
368 cERROR(1, ("No memory for SMB response")); 370 cERROR(1, "No memory for SMB response");
369 msleep(1000); 371 msleep(1000);
370 /* retry will check if exiting */ 372 /* retry will check if exiting */
371 continue; 373 continue;
@@ -390,9 +392,9 @@ incomplete_rcv:
390 if (server->tcpStatus == CifsExiting) { 392 if (server->tcpStatus == CifsExiting) {
391 break; 393 break;
392 } else if (server->tcpStatus == CifsNeedReconnect) { 394 } else if (server->tcpStatus == CifsNeedReconnect) {
393 cFYI(1, ("Reconnect after server stopped responding")); 395 cFYI(1, "Reconnect after server stopped responding");
394 cifs_reconnect(server); 396 cifs_reconnect(server);
395 cFYI(1, ("call to reconnect done")); 397 cFYI(1, "call to reconnect done");
396 csocket = server->ssocket; 398 csocket = server->ssocket;
397 continue; 399 continue;
398 } else if ((length == -ERESTARTSYS) || (length == -EAGAIN)) { 400 } else if ((length == -ERESTARTSYS) || (length == -EAGAIN)) {
@@ -410,7 +412,7 @@ incomplete_rcv:
410 continue; 412 continue;
411 } else if (length <= 0) { 413 } else if (length <= 0) {
412 if (server->tcpStatus == CifsNew) { 414 if (server->tcpStatus == CifsNew) {
413 cFYI(1, ("tcp session abend after SMBnegprot")); 415 cFYI(1, "tcp session abend after SMBnegprot");
414 /* some servers kill the TCP session rather than 416 /* some servers kill the TCP session rather than
415 returning an SMB negprot error, in which 417 returning an SMB negprot error, in which
416 case reconnecting here is not going to help, 418 case reconnecting here is not going to help,
@@ -418,18 +420,18 @@ incomplete_rcv:
418 break; 420 break;
419 } 421 }
420 if (!try_to_freeze() && (length == -EINTR)) { 422 if (!try_to_freeze() && (length == -EINTR)) {
421 cFYI(1, ("cifsd thread killed")); 423 cFYI(1, "cifsd thread killed");
422 break; 424 break;
423 } 425 }
424 cFYI(1, ("Reconnect after unexpected peek error %d", 426 cFYI(1, "Reconnect after unexpected peek error %d",
425 length)); 427 length);
426 cifs_reconnect(server); 428 cifs_reconnect(server);
427 csocket = server->ssocket; 429 csocket = server->ssocket;
428 wake_up(&server->response_q); 430 wake_up(&server->response_q);
429 continue; 431 continue;
430 } else if (length < pdu_length) { 432 } else if (length < pdu_length) {
431 cFYI(1, ("requested %d bytes but only got %d bytes", 433 cFYI(1, "requested %d bytes but only got %d bytes",
432 pdu_length, length)); 434 pdu_length, length);
433 pdu_length -= length; 435 pdu_length -= length;
434 msleep(1); 436 msleep(1);
435 goto incomplete_rcv; 437 goto incomplete_rcv;
@@ -449,18 +451,18 @@ incomplete_rcv:
449 pdu_length = be32_to_cpu((__force __be32)smb_buffer->smb_buf_length); 451 pdu_length = be32_to_cpu((__force __be32)smb_buffer->smb_buf_length);
450 smb_buffer->smb_buf_length = pdu_length; 452 smb_buffer->smb_buf_length = pdu_length;
451 453
452 cFYI(1, ("rfc1002 length 0x%x", pdu_length+4)); 454 cFYI(1, "rfc1002 length 0x%x", pdu_length+4);
453 455
454 if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) { 456 if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) {
455 continue; 457 continue;
456 } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) { 458 } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) {
457 cFYI(1, ("Good RFC 1002 session rsp")); 459 cFYI(1, "Good RFC 1002 session rsp");
458 continue; 460 continue;
459 } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) { 461 } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) {
460 /* we get this from Windows 98 instead of 462 /* we get this from Windows 98 instead of
461 an error on SMB negprot response */ 463 an error on SMB negprot response */
462 cFYI(1, ("Negative RFC1002 Session Response Error 0x%x)", 464 cFYI(1, "Negative RFC1002 Session Response Error 0x%x)",
463 pdu_length)); 465 pdu_length);
464 if (server->tcpStatus == CifsNew) { 466 if (server->tcpStatus == CifsNew) {
465 /* if nack on negprot (rather than 467 /* if nack on negprot (rather than
466 ret of smb negprot error) reconnecting 468 ret of smb negprot error) reconnecting
@@ -483,7 +485,7 @@ incomplete_rcv:
483 continue; 485 continue;
484 } 486 }
485 } else if (temp != (char) 0) { 487 } else if (temp != (char) 0) {
486 cERROR(1, ("Unknown RFC 1002 frame")); 488 cERROR(1, "Unknown RFC 1002 frame");
487 cifs_dump_mem(" Received Data: ", (char *)smb_buffer, 489 cifs_dump_mem(" Received Data: ", (char *)smb_buffer,
488 length); 490 length);
489 cifs_reconnect(server); 491 cifs_reconnect(server);
@@ -494,8 +496,8 @@ incomplete_rcv:
494 /* else we have an SMB response */ 496 /* else we have an SMB response */
495 if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) || 497 if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) ||
496 (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) { 498 (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) {
497 cERROR(1, ("Invalid size SMB length %d pdu_length %d", 499 cERROR(1, "Invalid size SMB length %d pdu_length %d",
498 length, pdu_length+4)); 500 length, pdu_length+4);
499 cifs_reconnect(server); 501 cifs_reconnect(server);
500 csocket = server->ssocket; 502 csocket = server->ssocket;
501 wake_up(&server->response_q); 503 wake_up(&server->response_q);
@@ -538,8 +540,8 @@ incomplete_rcv:
538 length = 0; 540 length = 0;
539 continue; 541 continue;
540 } else if (length <= 0) { 542 } else if (length <= 0) {
541 cERROR(1, ("Received no data, expecting %d", 543 cERROR(1, "Received no data, expecting %d",
542 pdu_length - total_read)); 544 pdu_length - total_read);
543 cifs_reconnect(server); 545 cifs_reconnect(server);
544 csocket = server->ssocket; 546 csocket = server->ssocket;
545 reconnect = 1; 547 reconnect = 1;
@@ -587,7 +589,7 @@ incomplete_rcv:
587 } 589 }
588 } else { 590 } else {
589 if (!isLargeBuf) { 591 if (!isLargeBuf) {
590 cERROR(1,("1st trans2 resp needs bigbuf")); 592 cERROR(1, "1st trans2 resp needs bigbuf");
591 /* BB maybe we can fix this up, switch 593 /* BB maybe we can fix this up, switch
592 to already allocated large buffer? */ 594 to already allocated large buffer? */
593 } else { 595 } else {
@@ -629,8 +631,8 @@ multi_t2_fnd:
629 wake_up_process(task_to_wake); 631 wake_up_process(task_to_wake);
630 } else if (!is_valid_oplock_break(smb_buffer, server) && 632 } else if (!is_valid_oplock_break(smb_buffer, server) &&
631 !isMultiRsp) { 633 !isMultiRsp) {
632 cERROR(1, ("No task to wake, unknown frame received! " 634 cERROR(1, "No task to wake, unknown frame received! "
633 "NumMids %d", midCount.counter)); 635 "NumMids %d", midCount.counter);
634 cifs_dump_mem("Received Data is: ", (char *)smb_buffer, 636 cifs_dump_mem("Received Data is: ", (char *)smb_buffer,
635 sizeof(struct smb_hdr)); 637 sizeof(struct smb_hdr));
636#ifdef CONFIG_CIFS_DEBUG2 638#ifdef CONFIG_CIFS_DEBUG2
@@ -707,8 +709,8 @@ multi_t2_fnd:
707 list_for_each(tmp, &server->pending_mid_q) { 709 list_for_each(tmp, &server->pending_mid_q) {
708 mid_entry = list_entry(tmp, struct mid_q_entry, qhead); 710 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
709 if (mid_entry->midState == MID_REQUEST_SUBMITTED) { 711 if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
710 cFYI(1, ("Clearing Mid 0x%x - waking up ", 712 cFYI(1, "Clearing Mid 0x%x - waking up ",
711 mid_entry->mid)); 713 mid_entry->mid);
712 task_to_wake = mid_entry->tsk; 714 task_to_wake = mid_entry->tsk;
713 if (task_to_wake) 715 if (task_to_wake)
714 wake_up_process(task_to_wake); 716 wake_up_process(task_to_wake);
@@ -727,7 +729,7 @@ multi_t2_fnd:
727 to wait at least 45 seconds before giving up 729 to wait at least 45 seconds before giving up
728 on a request getting a response and going ahead 730 on a request getting a response and going ahead
729 and killing cifsd */ 731 and killing cifsd */
730 cFYI(1, ("Wait for exit from demultiplex thread")); 732 cFYI(1, "Wait for exit from demultiplex thread");
731 msleep(46000); 733 msleep(46000);
732 /* if threads still have not exited they are probably never 734 /* if threads still have not exited they are probably never
733 coming home not much else we can do but free the memory */ 735 coming home not much else we can do but free the memory */
@@ -848,7 +850,7 @@ cifs_parse_mount_options(char *options, const char *devname,
848 separator[0] = options[4]; 850 separator[0] = options[4];
849 options += 5; 851 options += 5;
850 } else { 852 } else {
851 cFYI(1, ("Null separator not allowed")); 853 cFYI(1, "Null separator not allowed");
852 } 854 }
853 } 855 }
854 856
@@ -973,7 +975,7 @@ cifs_parse_mount_options(char *options, const char *devname,
973 } 975 }
974 } else if (strnicmp(data, "sec", 3) == 0) { 976 } else if (strnicmp(data, "sec", 3) == 0) {
975 if (!value || !*value) { 977 if (!value || !*value) {
976 cERROR(1, ("no security value specified")); 978 cERROR(1, "no security value specified");
977 continue; 979 continue;
978 } else if (strnicmp(value, "krb5i", 5) == 0) { 980 } else if (strnicmp(value, "krb5i", 5) == 0) {
979 vol->secFlg |= CIFSSEC_MAY_KRB5 | 981 vol->secFlg |= CIFSSEC_MAY_KRB5 |
@@ -981,7 +983,7 @@ cifs_parse_mount_options(char *options, const char *devname,
981 } else if (strnicmp(value, "krb5p", 5) == 0) { 983 } else if (strnicmp(value, "krb5p", 5) == 0) {
982 /* vol->secFlg |= CIFSSEC_MUST_SEAL | 984 /* vol->secFlg |= CIFSSEC_MUST_SEAL |
983 CIFSSEC_MAY_KRB5; */ 985 CIFSSEC_MAY_KRB5; */
984 cERROR(1, ("Krb5 cifs privacy not supported")); 986 cERROR(1, "Krb5 cifs privacy not supported");
985 return 1; 987 return 1;
986 } else if (strnicmp(value, "krb5", 4) == 0) { 988 } else if (strnicmp(value, "krb5", 4) == 0) {
987 vol->secFlg |= CIFSSEC_MAY_KRB5; 989 vol->secFlg |= CIFSSEC_MAY_KRB5;
@@ -1013,7 +1015,7 @@ cifs_parse_mount_options(char *options, const char *devname,
1013 } else if (strnicmp(value, "none", 4) == 0) { 1015 } else if (strnicmp(value, "none", 4) == 0) {
1014 vol->nullauth = 1; 1016 vol->nullauth = 1;
1015 } else { 1017 } else {
1016 cERROR(1, ("bad security option: %s", value)); 1018 cERROR(1, "bad security option: %s", value);
1017 return 1; 1019 return 1;
1018 } 1020 }
1019 } else if ((strnicmp(data, "unc", 3) == 0) 1021 } else if ((strnicmp(data, "unc", 3) == 0)
@@ -1052,7 +1054,7 @@ cifs_parse_mount_options(char *options, const char *devname,
1052 a domain name and need special handling? */ 1054 a domain name and need special handling? */
1053 if (strnlen(value, 256) < 256) { 1055 if (strnlen(value, 256) < 256) {
1054 vol->domainname = value; 1056 vol->domainname = value;
1055 cFYI(1, ("Domain name set")); 1057 cFYI(1, "Domain name set");
1056 } else { 1058 } else {
1057 printk(KERN_WARNING "CIFS: domain name too " 1059 printk(KERN_WARNING "CIFS: domain name too "
1058 "long\n"); 1060 "long\n");
@@ -1075,7 +1077,7 @@ cifs_parse_mount_options(char *options, const char *devname,
1075 strcpy(vol->prepath+1, value); 1077 strcpy(vol->prepath+1, value);
1076 } else 1078 } else
1077 strcpy(vol->prepath, value); 1079 strcpy(vol->prepath, value);
1078 cFYI(1, ("prefix path %s", vol->prepath)); 1080 cFYI(1, "prefix path %s", vol->prepath);
1079 } else { 1081 } else {
1080 printk(KERN_WARNING "CIFS: prefix too long\n"); 1082 printk(KERN_WARNING "CIFS: prefix too long\n");
1081 return 1; 1083 return 1;
@@ -1091,7 +1093,7 @@ cifs_parse_mount_options(char *options, const char *devname,
1091 vol->iocharset = value; 1093 vol->iocharset = value;
1092 /* if iocharset not set then load_nls_default 1094 /* if iocharset not set then load_nls_default
1093 is used by caller */ 1095 is used by caller */
1094 cFYI(1, ("iocharset set to %s", value)); 1096 cFYI(1, "iocharset set to %s", value);
1095 } else { 1097 } else {
1096 printk(KERN_WARNING "CIFS: iocharset name " 1098 printk(KERN_WARNING "CIFS: iocharset name "
1097 "too long.\n"); 1099 "too long.\n");
@@ -1143,14 +1145,14 @@ cifs_parse_mount_options(char *options, const char *devname,
1143 } 1145 }
1144 } else if (strnicmp(data, "sockopt", 5) == 0) { 1146 } else if (strnicmp(data, "sockopt", 5) == 0) {
1145 if (!value || !*value) { 1147 if (!value || !*value) {
1146 cERROR(1, ("no socket option specified")); 1148 cERROR(1, "no socket option specified");
1147 continue; 1149 continue;
1148 } else if (strnicmp(value, "TCP_NODELAY", 11) == 0) { 1150 } else if (strnicmp(value, "TCP_NODELAY", 11) == 0) {
1149 vol->sockopt_tcp_nodelay = 1; 1151 vol->sockopt_tcp_nodelay = 1;
1150 } 1152 }
1151 } else if (strnicmp(data, "netbiosname", 4) == 0) { 1153 } else if (strnicmp(data, "netbiosname", 4) == 0) {
1152 if (!value || !*value || (*value == ' ')) { 1154 if (!value || !*value || (*value == ' ')) {
1153 cFYI(1, ("invalid (empty) netbiosname")); 1155 cFYI(1, "invalid (empty) netbiosname");
1154 } else { 1156 } else {
1155 memset(vol->source_rfc1001_name, 0x20, 15); 1157 memset(vol->source_rfc1001_name, 0x20, 15);
1156 for (i = 0; i < 15; i++) { 1158 for (i = 0; i < 15; i++) {
@@ -1174,7 +1176,7 @@ cifs_parse_mount_options(char *options, const char *devname,
1174 } else if (strnicmp(data, "servern", 7) == 0) { 1176 } else if (strnicmp(data, "servern", 7) == 0) {
1175 /* servernetbiosname specified override *SMBSERVER */ 1177 /* servernetbiosname specified override *SMBSERVER */
1176 if (!value || !*value || (*value == ' ')) { 1178 if (!value || !*value || (*value == ' ')) {
1177 cFYI(1, ("empty server netbiosname specified")); 1179 cFYI(1, "empty server netbiosname specified");
1178 } else { 1180 } else {
1179 /* last byte, type, is 0x20 for servr type */ 1181 /* last byte, type, is 0x20 for servr type */
1180 memset(vol->target_rfc1001_name, 0x20, 16); 1182 memset(vol->target_rfc1001_name, 0x20, 16);
@@ -1433,7 +1435,7 @@ cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port)
1433 1435
1434 ++server->srv_count; 1436 ++server->srv_count;
1435 write_unlock(&cifs_tcp_ses_lock); 1437 write_unlock(&cifs_tcp_ses_lock);
1436 cFYI(1, ("Existing tcp session with server found")); 1438 cFYI(1, "Existing tcp session with server found");
1437 return server; 1439 return server;
1438 } 1440 }
1439 write_unlock(&cifs_tcp_ses_lock); 1441 write_unlock(&cifs_tcp_ses_lock);
@@ -1474,7 +1476,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1474 1476
1475 memset(&addr, 0, sizeof(struct sockaddr_storage)); 1477 memset(&addr, 0, sizeof(struct sockaddr_storage));
1476 1478
1477 cFYI(1, ("UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip)); 1479 cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip);
1478 1480
1479 if (volume_info->UNCip && volume_info->UNC) { 1481 if (volume_info->UNCip && volume_info->UNC) {
1480 rc = cifs_convert_address(volume_info->UNCip, &addr); 1482 rc = cifs_convert_address(volume_info->UNCip, &addr);
@@ -1486,13 +1488,12 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1486 } else if (volume_info->UNCip) { 1488 } else if (volume_info->UNCip) {
1487 /* BB using ip addr as tcp_ses name to connect to the 1489 /* BB using ip addr as tcp_ses name to connect to the
1488 DFS root below */ 1490 DFS root below */
1489 cERROR(1, ("Connecting to DFS root not implemented yet")); 1491 cERROR(1, "Connecting to DFS root not implemented yet");
1490 rc = -EINVAL; 1492 rc = -EINVAL;
1491 goto out_err; 1493 goto out_err;
1492 } else /* which tcp_sess DFS root would we conect to */ { 1494 } else /* which tcp_sess DFS root would we conect to */ {
1493 cERROR(1, 1495 cERROR(1, "CIFS mount error: No UNC path (e.g. -o "
1494 ("CIFS mount error: No UNC path (e.g. -o " 1496 "unc=//192.168.1.100/public) specified");
1495 "unc=//192.168.1.100/public) specified"));
1496 rc = -EINVAL; 1497 rc = -EINVAL;
1497 goto out_err; 1498 goto out_err;
1498 } 1499 }
@@ -1539,7 +1540,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1539 ++tcp_ses->srv_count; 1540 ++tcp_ses->srv_count;
1540 1541
1541 if (addr.ss_family == AF_INET6) { 1542 if (addr.ss_family == AF_INET6) {
1542 cFYI(1, ("attempting ipv6 connect")); 1543 cFYI(1, "attempting ipv6 connect");
1543 /* BB should we allow ipv6 on port 139? */ 1544 /* BB should we allow ipv6 on port 139? */
1544 /* other OS never observed in Wild doing 139 with v6 */ 1545 /* other OS never observed in Wild doing 139 with v6 */
1545 sin_server6->sin6_port = htons(volume_info->port); 1546 sin_server6->sin6_port = htons(volume_info->port);
@@ -1553,7 +1554,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1553 rc = ipv4_connect(tcp_ses); 1554 rc = ipv4_connect(tcp_ses);
1554 } 1555 }
1555 if (rc < 0) { 1556 if (rc < 0) {
1556 cERROR(1, ("Error connecting to socket. Aborting operation")); 1557 cERROR(1, "Error connecting to socket. Aborting operation");
1557 goto out_err; 1558 goto out_err;
1558 } 1559 }
1559 1560
@@ -1566,7 +1567,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1566 tcp_ses, "cifsd"); 1567 tcp_ses, "cifsd");
1567 if (IS_ERR(tcp_ses->tsk)) { 1568 if (IS_ERR(tcp_ses->tsk)) {
1568 rc = PTR_ERR(tcp_ses->tsk); 1569 rc = PTR_ERR(tcp_ses->tsk);
1569 cERROR(1, ("error %d create cifsd thread", rc)); 1570 cERROR(1, "error %d create cifsd thread", rc);
1570 module_put(THIS_MODULE); 1571 module_put(THIS_MODULE);
1571 goto out_err; 1572 goto out_err;
1572 } 1573 }
@@ -1615,6 +1616,7 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
1615 int xid; 1616 int xid;
1616 struct TCP_Server_Info *server = ses->server; 1617 struct TCP_Server_Info *server = ses->server;
1617 1618
1619 cFYI(1, "%s: ses_count=%d\n", __func__, ses->ses_count);
1618 write_lock(&cifs_tcp_ses_lock); 1620 write_lock(&cifs_tcp_ses_lock);
1619 if (--ses->ses_count > 0) { 1621 if (--ses->ses_count > 0) {
1620 write_unlock(&cifs_tcp_ses_lock); 1622 write_unlock(&cifs_tcp_ses_lock);
@@ -1633,6 +1635,102 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
1633 cifs_put_tcp_session(server); 1635 cifs_put_tcp_session(server);
1634} 1636}
1635 1637
1638static struct cifsSesInfo *
1639cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
1640{
1641 int rc = -ENOMEM, xid;
1642 struct cifsSesInfo *ses;
1643
1644 xid = GetXid();
1645
1646 ses = cifs_find_smb_ses(server, volume_info->username);
1647 if (ses) {
1648 cFYI(1, "Existing smb sess found (status=%d)", ses->status);
1649
1650 /* existing SMB ses has a server reference already */
1651 cifs_put_tcp_session(server);
1652
1653 mutex_lock(&ses->session_mutex);
1654 rc = cifs_negotiate_protocol(xid, ses);
1655 if (rc) {
1656 mutex_unlock(&ses->session_mutex);
1657 /* problem -- put our ses reference */
1658 cifs_put_smb_ses(ses);
1659 FreeXid(xid);
1660 return ERR_PTR(rc);
1661 }
1662 if (ses->need_reconnect) {
1663 cFYI(1, "Session needs reconnect");
1664 rc = cifs_setup_session(xid, ses,
1665 volume_info->local_nls);
1666 if (rc) {
1667 mutex_unlock(&ses->session_mutex);
1668 /* problem -- put our reference */
1669 cifs_put_smb_ses(ses);
1670 FreeXid(xid);
1671 return ERR_PTR(rc);
1672 }
1673 }
1674 mutex_unlock(&ses->session_mutex);
1675 FreeXid(xid);
1676 return ses;
1677 }
1678
1679 cFYI(1, "Existing smb sess not found");
1680 ses = sesInfoAlloc();
1681 if (ses == NULL)
1682 goto get_ses_fail;
1683
1684 /* new SMB session uses our server ref */
1685 ses->server = server;
1686 if (server->addr.sockAddr6.sin6_family == AF_INET6)
1687 sprintf(ses->serverName, "%pI6",
1688 &server->addr.sockAddr6.sin6_addr);
1689 else
1690 sprintf(ses->serverName, "%pI4",
1691 &server->addr.sockAddr.sin_addr.s_addr);
1692
1693 if (volume_info->username)
1694 strncpy(ses->userName, volume_info->username,
1695 MAX_USERNAME_SIZE);
1696
1697 /* volume_info->password freed at unmount */
1698 if (volume_info->password) {
1699 ses->password = kstrdup(volume_info->password, GFP_KERNEL);
1700 if (!ses->password)
1701 goto get_ses_fail;
1702 }
1703 if (volume_info->domainname) {
1704 int len = strlen(volume_info->domainname);
1705 ses->domainName = kmalloc(len + 1, GFP_KERNEL);
1706 if (ses->domainName)
1707 strcpy(ses->domainName, volume_info->domainname);
1708 }
1709 ses->linux_uid = volume_info->linux_uid;
1710 ses->overrideSecFlg = volume_info->secFlg;
1711
1712 mutex_lock(&ses->session_mutex);
1713 rc = cifs_negotiate_protocol(xid, ses);
1714 if (!rc)
1715 rc = cifs_setup_session(xid, ses, volume_info->local_nls);
1716 mutex_unlock(&ses->session_mutex);
1717 if (rc)
1718 goto get_ses_fail;
1719
1720 /* success, put it on the list */
1721 write_lock(&cifs_tcp_ses_lock);
1722 list_add(&ses->smb_ses_list, &server->smb_ses_list);
1723 write_unlock(&cifs_tcp_ses_lock);
1724
1725 FreeXid(xid);
1726 return ses;
1727
1728get_ses_fail:
1729 sesInfoFree(ses);
1730 FreeXid(xid);
1731 return ERR_PTR(rc);
1732}
1733
1636static struct cifsTconInfo * 1734static struct cifsTconInfo *
1637cifs_find_tcon(struct cifsSesInfo *ses, const char *unc) 1735cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
1638{ 1736{
@@ -1661,6 +1759,7 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
1661 int xid; 1759 int xid;
1662 struct cifsSesInfo *ses = tcon->ses; 1760 struct cifsSesInfo *ses = tcon->ses;
1663 1761
1762 cFYI(1, "%s: tc_count=%d\n", __func__, tcon->tc_count);
1664 write_lock(&cifs_tcp_ses_lock); 1763 write_lock(&cifs_tcp_ses_lock);
1665 if (--tcon->tc_count > 0) { 1764 if (--tcon->tc_count > 0) {
1666 write_unlock(&cifs_tcp_ses_lock); 1765 write_unlock(&cifs_tcp_ses_lock);
@@ -1678,6 +1777,80 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
1678 cifs_put_smb_ses(ses); 1777 cifs_put_smb_ses(ses);
1679} 1778}
1680 1779
1780static struct cifsTconInfo *
1781cifs_get_tcon(struct cifsSesInfo *ses, struct smb_vol *volume_info)
1782{
1783 int rc, xid;
1784 struct cifsTconInfo *tcon;
1785
1786 tcon = cifs_find_tcon(ses, volume_info->UNC);
1787 if (tcon) {
1788 cFYI(1, "Found match on UNC path");
1789 /* existing tcon already has a reference */
1790 cifs_put_smb_ses(ses);
1791 if (tcon->seal != volume_info->seal)
1792 cERROR(1, "transport encryption setting "
1793 "conflicts with existing tid");
1794 return tcon;
1795 }
1796
1797 tcon = tconInfoAlloc();
1798 if (tcon == NULL) {
1799 rc = -ENOMEM;
1800 goto out_fail;
1801 }
1802
1803 tcon->ses = ses;
1804 if (volume_info->password) {
1805 tcon->password = kstrdup(volume_info->password, GFP_KERNEL);
1806 if (!tcon->password) {
1807 rc = -ENOMEM;
1808 goto out_fail;
1809 }
1810 }
1811
1812 if (strchr(volume_info->UNC + 3, '\\') == NULL
1813 && strchr(volume_info->UNC + 3, '/') == NULL) {
1814 cERROR(1, "Missing share name");
1815 rc = -ENODEV;
1816 goto out_fail;
1817 }
1818
1819 /* BB Do we need to wrap session_mutex around
1820 * this TCon call and Unix SetFS as
1821 * we do on SessSetup and reconnect? */
1822 xid = GetXid();
1823 rc = CIFSTCon(xid, ses, volume_info->UNC, tcon, volume_info->local_nls);
1824 FreeXid(xid);
1825 cFYI(1, "CIFS Tcon rc = %d", rc);
1826 if (rc)
1827 goto out_fail;
1828
1829 if (volume_info->nodfs) {
1830 tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
1831 cFYI(1, "DFS disabled (%d)", tcon->Flags);
1832 }
1833 tcon->seal = volume_info->seal;
1834 /* we can have only one retry value for a connection
1835 to a share so for resources mounted more than once
1836 to the same server share the last value passed in
1837 for the retry flag is used */
1838 tcon->retry = volume_info->retry;
1839 tcon->nocase = volume_info->nocase;
1840 tcon->local_lease = volume_info->local_lease;
1841
1842 write_lock(&cifs_tcp_ses_lock);
1843 list_add(&tcon->tcon_list, &ses->tcon_list);
1844 write_unlock(&cifs_tcp_ses_lock);
1845
1846 return tcon;
1847
1848out_fail:
1849 tconInfoFree(tcon);
1850 return ERR_PTR(rc);
1851}
1852
1853
1681int 1854int
1682get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path, 1855get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
1683 const struct nls_table *nls_codepage, unsigned int *pnum_referrals, 1856 const struct nls_table *nls_codepage, unsigned int *pnum_referrals,
@@ -1702,8 +1875,7 @@ get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
1702 strcpy(temp_unc + 2, pSesInfo->serverName); 1875 strcpy(temp_unc + 2, pSesInfo->serverName);
1703 strcpy(temp_unc + 2 + strlen(pSesInfo->serverName), "\\IPC$"); 1876 strcpy(temp_unc + 2 + strlen(pSesInfo->serverName), "\\IPC$");
1704 rc = CIFSTCon(xid, pSesInfo, temp_unc, NULL, nls_codepage); 1877 rc = CIFSTCon(xid, pSesInfo, temp_unc, NULL, nls_codepage);
1705 cFYI(1, 1878 cFYI(1, "CIFS Tcon rc = %d ipc_tid = %d", rc, pSesInfo->ipc_tid);
1706 ("CIFS Tcon rc = %d ipc_tid = %d", rc, pSesInfo->ipc_tid));
1707 kfree(temp_unc); 1879 kfree(temp_unc);
1708 } 1880 }
1709 if (rc == 0) 1881 if (rc == 0)
@@ -1776,12 +1948,12 @@ ipv4_connect(struct TCP_Server_Info *server)
1776 rc = sock_create_kern(PF_INET, SOCK_STREAM, 1948 rc = sock_create_kern(PF_INET, SOCK_STREAM,
1777 IPPROTO_TCP, &socket); 1949 IPPROTO_TCP, &socket);
1778 if (rc < 0) { 1950 if (rc < 0) {
1779 cERROR(1, ("Error %d creating socket", rc)); 1951 cERROR(1, "Error %d creating socket", rc);
1780 return rc; 1952 return rc;
1781 } 1953 }
1782 1954
1783 /* BB other socket options to set KEEPALIVE, NODELAY? */ 1955 /* BB other socket options to set KEEPALIVE, NODELAY? */
1784 cFYI(1, ("Socket created")); 1956 cFYI(1, "Socket created");
1785 server->ssocket = socket; 1957 server->ssocket = socket;
1786 socket->sk->sk_allocation = GFP_NOFS; 1958 socket->sk->sk_allocation = GFP_NOFS;
1787 cifs_reclassify_socket4(socket); 1959 cifs_reclassify_socket4(socket);
@@ -1826,7 +1998,7 @@ ipv4_connect(struct TCP_Server_Info *server)
1826 if (!connected) { 1998 if (!connected) {
1827 if (orig_port) 1999 if (orig_port)
1828 server->addr.sockAddr.sin_port = orig_port; 2000 server->addr.sockAddr.sin_port = orig_port;
1829 cFYI(1, ("Error %d connecting to server via ipv4", rc)); 2001 cFYI(1, "Error %d connecting to server via ipv4", rc);
1830 sock_release(socket); 2002 sock_release(socket);
1831 server->ssocket = NULL; 2003 server->ssocket = NULL;
1832 return rc; 2004 return rc;
@@ -1854,12 +2026,12 @@ ipv4_connect(struct TCP_Server_Info *server)
1854 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY, 2026 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
1855 (char *)&val, sizeof(val)); 2027 (char *)&val, sizeof(val));
1856 if (rc) 2028 if (rc)
1857 cFYI(1, ("set TCP_NODELAY socket option error %d", rc)); 2029 cFYI(1, "set TCP_NODELAY socket option error %d", rc);
1858 } 2030 }
1859 2031
1860 cFYI(1, ("sndbuf %d rcvbuf %d rcvtimeo 0x%lx", 2032 cFYI(1, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx",
1861 socket->sk->sk_sndbuf, 2033 socket->sk->sk_sndbuf,
1862 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo)); 2034 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
1863 2035
1864 /* send RFC1001 sessinit */ 2036 /* send RFC1001 sessinit */
1865 if (server->addr.sockAddr.sin_port == htons(RFC1001_PORT)) { 2037 if (server->addr.sockAddr.sin_port == htons(RFC1001_PORT)) {
@@ -1937,13 +2109,13 @@ ipv6_connect(struct TCP_Server_Info *server)
1937 rc = sock_create_kern(PF_INET6, SOCK_STREAM, 2109 rc = sock_create_kern(PF_INET6, SOCK_STREAM,
1938 IPPROTO_TCP, &socket); 2110 IPPROTO_TCP, &socket);
1939 if (rc < 0) { 2111 if (rc < 0) {
1940 cERROR(1, ("Error %d creating ipv6 socket", rc)); 2112 cERROR(1, "Error %d creating ipv6 socket", rc);
1941 socket = NULL; 2113 socket = NULL;
1942 return rc; 2114 return rc;
1943 } 2115 }
1944 2116
1945 /* BB other socket options to set KEEPALIVE, NODELAY? */ 2117 /* BB other socket options to set KEEPALIVE, NODELAY? */
1946 cFYI(1, ("ipv6 Socket created")); 2118 cFYI(1, "ipv6 Socket created");
1947 server->ssocket = socket; 2119 server->ssocket = socket;
1948 socket->sk->sk_allocation = GFP_NOFS; 2120 socket->sk->sk_allocation = GFP_NOFS;
1949 cifs_reclassify_socket6(socket); 2121 cifs_reclassify_socket6(socket);
@@ -1987,7 +2159,7 @@ ipv6_connect(struct TCP_Server_Info *server)
1987 if (!connected) { 2159 if (!connected) {
1988 if (orig_port) 2160 if (orig_port)
1989 server->addr.sockAddr6.sin6_port = orig_port; 2161 server->addr.sockAddr6.sin6_port = orig_port;
1990 cFYI(1, ("Error %d connecting to server via ipv6", rc)); 2162 cFYI(1, "Error %d connecting to server via ipv6", rc);
1991 sock_release(socket); 2163 sock_release(socket);
1992 server->ssocket = NULL; 2164 server->ssocket = NULL;
1993 return rc; 2165 return rc;
@@ -2006,7 +2178,7 @@ ipv6_connect(struct TCP_Server_Info *server)
2006 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY, 2178 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
2007 (char *)&val, sizeof(val)); 2179 (char *)&val, sizeof(val));
2008 if (rc) 2180 if (rc)
2009 cFYI(1, ("set TCP_NODELAY socket option error %d", rc)); 2181 cFYI(1, "set TCP_NODELAY socket option error %d", rc);
2010 } 2182 }
2011 2183
2012 server->ssocket = socket; 2184 server->ssocket = socket;
@@ -2031,13 +2203,13 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
2031 if (vol_info && vol_info->no_linux_ext) { 2203 if (vol_info && vol_info->no_linux_ext) {
2032 tcon->fsUnixInfo.Capability = 0; 2204 tcon->fsUnixInfo.Capability = 0;
2033 tcon->unix_ext = 0; /* Unix Extensions disabled */ 2205 tcon->unix_ext = 0; /* Unix Extensions disabled */
2034 cFYI(1, ("Linux protocol extensions disabled")); 2206 cFYI(1, "Linux protocol extensions disabled");
2035 return; 2207 return;
2036 } else if (vol_info) 2208 } else if (vol_info)
2037 tcon->unix_ext = 1; /* Unix Extensions supported */ 2209 tcon->unix_ext = 1; /* Unix Extensions supported */
2038 2210
2039 if (tcon->unix_ext == 0) { 2211 if (tcon->unix_ext == 0) {
2040 cFYI(1, ("Unix extensions disabled so not set on reconnect")); 2212 cFYI(1, "Unix extensions disabled so not set on reconnect");
2041 return; 2213 return;
2042 } 2214 }
2043 2215
@@ -2053,12 +2225,11 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
2053 cap &= ~CIFS_UNIX_POSIX_ACL_CAP; 2225 cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
2054 if ((saved_cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) { 2226 if ((saved_cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
2055 if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) 2227 if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
2056 cERROR(1, ("POSIXPATH support change")); 2228 cERROR(1, "POSIXPATH support change");
2057 cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP; 2229 cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
2058 } else if ((cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) { 2230 } else if ((cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
2059 cERROR(1, ("possible reconnect error")); 2231 cERROR(1, "possible reconnect error");
2060 cERROR(1, 2232 cERROR(1, "server disabled POSIX path support");
2061 ("server disabled POSIX path support"));
2062 } 2233 }
2063 } 2234 }
2064 2235
@@ -2066,7 +2237,7 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
2066 if (vol_info && vol_info->no_psx_acl) 2237 if (vol_info && vol_info->no_psx_acl)
2067 cap &= ~CIFS_UNIX_POSIX_ACL_CAP; 2238 cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
2068 else if (CIFS_UNIX_POSIX_ACL_CAP & cap) { 2239 else if (CIFS_UNIX_POSIX_ACL_CAP & cap) {
2069 cFYI(1, ("negotiated posix acl support")); 2240 cFYI(1, "negotiated posix acl support");
2070 if (sb) 2241 if (sb)
2071 sb->s_flags |= MS_POSIXACL; 2242 sb->s_flags |= MS_POSIXACL;
2072 } 2243 }
@@ -2074,7 +2245,7 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
2074 if (vol_info && vol_info->posix_paths == 0) 2245 if (vol_info && vol_info->posix_paths == 0)
2075 cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP; 2246 cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
2076 else if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) { 2247 else if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) {
2077 cFYI(1, ("negotiate posix pathnames")); 2248 cFYI(1, "negotiate posix pathnames");
2078 if (sb) 2249 if (sb)
2079 CIFS_SB(sb)->mnt_cifs_flags |= 2250 CIFS_SB(sb)->mnt_cifs_flags |=
2080 CIFS_MOUNT_POSIX_PATHS; 2251 CIFS_MOUNT_POSIX_PATHS;
@@ -2089,39 +2260,38 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
2089 if (sb && (CIFS_SB(sb)->rsize > 127 * 1024)) { 2260 if (sb && (CIFS_SB(sb)->rsize > 127 * 1024)) {
2090 if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) { 2261 if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) {
2091 CIFS_SB(sb)->rsize = 127 * 1024; 2262 CIFS_SB(sb)->rsize = 127 * 1024;
2092 cFYI(DBG2, 2263 cFYI(DBG2, "larger reads not supported by srv");
2093 ("larger reads not supported by srv"));
2094 } 2264 }
2095 } 2265 }
2096 2266
2097 2267
2098 cFYI(1, ("Negotiate caps 0x%x", (int)cap)); 2268 cFYI(1, "Negotiate caps 0x%x", (int)cap);
2099#ifdef CONFIG_CIFS_DEBUG2 2269#ifdef CONFIG_CIFS_DEBUG2
2100 if (cap & CIFS_UNIX_FCNTL_CAP) 2270 if (cap & CIFS_UNIX_FCNTL_CAP)
2101 cFYI(1, ("FCNTL cap")); 2271 cFYI(1, "FCNTL cap");
2102 if (cap & CIFS_UNIX_EXTATTR_CAP) 2272 if (cap & CIFS_UNIX_EXTATTR_CAP)
2103 cFYI(1, ("EXTATTR cap")); 2273 cFYI(1, "EXTATTR cap");
2104 if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) 2274 if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
2105 cFYI(1, ("POSIX path cap")); 2275 cFYI(1, "POSIX path cap");
2106 if (cap & CIFS_UNIX_XATTR_CAP) 2276 if (cap & CIFS_UNIX_XATTR_CAP)
2107 cFYI(1, ("XATTR cap")); 2277 cFYI(1, "XATTR cap");
2108 if (cap & CIFS_UNIX_POSIX_ACL_CAP) 2278 if (cap & CIFS_UNIX_POSIX_ACL_CAP)
2109 cFYI(1, ("POSIX ACL cap")); 2279 cFYI(1, "POSIX ACL cap");
2110 if (cap & CIFS_UNIX_LARGE_READ_CAP) 2280 if (cap & CIFS_UNIX_LARGE_READ_CAP)
2111 cFYI(1, ("very large read cap")); 2281 cFYI(1, "very large read cap");
2112 if (cap & CIFS_UNIX_LARGE_WRITE_CAP) 2282 if (cap & CIFS_UNIX_LARGE_WRITE_CAP)
2113 cFYI(1, ("very large write cap")); 2283 cFYI(1, "very large write cap");
2114#endif /* CIFS_DEBUG2 */ 2284#endif /* CIFS_DEBUG2 */
2115 if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) { 2285 if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) {
2116 if (vol_info == NULL) { 2286 if (vol_info == NULL) {
2117 cFYI(1, ("resetting capabilities failed")); 2287 cFYI(1, "resetting capabilities failed");
2118 } else 2288 } else
2119 cERROR(1, ("Negotiating Unix capabilities " 2289 cERROR(1, "Negotiating Unix capabilities "
2120 "with the server failed. Consider " 2290 "with the server failed. Consider "
2121 "mounting with the Unix Extensions\n" 2291 "mounting with the Unix Extensions\n"
2122 "disabled, if problems are found, " 2292 "disabled, if problems are found, "
2123 "by specifying the nounix mount " 2293 "by specifying the nounix mount "
2124 "option.")); 2294 "option.");
2125 2295
2126 } 2296 }
2127 } 2297 }
@@ -2151,8 +2321,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2151 struct cifs_sb_info *cifs_sb) 2321 struct cifs_sb_info *cifs_sb)
2152{ 2322{
2153 if (pvolume_info->rsize > CIFSMaxBufSize) { 2323 if (pvolume_info->rsize > CIFSMaxBufSize) {
2154 cERROR(1, ("rsize %d too large, using MaxBufSize", 2324 cERROR(1, "rsize %d too large, using MaxBufSize",
2155 pvolume_info->rsize)); 2325 pvolume_info->rsize);
2156 cifs_sb->rsize = CIFSMaxBufSize; 2326 cifs_sb->rsize = CIFSMaxBufSize;
2157 } else if ((pvolume_info->rsize) && 2327 } else if ((pvolume_info->rsize) &&
2158 (pvolume_info->rsize <= CIFSMaxBufSize)) 2328 (pvolume_info->rsize <= CIFSMaxBufSize))
@@ -2161,8 +2331,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2161 cifs_sb->rsize = CIFSMaxBufSize; 2331 cifs_sb->rsize = CIFSMaxBufSize;
2162 2332
2163 if (pvolume_info->wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) { 2333 if (pvolume_info->wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) {
2164 cERROR(1, ("wsize %d too large, using 4096 instead", 2334 cERROR(1, "wsize %d too large, using 4096 instead",
2165 pvolume_info->wsize)); 2335 pvolume_info->wsize);
2166 cifs_sb->wsize = 4096; 2336 cifs_sb->wsize = 4096;
2167 } else if (pvolume_info->wsize) 2337 } else if (pvolume_info->wsize)
2168 cifs_sb->wsize = pvolume_info->wsize; 2338 cifs_sb->wsize = pvolume_info->wsize;
@@ -2180,7 +2350,7 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2180 if (cifs_sb->rsize < 2048) { 2350 if (cifs_sb->rsize < 2048) {
2181 cifs_sb->rsize = 2048; 2351 cifs_sb->rsize = 2048;
2182 /* Windows ME may prefer this */ 2352 /* Windows ME may prefer this */
2183 cFYI(1, ("readsize set to minimum: 2048")); 2353 cFYI(1, "readsize set to minimum: 2048");
2184 } 2354 }
2185 /* calculate prepath */ 2355 /* calculate prepath */
2186 cifs_sb->prepath = pvolume_info->prepath; 2356 cifs_sb->prepath = pvolume_info->prepath;
@@ -2198,8 +2368,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2198 cifs_sb->mnt_gid = pvolume_info->linux_gid; 2368 cifs_sb->mnt_gid = pvolume_info->linux_gid;
2199 cifs_sb->mnt_file_mode = pvolume_info->file_mode; 2369 cifs_sb->mnt_file_mode = pvolume_info->file_mode;
2200 cifs_sb->mnt_dir_mode = pvolume_info->dir_mode; 2370 cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
2201 cFYI(1, ("file mode: 0x%x dir mode: 0x%x", 2371 cFYI(1, "file mode: 0x%x dir mode: 0x%x",
2202 cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode)); 2372 cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
2203 2373
2204 if (pvolume_info->noperm) 2374 if (pvolume_info->noperm)
2205 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM; 2375 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
@@ -2228,13 +2398,13 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2228 if (pvolume_info->dynperm) 2398 if (pvolume_info->dynperm)
2229 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM; 2399 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
2230 if (pvolume_info->direct_io) { 2400 if (pvolume_info->direct_io) {
2231 cFYI(1, ("mounting share using direct i/o")); 2401 cFYI(1, "mounting share using direct i/o");
2232 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO; 2402 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
2233 } 2403 }
2234 2404
2235 if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm)) 2405 if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
2236 cERROR(1, ("mount option dynperm ignored if cifsacl " 2406 cERROR(1, "mount option dynperm ignored if cifsacl "
2237 "mount option supported")); 2407 "mount option supported");
2238} 2408}
2239 2409
2240static int 2410static int
@@ -2261,7 +2431,7 @@ cleanup_volume_info(struct smb_vol **pvolume_info)
2261{ 2431{
2262 struct smb_vol *volume_info; 2432 struct smb_vol *volume_info;
2263 2433
2264 if (!pvolume_info && !*pvolume_info) 2434 if (!pvolume_info || !*pvolume_info)
2265 return; 2435 return;
2266 2436
2267 volume_info = *pvolume_info; 2437 volume_info = *pvolume_info;
@@ -2343,11 +2513,11 @@ try_mount_again:
2343 } 2513 }
2344 2514
2345 if (volume_info->nullauth) { 2515 if (volume_info->nullauth) {
2346 cFYI(1, ("null user")); 2516 cFYI(1, "null user");
2347 volume_info->username = ""; 2517 volume_info->username = "";
2348 } else if (volume_info->username) { 2518 } else if (volume_info->username) {
2349 /* BB fixme parse for domain name here */ 2519 /* BB fixme parse for domain name here */
2350 cFYI(1, ("Username: %s", volume_info->username)); 2520 cFYI(1, "Username: %s", volume_info->username);
2351 } else { 2521 } else {
2352 cifserror("No username specified"); 2522 cifserror("No username specified");
2353 /* In userspace mount helper we can get user name from alternate 2523 /* In userspace mount helper we can get user name from alternate
@@ -2356,20 +2526,20 @@ try_mount_again:
2356 goto out; 2526 goto out;
2357 } 2527 }
2358 2528
2359
2360 /* this is needed for ASCII cp to Unicode converts */ 2529 /* this is needed for ASCII cp to Unicode converts */
2361 if (volume_info->iocharset == NULL) { 2530 if (volume_info->iocharset == NULL) {
2362 cifs_sb->local_nls = load_nls_default(); 2531 /* load_nls_default cannot return null */
2363 /* load_nls_default can not return null */ 2532 volume_info->local_nls = load_nls_default();
2364 } else { 2533 } else {
2365 cifs_sb->local_nls = load_nls(volume_info->iocharset); 2534 volume_info->local_nls = load_nls(volume_info->iocharset);
2366 if (cifs_sb->local_nls == NULL) { 2535 if (volume_info->local_nls == NULL) {
2367 cERROR(1, ("CIFS mount error: iocharset %s not found", 2536 cERROR(1, "CIFS mount error: iocharset %s not found",
2368 volume_info->iocharset)); 2537 volume_info->iocharset);
2369 rc = -ELIBACC; 2538 rc = -ELIBACC;
2370 goto out; 2539 goto out;
2371 } 2540 }
2372 } 2541 }
2542 cifs_sb->local_nls = volume_info->local_nls;
2373 2543
2374 /* get a reference to a tcp session */ 2544 /* get a reference to a tcp session */
2375 srvTcp = cifs_get_tcp_session(volume_info); 2545 srvTcp = cifs_get_tcp_session(volume_info);
@@ -2378,148 +2548,30 @@ try_mount_again:
2378 goto out; 2548 goto out;
2379 } 2549 }
2380 2550
2381 pSesInfo = cifs_find_smb_ses(srvTcp, volume_info->username); 2551 /* get a reference to a SMB session */
2382 if (pSesInfo) { 2552 pSesInfo = cifs_get_smb_ses(srvTcp, volume_info);
2383 cFYI(1, ("Existing smb sess found (status=%d)", 2553 if (IS_ERR(pSesInfo)) {
2384 pSesInfo->status)); 2554 rc = PTR_ERR(pSesInfo);
2385 /* 2555 pSesInfo = NULL;
2386 * The existing SMB session already has a reference to srvTcp, 2556 goto mount_fail_check;
2387 * so we can put back the extra one we got before
2388 */
2389 cifs_put_tcp_session(srvTcp);
2390
2391 mutex_lock(&pSesInfo->session_mutex);
2392 if (pSesInfo->need_reconnect) {
2393 cFYI(1, ("Session needs reconnect"));
2394 rc = cifs_setup_session(xid, pSesInfo,
2395 cifs_sb->local_nls);
2396 }
2397 mutex_unlock(&pSesInfo->session_mutex);
2398 } else if (!rc) {
2399 cFYI(1, ("Existing smb sess not found"));
2400 pSesInfo = sesInfoAlloc();
2401 if (pSesInfo == NULL) {
2402 rc = -ENOMEM;
2403 goto mount_fail_check;
2404 }
2405
2406 /* new SMB session uses our srvTcp ref */
2407 pSesInfo->server = srvTcp;
2408 if (srvTcp->addr.sockAddr6.sin6_family == AF_INET6)
2409 sprintf(pSesInfo->serverName, "%pI6",
2410 &srvTcp->addr.sockAddr6.sin6_addr);
2411 else
2412 sprintf(pSesInfo->serverName, "%pI4",
2413 &srvTcp->addr.sockAddr.sin_addr.s_addr);
2414
2415 write_lock(&cifs_tcp_ses_lock);
2416 list_add(&pSesInfo->smb_ses_list, &srvTcp->smb_ses_list);
2417 write_unlock(&cifs_tcp_ses_lock);
2418
2419 /* volume_info->password freed at unmount */
2420 if (volume_info->password) {
2421 pSesInfo->password = kstrdup(volume_info->password,
2422 GFP_KERNEL);
2423 if (!pSesInfo->password) {
2424 rc = -ENOMEM;
2425 goto mount_fail_check;
2426 }
2427 }
2428 if (volume_info->username)
2429 strncpy(pSesInfo->userName, volume_info->username,
2430 MAX_USERNAME_SIZE);
2431 if (volume_info->domainname) {
2432 int len = strlen(volume_info->domainname);
2433 pSesInfo->domainName = kmalloc(len + 1, GFP_KERNEL);
2434 if (pSesInfo->domainName)
2435 strcpy(pSesInfo->domainName,
2436 volume_info->domainname);
2437 }
2438 pSesInfo->linux_uid = volume_info->linux_uid;
2439 pSesInfo->overrideSecFlg = volume_info->secFlg;
2440 mutex_lock(&pSesInfo->session_mutex);
2441
2442 /* BB FIXME need to pass vol->secFlgs BB */
2443 rc = cifs_setup_session(xid, pSesInfo,
2444 cifs_sb->local_nls);
2445 mutex_unlock(&pSesInfo->session_mutex);
2446 } 2557 }
2447 2558
2448 /* search for existing tcon to this server share */ 2559 setup_cifs_sb(volume_info, cifs_sb);
2449 if (!rc) { 2560 if (pSesInfo->capabilities & CAP_LARGE_FILES)
2450 setup_cifs_sb(volume_info, cifs_sb); 2561 sb->s_maxbytes = MAX_LFS_FILESIZE;
2451 2562 else
2452 tcon = cifs_find_tcon(pSesInfo, volume_info->UNC); 2563 sb->s_maxbytes = MAX_NON_LFS;
2453 if (tcon) {
2454 cFYI(1, ("Found match on UNC path"));
2455 /* existing tcon already has a reference */
2456 cifs_put_smb_ses(pSesInfo);
2457 if (tcon->seal != volume_info->seal)
2458 cERROR(1, ("transport encryption setting "
2459 "conflicts with existing tid"));
2460 } else {
2461 tcon = tconInfoAlloc();
2462 if (tcon == NULL) {
2463 rc = -ENOMEM;
2464 goto mount_fail_check;
2465 }
2466
2467 tcon->ses = pSesInfo;
2468 if (volume_info->password) {
2469 tcon->password = kstrdup(volume_info->password,
2470 GFP_KERNEL);
2471 if (!tcon->password) {
2472 rc = -ENOMEM;
2473 goto mount_fail_check;
2474 }
2475 }
2476
2477 if ((strchr(volume_info->UNC + 3, '\\') == NULL)
2478 && (strchr(volume_info->UNC + 3, '/') == NULL)) {
2479 cERROR(1, ("Missing share name"));
2480 rc = -ENODEV;
2481 goto mount_fail_check;
2482 } else {
2483 /* BB Do we need to wrap sesSem around
2484 * this TCon call and Unix SetFS as
2485 * we do on SessSetup and reconnect? */
2486 rc = CIFSTCon(xid, pSesInfo, volume_info->UNC,
2487 tcon, cifs_sb->local_nls);
2488 cFYI(1, ("CIFS Tcon rc = %d", rc));
2489 if (volume_info->nodfs) {
2490 tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
2491 cFYI(1, ("DFS disabled (%d)",
2492 tcon->Flags));
2493 }
2494 }
2495 if (rc)
2496 goto remote_path_check;
2497 tcon->seal = volume_info->seal;
2498 write_lock(&cifs_tcp_ses_lock);
2499 list_add(&tcon->tcon_list, &pSesInfo->tcon_list);
2500 write_unlock(&cifs_tcp_ses_lock);
2501 }
2502
2503 /* we can have only one retry value for a connection
2504 to a share so for resources mounted more than once
2505 to the same server share the last value passed in
2506 for the retry flag is used */
2507 tcon->retry = volume_info->retry;
2508 tcon->nocase = volume_info->nocase;
2509 tcon->local_lease = volume_info->local_lease;
2510 }
2511 if (pSesInfo) {
2512 if (pSesInfo->capabilities & CAP_LARGE_FILES)
2513 sb->s_maxbytes = MAX_LFS_FILESIZE;
2514 else
2515 sb->s_maxbytes = MAX_NON_LFS;
2516 }
2517 2564
2518 /* BB FIXME fix time_gran to be larger for LANMAN sessions */ 2565 /* BB FIXME fix time_gran to be larger for LANMAN sessions */
2519 sb->s_time_gran = 100; 2566 sb->s_time_gran = 100;
2520 2567
2521 if (rc) 2568 /* search for existing tcon to this server share */
2569 tcon = cifs_get_tcon(pSesInfo, volume_info);
2570 if (IS_ERR(tcon)) {
2571 rc = PTR_ERR(tcon);
2572 tcon = NULL;
2522 goto remote_path_check; 2573 goto remote_path_check;
2574 }
2523 2575
2524 cifs_sb->tcon = tcon; 2576 cifs_sb->tcon = tcon;
2525 2577
@@ -2543,7 +2595,7 @@ try_mount_again:
2543 2595
2544 if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) { 2596 if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
2545 cifs_sb->rsize = 1024 * 127; 2597 cifs_sb->rsize = 1024 * 127;
2546 cFYI(DBG2, ("no very large read support, rsize now 127K")); 2598 cFYI(DBG2, "no very large read support, rsize now 127K");
2547 } 2599 }
2548 if (!(tcon->ses->capabilities & CAP_LARGE_WRITE_X)) 2600 if (!(tcon->ses->capabilities & CAP_LARGE_WRITE_X))
2549 cifs_sb->wsize = min(cifs_sb->wsize, 2601 cifs_sb->wsize = min(cifs_sb->wsize,
@@ -2592,7 +2644,7 @@ remote_path_check:
2592 goto mount_fail_check; 2644 goto mount_fail_check;
2593 } 2645 }
2594 2646
2595 cFYI(1, ("Getting referral for: %s", full_path)); 2647 cFYI(1, "Getting referral for: %s", full_path);
2596 rc = get_dfs_path(xid, pSesInfo , full_path + 1, 2648 rc = get_dfs_path(xid, pSesInfo , full_path + 1,
2597 cifs_sb->local_nls, &num_referrals, &referrals, 2649 cifs_sb->local_nls, &num_referrals, &referrals,
2598 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 2650 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -2706,7 +2758,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2706 by Samba (not sure whether other servers allow 2758 by Samba (not sure whether other servers allow
2707 NTLMv2 password here) */ 2759 NTLMv2 password here) */
2708#ifdef CONFIG_CIFS_WEAK_PW_HASH 2760#ifdef CONFIG_CIFS_WEAK_PW_HASH
2709 if ((extended_security & CIFSSEC_MAY_LANMAN) && 2761 if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
2710 (ses->server->secType == LANMAN)) 2762 (ses->server->secType == LANMAN))
2711 calc_lanman_hash(tcon->password, ses->server->cryptKey, 2763 calc_lanman_hash(tcon->password, ses->server->cryptKey,
2712 ses->server->secMode & 2764 ses->server->secMode &
@@ -2777,13 +2829,13 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2777 if (length == 3) { 2829 if (length == 3) {
2778 if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') && 2830 if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') &&
2779 (bcc_ptr[2] == 'C')) { 2831 (bcc_ptr[2] == 'C')) {
2780 cFYI(1, ("IPC connection")); 2832 cFYI(1, "IPC connection");
2781 tcon->ipc = 1; 2833 tcon->ipc = 1;
2782 } 2834 }
2783 } else if (length == 2) { 2835 } else if (length == 2) {
2784 if ((bcc_ptr[0] == 'A') && (bcc_ptr[1] == ':')) { 2836 if ((bcc_ptr[0] == 'A') && (bcc_ptr[1] == ':')) {
2785 /* the most common case */ 2837 /* the most common case */
2786 cFYI(1, ("disk share connection")); 2838 cFYI(1, "disk share connection");
2787 } 2839 }
2788 } 2840 }
2789 bcc_ptr += length + 1; 2841 bcc_ptr += length + 1;
@@ -2796,7 +2848,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2796 bytes_left, is_unicode, 2848 bytes_left, is_unicode,
2797 nls_codepage); 2849 nls_codepage);
2798 2850
2799 cFYI(1, ("nativeFileSystem=%s", tcon->nativeFileSystem)); 2851 cFYI(1, "nativeFileSystem=%s", tcon->nativeFileSystem);
2800 2852
2801 if ((smb_buffer_response->WordCount == 3) || 2853 if ((smb_buffer_response->WordCount == 3) ||
2802 (smb_buffer_response->WordCount == 7)) 2854 (smb_buffer_response->WordCount == 7))
@@ -2804,7 +2856,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2804 tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport); 2856 tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport);
2805 else 2857 else
2806 tcon->Flags = 0; 2858 tcon->Flags = 0;
2807 cFYI(1, ("Tcon flags: 0x%x ", tcon->Flags)); 2859 cFYI(1, "Tcon flags: 0x%x ", tcon->Flags);
2808 } else if ((rc == 0) && tcon == NULL) { 2860 } else if ((rc == 0) && tcon == NULL) {
2809 /* all we need to save for IPC$ connection */ 2861 /* all we need to save for IPC$ connection */
2810 ses->ipc_tid = smb_buffer_response->Tid; 2862 ses->ipc_tid = smb_buffer_response->Tid;
@@ -2832,57 +2884,61 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
2832 return rc; 2884 return rc;
2833} 2885}
2834 2886
2835int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, 2887int cifs_negotiate_protocol(unsigned int xid, struct cifsSesInfo *ses)
2836 struct nls_table *nls_info)
2837{ 2888{
2838 int rc = 0; 2889 int rc = 0;
2839 int first_time = 0; 2890 struct TCP_Server_Info *server = ses->server;
2840 struct TCP_Server_Info *server = pSesInfo->server; 2891
2841 2892 /* only send once per connect */
2842 /* what if server changes its buffer size after dropping the session? */ 2893 if (server->maxBuf != 0)
2843 if (server->maxBuf == 0) /* no need to send on reconnect */ { 2894 return 0;
2844 rc = CIFSSMBNegotiate(xid, pSesInfo); 2895
2845 if (rc == -EAGAIN) { 2896 rc = CIFSSMBNegotiate(xid, ses);
2846 /* retry only once on 1st time connection */ 2897 if (rc == -EAGAIN) {
2847 rc = CIFSSMBNegotiate(xid, pSesInfo); 2898 /* retry only once on 1st time connection */
2848 if (rc == -EAGAIN) 2899 rc = CIFSSMBNegotiate(xid, ses);
2849 rc = -EHOSTDOWN; 2900 if (rc == -EAGAIN)
2850 } 2901 rc = -EHOSTDOWN;
2851 if (rc == 0) { 2902 }
2852 spin_lock(&GlobalMid_Lock); 2903 if (rc == 0) {
2853 if (server->tcpStatus != CifsExiting) 2904 spin_lock(&GlobalMid_Lock);
2854 server->tcpStatus = CifsGood; 2905 if (server->tcpStatus != CifsExiting)
2855 else 2906 server->tcpStatus = CifsGood;
2856 rc = -EHOSTDOWN; 2907 else
2857 spin_unlock(&GlobalMid_Lock); 2908 rc = -EHOSTDOWN;
2909 spin_unlock(&GlobalMid_Lock);
2858 2910
2859 }
2860 first_time = 1;
2861 } 2911 }
2862 2912
2863 if (rc) 2913 return rc;
2864 goto ss_err_exit; 2914}
2915
2916
2917int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
2918 struct nls_table *nls_info)
2919{
2920 int rc = 0;
2921 struct TCP_Server_Info *server = ses->server;
2865 2922
2866 pSesInfo->flags = 0; 2923 ses->flags = 0;
2867 pSesInfo->capabilities = server->capabilities; 2924 ses->capabilities = server->capabilities;
2868 if (linuxExtEnabled == 0) 2925 if (linuxExtEnabled == 0)
2869 pSesInfo->capabilities &= (~CAP_UNIX); 2926 ses->capabilities &= (~CAP_UNIX);
2870 2927
2871 cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d", 2928 cFYI(1, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
2872 server->secMode, server->capabilities, server->timeAdj)); 2929 server->secMode, server->capabilities, server->timeAdj);
2873 2930
2874 rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info); 2931 rc = CIFS_SessSetup(xid, ses, nls_info);
2875 if (rc) { 2932 if (rc) {
2876 cERROR(1, ("Send error in SessSetup = %d", rc)); 2933 cERROR(1, "Send error in SessSetup = %d", rc);
2877 } else { 2934 } else {
2878 cFYI(1, ("CIFS Session Established successfully")); 2935 cFYI(1, "CIFS Session Established successfully");
2879 spin_lock(&GlobalMid_Lock); 2936 spin_lock(&GlobalMid_Lock);
2880 pSesInfo->status = CifsGood; 2937 ses->status = CifsGood;
2881 pSesInfo->need_reconnect = false; 2938 ses->need_reconnect = false;
2882 spin_unlock(&GlobalMid_Lock); 2939 spin_unlock(&GlobalMid_Lock);
2883 } 2940 }
2884 2941
2885ss_err_exit:
2886 return rc; 2942 return rc;
2887} 2943}
2888 2944
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 6ccf7262d1b7..391816b461ca 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -73,7 +73,7 @@ cifs_bp_rename_retry:
73 namelen += (1 + temp->d_name.len); 73 namelen += (1 + temp->d_name.len);
74 temp = temp->d_parent; 74 temp = temp->d_parent;
75 if (temp == NULL) { 75 if (temp == NULL) {
76 cERROR(1, ("corrupt dentry")); 76 cERROR(1, "corrupt dentry");
77 return NULL; 77 return NULL;
78 } 78 }
79 } 79 }
@@ -90,19 +90,18 @@ cifs_bp_rename_retry:
90 full_path[namelen] = dirsep; 90 full_path[namelen] = dirsep;
91 strncpy(full_path + namelen + 1, temp->d_name.name, 91 strncpy(full_path + namelen + 1, temp->d_name.name,
92 temp->d_name.len); 92 temp->d_name.len);
93 cFYI(0, ("name: %s", full_path + namelen)); 93 cFYI(0, "name: %s", full_path + namelen);
94 } 94 }
95 temp = temp->d_parent; 95 temp = temp->d_parent;
96 if (temp == NULL) { 96 if (temp == NULL) {
97 cERROR(1, ("corrupt dentry")); 97 cERROR(1, "corrupt dentry");
98 kfree(full_path); 98 kfree(full_path);
99 return NULL; 99 return NULL;
100 } 100 }
101 } 101 }
102 if (namelen != pplen + dfsplen) { 102 if (namelen != pplen + dfsplen) {
103 cERROR(1, 103 cERROR(1, "did not end path lookup where expected namelen is %d",
104 ("did not end path lookup where expected namelen is %d", 104 namelen);
105 namelen));
106 /* presumably this is only possible if racing with a rename 105 /* presumably this is only possible if racing with a rename
107 of one of the parent directories (we can not lock the dentries 106 of one of the parent directories (we can not lock the dentries
108 above us to prevent this, but retrying should be harmless) */ 107 above us to prevent this, but retrying should be harmless) */
@@ -130,6 +129,12 @@ cifs_bp_rename_retry:
130 return full_path; 129 return full_path;
131} 130}
132 131
132/*
133 * When called with struct file pointer set to NULL, there is no way we could
134 * update file->private_data, but getting it stuck on openFileList provides a
135 * way to access it from cifs_fill_filedata and thereby set file->private_data
136 * from cifs_open.
137 */
133struct cifsFileInfo * 138struct cifsFileInfo *
134cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, 139cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
135 struct file *file, struct vfsmount *mnt, unsigned int oflags) 140 struct file *file, struct vfsmount *mnt, unsigned int oflags)
@@ -173,7 +178,7 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
173 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { 178 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
174 pCifsInode->clientCanCacheAll = true; 179 pCifsInode->clientCanCacheAll = true;
175 pCifsInode->clientCanCacheRead = true; 180 pCifsInode->clientCanCacheRead = true;
176 cFYI(1, ("Exclusive Oplock inode %p", newinode)); 181 cFYI(1, "Exclusive Oplock inode %p", newinode);
177 } else if ((oplock & 0xF) == OPLOCK_READ) 182 } else if ((oplock & 0xF) == OPLOCK_READ)
178 pCifsInode->clientCanCacheRead = true; 183 pCifsInode->clientCanCacheRead = true;
179 } 184 }
@@ -183,16 +188,17 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
183} 188}
184 189
185int cifs_posix_open(char *full_path, struct inode **pinode, 190int cifs_posix_open(char *full_path, struct inode **pinode,
186 struct vfsmount *mnt, int mode, int oflags, 191 struct vfsmount *mnt, struct super_block *sb,
187 __u32 *poplock, __u16 *pnetfid, int xid) 192 int mode, int oflags,
193 __u32 *poplock, __u16 *pnetfid, int xid)
188{ 194{
189 int rc; 195 int rc;
190 FILE_UNIX_BASIC_INFO *presp_data; 196 FILE_UNIX_BASIC_INFO *presp_data;
191 __u32 posix_flags = 0; 197 __u32 posix_flags = 0;
192 struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb); 198 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
193 struct cifs_fattr fattr; 199 struct cifs_fattr fattr;
194 200
195 cFYI(1, ("posix open %s", full_path)); 201 cFYI(1, "posix open %s", full_path);
196 202
197 presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL); 203 presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
198 if (presp_data == NULL) 204 if (presp_data == NULL)
@@ -242,7 +248,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
242 248
243 /* get new inode and set it up */ 249 /* get new inode and set it up */
244 if (*pinode == NULL) { 250 if (*pinode == NULL) {
245 *pinode = cifs_iget(mnt->mnt_sb, &fattr); 251 cifs_fill_uniqueid(sb, &fattr);
252 *pinode = cifs_iget(sb, &fattr);
246 if (!*pinode) { 253 if (!*pinode) {
247 rc = -ENOMEM; 254 rc = -ENOMEM;
248 goto posix_open_ret; 255 goto posix_open_ret;
@@ -251,7 +258,18 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
251 cifs_fattr_to_inode(*pinode, &fattr); 258 cifs_fattr_to_inode(*pinode, &fattr);
252 } 259 }
253 260
254 cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt, oflags); 261 /*
262 * cifs_fill_filedata() takes care of setting cifsFileInfo pointer to
263 * file->private_data.
264 */
265 if (mnt) {
266 struct cifsFileInfo *pfile_info;
267
268 pfile_info = cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt,
269 oflags);
270 if (pfile_info == NULL)
271 rc = -ENOMEM;
272 }
255 273
256posix_open_ret: 274posix_open_ret:
257 kfree(presp_data); 275 kfree(presp_data);
@@ -315,13 +333,14 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
315 if (nd && (nd->flags & LOOKUP_OPEN)) 333 if (nd && (nd->flags & LOOKUP_OPEN))
316 oflags = nd->intent.open.flags; 334 oflags = nd->intent.open.flags;
317 else 335 else
318 oflags = FMODE_READ; 336 oflags = FMODE_READ | SMB_O_CREAT;
319 337
320 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) && 338 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
321 (CIFS_UNIX_POSIX_PATH_OPS_CAP & 339 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
322 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 340 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
323 rc = cifs_posix_open(full_path, &newinode, nd->path.mnt, 341 rc = cifs_posix_open(full_path, &newinode,
324 mode, oflags, &oplock, &fileHandle, xid); 342 nd ? nd->path.mnt : NULL,
343 inode->i_sb, mode, oflags, &oplock, &fileHandle, xid);
325 /* EIO could indicate that (posix open) operation is not 344 /* EIO could indicate that (posix open) operation is not
326 supported, despite what server claimed in capability 345 supported, despite what server claimed in capability
327 negotation. EREMOTE indicates DFS junction, which is not 346 negotation. EREMOTE indicates DFS junction, which is not
@@ -358,7 +377,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
358 else if ((oflags & O_CREAT) == O_CREAT) 377 else if ((oflags & O_CREAT) == O_CREAT)
359 disposition = FILE_OPEN_IF; 378 disposition = FILE_OPEN_IF;
360 else 379 else
361 cFYI(1, ("Create flag not set in create function")); 380 cFYI(1, "Create flag not set in create function");
362 } 381 }
363 382
364 /* BB add processing to set equivalent of mode - e.g. via CreateX with 383 /* BB add processing to set equivalent of mode - e.g. via CreateX with
@@ -394,7 +413,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
394 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 413 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
395 } 414 }
396 if (rc) { 415 if (rc) {
397 cFYI(1, ("cifs_create returned 0x%x", rc)); 416 cFYI(1, "cifs_create returned 0x%x", rc);
398 goto cifs_create_out; 417 goto cifs_create_out;
399 } 418 }
400 419
@@ -457,15 +476,22 @@ cifs_create_set_dentry:
457 if (rc == 0) 476 if (rc == 0)
458 setup_cifs_dentry(tcon, direntry, newinode); 477 setup_cifs_dentry(tcon, direntry, newinode);
459 else 478 else
460 cFYI(1, ("Create worked, get_inode_info failed rc = %d", rc)); 479 cFYI(1, "Create worked, get_inode_info failed rc = %d", rc);
461 480
462 /* nfsd case - nfs srv does not set nd */ 481 /* nfsd case - nfs srv does not set nd */
463 if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) { 482 if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) {
464 /* mknod case - do not leave file open */ 483 /* mknod case - do not leave file open */
465 CIFSSMBClose(xid, tcon, fileHandle); 484 CIFSSMBClose(xid, tcon, fileHandle);
466 } else if (!(posix_create) && (newinode)) { 485 } else if (!(posix_create) && (newinode)) {
467 cifs_new_fileinfo(newinode, fileHandle, NULL, 486 struct cifsFileInfo *pfile_info;
468 nd->path.mnt, oflags); 487 /*
488 * cifs_fill_filedata() takes care of setting cifsFileInfo
489 * pointer to file->private_data.
490 */
491 pfile_info = cifs_new_fileinfo(newinode, fileHandle, NULL,
492 nd->path.mnt, oflags);
493 if (pfile_info == NULL)
494 rc = -ENOMEM;
469 } 495 }
470cifs_create_out: 496cifs_create_out:
471 kfree(buf); 497 kfree(buf);
@@ -531,7 +557,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
531 u16 fileHandle; 557 u16 fileHandle;
532 FILE_ALL_INFO *buf; 558 FILE_ALL_INFO *buf;
533 559
534 cFYI(1, ("sfu compat create special file")); 560 cFYI(1, "sfu compat create special file");
535 561
536 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); 562 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
537 if (buf == NULL) { 563 if (buf == NULL) {
@@ -616,8 +642,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
616 642
617 xid = GetXid(); 643 xid = GetXid();
618 644
619 cFYI(1, ("parent inode = 0x%p name is: %s and dentry = 0x%p", 645 cFYI(1, "parent inode = 0x%p name is: %s and dentry = 0x%p",
620 parent_dir_inode, direntry->d_name.name, direntry)); 646 parent_dir_inode, direntry->d_name.name, direntry);
621 647
622 /* check whether path exists */ 648 /* check whether path exists */
623 649
@@ -632,7 +658,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
632 int i; 658 int i;
633 for (i = 0; i < direntry->d_name.len; i++) 659 for (i = 0; i < direntry->d_name.len; i++)
634 if (direntry->d_name.name[i] == '\\') { 660 if (direntry->d_name.name[i] == '\\') {
635 cFYI(1, ("Invalid file name")); 661 cFYI(1, "Invalid file name");
636 FreeXid(xid); 662 FreeXid(xid);
637 return ERR_PTR(-EINVAL); 663 return ERR_PTR(-EINVAL);
638 } 664 }
@@ -657,11 +683,11 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
657 } 683 }
658 684
659 if (direntry->d_inode != NULL) { 685 if (direntry->d_inode != NULL) {
660 cFYI(1, ("non-NULL inode in lookup")); 686 cFYI(1, "non-NULL inode in lookup");
661 } else { 687 } else {
662 cFYI(1, ("NULL inode in lookup")); 688 cFYI(1, "NULL inode in lookup");
663 } 689 }
664 cFYI(1, ("Full path: %s inode = 0x%p", full_path, direntry->d_inode)); 690 cFYI(1, "Full path: %s inode = 0x%p", full_path, direntry->d_inode);
665 691
666 /* Posix open is only called (at lookup time) for file create now. 692 /* Posix open is only called (at lookup time) for file create now.
667 * For opens (rather than creates), because we do not know if it 693 * For opens (rather than creates), because we do not know if it
@@ -678,6 +704,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
678 (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open && 704 (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
679 (nd->intent.open.flags & O_CREAT)) { 705 (nd->intent.open.flags & O_CREAT)) {
680 rc = cifs_posix_open(full_path, &newInode, nd->path.mnt, 706 rc = cifs_posix_open(full_path, &newInode, nd->path.mnt,
707 parent_dir_inode->i_sb,
681 nd->intent.open.create_mode, 708 nd->intent.open.create_mode,
682 nd->intent.open.flags, &oplock, 709 nd->intent.open.flags, &oplock,
683 &fileHandle, xid); 710 &fileHandle, xid);
@@ -723,7 +750,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
723 /* if it was once a directory (but how can we tell?) we could do 750 /* if it was once a directory (but how can we tell?) we could do
724 shrink_dcache_parent(direntry); */ 751 shrink_dcache_parent(direntry); */
725 } else if (rc != -EACCES) { 752 } else if (rc != -EACCES) {
726 cERROR(1, ("Unexpected lookup error %d", rc)); 753 cERROR(1, "Unexpected lookup error %d", rc);
727 /* We special case check for Access Denied - since that 754 /* We special case check for Access Denied - since that
728 is a common return code */ 755 is a common return code */
729 } 756 }
@@ -739,11 +766,11 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
739 int isValid = 1; 766 int isValid = 1;
740 767
741 if (direntry->d_inode) { 768 if (direntry->d_inode) {
742 if (cifs_revalidate(direntry)) 769 if (cifs_revalidate_dentry(direntry))
743 return 0; 770 return 0;
744 } else { 771 } else {
745 cFYI(1, ("neg dentry 0x%p name = %s", 772 cFYI(1, "neg dentry 0x%p name = %s",
746 direntry, direntry->d_name.name)); 773 direntry, direntry->d_name.name);
747 if (time_after(jiffies, direntry->d_time + HZ) || 774 if (time_after(jiffies, direntry->d_time + HZ) ||
748 !lookupCacheEnabled) { 775 !lookupCacheEnabled) {
749 d_drop(direntry); 776 d_drop(direntry);
@@ -758,7 +785,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
758{ 785{
759 int rc = 0; 786 int rc = 0;
760 787
761 cFYI(1, ("In cifs d_delete, name = %s", direntry->d_name.name)); 788 cFYI(1, "In cifs d_delete, name = %s", direntry->d_name.name);
762 789
763 return rc; 790 return rc;
764} */ 791} */
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 87948147d7ec..4db2c5e7283f 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -23,6 +23,7 @@
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */ 24 */
25 25
26#include <linux/slab.h>
26#include <keys/user-type.h> 27#include <keys/user-type.h>
27#include "dns_resolve.h" 28#include "dns_resolve.h"
28#include "cifsglob.h" 29#include "cifsglob.h"
@@ -105,14 +106,14 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
105 /* search for server name delimiter */ 106 /* search for server name delimiter */
106 len = strlen(unc); 107 len = strlen(unc);
107 if (len < 3) { 108 if (len < 3) {
108 cFYI(1, ("%s: unc is too short: %s", __func__, unc)); 109 cFYI(1, "%s: unc is too short: %s", __func__, unc);
109 return -EINVAL; 110 return -EINVAL;
110 } 111 }
111 len -= 2; 112 len -= 2;
112 name = memchr(unc+2, '\\', len); 113 name = memchr(unc+2, '\\', len);
113 if (!name) { 114 if (!name) {
114 cFYI(1, ("%s: probably server name is whole unc: %s", 115 cFYI(1, "%s: probably server name is whole unc: %s",
115 __func__, unc)); 116 __func__, unc);
116 } else { 117 } else {
117 len = (name - unc) - 2/* leading // */; 118 len = (name - unc) - 2/* leading // */;
118 } 119 }
@@ -126,8 +127,8 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
126 name[len] = 0; 127 name[len] = 0;
127 128
128 if (is_ip(name)) { 129 if (is_ip(name)) {
129 cFYI(1, ("%s: it is IP, skipping dns upcall: %s", 130 cFYI(1, "%s: it is IP, skipping dns upcall: %s",
130 __func__, name)); 131 __func__, name);
131 data = name; 132 data = name;
132 goto skip_upcall; 133 goto skip_upcall;
133 } 134 }
@@ -137,7 +138,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
137 len = rkey->type_data.x[0]; 138 len = rkey->type_data.x[0];
138 data = rkey->payload.data; 139 data = rkey->payload.data;
139 } else { 140 } else {
140 cERROR(1, ("%s: unable to resolve: %s", __func__, name)); 141 cERROR(1, "%s: unable to resolve: %s", __func__, name);
141 goto out; 142 goto out;
142 } 143 }
143 144
@@ -147,10 +148,10 @@ skip_upcall:
147 if (*ip_addr) { 148 if (*ip_addr) {
148 memcpy(*ip_addr, data, len + 1); 149 memcpy(*ip_addr, data, len + 1);
149 if (!IS_ERR(rkey)) 150 if (!IS_ERR(rkey))
150 cFYI(1, ("%s: resolved: %s to %s", __func__, 151 cFYI(1, "%s: resolved: %s to %s", __func__,
151 name, 152 name,
152 *ip_addr 153 *ip_addr
153 )); 154 );
154 rc = 0; 155 rc = 0;
155 } else { 156 } else {
156 rc = -ENOMEM; 157 rc = -ENOMEM;
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 6177f7cca16a..993f82045bf6 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -49,7 +49,7 @@
49static struct dentry *cifs_get_parent(struct dentry *dentry) 49static struct dentry *cifs_get_parent(struct dentry *dentry)
50{ 50{
51 /* BB need to add code here eventually to enable export via NFSD */ 51 /* BB need to add code here eventually to enable export via NFSD */
52 cFYI(1, ("get parent for %p", dentry)); 52 cFYI(1, "get parent for %p", dentry);
53 return ERR_PTR(-EACCES); 53 return ERR_PTR(-EACCES);
54} 54}
55 55
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 3d8f8a96f5a3..a83541ec9713 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * vfs operations that deal with files 4 * vfs operations that deal with files
5 * 5 *
6 * Copyright (C) International Business Machines Corp., 2002,2007 6 * Copyright (C) International Business Machines Corp., 2002,2010
7 * Author(s): Steve French (sfrench@us.ibm.com) 7 * Author(s): Steve French (sfrench@us.ibm.com)
8 * Jeremy Allison (jra@samba.org) 8 * Jeremy Allison (jra@samba.org)
9 * 9 *
@@ -31,6 +31,7 @@
31#include <linux/task_io_accounting_ops.h> 31#include <linux/task_io_accounting_ops.h>
32#include <linux/delay.h> 32#include <linux/delay.h>
33#include <linux/mount.h> 33#include <linux/mount.h>
34#include <linux/slab.h>
34#include <asm/div64.h> 35#include <asm/div64.h>
35#include "cifsfs.h" 36#include "cifsfs.h"
36#include "cifspdu.h" 37#include "cifspdu.h"
@@ -107,8 +108,7 @@ static inline int cifs_get_disposition(unsigned int flags)
107/* all arguments to this function must be checked for validity in caller */ 108/* all arguments to this function must be checked for validity in caller */
108static inline int 109static inline int
109cifs_posix_open_inode_helper(struct inode *inode, struct file *file, 110cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
110 struct cifsInodeInfo *pCifsInode, 111 struct cifsInodeInfo *pCifsInode, __u32 oplock,
111 struct cifsFileInfo *pCifsFile, __u32 oplock,
112 u16 netfid) 112 u16 netfid)
113{ 113{
114 114
@@ -135,15 +135,15 @@ cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
135 if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) && 135 if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
136 (file->f_path.dentry->d_inode->i_size == 136 (file->f_path.dentry->d_inode->i_size ==
137 (loff_t)le64_to_cpu(buf->EndOfFile))) { 137 (loff_t)le64_to_cpu(buf->EndOfFile))) {
138 cFYI(1, ("inode unchanged on server")); 138 cFYI(1, "inode unchanged on server");
139 } else { 139 } else {
140 if (file->f_path.dentry->d_inode->i_mapping) { 140 if (file->f_path.dentry->d_inode->i_mapping) {
141 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping); 141 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
142 if (rc != 0) 142 if (rc != 0)
143 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc; 143 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
144 } 144 }
145 cFYI(1, ("invalidating remote inode since open detected it " 145 cFYI(1, "invalidating remote inode since open detected it "
146 "changed")); 146 "changed");
147 invalidate_remote_inode(file->f_path.dentry->d_inode); 147 invalidate_remote_inode(file->f_path.dentry->d_inode);
148 } */ 148 } */
149 149
@@ -151,8 +151,8 @@ psx_client_can_cache:
151 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { 151 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
152 pCifsInode->clientCanCacheAll = true; 152 pCifsInode->clientCanCacheAll = true;
153 pCifsInode->clientCanCacheRead = true; 153 pCifsInode->clientCanCacheRead = true;
154 cFYI(1, ("Exclusive Oplock granted on inode %p", 154 cFYI(1, "Exclusive Oplock granted on inode %p",
155 file->f_path.dentry->d_inode)); 155 file->f_path.dentry->d_inode);
156 } else if ((oplock & 0xF) == OPLOCK_READ) 156 } else if ((oplock & 0xF) == OPLOCK_READ)
157 pCifsInode->clientCanCacheRead = true; 157 pCifsInode->clientCanCacheRead = true;
158 158
@@ -189,8 +189,8 @@ cifs_fill_filedata(struct file *file)
189 if (file->private_data != NULL) { 189 if (file->private_data != NULL) {
190 return pCifsFile; 190 return pCifsFile;
191 } else if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL)) 191 } else if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL))
192 cERROR(1, ("could not find file instance for " 192 cERROR(1, "could not find file instance for "
193 "new file %p", file)); 193 "new file %p", file);
194 return NULL; 194 return NULL;
195} 195}
196 196
@@ -216,17 +216,17 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
216 if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) && 216 if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
217 (file->f_path.dentry->d_inode->i_size == 217 (file->f_path.dentry->d_inode->i_size ==
218 (loff_t)le64_to_cpu(buf->EndOfFile))) { 218 (loff_t)le64_to_cpu(buf->EndOfFile))) {
219 cFYI(1, ("inode unchanged on server")); 219 cFYI(1, "inode unchanged on server");
220 } else { 220 } else {
221 if (file->f_path.dentry->d_inode->i_mapping) { 221 if (file->f_path.dentry->d_inode->i_mapping) {
222 /* BB no need to lock inode until after invalidate 222 /* BB no need to lock inode until after invalidate
223 since namei code should already have it locked? */ 223 since namei code should already have it locked? */
224 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping); 224 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
225 if (rc != 0) 225 if (rc != 0)
226 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc; 226 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
227 } 227 }
228 cFYI(1, ("invalidating remote inode since open detected it " 228 cFYI(1, "invalidating remote inode since open detected it "
229 "changed")); 229 "changed");
230 invalidate_remote_inode(file->f_path.dentry->d_inode); 230 invalidate_remote_inode(file->f_path.dentry->d_inode);
231 } 231 }
232 232
@@ -241,8 +241,8 @@ client_can_cache:
241 if ((*oplock & 0xF) == OPLOCK_EXCLUSIVE) { 241 if ((*oplock & 0xF) == OPLOCK_EXCLUSIVE) {
242 pCifsInode->clientCanCacheAll = true; 242 pCifsInode->clientCanCacheAll = true;
243 pCifsInode->clientCanCacheRead = true; 243 pCifsInode->clientCanCacheRead = true;
244 cFYI(1, ("Exclusive Oplock granted on inode %p", 244 cFYI(1, "Exclusive Oplock granted on inode %p",
245 file->f_path.dentry->d_inode)); 245 file->f_path.dentry->d_inode);
246 } else if ((*oplock & 0xF) == OPLOCK_READ) 246 } else if ((*oplock & 0xF) == OPLOCK_READ)
247 pCifsInode->clientCanCacheRead = true; 247 pCifsInode->clientCanCacheRead = true;
248 248
@@ -284,8 +284,8 @@ int cifs_open(struct inode *inode, struct file *file)
284 return rc; 284 return rc;
285 } 285 }
286 286
287 cFYI(1, ("inode = 0x%p file flags are 0x%x for %s", 287 cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
288 inode, file->f_flags, full_path)); 288 inode, file->f_flags, full_path);
289 289
290 if (oplockEnabled) 290 if (oplockEnabled)
291 oplock = REQ_OPLOCK; 291 oplock = REQ_OPLOCK;
@@ -297,27 +297,29 @@ int cifs_open(struct inode *inode, struct file *file)
297 (CIFS_UNIX_POSIX_PATH_OPS_CAP & 297 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
298 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 298 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
299 int oflags = (int) cifs_posix_convert_flags(file->f_flags); 299 int oflags = (int) cifs_posix_convert_flags(file->f_flags);
300 oflags |= SMB_O_CREAT;
300 /* can not refresh inode info since size could be stale */ 301 /* can not refresh inode info since size could be stale */
301 rc = cifs_posix_open(full_path, &inode, file->f_path.mnt, 302 rc = cifs_posix_open(full_path, &inode, file->f_path.mnt,
302 cifs_sb->mnt_file_mode /* ignored */, 303 inode->i_sb,
303 oflags, &oplock, &netfid, xid); 304 cifs_sb->mnt_file_mode /* ignored */,
305 oflags, &oplock, &netfid, xid);
304 if (rc == 0) { 306 if (rc == 0) {
305 cFYI(1, ("posix open succeeded")); 307 cFYI(1, "posix open succeeded");
306 /* no need for special case handling of setting mode 308 /* no need for special case handling of setting mode
307 on read only files needed here */ 309 on read only files needed here */
308 310
309 pCifsFile = cifs_fill_filedata(file); 311 pCifsFile = cifs_fill_filedata(file);
310 cifs_posix_open_inode_helper(inode, file, pCifsInode, 312 cifs_posix_open_inode_helper(inode, file, pCifsInode,
311 pCifsFile, oplock, netfid); 313 oplock, netfid);
312 goto out; 314 goto out;
313 } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { 315 } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
314 if (tcon->ses->serverNOS) 316 if (tcon->ses->serverNOS)
315 cERROR(1, ("server %s of type %s returned" 317 cERROR(1, "server %s of type %s returned"
316 " unexpected error on SMB posix open" 318 " unexpected error on SMB posix open"
317 ", disabling posix open support." 319 ", disabling posix open support."
318 " Check if server update available.", 320 " Check if server update available.",
319 tcon->ses->serverName, 321 tcon->ses->serverName,
320 tcon->ses->serverNOS)); 322 tcon->ses->serverNOS);
321 tcon->broken_posix_open = true; 323 tcon->broken_posix_open = true;
322 } else if ((rc != -EIO) && (rc != -EREMOTE) && 324 } else if ((rc != -EIO) && (rc != -EREMOTE) &&
323 (rc != -EOPNOTSUPP)) /* path not found or net err */ 325 (rc != -EOPNOTSUPP)) /* path not found or net err */
@@ -385,7 +387,7 @@ int cifs_open(struct inode *inode, struct file *file)
385 & CIFS_MOUNT_MAP_SPECIAL_CHR); 387 & CIFS_MOUNT_MAP_SPECIAL_CHR);
386 } 388 }
387 if (rc) { 389 if (rc) {
388 cFYI(1, ("cifs_open returned 0x%x", rc)); 390 cFYI(1, "cifs_open returned 0x%x", rc);
389 goto out; 391 goto out;
390 } 392 }
391 393
@@ -468,7 +470,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
468 } 470 }
469 471
470 if (file->f_path.dentry == NULL) { 472 if (file->f_path.dentry == NULL) {
471 cERROR(1, ("no valid name if dentry freed")); 473 cERROR(1, "no valid name if dentry freed");
472 dump_stack(); 474 dump_stack();
473 rc = -EBADF; 475 rc = -EBADF;
474 goto reopen_error_exit; 476 goto reopen_error_exit;
@@ -476,7 +478,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
476 478
477 inode = file->f_path.dentry->d_inode; 479 inode = file->f_path.dentry->d_inode;
478 if (inode == NULL) { 480 if (inode == NULL) {
479 cERROR(1, ("inode not valid")); 481 cERROR(1, "inode not valid");
480 dump_stack(); 482 dump_stack();
481 rc = -EBADF; 483 rc = -EBADF;
482 goto reopen_error_exit; 484 goto reopen_error_exit;
@@ -498,8 +500,8 @@ reopen_error_exit:
498 return rc; 500 return rc;
499 } 501 }
500 502
501 cFYI(1, ("inode = 0x%p file flags 0x%x for %s", 503 cFYI(1, "inode = 0x%p file flags 0x%x for %s",
502 inode, file->f_flags, full_path)); 504 inode, file->f_flags, full_path);
503 505
504 if (oplockEnabled) 506 if (oplockEnabled)
505 oplock = REQ_OPLOCK; 507 oplock = REQ_OPLOCK;
@@ -512,10 +514,11 @@ reopen_error_exit:
512 int oflags = (int) cifs_posix_convert_flags(file->f_flags); 514 int oflags = (int) cifs_posix_convert_flags(file->f_flags);
513 /* can not refresh inode info since size could be stale */ 515 /* can not refresh inode info since size could be stale */
514 rc = cifs_posix_open(full_path, NULL, file->f_path.mnt, 516 rc = cifs_posix_open(full_path, NULL, file->f_path.mnt,
515 cifs_sb->mnt_file_mode /* ignored */, 517 inode->i_sb,
516 oflags, &oplock, &netfid, xid); 518 cifs_sb->mnt_file_mode /* ignored */,
519 oflags, &oplock, &netfid, xid);
517 if (rc == 0) { 520 if (rc == 0) {
518 cFYI(1, ("posix reopen succeeded")); 521 cFYI(1, "posix reopen succeeded");
519 goto reopen_success; 522 goto reopen_success;
520 } 523 }
521 /* fallthrough to retry open the old way on errors, especially 524 /* fallthrough to retry open the old way on errors, especially
@@ -536,8 +539,8 @@ reopen_error_exit:
536 CIFS_MOUNT_MAP_SPECIAL_CHR); 539 CIFS_MOUNT_MAP_SPECIAL_CHR);
537 if (rc) { 540 if (rc) {
538 mutex_unlock(&pCifsFile->fh_mutex); 541 mutex_unlock(&pCifsFile->fh_mutex);
539 cFYI(1, ("cifs_open returned 0x%x", rc)); 542 cFYI(1, "cifs_open returned 0x%x", rc);
540 cFYI(1, ("oplock: %d", oplock)); 543 cFYI(1, "oplock: %d", oplock);
541 } else { 544 } else {
542reopen_success: 545reopen_success:
543 pCifsFile->netfid = netfid; 546 pCifsFile->netfid = netfid;
@@ -569,8 +572,8 @@ reopen_success:
569 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { 572 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
570 pCifsInode->clientCanCacheAll = true; 573 pCifsInode->clientCanCacheAll = true;
571 pCifsInode->clientCanCacheRead = true; 574 pCifsInode->clientCanCacheRead = true;
572 cFYI(1, ("Exclusive Oplock granted on inode %p", 575 cFYI(1, "Exclusive Oplock granted on inode %p",
573 file->f_path.dentry->d_inode)); 576 file->f_path.dentry->d_inode);
574 } else if ((oplock & 0xF) == OPLOCK_READ) { 577 } else if ((oplock & 0xF) == OPLOCK_READ) {
575 pCifsInode->clientCanCacheRead = true; 578 pCifsInode->clientCanCacheRead = true;
576 pCifsInode->clientCanCacheAll = false; 579 pCifsInode->clientCanCacheAll = false;
@@ -618,8 +621,7 @@ int cifs_close(struct inode *inode, struct file *file)
618 the struct would be in each open file, 621 the struct would be in each open file,
619 but this should give enough time to 622 but this should give enough time to
620 clear the socket */ 623 clear the socket */
621 cFYI(DBG2, 624 cFYI(DBG2, "close delay, write pending");
622 ("close delay, write pending"));
623 msleep(timeout); 625 msleep(timeout);
624 timeout *= 4; 626 timeout *= 4;
625 } 627 }
@@ -652,7 +654,7 @@ int cifs_close(struct inode *inode, struct file *file)
652 654
653 read_lock(&GlobalSMBSeslock); 655 read_lock(&GlobalSMBSeslock);
654 if (list_empty(&(CIFS_I(inode)->openFileList))) { 656 if (list_empty(&(CIFS_I(inode)->openFileList))) {
655 cFYI(1, ("closing last open instance for inode %p", inode)); 657 cFYI(1, "closing last open instance for inode %p", inode);
656 /* if the file is not open we do not know if we can cache info 658 /* if the file is not open we do not know if we can cache info
657 on this inode, much less write behind and read ahead */ 659 on this inode, much less write behind and read ahead */
658 CIFS_I(inode)->clientCanCacheRead = false; 660 CIFS_I(inode)->clientCanCacheRead = false;
@@ -673,7 +675,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
673 (struct cifsFileInfo *)file->private_data; 675 (struct cifsFileInfo *)file->private_data;
674 char *ptmp; 676 char *ptmp;
675 677
676 cFYI(1, ("Closedir inode = 0x%p", inode)); 678 cFYI(1, "Closedir inode = 0x%p", inode);
677 679
678 xid = GetXid(); 680 xid = GetXid();
679 681
@@ -684,22 +686,22 @@ int cifs_closedir(struct inode *inode, struct file *file)
684 686
685 pTcon = cifs_sb->tcon; 687 pTcon = cifs_sb->tcon;
686 688
687 cFYI(1, ("Freeing private data in close dir")); 689 cFYI(1, "Freeing private data in close dir");
688 write_lock(&GlobalSMBSeslock); 690 write_lock(&GlobalSMBSeslock);
689 if (!pCFileStruct->srch_inf.endOfSearch && 691 if (!pCFileStruct->srch_inf.endOfSearch &&
690 !pCFileStruct->invalidHandle) { 692 !pCFileStruct->invalidHandle) {
691 pCFileStruct->invalidHandle = true; 693 pCFileStruct->invalidHandle = true;
692 write_unlock(&GlobalSMBSeslock); 694 write_unlock(&GlobalSMBSeslock);
693 rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid); 695 rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid);
694 cFYI(1, ("Closing uncompleted readdir with rc %d", 696 cFYI(1, "Closing uncompleted readdir with rc %d",
695 rc)); 697 rc);
696 /* not much we can do if it fails anyway, ignore rc */ 698 /* not much we can do if it fails anyway, ignore rc */
697 rc = 0; 699 rc = 0;
698 } else 700 } else
699 write_unlock(&GlobalSMBSeslock); 701 write_unlock(&GlobalSMBSeslock);
700 ptmp = pCFileStruct->srch_inf.ntwrk_buf_start; 702 ptmp = pCFileStruct->srch_inf.ntwrk_buf_start;
701 if (ptmp) { 703 if (ptmp) {
702 cFYI(1, ("closedir free smb buf in srch struct")); 704 cFYI(1, "closedir free smb buf in srch struct");
703 pCFileStruct->srch_inf.ntwrk_buf_start = NULL; 705 pCFileStruct->srch_inf.ntwrk_buf_start = NULL;
704 if (pCFileStruct->srch_inf.smallBuf) 706 if (pCFileStruct->srch_inf.smallBuf)
705 cifs_small_buf_release(ptmp); 707 cifs_small_buf_release(ptmp);
@@ -747,49 +749,49 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
747 rc = -EACCES; 749 rc = -EACCES;
748 xid = GetXid(); 750 xid = GetXid();
749 751
750 cFYI(1, ("Lock parm: 0x%x flockflags: " 752 cFYI(1, "Lock parm: 0x%x flockflags: "
751 "0x%x flocktype: 0x%x start: %lld end: %lld", 753 "0x%x flocktype: 0x%x start: %lld end: %lld",
752 cmd, pfLock->fl_flags, pfLock->fl_type, pfLock->fl_start, 754 cmd, pfLock->fl_flags, pfLock->fl_type, pfLock->fl_start,
753 pfLock->fl_end)); 755 pfLock->fl_end);
754 756
755 if (pfLock->fl_flags & FL_POSIX) 757 if (pfLock->fl_flags & FL_POSIX)
756 cFYI(1, ("Posix")); 758 cFYI(1, "Posix");
757 if (pfLock->fl_flags & FL_FLOCK) 759 if (pfLock->fl_flags & FL_FLOCK)
758 cFYI(1, ("Flock")); 760 cFYI(1, "Flock");
759 if (pfLock->fl_flags & FL_SLEEP) { 761 if (pfLock->fl_flags & FL_SLEEP) {
760 cFYI(1, ("Blocking lock")); 762 cFYI(1, "Blocking lock");
761 wait_flag = true; 763 wait_flag = true;
762 } 764 }
763 if (pfLock->fl_flags & FL_ACCESS) 765 if (pfLock->fl_flags & FL_ACCESS)
764 cFYI(1, ("Process suspended by mandatory locking - " 766 cFYI(1, "Process suspended by mandatory locking - "
765 "not implemented yet")); 767 "not implemented yet");
766 if (pfLock->fl_flags & FL_LEASE) 768 if (pfLock->fl_flags & FL_LEASE)
767 cFYI(1, ("Lease on file - not implemented yet")); 769 cFYI(1, "Lease on file - not implemented yet");
768 if (pfLock->fl_flags & 770 if (pfLock->fl_flags &
769 (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE))) 771 (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE)))
770 cFYI(1, ("Unknown lock flags 0x%x", pfLock->fl_flags)); 772 cFYI(1, "Unknown lock flags 0x%x", pfLock->fl_flags);
771 773
772 if (pfLock->fl_type == F_WRLCK) { 774 if (pfLock->fl_type == F_WRLCK) {
773 cFYI(1, ("F_WRLCK ")); 775 cFYI(1, "F_WRLCK ");
774 numLock = 1; 776 numLock = 1;
775 } else if (pfLock->fl_type == F_UNLCK) { 777 } else if (pfLock->fl_type == F_UNLCK) {
776 cFYI(1, ("F_UNLCK")); 778 cFYI(1, "F_UNLCK");
777 numUnlock = 1; 779 numUnlock = 1;
778 /* Check if unlock includes more than 780 /* Check if unlock includes more than
779 one lock range */ 781 one lock range */
780 } else if (pfLock->fl_type == F_RDLCK) { 782 } else if (pfLock->fl_type == F_RDLCK) {
781 cFYI(1, ("F_RDLCK")); 783 cFYI(1, "F_RDLCK");
782 lockType |= LOCKING_ANDX_SHARED_LOCK; 784 lockType |= LOCKING_ANDX_SHARED_LOCK;
783 numLock = 1; 785 numLock = 1;
784 } else if (pfLock->fl_type == F_EXLCK) { 786 } else if (pfLock->fl_type == F_EXLCK) {
785 cFYI(1, ("F_EXLCK")); 787 cFYI(1, "F_EXLCK");
786 numLock = 1; 788 numLock = 1;
787 } else if (pfLock->fl_type == F_SHLCK) { 789 } else if (pfLock->fl_type == F_SHLCK) {
788 cFYI(1, ("F_SHLCK")); 790 cFYI(1, "F_SHLCK");
789 lockType |= LOCKING_ANDX_SHARED_LOCK; 791 lockType |= LOCKING_ANDX_SHARED_LOCK;
790 numLock = 1; 792 numLock = 1;
791 } else 793 } else
792 cFYI(1, ("Unknown type of lock")); 794 cFYI(1, "Unknown type of lock");
793 795
794 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 796 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
795 tcon = cifs_sb->tcon; 797 tcon = cifs_sb->tcon;
@@ -832,14 +834,38 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
832 0 /* wait flag */ ); 834 0 /* wait flag */ );
833 pfLock->fl_type = F_UNLCK; 835 pfLock->fl_type = F_UNLCK;
834 if (rc != 0) 836 if (rc != 0)
835 cERROR(1, ("Error unlocking previously locked " 837 cERROR(1, "Error unlocking previously locked "
836 "range %d during test of lock", rc)); 838 "range %d during test of lock", rc);
837 rc = 0; 839 rc = 0;
838 840
839 } else { 841 } else {
840 /* if rc == ERR_SHARING_VIOLATION ? */ 842 /* if rc == ERR_SHARING_VIOLATION ? */
841 rc = 0; /* do not change lock type to unlock 843 rc = 0;
842 since range in use */ 844
845 if (lockType & LOCKING_ANDX_SHARED_LOCK) {
846 pfLock->fl_type = F_WRLCK;
847 } else {
848 rc = CIFSSMBLock(xid, tcon, netfid, length,
849 pfLock->fl_start, 0, 1,
850 lockType | LOCKING_ANDX_SHARED_LOCK,
851 0 /* wait flag */);
852 if (rc == 0) {
853 rc = CIFSSMBLock(xid, tcon, netfid,
854 length, pfLock->fl_start, 1, 0,
855 lockType |
856 LOCKING_ANDX_SHARED_LOCK,
857 0 /* wait flag */);
858 pfLock->fl_type = F_RDLCK;
859 if (rc != 0)
860 cERROR(1, "Error unlocking "
861 "previously locked range %d "
862 "during test of lock", rc);
863 rc = 0;
864 } else {
865 pfLock->fl_type = F_WRLCK;
866 rc = 0;
867 }
868 }
843 } 869 }
844 870
845 FreeXid(xid); 871 FreeXid(xid);
@@ -898,9 +924,10 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
898 1, 0, li->type, false); 924 1, 0, li->type, false);
899 if (stored_rc) 925 if (stored_rc)
900 rc = stored_rc; 926 rc = stored_rc;
901 927 else {
902 list_del(&li->llist); 928 list_del(&li->llist);
903 kfree(li); 929 kfree(li);
930 }
904 } 931 }
905 } 932 }
906 mutex_unlock(&fid->lock_mutex); 933 mutex_unlock(&fid->lock_mutex);
@@ -963,9 +990,8 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
963 990
964 pTcon = cifs_sb->tcon; 991 pTcon = cifs_sb->tcon;
965 992
966 /* cFYI(1, 993 /* cFYI(1, " write %d bytes to offset %lld of %s", write_size,
967 (" write %d bytes to offset %lld of %s", write_size, 994 *poffset, file->f_path.dentry->d_name.name); */
968 *poffset, file->f_path.dentry->d_name.name)); */
969 995
970 if (file->private_data == NULL) 996 if (file->private_data == NULL)
971 return -EBADF; 997 return -EBADF;
@@ -1066,8 +1092,8 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
1066 1092
1067 pTcon = cifs_sb->tcon; 1093 pTcon = cifs_sb->tcon;
1068 1094
1069 cFYI(1, ("write %zd bytes to offset %lld of %s", write_size, 1095 cFYI(1, "write %zd bytes to offset %lld of %s", write_size,
1070 *poffset, file->f_path.dentry->d_name.name)); 1096 *poffset, file->f_path.dentry->d_name.name);
1071 1097
1072 if (file->private_data == NULL) 1098 if (file->private_data == NULL)
1073 return -EBADF; 1099 return -EBADF;
@@ -1208,7 +1234,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
1208 it being zero) during stress testcases so we need to check for it */ 1234 it being zero) during stress testcases so we need to check for it */
1209 1235
1210 if (cifs_inode == NULL) { 1236 if (cifs_inode == NULL) {
1211 cERROR(1, ("Null inode passed to cifs_writeable_file")); 1237 cERROR(1, "Null inode passed to cifs_writeable_file");
1212 dump_stack(); 1238 dump_stack();
1213 return NULL; 1239 return NULL;
1214 } 1240 }
@@ -1252,7 +1278,7 @@ refind_writable:
1252 again. Note that it would be bad 1278 again. Note that it would be bad
1253 to hold up writepages here (rather than 1279 to hold up writepages here (rather than
1254 in caller) with continuous retries */ 1280 in caller) with continuous retries */
1255 cFYI(1, ("wp failed on reopen file")); 1281 cFYI(1, "wp failed on reopen file");
1256 read_lock(&GlobalSMBSeslock); 1282 read_lock(&GlobalSMBSeslock);
1257 /* can not use this handle, no write 1283 /* can not use this handle, no write
1258 pending on this one after all */ 1284 pending on this one after all */
@@ -1328,7 +1354,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1328 else if (bytes_written < 0) 1354 else if (bytes_written < 0)
1329 rc = bytes_written; 1355 rc = bytes_written;
1330 } else { 1356 } else {
1331 cFYI(1, ("No writeable filehandles for inode")); 1357 cFYI(1, "No writeable filehandles for inode");
1332 rc = -EIO; 1358 rc = -EIO;
1333 } 1359 }
1334 1360
@@ -1500,7 +1526,7 @@ retry:
1500 */ 1526 */
1501 open_file = find_writable_file(CIFS_I(mapping->host)); 1527 open_file = find_writable_file(CIFS_I(mapping->host));
1502 if (!open_file) { 1528 if (!open_file) {
1503 cERROR(1, ("No writable handles for inode")); 1529 cERROR(1, "No writable handles for inode");
1504 rc = -EBADF; 1530 rc = -EBADF;
1505 } else { 1531 } else {
1506 long_op = cifs_write_timeout(cifsi, offset); 1532 long_op = cifs_write_timeout(cifsi, offset);
@@ -1513,8 +1539,8 @@ retry:
1513 cifs_update_eof(cifsi, offset, bytes_written); 1539 cifs_update_eof(cifsi, offset, bytes_written);
1514 1540
1515 if (rc || bytes_written < bytes_to_write) { 1541 if (rc || bytes_written < bytes_to_write) {
1516 cERROR(1, ("Write2 ret %d, wrote %d", 1542 cERROR(1, "Write2 ret %d, wrote %d",
1517 rc, bytes_written)); 1543 rc, bytes_written);
1518 /* BB what if continued retry is 1544 /* BB what if continued retry is
1519 requested via mount flags? */ 1545 requested via mount flags? */
1520 if (rc == -ENOSPC) 1546 if (rc == -ENOSPC)
@@ -1575,7 +1601,7 @@ static int cifs_writepage(struct page *page, struct writeback_control *wbc)
1575/* BB add check for wbc flags */ 1601/* BB add check for wbc flags */
1576 page_cache_get(page); 1602 page_cache_get(page);
1577 if (!PageUptodate(page)) 1603 if (!PageUptodate(page))
1578 cFYI(1, ("ppw - page not up to date")); 1604 cFYI(1, "ppw - page not up to date");
1579 1605
1580 /* 1606 /*
1581 * Set the "writeback" flag, and clear "dirty" in the radix tree. 1607 * Set the "writeback" flag, and clear "dirty" in the radix tree.
@@ -1604,8 +1630,8 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
1604 int rc; 1630 int rc;
1605 struct inode *inode = mapping->host; 1631 struct inode *inode = mapping->host;
1606 1632
1607 cFYI(1, ("write_end for page %p from pos %lld with %d bytes", 1633 cFYI(1, "write_end for page %p from pos %lld with %d bytes",
1608 page, pos, copied)); 1634 page, pos, copied);
1609 1635
1610 if (PageChecked(page)) { 1636 if (PageChecked(page)) {
1611 if (copied == len) 1637 if (copied == len)
@@ -1661,8 +1687,8 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
1661 1687
1662 xid = GetXid(); 1688 xid = GetXid();
1663 1689
1664 cFYI(1, ("Sync file - name: %s datasync: 0x%x", 1690 cFYI(1, "Sync file - name: %s datasync: 0x%x",
1665 dentry->d_name.name, datasync)); 1691 dentry->d_name.name, datasync);
1666 1692
1667 rc = filemap_write_and_wait(inode->i_mapping); 1693 rc = filemap_write_and_wait(inode->i_mapping);
1668 if (rc == 0) { 1694 if (rc == 0) {
@@ -1686,7 +1712,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
1686 unsigned int rpages = 0; 1712 unsigned int rpages = 0;
1687 int rc = 0; 1713 int rc = 0;
1688 1714
1689 cFYI(1, ("sync page %p",page)); 1715 cFYI(1, "sync page %p", page);
1690 mapping = page->mapping; 1716 mapping = page->mapping;
1691 if (!mapping) 1717 if (!mapping)
1692 return 0; 1718 return 0;
@@ -1697,7 +1723,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
1697/* fill in rpages then 1723/* fill in rpages then
1698 result = cifs_pagein_inode(inode, index, rpages); */ /* BB finish */ 1724 result = cifs_pagein_inode(inode, index, rpages); */ /* BB finish */
1699 1725
1700/* cFYI(1, ("rpages is %d for sync page of Index %ld", rpages, index)); 1726/* cFYI(1, "rpages is %d for sync page of Index %ld", rpages, index);
1701 1727
1702#if 0 1728#if 0
1703 if (rc < 0) 1729 if (rc < 0)
@@ -1731,7 +1757,7 @@ int cifs_flush(struct file *file, fl_owner_t id)
1731 CIFS_I(inode)->write_behind_rc = 0; 1757 CIFS_I(inode)->write_behind_rc = 0;
1732 } 1758 }
1733 1759
1734 cFYI(1, ("Flush inode %p file %p rc %d", inode, file, rc)); 1760 cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc);
1735 1761
1736 return rc; 1762 return rc;
1737} 1763}
@@ -1763,7 +1789,7 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
1763 open_file = (struct cifsFileInfo *)file->private_data; 1789 open_file = (struct cifsFileInfo *)file->private_data;
1764 1790
1765 if ((file->f_flags & O_ACCMODE) == O_WRONLY) 1791 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
1766 cFYI(1, ("attempting read on write only file instance")); 1792 cFYI(1, "attempting read on write only file instance");
1767 1793
1768 for (total_read = 0, current_offset = read_data; 1794 for (total_read = 0, current_offset = read_data;
1769 read_size > total_read; 1795 read_size > total_read;
@@ -1844,7 +1870,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1844 open_file = (struct cifsFileInfo *)file->private_data; 1870 open_file = (struct cifsFileInfo *)file->private_data;
1845 1871
1846 if ((file->f_flags & O_ACCMODE) == O_WRONLY) 1872 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
1847 cFYI(1, ("attempting read on write only file instance")); 1873 cFYI(1, "attempting read on write only file instance");
1848 1874
1849 for (total_read = 0, current_offset = read_data; 1875 for (total_read = 0, current_offset = read_data;
1850 read_size > total_read; 1876 read_size > total_read;
@@ -1890,13 +1916,12 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1890 1916
1891int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) 1917int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1892{ 1918{
1893 struct dentry *dentry = file->f_path.dentry;
1894 int rc, xid; 1919 int rc, xid;
1895 1920
1896 xid = GetXid(); 1921 xid = GetXid();
1897 rc = cifs_revalidate(dentry); 1922 rc = cifs_revalidate_file(file);
1898 if (rc) { 1923 if (rc) {
1899 cFYI(1, ("Validation prior to mmap failed, error=%d", rc)); 1924 cFYI(1, "Validation prior to mmap failed, error=%d", rc);
1900 FreeXid(xid); 1925 FreeXid(xid);
1901 return rc; 1926 return rc;
1902 } 1927 }
@@ -1907,8 +1932,7 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1907 1932
1908 1933
1909static void cifs_copy_cache_pages(struct address_space *mapping, 1934static void cifs_copy_cache_pages(struct address_space *mapping,
1910 struct list_head *pages, int bytes_read, char *data, 1935 struct list_head *pages, int bytes_read, char *data)
1911 struct pagevec *plru_pvec)
1912{ 1936{
1913 struct page *page; 1937 struct page *page;
1914 char *target; 1938 char *target;
@@ -1920,10 +1944,10 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
1920 page = list_entry(pages->prev, struct page, lru); 1944 page = list_entry(pages->prev, struct page, lru);
1921 list_del(&page->lru); 1945 list_del(&page->lru);
1922 1946
1923 if (add_to_page_cache(page, mapping, page->index, 1947 if (add_to_page_cache_lru(page, mapping, page->index,
1924 GFP_KERNEL)) { 1948 GFP_KERNEL)) {
1925 page_cache_release(page); 1949 page_cache_release(page);
1926 cFYI(1, ("Add page cache failed")); 1950 cFYI(1, "Add page cache failed");
1927 data += PAGE_CACHE_SIZE; 1951 data += PAGE_CACHE_SIZE;
1928 bytes_read -= PAGE_CACHE_SIZE; 1952 bytes_read -= PAGE_CACHE_SIZE;
1929 continue; 1953 continue;
@@ -1946,8 +1970,6 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
1946 flush_dcache_page(page); 1970 flush_dcache_page(page);
1947 SetPageUptodate(page); 1971 SetPageUptodate(page);
1948 unlock_page(page); 1972 unlock_page(page);
1949 if (!pagevec_add(plru_pvec, page))
1950 __pagevec_lru_add_file(plru_pvec);
1951 data += PAGE_CACHE_SIZE; 1973 data += PAGE_CACHE_SIZE;
1952 } 1974 }
1953 return; 1975 return;
@@ -1966,7 +1988,6 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
1966 unsigned int read_size, i; 1988 unsigned int read_size, i;
1967 char *smb_read_data = NULL; 1989 char *smb_read_data = NULL;
1968 struct smb_com_read_rsp *pSMBr; 1990 struct smb_com_read_rsp *pSMBr;
1969 struct pagevec lru_pvec;
1970 struct cifsFileInfo *open_file; 1991 struct cifsFileInfo *open_file;
1971 int buf_type = CIFS_NO_BUFFER; 1992 int buf_type = CIFS_NO_BUFFER;
1972 1993
@@ -1980,8 +2001,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
1980 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 2001 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1981 pTcon = cifs_sb->tcon; 2002 pTcon = cifs_sb->tcon;
1982 2003
1983 pagevec_init(&lru_pvec, 0); 2004 cFYI(DBG2, "rpages: num pages %d", num_pages);
1984 cFYI(DBG2, ("rpages: num pages %d", num_pages));
1985 for (i = 0; i < num_pages; ) { 2005 for (i = 0; i < num_pages; ) {
1986 unsigned contig_pages; 2006 unsigned contig_pages;
1987 struct page *tmp_page; 2007 struct page *tmp_page;
@@ -2014,8 +2034,8 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
2014 /* Read size needs to be in multiples of one page */ 2034 /* Read size needs to be in multiples of one page */
2015 read_size = min_t(const unsigned int, read_size, 2035 read_size = min_t(const unsigned int, read_size,
2016 cifs_sb->rsize & PAGE_CACHE_MASK); 2036 cifs_sb->rsize & PAGE_CACHE_MASK);
2017 cFYI(DBG2, ("rpages: read size 0x%x contiguous pages %d", 2037 cFYI(DBG2, "rpages: read size 0x%x contiguous pages %d",
2018 read_size, contig_pages)); 2038 read_size, contig_pages);
2019 rc = -EAGAIN; 2039 rc = -EAGAIN;
2020 while (rc == -EAGAIN) { 2040 while (rc == -EAGAIN) {
2021 if ((open_file->invalidHandle) && 2041 if ((open_file->invalidHandle) &&
@@ -2042,14 +2062,14 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
2042 } 2062 }
2043 } 2063 }
2044 if ((rc < 0) || (smb_read_data == NULL)) { 2064 if ((rc < 0) || (smb_read_data == NULL)) {
2045 cFYI(1, ("Read error in readpages: %d", rc)); 2065 cFYI(1, "Read error in readpages: %d", rc);
2046 break; 2066 break;
2047 } else if (bytes_read > 0) { 2067 } else if (bytes_read > 0) {
2048 task_io_account_read(bytes_read); 2068 task_io_account_read(bytes_read);
2049 pSMBr = (struct smb_com_read_rsp *)smb_read_data; 2069 pSMBr = (struct smb_com_read_rsp *)smb_read_data;
2050 cifs_copy_cache_pages(mapping, page_list, bytes_read, 2070 cifs_copy_cache_pages(mapping, page_list, bytes_read,
2051 smb_read_data + 4 /* RFC1001 hdr */ + 2071 smb_read_data + 4 /* RFC1001 hdr */ +
2052 le16_to_cpu(pSMBr->DataOffset), &lru_pvec); 2072 le16_to_cpu(pSMBr->DataOffset));
2053 2073
2054 i += bytes_read >> PAGE_CACHE_SHIFT; 2074 i += bytes_read >> PAGE_CACHE_SHIFT;
2055 cifs_stats_bytes_read(pTcon, bytes_read); 2075 cifs_stats_bytes_read(pTcon, bytes_read);
@@ -2065,9 +2085,9 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
2065 /* break; */ 2085 /* break; */
2066 } 2086 }
2067 } else { 2087 } else {
2068 cFYI(1, ("No bytes read (%d) at offset %lld . " 2088 cFYI(1, "No bytes read (%d) at offset %lld . "
2069 "Cleaning remaining pages from readahead list", 2089 "Cleaning remaining pages from readahead list",
2070 bytes_read, offset)); 2090 bytes_read, offset);
2071 /* BB turn off caching and do new lookup on 2091 /* BB turn off caching and do new lookup on
2072 file size at server? */ 2092 file size at server? */
2073 break; 2093 break;
@@ -2082,8 +2102,6 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
2082 bytes_read = 0; 2102 bytes_read = 0;
2083 } 2103 }
2084 2104
2085 pagevec_lru_add_file(&lru_pvec);
2086
2087/* need to free smb_read_data buf before exit */ 2105/* need to free smb_read_data buf before exit */
2088 if (smb_read_data) { 2106 if (smb_read_data) {
2089 if (buf_type == CIFS_SMALL_BUFFER) 2107 if (buf_type == CIFS_SMALL_BUFFER)
@@ -2112,7 +2130,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
2112 if (rc < 0) 2130 if (rc < 0)
2113 goto io_error; 2131 goto io_error;
2114 else 2132 else
2115 cFYI(1, ("Bytes read %d", rc)); 2133 cFYI(1, "Bytes read %d", rc);
2116 2134
2117 file->f_path.dentry->d_inode->i_atime = 2135 file->f_path.dentry->d_inode->i_atime =
2118 current_fs_time(file->f_path.dentry->d_inode->i_sb); 2136 current_fs_time(file->f_path.dentry->d_inode->i_sb);
@@ -2144,8 +2162,8 @@ static int cifs_readpage(struct file *file, struct page *page)
2144 return rc; 2162 return rc;
2145 } 2163 }
2146 2164
2147 cFYI(1, ("readpage %p at offset %d 0x%x\n", 2165 cFYI(1, "readpage %p at offset %d 0x%x\n",
2148 page, (int)offset, (int)offset)); 2166 page, (int)offset, (int)offset);
2149 2167
2150 rc = cifs_readpage_worker(file, page, &offset); 2168 rc = cifs_readpage_worker(file, page, &offset);
2151 2169
@@ -2215,7 +2233,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
2215 struct page *page; 2233 struct page *page;
2216 int rc = 0; 2234 int rc = 0;
2217 2235
2218 cFYI(1, ("write_begin from %lld len %d", (long long)pos, len)); 2236 cFYI(1, "write_begin from %lld len %d", (long long)pos, len);
2219 2237
2220 page = grab_cache_page_write_begin(mapping, index, flags); 2238 page = grab_cache_page_write_begin(mapping, index, flags);
2221 if (!page) { 2239 if (!page) {
@@ -2287,12 +2305,10 @@ cifs_oplock_break(struct slow_work *work)
2287 int rc, waitrc = 0; 2305 int rc, waitrc = 0;
2288 2306
2289 if (inode && S_ISREG(inode->i_mode)) { 2307 if (inode && S_ISREG(inode->i_mode)) {
2290#ifdef CONFIG_CIFS_EXPERIMENTAL 2308 if (cinode->clientCanCacheRead)
2291 if (cinode->clientCanCacheAll == 0)
2292 break_lease(inode, O_RDONLY); 2309 break_lease(inode, O_RDONLY);
2293 else if (cinode->clientCanCacheRead == 0) 2310 else
2294 break_lease(inode, O_WRONLY); 2311 break_lease(inode, O_WRONLY);
2295#endif
2296 rc = filemap_fdatawrite(inode->i_mapping); 2312 rc = filemap_fdatawrite(inode->i_mapping);
2297 if (cinode->clientCanCacheRead == 0) { 2313 if (cinode->clientCanCacheRead == 0) {
2298 waitrc = filemap_fdatawait(inode->i_mapping); 2314 waitrc = filemap_fdatawait(inode->i_mapping);
@@ -2302,7 +2318,7 @@ cifs_oplock_break(struct slow_work *work)
2302 rc = waitrc; 2318 rc = waitrc;
2303 if (rc) 2319 if (rc)
2304 cinode->write_behind_rc = rc; 2320 cinode->write_behind_rc = rc;
2305 cFYI(1, ("Oplock flush inode %p rc %d", inode, rc)); 2321 cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
2306 } 2322 }
2307 2323
2308 /* 2324 /*
@@ -2314,7 +2330,7 @@ cifs_oplock_break(struct slow_work *work)
2314 if (!cfile->closePend && !cfile->oplock_break_cancelled) { 2330 if (!cfile->closePend && !cfile->oplock_break_cancelled) {
2315 rc = CIFSSMBLock(0, cifs_sb->tcon, cfile->netfid, 0, 0, 0, 0, 2331 rc = CIFSSMBLock(0, cifs_sb->tcon, cfile->netfid, 0, 0, 0, 0,
2316 LOCKING_ANDX_OPLOCK_RELEASE, false); 2332 LOCKING_ANDX_OPLOCK_RELEASE, false);
2317 cFYI(1, ("Oplock release rc = %d", rc)); 2333 cFYI(1, "Oplock release rc = %d", rc);
2318 } 2334 }
2319} 2335}
2320 2336
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 8bdbc818164c..62b324f26a56 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/inode.c 2 * fs/cifs/inode.c
3 * 3 *
4 * Copyright (C) International Business Machines Corp., 2002,2008 4 * Copyright (C) International Business Machines Corp., 2002,2010
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * This library is free software; you can redistribute it and/or modify 7 * This library is free software; you can redistribute it and/or modify
@@ -20,6 +20,7 @@
20 */ 20 */
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/stat.h> 22#include <linux/stat.h>
23#include <linux/slab.h>
23#include <linux/pagemap.h> 24#include <linux/pagemap.h>
24#include <asm/div64.h> 25#include <asm/div64.h>
25#include "cifsfs.h" 26#include "cifsfs.h"
@@ -77,6 +78,41 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
77 } 78 }
78} 79}
79 80
81/* check inode attributes against fattr. If they don't match, tag the
82 * inode for cache invalidation
83 */
84static void
85cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
86{
87 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
88
89 cFYI(1, "%s: revalidating inode %llu", __func__, cifs_i->uniqueid);
90
91 if (inode->i_state & I_NEW) {
92 cFYI(1, "%s: inode %llu is new", __func__, cifs_i->uniqueid);
93 return;
94 }
95
96 /* don't bother with revalidation if we have an oplock */
97 if (cifs_i->clientCanCacheRead) {
98 cFYI(1, "%s: inode %llu is oplocked", __func__,
99 cifs_i->uniqueid);
100 return;
101 }
102
103 /* revalidate if mtime or size have changed */
104 if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) &&
105 cifs_i->server_eof == fattr->cf_eof) {
106 cFYI(1, "%s: inode %llu is unchanged", __func__,
107 cifs_i->uniqueid);
108 return;
109 }
110
111 cFYI(1, "%s: invalidating inode %llu mapping", __func__,
112 cifs_i->uniqueid);
113 cifs_i->invalid_mapping = true;
114}
115
80/* populate an inode with info from a cifs_fattr struct */ 116/* populate an inode with info from a cifs_fattr struct */
81void 117void
82cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) 118cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
@@ -85,6 +121,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
85 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 121 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
86 unsigned long oldtime = cifs_i->time; 122 unsigned long oldtime = cifs_i->time;
87 123
124 cifs_revalidate_cache(inode, fattr);
125
88 inode->i_atime = fattr->cf_atime; 126 inode->i_atime = fattr->cf_atime;
89 inode->i_mtime = fattr->cf_mtime; 127 inode->i_mtime = fattr->cf_mtime;
90 inode->i_ctime = fattr->cf_ctime; 128 inode->i_ctime = fattr->cf_ctime;
@@ -99,15 +137,14 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
99 inode->i_mode = fattr->cf_mode; 137 inode->i_mode = fattr->cf_mode;
100 138
101 cifs_i->cifsAttrs = fattr->cf_cifsattrs; 139 cifs_i->cifsAttrs = fattr->cf_cifsattrs;
102 cifs_i->uniqueid = fattr->cf_uniqueid;
103 140
104 if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL) 141 if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)
105 cifs_i->time = 0; 142 cifs_i->time = 0;
106 else 143 else
107 cifs_i->time = jiffies; 144 cifs_i->time = jiffies;
108 145
109 cFYI(1, ("inode 0x%p old_time=%ld new_time=%ld", inode, 146 cFYI(1, "inode 0x%p old_time=%ld new_time=%ld", inode,
110 oldtime, cifs_i->time)); 147 oldtime, cifs_i->time);
111 148
112 cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING; 149 cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING;
113 150
@@ -132,6 +169,17 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
132 cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL); 169 cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL);
133} 170}
134 171
172void
173cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr)
174{
175 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
176
177 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
178 return;
179
180 fattr->cf_uniqueid = iunique(sb, ROOT_I);
181}
182
135/* Fill a cifs_fattr struct with info from FILE_UNIX_BASIC_INFO. */ 183/* Fill a cifs_fattr struct with info from FILE_UNIX_BASIC_INFO. */
136void 184void
137cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info, 185cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
@@ -189,7 +237,7 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
189 /* safest to call it a file if we do not know */ 237 /* safest to call it a file if we do not know */
190 fattr->cf_mode |= S_IFREG; 238 fattr->cf_mode |= S_IFREG;
191 fattr->cf_dtype = DT_REG; 239 fattr->cf_dtype = DT_REG;
192 cFYI(1, ("unknown type %d", le32_to_cpu(info->Type))); 240 cFYI(1, "unknown type %d", le32_to_cpu(info->Type));
193 break; 241 break;
194 } 242 }
195 243
@@ -218,7 +266,7 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
218{ 266{
219 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 267 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
220 268
221 cFYI(1, ("creating fake fattr for DFS referral")); 269 cFYI(1, "creating fake fattr for DFS referral");
222 270
223 memset(fattr, 0, sizeof(*fattr)); 271 memset(fattr, 0, sizeof(*fattr));
224 fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU; 272 fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU;
@@ -231,6 +279,31 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
231 fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL; 279 fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL;
232} 280}
233 281
282int cifs_get_file_info_unix(struct file *filp)
283{
284 int rc;
285 int xid;
286 FILE_UNIX_BASIC_INFO find_data;
287 struct cifs_fattr fattr;
288 struct inode *inode = filp->f_path.dentry->d_inode;
289 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
290 struct cifsTconInfo *tcon = cifs_sb->tcon;
291 struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
292
293 xid = GetXid();
294 rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
295 if (!rc) {
296 cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
297 } else if (rc == -EREMOTE) {
298 cifs_create_dfs_fattr(&fattr, inode->i_sb);
299 rc = 0;
300 }
301
302 cifs_fattr_to_inode(inode, &fattr);
303 FreeXid(xid);
304 return rc;
305}
306
234int cifs_get_inode_info_unix(struct inode **pinode, 307int cifs_get_inode_info_unix(struct inode **pinode,
235 const unsigned char *full_path, 308 const unsigned char *full_path,
236 struct super_block *sb, int xid) 309 struct super_block *sb, int xid)
@@ -242,7 +315,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
242 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 315 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
243 316
244 tcon = cifs_sb->tcon; 317 tcon = cifs_sb->tcon;
245 cFYI(1, ("Getting info on %s", full_path)); 318 cFYI(1, "Getting info on %s", full_path);
246 319
247 /* could have done a find first instead but this returns more info */ 320 /* could have done a find first instead but this returns more info */
248 rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data, 321 rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data,
@@ -260,6 +333,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
260 333
261 if (*pinode == NULL) { 334 if (*pinode == NULL) {
262 /* get new inode */ 335 /* get new inode */
336 cifs_fill_uniqueid(sb, &fattr);
263 *pinode = cifs_iget(sb, &fattr); 337 *pinode = cifs_iget(sb, &fattr);
264 if (!*pinode) 338 if (!*pinode)
265 rc = -ENOMEM; 339 rc = -ENOMEM;
@@ -310,7 +384,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
310 &bytes_read, &pbuf, &buf_type); 384 &bytes_read, &pbuf, &buf_type);
311 if ((rc == 0) && (bytes_read >= 8)) { 385 if ((rc == 0) && (bytes_read >= 8)) {
312 if (memcmp("IntxBLK", pbuf, 8) == 0) { 386 if (memcmp("IntxBLK", pbuf, 8) == 0) {
313 cFYI(1, ("Block device")); 387 cFYI(1, "Block device");
314 fattr->cf_mode |= S_IFBLK; 388 fattr->cf_mode |= S_IFBLK;
315 fattr->cf_dtype = DT_BLK; 389 fattr->cf_dtype = DT_BLK;
316 if (bytes_read == 24) { 390 if (bytes_read == 24) {
@@ -322,7 +396,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
322 fattr->cf_rdev = MKDEV(mjr, mnr); 396 fattr->cf_rdev = MKDEV(mjr, mnr);
323 } 397 }
324 } else if (memcmp("IntxCHR", pbuf, 8) == 0) { 398 } else if (memcmp("IntxCHR", pbuf, 8) == 0) {
325 cFYI(1, ("Char device")); 399 cFYI(1, "Char device");
326 fattr->cf_mode |= S_IFCHR; 400 fattr->cf_mode |= S_IFCHR;
327 fattr->cf_dtype = DT_CHR; 401 fattr->cf_dtype = DT_CHR;
328 if (bytes_read == 24) { 402 if (bytes_read == 24) {
@@ -334,7 +408,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
334 fattr->cf_rdev = MKDEV(mjr, mnr); 408 fattr->cf_rdev = MKDEV(mjr, mnr);
335 } 409 }
336 } else if (memcmp("IntxLNK", pbuf, 7) == 0) { 410 } else if (memcmp("IntxLNK", pbuf, 7) == 0) {
337 cFYI(1, ("Symlink")); 411 cFYI(1, "Symlink");
338 fattr->cf_mode |= S_IFLNK; 412 fattr->cf_mode |= S_IFLNK;
339 fattr->cf_dtype = DT_LNK; 413 fattr->cf_dtype = DT_LNK;
340 } else { 414 } else {
@@ -376,10 +450,10 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
376 else if (rc > 3) { 450 else if (rc > 3) {
377 mode = le32_to_cpu(*((__le32 *)ea_value)); 451 mode = le32_to_cpu(*((__le32 *)ea_value));
378 fattr->cf_mode &= ~SFBITS_MASK; 452 fattr->cf_mode &= ~SFBITS_MASK;
379 cFYI(1, ("special bits 0%o org mode 0%o", mode, 453 cFYI(1, "special bits 0%o org mode 0%o", mode,
380 fattr->cf_mode)); 454 fattr->cf_mode);
381 fattr->cf_mode = (mode & SFBITS_MASK) | fattr->cf_mode; 455 fattr->cf_mode = (mode & SFBITS_MASK) | fattr->cf_mode;
382 cFYI(1, ("special mode bits 0%o", mode)); 456 cFYI(1, "special mode bits 0%o", mode);
383 } 457 }
384 458
385 return 0; 459 return 0;
@@ -432,6 +506,47 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
432 fattr->cf_gid = cifs_sb->mnt_gid; 506 fattr->cf_gid = cifs_sb->mnt_gid;
433} 507}
434 508
509int cifs_get_file_info(struct file *filp)
510{
511 int rc;
512 int xid;
513 FILE_ALL_INFO find_data;
514 struct cifs_fattr fattr;
515 struct inode *inode = filp->f_path.dentry->d_inode;
516 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
517 struct cifsTconInfo *tcon = cifs_sb->tcon;
518 struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
519
520 xid = GetXid();
521 rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
522 if (rc == -EOPNOTSUPP || rc == -EINVAL) {
523 /*
524 * FIXME: legacy server -- fall back to path-based call?
525 * for now, just skip revalidating and mark inode for
526 * immediate reval.
527 */
528 rc = 0;
529 CIFS_I(inode)->time = 0;
530 goto cgfi_exit;
531 } else if (rc == -EREMOTE) {
532 cifs_create_dfs_fattr(&fattr, inode->i_sb);
533 rc = 0;
534 } else if (rc)
535 goto cgfi_exit;
536
537 /*
538 * don't bother with SFU junk here -- just mark inode as needing
539 * revalidation.
540 */
541 cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
542 fattr.cf_uniqueid = CIFS_I(inode)->uniqueid;
543 fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
544 cifs_fattr_to_inode(inode, &fattr);
545cgfi_exit:
546 FreeXid(xid);
547 return rc;
548}
549
435int cifs_get_inode_info(struct inode **pinode, 550int cifs_get_inode_info(struct inode **pinode,
436 const unsigned char *full_path, FILE_ALL_INFO *pfindData, 551 const unsigned char *full_path, FILE_ALL_INFO *pfindData,
437 struct super_block *sb, int xid, const __u16 *pfid) 552 struct super_block *sb, int xid, const __u16 *pfid)
@@ -444,11 +559,11 @@ int cifs_get_inode_info(struct inode **pinode,
444 struct cifs_fattr fattr; 559 struct cifs_fattr fattr;
445 560
446 pTcon = cifs_sb->tcon; 561 pTcon = cifs_sb->tcon;
447 cFYI(1, ("Getting info on %s", full_path)); 562 cFYI(1, "Getting info on %s", full_path);
448 563
449 if ((pfindData == NULL) && (*pinode != NULL)) { 564 if ((pfindData == NULL) && (*pinode != NULL)) {
450 if (CIFS_I(*pinode)->clientCanCacheRead) { 565 if (CIFS_I(*pinode)->clientCanCacheRead) {
451 cFYI(1, ("No need to revalidate cached inode sizes")); 566 cFYI(1, "No need to revalidate cached inode sizes");
452 return rc; 567 return rc;
453 } 568 }
454 } 569 }
@@ -514,7 +629,7 @@ int cifs_get_inode_info(struct inode **pinode,
514 cifs_sb->mnt_cifs_flags & 629 cifs_sb->mnt_cifs_flags &
515 CIFS_MOUNT_MAP_SPECIAL_CHR); 630 CIFS_MOUNT_MAP_SPECIAL_CHR);
516 if (rc1 || !fattr.cf_uniqueid) { 631 if (rc1 || !fattr.cf_uniqueid) {
517 cFYI(1, ("GetSrvInodeNum rc %d", rc1)); 632 cFYI(1, "GetSrvInodeNum rc %d", rc1);
518 fattr.cf_uniqueid = iunique(sb, ROOT_I); 633 fattr.cf_uniqueid = iunique(sb, ROOT_I);
519 cifs_autodisable_serverino(cifs_sb); 634 cifs_autodisable_serverino(cifs_sb);
520 } 635 }
@@ -530,13 +645,13 @@ int cifs_get_inode_info(struct inode **pinode,
530 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { 645 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
531 tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid); 646 tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid);
532 if (tmprc) 647 if (tmprc)
533 cFYI(1, ("cifs_sfu_type failed: %d", tmprc)); 648 cFYI(1, "cifs_sfu_type failed: %d", tmprc);
534 } 649 }
535 650
536#ifdef CONFIG_CIFS_EXPERIMENTAL 651#ifdef CONFIG_CIFS_EXPERIMENTAL
537 /* fill in 0777 bits from ACL */ 652 /* fill in 0777 bits from ACL */
538 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { 653 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
539 cFYI(1, ("Getting mode bits from ACL")); 654 cFYI(1, "Getting mode bits from ACL");
540 cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, pfid); 655 cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, pfid);
541 } 656 }
542#endif 657#endif
@@ -611,6 +726,16 @@ cifs_find_inode(struct inode *inode, void *opaque)
611 if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid) 726 if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
612 return 0; 727 return 0;
613 728
729 /*
730 * uh oh -- it's a directory. We can't use it since hardlinked dirs are
731 * verboten. Disable serverino and return it as if it were found, the
732 * caller can discard it, generate a uniqueid and retry the find
733 */
734 if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) {
735 fattr->cf_flags |= CIFS_FATTR_INO_COLLISION;
736 cifs_autodisable_serverino(CIFS_SB(inode->i_sb));
737 }
738
614 return 1; 739 return 1;
615} 740}
616 741
@@ -630,15 +755,22 @@ cifs_iget(struct super_block *sb, struct cifs_fattr *fattr)
630 unsigned long hash; 755 unsigned long hash;
631 struct inode *inode; 756 struct inode *inode;
632 757
633 cFYI(1, ("looking for uniqueid=%llu", fattr->cf_uniqueid)); 758retry_iget5_locked:
759 cFYI(1, "looking for uniqueid=%llu", fattr->cf_uniqueid);
634 760
635 /* hash down to 32-bits on 32-bit arch */ 761 /* hash down to 32-bits on 32-bit arch */
636 hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid); 762 hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
637 763
638 inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr); 764 inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr);
639
640 /* we have fattrs in hand, update the inode */
641 if (inode) { 765 if (inode) {
766 /* was there a problematic inode number collision? */
767 if (fattr->cf_flags & CIFS_FATTR_INO_COLLISION) {
768 iput(inode);
769 fattr->cf_uniqueid = iunique(sb, ROOT_I);
770 fattr->cf_flags &= ~CIFS_FATTR_INO_COLLISION;
771 goto retry_iget5_locked;
772 }
773
642 cifs_fattr_to_inode(inode, fattr); 774 cifs_fattr_to_inode(inode, fattr);
643 if (sb->s_flags & MS_NOATIME) 775 if (sb->s_flags & MS_NOATIME)
644 inode->i_flags |= S_NOATIME | S_NOCMTIME; 776 inode->i_flags |= S_NOATIME | S_NOCMTIME;
@@ -676,7 +808,7 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
676 return ERR_PTR(-ENOMEM); 808 return ERR_PTR(-ENOMEM);
677 809
678 if (rc && cifs_sb->tcon->ipc) { 810 if (rc && cifs_sb->tcon->ipc) {
679 cFYI(1, ("ipc connection - fake read inode")); 811 cFYI(1, "ipc connection - fake read inode");
680 inode->i_mode |= S_IFDIR; 812 inode->i_mode |= S_IFDIR;
681 inode->i_nlink = 2; 813 inode->i_nlink = 2;
682 inode->i_op = &cifs_ipc_inode_ops; 814 inode->i_op = &cifs_ipc_inode_ops;
@@ -738,7 +870,7 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
738 * server times. 870 * server times.
739 */ 871 */
740 if (set_time && (attrs->ia_valid & ATTR_CTIME)) { 872 if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
741 cFYI(1, ("CIFS - CTIME changed")); 873 cFYI(1, "CIFS - CTIME changed");
742 info_buf.ChangeTime = 874 info_buf.ChangeTime =
743 cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime)); 875 cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
744 } else 876 } else
@@ -773,8 +905,8 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
773 goto out; 905 goto out;
774 } 906 }
775 907
776 cFYI(1, ("calling SetFileInfo since SetPathInfo for " 908 cFYI(1, "calling SetFileInfo since SetPathInfo for "
777 "times not supported by this server")); 909 "times not supported by this server");
778 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, 910 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
779 SYNCHRONIZE | FILE_WRITE_ATTRIBUTES, 911 SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
780 CREATE_NOT_DIR, &netfid, &oplock, 912 CREATE_NOT_DIR, &netfid, &oplock,
@@ -932,7 +1064,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
932 struct iattr *attrs = NULL; 1064 struct iattr *attrs = NULL;
933 __u32 dosattr = 0, origattr = 0; 1065 __u32 dosattr = 0, origattr = 0;
934 1066
935 cFYI(1, ("cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry)); 1067 cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry);
936 1068
937 xid = GetXid(); 1069 xid = GetXid();
938 1070
@@ -951,7 +1083,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
951 rc = CIFSPOSIXDelFile(xid, tcon, full_path, 1083 rc = CIFSPOSIXDelFile(xid, tcon, full_path,
952 SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls, 1084 SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls,
953 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 1085 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
954 cFYI(1, ("posix del rc %d", rc)); 1086 cFYI(1, "posix del rc %d", rc);
955 if ((rc == 0) || (rc == -ENOENT)) 1087 if ((rc == 0) || (rc == -ENOENT))
956 goto psx_del_no_retry; 1088 goto psx_del_no_retry;
957 } 1089 }
@@ -1025,7 +1157,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1025 struct inode *newinode = NULL; 1157 struct inode *newinode = NULL;
1026 struct cifs_fattr fattr; 1158 struct cifs_fattr fattr;
1027 1159
1028 cFYI(1, ("In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode)); 1160 cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode);
1029 1161
1030 xid = GetXid(); 1162 xid = GetXid();
1031 1163
@@ -1060,7 +1192,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1060 kfree(pInfo); 1192 kfree(pInfo);
1061 goto mkdir_retry_old; 1193 goto mkdir_retry_old;
1062 } else if (rc) { 1194 } else if (rc) {
1063 cFYI(1, ("posix mkdir returned 0x%x", rc)); 1195 cFYI(1, "posix mkdir returned 0x%x", rc);
1064 d_drop(direntry); 1196 d_drop(direntry);
1065 } else { 1197 } else {
1066 if (pInfo->Type == cpu_to_le32(-1)) { 1198 if (pInfo->Type == cpu_to_le32(-1)) {
@@ -1077,6 +1209,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1077 direntry->d_op = &cifs_dentry_ops; 1209 direntry->d_op = &cifs_dentry_ops;
1078 1210
1079 cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb); 1211 cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
1212 cifs_fill_uniqueid(inode->i_sb, &fattr);
1080 newinode = cifs_iget(inode->i_sb, &fattr); 1213 newinode = cifs_iget(inode->i_sb, &fattr);
1081 if (!newinode) { 1214 if (!newinode) {
1082 kfree(pInfo); 1215 kfree(pInfo);
@@ -1086,12 +1219,12 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1086 d_instantiate(direntry, newinode); 1219 d_instantiate(direntry, newinode);
1087 1220
1088#ifdef CONFIG_CIFS_DEBUG2 1221#ifdef CONFIG_CIFS_DEBUG2
1089 cFYI(1, ("instantiated dentry %p %s to inode %p", 1222 cFYI(1, "instantiated dentry %p %s to inode %p",
1090 direntry, direntry->d_name.name, newinode)); 1223 direntry, direntry->d_name.name, newinode);
1091 1224
1092 if (newinode->i_nlink != 2) 1225 if (newinode->i_nlink != 2)
1093 cFYI(1, ("unexpected number of links %d", 1226 cFYI(1, "unexpected number of links %d",
1094 newinode->i_nlink)); 1227 newinode->i_nlink);
1095#endif 1228#endif
1096 } 1229 }
1097 kfree(pInfo); 1230 kfree(pInfo);
@@ -1102,7 +1235,7 @@ mkdir_retry_old:
1102 rc = CIFSSMBMkDir(xid, pTcon, full_path, cifs_sb->local_nls, 1235 rc = CIFSSMBMkDir(xid, pTcon, full_path, cifs_sb->local_nls,
1103 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 1236 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
1104 if (rc) { 1237 if (rc) {
1105 cFYI(1, ("cifs_mkdir returned 0x%x", rc)); 1238 cFYI(1, "cifs_mkdir returned 0x%x", rc);
1106 d_drop(direntry); 1239 d_drop(direntry);
1107 } else { 1240 } else {
1108mkdir_get_info: 1241mkdir_get_info:
@@ -1205,7 +1338,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
1205 char *full_path = NULL; 1338 char *full_path = NULL;
1206 struct cifsInodeInfo *cifsInode; 1339 struct cifsInodeInfo *cifsInode;
1207 1340
1208 cFYI(1, ("cifs_rmdir, inode = 0x%p", inode)); 1341 cFYI(1, "cifs_rmdir, inode = 0x%p", inode);
1209 1342
1210 xid = GetXid(); 1343 xid = GetXid();
1211 1344
@@ -1389,135 +1522,108 @@ cifs_rename_exit:
1389 return rc; 1522 return rc;
1390} 1523}
1391 1524
1392int cifs_revalidate(struct dentry *direntry) 1525static bool
1526cifs_inode_needs_reval(struct inode *inode)
1393{ 1527{
1394 int xid; 1528 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
1395 int rc = 0, wbrc = 0;
1396 char *full_path;
1397 struct cifs_sb_info *cifs_sb;
1398 struct cifsInodeInfo *cifsInode;
1399 loff_t local_size;
1400 struct timespec local_mtime;
1401 bool invalidate_inode = false;
1402 1529
1403 if (direntry->d_inode == NULL) 1530 if (cifs_i->clientCanCacheRead)
1404 return -ENOENT; 1531 return false;
1405 1532
1406 cifsInode = CIFS_I(direntry->d_inode); 1533 if (!lookupCacheEnabled)
1534 return true;
1407 1535
1408 if (cifsInode == NULL) 1536 if (cifs_i->time == 0)
1409 return -ENOENT; 1537 return true;
1410 1538
1411 /* no sense revalidating inode info on file that no one can write */ 1539 /* FIXME: the actimeo should be tunable */
1412 if (CIFS_I(direntry->d_inode)->clientCanCacheRead) 1540 if (time_after_eq(jiffies, cifs_i->time + HZ))
1413 return rc; 1541 return true;
1542
1543 /* hardlinked files w/ noserverino get "special" treatment */
1544 if (!(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
1545 S_ISREG(inode->i_mode) && inode->i_nlink != 1)
1546 return true;
1547
1548 return false;
1549}
1550
1551/* check invalid_mapping flag and zap the cache if it's set */
1552static void
1553cifs_invalidate_mapping(struct inode *inode)
1554{
1555 int rc;
1556 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
1557
1558 cifs_i->invalid_mapping = false;
1559
1560 /* write back any cached data */
1561 if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
1562 rc = filemap_write_and_wait(inode->i_mapping);
1563 if (rc)
1564 cifs_i->write_behind_rc = rc;
1565 }
1566 invalidate_remote_inode(inode);
1567}
1568
1569int cifs_revalidate_file(struct file *filp)
1570{
1571 int rc = 0;
1572 struct inode *inode = filp->f_path.dentry->d_inode;
1573
1574 if (!cifs_inode_needs_reval(inode))
1575 goto check_inval;
1576
1577 if (CIFS_SB(inode->i_sb)->tcon->unix_ext)
1578 rc = cifs_get_file_info_unix(filp);
1579 else
1580 rc = cifs_get_file_info(filp);
1581
1582check_inval:
1583 if (CIFS_I(inode)->invalid_mapping)
1584 cifs_invalidate_mapping(inode);
1585
1586 return rc;
1587}
1588
1589/* revalidate a dentry's inode attributes */
1590int cifs_revalidate_dentry(struct dentry *dentry)
1591{
1592 int xid;
1593 int rc = 0;
1594 char *full_path = NULL;
1595 struct inode *inode = dentry->d_inode;
1596 struct super_block *sb = dentry->d_sb;
1597
1598 if (inode == NULL)
1599 return -ENOENT;
1414 1600
1415 xid = GetXid(); 1601 xid = GetXid();
1416 1602
1417 cifs_sb = CIFS_SB(direntry->d_sb); 1603 if (!cifs_inode_needs_reval(inode))
1604 goto check_inval;
1418 1605
1419 /* can not safely grab the rename sem here if rename calls revalidate 1606 /* can not safely grab the rename sem here if rename calls revalidate
1420 since that would deadlock */ 1607 since that would deadlock */
1421 full_path = build_path_from_dentry(direntry); 1608 full_path = build_path_from_dentry(dentry);
1422 if (full_path == NULL) { 1609 if (full_path == NULL) {
1423 rc = -ENOMEM; 1610 rc = -ENOMEM;
1424 FreeXid(xid); 1611 goto check_inval;
1425 return rc;
1426 }
1427 cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
1428 "jiffies %ld", full_path, direntry->d_inode,
1429 direntry->d_inode->i_count.counter, direntry,
1430 direntry->d_time, jiffies));
1431
1432 if (cifsInode->time == 0) {
1433 /* was set to zero previously to force revalidate */
1434 } else if (time_before(jiffies, cifsInode->time + HZ) &&
1435 lookupCacheEnabled) {
1436 if ((S_ISREG(direntry->d_inode->i_mode) == 0) ||
1437 (direntry->d_inode->i_nlink == 1)) {
1438 kfree(full_path);
1439 FreeXid(xid);
1440 return rc;
1441 } else {
1442 cFYI(1, ("Have to revalidate file due to hardlinks"));
1443 }
1444 } 1612 }
1445 1613
1446 /* save mtime and size */ 1614 cFYI(1, "Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
1447 local_mtime = direntry->d_inode->i_mtime; 1615 "jiffies %ld", full_path, inode, inode->i_count.counter,
1448 local_size = direntry->d_inode->i_size; 1616 dentry, dentry->d_time, jiffies);
1449 1617
1450 if (cifs_sb->tcon->unix_ext) { 1618 if (CIFS_SB(sb)->tcon->unix_ext)
1451 rc = cifs_get_inode_info_unix(&direntry->d_inode, full_path, 1619 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
1452 direntry->d_sb, xid); 1620 else
1453 if (rc) { 1621 rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
1454 cFYI(1, ("error on getting revalidate info %d", rc)); 1622 xid, NULL);
1455/* if (rc != -ENOENT)
1456 rc = 0; */ /* BB should we cache info on
1457 certain errors? */
1458 }
1459 } else {
1460 rc = cifs_get_inode_info(&direntry->d_inode, full_path, NULL,
1461 direntry->d_sb, xid, NULL);
1462 if (rc) {
1463 cFYI(1, ("error on getting revalidate info %d", rc));
1464/* if (rc != -ENOENT)
1465 rc = 0; */ /* BB should we cache info on
1466 certain errors? */
1467 }
1468 }
1469 /* should we remap certain errors, access denied?, to zero */
1470
1471 /* if not oplocked, we invalidate inode pages if mtime or file size
1472 had changed on server */
1473
1474 if (timespec_equal(&local_mtime, &direntry->d_inode->i_mtime) &&
1475 (local_size == direntry->d_inode->i_size)) {
1476 cFYI(1, ("cifs_revalidate - inode unchanged"));
1477 } else {
1478 /* file may have changed on server */
1479 if (cifsInode->clientCanCacheRead) {
1480 /* no need to invalidate inode pages since we were the
1481 only ones who could have modified the file and the
1482 server copy is staler than ours */
1483 } else {
1484 invalidate_inode = true;
1485 }
1486 }
1487 1623
1488 /* can not grab this sem since kernel filesys locking documentation 1624check_inval:
1489 indicates i_mutex may be taken by the kernel on lookup and rename 1625 if (CIFS_I(inode)->invalid_mapping)
1490 which could deadlock if we grab the i_mutex here as well */ 1626 cifs_invalidate_mapping(inode);
1491/* mutex_lock(&direntry->d_inode->i_mutex);*/
1492 /* need to write out dirty pages here */
1493 if (direntry->d_inode->i_mapping) {
1494 /* do we need to lock inode until after invalidate completes
1495 below? */
1496 wbrc = filemap_fdatawrite(direntry->d_inode->i_mapping);
1497 if (wbrc)
1498 CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
1499 }
1500 if (invalidate_inode) {
1501 /* shrink_dcache not necessary now that cifs dentry ops
1502 are exported for negative dentries */
1503/* if (S_ISDIR(direntry->d_inode->i_mode))
1504 shrink_dcache_parent(direntry); */
1505 if (S_ISREG(direntry->d_inode->i_mode)) {
1506 if (direntry->d_inode->i_mapping) {
1507 wbrc = filemap_fdatawait(direntry->d_inode->i_mapping);
1508 if (wbrc)
1509 CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
1510 }
1511 /* may eventually have to do this for open files too */
1512 if (list_empty(&(cifsInode->openFileList))) {
1513 /* changed on server - flush read ahead pages */
1514 cFYI(1, ("Invalidating read ahead data on "
1515 "closed file"));
1516 invalidate_remote_inode(direntry->d_inode);
1517 }
1518 }
1519 }
1520/* mutex_unlock(&direntry->d_inode->i_mutex); */
1521 1627
1522 kfree(full_path); 1628 kfree(full_path);
1523 FreeXid(xid); 1629 FreeXid(xid);
@@ -1527,7 +1633,7 @@ int cifs_revalidate(struct dentry *direntry)
1527int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, 1633int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1528 struct kstat *stat) 1634 struct kstat *stat)
1529{ 1635{
1530 int err = cifs_revalidate(dentry); 1636 int err = cifs_revalidate_dentry(dentry);
1531 if (!err) { 1637 if (!err) {
1532 generic_fillattr(dentry->d_inode, stat); 1638 generic_fillattr(dentry->d_inode, stat);
1533 stat->blksize = CIFS_MAX_MSGSIZE; 1639 stat->blksize = CIFS_MAX_MSGSIZE;
@@ -1601,12 +1707,12 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1601 rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid, 1707 rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
1602 npid, false); 1708 npid, false);
1603 cifsFileInfo_put(open_file); 1709 cifsFileInfo_put(open_file);
1604 cFYI(1, ("SetFSize for attrs rc = %d", rc)); 1710 cFYI(1, "SetFSize for attrs rc = %d", rc);
1605 if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { 1711 if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
1606 unsigned int bytes_written; 1712 unsigned int bytes_written;
1607 rc = CIFSSMBWrite(xid, pTcon, nfid, 0, attrs->ia_size, 1713 rc = CIFSSMBWrite(xid, pTcon, nfid, 0, attrs->ia_size,
1608 &bytes_written, NULL, NULL, 1); 1714 &bytes_written, NULL, NULL, 1);
1609 cFYI(1, ("Wrt seteof rc %d", rc)); 1715 cFYI(1, "Wrt seteof rc %d", rc);
1610 } 1716 }
1611 } else 1717 } else
1612 rc = -EINVAL; 1718 rc = -EINVAL;
@@ -1620,7 +1726,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1620 false, cifs_sb->local_nls, 1726 false, cifs_sb->local_nls,
1621 cifs_sb->mnt_cifs_flags & 1727 cifs_sb->mnt_cifs_flags &
1622 CIFS_MOUNT_MAP_SPECIAL_CHR); 1728 CIFS_MOUNT_MAP_SPECIAL_CHR);
1623 cFYI(1, ("SetEOF by path (setattrs) rc = %d", rc)); 1729 cFYI(1, "SetEOF by path (setattrs) rc = %d", rc);
1624 if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { 1730 if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
1625 __u16 netfid; 1731 __u16 netfid;
1626 int oplock = 0; 1732 int oplock = 0;
@@ -1637,7 +1743,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1637 attrs->ia_size, 1743 attrs->ia_size,
1638 &bytes_written, NULL, 1744 &bytes_written, NULL,
1639 NULL, 1); 1745 NULL, 1);
1640 cFYI(1, ("wrt seteof rc %d", rc)); 1746 cFYI(1, "wrt seteof rc %d", rc);
1641 CIFSSMBClose(xid, pTcon, netfid); 1747 CIFSSMBClose(xid, pTcon, netfid);
1642 } 1748 }
1643 } 1749 }
@@ -1665,8 +1771,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1665 struct cifs_unix_set_info_args *args = NULL; 1771 struct cifs_unix_set_info_args *args = NULL;
1666 struct cifsFileInfo *open_file; 1772 struct cifsFileInfo *open_file;
1667 1773
1668 cFYI(1, ("setattr_unix on file %s attrs->ia_valid=0x%x", 1774 cFYI(1, "setattr_unix on file %s attrs->ia_valid=0x%x",
1669 direntry->d_name.name, attrs->ia_valid)); 1775 direntry->d_name.name, attrs->ia_valid);
1670 1776
1671 xid = GetXid(); 1777 xid = GetXid();
1672 1778
@@ -1796,8 +1902,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
1796 1902
1797 xid = GetXid(); 1903 xid = GetXid();
1798 1904
1799 cFYI(1, ("setattr on file %s attrs->iavalid 0x%x", 1905 cFYI(1, "setattr on file %s attrs->iavalid 0x%x",
1800 direntry->d_name.name, attrs->ia_valid)); 1906 direntry->d_name.name, attrs->ia_valid);
1801 1907
1802 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) { 1908 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) {
1803 /* check if we have permission to change attrs */ 1909 /* check if we have permission to change attrs */
@@ -1854,7 +1960,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
1854 attrs->ia_valid &= ~ATTR_MODE; 1960 attrs->ia_valid &= ~ATTR_MODE;
1855 1961
1856 if (attrs->ia_valid & ATTR_MODE) { 1962 if (attrs->ia_valid & ATTR_MODE) {
1857 cFYI(1, ("Mode changed to 0%o", attrs->ia_mode)); 1963 cFYI(1, "Mode changed to 0%o", attrs->ia_mode);
1858 mode = attrs->ia_mode; 1964 mode = attrs->ia_mode;
1859 } 1965 }
1860 1966
@@ -1940,7 +2046,7 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs)
1940#if 0 2046#if 0
1941void cifs_delete_inode(struct inode *inode) 2047void cifs_delete_inode(struct inode *inode)
1942{ 2048{
1943 cFYI(1, ("In cifs_delete_inode, inode = 0x%p", inode)); 2049 cFYI(1, "In cifs_delete_inode, inode = 0x%p", inode);
1944 /* may have to add back in if and when safe distributed caching of 2050 /* may have to add back in if and when safe distributed caching of
1945 directories added e.g. via FindNotify */ 2051 directories added e.g. via FindNotify */
1946} 2052}
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index f94650683a00..505926f1ee6b 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -47,7 +47,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
47 47
48 xid = GetXid(); 48 xid = GetXid();
49 49
50 cFYI(1, ("ioctl file %p cmd %u arg %lu", filep, command, arg)); 50 cFYI(1, "ioctl file %p cmd %u arg %lu", filep, command, arg);
51 51
52 cifs_sb = CIFS_SB(inode->i_sb); 52 cifs_sb = CIFS_SB(inode->i_sb);
53 53
@@ -64,12 +64,12 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
64 64
65 switch (command) { 65 switch (command) {
66 case CIFS_IOC_CHECKUMOUNT: 66 case CIFS_IOC_CHECKUMOUNT:
67 cFYI(1, ("User unmount attempted")); 67 cFYI(1, "User unmount attempted");
68 if (cifs_sb->mnt_uid == current_uid()) 68 if (cifs_sb->mnt_uid == current_uid())
69 rc = 0; 69 rc = 0;
70 else { 70 else {
71 rc = -EACCES; 71 rc = -EACCES;
72 cFYI(1, ("uids do not match")); 72 cFYI(1, "uids do not match");
73 } 73 }
74 break; 74 break;
75#ifdef CONFIG_CIFS_POSIX 75#ifdef CONFIG_CIFS_POSIX
@@ -97,11 +97,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
97 /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid, 97 /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid,
98 extAttrBits, &ExtAttrMask);*/ 98 extAttrBits, &ExtAttrMask);*/
99 } 99 }
100 cFYI(1, ("set flags not implemented yet")); 100 cFYI(1, "set flags not implemented yet");
101 break; 101 break;
102#endif /* CONFIG_CIFS_POSIX */ 102#endif /* CONFIG_CIFS_POSIX */
103 default: 103 default:
104 cFYI(1, ("unsupported ioctl")); 104 cFYI(1, "unsupported ioctl");
105 break; 105 break;
106 } 106 }
107 107
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index fc1e0487eaee..473ca8033656 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -20,6 +20,7 @@
20 */ 20 */
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/stat.h> 22#include <linux/stat.h>
23#include <linux/slab.h>
23#include <linux/namei.h> 24#include <linux/namei.h>
24#include "cifsfs.h" 25#include "cifsfs.h"
25#include "cifspdu.h" 26#include "cifspdu.h"
@@ -138,7 +139,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
138 if (!full_path) 139 if (!full_path)
139 goto out; 140 goto out;
140 141
141 cFYI(1, ("Full path: %s inode = 0x%p", full_path, inode)); 142 cFYI(1, "Full path: %s inode = 0x%p", full_path, inode);
142 143
143 rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path, 144 rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
144 cifs_sb->local_nls); 145 cifs_sb->local_nls);
@@ -177,8 +178,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
177 return rc; 178 return rc;
178 } 179 }
179 180
180 cFYI(1, ("Full path: %s", full_path)); 181 cFYI(1, "Full path: %s", full_path);
181 cFYI(1, ("symname is %s", symname)); 182 cFYI(1, "symname is %s", symname);
182 183
183 /* BB what if DFS and this volume is on different share? BB */ 184 /* BB what if DFS and this volume is on different share? BB */
184 if (pTcon->unix_ext) 185 if (pTcon->unix_ext)
@@ -197,8 +198,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
197 inode->i_sb, xid, NULL); 198 inode->i_sb, xid, NULL);
198 199
199 if (rc != 0) { 200 if (rc != 0) {
200 cFYI(1, ("Create symlink ok, getinodeinfo fail rc = %d", 201 cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d",
201 rc)); 202 rc);
202 } else { 203 } else {
203 if (pTcon->nocase) 204 if (pTcon->nocase)
204 direntry->d_op = &cifs_ci_dentry_ops; 205 direntry->d_op = &cifs_ci_dentry_ops;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index d1474996a812..1394aa37f26c 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -51,7 +51,7 @@ _GetXid(void)
51 if (GlobalTotalActiveXid > GlobalMaxActiveXid) 51 if (GlobalTotalActiveXid > GlobalMaxActiveXid)
52 GlobalMaxActiveXid = GlobalTotalActiveXid; 52 GlobalMaxActiveXid = GlobalTotalActiveXid;
53 if (GlobalTotalActiveXid > 65000) 53 if (GlobalTotalActiveXid > 65000)
54 cFYI(1, ("warning: more than 65000 requests active")); 54 cFYI(1, "warning: more than 65000 requests active");
55 xid = GlobalCurrentXid++; 55 xid = GlobalCurrentXid++;
56 spin_unlock(&GlobalMid_Lock); 56 spin_unlock(&GlobalMid_Lock);
57 return xid; 57 return xid;
@@ -88,7 +88,7 @@ void
88sesInfoFree(struct cifsSesInfo *buf_to_free) 88sesInfoFree(struct cifsSesInfo *buf_to_free)
89{ 89{
90 if (buf_to_free == NULL) { 90 if (buf_to_free == NULL) {
91 cFYI(1, ("Null buffer passed to sesInfoFree")); 91 cFYI(1, "Null buffer passed to sesInfoFree");
92 return; 92 return;
93 } 93 }
94 94
@@ -126,7 +126,7 @@ void
126tconInfoFree(struct cifsTconInfo *buf_to_free) 126tconInfoFree(struct cifsTconInfo *buf_to_free)
127{ 127{
128 if (buf_to_free == NULL) { 128 if (buf_to_free == NULL) {
129 cFYI(1, ("Null buffer passed to tconInfoFree")); 129 cFYI(1, "Null buffer passed to tconInfoFree");
130 return; 130 return;
131 } 131 }
132 atomic_dec(&tconInfoAllocCount); 132 atomic_dec(&tconInfoAllocCount);
@@ -166,7 +166,7 @@ void
166cifs_buf_release(void *buf_to_free) 166cifs_buf_release(void *buf_to_free)
167{ 167{
168 if (buf_to_free == NULL) { 168 if (buf_to_free == NULL) {
169 /* cFYI(1, ("Null buffer passed to cifs_buf_release"));*/ 169 /* cFYI(1, "Null buffer passed to cifs_buf_release");*/
170 return; 170 return;
171 } 171 }
172 mempool_free(buf_to_free, cifs_req_poolp); 172 mempool_free(buf_to_free, cifs_req_poolp);
@@ -202,7 +202,7 @@ cifs_small_buf_release(void *buf_to_free)
202{ 202{
203 203
204 if (buf_to_free == NULL) { 204 if (buf_to_free == NULL) {
205 cFYI(1, ("Null buffer passed to cifs_small_buf_release")); 205 cFYI(1, "Null buffer passed to cifs_small_buf_release");
206 return; 206 return;
207 } 207 }
208 mempool_free(buf_to_free, cifs_sm_req_poolp); 208 mempool_free(buf_to_free, cifs_sm_req_poolp);
@@ -345,19 +345,19 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
345 /* with userid/password pairs found on the smb session */ 345 /* with userid/password pairs found on the smb session */
346 /* for other target tcp/ip addresses BB */ 346 /* for other target tcp/ip addresses BB */
347 if (current_fsuid() != treeCon->ses->linux_uid) { 347 if (current_fsuid() != treeCon->ses->linux_uid) {
348 cFYI(1, ("Multiuser mode and UID " 348 cFYI(1, "Multiuser mode and UID "
349 "did not match tcon uid")); 349 "did not match tcon uid");
350 read_lock(&cifs_tcp_ses_lock); 350 read_lock(&cifs_tcp_ses_lock);
351 list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) { 351 list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
352 ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list); 352 ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list);
353 if (ses->linux_uid == current_fsuid()) { 353 if (ses->linux_uid == current_fsuid()) {
354 if (ses->server == treeCon->ses->server) { 354 if (ses->server == treeCon->ses->server) {
355 cFYI(1, ("found matching uid substitute right smb_uid")); 355 cFYI(1, "found matching uid substitute right smb_uid");
356 buffer->Uid = ses->Suid; 356 buffer->Uid = ses->Suid;
357 break; 357 break;
358 } else { 358 } else {
359 /* BB eventually call cifs_setup_session here */ 359 /* BB eventually call cifs_setup_session here */
360 cFYI(1, ("local UID found but no smb sess with this server exists")); 360 cFYI(1, "local UID found but no smb sess with this server exists");
361 } 361 }
362 } 362 }
363 } 363 }
@@ -394,17 +394,16 @@ checkSMBhdr(struct smb_hdr *smb, __u16 mid)
394 if (smb->Command == SMB_COM_LOCKING_ANDX) 394 if (smb->Command == SMB_COM_LOCKING_ANDX)
395 return 0; 395 return 0;
396 else 396 else
397 cERROR(1, ("Received Request not response")); 397 cERROR(1, "Received Request not response");
398 } 398 }
399 } else { /* bad signature or mid */ 399 } else { /* bad signature or mid */
400 if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) 400 if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff))
401 cERROR(1, 401 cERROR(1, "Bad protocol string signature header %x",
402 ("Bad protocol string signature header %x", 402 *(unsigned int *) smb->Protocol);
403 *(unsigned int *) smb->Protocol));
404 if (mid != smb->Mid) 403 if (mid != smb->Mid)
405 cERROR(1, ("Mids do not match")); 404 cERROR(1, "Mids do not match");
406 } 405 }
407 cERROR(1, ("bad smb detected. The Mid=%d", smb->Mid)); 406 cERROR(1, "bad smb detected. The Mid=%d", smb->Mid);
408 return 1; 407 return 1;
409} 408}
410 409
@@ -413,7 +412,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
413{ 412{
414 __u32 len = smb->smb_buf_length; 413 __u32 len = smb->smb_buf_length;
415 __u32 clc_len; /* calculated length */ 414 __u32 clc_len; /* calculated length */
416 cFYI(0, ("checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len)); 415 cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len);
417 416
418 if (length < 2 + sizeof(struct smb_hdr)) { 417 if (length < 2 + sizeof(struct smb_hdr)) {
419 if ((length >= sizeof(struct smb_hdr) - 1) 418 if ((length >= sizeof(struct smb_hdr) - 1)
@@ -437,15 +436,15 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
437 tmp[sizeof(struct smb_hdr)+1] = 0; 436 tmp[sizeof(struct smb_hdr)+1] = 0;
438 return 0; 437 return 0;
439 } 438 }
440 cERROR(1, ("rcvd invalid byte count (bcc)")); 439 cERROR(1, "rcvd invalid byte count (bcc)");
441 } else { 440 } else {
442 cERROR(1, ("Length less than smb header size")); 441 cERROR(1, "Length less than smb header size");
443 } 442 }
444 return 1; 443 return 1;
445 } 444 }
446 if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { 445 if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
447 cERROR(1, ("smb length greater than MaxBufSize, mid=%d", 446 cERROR(1, "smb length greater than MaxBufSize, mid=%d",
448 smb->Mid)); 447 smb->Mid);
449 return 1; 448 return 1;
450 } 449 }
451 450
@@ -454,8 +453,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
454 clc_len = smbCalcSize_LE(smb); 453 clc_len = smbCalcSize_LE(smb);
455 454
456 if (4 + len != length) { 455 if (4 + len != length) {
457 cERROR(1, ("Length read does not match RFC1001 length %d", 456 cERROR(1, "Length read does not match RFC1001 length %d",
458 len)); 457 len);
459 return 1; 458 return 1;
460 } 459 }
461 460
@@ -466,8 +465,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
466 if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF)) 465 if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF))
467 return 0; /* bcc wrapped */ 466 return 0; /* bcc wrapped */
468 } 467 }
469 cFYI(1, ("Calculated size %d vs length %d mismatch for mid %d", 468 cFYI(1, "Calculated size %d vs length %d mismatch for mid %d",
470 clc_len, 4 + len, smb->Mid)); 469 clc_len, 4 + len, smb->Mid);
471 /* Windows XP can return a few bytes too much, presumably 470 /* Windows XP can return a few bytes too much, presumably
472 an illegal pad, at the end of byte range lock responses 471 an illegal pad, at the end of byte range lock responses
473 so we allow for that three byte pad, as long as actual 472 so we allow for that three byte pad, as long as actual
@@ -482,8 +481,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
482 if ((4+len > clc_len) && (len <= clc_len + 512)) 481 if ((4+len > clc_len) && (len <= clc_len + 512))
483 return 0; 482 return 0;
484 else { 483 else {
485 cERROR(1, ("RFC1001 size %d bigger than SMB for Mid=%d", 484 cERROR(1, "RFC1001 size %d bigger than SMB for Mid=%d",
486 len, smb->Mid)); 485 len, smb->Mid);
487 return 1; 486 return 1;
488 } 487 }
489 } 488 }
@@ -501,7 +500,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
501 struct cifsFileInfo *netfile; 500 struct cifsFileInfo *netfile;
502 int rc; 501 int rc;
503 502
504 cFYI(1, ("Checking for oplock break or dnotify response")); 503 cFYI(1, "Checking for oplock break or dnotify response");
505 if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) && 504 if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) &&
506 (pSMB->hdr.Flags & SMBFLG_RESPONSE)) { 505 (pSMB->hdr.Flags & SMBFLG_RESPONSE)) {
507 struct smb_com_transaction_change_notify_rsp *pSMBr = 506 struct smb_com_transaction_change_notify_rsp *pSMBr =
@@ -513,15 +512,15 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
513 512
514 pnotify = (struct file_notify_information *) 513 pnotify = (struct file_notify_information *)
515 ((char *)&pSMBr->hdr.Protocol + data_offset); 514 ((char *)&pSMBr->hdr.Protocol + data_offset);
516 cFYI(1, ("dnotify on %s Action: 0x%x", 515 cFYI(1, "dnotify on %s Action: 0x%x",
517 pnotify->FileName, pnotify->Action)); 516 pnotify->FileName, pnotify->Action);
518 /* cifs_dump_mem("Rcvd notify Data: ",buf, 517 /* cifs_dump_mem("Rcvd notify Data: ",buf,
519 sizeof(struct smb_hdr)+60); */ 518 sizeof(struct smb_hdr)+60); */
520 return true; 519 return true;
521 } 520 }
522 if (pSMBr->hdr.Status.CifsError) { 521 if (pSMBr->hdr.Status.CifsError) {
523 cFYI(1, ("notify err 0x%d", 522 cFYI(1, "notify err 0x%d",
524 pSMBr->hdr.Status.CifsError)); 523 pSMBr->hdr.Status.CifsError);
525 return true; 524 return true;
526 } 525 }
527 return false; 526 return false;
@@ -535,7 +534,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
535 large dirty files cached on the client */ 534 large dirty files cached on the client */
536 if ((NT_STATUS_INVALID_HANDLE) == 535 if ((NT_STATUS_INVALID_HANDLE) ==
537 le32_to_cpu(pSMB->hdr.Status.CifsError)) { 536 le32_to_cpu(pSMB->hdr.Status.CifsError)) {
538 cFYI(1, ("invalid handle on oplock break")); 537 cFYI(1, "invalid handle on oplock break");
539 return true; 538 return true;
540 } else if (ERRbadfid == 539 } else if (ERRbadfid ==
541 le16_to_cpu(pSMB->hdr.Status.DosError.Error)) { 540 le16_to_cpu(pSMB->hdr.Status.DosError.Error)) {
@@ -547,8 +546,8 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
547 if (pSMB->hdr.WordCount != 8) 546 if (pSMB->hdr.WordCount != 8)
548 return false; 547 return false;
549 548
550 cFYI(1, ("oplock type 0x%d level 0x%d", 549 cFYI(1, "oplock type 0x%d level 0x%d",
551 pSMB->LockType, pSMB->OplockLevel)); 550 pSMB->LockType, pSMB->OplockLevel);
552 if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE)) 551 if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE))
553 return false; 552 return false;
554 553
@@ -579,15 +578,15 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
579 return true; 578 return true;
580 } 579 }
581 580
582 cFYI(1, ("file id match, oplock break")); 581 cFYI(1, "file id match, oplock break");
583 pCifsInode = CIFS_I(netfile->pInode); 582 pCifsInode = CIFS_I(netfile->pInode);
584 pCifsInode->clientCanCacheAll = false; 583 pCifsInode->clientCanCacheAll = false;
585 if (pSMB->OplockLevel == 0) 584 if (pSMB->OplockLevel == 0)
586 pCifsInode->clientCanCacheRead = false; 585 pCifsInode->clientCanCacheRead = false;
587 rc = slow_work_enqueue(&netfile->oplock_break); 586 rc = slow_work_enqueue(&netfile->oplock_break);
588 if (rc) { 587 if (rc) {
589 cERROR(1, ("failed to enqueue oplock " 588 cERROR(1, "failed to enqueue oplock "
590 "break: %d\n", rc)); 589 "break: %d\n", rc);
591 } else { 590 } else {
592 netfile->oplock_break_cancelled = false; 591 netfile->oplock_break_cancelled = false;
593 } 592 }
@@ -597,12 +596,12 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
597 } 596 }
598 read_unlock(&GlobalSMBSeslock); 597 read_unlock(&GlobalSMBSeslock);
599 read_unlock(&cifs_tcp_ses_lock); 598 read_unlock(&cifs_tcp_ses_lock);
600 cFYI(1, ("No matching file for oplock break")); 599 cFYI(1, "No matching file for oplock break");
601 return true; 600 return true;
602 } 601 }
603 } 602 }
604 read_unlock(&cifs_tcp_ses_lock); 603 read_unlock(&cifs_tcp_ses_lock);
605 cFYI(1, ("Can not process oplock break for non-existent connection")); 604 cFYI(1, "Can not process oplock break for non-existent connection");
606 return true; 605 return true;
607} 606}
608 607
@@ -721,11 +720,11 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
721{ 720{
722 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { 721 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
723 cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; 722 cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
724 cERROR(1, ("Autodisabling the use of server inode numbers on " 723 cERROR(1, "Autodisabling the use of server inode numbers on "
725 "%s. This server doesn't seem to support them " 724 "%s. This server doesn't seem to support them "
726 "properly. Hardlinks will not be recognized on this " 725 "properly. Hardlinks will not be recognized on this "
727 "mount. Consider mounting with the \"noserverino\" " 726 "mount. Consider mounting with the \"noserverino\" "
728 "option to silence this message.", 727 "option to silence this message.",
729 cifs_sb->tcon->treeName)); 728 cifs_sb->tcon->treeName);
730 } 729 }
731} 730}
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index bd6d6895730d..d35d52889cb5 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -149,7 +149,7 @@ cifs_inet_pton(const int address_family, const char *cp, void *dst)
149 else if (address_family == AF_INET6) 149 else if (address_family == AF_INET6)
150 ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL); 150 ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL);
151 151
152 cFYI(DBG2, ("address conversion returned %d for %s", ret, cp)); 152 cFYI(DBG2, "address conversion returned %d for %s", ret, cp);
153 if (ret > 0) 153 if (ret > 0)
154 ret = 1; 154 ret = 1;
155 return ret; 155 return ret;
@@ -870,8 +870,8 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
870 } 870 }
871 /* else ERRHRD class errors or junk - return EIO */ 871 /* else ERRHRD class errors or junk - return EIO */
872 872
873 cFYI(1, ("Mapping smb error code %d to POSIX err %d", 873 cFYI(1, "Mapping smb error code %d to POSIX err %d",
874 smberrcode, rc)); 874 smberrcode, rc);
875 875
876 /* generic corrective action e.g. reconnect SMB session on 876 /* generic corrective action e.g. reconnect SMB session on
877 * ERRbaduid could be added */ 877 * ERRbaduid could be added */
@@ -940,20 +940,20 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
940 SMB_TIME *st = (SMB_TIME *)&time; 940 SMB_TIME *st = (SMB_TIME *)&time;
941 SMB_DATE *sd = (SMB_DATE *)&date; 941 SMB_DATE *sd = (SMB_DATE *)&date;
942 942
943 cFYI(1, ("date %d time %d", date, time)); 943 cFYI(1, "date %d time %d", date, time);
944 944
945 sec = 2 * st->TwoSeconds; 945 sec = 2 * st->TwoSeconds;
946 min = st->Minutes; 946 min = st->Minutes;
947 if ((sec > 59) || (min > 59)) 947 if ((sec > 59) || (min > 59))
948 cERROR(1, ("illegal time min %d sec %d", min, sec)); 948 cERROR(1, "illegal time min %d sec %d", min, sec);
949 sec += (min * 60); 949 sec += (min * 60);
950 sec += 60 * 60 * st->Hours; 950 sec += 60 * 60 * st->Hours;
951 if (st->Hours > 24) 951 if (st->Hours > 24)
952 cERROR(1, ("illegal hours %d", st->Hours)); 952 cERROR(1, "illegal hours %d", st->Hours);
953 days = sd->Day; 953 days = sd->Day;
954 month = sd->Month; 954 month = sd->Month;
955 if ((days > 31) || (month > 12)) { 955 if ((days > 31) || (month > 12)) {
956 cERROR(1, ("illegal date, month %d day: %d", month, days)); 956 cERROR(1, "illegal date, month %d day: %d", month, days);
957 if (month > 12) 957 if (month > 12)
958 month = 12; 958 month = 12;
959 } 959 }
@@ -979,7 +979,7 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
979 979
980 ts.tv_sec = sec + offset; 980 ts.tv_sec = sec + offset;
981 981
982 /* cFYI(1,("sec after cnvrt dos to unix time %d",sec)); */ 982 /* cFYI(1, "sec after cnvrt dos to unix time %d",sec); */
983 983
984 ts.tv_nsec = 0; 984 ts.tv_nsec = 0;
985 return ts; 985 return ts;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index c343b14ba2d3..daf1753af674 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -22,6 +22,7 @@
22 */ 22 */
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/pagemap.h> 24#include <linux/pagemap.h>
25#include <linux/slab.h>
25#include <linux/stat.h> 26#include <linux/stat.h>
26#include "cifspdu.h" 27#include "cifspdu.h"
27#include "cifsglob.h" 28#include "cifsglob.h"
@@ -46,15 +47,15 @@ static void dump_cifs_file_struct(struct file *file, char *label)
46 if (file) { 47 if (file) {
47 cf = file->private_data; 48 cf = file->private_data;
48 if (cf == NULL) { 49 if (cf == NULL) {
49 cFYI(1, ("empty cifs private file data")); 50 cFYI(1, "empty cifs private file data");
50 return; 51 return;
51 } 52 }
52 if (cf->invalidHandle) 53 if (cf->invalidHandle)
53 cFYI(1, ("invalid handle")); 54 cFYI(1, "invalid handle");
54 if (cf->srch_inf.endOfSearch) 55 if (cf->srch_inf.endOfSearch)
55 cFYI(1, ("end of search")); 56 cFYI(1, "end of search");
56 if (cf->srch_inf.emptyDir) 57 if (cf->srch_inf.emptyDir)
57 cFYI(1, ("empty dir")); 58 cFYI(1, "empty dir");
58 } 59 }
59} 60}
60#else 61#else
@@ -75,7 +76,7 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
75 struct inode *inode; 76 struct inode *inode;
76 struct super_block *sb = parent->d_inode->i_sb; 77 struct super_block *sb = parent->d_inode->i_sb;
77 78
78 cFYI(1, ("For %s", name->name)); 79 cFYI(1, "For %s", name->name);
79 80
80 if (parent->d_op && parent->d_op->d_hash) 81 if (parent->d_op && parent->d_op->d_hash)
81 parent->d_op->d_hash(parent, name); 82 parent->d_op->d_hash(parent, name);
@@ -213,7 +214,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
213 fid, 214 fid,
214 cifs_sb->local_nls); 215 cifs_sb->local_nls);
215 if (CIFSSMBClose(xid, ptcon, fid)) { 216 if (CIFSSMBClose(xid, ptcon, fid)) {
216 cFYI(1, ("Error closing temporary reparsepoint open)")); 217 cFYI(1, "Error closing temporary reparsepoint open");
217 } 218 }
218 } 219 }
219} 220}
@@ -251,7 +252,7 @@ static int initiate_cifs_search(const int xid, struct file *file)
251 if (full_path == NULL) 252 if (full_path == NULL)
252 return -ENOMEM; 253 return -ENOMEM;
253 254
254 cFYI(1, ("Full path: %s start at: %lld", full_path, file->f_pos)); 255 cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos);
255 256
256ffirst_retry: 257ffirst_retry:
257 /* test for Unix extensions */ 258 /* test for Unix extensions */
@@ -296,7 +297,7 @@ static int cifs_unicode_bytelen(char *str)
296 if (ustr[len] == 0) 297 if (ustr[len] == 0)
297 return len << 1; 298 return len << 1;
298 } 299 }
299 cFYI(1, ("Unicode string longer than PATH_MAX found")); 300 cFYI(1, "Unicode string longer than PATH_MAX found");
300 return len << 1; 301 return len << 1;
301} 302}
302 303
@@ -313,19 +314,18 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
313 pfData->FileNameLength; 314 pfData->FileNameLength;
314 } else 315 } else
315 new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset); 316 new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset);
316 cFYI(1, ("new entry %p old entry %p", new_entry, old_entry)); 317 cFYI(1, "new entry %p old entry %p", new_entry, old_entry);
317 /* validate that new_entry is not past end of SMB */ 318 /* validate that new_entry is not past end of SMB */
318 if (new_entry >= end_of_smb) { 319 if (new_entry >= end_of_smb) {
319 cERROR(1, 320 cERROR(1, "search entry %p began after end of SMB %p old entry %p",
320 ("search entry %p began after end of SMB %p old entry %p", 321 new_entry, end_of_smb, old_entry);
321 new_entry, end_of_smb, old_entry));
322 return NULL; 322 return NULL;
323 } else if (((level == SMB_FIND_FILE_INFO_STANDARD) && 323 } else if (((level == SMB_FIND_FILE_INFO_STANDARD) &&
324 (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb)) 324 (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb))
325 || ((level != SMB_FIND_FILE_INFO_STANDARD) && 325 || ((level != SMB_FIND_FILE_INFO_STANDARD) &&
326 (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb))) { 326 (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb))) {
327 cERROR(1, ("search entry %p extends after end of SMB %p", 327 cERROR(1, "search entry %p extends after end of SMB %p",
328 new_entry, end_of_smb)); 328 new_entry, end_of_smb);
329 return NULL; 329 return NULL;
330 } else 330 } else
331 return new_entry; 331 return new_entry;
@@ -379,8 +379,8 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
379 filename = &pFindData->FileName[0]; 379 filename = &pFindData->FileName[0];
380 len = pFindData->FileNameLength; 380 len = pFindData->FileNameLength;
381 } else { 381 } else {
382 cFYI(1, ("Unknown findfirst level %d", 382 cFYI(1, "Unknown findfirst level %d",
383 cfile->srch_inf.info_level)); 383 cfile->srch_inf.info_level);
384 } 384 }
385 385
386 if (filename) { 386 if (filename) {
@@ -480,7 +480,7 @@ static int cifs_save_resume_key(const char *current_entry,
480 len = (unsigned int)pFindData->FileNameLength; 480 len = (unsigned int)pFindData->FileNameLength;
481 cifsFile->srch_inf.resume_key = pFindData->ResumeKey; 481 cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
482 } else { 482 } else {
483 cFYI(1, ("Unknown findfirst level %d", level)); 483 cFYI(1, "Unknown findfirst level %d", level);
484 return -EINVAL; 484 return -EINVAL;
485 } 485 }
486 cifsFile->srch_inf.resume_name_len = len; 486 cifsFile->srch_inf.resume_name_len = len;
@@ -524,7 +524,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
524 is_dir_changed(file)) || 524 is_dir_changed(file)) ||
525 (index_to_find < first_entry_in_buffer)) { 525 (index_to_find < first_entry_in_buffer)) {
526 /* close and restart search */ 526 /* close and restart search */
527 cFYI(1, ("search backing up - close and restart search")); 527 cFYI(1, "search backing up - close and restart search");
528 write_lock(&GlobalSMBSeslock); 528 write_lock(&GlobalSMBSeslock);
529 if (!cifsFile->srch_inf.endOfSearch && 529 if (!cifsFile->srch_inf.endOfSearch &&
530 !cifsFile->invalidHandle) { 530 !cifsFile->invalidHandle) {
@@ -534,7 +534,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
534 } else 534 } else
535 write_unlock(&GlobalSMBSeslock); 535 write_unlock(&GlobalSMBSeslock);
536 if (cifsFile->srch_inf.ntwrk_buf_start) { 536 if (cifsFile->srch_inf.ntwrk_buf_start) {
537 cFYI(1, ("freeing SMB ff cache buf on search rewind")); 537 cFYI(1, "freeing SMB ff cache buf on search rewind");
538 if (cifsFile->srch_inf.smallBuf) 538 if (cifsFile->srch_inf.smallBuf)
539 cifs_small_buf_release(cifsFile->srch_inf. 539 cifs_small_buf_release(cifsFile->srch_inf.
540 ntwrk_buf_start); 540 ntwrk_buf_start);
@@ -545,8 +545,8 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
545 } 545 }
546 rc = initiate_cifs_search(xid, file); 546 rc = initiate_cifs_search(xid, file);
547 if (rc) { 547 if (rc) {
548 cFYI(1, ("error %d reinitiating a search on rewind", 548 cFYI(1, "error %d reinitiating a search on rewind",
549 rc)); 549 rc);
550 return rc; 550 return rc;
551 } 551 }
552 cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); 552 cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
@@ -554,7 +554,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
554 554
555 while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) && 555 while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) &&
556 (rc == 0) && !cifsFile->srch_inf.endOfSearch) { 556 (rc == 0) && !cifsFile->srch_inf.endOfSearch) {
557 cFYI(1, ("calling findnext2")); 557 cFYI(1, "calling findnext2");
558 rc = CIFSFindNext(xid, pTcon, cifsFile->netfid, 558 rc = CIFSFindNext(xid, pTcon, cifsFile->netfid,
559 &cifsFile->srch_inf); 559 &cifsFile->srch_inf);
560 cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); 560 cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
@@ -574,7 +574,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
574 first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry 574 first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry
575 - cifsFile->srch_inf.entries_in_buffer; 575 - cifsFile->srch_inf.entries_in_buffer;
576 pos_in_buf = index_to_find - first_entry_in_buffer; 576 pos_in_buf = index_to_find - first_entry_in_buffer;
577 cFYI(1, ("found entry - pos_in_buf %d", pos_in_buf)); 577 cFYI(1, "found entry - pos_in_buf %d", pos_in_buf);
578 578
579 for (i = 0; (i < (pos_in_buf)) && (current_entry != NULL); i++) { 579 for (i = 0; (i < (pos_in_buf)) && (current_entry != NULL); i++) {
580 /* go entry by entry figuring out which is first */ 580 /* go entry by entry figuring out which is first */
@@ -583,19 +583,19 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
583 } 583 }
584 if ((current_entry == NULL) && (i < pos_in_buf)) { 584 if ((current_entry == NULL) && (i < pos_in_buf)) {
585 /* BB fixme - check if we should flag this error */ 585 /* BB fixme - check if we should flag this error */
586 cERROR(1, ("reached end of buf searching for pos in buf" 586 cERROR(1, "reached end of buf searching for pos in buf"
587 " %d index to find %lld rc %d", 587 " %d index to find %lld rc %d",
588 pos_in_buf, index_to_find, rc)); 588 pos_in_buf, index_to_find, rc);
589 } 589 }
590 rc = 0; 590 rc = 0;
591 *ppCurrentEntry = current_entry; 591 *ppCurrentEntry = current_entry;
592 } else { 592 } else {
593 cFYI(1, ("index not in buffer - could not findnext into it")); 593 cFYI(1, "index not in buffer - could not findnext into it");
594 return 0; 594 return 0;
595 } 595 }
596 596
597 if (pos_in_buf >= cifsFile->srch_inf.entries_in_buffer) { 597 if (pos_in_buf >= cifsFile->srch_inf.entries_in_buffer) {
598 cFYI(1, ("can not return entries pos_in_buf beyond last")); 598 cFYI(1, "can not return entries pos_in_buf beyond last");
599 *num_to_ret = 0; 599 *num_to_ret = 0;
600 } else 600 } else
601 *num_to_ret = cifsFile->srch_inf.entries_in_buffer - pos_in_buf; 601 *num_to_ret = cifsFile->srch_inf.entries_in_buffer - pos_in_buf;
@@ -655,12 +655,12 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
655 /* one byte length, no name conversion */ 655 /* one byte length, no name conversion */
656 len = (unsigned int)pFindData->FileNameLength; 656 len = (unsigned int)pFindData->FileNameLength;
657 } else { 657 } else {
658 cFYI(1, ("Unknown findfirst level %d", level)); 658 cFYI(1, "Unknown findfirst level %d", level);
659 return -EINVAL; 659 return -EINVAL;
660 } 660 }
661 661
662 if (len > max_len) { 662 if (len > max_len) {
663 cERROR(1, ("bad search response length %d past smb end", len)); 663 cERROR(1, "bad search response length %d past smb end", len);
664 return -EINVAL; 664 return -EINVAL;
665 } 665 }
666 666
@@ -753,7 +753,7 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
753 * case already. Why should we be clobbering other errors from it? 753 * case already. Why should we be clobbering other errors from it?
754 */ 754 */
755 if (rc) { 755 if (rc) {
756 cFYI(1, ("filldir rc = %d", rc)); 756 cFYI(1, "filldir rc = %d", rc);
757 rc = -EOVERFLOW; 757 rc = -EOVERFLOW;
758 } 758 }
759 dput(tmp_dentry); 759 dput(tmp_dentry);
@@ -785,7 +785,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
785 case 0: 785 case 0:
786 if (filldir(direntry, ".", 1, file->f_pos, 786 if (filldir(direntry, ".", 1, file->f_pos,
787 file->f_path.dentry->d_inode->i_ino, DT_DIR) < 0) { 787 file->f_path.dentry->d_inode->i_ino, DT_DIR) < 0) {
788 cERROR(1, ("Filldir for current dir failed")); 788 cERROR(1, "Filldir for current dir failed");
789 rc = -ENOMEM; 789 rc = -ENOMEM;
790 break; 790 break;
791 } 791 }
@@ -793,7 +793,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
793 case 1: 793 case 1:
794 if (filldir(direntry, "..", 2, file->f_pos, 794 if (filldir(direntry, "..", 2, file->f_pos,
795 file->f_path.dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) { 795 file->f_path.dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) {
796 cERROR(1, ("Filldir for parent dir failed")); 796 cERROR(1, "Filldir for parent dir failed");
797 rc = -ENOMEM; 797 rc = -ENOMEM;
798 break; 798 break;
799 } 799 }
@@ -806,7 +806,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
806 806
807 if (file->private_data == NULL) { 807 if (file->private_data == NULL) {
808 rc = initiate_cifs_search(xid, file); 808 rc = initiate_cifs_search(xid, file);
809 cFYI(1, ("initiate cifs search rc %d", rc)); 809 cFYI(1, "initiate cifs search rc %d", rc);
810 if (rc) { 810 if (rc) {
811 FreeXid(xid); 811 FreeXid(xid);
812 return rc; 812 return rc;
@@ -820,7 +820,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
820 cifsFile = file->private_data; 820 cifsFile = file->private_data;
821 if (cifsFile->srch_inf.endOfSearch) { 821 if (cifsFile->srch_inf.endOfSearch) {
822 if (cifsFile->srch_inf.emptyDir) { 822 if (cifsFile->srch_inf.emptyDir) {
823 cFYI(1, ("End of search, empty dir")); 823 cFYI(1, "End of search, empty dir");
824 rc = 0; 824 rc = 0;
825 break; 825 break;
826 } 826 }
@@ -832,16 +832,16 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
832 rc = find_cifs_entry(xid, pTcon, file, 832 rc = find_cifs_entry(xid, pTcon, file,
833 &current_entry, &num_to_fill); 833 &current_entry, &num_to_fill);
834 if (rc) { 834 if (rc) {
835 cFYI(1, ("fce error %d", rc)); 835 cFYI(1, "fce error %d", rc);
836 goto rddir2_exit; 836 goto rddir2_exit;
837 } else if (current_entry != NULL) { 837 } else if (current_entry != NULL) {
838 cFYI(1, ("entry %lld found", file->f_pos)); 838 cFYI(1, "entry %lld found", file->f_pos);
839 } else { 839 } else {
840 cFYI(1, ("could not find entry")); 840 cFYI(1, "could not find entry");
841 goto rddir2_exit; 841 goto rddir2_exit;
842 } 842 }
843 cFYI(1, ("loop through %d times filling dir for net buf %p", 843 cFYI(1, "loop through %d times filling dir for net buf %p",
844 num_to_fill, cifsFile->srch_inf.ntwrk_buf_start)); 844 num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
845 max_len = smbCalcSize((struct smb_hdr *) 845 max_len = smbCalcSize((struct smb_hdr *)
846 cifsFile->srch_inf.ntwrk_buf_start); 846 cifsFile->srch_inf.ntwrk_buf_start);
847 end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len; 847 end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
@@ -850,8 +850,8 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
850 for (i = 0; (i < num_to_fill) && (rc == 0); i++) { 850 for (i = 0; (i < num_to_fill) && (rc == 0); i++) {
851 if (current_entry == NULL) { 851 if (current_entry == NULL) {
852 /* evaluate whether this case is an error */ 852 /* evaluate whether this case is an error */
853 cERROR(1, ("past SMB end, num to fill %d i %d", 853 cERROR(1, "past SMB end, num to fill %d i %d",
854 num_to_fill, i)); 854 num_to_fill, i);
855 break; 855 break;
856 } 856 }
857 /* if buggy server returns . and .. late do 857 /* if buggy server returns . and .. late do
@@ -866,8 +866,8 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
866 file->f_pos++; 866 file->f_pos++;
867 if (file->f_pos == 867 if (file->f_pos ==
868 cifsFile->srch_inf.index_of_last_entry) { 868 cifsFile->srch_inf.index_of_last_entry) {
869 cFYI(1, ("last entry in buf at pos %lld %s", 869 cFYI(1, "last entry in buf at pos %lld %s",
870 file->f_pos, tmp_buf)); 870 file->f_pos, tmp_buf);
871 cifs_save_resume_key(current_entry, cifsFile); 871 cifs_save_resume_key(current_entry, cifsFile);
872 break; 872 break;
873 } else 873 } else
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index aaa9c1c5a5bd..7707389bdf2c 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -29,14 +29,17 @@
29#include "ntlmssp.h" 29#include "ntlmssp.h"
30#include "nterr.h" 30#include "nterr.h"
31#include <linux/utsname.h> 31#include <linux/utsname.h>
32#include <linux/slab.h>
32#include "cifs_spnego.h" 33#include "cifs_spnego.h"
33 34
34extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, 35extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
35 unsigned char *p24); 36 unsigned char *p24);
36 37
37/* Checks if this is the first smb session to be reconnected after 38/*
38 the socket has been reestablished (so we know whether to use vc 0). 39 * Checks if this is the first smb session to be reconnected after
39 Called while holding the cifs_tcp_ses_lock, so do not block */ 40 * the socket has been reestablished (so we know whether to use vc 0).
41 * Called while holding the cifs_tcp_ses_lock, so do not block
42 */
40static bool is_first_ses_reconnect(struct cifsSesInfo *ses) 43static bool is_first_ses_reconnect(struct cifsSesInfo *ses)
41{ 44{
42 struct list_head *tmp; 45 struct list_head *tmp;
@@ -283,7 +286,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
283 int len; 286 int len;
284 char *data = *pbcc_area; 287 char *data = *pbcc_area;
285 288
286 cFYI(1, ("bleft %d", bleft)); 289 cFYI(1, "bleft %d", bleft);
287 290
288 /* 291 /*
289 * Windows servers do not always double null terminate their final 292 * Windows servers do not always double null terminate their final
@@ -300,7 +303,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
300 303
301 kfree(ses->serverOS); 304 kfree(ses->serverOS);
302 ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp); 305 ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
303 cFYI(1, ("serverOS=%s", ses->serverOS)); 306 cFYI(1, "serverOS=%s", ses->serverOS);
304 len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2; 307 len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
305 data += len; 308 data += len;
306 bleft -= len; 309 bleft -= len;
@@ -309,7 +312,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
309 312
310 kfree(ses->serverNOS); 313 kfree(ses->serverNOS);
311 ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp); 314 ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
312 cFYI(1, ("serverNOS=%s", ses->serverNOS)); 315 cFYI(1, "serverNOS=%s", ses->serverNOS);
313 len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2; 316 len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
314 data += len; 317 data += len;
315 bleft -= len; 318 bleft -= len;
@@ -318,7 +321,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
318 321
319 kfree(ses->serverDomain); 322 kfree(ses->serverDomain);
320 ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp); 323 ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
321 cFYI(1, ("serverDomain=%s", ses->serverDomain)); 324 cFYI(1, "serverDomain=%s", ses->serverDomain);
322 325
323 return; 326 return;
324} 327}
@@ -331,7 +334,7 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
331 int len; 334 int len;
332 char *bcc_ptr = *pbcc_area; 335 char *bcc_ptr = *pbcc_area;
333 336
334 cFYI(1, ("decode sessetup ascii. bleft %d", bleft)); 337 cFYI(1, "decode sessetup ascii. bleft %d", bleft);
335 338
336 len = strnlen(bcc_ptr, bleft); 339 len = strnlen(bcc_ptr, bleft);
337 if (len >= bleft) 340 if (len >= bleft)
@@ -343,7 +346,7 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
343 if (ses->serverOS) 346 if (ses->serverOS)
344 strncpy(ses->serverOS, bcc_ptr, len); 347 strncpy(ses->serverOS, bcc_ptr, len);
345 if (strncmp(ses->serverOS, "OS/2", 4) == 0) { 348 if (strncmp(ses->serverOS, "OS/2", 4) == 0) {
346 cFYI(1, ("OS/2 server")); 349 cFYI(1, "OS/2 server");
347 ses->flags |= CIFS_SES_OS2; 350 ses->flags |= CIFS_SES_OS2;
348 } 351 }
349 352
@@ -372,7 +375,7 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
372 /* BB For newer servers which do not support Unicode, 375 /* BB For newer servers which do not support Unicode,
373 but thus do return domain here we could add parsing 376 but thus do return domain here we could add parsing
374 for it later, but it is not very important */ 377 for it later, but it is not very important */
375 cFYI(1, ("ascii: bytes left %d", bleft)); 378 cFYI(1, "ascii: bytes left %d", bleft);
376 379
377 return rc; 380 return rc;
378} 381}
@@ -383,16 +386,16 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
383 CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr; 386 CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
384 387
385 if (blob_len < sizeof(CHALLENGE_MESSAGE)) { 388 if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
386 cERROR(1, ("challenge blob len %d too small", blob_len)); 389 cERROR(1, "challenge blob len %d too small", blob_len);
387 return -EINVAL; 390 return -EINVAL;
388 } 391 }
389 392
390 if (memcmp(pblob->Signature, "NTLMSSP", 8)) { 393 if (memcmp(pblob->Signature, "NTLMSSP", 8)) {
391 cERROR(1, ("blob signature incorrect %s", pblob->Signature)); 394 cERROR(1, "blob signature incorrect %s", pblob->Signature);
392 return -EINVAL; 395 return -EINVAL;
393 } 396 }
394 if (pblob->MessageType != NtLmChallenge) { 397 if (pblob->MessageType != NtLmChallenge) {
395 cERROR(1, ("Incorrect message type %d", pblob->MessageType)); 398 cERROR(1, "Incorrect message type %d", pblob->MessageType);
396 return -EINVAL; 399 return -EINVAL;
397 } 400 }
398 401
@@ -446,7 +449,7 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
446 This function returns the length of the data in the blob */ 449 This function returns the length of the data in the blob */
447static int build_ntlmssp_auth_blob(unsigned char *pbuffer, 450static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
448 struct cifsSesInfo *ses, 451 struct cifsSesInfo *ses,
449 const struct nls_table *nls_cp, int first) 452 const struct nls_table *nls_cp, bool first)
450{ 453{
451 AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer; 454 AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
452 __u32 flags; 455 __u32 flags;
@@ -545,7 +548,7 @@ static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
545 548
546static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB, 549static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
547 struct cifsSesInfo *ses, 550 struct cifsSesInfo *ses,
548 const struct nls_table *nls, int first_time) 551 const struct nls_table *nls, bool first_time)
549{ 552{
550 int bloblen; 553 int bloblen;
551 554
@@ -558,8 +561,8 @@ static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
558#endif 561#endif
559 562
560int 563int
561CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time, 564CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
562 const struct nls_table *nls_cp) 565 const struct nls_table *nls_cp)
563{ 566{
564 int rc = 0; 567 int rc = 0;
565 int wct; 568 int wct;
@@ -576,13 +579,18 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
576 int bytes_remaining; 579 int bytes_remaining;
577 struct key *spnego_key = NULL; 580 struct key *spnego_key = NULL;
578 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ 581 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
582 bool first_time;
579 583
580 if (ses == NULL) 584 if (ses == NULL)
581 return -EINVAL; 585 return -EINVAL;
582 586
587 read_lock(&cifs_tcp_ses_lock);
588 first_time = is_first_ses_reconnect(ses);
589 read_unlock(&cifs_tcp_ses_lock);
590
583 type = ses->server->secType; 591 type = ses->server->secType;
584 592
585 cFYI(1, ("sess setup type %d", type)); 593 cFYI(1, "sess setup type %d", type);
586ssetup_ntlmssp_authenticate: 594ssetup_ntlmssp_authenticate:
587 if (phase == NtLmChallenge) 595 if (phase == NtLmChallenge)
588 phase = NtLmAuthenticate; /* if ntlmssp, now final phase */ 596 phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
@@ -663,7 +671,7 @@ ssetup_ntlmssp_authenticate:
663 changed to do higher than lanman dialect and 671 changed to do higher than lanman dialect and
664 we reconnected would we ever calc signing_key? */ 672 we reconnected would we ever calc signing_key? */
665 673
666 cFYI(1, ("Negotiating LANMAN setting up strings")); 674 cFYI(1, "Negotiating LANMAN setting up strings");
667 /* Unicode not allowed for LANMAN dialects */ 675 /* Unicode not allowed for LANMAN dialects */
668 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); 676 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
669#endif 677#endif
@@ -743,7 +751,7 @@ ssetup_ntlmssp_authenticate:
743 unicode_ssetup_strings(&bcc_ptr, ses, nls_cp); 751 unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
744 } else 752 } else
745 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); 753 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
746 } else if (type == Kerberos || type == MSKerberos) { 754 } else if (type == Kerberos) {
747#ifdef CONFIG_CIFS_UPCALL 755#ifdef CONFIG_CIFS_UPCALL
748 struct cifs_spnego_msg *msg; 756 struct cifs_spnego_msg *msg;
749 spnego_key = cifs_get_spnego_key(ses); 757 spnego_key = cifs_get_spnego_key(ses);
@@ -757,17 +765,17 @@ ssetup_ntlmssp_authenticate:
757 /* check version field to make sure that cifs.upcall is 765 /* check version field to make sure that cifs.upcall is
758 sending us a response in an expected form */ 766 sending us a response in an expected form */
759 if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { 767 if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
760 cERROR(1, ("incorrect version of cifs.upcall (expected" 768 cERROR(1, "incorrect version of cifs.upcall (expected"
761 " %d but got %d)", 769 " %d but got %d)",
762 CIFS_SPNEGO_UPCALL_VERSION, msg->version)); 770 CIFS_SPNEGO_UPCALL_VERSION, msg->version);
763 rc = -EKEYREJECTED; 771 rc = -EKEYREJECTED;
764 goto ssetup_exit; 772 goto ssetup_exit;
765 } 773 }
766 /* bail out if key is too long */ 774 /* bail out if key is too long */
767 if (msg->sesskey_len > 775 if (msg->sesskey_len >
768 sizeof(ses->server->mac_signing_key.data.krb5)) { 776 sizeof(ses->server->mac_signing_key.data.krb5)) {
769 cERROR(1, ("Kerberos signing key too long (%u bytes)", 777 cERROR(1, "Kerberos signing key too long (%u bytes)",
770 msg->sesskey_len)); 778 msg->sesskey_len);
771 rc = -EOVERFLOW; 779 rc = -EOVERFLOW;
772 goto ssetup_exit; 780 goto ssetup_exit;
773 } 781 }
@@ -795,7 +803,7 @@ ssetup_ntlmssp_authenticate:
795 /* BB: is this right? */ 803 /* BB: is this right? */
796 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); 804 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
797#else /* ! CONFIG_CIFS_UPCALL */ 805#else /* ! CONFIG_CIFS_UPCALL */
798 cERROR(1, ("Kerberos negotiated but upcall support disabled!")); 806 cERROR(1, "Kerberos negotiated but upcall support disabled!");
799 rc = -ENOSYS; 807 rc = -ENOSYS;
800 goto ssetup_exit; 808 goto ssetup_exit;
801#endif /* CONFIG_CIFS_UPCALL */ 809#endif /* CONFIG_CIFS_UPCALL */
@@ -803,12 +811,12 @@ ssetup_ntlmssp_authenticate:
803#ifdef CONFIG_CIFS_EXPERIMENTAL 811#ifdef CONFIG_CIFS_EXPERIMENTAL
804 if (type == RawNTLMSSP) { 812 if (type == RawNTLMSSP) {
805 if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { 813 if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
806 cERROR(1, ("NTLMSSP requires Unicode support")); 814 cERROR(1, "NTLMSSP requires Unicode support");
807 rc = -ENOSYS; 815 rc = -ENOSYS;
808 goto ssetup_exit; 816 goto ssetup_exit;
809 } 817 }
810 818
811 cFYI(1, ("ntlmssp session setup phase %d", phase)); 819 cFYI(1, "ntlmssp session setup phase %d", phase);
812 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; 820 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
813 capabilities |= CAP_EXTENDED_SECURITY; 821 capabilities |= CAP_EXTENDED_SECURITY;
814 pSMB->req.Capabilities |= cpu_to_le32(capabilities); 822 pSMB->req.Capabilities |= cpu_to_le32(capabilities);
@@ -826,7 +834,7 @@ ssetup_ntlmssp_authenticate:
826 on the response (challenge) */ 834 on the response (challenge) */
827 smb_buf->Uid = ses->Suid; 835 smb_buf->Uid = ses->Suid;
828 } else { 836 } else {
829 cERROR(1, ("invalid phase %d", phase)); 837 cERROR(1, "invalid phase %d", phase);
830 rc = -ENOSYS; 838 rc = -ENOSYS;
831 goto ssetup_exit; 839 goto ssetup_exit;
832 } 840 }
@@ -838,12 +846,12 @@ ssetup_ntlmssp_authenticate:
838 } 846 }
839 unicode_oslm_strings(&bcc_ptr, nls_cp); 847 unicode_oslm_strings(&bcc_ptr, nls_cp);
840 } else { 848 } else {
841 cERROR(1, ("secType %d not supported!", type)); 849 cERROR(1, "secType %d not supported!", type);
842 rc = -ENOSYS; 850 rc = -ENOSYS;
843 goto ssetup_exit; 851 goto ssetup_exit;
844 } 852 }
845#else 853#else
846 cERROR(1, ("secType %d not supported!", type)); 854 cERROR(1, "secType %d not supported!", type);
847 rc = -ENOSYS; 855 rc = -ENOSYS;
848 goto ssetup_exit; 856 goto ssetup_exit;
849#endif 857#endif
@@ -861,7 +869,7 @@ ssetup_ntlmssp_authenticate:
861 CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR); 869 CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR);
862 /* SMB request buf freed in SendReceive2 */ 870 /* SMB request buf freed in SendReceive2 */
863 871
864 cFYI(1, ("ssetup rc from sendrecv2 is %d", rc)); 872 cFYI(1, "ssetup rc from sendrecv2 is %d", rc);
865 873
866 pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base; 874 pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
867 smb_buf = (struct smb_hdr *)iov[0].iov_base; 875 smb_buf = (struct smb_hdr *)iov[0].iov_base;
@@ -869,7 +877,7 @@ ssetup_ntlmssp_authenticate:
869 if ((type == RawNTLMSSP) && (smb_buf->Status.CifsError == 877 if ((type == RawNTLMSSP) && (smb_buf->Status.CifsError ==
870 cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) { 878 cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) {
871 if (phase != NtLmNegotiate) { 879 if (phase != NtLmNegotiate) {
872 cERROR(1, ("Unexpected more processing error")); 880 cERROR(1, "Unexpected more processing error");
873 goto ssetup_exit; 881 goto ssetup_exit;
874 } 882 }
875 /* NTLMSSP Negotiate sent now processing challenge (response) */ 883 /* NTLMSSP Negotiate sent now processing challenge (response) */
@@ -881,14 +889,14 @@ ssetup_ntlmssp_authenticate:
881 889
882 if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) { 890 if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) {
883 rc = -EIO; 891 rc = -EIO;
884 cERROR(1, ("bad word count %d", smb_buf->WordCount)); 892 cERROR(1, "bad word count %d", smb_buf->WordCount);
885 goto ssetup_exit; 893 goto ssetup_exit;
886 } 894 }
887 action = le16_to_cpu(pSMB->resp.Action); 895 action = le16_to_cpu(pSMB->resp.Action);
888 if (action & GUEST_LOGIN) 896 if (action & GUEST_LOGIN)
889 cFYI(1, ("Guest login")); /* BB mark SesInfo struct? */ 897 cFYI(1, "Guest login"); /* BB mark SesInfo struct? */
890 ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ 898 ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */
891 cFYI(1, ("UID = %d ", ses->Suid)); 899 cFYI(1, "UID = %d ", ses->Suid);
892 /* response can have either 3 or 4 word count - Samba sends 3 */ 900 /* response can have either 3 or 4 word count - Samba sends 3 */
893 /* and lanman response is 3 */ 901 /* and lanman response is 3 */
894 bytes_remaining = BCC(smb_buf); 902 bytes_remaining = BCC(smb_buf);
@@ -898,7 +906,7 @@ ssetup_ntlmssp_authenticate:
898 __u16 blob_len; 906 __u16 blob_len;
899 blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); 907 blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
900 if (blob_len > bytes_remaining) { 908 if (blob_len > bytes_remaining) {
901 cERROR(1, ("bad security blob length %d", blob_len)); 909 cERROR(1, "bad security blob length %d", blob_len);
902 rc = -EINVAL; 910 rc = -EINVAL;
903 goto ssetup_exit; 911 goto ssetup_exit;
904 } 912 }
@@ -932,7 +940,7 @@ ssetup_exit:
932 } 940 }
933 kfree(str_area); 941 kfree(str_area);
934 if (resp_buf_type == CIFS_SMALL_BUFFER) { 942 if (resp_buf_type == CIFS_SMALL_BUFFER) {
935 cFYI(1, ("ssetup freeing small buf %p", iov[0].iov_base)); 943 cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base);
936 cifs_small_buf_release(iov[0].iov_base); 944 cifs_small_buf_release(iov[0].iov_base);
937 } else if (resp_buf_type == CIFS_LARGE_BUFFER) 945 } else if (resp_buf_type == CIFS_LARGE_BUFFER)
938 cifs_buf_release(iov[0].iov_base); 946 cifs_buf_release(iov[0].iov_base);
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 93fb09a99c69..192ea51af20f 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -24,6 +24,7 @@
24*/ 24*/
25 25
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/slab.h>
27#include <linux/fs.h> 28#include <linux/fs.h>
28#include <linux/string.h> 29#include <linux/string.h>
29#include <linux/kernel.h> 30#include <linux/kernel.h>
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 07b8e71544ee..82f78c4d6978 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/list.h> 24#include <linux/list.h>
25#include <linux/gfp.h>
25#include <linux/wait.h> 26#include <linux/wait.h>
26#include <linux/net.h> 27#include <linux/net.h>
27#include <linux/delay.h> 28#include <linux/delay.h>
@@ -34,7 +35,6 @@
34#include "cifs_debug.h" 35#include "cifs_debug.h"
35 36
36extern mempool_t *cifs_mid_poolp; 37extern mempool_t *cifs_mid_poolp;
37extern struct kmem_cache *cifs_oplock_cachep;
38 38
39static struct mid_q_entry * 39static struct mid_q_entry *
40AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) 40AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
@@ -42,7 +42,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
42 struct mid_q_entry *temp; 42 struct mid_q_entry *temp;
43 43
44 if (server == NULL) { 44 if (server == NULL) {
45 cERROR(1, ("Null TCP session in AllocMidQEntry")); 45 cERROR(1, "Null TCP session in AllocMidQEntry");
46 return NULL; 46 return NULL;
47 } 47 }
48 48
@@ -54,7 +54,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
54 temp->mid = smb_buffer->Mid; /* always LE */ 54 temp->mid = smb_buffer->Mid; /* always LE */
55 temp->pid = current->pid; 55 temp->pid = current->pid;
56 temp->command = smb_buffer->Command; 56 temp->command = smb_buffer->Command;
57 cFYI(1, ("For smb_command %d", temp->command)); 57 cFYI(1, "For smb_command %d", temp->command);
58 /* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */ 58 /* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
59 /* when mid allocated can be before when sent */ 59 /* when mid allocated can be before when sent */
60 temp->when_alloc = jiffies; 60 temp->when_alloc = jiffies;
@@ -139,7 +139,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
139 total_len += iov[i].iov_len; 139 total_len += iov[i].iov_len;
140 140
141 smb_buffer->smb_buf_length = cpu_to_be32(smb_buffer->smb_buf_length); 141 smb_buffer->smb_buf_length = cpu_to_be32(smb_buffer->smb_buf_length);
142 cFYI(1, ("Sending smb: total_len %d", total_len)); 142 cFYI(1, "Sending smb: total_len %d", total_len);
143 dump_smb(smb_buffer, len); 143 dump_smb(smb_buffer, len);
144 144
145 i = 0; 145 i = 0;
@@ -167,9 +167,8 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
167 reconnect which may clear the network problem. 167 reconnect which may clear the network problem.
168 */ 168 */
169 if ((i >= 14) || (!server->noblocksnd && (i > 2))) { 169 if ((i >= 14) || (!server->noblocksnd && (i > 2))) {
170 cERROR(1, 170 cERROR(1, "sends on sock %p stuck for 15 seconds",
171 ("sends on sock %p stuck for 15 seconds", 171 ssocket);
172 ssocket));
173 rc = -EAGAIN; 172 rc = -EAGAIN;
174 break; 173 break;
175 } 174 }
@@ -183,13 +182,13 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
183 total_len = 0; 182 total_len = 0;
184 break; 183 break;
185 } else if (rc > total_len) { 184 } else if (rc > total_len) {
186 cERROR(1, ("sent %d requested %d", rc, total_len)); 185 cERROR(1, "sent %d requested %d", rc, total_len);
187 break; 186 break;
188 } 187 }
189 if (rc == 0) { 188 if (rc == 0) {
190 /* should never happen, letting socket clear before 189 /* should never happen, letting socket clear before
191 retrying is our only obvious option here */ 190 retrying is our only obvious option here */
192 cERROR(1, ("tcp sent no data")); 191 cERROR(1, "tcp sent no data");
193 msleep(500); 192 msleep(500);
194 continue; 193 continue;
195 } 194 }
@@ -212,8 +211,8 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
212 } 211 }
213 212
214 if ((total_len > 0) && (total_len != smb_buf_length + 4)) { 213 if ((total_len > 0) && (total_len != smb_buf_length + 4)) {
215 cFYI(1, ("partial send (%d remaining), terminating session", 214 cFYI(1, "partial send (%d remaining), terminating session",
216 total_len)); 215 total_len);
217 /* If we have only sent part of an SMB then the next SMB 216 /* If we have only sent part of an SMB then the next SMB
218 could be taken as the remainder of this one. We need 217 could be taken as the remainder of this one. We need
219 to kill the socket so the server throws away the partial 218 to kill the socket so the server throws away the partial
@@ -222,7 +221,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
222 } 221 }
223 222
224 if (rc < 0) { 223 if (rc < 0) {
225 cERROR(1, ("Error %d sending data on socket to server", rc)); 224 cERROR(1, "Error %d sending data on socket to server", rc);
226 } else 225 } else
227 rc = 0; 226 rc = 0;
228 227
@@ -295,7 +294,7 @@ static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf,
295 } 294 }
296 295
297 if (ses->server->tcpStatus == CifsNeedReconnect) { 296 if (ses->server->tcpStatus == CifsNeedReconnect) {
298 cFYI(1, ("tcp session dead - return to caller to retry")); 297 cFYI(1, "tcp session dead - return to caller to retry");
299 return -EAGAIN; 298 return -EAGAIN;
300 } 299 }
301 300
@@ -347,7 +346,7 @@ static int wait_for_response(struct cifsSesInfo *ses,
347 lrt += time_to_wait; 346 lrt += time_to_wait;
348 if (time_after(jiffies, lrt)) { 347 if (time_after(jiffies, lrt)) {
349 /* No replies for time_to_wait. */ 348 /* No replies for time_to_wait. */
350 cERROR(1, ("server not responding")); 349 cERROR(1, "server not responding");
351 return -1; 350 return -1;
352 } 351 }
353 } else { 352 } else {
@@ -378,7 +377,7 @@ SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
378 iov[0].iov_len = in_buf->smb_buf_length + 4; 377 iov[0].iov_len = in_buf->smb_buf_length + 4;
379 flags |= CIFS_NO_RESP; 378 flags |= CIFS_NO_RESP;
380 rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags); 379 rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags);
381 cFYI(DBG2, ("SendRcvNoRsp flags %d rc %d", flags, rc)); 380 cFYI(DBG2, "SendRcvNoRsp flags %d rc %d", flags, rc);
382 381
383 return rc; 382 return rc;
384} 383}
@@ -401,7 +400,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
401 400
402 if ((ses == NULL) || (ses->server == NULL)) { 401 if ((ses == NULL) || (ses->server == NULL)) {
403 cifs_small_buf_release(in_buf); 402 cifs_small_buf_release(in_buf);
404 cERROR(1, ("Null session")); 403 cERROR(1, "Null session");
405 return -EIO; 404 return -EIO;
406 } 405 }
407 406
@@ -470,7 +469,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
470 else if (long_op == CIFS_BLOCKING_OP) 469 else if (long_op == CIFS_BLOCKING_OP)
471 timeout = 0x7FFFFFFF; /* large, but not so large as to wrap */ 470 timeout = 0x7FFFFFFF; /* large, but not so large as to wrap */
472 else { 471 else {
473 cERROR(1, ("unknown timeout flag %d", long_op)); 472 cERROR(1, "unknown timeout flag %d", long_op);
474 rc = -EIO; 473 rc = -EIO;
475 goto out; 474 goto out;
476 } 475 }
@@ -489,8 +488,8 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
489 spin_lock(&GlobalMid_Lock); 488 spin_lock(&GlobalMid_Lock);
490 489
491 if (midQ->resp_buf == NULL) { 490 if (midQ->resp_buf == NULL) {
492 cERROR(1, ("No response to cmd %d mid %d", 491 cERROR(1, "No response to cmd %d mid %d",
493 midQ->command, midQ->mid)); 492 midQ->command, midQ->mid);
494 if (midQ->midState == MID_REQUEST_SUBMITTED) { 493 if (midQ->midState == MID_REQUEST_SUBMITTED) {
495 if (ses->server->tcpStatus == CifsExiting) 494 if (ses->server->tcpStatus == CifsExiting)
496 rc = -EHOSTDOWN; 495 rc = -EHOSTDOWN;
@@ -503,7 +502,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
503 if (rc != -EHOSTDOWN) { 502 if (rc != -EHOSTDOWN) {
504 if (midQ->midState == MID_RETRY_NEEDED) { 503 if (midQ->midState == MID_RETRY_NEEDED) {
505 rc = -EAGAIN; 504 rc = -EAGAIN;
506 cFYI(1, ("marking request for retry")); 505 cFYI(1, "marking request for retry");
507 } else { 506 } else {
508 rc = -EIO; 507 rc = -EIO;
509 } 508 }
@@ -520,8 +519,8 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
520 receive_len = midQ->resp_buf->smb_buf_length; 519 receive_len = midQ->resp_buf->smb_buf_length;
521 520
522 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { 521 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
523 cERROR(1, ("Frame too large received. Length: %d Xid: %d", 522 cERROR(1, "Frame too large received. Length: %d Xid: %d",
524 receive_len, xid)); 523 receive_len, xid);
525 rc = -EIO; 524 rc = -EIO;
526 goto out; 525 goto out;
527 } 526 }
@@ -547,7 +546,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
547 &ses->server->mac_signing_key, 546 &ses->server->mac_signing_key,
548 midQ->sequence_number+1); 547 midQ->sequence_number+1);
549 if (rc) { 548 if (rc) {
550 cERROR(1, ("Unexpected SMB signature")); 549 cERROR(1, "Unexpected SMB signature");
551 /* BB FIXME add code to kill session */ 550 /* BB FIXME add code to kill session */
552 } 551 }
553 } 552 }
@@ -568,7 +567,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
568 DeleteMidQEntry */ 567 DeleteMidQEntry */
569 } else { 568 } else {
570 rc = -EIO; 569 rc = -EIO;
571 cFYI(1, ("Bad MID state?")); 570 cFYI(1, "Bad MID state?");
572 } 571 }
573 572
574out: 573out:
@@ -590,11 +589,11 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
590 struct mid_q_entry *midQ; 589 struct mid_q_entry *midQ;
591 590
592 if (ses == NULL) { 591 if (ses == NULL) {
593 cERROR(1, ("Null smb session")); 592 cERROR(1, "Null smb session");
594 return -EIO; 593 return -EIO;
595 } 594 }
596 if (ses->server == NULL) { 595 if (ses->server == NULL) {
597 cERROR(1, ("Null tcp session")); 596 cERROR(1, "Null tcp session");
598 return -EIO; 597 return -EIO;
599 } 598 }
600 599
@@ -606,8 +605,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
606 use ses->maxReq */ 605 use ses->maxReq */
607 606
608 if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { 607 if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
609 cERROR(1, ("Illegal length, greater than maximum frame, %d", 608 cERROR(1, "Illegal length, greater than maximum frame, %d",
610 in_buf->smb_buf_length)); 609 in_buf->smb_buf_length);
611 return -EIO; 610 return -EIO;
612 } 611 }
613 612
@@ -664,7 +663,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
664 else if (long_op == CIFS_BLOCKING_OP) 663 else if (long_op == CIFS_BLOCKING_OP)
665 timeout = 0x7FFFFFFF; /* large but no so large as to wrap */ 664 timeout = 0x7FFFFFFF; /* large but no so large as to wrap */
666 else { 665 else {
667 cERROR(1, ("unknown timeout flag %d", long_op)); 666 cERROR(1, "unknown timeout flag %d", long_op);
668 rc = -EIO; 667 rc = -EIO;
669 goto out; 668 goto out;
670 } 669 }
@@ -680,8 +679,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
680 679
681 spin_lock(&GlobalMid_Lock); 680 spin_lock(&GlobalMid_Lock);
682 if (midQ->resp_buf == NULL) { 681 if (midQ->resp_buf == NULL) {
683 cERROR(1, ("No response for cmd %d mid %d", 682 cERROR(1, "No response for cmd %d mid %d",
684 midQ->command, midQ->mid)); 683 midQ->command, midQ->mid);
685 if (midQ->midState == MID_REQUEST_SUBMITTED) { 684 if (midQ->midState == MID_REQUEST_SUBMITTED) {
686 if (ses->server->tcpStatus == CifsExiting) 685 if (ses->server->tcpStatus == CifsExiting)
687 rc = -EHOSTDOWN; 686 rc = -EHOSTDOWN;
@@ -694,7 +693,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
694 if (rc != -EHOSTDOWN) { 693 if (rc != -EHOSTDOWN) {
695 if (midQ->midState == MID_RETRY_NEEDED) { 694 if (midQ->midState == MID_RETRY_NEEDED) {
696 rc = -EAGAIN; 695 rc = -EAGAIN;
697 cFYI(1, ("marking request for retry")); 696 cFYI(1, "marking request for retry");
698 } else { 697 } else {
699 rc = -EIO; 698 rc = -EIO;
700 } 699 }
@@ -711,8 +710,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
711 receive_len = midQ->resp_buf->smb_buf_length; 710 receive_len = midQ->resp_buf->smb_buf_length;
712 711
713 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { 712 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
714 cERROR(1, ("Frame too large received. Length: %d Xid: %d", 713 cERROR(1, "Frame too large received. Length: %d Xid: %d",
715 receive_len, xid)); 714 receive_len, xid);
716 rc = -EIO; 715 rc = -EIO;
717 goto out; 716 goto out;
718 } 717 }
@@ -735,7 +734,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
735 &ses->server->mac_signing_key, 734 &ses->server->mac_signing_key,
736 midQ->sequence_number+1); 735 midQ->sequence_number+1);
737 if (rc) { 736 if (rc) {
738 cERROR(1, ("Unexpected SMB signature")); 737 cERROR(1, "Unexpected SMB signature");
739 /* BB FIXME add code to kill session */ 738 /* BB FIXME add code to kill session */
740 } 739 }
741 } 740 }
@@ -752,7 +751,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
752 BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf)); 751 BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf));
753 } else { 752 } else {
754 rc = -EIO; 753 rc = -EIO;
755 cERROR(1, ("Bad MID state?")); 754 cERROR(1, "Bad MID state?");
756 } 755 }
757 756
758out: 757out:
@@ -823,13 +822,13 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
823 struct cifsSesInfo *ses; 822 struct cifsSesInfo *ses;
824 823
825 if (tcon == NULL || tcon->ses == NULL) { 824 if (tcon == NULL || tcon->ses == NULL) {
826 cERROR(1, ("Null smb session")); 825 cERROR(1, "Null smb session");
827 return -EIO; 826 return -EIO;
828 } 827 }
829 ses = tcon->ses; 828 ses = tcon->ses;
830 829
831 if (ses->server == NULL) { 830 if (ses->server == NULL) {
832 cERROR(1, ("Null tcp session")); 831 cERROR(1, "Null tcp session");
833 return -EIO; 832 return -EIO;
834 } 833 }
835 834
@@ -841,8 +840,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
841 use ses->maxReq */ 840 use ses->maxReq */
842 841
843 if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { 842 if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
844 cERROR(1, ("Illegal length, greater than maximum frame, %d", 843 cERROR(1, "Illegal length, greater than maximum frame, %d",
845 in_buf->smb_buf_length)); 844 in_buf->smb_buf_length);
846 return -EIO; 845 return -EIO;
847 } 846 }
848 847
@@ -932,8 +931,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
932 spin_unlock(&GlobalMid_Lock); 931 spin_unlock(&GlobalMid_Lock);
933 receive_len = midQ->resp_buf->smb_buf_length; 932 receive_len = midQ->resp_buf->smb_buf_length;
934 } else { 933 } else {
935 cERROR(1, ("No response for cmd %d mid %d", 934 cERROR(1, "No response for cmd %d mid %d",
936 midQ->command, midQ->mid)); 935 midQ->command, midQ->mid);
937 if (midQ->midState == MID_REQUEST_SUBMITTED) { 936 if (midQ->midState == MID_REQUEST_SUBMITTED) {
938 if (ses->server->tcpStatus == CifsExiting) 937 if (ses->server->tcpStatus == CifsExiting)
939 rc = -EHOSTDOWN; 938 rc = -EHOSTDOWN;
@@ -946,7 +945,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
946 if (rc != -EHOSTDOWN) { 945 if (rc != -EHOSTDOWN) {
947 if (midQ->midState == MID_RETRY_NEEDED) { 946 if (midQ->midState == MID_RETRY_NEEDED) {
948 rc = -EAGAIN; 947 rc = -EAGAIN;
949 cFYI(1, ("marking request for retry")); 948 cFYI(1, "marking request for retry");
950 } else { 949 } else {
951 rc = -EIO; 950 rc = -EIO;
952 } 951 }
@@ -957,8 +956,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
957 } 956 }
958 957
959 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { 958 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
960 cERROR(1, ("Frame too large received. Length: %d Xid: %d", 959 cERROR(1, "Frame too large received. Length: %d Xid: %d",
961 receive_len, xid)); 960 receive_len, xid);
962 rc = -EIO; 961 rc = -EIO;
963 goto out; 962 goto out;
964 } 963 }
@@ -967,7 +966,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
967 966
968 if ((out_buf == NULL) || (midQ->midState != MID_RESPONSE_RECEIVED)) { 967 if ((out_buf == NULL) || (midQ->midState != MID_RESPONSE_RECEIVED)) {
969 rc = -EIO; 968 rc = -EIO;
970 cERROR(1, ("Bad MID state?")); 969 cERROR(1, "Bad MID state?");
971 goto out; 970 goto out;
972 } 971 }
973 972
@@ -985,7 +984,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
985 &ses->server->mac_signing_key, 984 &ses->server->mac_signing_key,
986 midQ->sequence_number+1); 985 midQ->sequence_number+1);
987 if (rc) { 986 if (rc) {
988 cERROR(1, ("Unexpected SMB signature")); 987 cERROR(1, "Unexpected SMB signature");
989 /* BB FIXME add code to kill session */ 988 /* BB FIXME add code to kill session */
990 } 989 }
991 } 990 }
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 3e2ef0de1209..a1509207bfa6 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -21,6 +21,7 @@
21 21
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/posix_acl_xattr.h> 23#include <linux/posix_acl_xattr.h>
24#include <linux/slab.h>
24#include "cifsfs.h" 25#include "cifsfs.h"
25#include "cifspdu.h" 26#include "cifspdu.h"
26#include "cifsglob.h" 27#include "cifsglob.h"
@@ -69,12 +70,12 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
69 return rc; 70 return rc;
70 } 71 }
71 if (ea_name == NULL) { 72 if (ea_name == NULL) {
72 cFYI(1, ("Null xattr names not supported")); 73 cFYI(1, "Null xattr names not supported");
73 } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) 74 } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5)
74 && (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4))) { 75 && (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4))) {
75 cFYI(1, 76 cFYI(1,
76 ("illegal xattr request %s (only user namespace supported)", 77 "illegal xattr request %s (only user namespace supported)",
77 ea_name)); 78 ea_name);
78 /* BB what if no namespace prefix? */ 79 /* BB what if no namespace prefix? */
79 /* Should we just pass them to server, except for 80 /* Should we just pass them to server, except for
80 system and perhaps security prefixes? */ 81 system and perhaps security prefixes? */
@@ -130,19 +131,19 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
130 search server for EAs or streams to 131 search server for EAs or streams to
131 returns as xattrs */ 132 returns as xattrs */
132 if (value_size > MAX_EA_VALUE_SIZE) { 133 if (value_size > MAX_EA_VALUE_SIZE) {
133 cFYI(1, ("size of EA value too large")); 134 cFYI(1, "size of EA value too large");
134 kfree(full_path); 135 kfree(full_path);
135 FreeXid(xid); 136 FreeXid(xid);
136 return -EOPNOTSUPP; 137 return -EOPNOTSUPP;
137 } 138 }
138 139
139 if (ea_name == NULL) { 140 if (ea_name == NULL) {
140 cFYI(1, ("Null xattr names not supported")); 141 cFYI(1, "Null xattr names not supported");
141 } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) { 142 } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
142 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) 143 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
143 goto set_ea_exit; 144 goto set_ea_exit;
144 if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) 145 if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0)
145 cFYI(1, ("attempt to set cifs inode metadata")); 146 cFYI(1, "attempt to set cifs inode metadata");
146 147
147 ea_name += 5; /* skip past user. prefix */ 148 ea_name += 5; /* skip past user. prefix */
148 rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value, 149 rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
@@ -168,9 +169,9 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
168 ACL_TYPE_ACCESS, cifs_sb->local_nls, 169 ACL_TYPE_ACCESS, cifs_sb->local_nls,
169 cifs_sb->mnt_cifs_flags & 170 cifs_sb->mnt_cifs_flags &
170 CIFS_MOUNT_MAP_SPECIAL_CHR); 171 CIFS_MOUNT_MAP_SPECIAL_CHR);
171 cFYI(1, ("set POSIX ACL rc %d", rc)); 172 cFYI(1, "set POSIX ACL rc %d", rc);
172#else 173#else
173 cFYI(1, ("set POSIX ACL not supported")); 174 cFYI(1, "set POSIX ACL not supported");
174#endif 175#endif
175 } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT, 176 } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
176 strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) { 177 strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -181,13 +182,13 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
181 ACL_TYPE_DEFAULT, cifs_sb->local_nls, 182 ACL_TYPE_DEFAULT, cifs_sb->local_nls,
182 cifs_sb->mnt_cifs_flags & 183 cifs_sb->mnt_cifs_flags &
183 CIFS_MOUNT_MAP_SPECIAL_CHR); 184 CIFS_MOUNT_MAP_SPECIAL_CHR);
184 cFYI(1, ("set POSIX default ACL rc %d", rc)); 185 cFYI(1, "set POSIX default ACL rc %d", rc);
185#else 186#else
186 cFYI(1, ("set default POSIX ACL not supported")); 187 cFYI(1, "set default POSIX ACL not supported");
187#endif 188#endif
188 } else { 189 } else {
189 cFYI(1, ("illegal xattr request %s (only user namespace" 190 cFYI(1, "illegal xattr request %s (only user namespace"
190 " supported)", ea_name)); 191 " supported)", ea_name);
191 /* BB what if no namespace prefix? */ 192 /* BB what if no namespace prefix? */
192 /* Should we just pass them to server, except for 193 /* Should we just pass them to server, except for
193 system and perhaps security prefixes? */ 194 system and perhaps security prefixes? */
@@ -234,13 +235,13 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
234 /* return dos attributes as pseudo xattr */ 235 /* return dos attributes as pseudo xattr */
235 /* return alt name if available as pseudo attr */ 236 /* return alt name if available as pseudo attr */
236 if (ea_name == NULL) { 237 if (ea_name == NULL) {
237 cFYI(1, ("Null xattr names not supported")); 238 cFYI(1, "Null xattr names not supported");
238 } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) { 239 } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
239 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) 240 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
240 goto get_ea_exit; 241 goto get_ea_exit;
241 242
242 if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) { 243 if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) {
243 cFYI(1, ("attempt to query cifs inode metadata")); 244 cFYI(1, "attempt to query cifs inode metadata");
244 /* revalidate/getattr then populate from inode */ 245 /* revalidate/getattr then populate from inode */
245 } /* BB add else when above is implemented */ 246 } /* BB add else when above is implemented */
246 ea_name += 5; /* skip past user. prefix */ 247 ea_name += 5; /* skip past user. prefix */
@@ -286,7 +287,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
286 } 287 }
287#endif /* EXPERIMENTAL */ 288#endif /* EXPERIMENTAL */
288#else 289#else
289 cFYI(1, ("query POSIX ACL not supported yet")); 290 cFYI(1, "query POSIX ACL not supported yet");
290#endif /* CONFIG_CIFS_POSIX */ 291#endif /* CONFIG_CIFS_POSIX */
291 } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT, 292 } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
292 strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) { 293 strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -298,18 +299,18 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
298 cifs_sb->mnt_cifs_flags & 299 cifs_sb->mnt_cifs_flags &
299 CIFS_MOUNT_MAP_SPECIAL_CHR); 300 CIFS_MOUNT_MAP_SPECIAL_CHR);
300#else 301#else
301 cFYI(1, ("query POSIX default ACL not supported yet")); 302 cFYI(1, "query POSIX default ACL not supported yet");
302#endif 303#endif
303 } else if (strncmp(ea_name, 304 } else if (strncmp(ea_name,
304 CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) { 305 CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
305 cFYI(1, ("Trusted xattr namespace not supported yet")); 306 cFYI(1, "Trusted xattr namespace not supported yet");
306 } else if (strncmp(ea_name, 307 } else if (strncmp(ea_name,
307 CIFS_XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) { 308 CIFS_XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) {
308 cFYI(1, ("Security xattr namespace not supported yet")); 309 cFYI(1, "Security xattr namespace not supported yet");
309 } else 310 } else
310 cFYI(1, 311 cFYI(1,
311 ("illegal xattr request %s (only user namespace supported)", 312 "illegal xattr request %s (only user namespace supported)",
312 ea_name)); 313 ea_name);
313 314
314 /* We could add an additional check for streams ie 315 /* We could add an additional check for streams ie
315 if proc/fs/cifs/streamstoxattr is set then 316 if proc/fs/cifs/streamstoxattr is set then
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 4bb9d0a5decc..ccd98b0f2b0b 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -12,6 +12,7 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/slab.h>
15#include <linux/file.h> 16#include <linux/file.h>
16#include <linux/stat.h> 17#include <linux/stat.h>
17#include <linux/errno.h> 18#include <linux/errno.h>
diff --git a/fs/coda/file.c b/fs/coda/file.c
index ffd42815fda1..4c813f2cdc52 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -17,6 +17,7 @@
17#include <linux/errno.h> 17#include <linux/errno.h>
18#include <linux/smp_lock.h> 18#include <linux/smp_lock.h>
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/slab.h>
20#include <asm/uaccess.h> 21#include <asm/uaccess.h>
21 22
22#include <linux/coda.h> 23#include <linux/coda.h>
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 830f51abb971..d97f9935a028 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -18,6 +18,7 @@
18#include <linux/smp_lock.h> 18#include <linux/smp_lock.h>
19#include <linux/file.h> 19#include <linux/file.h>
20#include <linux/vfs.h> 20#include <linux/vfs.h>
21#include <linux/slab.h>
21 22
22#include <asm/system.h> 23#include <asm/system.h>
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
@@ -166,6 +167,10 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
166 return -EBUSY; 167 return -EBUSY;
167 } 168 }
168 169
170 error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY);
171 if (error)
172 goto bdi_err;
173
169 vc->vc_sb = sb; 174 vc->vc_sb = sb;
170 175
171 sb->s_fs_info = vc; 176 sb->s_fs_info = vc;
@@ -174,6 +179,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
174 sb->s_blocksize_bits = 12; 179 sb->s_blocksize_bits = 12;
175 sb->s_magic = CODA_SUPER_MAGIC; 180 sb->s_magic = CODA_SUPER_MAGIC;
176 sb->s_op = &coda_super_operations; 181 sb->s_op = &coda_super_operations;
182 sb->s_bdi = &vc->bdi;
177 183
178 /* get root fid from Venus: this needs the root inode */ 184 /* get root fid from Venus: this needs the root inode */
179 error = venus_rootfid(sb, &fid); 185 error = venus_rootfid(sb, &fid);
@@ -199,6 +205,8 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
199 return 0; 205 return 0;
200 206
201 error: 207 error:
208 bdi_destroy(&vc->bdi);
209 bdi_err:
202 if (root) 210 if (root)
203 iput(root); 211 iput(root);
204 if (vc) 212 if (vc)
@@ -209,6 +217,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
209 217
210static void coda_put_super(struct super_block *sb) 218static void coda_put_super(struct super_block *sb)
211{ 219{
220 bdi_destroy(&coda_vcp(sb)->bdi);
212 coda_vcp(sb)->vc_sb = NULL; 221 coda_vcp(sb)->vc_sb = NULL;
213 sb->s_fs_info = NULL; 222 sb->s_fs_info = NULL;
214 223
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index c274d949179d..f09c5ed76f6c 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -26,6 +26,7 @@
26#include <linux/stat.h> 26#include <linux/stat.h>
27#include <linux/errno.h> 27#include <linux/errno.h>
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/slab.h>
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
30#include <linux/vmalloc.h> 31#include <linux/vmalloc.h>
31#include <linux/vfs.h> 32#include <linux/vfs.h>
diff --git a/fs/compat.c b/fs/compat.c
index 00d90c2e66f0..05448730f840 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -49,6 +49,7 @@
49#include <linux/mm.h> 49#include <linux/mm.h>
50#include <linux/eventpoll.h> 50#include <linux/eventpoll.h>
51#include <linux/fs_struct.h> 51#include <linux/fs_struct.h>
52#include <linux/slab.h>
52 53
53#include <asm/uaccess.h> 54#include <asm/uaccess.h>
54#include <asm/mmu_context.h> 55#include <asm/mmu_context.h>
@@ -1530,8 +1531,6 @@ int compat_do_execve(char * filename,
1530 if (retval < 0) 1531 if (retval < 0)
1531 goto out; 1532 goto out;
1532 1533
1533 current->stack_start = current->mm->start_stack;
1534
1535 /* execve succeeded */ 1534 /* execve succeeded */
1536 current->fs->in_exec = 0; 1535 current->fs->in_exec = 0;
1537 current->in_execve = 0; 1536 current->in_execve = 0;
@@ -1795,6 +1794,24 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
1795 return ret; 1794 return ret;
1796} 1795}
1797 1796
1797struct compat_sel_arg_struct {
1798 compat_ulong_t n;
1799 compat_uptr_t inp;
1800 compat_uptr_t outp;
1801 compat_uptr_t exp;
1802 compat_uptr_t tvp;
1803};
1804
1805asmlinkage long compat_sys_old_select(struct compat_sel_arg_struct __user *arg)
1806{
1807 struct compat_sel_arg_struct a;
1808
1809 if (copy_from_user(&a, arg, sizeof(a)))
1810 return -EFAULT;
1811 return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
1812 compat_ptr(a.exp), compat_ptr(a.tvp));
1813}
1814
1798#ifdef HAVE_SET_RESTORE_SIGMASK 1815#ifdef HAVE_SET_RESTORE_SIGMASK
1799static long do_compat_pselect(int n, compat_ulong_t __user *inp, 1816static long do_compat_pselect(int n, compat_ulong_t __user *inp,
1800 compat_ulong_t __user *outp, compat_ulong_t __user *exp, 1817 compat_ulong_t __user *outp, compat_ulong_t __user *exp,
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 6d55b61bfa79..641640dc7ae5 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -23,7 +23,6 @@
23#include <linux/ioctl.h> 23#include <linux/ioctl.h>
24#include <linux/if.h> 24#include <linux/if.h>
25#include <linux/if_bridge.h> 25#include <linux/if_bridge.h>
26#include <linux/slab.h>
27#include <linux/raid/md_u.h> 26#include <linux/raid/md_u.h>
28#include <linux/kd.h> 27#include <linux/kd.h>
29#include <linux/route.h> 28#include <linux/route.h>
@@ -60,6 +59,7 @@
60#include <linux/i2c.h> 59#include <linux/i2c.h>
61#include <linux/i2c-dev.h> 60#include <linux/i2c-dev.h>
62#include <linux/atalk.h> 61#include <linux/atalk.h>
62#include <linux/gfp.h>
63 63
64#include <net/bluetooth/bluetooth.h> 64#include <net/bluetooth/bluetooth.h>
65#include <net/bluetooth/hci.h> 65#include <net/bluetooth/hci.h>
@@ -102,7 +102,6 @@
102#include <linux/nbd.h> 102#include <linux/nbd.h>
103#include <linux/random.h> 103#include <linux/random.h>
104#include <linux/filter.h> 104#include <linux/filter.h>
105#include <linux/pktcdvd.h>
106 105
107#include <linux/hiddev.h> 106#include <linux/hiddev.h>
108 107
@@ -1126,8 +1125,6 @@ COMPATIBLE_IOCTL(PPGETMODE)
1126COMPATIBLE_IOCTL(PPGETPHASE) 1125COMPATIBLE_IOCTL(PPGETPHASE)
1127COMPATIBLE_IOCTL(PPGETFLAGS) 1126COMPATIBLE_IOCTL(PPGETFLAGS)
1128COMPATIBLE_IOCTL(PPSETFLAGS) 1127COMPATIBLE_IOCTL(PPSETFLAGS)
1129/* pktcdvd */
1130COMPATIBLE_IOCTL(PACKET_CTRL_CMD)
1131/* Big A */ 1128/* Big A */
1132/* sparc only */ 1129/* sparc only */
1133/* Big Q for sound/OSS */ 1130/* Big Q for sound/OSS */
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 8e48b52205aa..0b502f80c691 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -645,6 +645,7 @@ static void detach_groups(struct config_group *group)
645 645
646 configfs_detach_group(sd->s_element); 646 configfs_detach_group(sd->s_element);
647 child->d_inode->i_flags |= S_DEAD; 647 child->d_inode->i_flags |= S_DEAD;
648 dont_mount(child);
648 649
649 mutex_unlock(&child->d_inode->i_mutex); 650 mutex_unlock(&child->d_inode->i_mutex);
650 651
@@ -840,6 +841,7 @@ static int configfs_attach_item(struct config_item *parent_item,
840 mutex_lock(&dentry->d_inode->i_mutex); 841 mutex_lock(&dentry->d_inode->i_mutex);
841 configfs_remove_dir(item); 842 configfs_remove_dir(item);
842 dentry->d_inode->i_flags |= S_DEAD; 843 dentry->d_inode->i_flags |= S_DEAD;
844 dont_mount(dentry);
843 mutex_unlock(&dentry->d_inode->i_mutex); 845 mutex_unlock(&dentry->d_inode->i_mutex);
844 d_delete(dentry); 846 d_delete(dentry);
845 } 847 }
@@ -882,6 +884,7 @@ static int configfs_attach_group(struct config_item *parent_item,
882 if (ret) { 884 if (ret) {
883 configfs_detach_item(item); 885 configfs_detach_item(item);
884 dentry->d_inode->i_flags |= S_DEAD; 886 dentry->d_inode->i_flags |= S_DEAD;
887 dont_mount(dentry);
885 } 888 }
886 configfs_adjust_dir_dirent_depth_after_populate(sd); 889 configfs_adjust_dir_dirent_depth_after_populate(sd);
887 mutex_unlock(&dentry->d_inode->i_mutex); 890 mutex_unlock(&dentry->d_inode->i_mutex);
@@ -1725,6 +1728,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
1725 mutex_unlock(&configfs_symlink_mutex); 1728 mutex_unlock(&configfs_symlink_mutex);
1726 configfs_detach_group(&group->cg_item); 1729 configfs_detach_group(&group->cg_item);
1727 dentry->d_inode->i_flags |= S_DEAD; 1730 dentry->d_inode->i_flags |= S_DEAD;
1731 dont_mount(dentry);
1728 mutex_unlock(&dentry->d_inode->i_mutex); 1732 mutex_unlock(&dentry->d_inode->i_mutex);
1729 1733
1730 d_delete(dentry); 1734 d_delete(dentry);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index a2f746066c5d..c8af2d91174b 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -34,6 +34,7 @@
34#include <linux/capability.h> 34#include <linux/capability.h>
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/lockdep.h> 36#include <linux/lockdep.h>
37#include <linux/slab.h>
37 38
38#include <linux/configfs.h> 39#include <linux/configfs.h>
39#include "configfs_internal.h" 40#include "configfs_internal.h"
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 8421cea7d8c7..8c8d64230c2d 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -29,6 +29,7 @@
29#include <linux/mount.h> 29#include <linux/mount.h>
30#include <linux/pagemap.h> 30#include <linux/pagemap.h>
31#include <linux/init.h> 31#include <linux/init.h>
32#include <linux/slab.h>
32 33
33#include <linux/configfs.h> 34#include <linux/configfs.h>
34#include "configfs_internal.h" 35#include "configfs_internal.h"
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 32a5f46b1157..0f3eb41d9201 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -27,6 +27,7 @@
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/namei.h> 29#include <linux/namei.h>
30#include <linux/slab.h>
30 31
31#include <linux/configfs.h> 32#include <linux/configfs.h>
32#include "configfs_internal.h" 33#include "configfs_internal.h"
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 049d6c36da09..30a87b3dbcac 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -27,6 +27,7 @@
27#include <linux/fsnotify.h> 27#include <linux/fsnotify.h>
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/magic.h> 29#include <linux/magic.h>
30#include <linux/slab.h>
30 31
31static struct vfsmount *debugfs_mount; 32static struct vfsmount *debugfs_mount;
32static int debugfs_mount_count; 33static int debugfs_mount_count;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 8882ecc0f1bf..0120247b41c0 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -15,6 +15,7 @@
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/slab.h>
18#include <linux/mount.h> 19#include <linux/mount.h>
19#include <linux/tty.h> 20#include <linux/tty.h>
20#include <linux/mutex.h> 21#include <linux/mutex.h>
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 0df243850818..b54bca03d92f 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -14,6 +14,7 @@
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/configfs.h> 16#include <linux/configfs.h>
17#include <linux/slab.h>
17#include <linux/in.h> 18#include <linux/in.h>
18#include <linux/in6.h> 19#include <linux/in6.h>
19#include <net/ipv6.h> 20#include <net/ipv6.h>
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 29d6139c35fc..c6cf25158746 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -15,6 +15,7 @@
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/ctype.h> 16#include <linux/ctype.h>
17#include <linux/debugfs.h> 17#include <linux/debugfs.h>
18#include <linux/slab.h>
18 19
19#include "dlm_internal.h" 20#include "dlm_internal.h"
20#include "lock.h" 21#include "lock.h"
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 46ffd3eeaaf7..031dbe3a15ca 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -56,6 +56,7 @@
56 L: receive_xxxx_reply() <- R: send_xxxx_reply() 56 L: receive_xxxx_reply() <- R: send_xxxx_reply()
57*/ 57*/
58#include <linux/types.h> 58#include <linux/types.h>
59#include <linux/slab.h>
59#include "dlm_internal.h" 60#include "dlm_internal.h"
60#include <linux/dlm_device.h> 61#include <linux/dlm_device.h>
61#include "memory.h" 62#include "memory.h"
@@ -732,10 +733,7 @@ static void lkb_add_ordered(struct list_head *new, struct list_head *head,
732 if (lkb->lkb_rqmode < mode) 733 if (lkb->lkb_rqmode < mode)
733 break; 734 break;
734 735
735 if (!lkb) 736 __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
736 list_add_tail(new, head);
737 else
738 __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
739} 737}
740 738
741/* add/remove lkb to rsb's grant/convert/wait queue */ 739/* add/remove lkb to rsb's grant/convert/wait queue */
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 26a8bd40400a..f994a7dfda85 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -148,7 +148,7 @@ static void lockspace_kobj_release(struct kobject *k)
148 kfree(ls); 148 kfree(ls);
149} 149}
150 150
151static struct sysfs_ops dlm_attr_ops = { 151static const struct sysfs_ops dlm_attr_ops = {
152 .show = dlm_attr_show, 152 .show = dlm_attr_show,
153 .store = dlm_attr_store, 153 .store = dlm_attr_store,
154}; 154};
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 52cab160893c..c0d35c620526 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -51,6 +51,7 @@
51#include <linux/file.h> 51#include <linux/file.h>
52#include <linux/mutex.h> 52#include <linux/mutex.h>
53#include <linux/sctp.h> 53#include <linux/sctp.h>
54#include <linux/slab.h>
54#include <net/sctp/user.h> 55#include <net/sctp/user.h>
55#include <net/ipv6.h> 56#include <net/ipv6.h>
56 57
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 84f70bfb0baf..b12532e553f8 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -312,7 +312,7 @@ int dlm_ls_stop(struct dlm_ls *ls)
312 /* 312 /*
313 * This in_recovery lock does two things: 313 * This in_recovery lock does two things:
314 * 1) Keeps this function from returning until all threads are out 314 * 1) Keeps this function from returning until all threads are out
315 * of locking routines and locking is truely stopped. 315 * of locking routines and locking is truly stopped.
316 * 2) Keeps any new requests from being processed until it's unlocked 316 * 2) Keeps any new requests from being processed until it's unlocked
317 * when recovery is complete. 317 * when recovery is complete.
318 */ 318 */
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 052095cd592f..2c6ad518100d 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -9,6 +9,7 @@
9#include <net/genetlink.h> 9#include <net/genetlink.h>
10#include <linux/dlm.h> 10#include <linux/dlm.h>
11#include <linux/dlm_netlink.h> 11#include <linux/dlm_netlink.h>
12#include <linux/gfp.h>
12 13
13#include "dlm_internal.h" 14#include "dlm_internal.h"
14 15
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index b5f89aef3b29..d45c02db6943 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -11,6 +11,7 @@
11#include <linux/poll.h> 11#include <linux/poll.h>
12#include <linux/dlm.h> 12#include <linux/dlm.h>
13#include <linux/dlm_plock.h> 13#include <linux/dlm_plock.h>
14#include <linux/slab.h>
14 15
15#include "dlm_internal.h" 16#include "dlm_internal.h"
16#include "lockspace.h" 17#include "lockspace.h"
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index a4bfd31ac45b..b6272853130c 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -17,6 +17,7 @@
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/dlm.h> 18#include <linux/dlm.h>
19#include <linux/dlm_device.h> 19#include <linux/dlm_device.h>
20#include <linux/slab.h>
20 21
21#include "dlm_internal.h" 22#include "dlm_internal.h"
22#include "lockspace.h" 23#include "lockspace.h"
@@ -214,6 +215,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
214 if (!ast_type) { 215 if (!ast_type) {
215 kref_get(&lkb->lkb_ref); 216 kref_get(&lkb->lkb_ref);
216 list_add_tail(&lkb->lkb_astqueue, &proc->asts); 217 list_add_tail(&lkb->lkb_astqueue, &proc->asts);
218 lkb->lkb_ast_first = type;
217 wake_up_interruptible(&proc->wait); 219 wake_up_interruptible(&proc->wait);
218 } 220 }
219 if (type == AST_COMP && (ast_type & AST_COMP)) 221 if (type == AST_COMP && (ast_type & AST_COMP))
@@ -222,7 +224,6 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
222 224
223 eol = lkb_is_endoflife(lkb, ua->lksb.sb_status, type); 225 eol = lkb_is_endoflife(lkb, ua->lksb.sb_status, type);
224 if (eol) { 226 if (eol) {
225 lkb->lkb_ast_type &= ~AST_BAST;
226 lkb->lkb_flags |= DLM_IFL_ENDOFLIFE; 227 lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
227 } 228 }
228 229
@@ -705,7 +706,7 @@ static int device_close(struct inode *inode, struct file *file)
705} 706}
706 707
707static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type, 708static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
708 int bmode, char __user *buf, size_t count) 709 int mode, char __user *buf, size_t count)
709{ 710{
710#ifdef CONFIG_COMPAT 711#ifdef CONFIG_COMPAT
711 struct dlm_lock_result32 result32; 712 struct dlm_lock_result32 result32;
@@ -732,7 +733,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
732 if (type == AST_BAST) { 733 if (type == AST_BAST) {
733 result.user_astaddr = ua->bastaddr; 734 result.user_astaddr = ua->bastaddr;
734 result.user_astparam = ua->bastparam; 735 result.user_astparam = ua->bastparam;
735 result.bast_mode = bmode; 736 result.bast_mode = mode;
736 } else { 737 } else {
737 result.user_astaddr = ua->castaddr; 738 result.user_astaddr = ua->castaddr;
738 result.user_astparam = ua->castparam; 739 result.user_astparam = ua->castparam;
@@ -800,7 +801,9 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
800 struct dlm_user_proc *proc = file->private_data; 801 struct dlm_user_proc *proc = file->private_data;
801 struct dlm_lkb *lkb; 802 struct dlm_lkb *lkb;
802 DECLARE_WAITQUEUE(wait, current); 803 DECLARE_WAITQUEUE(wait, current);
803 int error, type=0, bmode=0, removed = 0; 804 int error = 0, removed;
805 int ret_type, ret_mode;
806 int bastmode, castmode, do_bast, do_cast;
804 807
805 if (count == sizeof(struct dlm_device_version)) { 808 if (count == sizeof(struct dlm_device_version)) {
806 error = copy_version_to_user(buf, count); 809 error = copy_version_to_user(buf, count);
@@ -819,6 +822,8 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
819#endif 822#endif
820 return -EINVAL; 823 return -EINVAL;
821 824
825 try_another:
826
822 /* do we really need this? can a read happen after a close? */ 827 /* do we really need this? can a read happen after a close? */
823 if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags)) 828 if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
824 return -EINVAL; 829 return -EINVAL;
@@ -854,13 +859,55 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
854 859
855 lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue); 860 lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
856 861
857 if (lkb->lkb_ast_type & AST_COMP) { 862 removed = 0;
858 lkb->lkb_ast_type &= ~AST_COMP; 863 ret_type = 0;
859 type = AST_COMP; 864 ret_mode = 0;
860 } else if (lkb->lkb_ast_type & AST_BAST) { 865 do_bast = lkb->lkb_ast_type & AST_BAST;
861 lkb->lkb_ast_type &= ~AST_BAST; 866 do_cast = lkb->lkb_ast_type & AST_COMP;
862 type = AST_BAST; 867 bastmode = lkb->lkb_bastmode;
863 bmode = lkb->lkb_bastmode; 868 castmode = lkb->lkb_castmode;
869
870 /* when both are queued figure out which to do first and
871 switch first so the other goes in the next read */
872
873 if (do_cast && do_bast) {
874 if (lkb->lkb_ast_first == AST_COMP) {
875 ret_type = AST_COMP;
876 ret_mode = castmode;
877 lkb->lkb_ast_type &= ~AST_COMP;
878 lkb->lkb_ast_first = AST_BAST;
879 } else {
880 ret_type = AST_BAST;
881 ret_mode = bastmode;
882 lkb->lkb_ast_type &= ~AST_BAST;
883 lkb->lkb_ast_first = AST_COMP;
884 }
885 } else {
886 ret_type = lkb->lkb_ast_first;
887 ret_mode = (ret_type == AST_COMP) ? castmode : bastmode;
888 lkb->lkb_ast_type &= ~ret_type;
889 lkb->lkb_ast_first = 0;
890 }
891
892 /* if we're doing a bast but the bast is unnecessary, then
893 switch to do nothing or do a cast if that was needed next */
894
895 if ((ret_type == AST_BAST) &&
896 dlm_modes_compat(bastmode, lkb->lkb_castmode_done)) {
897 ret_type = 0;
898 ret_mode = 0;
899
900 if (do_cast) {
901 ret_type = AST_COMP;
902 ret_mode = castmode;
903 lkb->lkb_ast_type &= ~AST_COMP;
904 lkb->lkb_ast_first = 0;
905 }
906 }
907
908 if (lkb->lkb_ast_first != lkb->lkb_ast_type) {
909 log_print("device_read %x ast_first %x ast_type %x",
910 lkb->lkb_id, lkb->lkb_ast_first, lkb->lkb_ast_type);
864 } 911 }
865 912
866 if (!lkb->lkb_ast_type) { 913 if (!lkb->lkb_ast_type) {
@@ -869,15 +916,29 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
869 } 916 }
870 spin_unlock(&proc->asts_spin); 917 spin_unlock(&proc->asts_spin);
871 918
872 error = copy_result_to_user(lkb->lkb_ua, 919 if (ret_type) {
873 test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags), 920 error = copy_result_to_user(lkb->lkb_ua,
874 type, bmode, buf, count); 921 test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
922 ret_type, ret_mode, buf, count);
923
924 if (ret_type == AST_COMP)
925 lkb->lkb_castmode_done = castmode;
926 if (ret_type == AST_BAST)
927 lkb->lkb_bastmode_done = bastmode;
928 }
875 929
876 /* removes reference for the proc->asts lists added by 930 /* removes reference for the proc->asts lists added by
877 dlm_user_add_ast() and may result in the lkb being freed */ 931 dlm_user_add_ast() and may result in the lkb being freed */
932
878 if (removed) 933 if (removed)
879 dlm_put_lkb(lkb); 934 dlm_put_lkb(lkb);
880 935
936 /* the bast that was queued was eliminated (see unnecessary above),
937 leaving nothing to return */
938
939 if (!ret_type)
940 goto try_another;
941
881 return error; 942 return error;
882} 943}
883 944
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 7cb0a59f4b9d..1cc087635a5e 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -33,6 +33,7 @@
33#include <linux/crypto.h> 33#include <linux/crypto.h>
34#include <linux/file.h> 34#include <linux/file.h>
35#include <linux/scatterlist.h> 35#include <linux/scatterlist.h>
36#include <linux/slab.h>
36#include <asm/unaligned.h> 37#include <asm/unaligned.h>
37#include "ecryptfs_kernel.h" 38#include "ecryptfs_kernel.h"
38 39
@@ -381,8 +382,8 @@ out:
381static void ecryptfs_lower_offset_for_extent(loff_t *offset, loff_t extent_num, 382static void ecryptfs_lower_offset_for_extent(loff_t *offset, loff_t extent_num,
382 struct ecryptfs_crypt_stat *crypt_stat) 383 struct ecryptfs_crypt_stat *crypt_stat)
383{ 384{
384 (*offset) = (crypt_stat->num_header_bytes_at_front 385 (*offset) = ecryptfs_lower_header_size(crypt_stat)
385 + (crypt_stat->extent_size * extent_num)); 386 + (crypt_stat->extent_size * extent_num);
386} 387}
387 388
388/** 389/**
@@ -834,13 +835,13 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat)
834 set_extent_mask_and_shift(crypt_stat); 835 set_extent_mask_and_shift(crypt_stat);
835 crypt_stat->iv_bytes = ECRYPTFS_DEFAULT_IV_BYTES; 836 crypt_stat->iv_bytes = ECRYPTFS_DEFAULT_IV_BYTES;
836 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) 837 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
837 crypt_stat->num_header_bytes_at_front = 0; 838 crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
838 else { 839 else {
839 if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) 840 if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)
840 crypt_stat->num_header_bytes_at_front = 841 crypt_stat->metadata_size =
841 ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; 842 ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
842 else 843 else
843 crypt_stat->num_header_bytes_at_front = PAGE_CACHE_SIZE; 844 crypt_stat->metadata_size = PAGE_CACHE_SIZE;
844 } 845 }
845} 846}
846 847
@@ -1107,9 +1108,9 @@ static void write_ecryptfs_marker(char *page_virt, size_t *written)
1107 (*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES; 1108 (*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
1108} 1109}
1109 1110
1110static void 1111void ecryptfs_write_crypt_stat_flags(char *page_virt,
1111write_ecryptfs_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat, 1112 struct ecryptfs_crypt_stat *crypt_stat,
1112 size_t *written) 1113 size_t *written)
1113{ 1114{
1114 u32 flags = 0; 1115 u32 flags = 0;
1115 int i; 1116 int i;
@@ -1237,8 +1238,7 @@ ecryptfs_write_header_metadata(char *virt,
1237 1238
1238 header_extent_size = (u32)crypt_stat->extent_size; 1239 header_extent_size = (u32)crypt_stat->extent_size;
1239 num_header_extents_at_front = 1240 num_header_extents_at_front =
1240 (u16)(crypt_stat->num_header_bytes_at_front 1241 (u16)(crypt_stat->metadata_size / crypt_stat->extent_size);
1241 / crypt_stat->extent_size);
1242 put_unaligned_be32(header_extent_size, virt); 1242 put_unaligned_be32(header_extent_size, virt);
1243 virt += 4; 1243 virt += 4;
1244 put_unaligned_be16(num_header_extents_at_front, virt); 1244 put_unaligned_be16(num_header_extents_at_front, virt);
@@ -1291,7 +1291,8 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max,
1291 offset = ECRYPTFS_FILE_SIZE_BYTES; 1291 offset = ECRYPTFS_FILE_SIZE_BYTES;
1292 write_ecryptfs_marker((page_virt + offset), &written); 1292 write_ecryptfs_marker((page_virt + offset), &written);
1293 offset += written; 1293 offset += written;
1294 write_ecryptfs_flags((page_virt + offset), crypt_stat, &written); 1294 ecryptfs_write_crypt_stat_flags((page_virt + offset), crypt_stat,
1295 &written);
1295 offset += written; 1296 offset += written;
1296 ecryptfs_write_header_metadata((page_virt + offset), crypt_stat, 1297 ecryptfs_write_header_metadata((page_virt + offset), crypt_stat,
1297 &written); 1298 &written);
@@ -1381,7 +1382,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
1381 rc = -EINVAL; 1382 rc = -EINVAL;
1382 goto out; 1383 goto out;
1383 } 1384 }
1384 virt_len = crypt_stat->num_header_bytes_at_front; 1385 virt_len = crypt_stat->metadata_size;
1385 order = get_order(virt_len); 1386 order = get_order(virt_len);
1386 /* Released in this function */ 1387 /* Released in this function */
1387 virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order); 1388 virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order);
@@ -1427,16 +1428,15 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
1427 header_extent_size = get_unaligned_be32(virt); 1428 header_extent_size = get_unaligned_be32(virt);
1428 virt += sizeof(__be32); 1429 virt += sizeof(__be32);
1429 num_header_extents_at_front = get_unaligned_be16(virt); 1430 num_header_extents_at_front = get_unaligned_be16(virt);
1430 crypt_stat->num_header_bytes_at_front = 1431 crypt_stat->metadata_size = (((size_t)num_header_extents_at_front
1431 (((size_t)num_header_extents_at_front 1432 * (size_t)header_extent_size));
1432 * (size_t)header_extent_size));
1433 (*bytes_read) = (sizeof(__be32) + sizeof(__be16)); 1433 (*bytes_read) = (sizeof(__be32) + sizeof(__be16));
1434 if ((validate_header_size == ECRYPTFS_VALIDATE_HEADER_SIZE) 1434 if ((validate_header_size == ECRYPTFS_VALIDATE_HEADER_SIZE)
1435 && (crypt_stat->num_header_bytes_at_front 1435 && (crypt_stat->metadata_size
1436 < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)) { 1436 < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)) {
1437 rc = -EINVAL; 1437 rc = -EINVAL;
1438 printk(KERN_WARNING "Invalid header size: [%zd]\n", 1438 printk(KERN_WARNING "Invalid header size: [%zd]\n",
1439 crypt_stat->num_header_bytes_at_front); 1439 crypt_stat->metadata_size);
1440 } 1440 }
1441 return rc; 1441 return rc;
1442} 1442}
@@ -1451,8 +1451,7 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
1451 */ 1451 */
1452static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat) 1452static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat)
1453{ 1453{
1454 crypt_stat->num_header_bytes_at_front = 1454 crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
1455 ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
1456} 1455}
1457 1456
1458/** 1457/**
@@ -1606,6 +1605,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
1606 ecryptfs_dentry, 1605 ecryptfs_dentry,
1607 ECRYPTFS_VALIDATE_HEADER_SIZE); 1606 ECRYPTFS_VALIDATE_HEADER_SIZE);
1608 if (rc) { 1607 if (rc) {
1608 memset(page_virt, 0, PAGE_CACHE_SIZE);
1609 rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode); 1609 rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode);
1610 if (rc) { 1610 if (rc) {
1611 printk(KERN_DEBUG "Valid eCryptfs headers not found in " 1611 printk(KERN_DEBUG "Valid eCryptfs headers not found in "
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 8f006a0d6076..906e803f7f79 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -26,6 +26,7 @@
26#include <linux/namei.h> 26#include <linux/namei.h>
27#include <linux/mount.h> 27#include <linux/mount.h>
28#include <linux/fs_stack.h> 28#include <linux/fs_stack.h>
29#include <linux/slab.h>
29#include "ecryptfs_kernel.h" 30#include "ecryptfs_kernel.h"
30 31
31/** 32/**
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 542f625312f3..bfc2e0f78f00 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -35,6 +35,7 @@
35#include <linux/scatterlist.h> 35#include <linux/scatterlist.h>
36#include <linux/hash.h> 36#include <linux/hash.h>
37#include <linux/nsproxy.h> 37#include <linux/nsproxy.h>
38#include <linux/backing-dev.h>
38 39
39/* Version verification for shared data structures w/ userspace */ 40/* Version verification for shared data structures w/ userspace */
40#define ECRYPTFS_VERSION_MAJOR 0x00 41#define ECRYPTFS_VERSION_MAJOR 0x00
@@ -273,7 +274,7 @@ struct ecryptfs_crypt_stat {
273 u32 flags; 274 u32 flags;
274 unsigned int file_version; 275 unsigned int file_version;
275 size_t iv_bytes; 276 size_t iv_bytes;
276 size_t num_header_bytes_at_front; 277 size_t metadata_size;
277 size_t extent_size; /* Data extent size; default is 4096 */ 278 size_t extent_size; /* Data extent size; default is 4096 */
278 size_t key_size; 279 size_t key_size;
279 size_t extent_shift; 280 size_t extent_shift;
@@ -393,6 +394,7 @@ struct ecryptfs_mount_crypt_stat {
393struct ecryptfs_sb_info { 394struct ecryptfs_sb_info {
394 struct super_block *wsi_sb; 395 struct super_block *wsi_sb;
395 struct ecryptfs_mount_crypt_stat mount_crypt_stat; 396 struct ecryptfs_mount_crypt_stat mount_crypt_stat;
397 struct backing_dev_info bdi;
396}; 398};
397 399
398/* file private data. */ 400/* file private data. */
@@ -464,6 +466,14 @@ struct ecryptfs_daemon {
464 466
465extern struct mutex ecryptfs_daemon_hash_mux; 467extern struct mutex ecryptfs_daemon_hash_mux;
466 468
469static inline size_t
470ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat)
471{
472 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
473 return 0;
474 return crypt_stat->metadata_size;
475}
476
467static inline struct ecryptfs_file_info * 477static inline struct ecryptfs_file_info *
468ecryptfs_file_to_private(struct file *file) 478ecryptfs_file_to_private(struct file *file)
469{ 479{
@@ -651,6 +661,9 @@ int ecryptfs_decrypt_page(struct page *page);
651int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry); 661int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry);
652int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry); 662int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry);
653int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry); 663int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry);
664void ecryptfs_write_crypt_stat_flags(char *page_virt,
665 struct ecryptfs_crypt_stat *crypt_stat,
666 size_t *written);
654int ecryptfs_read_and_validate_header_region(char *data, 667int ecryptfs_read_and_validate_header_region(char *data,
655 struct inode *ecryptfs_inode); 668 struct inode *ecryptfs_inode);
656int ecryptfs_read_and_validate_xattr_region(char *page_virt, 669int ecryptfs_read_and_validate_xattr_region(char *page_virt,
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 678172b61be2..e7440a6f5ebf 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -25,6 +25,7 @@
25 25
26#include <linux/file.h> 26#include <linux/file.h>
27#include <linux/poll.h> 27#include <linux/poll.h>
28#include <linux/slab.h>
28#include <linux/mount.h> 29#include <linux/mount.h>
29#include <linux/pagemap.h> 30#include <linux/pagemap.h>
30#include <linux/security.h> 31#include <linux/security.h>
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 4a430ab4115c..e2d4418affac 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -31,6 +31,7 @@
31#include <linux/mount.h> 31#include <linux/mount.h>
32#include <linux/crypto.h> 32#include <linux/crypto.h>
33#include <linux/fs_stack.h> 33#include <linux/fs_stack.h>
34#include <linux/slab.h>
34#include <asm/unaligned.h> 35#include <asm/unaligned.h>
35#include "ecryptfs_kernel.h" 36#include "ecryptfs_kernel.h"
36 37
@@ -323,6 +324,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
323 rc = ecryptfs_read_and_validate_header_region(page_virt, 324 rc = ecryptfs_read_and_validate_header_region(page_virt,
324 ecryptfs_dentry->d_inode); 325 ecryptfs_dentry->d_inode);
325 if (rc) { 326 if (rc) {
327 memset(page_virt, 0, PAGE_CACHE_SIZE);
326 rc = ecryptfs_read_and_validate_xattr_region(page_virt, 328 rc = ecryptfs_read_and_validate_xattr_region(page_virt,
327 ecryptfs_dentry); 329 ecryptfs_dentry);
328 if (rc) { 330 if (rc) {
@@ -335,7 +337,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
335 ecryptfs_dentry->d_sb)->mount_crypt_stat; 337 ecryptfs_dentry->d_sb)->mount_crypt_stat;
336 if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) { 338 if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) {
337 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) 339 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
338 file_size = (crypt_stat->num_header_bytes_at_front 340 file_size = (crypt_stat->metadata_size
339 + i_size_read(lower_dentry->d_inode)); 341 + i_size_read(lower_dentry->d_inode));
340 else 342 else
341 file_size = i_size_read(lower_dentry->d_inode); 343 file_size = i_size_read(lower_dentry->d_inode);
@@ -387,9 +389,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
387 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); 389 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
388 if (IS_ERR(lower_dentry)) { 390 if (IS_ERR(lower_dentry)) {
389 rc = PTR_ERR(lower_dentry); 391 rc = PTR_ERR(lower_dentry);
390 printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " 392 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
391 "lower_dentry = [%s]\n", __func__, rc, 393 "[%d] on lower_dentry = [%s]\n", __func__, rc,
392 ecryptfs_dentry->d_name.name); 394 encrypted_and_encoded_name);
393 goto out_d_drop; 395 goto out_d_drop;
394 } 396 }
395 if (lower_dentry->d_inode) 397 if (lower_dentry->d_inode)
@@ -416,9 +418,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
416 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); 418 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
417 if (IS_ERR(lower_dentry)) { 419 if (IS_ERR(lower_dentry)) {
418 rc = PTR_ERR(lower_dentry); 420 rc = PTR_ERR(lower_dentry);
419 printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " 421 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
420 "lower_dentry = [%s]\n", __func__, rc, 422 "[%d] on lower_dentry = [%s]\n", __func__, rc,
421 encrypted_and_encoded_name); 423 encrypted_and_encoded_name);
422 goto out_d_drop; 424 goto out_d_drop;
423 } 425 }
424lookup_and_interpose: 426lookup_and_interpose:
@@ -455,8 +457,8 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
455 rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0); 457 rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0);
456 if (rc) 458 if (rc)
457 goto out_lock; 459 goto out_lock;
458 fsstack_copy_attr_times(dir, lower_new_dentry->d_inode); 460 fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
459 fsstack_copy_inode_size(dir, lower_new_dentry->d_inode); 461 fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode);
460 old_dentry->d_inode->i_nlink = 462 old_dentry->d_inode->i_nlink =
461 ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink; 463 ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink;
462 i_size_write(new_dentry->d_inode, file_size_save); 464 i_size_write(new_dentry->d_inode, file_size_save);
@@ -647,38 +649,17 @@ out_lock:
647 return rc; 649 return rc;
648} 650}
649 651
650static int 652static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
651ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) 653 size_t *bufsiz)
652{ 654{
655 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
653 char *lower_buf; 656 char *lower_buf;
654 size_t lower_bufsiz; 657 size_t lower_bufsiz = PATH_MAX;
655 struct dentry *lower_dentry;
656 struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
657 char *plaintext_name;
658 size_t plaintext_name_size;
659 mm_segment_t old_fs; 658 mm_segment_t old_fs;
660 int rc; 659 int rc;
661 660
662 lower_dentry = ecryptfs_dentry_to_lower(dentry);
663 if (!lower_dentry->d_inode->i_op->readlink) {
664 rc = -EINVAL;
665 goto out;
666 }
667 mount_crypt_stat = &ecryptfs_superblock_to_private(
668 dentry->d_sb)->mount_crypt_stat;
669 /*
670 * If the lower filename is encrypted, it will result in a significantly
671 * longer name. If needed, truncate the name after decode and decrypt.
672 */
673 if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
674 lower_bufsiz = PATH_MAX;
675 else
676 lower_bufsiz = bufsiz;
677 /* Released in this function */
678 lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL); 661 lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL);
679 if (lower_buf == NULL) { 662 if (!lower_buf) {
680 printk(KERN_ERR "%s: Out of memory whilst attempting to "
681 "kmalloc [%zd] bytes\n", __func__, lower_bufsiz);
682 rc = -ENOMEM; 663 rc = -ENOMEM;
683 goto out; 664 goto out;
684 } 665 }
@@ -688,29 +669,31 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
688 (char __user *)lower_buf, 669 (char __user *)lower_buf,
689 lower_bufsiz); 670 lower_bufsiz);
690 set_fs(old_fs); 671 set_fs(old_fs);
691 if (rc >= 0) { 672 if (rc < 0)
692 rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name, 673 goto out;
693 &plaintext_name_size, 674 lower_bufsiz = rc;
694 dentry, lower_buf, 675 rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry,
695 rc); 676 lower_buf, lower_bufsiz);
696 if (rc) { 677out:
697 printk(KERN_ERR "%s: Error attempting to decode and "
698 "decrypt filename; rc = [%d]\n", __func__,
699 rc);
700 goto out_free_lower_buf;
701 }
702 /* Check for bufsiz <= 0 done in sys_readlinkat() */
703 rc = copy_to_user(buf, plaintext_name,
704 min((size_t) bufsiz, plaintext_name_size));
705 if (rc)
706 rc = -EFAULT;
707 else
708 rc = plaintext_name_size;
709 kfree(plaintext_name);
710 fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode);
711 }
712out_free_lower_buf:
713 kfree(lower_buf); 678 kfree(lower_buf);
679 return rc;
680}
681
682static int
683ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
684{
685 char *kbuf;
686 size_t kbufsiz, copied;
687 int rc;
688
689 rc = ecryptfs_readlink_lower(dentry, &kbuf, &kbufsiz);
690 if (rc)
691 goto out;
692 copied = min_t(size_t, bufsiz, kbufsiz);
693 rc = copy_to_user(buf, kbuf, copied) ? -EFAULT : copied;
694 kfree(kbuf);
695 fsstack_copy_attr_atime(dentry->d_inode,
696 ecryptfs_dentry_to_lower(dentry)->d_inode);
714out: 697out:
715 return rc; 698 return rc;
716} 699}
@@ -768,7 +751,7 @@ upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat,
768{ 751{
769 loff_t lower_size; 752 loff_t lower_size;
770 753
771 lower_size = crypt_stat->num_header_bytes_at_front; 754 lower_size = ecryptfs_lower_header_size(crypt_stat);
772 if (upper_size != 0) { 755 if (upper_size != 0) {
773 loff_t num_extents; 756 loff_t num_extents;
774 757
@@ -1015,6 +998,28 @@ out:
1015 return rc; 998 return rc;
1016} 999}
1017 1000
1001int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
1002 struct kstat *stat)
1003{
1004 struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
1005 int rc = 0;
1006
1007 mount_crypt_stat = &ecryptfs_superblock_to_private(
1008 dentry->d_sb)->mount_crypt_stat;
1009 generic_fillattr(dentry->d_inode, stat);
1010 if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
1011 char *target;
1012 size_t targetsiz;
1013
1014 rc = ecryptfs_readlink_lower(dentry, &target, &targetsiz);
1015 if (!rc) {
1016 kfree(target);
1017 stat->size = targetsiz;
1018 }
1019 }
1020 return rc;
1021}
1022
1018int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, 1023int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1019 struct kstat *stat) 1024 struct kstat *stat)
1020{ 1025{
@@ -1039,7 +1044,7 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
1039 1044
1040 lower_dentry = ecryptfs_dentry_to_lower(dentry); 1045 lower_dentry = ecryptfs_dentry_to_lower(dentry);
1041 if (!lower_dentry->d_inode->i_op->setxattr) { 1046 if (!lower_dentry->d_inode->i_op->setxattr) {
1042 rc = -ENOSYS; 1047 rc = -EOPNOTSUPP;
1043 goto out; 1048 goto out;
1044 } 1049 }
1045 mutex_lock(&lower_dentry->d_inode->i_mutex); 1050 mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1057,7 +1062,7 @@ ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
1057 int rc = 0; 1062 int rc = 0;
1058 1063
1059 if (!lower_dentry->d_inode->i_op->getxattr) { 1064 if (!lower_dentry->d_inode->i_op->getxattr) {
1060 rc = -ENOSYS; 1065 rc = -EOPNOTSUPP;
1061 goto out; 1066 goto out;
1062 } 1067 }
1063 mutex_lock(&lower_dentry->d_inode->i_mutex); 1068 mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1084,7 +1089,7 @@ ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size)
1084 1089
1085 lower_dentry = ecryptfs_dentry_to_lower(dentry); 1090 lower_dentry = ecryptfs_dentry_to_lower(dentry);
1086 if (!lower_dentry->d_inode->i_op->listxattr) { 1091 if (!lower_dentry->d_inode->i_op->listxattr) {
1087 rc = -ENOSYS; 1092 rc = -EOPNOTSUPP;
1088 goto out; 1093 goto out;
1089 } 1094 }
1090 mutex_lock(&lower_dentry->d_inode->i_mutex); 1095 mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1101,7 +1106,7 @@ static int ecryptfs_removexattr(struct dentry *dentry, const char *name)
1101 1106
1102 lower_dentry = ecryptfs_dentry_to_lower(dentry); 1107 lower_dentry = ecryptfs_dentry_to_lower(dentry);
1103 if (!lower_dentry->d_inode->i_op->removexattr) { 1108 if (!lower_dentry->d_inode->i_op->removexattr) {
1104 rc = -ENOSYS; 1109 rc = -EOPNOTSUPP;
1105 goto out; 1110 goto out;
1106 } 1111 }
1107 mutex_lock(&lower_dentry->d_inode->i_mutex); 1112 mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1132,6 +1137,7 @@ const struct inode_operations ecryptfs_symlink_iops = {
1132 .put_link = ecryptfs_put_link, 1137 .put_link = ecryptfs_put_link,
1133 .permission = ecryptfs_permission, 1138 .permission = ecryptfs_permission,
1134 .setattr = ecryptfs_setattr, 1139 .setattr = ecryptfs_setattr,
1140 .getattr = ecryptfs_getattr_link,
1135 .setxattr = ecryptfs_setxattr, 1141 .setxattr = ecryptfs_setxattr,
1136 .getxattr = ecryptfs_getxattr, 1142 .getxattr = ecryptfs_getxattr,
1137 .listxattr = ecryptfs_listxattr, 1143 .listxattr = ecryptfs_listxattr,
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index a0a7847567e9..89c5476506ef 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -32,6 +32,7 @@
32#include <linux/random.h> 32#include <linux/random.h>
33#include <linux/crypto.h> 33#include <linux/crypto.h>
34#include <linux/scatterlist.h> 34#include <linux/scatterlist.h>
35#include <linux/slab.h>
35#include "ecryptfs_kernel.h" 36#include "ecryptfs_kernel.h"
36 37
37/** 38/**
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index e14cf7e588db..d8c3a373aafa 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/kthread.h> 23#include <linux/kthread.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/slab.h>
25#include <linux/wait.h> 26#include <linux/wait.h>
26#include <linux/mount.h> 27#include <linux/mount.h>
27#include "ecryptfs_kernel.h" 28#include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index ea2f92101dfe..760983d0f25e 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -35,6 +35,7 @@
35#include <linux/key.h> 35#include <linux/key.h>
36#include <linux/parser.h> 36#include <linux/parser.h>
37#include <linux/fs_stack.h> 37#include <linux/fs_stack.h>
38#include <linux/slab.h>
38#include "ecryptfs_kernel.h" 39#include "ecryptfs_kernel.h"
39 40
40/** 41/**
@@ -496,17 +497,25 @@ struct kmem_cache *ecryptfs_sb_info_cache;
496static int 497static int
497ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent) 498ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent)
498{ 499{
500 struct ecryptfs_sb_info *esi;
499 int rc = 0; 501 int rc = 0;
500 502
501 /* Released in ecryptfs_put_super() */ 503 /* Released in ecryptfs_put_super() */
502 ecryptfs_set_superblock_private(sb, 504 ecryptfs_set_superblock_private(sb,
503 kmem_cache_zalloc(ecryptfs_sb_info_cache, 505 kmem_cache_zalloc(ecryptfs_sb_info_cache,
504 GFP_KERNEL)); 506 GFP_KERNEL));
505 if (!ecryptfs_superblock_to_private(sb)) { 507 esi = ecryptfs_superblock_to_private(sb);
508 if (!esi) {
506 ecryptfs_printk(KERN_WARNING, "Out of memory\n"); 509 ecryptfs_printk(KERN_WARNING, "Out of memory\n");
507 rc = -ENOMEM; 510 rc = -ENOMEM;
508 goto out; 511 goto out;
509 } 512 }
513
514 rc = bdi_setup_and_register(&esi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
515 if (rc)
516 goto out;
517
518 sb->s_bdi = &esi->bdi;
510 sb->s_op = &ecryptfs_sops; 519 sb->s_op = &ecryptfs_sops;
511 /* Released through deactivate_super(sb) from get_sb_nodev */ 520 /* Released through deactivate_super(sb) from get_sb_nodev */
512 sb->s_root = d_alloc(NULL, &(const struct qstr) { 521 sb->s_root = d_alloc(NULL, &(const struct qstr) {
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index f1c17e87c5fb..2d8dbce9d485 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -20,6 +20,7 @@
20 * 02111-1307, USA. 20 * 02111-1307, USA.
21 */ 21 */
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/slab.h>
23#include <linux/user_namespace.h> 24#include <linux/user_namespace.h>
24#include <linux/nsproxy.h> 25#include <linux/nsproxy.h>
25#include "ecryptfs_kernel.h" 26#include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 4ec8f61ccf5a..3745f612bcd4 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -24,6 +24,7 @@
24#include <linux/random.h> 24#include <linux/random.h>
25#include <linux/miscdevice.h> 25#include <linux/miscdevice.h>
26#include <linux/poll.h> 26#include <linux/poll.h>
27#include <linux/slab.h>
27#include <linux/wait.h> 28#include <linux/wait.h>
28#include <linux/module.h> 29#include <linux/module.h>
29#include "ecryptfs_kernel.h" 30#include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index df4ce99d0597..2ee9a3a7b68c 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -32,6 +32,7 @@
32#include <linux/file.h> 32#include <linux/file.h>
33#include <linux/crypto.h> 33#include <linux/crypto.h>
34#include <linux/scatterlist.h> 34#include <linux/scatterlist.h>
35#include <linux/slab.h>
35#include <asm/unaligned.h> 36#include <asm/unaligned.h>
36#include "ecryptfs_kernel.h" 37#include "ecryptfs_kernel.h"
37 38
@@ -82,6 +83,19 @@ out:
82 return rc; 83 return rc;
83} 84}
84 85
86static void strip_xattr_flag(char *page_virt,
87 struct ecryptfs_crypt_stat *crypt_stat)
88{
89 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) {
90 size_t written;
91
92 crypt_stat->flags &= ~ECRYPTFS_METADATA_IN_XATTR;
93 ecryptfs_write_crypt_stat_flags(page_virt, crypt_stat,
94 &written);
95 crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
96 }
97}
98
85/** 99/**
86 * Header Extent: 100 * Header Extent:
87 * Octets 0-7: Unencrypted file size (big-endian) 101 * Octets 0-7: Unencrypted file size (big-endian)
@@ -97,19 +111,6 @@ out:
97 * (big-endian) 111 * (big-endian)
98 * Octet 26: Begin RFC 2440 authentication token packet set 112 * Octet 26: Begin RFC 2440 authentication token packet set
99 */ 113 */
100static void set_header_info(char *page_virt,
101 struct ecryptfs_crypt_stat *crypt_stat)
102{
103 size_t written;
104 size_t save_num_header_bytes_at_front =
105 crypt_stat->num_header_bytes_at_front;
106
107 crypt_stat->num_header_bytes_at_front =
108 ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
109 ecryptfs_write_header_metadata(page_virt + 20, crypt_stat, &written);
110 crypt_stat->num_header_bytes_at_front =
111 save_num_header_bytes_at_front;
112}
113 114
114/** 115/**
115 * ecryptfs_copy_up_encrypted_with_header 116 * ecryptfs_copy_up_encrypted_with_header
@@ -135,8 +136,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
135 * num_extents_per_page) 136 * num_extents_per_page)
136 + extent_num_in_page); 137 + extent_num_in_page);
137 size_t num_header_extents_at_front = 138 size_t num_header_extents_at_front =
138 (crypt_stat->num_header_bytes_at_front 139 (crypt_stat->metadata_size / crypt_stat->extent_size);
139 / crypt_stat->extent_size);
140 140
141 if (view_extent_num < num_header_extents_at_front) { 141 if (view_extent_num < num_header_extents_at_front) {
142 /* This is a header extent */ 142 /* This is a header extent */
@@ -146,9 +146,14 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
146 memset(page_virt, 0, PAGE_CACHE_SIZE); 146 memset(page_virt, 0, PAGE_CACHE_SIZE);
147 /* TODO: Support more than one header extent */ 147 /* TODO: Support more than one header extent */
148 if (view_extent_num == 0) { 148 if (view_extent_num == 0) {
149 size_t written;
150
149 rc = ecryptfs_read_xattr_region( 151 rc = ecryptfs_read_xattr_region(
150 page_virt, page->mapping->host); 152 page_virt, page->mapping->host);
151 set_header_info(page_virt, crypt_stat); 153 strip_xattr_flag(page_virt + 16, crypt_stat);
154 ecryptfs_write_header_metadata(page_virt + 20,
155 crypt_stat,
156 &written);
152 } 157 }
153 kunmap_atomic(page_virt, KM_USER0); 158 kunmap_atomic(page_virt, KM_USER0);
154 flush_dcache_page(page); 159 flush_dcache_page(page);
@@ -161,7 +166,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
161 /* This is an encrypted data extent */ 166 /* This is an encrypted data extent */
162 loff_t lower_offset = 167 loff_t lower_offset =
163 ((view_extent_num * crypt_stat->extent_size) 168 ((view_extent_num * crypt_stat->extent_size)
164 - crypt_stat->num_header_bytes_at_front); 169 - crypt_stat->metadata_size);
165 170
166 rc = ecryptfs_read_lower_page_segment( 171 rc = ecryptfs_read_lower_page_segment(
167 page, (lower_offset >> PAGE_CACHE_SHIFT), 172 page, (lower_offset >> PAGE_CACHE_SHIFT),
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index b15a43a80ab7..0c0ae491d231 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -26,6 +26,7 @@
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/mount.h> 27#include <linux/mount.h>
28#include <linux/key.h> 28#include <linux/key.h>
29#include <linux/slab.h>
29#include <linux/seq_file.h> 30#include <linux/seq_file.h>
30#include <linux/smp_lock.h> 31#include <linux/smp_lock.h>
31#include <linux/file.h> 32#include <linux/file.h>
@@ -85,7 +86,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
85 if (lower_dentry->d_inode) { 86 if (lower_dentry->d_inode) {
86 fput(inode_info->lower_file); 87 fput(inode_info->lower_file);
87 inode_info->lower_file = NULL; 88 inode_info->lower_file = NULL;
88 d_drop(lower_dentry);
89 } 89 }
90 } 90 }
91 ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat); 91 ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
@@ -122,6 +122,7 @@ static void ecryptfs_put_super(struct super_block *sb)
122 lock_kernel(); 122 lock_kernel();
123 123
124 ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); 124 ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
125 bdi_destroy(&sb_info->bdi);
125 kmem_cache_free(ecryptfs_sb_info_cache, sb_info); 126 kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
126 ecryptfs_set_superblock_private(sb, NULL); 127 ecryptfs_set_superblock_private(sb, NULL);
127 128
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 7758cc382ef0..6bd3f76fdf88 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -11,6 +11,7 @@
11#include <linux/fs.h> 11#include <linux/fs.h>
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/slab.h>
14#include <linux/list.h> 15#include <linux/list.h>
15#include <linux/spinlock.h> 16#include <linux/spinlock.h>
16#include <linux/anon_inodes.h> 17#include <linux/anon_inodes.h>
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index bd056a5b4efc..3817149919cb 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1140,8 +1140,7 @@ retry:
1140 * ep_poll_callback() when events will become available. 1140 * ep_poll_callback() when events will become available.
1141 */ 1141 */
1142 init_waitqueue_entry(&wait, current); 1142 init_waitqueue_entry(&wait, current);
1143 wait.flags |= WQ_FLAG_EXCLUSIVE; 1143 __add_wait_queue_exclusive(&ep->wq, &wait);
1144 __add_wait_queue(&ep->wq, &wait);
1145 1144
1146 for (;;) { 1145 for (;;) {
1147 /* 1146 /*
diff --git a/fs/exec.c b/fs/exec.c
index 49cdaa19e5b9..e6e94c626c2c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1387,8 +1387,6 @@ int do_execve(char * filename,
1387 if (retval < 0) 1387 if (retval < 0)
1388 goto out; 1388 goto out;
1389 1389
1390 current->stack_start = current->mm->start_stack;
1391
1392 /* execve succeeded */ 1390 /* execve succeeded */
1393 current->fs->in_exec = 0; 1391 current->fs->in_exec = 0;
1394 current->in_execve = 0; 1392 current->in_execve = 0;
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 8442e353309f..22721b2fd890 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -35,6 +35,7 @@
35 35
36#include <linux/fs.h> 36#include <linux/fs.h>
37#include <linux/time.h> 37#include <linux/time.h>
38#include <linux/backing-dev.h>
38#include "common.h" 39#include "common.h"
39 40
40/* FIXME: Remove once pnfs hits mainline 41/* FIXME: Remove once pnfs hits mainline
@@ -84,6 +85,7 @@ struct exofs_sb_info {
84 u32 s_next_generation; /* next gen # to use */ 85 u32 s_next_generation; /* next gen # to use */
85 atomic_t s_curr_pending; /* number of pending commands */ 86 atomic_t s_curr_pending; /* number of pending commands */
86 uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */ 87 uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */
88 struct backing_dev_info bdi; /* register our bdi with VFS */
87 89
88 struct pnfs_osd_data_map data_map; /* Default raid to use 90 struct pnfs_osd_data_map data_map; /* Default raid to use
89 * FIXME: Needed ? 91 * FIXME: Needed ?
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index a17e4b733e35..76d2a79ef93e 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -31,6 +31,7 @@
31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
32 */ 32 */
33 33
34#include <linux/slab.h>
34#include <linux/writeback.h> 35#include <linux/writeback.h>
35#include <linux/buffer_head.h> 36#include <linux/buffer_head.h>
36#include <scsi/scsi_device.h> 37#include <scsi/scsi_device.h>
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 5293bc411d17..4337cad7777b 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -22,6 +22,7 @@
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 */ 23 */
24 24
25#include <linux/slab.h>
25#include <scsi/scsi_device.h> 26#include <scsi/scsi_device.h>
26#include <asm/div64.h> 27#include <asm/div64.h>
27 28
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 6cf5e4e84d61..03149b9a5178 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -37,6 +37,7 @@
37#include <linux/vfs.h> 37#include <linux/vfs.h>
38#include <linux/random.h> 38#include <linux/random.h>
39#include <linux/exportfs.h> 39#include <linux/exportfs.h>
40#include <linux/slab.h>
40 41
41#include "exofs.h" 42#include "exofs.h"
42 43
@@ -301,6 +302,7 @@ static void exofs_put_super(struct super_block *sb)
301 _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0], 302 _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0],
302 sbi->layout.s_pid); 303 sbi->layout.s_pid);
303 304
305 bdi_destroy(&sbi->bdi);
304 exofs_free_sbi(sbi); 306 exofs_free_sbi(sbi);
305 sb->s_fs_info = NULL; 307 sb->s_fs_info = NULL;
306} 308}
@@ -545,6 +547,10 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
545 if (!sbi) 547 if (!sbi)
546 return -ENOMEM; 548 return -ENOMEM;
547 549
550 ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
551 if (ret)
552 goto free_bdi;
553
548 /* use mount options to fill superblock */ 554 /* use mount options to fill superblock */
549 od = osduld_path_lookup(opts->dev_name); 555 od = osduld_path_lookup(opts->dev_name);
550 if (IS_ERR(od)) { 556 if (IS_ERR(od)) {
@@ -611,6 +617,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
611 } 617 }
612 618
613 /* set up operation vectors */ 619 /* set up operation vectors */
620 sb->s_bdi = &sbi->bdi;
614 sb->s_fs_info = sbi; 621 sb->s_fs_info = sbi;
615 sb->s_op = &exofs_sops; 622 sb->s_op = &exofs_sops;
616 sb->s_export_op = &exofs_export_ops; 623 sb->s_export_op = &exofs_export_ops;
@@ -642,6 +649,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
642 return 0; 649 return 0;
643 650
644free_sbi: 651free_sbi:
652 bdi_destroy(&sbi->bdi);
653free_bdi:
645 EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", 654 EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
646 opts->dev_name, sbi->layout.s_pid, ret); 655 opts->dev_name, sbi->layout.s_pid, ret);
647 exofs_free_sbi(sbi); 656 exofs_free_sbi(sbi);
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 1d081f0cfec2..3cf038c055d7 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -13,6 +13,7 @@
13 13
14#include "ext2.h" 14#include "ext2.h"
15#include <linux/quotaops.h> 15#include <linux/quotaops.h>
16#include <linux/slab.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
17#include <linux/buffer_head.h> 18#include <linux/buffer_head.h>
18#include <linux/capability.h> 19#include <linux/capability.h>
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index 4e2426e22bbe..565cf817bbf1 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -32,6 +32,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
32 .readlink = generic_readlink, 32 .readlink = generic_readlink,
33 .follow_link = page_follow_link_light, 33 .follow_link = page_follow_link_light,
34 .put_link = page_put_link, 34 .put_link = page_put_link,
35 .setattr = ext2_setattr,
35#ifdef CONFIG_EXT2_FS_XATTR 36#ifdef CONFIG_EXT2_FS_XATTR
36 .setxattr = generic_setxattr, 37 .setxattr = generic_setxattr,
37 .getxattr = generic_getxattr, 38 .getxattr = generic_getxattr,
@@ -43,6 +44,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
43const struct inode_operations ext2_fast_symlink_inode_operations = { 44const struct inode_operations ext2_fast_symlink_inode_operations = {
44 .readlink = generic_readlink, 45 .readlink = generic_readlink,
45 .follow_link = ext2_follow_link, 46 .follow_link = ext2_follow_link,
47 .setattr = ext2_setattr,
46#ifdef CONFIG_EXT2_FS_XATTR 48#ifdef CONFIG_EXT2_FS_XATTR
47 .setxattr = generic_setxattr, 49 .setxattr = generic_setxattr,
48 .getxattr = generic_getxattr, 50 .getxattr = generic_getxattr,
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index c8155845ac05..b118c6383c6d 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -4,6 +4,7 @@
4 */ 4 */
5 5
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/slab.h>
7#include <linux/string.h> 8#include <linux/string.h>
8#include <linux/fs.h> 9#include <linux/fs.h>
9#include <linux/ext2_fs.h> 10#include <linux/ext2_fs.h>
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 161da2d3f890..a177122a1b25 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -14,6 +14,7 @@
14#include <linux/time.h> 14#include <linux/time.h>
15#include <linux/capability.h> 15#include <linux/capability.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/slab.h>
17#include <linux/jbd.h> 18#include <linux/jbd.h>
18#include <linux/ext3_fs.h> 19#include <linux/ext3_fs.h>
19#include <linux/ext3_jbd.h> 20#include <linux/ext3_jbd.h>
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index ef9008b885b5..0d0e97ed3ff6 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -582,7 +582,9 @@ got:
582 inode->i_generation = sbi->s_next_generation++; 582 inode->i_generation = sbi->s_next_generation++;
583 spin_unlock(&sbi->s_next_gen_lock); 583 spin_unlock(&sbi->s_next_gen_lock);
584 584
585 ei->i_state = EXT3_STATE_NEW; 585 ei->i_state_flags = 0;
586 ext3_set_inode_state(inode, EXT3_STATE_NEW);
587
586 ei->i_extra_isize = 588 ei->i_extra_isize =
587 (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ? 589 (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ?
588 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; 590 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 7f920b7263a4..ea33bdf0a300 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2811,7 +2811,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
2811 inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime); 2811 inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
2812 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0; 2812 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2813 2813
2814 ei->i_state = 0; 2814 ei->i_state_flags = 0;
2815 ei->i_dir_start_lookup = 0; 2815 ei->i_dir_start_lookup = 0;
2816 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 2816 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2817 /* We now have enough fields to check if the inode was active or not. 2817 /* We now have enough fields to check if the inode was active or not.
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index e844accbf55d..1bee604cc6cd 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -164,7 +164,7 @@ void ext3_msg(struct super_block *sb, const char *prefix,
164 * write out the superblock safely. 164 * write out the superblock safely.
165 * 165 *
166 * We'll just use the journal_abort() error code to record an error in 166 * We'll just use the journal_abort() error code to record an error in
167 * the journal instead. On recovery, the journal will compain about 167 * the journal instead. On recovery, the journal will complain about
168 * that error until we've noted it down and cleared it. 168 * that error until we've noted it down and cleared it.
169 */ 169 */
170 170
diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c
index ff7b4ccd8983..7c4898207776 100644
--- a/fs/ext3/symlink.c
+++ b/fs/ext3/symlink.c
@@ -34,6 +34,7 @@ const struct inode_operations ext3_symlink_inode_operations = {
34 .readlink = generic_readlink, 34 .readlink = generic_readlink,
35 .follow_link = page_follow_link_light, 35 .follow_link = page_follow_link_light,
36 .put_link = page_put_link, 36 .put_link = page_put_link,
37 .setattr = ext3_setattr,
37#ifdef CONFIG_EXT3_FS_XATTR 38#ifdef CONFIG_EXT3_FS_XATTR
38 .setxattr = generic_setxattr, 39 .setxattr = generic_setxattr,
39 .getxattr = generic_getxattr, 40 .getxattr = generic_getxattr,
@@ -45,6 +46,7 @@ const struct inode_operations ext3_symlink_inode_operations = {
45const struct inode_operations ext3_fast_symlink_inode_operations = { 46const struct inode_operations ext3_fast_symlink_inode_operations = {
46 .readlink = generic_readlink, 47 .readlink = generic_readlink,
47 .follow_link = ext3_follow_link, 48 .follow_link = ext3_follow_link,
49 .setattr = ext3_setattr,
48#ifdef CONFIG_EXT3_FS_XATTR 50#ifdef CONFIG_EXT3_FS_XATTR
49 .setxattr = generic_setxattr, 51 .setxattr = generic_setxattr,
50 .getxattr = generic_getxattr, 52 .getxattr = generic_getxattr,
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 474348788dd9..3af91f476dff 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -4,6 +4,7 @@
4 */ 4 */
5 5
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/slab.h>
7#include <linux/string.h> 8#include <linux/string.h>
8#include <linux/fs.h> 9#include <linux/fs.h>
9#include <linux/ext3_jbd.h> 10#include <linux/ext3_jbd.h>
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 983f0e127493..538c48655084 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -18,6 +18,7 @@
18#include <linux/pagemap.h> 18#include <linux/pagemap.h>
19#include <linux/blkdev.h> 19#include <linux/blkdev.h>
20#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include <linux/slab.h>
21#include "ext4.h" 22#include "ext4.h"
22 23
23struct ext4_system_zone { 24struct ext4_system_zone {
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 94c8ee81f5e1..236b834b4ca8 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3879,6 +3879,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
3879 physical += offset; 3879 physical += offset;
3880 length = EXT4_SB(inode->i_sb)->s_inode_size - offset; 3880 length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
3881 flags |= FIEMAP_EXTENT_DATA_INLINE; 3881 flags |= FIEMAP_EXTENT_DATA_INLINE;
3882 brelse(iloc.bh);
3882 } else { /* external block */ 3883 } else { /* external block */
3883 physical = EXT4_I(inode)->i_file_acl << blockbits; 3884 physical = EXT4_I(inode)->i_file_acl << blockbits;
3884 length = inode->i_sb->s_blocksize; 3885 length = inode->i_sb->s_blocksize;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 361c0b9962a8..57f6eef6ccd6 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -263,7 +263,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
263 ext4_group_t f; 263 ext4_group_t f;
264 264
265 f = ext4_flex_group(sbi, block_group); 265 f = ext4_flex_group(sbi, block_group);
266 atomic_dec(&sbi->s_flex_groups[f].free_inodes); 266 atomic_dec(&sbi->s_flex_groups[f].used_dirs);
267 } 267 }
268 268
269 } 269 }
@@ -773,7 +773,7 @@ static int ext4_claim_inode(struct super_block *sb,
773 if (sbi->s_log_groups_per_flex) { 773 if (sbi->s_log_groups_per_flex) {
774 ext4_group_t f = ext4_flex_group(sbi, group); 774 ext4_group_t f = ext4_flex_group(sbi, group);
775 775
776 atomic_inc(&sbi->s_flex_groups[f].free_inodes); 776 atomic_inc(&sbi->s_flex_groups[f].used_dirs);
777 } 777 }
778 } 778 }
779 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); 779 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 986120f30066..81d605412844 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -39,6 +39,7 @@
39#include <linux/bio.h> 39#include <linux/bio.h>
40#include <linux/workqueue.h> 40#include <linux/workqueue.h>
41#include <linux/kernel.h> 41#include <linux/kernel.h>
42#include <linux/slab.h>
42 43
43#include "ext4_jbd2.h" 44#include "ext4_jbd2.h"
44#include "xattr.h" 45#include "xattr.h"
@@ -1035,7 +1036,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1035 sector_t lblock) 1036 sector_t lblock)
1036{ 1037{
1037 struct ext4_inode_info *ei = EXT4_I(inode); 1038 struct ext4_inode_info *ei = EXT4_I(inode);
1038 int dind_mask = EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1; 1039 sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
1039 int blk_bits; 1040 int blk_bits;
1040 1041
1041 if (lblock < EXT4_NDIR_BLOCKS) 1042 if (lblock < EXT4_NDIR_BLOCKS)
@@ -1050,7 +1051,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1050 } 1051 }
1051 ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; 1052 ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
1052 ei->i_da_metadata_calc_len = 1; 1053 ei->i_da_metadata_calc_len = 1;
1053 blk_bits = roundup_pow_of_two(lblock + 1); 1054 blk_bits = order_base_2(lblock);
1054 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; 1055 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
1055} 1056}
1056 1057
@@ -5374,7 +5375,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
5374 } else { 5375 } else {
5375 struct ext4_iloc iloc; 5376 struct ext4_iloc iloc;
5376 5377
5377 err = ext4_get_inode_loc(inode, &iloc); 5378 err = __ext4_get_inode_loc(inode, &iloc, 0);
5378 if (err) 5379 if (err)
5379 return err; 5380 return err;
5380 if (wbc->sync_mode == WB_SYNC_ALL) 5381 if (wbc->sync_mode == WB_SYNC_ALL)
@@ -5385,6 +5386,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
5385 (unsigned long long)iloc.bh->b_blocknr); 5386 (unsigned long long)iloc.bh->b_blocknr);
5386 err = -EIO; 5387 err = -EIO;
5387 } 5388 }
5389 brelse(iloc.bh);
5388 } 5390 }
5389 return err; 5391 return err;
5390} 5392}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 506713a2ebd8..b423a364dca3 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -23,6 +23,7 @@
23 23
24#include "mballoc.h" 24#include "mballoc.h"
25#include <linux/debugfs.h> 25#include <linux/debugfs.h>
26#include <linux/slab.h>
26#include <trace/events/ext4.h> 27#include <trace/events/ext4.h>
27 28
28/* 29/*
@@ -69,7 +70,7 @@
69 * 70 *
70 * pa_lstart -> the logical start block for this prealloc space 71 * pa_lstart -> the logical start block for this prealloc space
71 * pa_pstart -> the physical start block for this prealloc space 72 * pa_pstart -> the physical start block for this prealloc space
72 * pa_len -> lenght for this prealloc space 73 * pa_len -> length for this prealloc space
73 * pa_free -> free space available in this prealloc space 74 * pa_free -> free space available in this prealloc space
74 * 75 *
75 * The inode preallocation space is used looking at the _logical_ start 76 * The inode preallocation space is used looking at the _logical_ start
@@ -2534,6 +2535,17 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2534 mb_debug(1, "gonna free %u blocks in group %u (0x%p):", 2535 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2535 entry->count, entry->group, entry); 2536 entry->count, entry->group, entry);
2536 2537
2538 if (test_opt(sb, DISCARD)) {
2539 ext4_fsblk_t discard_block;
2540
2541 discard_block = entry->start_blk +
2542 ext4_group_first_block_no(sb, entry->group);
2543 trace_ext4_discard_blocks(sb,
2544 (unsigned long long)discard_block,
2545 entry->count);
2546 sb_issue_discard(sb, discard_block, entry->count);
2547 }
2548
2537 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2549 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
2538 /* we expect to find existing buddy because it's pinned */ 2550 /* we expect to find existing buddy because it's pinned */
2539 BUG_ON(err != 0); 2551 BUG_ON(err != 0);
@@ -2555,16 +2567,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2555 page_cache_release(e4b.bd_bitmap_page); 2567 page_cache_release(e4b.bd_bitmap_page);
2556 } 2568 }
2557 ext4_unlock_group(sb, entry->group); 2569 ext4_unlock_group(sb, entry->group);
2558 if (test_opt(sb, DISCARD)) {
2559 ext4_fsblk_t discard_block;
2560
2561 discard_block = entry->start_blk +
2562 ext4_group_first_block_no(sb, entry->group);
2563 trace_ext4_discard_blocks(sb,
2564 (unsigned long long)discard_block,
2565 entry->count);
2566 sb_issue_discard(sb, discard_block, entry->count);
2567 }
2568 kmem_cache_free(ext4_free_ext_cachep, entry); 2570 kmem_cache_free(ext4_free_ext_cachep, entry);
2569 ext4_mb_release_desc(&e4b); 2571 ext4_mb_release_desc(&e4b);
2570 } 2572 }
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 8b87bd0eac95..34dcfc52ef44 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -13,6 +13,7 @@
13 */ 13 */
14 14
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/slab.h>
16#include "ext4_jbd2.h" 17#include "ext4_jbd2.h"
17#include "ext4_extents.h" 18#include "ext4_extents.h"
18 19
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index aa5fe28d180f..d1fc662cc311 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -15,6 +15,7 @@
15 15
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/quotaops.h> 17#include <linux/quotaops.h>
18#include <linux/slab.h>
18#include "ext4_jbd2.h" 19#include "ext4_jbd2.h"
19#include "ext4_extents.h" 20#include "ext4_extents.h"
20#include "ext4.h" 21#include "ext4.h"
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2b83b96cb2eb..e14d22c170d5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -68,7 +68,21 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
68static int ext4_unfreeze(struct super_block *sb); 68static int ext4_unfreeze(struct super_block *sb);
69static void ext4_write_super(struct super_block *sb); 69static void ext4_write_super(struct super_block *sb);
70static int ext4_freeze(struct super_block *sb); 70static int ext4_freeze(struct super_block *sb);
71static int ext4_get_sb(struct file_system_type *fs_type, int flags,
72 const char *dev_name, void *data, struct vfsmount *mnt);
71 73
74#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
75static struct file_system_type ext3_fs_type = {
76 .owner = THIS_MODULE,
77 .name = "ext3",
78 .get_sb = ext4_get_sb,
79 .kill_sb = kill_block_super,
80 .fs_flags = FS_REQUIRES_DEV,
81};
82#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
83#else
84#define IS_EXT3_SB(sb) (0)
85#endif
72 86
73ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, 87ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
74 struct ext4_group_desc *bg) 88 struct ext4_group_desc *bg)
@@ -302,7 +316,7 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
302 * write out the superblock safely. 316 * write out the superblock safely.
303 * 317 *
304 * We'll just use the jbd2_journal_abort() error code to record an error in 318 * We'll just use the jbd2_journal_abort() error code to record an error in
305 * the journal instead. On recovery, the journal will compain about 319 * the journal instead. On recovery, the journal will complain about
306 * that error until we've noted it down and cleared it. 320 * that error until we've noted it down and cleared it.
307 */ 321 */
308 322
@@ -2358,7 +2372,7 @@ static void ext4_sb_release(struct kobject *kobj)
2358} 2372}
2359 2373
2360 2374
2361static struct sysfs_ops ext4_attr_ops = { 2375static const struct sysfs_ops ext4_attr_ops = {
2362 .show = ext4_attr_show, 2376 .show = ext4_attr_show,
2363 .store = ext4_attr_store, 2377 .store = ext4_attr_store,
2364}; 2378};
@@ -2539,7 +2553,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2539 * enable delayed allocation by default 2553 * enable delayed allocation by default
2540 * Use -o nodelalloc to turn it off 2554 * Use -o nodelalloc to turn it off
2541 */ 2555 */
2542 set_opt(sbi->s_mount_opt, DELALLOC); 2556 if (!IS_EXT3_SB(sb))
2557 set_opt(sbi->s_mount_opt, DELALLOC);
2543 2558
2544 if (!parse_options((char *) data, sb, &journal_devnum, 2559 if (!parse_options((char *) data, sb, &journal_devnum,
2545 &journal_ioprio, NULL, 0)) 2560 &journal_ioprio, NULL, 0))
@@ -4068,7 +4083,7 @@ static int ext4_get_sb(struct file_system_type *fs_type, int flags,
4068 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); 4083 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
4069} 4084}
4070 4085
4071#if !defined(CONTIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 4086#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
4072static struct file_system_type ext2_fs_type = { 4087static struct file_system_type ext2_fs_type = {
4073 .owner = THIS_MODULE, 4088 .owner = THIS_MODULE,
4074 .name = "ext2", 4089 .name = "ext2",
@@ -4095,15 +4110,7 @@ static inline void register_as_ext2(void) { }
4095static inline void unregister_as_ext2(void) { } 4110static inline void unregister_as_ext2(void) { }
4096#endif 4111#endif
4097 4112
4098#if !defined(CONTIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 4113#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
4099static struct file_system_type ext3_fs_type = {
4100 .owner = THIS_MODULE,
4101 .name = "ext3",
4102 .get_sb = ext4_get_sb,
4103 .kill_sb = kill_block_super,
4104 .fs_flags = FS_REQUIRES_DEV,
4105};
4106
4107static inline void register_as_ext3(void) 4114static inline void register_as_ext3(void)
4108{ 4115{
4109 int err = register_filesystem(&ext3_fs_type); 4116 int err = register_filesystem(&ext3_fs_type);
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 983c253999a7..8b145e98df07 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -7,6 +7,7 @@
7#include <linux/string.h> 7#include <linux/string.h>
8#include <linux/fs.h> 8#include <linux/fs.h>
9#include <linux/security.h> 9#include <linux/security.h>
10#include <linux/slab.h>
10#include "ext4_jbd2.h" 11#include "ext4_jbd2.h"
11#include "ext4.h" 12#include "ext4.h"
12#include "xattr.h" 13#include "xattr.h"
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 923990e4f16e..113f0a1e565d 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/fs.h> 11#include <linux/fs.h>
12#include <linux/slab.h>
12#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
13#include "fat.h" 14#include "fat.h"
14 15
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index fbeecdc194dc..0ce143bd7d56 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -558,7 +558,7 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
558 buf->f_bavail = sbi->free_clusters; 558 buf->f_bavail = sbi->free_clusters;
559 buf->f_fsid.val[0] = (u32)id; 559 buf->f_fsid.val[0] = (u32)id;
560 buf->f_fsid.val[1] = (u32)(id >> 32); 560 buf->f_fsid.val[1] = (u32)(id >> 32);
561 buf->f_namelen = sbi->options.isvfat ? 260 : 12; 561 buf->f_namelen = sbi->options.isvfat ? FAT_LFN_LEN : 12;
562 562
563 return 0; 563 return 0;
564} 564}
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index f565f24019b5..6fcc7e71fbaa 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -309,7 +309,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
309{ 309{
310 struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options; 310 struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options;
311 wchar_t *ip, *ext_start, *end, *name_start; 311 wchar_t *ip, *ext_start, *end, *name_start;
312 unsigned char base[9], ext[4], buf[8], *p; 312 unsigned char base[9], ext[4], buf[5], *p;
313 unsigned char charbuf[NLS_MAX_CHARSET_SIZE]; 313 unsigned char charbuf[NLS_MAX_CHARSET_SIZE];
314 int chl, chi; 314 int chl, chi;
315 int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen; 315 int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen;
@@ -467,7 +467,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
467 return 0; 467 return 0;
468 } 468 }
469 469
470 i = jiffies & 0xffff; 470 i = jiffies;
471 sz = (jiffies >> 16) & 0x7; 471 sz = (jiffies >> 16) & 0x7;
472 if (baselen > 2) { 472 if (baselen > 2) {
473 baselen = numtail2_baselen; 473 baselen = numtail2_baselen;
@@ -476,7 +476,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
476 name_res[baselen + 4] = '~'; 476 name_res[baselen + 4] = '~';
477 name_res[baselen + 5] = '1' + sz; 477 name_res[baselen + 5] = '1' + sz;
478 while (1) { 478 while (1) {
479 sprintf(buf, "%04X", i); 479 snprintf(buf, sizeof(buf), "%04X", i & 0xffff);
480 memcpy(&name_res[baselen], buf, 4); 480 memcpy(&name_res[baselen], buf, 4);
481 if (vfat_find_form(dir, name_res) < 0) 481 if (vfat_find_form(dir, name_res) < 0)
482 break; 482 break;
@@ -502,14 +502,14 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
502 *outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname); 502 *outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname);
503 if (*outlen < 0) 503 if (*outlen < 0)
504 return *outlen; 504 return *outlen;
505 else if (*outlen > 255) 505 else if (*outlen > FAT_LFN_LEN)
506 return -ENAMETOOLONG; 506 return -ENAMETOOLONG;
507 507
508 op = &outname[*outlen * sizeof(wchar_t)]; 508 op = &outname[*outlen * sizeof(wchar_t)];
509 } else { 509 } else {
510 if (nls) { 510 if (nls) {
511 for (i = 0, ip = name, op = outname, *outlen = 0; 511 for (i = 0, ip = name, op = outname, *outlen = 0;
512 i < len && *outlen <= 255; 512 i < len && *outlen <= FAT_LFN_LEN;
513 *outlen += 1) 513 *outlen += 1)
514 { 514 {
515 if (escape && (*ip == ':')) { 515 if (escape && (*ip == ':')) {
@@ -549,7 +549,7 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
549 return -ENAMETOOLONG; 549 return -ENAMETOOLONG;
550 } else { 550 } else {
551 for (i = 0, ip = name, op = outname, *outlen = 0; 551 for (i = 0, ip = name, op = outname, *outlen = 0;
552 i < len && *outlen <= 255; 552 i < len && *outlen <= FAT_LFN_LEN;
553 i++, *outlen += 1) 553 i++, *outlen += 1)
554 { 554 {
555 *op++ = *ip++; 555 *op++ = *ip++;
@@ -701,6 +701,15 @@ static int vfat_find(struct inode *dir, struct qstr *qname,
701 return fat_search_long(dir, qname->name, len, sinfo); 701 return fat_search_long(dir, qname->name, len, sinfo);
702} 702}
703 703
704/*
705 * (nfsd's) anonymous disconnected dentry?
706 * NOTE: !IS_ROOT() is not anonymous (I.e. d_splice_alias() did the job).
707 */
708static int vfat_d_anon_disconn(struct dentry *dentry)
709{
710 return IS_ROOT(dentry) && (dentry->d_flags & DCACHE_DISCONNECTED);
711}
712
704static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, 713static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
705 struct nameidata *nd) 714 struct nameidata *nd)
706{ 715{
@@ -729,11 +738,11 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
729 } 738 }
730 739
731 alias = d_find_alias(inode); 740 alias = d_find_alias(inode);
732 if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) { 741 if (alias && !vfat_d_anon_disconn(alias)) {
733 /* 742 /*
734 * This inode has non DCACHE_DISCONNECTED dentry. This 743 * This inode has non anonymous-DCACHE_DISCONNECTED
735 * means, the user did ->lookup() by an another name 744 * dentry. This means, the user did ->lookup() by an
736 * (longname vs 8.3 alias of it) in past. 745 * another name (longname vs 8.3 alias of it) in past.
737 * 746 *
738 * Switch to new one for reason of locality if possible. 747 * Switch to new one for reason of locality if possible.
739 */ 748 */
@@ -743,7 +752,9 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
743 iput(inode); 752 iput(inode);
744 unlock_super(sb); 753 unlock_super(sb);
745 return alias; 754 return alias;
746 } 755 } else
756 dput(alias);
757
747out: 758out:
748 unlock_super(sb); 759 unlock_super(sb);
749 dentry->d_op = sb->s_root->d_op; 760 dentry->d_op = sb->s_root->d_op;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 452d02f9075e..0a140741b39e 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -614,9 +614,15 @@ int send_sigurg(struct fown_struct *fown)
614 return ret; 614 return ret;
615} 615}
616 616
617static DEFINE_RWLOCK(fasync_lock); 617static DEFINE_SPINLOCK(fasync_lock);
618static struct kmem_cache *fasync_cache __read_mostly; 618static struct kmem_cache *fasync_cache __read_mostly;
619 619
620static void fasync_free_rcu(struct rcu_head *head)
621{
622 kmem_cache_free(fasync_cache,
623 container_of(head, struct fasync_struct, fa_rcu));
624}
625
620/* 626/*
621 * Remove a fasync entry. If successfully removed, return 627 * Remove a fasync entry. If successfully removed, return
622 * positive and clear the FASYNC flag. If no entry exists, 628 * positive and clear the FASYNC flag. If no entry exists,
@@ -625,8 +631,6 @@ static struct kmem_cache *fasync_cache __read_mostly;
625 * NOTE! It is very important that the FASYNC flag always 631 * NOTE! It is very important that the FASYNC flag always
626 * match the state "is the filp on a fasync list". 632 * match the state "is the filp on a fasync list".
627 * 633 *
628 * We always take the 'filp->f_lock', in since fasync_lock
629 * needs to be irq-safe.
630 */ 634 */
631static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) 635static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
632{ 636{
@@ -634,17 +638,22 @@ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
634 int result = 0; 638 int result = 0;
635 639
636 spin_lock(&filp->f_lock); 640 spin_lock(&filp->f_lock);
637 write_lock_irq(&fasync_lock); 641 spin_lock(&fasync_lock);
638 for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { 642 for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
639 if (fa->fa_file != filp) 643 if (fa->fa_file != filp)
640 continue; 644 continue;
645
646 spin_lock_irq(&fa->fa_lock);
647 fa->fa_file = NULL;
648 spin_unlock_irq(&fa->fa_lock);
649
641 *fp = fa->fa_next; 650 *fp = fa->fa_next;
642 kmem_cache_free(fasync_cache, fa); 651 call_rcu(&fa->fa_rcu, fasync_free_rcu);
643 filp->f_flags &= ~FASYNC; 652 filp->f_flags &= ~FASYNC;
644 result = 1; 653 result = 1;
645 break; 654 break;
646 } 655 }
647 write_unlock_irq(&fasync_lock); 656 spin_unlock(&fasync_lock);
648 spin_unlock(&filp->f_lock); 657 spin_unlock(&filp->f_lock);
649 return result; 658 return result;
650} 659}
@@ -666,25 +675,30 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
666 return -ENOMEM; 675 return -ENOMEM;
667 676
668 spin_lock(&filp->f_lock); 677 spin_lock(&filp->f_lock);
669 write_lock_irq(&fasync_lock); 678 spin_lock(&fasync_lock);
670 for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { 679 for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
671 if (fa->fa_file != filp) 680 if (fa->fa_file != filp)
672 continue; 681 continue;
682
683 spin_lock_irq(&fa->fa_lock);
673 fa->fa_fd = fd; 684 fa->fa_fd = fd;
685 spin_unlock_irq(&fa->fa_lock);
686
674 kmem_cache_free(fasync_cache, new); 687 kmem_cache_free(fasync_cache, new);
675 goto out; 688 goto out;
676 } 689 }
677 690
691 spin_lock_init(&new->fa_lock);
678 new->magic = FASYNC_MAGIC; 692 new->magic = FASYNC_MAGIC;
679 new->fa_file = filp; 693 new->fa_file = filp;
680 new->fa_fd = fd; 694 new->fa_fd = fd;
681 new->fa_next = *fapp; 695 new->fa_next = *fapp;
682 *fapp = new; 696 rcu_assign_pointer(*fapp, new);
683 result = 1; 697 result = 1;
684 filp->f_flags |= FASYNC; 698 filp->f_flags |= FASYNC;
685 699
686out: 700out:
687 write_unlock_irq(&fasync_lock); 701 spin_unlock(&fasync_lock);
688 spin_unlock(&filp->f_lock); 702 spin_unlock(&filp->f_lock);
689 return result; 703 return result;
690} 704}
@@ -704,37 +718,41 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap
704 718
705EXPORT_SYMBOL(fasync_helper); 719EXPORT_SYMBOL(fasync_helper);
706 720
707void __kill_fasync(struct fasync_struct *fa, int sig, int band) 721/*
722 * rcu_read_lock() is held
723 */
724static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
708{ 725{
709 while (fa) { 726 while (fa) {
710 struct fown_struct * fown; 727 struct fown_struct *fown;
711 if (fa->magic != FASYNC_MAGIC) { 728 if (fa->magic != FASYNC_MAGIC) {
712 printk(KERN_ERR "kill_fasync: bad magic number in " 729 printk(KERN_ERR "kill_fasync: bad magic number in "
713 "fasync_struct!\n"); 730 "fasync_struct!\n");
714 return; 731 return;
715 } 732 }
716 fown = &fa->fa_file->f_owner; 733 spin_lock(&fa->fa_lock);
717 /* Don't send SIGURG to processes which have not set a 734 if (fa->fa_file) {
718 queued signum: SIGURG has its own default signalling 735 fown = &fa->fa_file->f_owner;
719 mechanism. */ 736 /* Don't send SIGURG to processes which have not set a
720 if (!(sig == SIGURG && fown->signum == 0)) 737 queued signum: SIGURG has its own default signalling
721 send_sigio(fown, fa->fa_fd, band); 738 mechanism. */
722 fa = fa->fa_next; 739 if (!(sig == SIGURG && fown->signum == 0))
740 send_sigio(fown, fa->fa_fd, band);
741 }
742 spin_unlock(&fa->fa_lock);
743 fa = rcu_dereference(fa->fa_next);
723 } 744 }
724} 745}
725 746
726EXPORT_SYMBOL(__kill_fasync);
727
728void kill_fasync(struct fasync_struct **fp, int sig, int band) 747void kill_fasync(struct fasync_struct **fp, int sig, int band)
729{ 748{
730 /* First a quick test without locking: usually 749 /* First a quick test without locking: usually
731 * the list is empty. 750 * the list is empty.
732 */ 751 */
733 if (*fp) { 752 if (*fp) {
734 read_lock(&fasync_lock); 753 rcu_read_lock();
735 /* reread *fp after obtaining the lock */ 754 kill_fasync_rcu(rcu_dereference(*fp), sig, band);
736 __kill_fasync(*fp, sig, band); 755 rcu_read_unlock();
737 read_unlock(&fasync_lock);
738 } 756 }
739} 757}
740EXPORT_SYMBOL(kill_fasync); 758EXPORT_SYMBOL(kill_fasync);
diff --git a/fs/fifo.c b/fs/fifo.c
index f8f97b8b6d44..5d6606ffc2d2 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/slab.h>
14#include <linux/fs.h> 13#include <linux/fs.h>
15#include <linux/sched.h> 14#include <linux/sched.h>
16#include <linux/pipe_fs_i.h> 15#include <linux/pipe_fs_i.h>
diff --git a/fs/filesystems.c b/fs/filesystems.c
index a24c58e181db..68ba492d8eef 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -10,10 +10,10 @@
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/proc_fs.h> 11#include <linux/proc_fs.h>
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/slab.h>
14#include <linux/kmod.h> 13#include <linux/kmod.h>
15#include <linux/init.h> 14#include <linux/init.h>
16#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/slab.h>
17#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18 18
19/* 19/*
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index ed8f0b0dd880..1429f3ae1e86 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -33,7 +33,6 @@
33#include <linux/fs.h> 33#include <linux/fs.h>
34#include <linux/buffer_head.h> 34#include <linux/buffer_head.h>
35#include <linux/kernel.h> 35#include <linux/kernel.h>
36#include <linux/slab.h>
37#include <linux/pagemap.h> 36#include <linux/pagemap.h>
38 37
39#include "vxfs_extern.h" 38#include "vxfs_extern.h"
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 76fc4d594acb..4b37f7cea4dd 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -16,6 +16,7 @@
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/spinlock.h> 18#include <linux/spinlock.h>
19#include <linux/slab.h>
19#include <linux/sched.h> 20#include <linux/sched.h>
20#include <linux/fs.h> 21#include <linux/fs.h>
21#include <linux/mm.h> 22#include <linux/mm.h>
@@ -553,108 +554,85 @@ select_queue:
553 return ret; 554 return ret;
554} 555}
555 556
556static void unpin_sb_for_writeback(struct super_block **psb) 557static void unpin_sb_for_writeback(struct super_block *sb)
557{ 558{
558 struct super_block *sb = *psb; 559 up_read(&sb->s_umount);
559 560 put_super(sb);
560 if (sb) {
561 up_read(&sb->s_umount);
562 put_super(sb);
563 *psb = NULL;
564 }
565} 561}
566 562
563enum sb_pin_state {
564 SB_PINNED,
565 SB_NOT_PINNED,
566 SB_PIN_FAILED
567};
568
567/* 569/*
568 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned 570 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
569 * before calling writeback. So make sure that we do pin it, so it doesn't 571 * before calling writeback. So make sure that we do pin it, so it doesn't
570 * go away while we are writing inodes from it. 572 * go away while we are writing inodes from it.
571 *
572 * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
573 * 1 if we failed.
574 */ 573 */
575static int pin_sb_for_writeback(struct writeback_control *wbc, 574static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
576 struct inode *inode, struct super_block **psb) 575 struct super_block *sb)
577{ 576{
578 struct super_block *sb = inode->i_sb;
579
580 /*
581 * If this sb is already pinned, nothing more to do. If not and
582 * *psb is non-NULL, unpin the old one first
583 */
584 if (sb == *psb)
585 return 0;
586 else if (*psb)
587 unpin_sb_for_writeback(psb);
588
589 /* 577 /*
590 * Caller must already hold the ref for this 578 * Caller must already hold the ref for this
591 */ 579 */
592 if (wbc->sync_mode == WB_SYNC_ALL) { 580 if (wbc->sync_mode == WB_SYNC_ALL) {
593 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 581 WARN_ON(!rwsem_is_locked(&sb->s_umount));
594 return 0; 582 return SB_NOT_PINNED;
595 } 583 }
596
597 spin_lock(&sb_lock); 584 spin_lock(&sb_lock);
598 sb->s_count++; 585 sb->s_count++;
599 if (down_read_trylock(&sb->s_umount)) { 586 if (down_read_trylock(&sb->s_umount)) {
600 if (sb->s_root) { 587 if (sb->s_root) {
601 spin_unlock(&sb_lock); 588 spin_unlock(&sb_lock);
602 goto pinned; 589 return SB_PINNED;
603 } 590 }
604 /* 591 /*
605 * umounted, drop rwsem again and fall through to failure 592 * umounted, drop rwsem again and fall through to failure
606 */ 593 */
607 up_read(&sb->s_umount); 594 up_read(&sb->s_umount);
608 } 595 }
609
610 sb->s_count--; 596 sb->s_count--;
611 spin_unlock(&sb_lock); 597 spin_unlock(&sb_lock);
612 return 1; 598 return SB_PIN_FAILED;
613pinned:
614 *psb = sb;
615 return 0;
616} 599}
617 600
618static void writeback_inodes_wb(struct bdi_writeback *wb, 601/*
619 struct writeback_control *wbc) 602 * Write a portion of b_io inodes which belong to @sb.
603 * If @wbc->sb != NULL, then find and write all such
604 * inodes. Otherwise write only ones which go sequentially
605 * in reverse order.
606 * Return 1, if the caller writeback routine should be
607 * interrupted. Otherwise return 0.
608 */
609static int writeback_sb_inodes(struct super_block *sb,
610 struct bdi_writeback *wb,
611 struct writeback_control *wbc)
620{ 612{
621 struct super_block *sb = wbc->sb, *pin_sb = NULL;
622 const unsigned long start = jiffies; /* livelock avoidance */
623
624 spin_lock(&inode_lock);
625
626 if (!wbc->for_kupdate || list_empty(&wb->b_io))
627 queue_io(wb, wbc->older_than_this);
628
629 while (!list_empty(&wb->b_io)) { 613 while (!list_empty(&wb->b_io)) {
630 struct inode *inode = list_entry(wb->b_io.prev,
631 struct inode, i_list);
632 long pages_skipped; 614 long pages_skipped;
633 615 struct inode *inode = list_entry(wb->b_io.prev,
634 /* 616 struct inode, i_list);
635 * super block given and doesn't match, skip this inode 617 if (wbc->sb && sb != inode->i_sb) {
636 */ 618 /* super block given and doesn't
637 if (sb && sb != inode->i_sb) { 619 match, skip this inode */
638 redirty_tail(inode); 620 redirty_tail(inode);
639 continue; 621 continue;
640 } 622 }
641 623 if (sb != inode->i_sb)
624 /* finish with this superblock */
625 return 0;
642 if (inode->i_state & (I_NEW | I_WILL_FREE)) { 626 if (inode->i_state & (I_NEW | I_WILL_FREE)) {
643 requeue_io(inode); 627 requeue_io(inode);
644 continue; 628 continue;
645 } 629 }
646
647 /* 630 /*
648 * Was this inode dirtied after sync_sb_inodes was called? 631 * Was this inode dirtied after sync_sb_inodes was called?
649 * This keeps sync from extra jobs and livelock. 632 * This keeps sync from extra jobs and livelock.
650 */ 633 */
651 if (inode_dirtied_after(inode, start)) 634 if (inode_dirtied_after(inode, wbc->wb_start))
652 break; 635 return 1;
653
654 if (pin_sb_for_writeback(wbc, inode, &pin_sb)) {
655 requeue_io(inode);
656 continue;
657 }
658 636
659 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); 637 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
660 __iget(inode); 638 __iget(inode);
@@ -673,14 +651,50 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
673 spin_lock(&inode_lock); 651 spin_lock(&inode_lock);
674 if (wbc->nr_to_write <= 0) { 652 if (wbc->nr_to_write <= 0) {
675 wbc->more_io = 1; 653 wbc->more_io = 1;
676 break; 654 return 1;
677 } 655 }
678 if (!list_empty(&wb->b_more_io)) 656 if (!list_empty(&wb->b_more_io))
679 wbc->more_io = 1; 657 wbc->more_io = 1;
680 } 658 }
659 /* b_io is empty */
660 return 1;
661}
662
663static void writeback_inodes_wb(struct bdi_writeback *wb,
664 struct writeback_control *wbc)
665{
666 int ret = 0;
681 667
682 unpin_sb_for_writeback(&pin_sb); 668 wbc->wb_start = jiffies; /* livelock avoidance */
669 spin_lock(&inode_lock);
670 if (!wbc->for_kupdate || list_empty(&wb->b_io))
671 queue_io(wb, wbc->older_than_this);
672
673 while (!list_empty(&wb->b_io)) {
674 struct inode *inode = list_entry(wb->b_io.prev,
675 struct inode, i_list);
676 struct super_block *sb = inode->i_sb;
677 enum sb_pin_state state;
678
679 if (wbc->sb && sb != wbc->sb) {
680 /* super block given and doesn't
681 match, skip this inode */
682 redirty_tail(inode);
683 continue;
684 }
685 state = pin_sb_for_writeback(wbc, sb);
686
687 if (state == SB_PIN_FAILED) {
688 requeue_io(inode);
689 continue;
690 }
691 ret = writeback_sb_inodes(sb, wb, wbc);
683 692
693 if (state == SB_PINNED)
694 unpin_sb_for_writeback(sb);
695 if (ret)
696 break;
697 }
684 spin_unlock(&inode_lock); 698 spin_unlock(&inode_lock);
685 /* Leave any unwritten inodes on b_io */ 699 /* Leave any unwritten inodes on b_io */
686} 700}
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index 864dac20a242..cc94bb9563f2 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -1,7 +1,6 @@
1 1
2config FSCACHE 2config FSCACHE
3 tristate "General filesystem local caching manager" 3 tristate "General filesystem local caching manager"
4 depends on EXPERIMENTAL
5 select SLOW_WORK 4 select SLOW_WORK
6 help 5 help
7 This option enables a generic filesystem caching manager that can be 6 This option enables a generic filesystem caching manager that can be
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 3221a0c7944e..1e1f286dd70e 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -12,6 +12,7 @@
12#define FSCACHE_DEBUG_LEVEL COOKIE 12#define FSCACHE_DEBUG_LEVEL COOKIE
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/seq_file.h> 14#include <linux/seq_file.h>
15#include <linux/slab.h>
15#include <linux/key.h> 16#include <linux/key.h>
16#include <keys/user-type.h> 17#include <keys/user-type.h>
17#include "internal.h" 18#include "internal.h"
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index e513ac599c8e..0b589a9b4ffc 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -53,7 +53,7 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
53static void fscache_object_slow_work_put_ref(struct slow_work *); 53static void fscache_object_slow_work_put_ref(struct slow_work *);
54static int fscache_object_slow_work_get_ref(struct slow_work *); 54static int fscache_object_slow_work_get_ref(struct slow_work *);
55static void fscache_object_slow_work_execute(struct slow_work *); 55static void fscache_object_slow_work_execute(struct slow_work *);
56#ifdef CONFIG_SLOW_WORK_PROC 56#ifdef CONFIG_SLOW_WORK_DEBUG
57static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *); 57static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
58#endif 58#endif
59static void fscache_initialise_object(struct fscache_object *); 59static void fscache_initialise_object(struct fscache_object *);
@@ -69,7 +69,7 @@ const struct slow_work_ops fscache_object_slow_work_ops = {
69 .get_ref = fscache_object_slow_work_get_ref, 69 .get_ref = fscache_object_slow_work_get_ref,
70 .put_ref = fscache_object_slow_work_put_ref, 70 .put_ref = fscache_object_slow_work_put_ref,
71 .execute = fscache_object_slow_work_execute, 71 .execute = fscache_object_slow_work_execute,
72#ifdef CONFIG_SLOW_WORK_PROC 72#ifdef CONFIG_SLOW_WORK_DEBUG
73 .desc = fscache_object_slow_work_desc, 73 .desc = fscache_object_slow_work_desc,
74#endif 74#endif
75}; 75};
@@ -364,7 +364,7 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
364/* 364/*
365 * describe an object for slow-work debugging 365 * describe an object for slow-work debugging
366 */ 366 */
367#ifdef CONFIG_SLOW_WORK_PROC 367#ifdef CONFIG_SLOW_WORK_DEBUG
368static void fscache_object_slow_work_desc(struct slow_work *work, 368static void fscache_object_slow_work_desc(struct slow_work *work,
369 struct seq_file *m) 369 struct seq_file *m)
370{ 370{
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 313e79a14266..f17cecafae44 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -14,6 +14,7 @@
14#define FSCACHE_DEBUG_LEVEL OPERATION 14#define FSCACHE_DEBUG_LEVEL OPERATION
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/seq_file.h> 16#include <linux/seq_file.h>
17#include <linux/slab.h>
17#include "internal.h" 18#include "internal.h"
18 19
19atomic_t fscache_op_debug_id; 20atomic_t fscache_op_debug_id;
@@ -500,7 +501,7 @@ static void fscache_op_execute(struct slow_work *work)
500/* 501/*
501 * describe an operation for slow-work debugging 502 * describe an operation for slow-work debugging
502 */ 503 */
503#ifdef CONFIG_SLOW_WORK_PROC 504#ifdef CONFIG_SLOW_WORK_DEBUG
504static void fscache_op_desc(struct slow_work *work, struct seq_file *m) 505static void fscache_op_desc(struct slow_work *work, struct seq_file *m)
505{ 506{
506 struct fscache_operation *op = 507 struct fscache_operation *op =
@@ -517,7 +518,7 @@ const struct slow_work_ops fscache_op_slow_work_ops = {
517 .get_ref = fscache_op_get_ref, 518 .get_ref = fscache_op_get_ref,
518 .put_ref = fscache_op_put_ref, 519 .put_ref = fscache_op_put_ref,
519 .execute = fscache_op_execute, 520 .execute = fscache_op_execute,
520#ifdef CONFIG_SLOW_WORK_PROC 521#ifdef CONFIG_SLOW_WORK_DEBUG
521 .desc = fscache_op_desc, 522 .desc = fscache_op_desc,
522#endif 523#endif
523}; 524};
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index c598ea4c4e7d..47aefd376e54 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -14,6 +14,7 @@
14#include <linux/fscache-cache.h> 14#include <linux/fscache-cache.h>
15#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
16#include <linux/pagevec.h> 16#include <linux/pagevec.h>
17#include <linux/slab.h>
17#include "internal.h" 18#include "internal.h"
18 19
19/* 20/*
@@ -881,6 +882,7 @@ submit_failed:
881 goto nobufs; 882 goto nobufs;
882 883
883nobufs_unlock_obj: 884nobufs_unlock_obj:
885 spin_unlock(&cookie->stores_lock);
884 spin_unlock(&object->lock); 886 spin_unlock(&object->lock);
885nobufs: 887nobufs:
886 spin_unlock(&cookie->lock); 888 spin_unlock(&cookie->lock);
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 46435f3aae68..4765190d537f 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -165,8 +165,8 @@ static int fscache_stats_show(struct seq_file *m, void *v)
165 atomic_read(&fscache_n_object_lookups), 165 atomic_read(&fscache_n_object_lookups),
166 atomic_read(&fscache_n_object_lookups_negative), 166 atomic_read(&fscache_n_object_lookups_negative),
167 atomic_read(&fscache_n_object_lookups_positive), 167 atomic_read(&fscache_n_object_lookups_positive),
168 atomic_read(&fscache_n_object_lookups_timed_out), 168 atomic_read(&fscache_n_object_created),
169 atomic_read(&fscache_n_object_created)); 169 atomic_read(&fscache_n_object_lookups_timed_out));
170 170
171 seq_printf(m, "Updates: n=%u nul=%u run=%u\n", 171 seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
172 atomic_read(&fscache_n_updates), 172 atomic_read(&fscache_n_updates),
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index de792dcf3274..e1f8171278bd 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -44,6 +44,7 @@
44#include <linux/magic.h> 44#include <linux/magic.h>
45#include <linux/miscdevice.h> 45#include <linux/miscdevice.h>
46#include <linux/mutex.h> 46#include <linux/mutex.h>
47#include <linux/slab.h>
47#include <linux/spinlock.h> 48#include <linux/spinlock.h>
48#include <linux/stat.h> 49#include <linux/stat.h>
49 50
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 1a822ce2b24b..ec14d19ce501 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -850,7 +850,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
850 req->in.args[0].size = sizeof(*arg); 850 req->in.args[0].size = sizeof(*arg);
851 req->in.args[0].value = arg; 851 req->in.args[0].value = arg;
852 req->out.numargs = 1; 852 req->out.numargs = 1;
853 /* Variable length arguement used for backward compatibility 853 /* Variable length argument used for backward compatibility
854 with interface version < 7.5. Rest of init_out is zeroed 854 with interface version < 7.5. Rest of init_out is zeroed
855 by do_get_request(), so a short reply is not a problem */ 855 by do_get_request(), so a short reply is not a problem */
856 req->out.argvar = 1; 856 req->out.argvar = 1;
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 55458031e501..fe5df5457656 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/gfp.h>
10#include <linux/fs.h> 11#include <linux/fs.h>
11#include <linux/generic_acl.h> 12#include <linux/generic_acl.h>
12#include <linux/posix_acl.h> 13#include <linux/posix_acl.h>
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 4dcddf83326f..a47b43107112 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -8,7 +8,6 @@ config GFS2_FS
8 select FS_POSIX_ACL 8 select FS_POSIX_ACL
9 select CRC32 9 select CRC32
10 select SLOW_WORK 10 select SLOW_WORK
11 select QUOTA
12 select QUOTACTL 11 select QUOTACTL
13 help 12 help
14 A cluster filesystem. 13 A cluster filesystem.
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 0c1d0b82dcf1..a739a0a48067 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -418,6 +418,7 @@ static int gfs2_jdata_writepages(struct address_space *mapping,
418static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) 418static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
419{ 419{
420 struct buffer_head *dibh; 420 struct buffer_head *dibh;
421 u64 dsize = i_size_read(&ip->i_inode);
421 void *kaddr; 422 void *kaddr;
422 int error; 423 int error;
423 424
@@ -437,9 +438,10 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
437 return error; 438 return error;
438 439
439 kaddr = kmap_atomic(page, KM_USER0); 440 kaddr = kmap_atomic(page, KM_USER0);
440 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), 441 if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
441 ip->i_disksize); 442 dsize = (dibh->b_size - sizeof(struct gfs2_dinode));
442 memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize); 443 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
444 memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
443 kunmap_atomic(kaddr, KM_USER0); 445 kunmap_atomic(kaddr, KM_USER0);
444 flush_dcache_page(page); 446 flush_dcache_page(page);
445 brelse(dibh); 447 brelse(dibh);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 583e823307ae..4a48c0f4b402 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
@@ -72,11 +71,13 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
72 71
73 if (!PageUptodate(page)) { 72 if (!PageUptodate(page)) {
74 void *kaddr = kmap(page); 73 void *kaddr = kmap(page);
74 u64 dsize = i_size_read(inode);
75
76 if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
77 dsize = dibh->b_size - sizeof(struct gfs2_dinode);
75 78
76 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), 79 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
77 ip->i_disksize); 80 memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
78 memset(kaddr + ip->i_disksize, 0,
79 PAGE_CACHE_SIZE - ip->i_disksize);
80 kunmap(page); 81 kunmap(page);
81 82
82 SetPageUptodate(page); 83 SetPageUptodate(page);
@@ -1039,13 +1040,14 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
1039 goto out; 1040 goto out;
1040 1041
1041 if (gfs2_is_stuffed(ip)) { 1042 if (gfs2_is_stuffed(ip)) {
1042 ip->i_disksize = size; 1043 u64 dsize = size + sizeof(struct gfs2_inode);
1043 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1044 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1044 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1045 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1045 gfs2_dinode_out(ip, dibh->b_data); 1046 gfs2_dinode_out(ip, dibh->b_data);
1046 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size); 1047 if (dsize > dibh->b_size)
1048 dsize = dibh->b_size;
1049 gfs2_buffer_clear_tail(dibh, dsize);
1047 error = 1; 1050 error = 1;
1048
1049 } else { 1051 } else {
1050 if (size & (u64)(sdp->sd_sb.sb_bsize - 1)) 1052 if (size & (u64)(sdp->sd_sb.sb_bsize - 1))
1051 error = gfs2_block_truncate_page(ip->i_inode.i_mapping); 1053 error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 91beddadd388..bb7907bde3d8 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 25fddc100f18..8295c5b5d4a9 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1475,7 +1475,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
1475 inode = gfs2_inode_lookup(dir->i_sb, 1475 inode = gfs2_inode_lookup(dir->i_sb,
1476 be16_to_cpu(dent->de_type), 1476 be16_to_cpu(dent->de_type),
1477 be64_to_cpu(dent->de_inum.no_addr), 1477 be64_to_cpu(dent->de_inum.no_addr),
1478 be64_to_cpu(dent->de_inum.no_formal_ino), 0); 1478 be64_to_cpu(dent->de_inum.no_formal_ino));
1479 brelse(bh); 1479 brelse(bh);
1480 return inode; 1480 return inode;
1481 } 1481 }
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index d15876e9aa26..dfe237a3f8ad 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
@@ -169,7 +168,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
169 if (error) 168 if (error)
170 goto fail; 169 goto fail;
171 170
172 inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0); 171 inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0);
173 if (IS_ERR(inode)) { 172 if (IS_ERR(inode)) {
174 error = PTR_ERR(inode); 173 error = PTR_ERR(inode);
175 goto fail; 174 goto fail;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index a6abbae8a278..e6dd2aec6f82 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -640,7 +640,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
640 640
641 if (!(fl->fl_flags & FL_POSIX)) 641 if (!(fl->fl_flags & FL_POSIX))
642 return -ENOLCK; 642 return -ENOLCK;
643 if (__mandatory_lock(&ip->i_inode)) 643 if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK)
644 return -ENOLCK; 644 return -ENOLCK;
645 645
646 if (cmd == F_CANCELLK) { 646 if (cmd == F_CANCELLK) {
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 454d4b4eb36b..ddcdbf493536 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -855,6 +855,9 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *
855 gh->gh_flags = flags; 855 gh->gh_flags = flags;
856 gh->gh_iflags = 0; 856 gh->gh_iflags = 0;
857 gh->gh_ip = (unsigned long)__builtin_return_address(0); 857 gh->gh_ip = (unsigned long)__builtin_return_address(0);
858 if (gh->gh_owner_pid)
859 put_pid(gh->gh_owner_pid);
860 gh->gh_owner_pid = get_pid(task_pid(current));
858} 861}
859 862
860/** 863/**
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 38e3749d476c..49f97d3bb690 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index b8025e51cabf..b5d7363b22da 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -439,9 +439,6 @@ struct gfs2_args {
439struct gfs2_tune { 439struct gfs2_tune {
440 spinlock_t gt_spin; 440 spinlock_t gt_spin;
441 441
442 unsigned int gt_incore_log_blocks;
443 unsigned int gt_log_flush_secs;
444
445 unsigned int gt_logd_secs; 442 unsigned int gt_logd_secs;
446 443
447 unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */ 444 unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
@@ -462,6 +459,7 @@ enum {
462 SDF_SHUTDOWN = 2, 459 SDF_SHUTDOWN = 2,
463 SDF_NOBARRIERS = 3, 460 SDF_NOBARRIERS = 3,
464 SDF_NORECOVERY = 4, 461 SDF_NORECOVERY = 4,
462 SDF_DEMOTE = 5,
465}; 463};
466 464
467#define GFS2_FSNAME_LEN 256 465#define GFS2_FSNAME_LEN 256
@@ -616,8 +614,9 @@ struct gfs2_sbd {
616 unsigned int sd_log_blks_reserved; 614 unsigned int sd_log_blks_reserved;
617 unsigned int sd_log_commited_buf; 615 unsigned int sd_log_commited_buf;
618 unsigned int sd_log_commited_databuf; 616 unsigned int sd_log_commited_databuf;
619 unsigned int sd_log_commited_revoke; 617 int sd_log_commited_revoke;
620 618
619 atomic_t sd_log_pinned;
621 unsigned int sd_log_num_buf; 620 unsigned int sd_log_num_buf;
622 unsigned int sd_log_num_revoke; 621 unsigned int sd_log_num_revoke;
623 unsigned int sd_log_num_rg; 622 unsigned int sd_log_num_rg;
@@ -629,15 +628,17 @@ struct gfs2_sbd {
629 struct list_head sd_log_le_databuf; 628 struct list_head sd_log_le_databuf;
630 struct list_head sd_log_le_ordered; 629 struct list_head sd_log_le_ordered;
631 630
631 atomic_t sd_log_thresh1;
632 atomic_t sd_log_thresh2;
632 atomic_t sd_log_blks_free; 633 atomic_t sd_log_blks_free;
633 struct mutex sd_log_reserve_mutex; 634 wait_queue_head_t sd_log_waitq;
635 wait_queue_head_t sd_logd_waitq;
634 636
635 u64 sd_log_sequence; 637 u64 sd_log_sequence;
636 unsigned int sd_log_head; 638 unsigned int sd_log_head;
637 unsigned int sd_log_tail; 639 unsigned int sd_log_tail;
638 int sd_log_idle; 640 int sd_log_idle;
639 641
640 unsigned long sd_log_flush_time;
641 struct rw_semaphore sd_log_flush_lock; 642 struct rw_semaphore sd_log_flush_lock;
642 atomic_t sd_log_in_flight; 643 atomic_t sd_log_in_flight;
643 wait_queue_head_t sd_log_flush_wait; 644 wait_queue_head_t sd_log_flush_wait;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index b1bf2694fb2b..51d8061fa07a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -158,7 +158,6 @@ void gfs2_set_iop(struct inode *inode)
158 * @sb: The super block 158 * @sb: The super block
159 * @no_addr: The inode number 159 * @no_addr: The inode number
160 * @type: The type of the inode 160 * @type: The type of the inode
161 * @skip_freeing: set this not return an inode if it is currently being freed.
162 * 161 *
163 * Returns: A VFS inode, or an error 162 * Returns: A VFS inode, or an error
164 */ 163 */
@@ -166,17 +165,14 @@ void gfs2_set_iop(struct inode *inode)
166struct inode *gfs2_inode_lookup(struct super_block *sb, 165struct inode *gfs2_inode_lookup(struct super_block *sb,
167 unsigned int type, 166 unsigned int type,
168 u64 no_addr, 167 u64 no_addr,
169 u64 no_formal_ino, int skip_freeing) 168 u64 no_formal_ino)
170{ 169{
171 struct inode *inode; 170 struct inode *inode;
172 struct gfs2_inode *ip; 171 struct gfs2_inode *ip;
173 struct gfs2_glock *io_gl; 172 struct gfs2_glock *io_gl;
174 int error; 173 int error;
175 174
176 if (skip_freeing) 175 inode = gfs2_iget(sb, no_addr);
177 inode = gfs2_iget_skip(sb, no_addr);
178 else
179 inode = gfs2_iget(sb, no_addr);
180 ip = GFS2_I(inode); 176 ip = GFS2_I(inode);
181 177
182 if (!inode) 178 if (!inode)
@@ -234,13 +230,100 @@ fail_glock:
234fail_iopen: 230fail_iopen:
235 gfs2_glock_put(io_gl); 231 gfs2_glock_put(io_gl);
236fail_put: 232fail_put:
237 ip->i_gl->gl_object = NULL; 233 if (inode->i_state & I_NEW)
234 ip->i_gl->gl_object = NULL;
238 gfs2_glock_put(ip->i_gl); 235 gfs2_glock_put(ip->i_gl);
239fail: 236fail:
240 iget_failed(inode); 237 if (inode->i_state & I_NEW)
238 iget_failed(inode);
239 else
240 iput(inode);
241 return ERR_PTR(error); 241 return ERR_PTR(error);
242} 242}
243 243
244/**
245 * gfs2_unlinked_inode_lookup - Lookup an unlinked inode for reclamation
246 * @sb: The super block
247 * no_addr: The inode number
248 * @@inode: A pointer to the inode found, if any
249 *
250 * Returns: 0 and *inode if no errors occurred. If an error occurs,
251 * the resulting *inode may or may not be NULL.
252 */
253
254int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr,
255 struct inode **inode)
256{
257 struct gfs2_sbd *sdp;
258 struct gfs2_inode *ip;
259 struct gfs2_glock *io_gl;
260 int error;
261 struct gfs2_holder gh;
262
263 *inode = gfs2_iget_skip(sb, no_addr);
264
265 if (!(*inode))
266 return -ENOBUFS;
267
268 if (!((*inode)->i_state & I_NEW))
269 return -ENOBUFS;
270
271 ip = GFS2_I(*inode);
272 sdp = GFS2_SB(*inode);
273 ip->i_no_formal_ino = -1;
274
275 error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
276 if (unlikely(error))
277 goto fail;
278 ip->i_gl->gl_object = ip;
279
280 error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
281 if (unlikely(error))
282 goto fail_put;
283
284 set_bit(GIF_INVALID, &ip->i_flags);
285 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
286 &ip->i_iopen_gh);
287 if (unlikely(error)) {
288 if (error == GLR_TRYFAILED)
289 error = 0;
290 goto fail_iopen;
291 }
292 ip->i_iopen_gh.gh_gl->gl_object = ip;
293 gfs2_glock_put(io_gl);
294
295 (*inode)->i_mode = DT2IF(DT_UNKNOWN);
296
297 /*
298 * We must read the inode in order to work out its type in
299 * this case. Note that this doesn't happen often as we normally
300 * know the type beforehand. This code path only occurs during
301 * unlinked inode recovery (where it is safe to do this glock,
302 * which is not true in the general case).
303 */
304 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
305 &gh);
306 if (unlikely(error)) {
307 if (error == GLR_TRYFAILED)
308 error = 0;
309 goto fail_glock;
310 }
311 /* Inode is now uptodate */
312 gfs2_glock_dq_uninit(&gh);
313 gfs2_set_iop(*inode);
314
315 return 0;
316fail_glock:
317 gfs2_glock_dq(&ip->i_iopen_gh);
318fail_iopen:
319 gfs2_glock_put(io_gl);
320fail_put:
321 ip->i_gl->gl_object = NULL;
322 gfs2_glock_put(ip->i_gl);
323fail:
324 return error;
325}
326
244static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) 327static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
245{ 328{
246 const struct gfs2_dinode *str = buf; 329 const struct gfs2_dinode *str = buf;
@@ -862,7 +945,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
862 goto fail_gunlock2; 945 goto fail_gunlock2;
863 946
864 inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr, 947 inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr,
865 inum.no_formal_ino, 0); 948 inum.no_formal_ino);
866 if (IS_ERR(inode)) 949 if (IS_ERR(inode))
867 goto fail_gunlock2; 950 goto fail_gunlock2;
868 951
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c341aaf67adb..e161461d4c57 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -83,8 +83,9 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
83 83
84extern void gfs2_set_iop(struct inode *inode); 84extern void gfs2_set_iop(struct inode *inode);
85extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 85extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
86 u64 no_addr, u64 no_formal_ino, 86 u64 no_addr, u64 no_formal_ino);
87 int skip_freeing); 87extern int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr,
88 struct inode **inode);
88extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); 89extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
89 90
90extern int gfs2_inode_refresh(struct gfs2_inode *ip); 91extern int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 569b46240f61..0e0470ed34c2 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -9,6 +9,7 @@
9 9
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/dlm.h> 11#include <linux/dlm.h>
12#include <linux/slab.h>
12#include <linux/types.h> 13#include <linux/types.h>
13#include <linux/gfs2_ondisk.h> 14#include <linux/gfs2_ondisk.h>
14 15
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 4511b08fc451..b593f0e28f25 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -168,12 +168,11 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int fl
168 return list_empty(&ai->ai_ail1_list); 168 return list_empty(&ai->ai_ail1_list);
169} 169}
170 170
171static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags) 171static void gfs2_ail1_start(struct gfs2_sbd *sdp)
172{ 172{
173 struct list_head *head; 173 struct list_head *head;
174 u64 sync_gen; 174 u64 sync_gen;
175 struct list_head *first; 175 struct gfs2_ail *ai;
176 struct gfs2_ail *first_ai, *ai, *tmp;
177 int done = 0; 176 int done = 0;
178 177
179 gfs2_log_lock(sdp); 178 gfs2_log_lock(sdp);
@@ -184,21 +183,9 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
184 } 183 }
185 sync_gen = sdp->sd_ail_sync_gen++; 184 sync_gen = sdp->sd_ail_sync_gen++;
186 185
187 first = head->prev;
188 first_ai = list_entry(first, struct gfs2_ail, ai_list);
189 first_ai->ai_sync_gen = sync_gen;
190 gfs2_ail1_start_one(sdp, first_ai); /* This may drop log lock */
191
192 if (flags & DIO_ALL)
193 first = NULL;
194
195 while(!done) { 186 while(!done) {
196 if (first && (head->prev != first ||
197 gfs2_ail1_empty_one(sdp, first_ai, 0)))
198 break;
199
200 done = 1; 187 done = 1;
201 list_for_each_entry_safe_reverse(ai, tmp, head, ai_list) { 188 list_for_each_entry_reverse(ai, head, ai_list) {
202 if (ai->ai_sync_gen >= sync_gen) 189 if (ai->ai_sync_gen >= sync_gen)
203 continue; 190 continue;
204 ai->ai_sync_gen = sync_gen; 191 ai->ai_sync_gen = sync_gen;
@@ -290,58 +277,57 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
290 * flush time, so we ensure that we have just enough free blocks at all 277 * flush time, so we ensure that we have just enough free blocks at all
291 * times to avoid running out during a log flush. 278 * times to avoid running out during a log flush.
292 * 279 *
280 * We no longer flush the log here, instead we wake up logd to do that
281 * for us. To avoid the thundering herd and to ensure that we deal fairly
282 * with queued waiters, we use an exclusive wait. This means that when we
283 * get woken with enough journal space to get our reservation, we need to
284 * wake the next waiter on the list.
285 *
293 * Returns: errno 286 * Returns: errno
294 */ 287 */
295 288
296int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) 289int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
297{ 290{
298 unsigned int try = 0;
299 unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize); 291 unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize);
292 unsigned wanted = blks + reserved_blks;
293 DEFINE_WAIT(wait);
294 int did_wait = 0;
295 unsigned int free_blocks;
300 296
301 if (gfs2_assert_warn(sdp, blks) || 297 if (gfs2_assert_warn(sdp, blks) ||
302 gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks)) 298 gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
303 return -EINVAL; 299 return -EINVAL;
304 300retry:
305 mutex_lock(&sdp->sd_log_reserve_mutex); 301 free_blocks = atomic_read(&sdp->sd_log_blks_free);
306 gfs2_log_lock(sdp); 302 if (unlikely(free_blocks <= wanted)) {
307 while(atomic_read(&sdp->sd_log_blks_free) <= (blks + reserved_blks)) { 303 do {
308 gfs2_log_unlock(sdp); 304 prepare_to_wait_exclusive(&sdp->sd_log_waitq, &wait,
309 gfs2_ail1_empty(sdp, 0); 305 TASK_UNINTERRUPTIBLE);
310 gfs2_log_flush(sdp, NULL); 306 wake_up(&sdp->sd_logd_waitq);
311 307 did_wait = 1;
312 if (try++) 308 if (atomic_read(&sdp->sd_log_blks_free) <= wanted)
313 gfs2_ail1_start(sdp, 0); 309 io_schedule();
314 gfs2_log_lock(sdp); 310 free_blocks = atomic_read(&sdp->sd_log_blks_free);
311 } while(free_blocks <= wanted);
312 finish_wait(&sdp->sd_log_waitq, &wait);
315 } 313 }
316 atomic_sub(blks, &sdp->sd_log_blks_free); 314 if (atomic_cmpxchg(&sdp->sd_log_blks_free, free_blocks,
315 free_blocks - blks) != free_blocks)
316 goto retry;
317 trace_gfs2_log_blocks(sdp, -blks); 317 trace_gfs2_log_blocks(sdp, -blks);
318 gfs2_log_unlock(sdp); 318
319 mutex_unlock(&sdp->sd_log_reserve_mutex); 319 /*
320 * If we waited, then so might others, wake them up _after_ we get
321 * our share of the log.
322 */
323 if (unlikely(did_wait))
324 wake_up(&sdp->sd_log_waitq);
320 325
321 down_read(&sdp->sd_log_flush_lock); 326 down_read(&sdp->sd_log_flush_lock);
322 327
323 return 0; 328 return 0;
324} 329}
325 330
326/**
327 * gfs2_log_release - Release a given number of log blocks
328 * @sdp: The GFS2 superblock
329 * @blks: The number of blocks
330 *
331 */
332
333void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
334{
335
336 gfs2_log_lock(sdp);
337 atomic_add(blks, &sdp->sd_log_blks_free);
338 trace_gfs2_log_blocks(sdp, blks);
339 gfs2_assert_withdraw(sdp,
340 atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
341 gfs2_log_unlock(sdp);
342 up_read(&sdp->sd_log_flush_lock);
343}
344
345static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn) 331static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
346{ 332{
347 struct gfs2_journal_extent *je; 333 struct gfs2_journal_extent *je;
@@ -417,7 +403,7 @@ static unsigned int calc_reserved(struct gfs2_sbd *sdp)
417 databufhdrs_needed = (sdp->sd_log_commited_databuf + 403 databufhdrs_needed = (sdp->sd_log_commited_databuf +
418 (dbuf_limit - 1)) / dbuf_limit; 404 (dbuf_limit - 1)) / dbuf_limit;
419 405
420 if (sdp->sd_log_commited_revoke) 406 if (sdp->sd_log_commited_revoke > 0)
421 revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke, 407 revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
422 sizeof(u64)); 408 sizeof(u64));
423 409
@@ -559,11 +545,10 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
559 545
560 ail2_empty(sdp, new_tail); 546 ail2_empty(sdp, new_tail);
561 547
562 gfs2_log_lock(sdp);
563 atomic_add(dist, &sdp->sd_log_blks_free); 548 atomic_add(dist, &sdp->sd_log_blks_free);
564 trace_gfs2_log_blocks(sdp, dist); 549 trace_gfs2_log_blocks(sdp, dist);
565 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); 550 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
566 gfs2_log_unlock(sdp); 551 sdp->sd_jdesc->jd_blocks);
567 552
568 sdp->sd_log_tail = new_tail; 553 sdp->sd_log_tail = new_tail;
569} 554}
@@ -615,6 +600,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
615 if (buffer_eopnotsupp(bh)) { 600 if (buffer_eopnotsupp(bh)) {
616 clear_buffer_eopnotsupp(bh); 601 clear_buffer_eopnotsupp(bh);
617 set_buffer_uptodate(bh); 602 set_buffer_uptodate(bh);
603 fs_info(sdp, "barrier sync failed - disabling barriers\n");
618 set_bit(SDF_NOBARRIERS, &sdp->sd_flags); 604 set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
619 lock_buffer(bh); 605 lock_buffer(bh);
620skip_barrier: 606skip_barrier:
@@ -790,7 +776,6 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
790 gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) || 776 gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) ||
791 (((int)sdp->sd_log_commited_databuf) >= 0)); 777 (((int)sdp->sd_log_commited_databuf) >= 0));
792 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; 778 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
793 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
794 reserved = calc_reserved(sdp); 779 reserved = calc_reserved(sdp);
795 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved); 780 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved);
796 unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved; 781 unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
@@ -823,6 +808,13 @@ static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
823 * @sdp: the filesystem 808 * @sdp: the filesystem
824 * @tr: the transaction 809 * @tr: the transaction
825 * 810 *
811 * We wake up gfs2_logd if the number of pinned blocks exceed thresh1
812 * or the total number of used blocks (pinned blocks plus AIL blocks)
813 * is greater than thresh2.
814 *
815 * At mount time thresh1 is 1/3rd of journal size, thresh2 is 2/3rd of
816 * journal size.
817 *
826 * Returns: errno 818 * Returns: errno
827 */ 819 */
828 820
@@ -833,10 +825,10 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
833 825
834 up_read(&sdp->sd_log_flush_lock); 826 up_read(&sdp->sd_log_flush_lock);
835 827
836 gfs2_log_lock(sdp); 828 if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) ||
837 if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks)) 829 ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) >
838 wake_up_process(sdp->sd_logd_process); 830 atomic_read(&sdp->sd_log_thresh2)))
839 gfs2_log_unlock(sdp); 831 wake_up(&sdp->sd_logd_waitq);
840} 832}
841 833
842/** 834/**
@@ -883,13 +875,23 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
883{ 875{
884 gfs2_log_flush(sdp, NULL); 876 gfs2_log_flush(sdp, NULL);
885 for (;;) { 877 for (;;) {
886 gfs2_ail1_start(sdp, DIO_ALL); 878 gfs2_ail1_start(sdp);
887 if (gfs2_ail1_empty(sdp, DIO_ALL)) 879 if (gfs2_ail1_empty(sdp, DIO_ALL))
888 break; 880 break;
889 msleep(10); 881 msleep(10);
890 } 882 }
891} 883}
892 884
885static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp)
886{
887 return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1));
888}
889
890static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp)
891{
892 unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free);
893 return used_blocks >= atomic_read(&sdp->sd_log_thresh2);
894}
893 895
894/** 896/**
895 * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks 897 * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
@@ -902,28 +904,43 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
902int gfs2_logd(void *data) 904int gfs2_logd(void *data)
903{ 905{
904 struct gfs2_sbd *sdp = data; 906 struct gfs2_sbd *sdp = data;
905 unsigned long t; 907 unsigned long t = 1;
906 int need_flush; 908 DEFINE_WAIT(wait);
909 unsigned preflush;
907 910
908 while (!kthread_should_stop()) { 911 while (!kthread_should_stop()) {
909 /* Advance the log tail */
910 912
911 t = sdp->sd_log_flush_time + 913 preflush = atomic_read(&sdp->sd_log_pinned);
912 gfs2_tune_get(sdp, gt_log_flush_secs) * HZ; 914 if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
915 gfs2_ail1_empty(sdp, DIO_ALL);
916 gfs2_log_flush(sdp, NULL);
917 gfs2_ail1_empty(sdp, DIO_ALL);
918 }
913 919
914 gfs2_ail1_empty(sdp, DIO_ALL); 920 if (gfs2_ail_flush_reqd(sdp)) {
915 gfs2_log_lock(sdp); 921 gfs2_ail1_start(sdp);
916 need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks); 922 io_schedule();
917 gfs2_log_unlock(sdp); 923 gfs2_ail1_empty(sdp, 0);
918 if (need_flush || time_after_eq(jiffies, t)) {
919 gfs2_log_flush(sdp, NULL); 924 gfs2_log_flush(sdp, NULL);
920 sdp->sd_log_flush_time = jiffies; 925 gfs2_ail1_empty(sdp, DIO_ALL);
921 } 926 }
922 927
928 wake_up(&sdp->sd_log_waitq);
923 t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; 929 t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
924 if (freezing(current)) 930 if (freezing(current))
925 refrigerator(); 931 refrigerator();
926 schedule_timeout_interruptible(t); 932
933 do {
934 prepare_to_wait(&sdp->sd_logd_waitq, &wait,
935 TASK_UNINTERRUPTIBLE);
936 if (!gfs2_ail_flush_reqd(sdp) &&
937 !gfs2_jrnl_flush_reqd(sdp) &&
938 !kthread_should_stop())
939 t = schedule_timeout(t);
940 } while(t && !gfs2_ail_flush_reqd(sdp) &&
941 !gfs2_jrnl_flush_reqd(sdp) &&
942 !kthread_should_stop());
943 finish_wait(&sdp->sd_logd_waitq, &wait);
927 } 944 }
928 945
929 return 0; 946 return 0;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 7c64510ccfd2..eb570b4ad443 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -51,7 +51,6 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
51 unsigned int ssize); 51 unsigned int ssize);
52 52
53int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); 53int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
54void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
55void gfs2_log_incr_head(struct gfs2_sbd *sdp); 54void gfs2_log_incr_head(struct gfs2_sbd *sdp);
56 55
57struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp); 56struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index adc260fbea90..bf33f822058d 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -54,6 +54,7 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
54 if (bd->bd_ail) 54 if (bd->bd_ail)
55 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list); 55 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
56 get_bh(bh); 56 get_bh(bh);
57 atomic_inc(&sdp->sd_log_pinned);
57 trace_gfs2_pin(bd, 1); 58 trace_gfs2_pin(bd, 1);
58} 59}
59 60
@@ -94,6 +95,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
94 trace_gfs2_pin(bd, 0); 95 trace_gfs2_pin(bd, 0);
95 gfs2_log_unlock(sdp); 96 gfs2_log_unlock(sdp);
96 unlock_buffer(bh); 97 unlock_buffer(bh);
98 atomic_dec(&sdp->sd_log_pinned);
97} 99}
98 100
99 101
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index a88fadc704bb..fb2a5f93b7c3 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -94,7 +94,7 @@ static int __init init_gfs2_fs(void)
94 if (!gfs2_glock_cachep) 94 if (!gfs2_glock_cachep)
95 goto fail; 95 goto fail;
96 96
97 gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock (aspace)", 97 gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock(aspace)",
98 sizeof(struct gfs2_glock) + 98 sizeof(struct gfs2_glock) +
99 sizeof(struct address_space), 99 sizeof(struct address_space),
100 0, 0, gfs2_init_gl_aspace_once); 100 0, 0, gfs2_init_gl_aspace_once);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 0bb12c80937a..18176d0b75d7 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -34,7 +34,6 @@
34 34
35static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc) 35static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
36{ 36{
37 int err;
38 struct buffer_head *bh, *head; 37 struct buffer_head *bh, *head;
39 int nr_underway = 0; 38 int nr_underway = 0;
40 int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ? 39 int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ?
@@ -86,11 +85,10 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
86 } while (bh != head); 85 } while (bh != head);
87 unlock_page(page); 86 unlock_page(page);
88 87
89 err = 0;
90 if (nr_underway == 0) 88 if (nr_underway == 0)
91 end_page_writeback(page); 89 end_page_writeback(page);
92 90
93 return err; 91 return 0;
94} 92}
95 93
96const struct address_space_operations gfs2_meta_aops = { 94const struct address_space_operations gfs2_meta_aops = {
@@ -313,6 +311,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
313 struct gfs2_bufdata *bd = bh->b_private; 311 struct gfs2_bufdata *bd = bh->b_private;
314 312
315 if (test_clear_buffer_pinned(bh)) { 313 if (test_clear_buffer_pinned(bh)) {
314 atomic_dec(&sdp->sd_log_pinned);
316 list_del_init(&bd->bd_le.le_list); 315 list_del_init(&bd->bd_le.le_list);
317 if (meta) { 316 if (meta) {
318 gfs2_assert_warn(sdp, sdp->sd_log_num_buf); 317 gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index a054b526dc08..3593b3a7290e 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -57,8 +57,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
57{ 57{
58 spin_lock_init(&gt->gt_spin); 58 spin_lock_init(&gt->gt_spin);
59 59
60 gt->gt_incore_log_blocks = 1024;
61 gt->gt_logd_secs = 1;
62 gt->gt_quota_simul_sync = 64; 60 gt->gt_quota_simul_sync = 64;
63 gt->gt_quota_warn_period = 10; 61 gt->gt_quota_warn_period = 10;
64 gt->gt_quota_scale_num = 1; 62 gt->gt_quota_scale_num = 1;
@@ -101,14 +99,15 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
101 spin_lock_init(&sdp->sd_trunc_lock); 99 spin_lock_init(&sdp->sd_trunc_lock);
102 100
103 spin_lock_init(&sdp->sd_log_lock); 101 spin_lock_init(&sdp->sd_log_lock);
104 102 atomic_set(&sdp->sd_log_pinned, 0);
105 INIT_LIST_HEAD(&sdp->sd_log_le_buf); 103 INIT_LIST_HEAD(&sdp->sd_log_le_buf);
106 INIT_LIST_HEAD(&sdp->sd_log_le_revoke); 104 INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
107 INIT_LIST_HEAD(&sdp->sd_log_le_rg); 105 INIT_LIST_HEAD(&sdp->sd_log_le_rg);
108 INIT_LIST_HEAD(&sdp->sd_log_le_databuf); 106 INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
109 INIT_LIST_HEAD(&sdp->sd_log_le_ordered); 107 INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
110 108
111 mutex_init(&sdp->sd_log_reserve_mutex); 109 init_waitqueue_head(&sdp->sd_log_waitq);
110 init_waitqueue_head(&sdp->sd_logd_waitq);
112 INIT_LIST_HEAD(&sdp->sd_ail1_list); 111 INIT_LIST_HEAD(&sdp->sd_ail1_list);
113 INIT_LIST_HEAD(&sdp->sd_ail2_list); 112 INIT_LIST_HEAD(&sdp->sd_ail2_list);
114 113
@@ -487,7 +486,7 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
487 struct dentry *dentry; 486 struct dentry *dentry;
488 struct inode *inode; 487 struct inode *inode;
489 488
490 inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0); 489 inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0);
491 if (IS_ERR(inode)) { 490 if (IS_ERR(inode)) {
492 fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode)); 491 fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
493 return PTR_ERR(inode); 492 return PTR_ERR(inode);
@@ -733,6 +732,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
733 if (sdp->sd_args.ar_spectator) { 732 if (sdp->sd_args.ar_spectator) {
734 sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0); 733 sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
735 atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); 734 atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
735 atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5);
736 atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
736 } else { 737 } else {
737 if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) { 738 if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
738 fs_err(sdp, "can't mount journal #%u\n", 739 fs_err(sdp, "can't mount journal #%u\n",
@@ -770,6 +771,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
770 goto fail_jinode_gh; 771 goto fail_jinode_gh;
771 } 772 }
772 atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); 773 atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
774 atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5);
775 atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
773 776
774 /* Map the extents for this journal's blocks */ 777 /* Map the extents for this journal's blocks */
775 map_journal_extents(sdp); 778 map_journal_extents(sdp);
@@ -951,8 +954,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
951 if (undo) 954 if (undo)
952 goto fail_quotad; 955 goto fail_quotad;
953 956
954 sdp->sd_log_flush_time = jiffies;
955
956 p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); 957 p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
957 error = IS_ERR(p); 958 error = IS_ERR(p);
958 if (error) { 959 if (error) {
@@ -1001,7 +1002,7 @@ static const struct lm_lockops nolock_ops = {
1001/** 1002/**
1002 * gfs2_lm_mount - mount a locking protocol 1003 * gfs2_lm_mount - mount a locking protocol
1003 * @sdp: the filesystem 1004 * @sdp: the filesystem
1004 * @args: mount arguements 1005 * @args: mount arguments
1005 * @silent: if 1, don't complain if the FS isn't a GFS2 fs 1006 * @silent: if 1, don't complain if the FS isn't a GFS2 fs
1006 * 1007 *
1007 * Returns: errno 1008 * Returns: errno
@@ -1160,7 +1161,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1160 GFS2_BASIC_BLOCK_SHIFT; 1161 GFS2_BASIC_BLOCK_SHIFT;
1161 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; 1162 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
1162 1163
1163 sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit; 1164 sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit;
1164 sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum; 1165 sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum;
1165 if (sdp->sd_args.ar_statfs_quantum) { 1166 if (sdp->sd_args.ar_statfs_quantum) {
1166 sdp->sd_tune.gt_statfs_slow = 0; 1167 sdp->sd_tune.gt_statfs_slow = 0;
@@ -1323,7 +1324,7 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
1323 memset(&args, 0, sizeof(args)); 1324 memset(&args, 0, sizeof(args));
1324 args.ar_quota = GFS2_QUOTA_DEFAULT; 1325 args.ar_quota = GFS2_QUOTA_DEFAULT;
1325 args.ar_data = GFS2_DATA_DEFAULT; 1326 args.ar_data = GFS2_DATA_DEFAULT;
1326 args.ar_commit = 60; 1327 args.ar_commit = 30;
1327 args.ar_statfs_quantum = 30; 1328 args.ar_statfs_quantum = 30;
1328 args.ar_quota_quantum = 60; 1329 args.ar_quota_quantum = 60;
1329 args.ar_errors = GFS2_ERRORS_DEFAULT; 1330 args.ar_errors = GFS2_ERRORS_DEFAULT;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 6dbcbad6ab17..d5f4661287f9 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -637,15 +637,40 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
637 unsigned blocksize, iblock, pos; 637 unsigned blocksize, iblock, pos;
638 struct buffer_head *bh, *dibh; 638 struct buffer_head *bh, *dibh;
639 struct page *page; 639 struct page *page;
640 void *kaddr; 640 void *kaddr, *ptr;
641 struct gfs2_quota *qp; 641 struct gfs2_quota q, *qp;
642 s64 value; 642 int err, nbytes;
643 int err = -EIO;
644 u64 size; 643 u64 size;
645 644
646 if (gfs2_is_stuffed(ip)) 645 if (gfs2_is_stuffed(ip))
647 gfs2_unstuff_dinode(ip, NULL); 646 gfs2_unstuff_dinode(ip, NULL);
648 647
648 memset(&q, 0, sizeof(struct gfs2_quota));
649 err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q));
650 if (err < 0)
651 return err;
652
653 err = -EIO;
654 qp = &q;
655 qp->qu_value = be64_to_cpu(qp->qu_value);
656 qp->qu_value += change;
657 qp->qu_value = cpu_to_be64(qp->qu_value);
658 qd->qd_qb.qb_value = qp->qu_value;
659 if (fdq) {
660 if (fdq->d_fieldmask & FS_DQ_BSOFT) {
661 qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
662 qd->qd_qb.qb_warn = qp->qu_warn;
663 }
664 if (fdq->d_fieldmask & FS_DQ_BHARD) {
665 qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
666 qd->qd_qb.qb_limit = qp->qu_limit;
667 }
668 }
669
670 /* Write the quota into the quota file on disk */
671 ptr = qp;
672 nbytes = sizeof(struct gfs2_quota);
673get_a_page:
649 page = grab_cache_page(mapping, index); 674 page = grab_cache_page(mapping, index);
650 if (!page) 675 if (!page)
651 return -ENOMEM; 676 return -ENOMEM;
@@ -667,7 +692,12 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
667 if (!buffer_mapped(bh)) { 692 if (!buffer_mapped(bh)) {
668 gfs2_block_map(inode, iblock, bh, 1); 693 gfs2_block_map(inode, iblock, bh, 1);
669 if (!buffer_mapped(bh)) 694 if (!buffer_mapped(bh))
670 goto unlock; 695 goto unlock_out;
696 /* If it's a newly allocated disk block for quota, zero it */
697 if (buffer_new(bh)) {
698 memset(bh->b_data, 0, bh->b_size);
699 set_buffer_uptodate(bh);
700 }
671 } 701 }
672 702
673 if (PageUptodate(page)) 703 if (PageUptodate(page))
@@ -677,32 +707,34 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
677 ll_rw_block(READ_META, 1, &bh); 707 ll_rw_block(READ_META, 1, &bh);
678 wait_on_buffer(bh); 708 wait_on_buffer(bh);
679 if (!buffer_uptodate(bh)) 709 if (!buffer_uptodate(bh))
680 goto unlock; 710 goto unlock_out;
681 } 711 }
682 712
683 gfs2_trans_add_bh(ip->i_gl, bh, 0); 713 gfs2_trans_add_bh(ip->i_gl, bh, 0);
684 714
685 kaddr = kmap_atomic(page, KM_USER0); 715 kaddr = kmap_atomic(page, KM_USER0);
686 qp = kaddr + offset; 716 if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE)
687 value = (s64)be64_to_cpu(qp->qu_value) + change; 717 nbytes = PAGE_CACHE_SIZE - offset;
688 qp->qu_value = cpu_to_be64(value); 718 memcpy(kaddr + offset, ptr, nbytes);
689 qd->qd_qb.qb_value = qp->qu_value;
690 if (fdq) {
691 if (fdq->d_fieldmask & FS_DQ_BSOFT) {
692 qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
693 qd->qd_qb.qb_warn = qp->qu_warn;
694 }
695 if (fdq->d_fieldmask & FS_DQ_BHARD) {
696 qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
697 qd->qd_qb.qb_limit = qp->qu_limit;
698 }
699 }
700 flush_dcache_page(page); 719 flush_dcache_page(page);
701 kunmap_atomic(kaddr, KM_USER0); 720 kunmap_atomic(kaddr, KM_USER0);
721 unlock_page(page);
722 page_cache_release(page);
702 723
724 /* If quota straddles page boundary, we need to update the rest of the
725 * quota at the beginning of the next page */
726 if (offset != 0) { /* first page, offset is closer to PAGE_CACHE_SIZE */
727 ptr = ptr + nbytes;
728 nbytes = sizeof(struct gfs2_quota) - nbytes;
729 offset = 0;
730 index++;
731 goto get_a_page;
732 }
733
734 /* Update the disk inode timestamp and size (if extended) */
703 err = gfs2_meta_inode_buffer(ip, &dibh); 735 err = gfs2_meta_inode_buffer(ip, &dibh);
704 if (err) 736 if (err)
705 goto unlock; 737 goto out;
706 738
707 size = loc + sizeof(struct gfs2_quota); 739 size = loc + sizeof(struct gfs2_quota);
708 if (size > inode->i_size) { 740 if (size > inode->i_size) {
@@ -715,7 +747,9 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
715 brelse(dibh); 747 brelse(dibh);
716 mark_inode_dirty(inode); 748 mark_inode_dirty(inode);
717 749
718unlock: 750out:
751 return err;
752unlock_out:
719 unlock_page(page); 753 unlock_page(page);
720 page_cache_release(page); 754 page_cache_release(page);
721 return err; 755 return err;
@@ -779,8 +813,10 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
779 * rgrp since it won't be allocated during the transaction 813 * rgrp since it won't be allocated during the transaction
780 */ 814 */
781 al->al_requested = 1; 815 al->al_requested = 1;
782 /* +1 in the end for block requested above for unstuffing */ 816 /* +3 in the end for unstuffing block, inode size update block
783 blocks = num_qd * data_blocks + RES_DINODE + num_qd + 1; 817 * and another block in case quota straddles page boundary and
818 * two blocks need to be updated instead of 1 */
819 blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3;
784 820
785 if (nalloc) 821 if (nalloc)
786 al->al_requested += nalloc * (data_blocks + ind_blocks); 822 al->al_requested += nalloc * (data_blocks + ind_blocks);
@@ -1418,10 +1454,18 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
1418 1454
1419 memset(fqs, 0, sizeof(struct fs_quota_stat)); 1455 memset(fqs, 0, sizeof(struct fs_quota_stat));
1420 fqs->qs_version = FS_QSTAT_VERSION; 1456 fqs->qs_version = FS_QSTAT_VERSION;
1421 if (sdp->sd_args.ar_quota == GFS2_QUOTA_ON) 1457
1422 fqs->qs_flags = (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD); 1458 switch (sdp->sd_args.ar_quota) {
1423 else if (sdp->sd_args.ar_quota == GFS2_QUOTA_ACCOUNT) 1459 case GFS2_QUOTA_ON:
1424 fqs->qs_flags = (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT); 1460 fqs->qs_flags |= (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD);
1461 /*FALLTHRU*/
1462 case GFS2_QUOTA_ACCOUNT:
1463 fqs->qs_flags |= (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT);
1464 break;
1465 case GFS2_QUOTA_OFF:
1466 break;
1467 }
1468
1425 if (sdp->sd_quota_inode) { 1469 if (sdp->sd_quota_inode) {
1426 fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr; 1470 fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr;
1427 fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks; 1471 fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 503b842f3ba2..8bce73ed4d8e 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -948,13 +948,13 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
948 * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes 948 * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
949 * @rgd: The rgrp 949 * @rgd: The rgrp
950 * 950 *
951 * Returns: The inode, if one has been found 951 * Returns: 0 if no error
952 * The inode, if one has been found, in inode.
952 */ 953 */
953 954
954static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, 955static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
955 u64 skip) 956 u64 skip)
956{ 957{
957 struct inode *inode;
958 u32 goal = 0, block; 958 u32 goal = 0, block;
959 u64 no_addr; 959 u64 no_addr;
960 struct gfs2_sbd *sdp = rgd->rd_sbd; 960 struct gfs2_sbd *sdp = rgd->rd_sbd;
@@ -979,14 +979,11 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
979 if (no_addr == skip) 979 if (no_addr == skip)
980 continue; 980 continue;
981 *last_unlinked = no_addr; 981 *last_unlinked = no_addr;
982 inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN, 982 return no_addr;
983 no_addr, -1, 1);
984 if (!IS_ERR(inode))
985 return inode;
986 } 983 }
987 984
988 rgd->rd_flags &= ~GFS2_RDF_CHECK; 985 rgd->rd_flags &= ~GFS2_RDF_CHECK;
989 return NULL; 986 return 0;
990} 987}
991 988
992/** 989/**
@@ -1067,11 +1064,12 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
1067 * Try to acquire rgrp in way which avoids contending with others. 1064 * Try to acquire rgrp in way which avoids contending with others.
1068 * 1065 *
1069 * Returns: errno 1066 * Returns: errno
1067 * unlinked: the block address of an unlinked block to be reclaimed
1070 */ 1068 */
1071 1069
1072static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) 1070static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
1071 u64 *last_unlinked)
1073{ 1072{
1074 struct inode *inode = NULL;
1075 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1073 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1076 struct gfs2_rgrpd *rgd, *begin = NULL; 1074 struct gfs2_rgrpd *rgd, *begin = NULL;
1077 struct gfs2_alloc *al = ip->i_alloc; 1075 struct gfs2_alloc *al = ip->i_alloc;
@@ -1080,6 +1078,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1080 int loops = 0; 1078 int loops = 0;
1081 int error, rg_locked; 1079 int error, rg_locked;
1082 1080
1081 *unlinked = 0;
1083 rgd = gfs2_blk2rgrpd(sdp, ip->i_goal); 1082 rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
1084 1083
1085 while (rgd) { 1084 while (rgd) {
@@ -1096,19 +1095,24 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1096 case 0: 1095 case 0:
1097 if (try_rgrp_fit(rgd, al)) 1096 if (try_rgrp_fit(rgd, al))
1098 goto out; 1097 goto out;
1099 if (rgd->rd_flags & GFS2_RDF_CHECK) 1098 /* If the rg came in already locked, there's no
1100 inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); 1099 way we can recover from a failed try_rgrp_unlink
1100 because that would require an iput which can only
1101 happen after the rgrp is unlocked. */
1102 if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
1103 *unlinked = try_rgrp_unlink(rgd, last_unlinked,
1104 ip->i_no_addr);
1101 if (!rg_locked) 1105 if (!rg_locked)
1102 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1106 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1103 if (inode) 1107 if (*unlinked)
1104 return inode; 1108 return -EAGAIN;
1105 /* fall through */ 1109 /* fall through */
1106 case GLR_TRYFAILED: 1110 case GLR_TRYFAILED:
1107 rgd = recent_rgrp_next(rgd); 1111 rgd = recent_rgrp_next(rgd);
1108 break; 1112 break;
1109 1113
1110 default: 1114 default:
1111 return ERR_PTR(error); 1115 return error;
1112 } 1116 }
1113 } 1117 }
1114 1118
@@ -1130,12 +1134,13 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1130 case 0: 1134 case 0:
1131 if (try_rgrp_fit(rgd, al)) 1135 if (try_rgrp_fit(rgd, al))
1132 goto out; 1136 goto out;
1133 if (rgd->rd_flags & GFS2_RDF_CHECK) 1137 if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
1134 inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); 1138 *unlinked = try_rgrp_unlink(rgd, last_unlinked,
1139 ip->i_no_addr);
1135 if (!rg_locked) 1140 if (!rg_locked)
1136 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1141 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1137 if (inode) 1142 if (*unlinked)
1138 return inode; 1143 return -EAGAIN;
1139 break; 1144 break;
1140 1145
1141 case GLR_TRYFAILED: 1146 case GLR_TRYFAILED:
@@ -1143,7 +1148,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1143 break; 1148 break;
1144 1149
1145 default: 1150 default:
1146 return ERR_PTR(error); 1151 return error;
1147 } 1152 }
1148 1153
1149 rgd = gfs2_rgrpd_get_next(rgd); 1154 rgd = gfs2_rgrpd_get_next(rgd);
@@ -1152,7 +1157,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1152 1157
1153 if (rgd == begin) { 1158 if (rgd == begin) {
1154 if (++loops >= 3) 1159 if (++loops >= 3)
1155 return ERR_PTR(-ENOSPC); 1160 return -ENOSPC;
1156 if (!skipped) 1161 if (!skipped)
1157 loops++; 1162 loops++;
1158 flags = 0; 1163 flags = 0;
@@ -1172,7 +1177,7 @@ out:
1172 forward_rgrp_set(sdp, rgd); 1177 forward_rgrp_set(sdp, rgd);
1173 } 1178 }
1174 1179
1175 return NULL; 1180 return 0;
1176} 1181}
1177 1182
1178/** 1183/**
@@ -1188,7 +1193,7 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
1188 struct gfs2_alloc *al = ip->i_alloc; 1193 struct gfs2_alloc *al = ip->i_alloc;
1189 struct inode *inode; 1194 struct inode *inode;
1190 int error = 0; 1195 int error = 0;
1191 u64 last_unlinked = NO_BLOCK; 1196 u64 last_unlinked = NO_BLOCK, unlinked;
1192 1197
1193 if (gfs2_assert_warn(sdp, al->al_requested)) 1198 if (gfs2_assert_warn(sdp, al->al_requested))
1194 return -EINVAL; 1199 return -EINVAL;
@@ -1204,14 +1209,19 @@ try_again:
1204 if (error) 1209 if (error)
1205 return error; 1210 return error;
1206 1211
1207 inode = get_local_rgrp(ip, &last_unlinked); 1212 error = get_local_rgrp(ip, &unlinked, &last_unlinked);
1208 if (inode) { 1213 if (error) {
1209 if (ip != GFS2_I(sdp->sd_rindex)) 1214 if (ip != GFS2_I(sdp->sd_rindex))
1210 gfs2_glock_dq_uninit(&al->al_ri_gh); 1215 gfs2_glock_dq_uninit(&al->al_ri_gh);
1211 if (IS_ERR(inode)) 1216 if (error != -EAGAIN)
1212 return PTR_ERR(inode); 1217 return error;
1213 iput(inode); 1218 error = gfs2_unlinked_inode_lookup(ip->i_inode.i_sb,
1219 unlinked, &inode);
1220 if (inode)
1221 iput(inode);
1214 gfs2_log_flush(sdp, NULL); 1222 gfs2_log_flush(sdp, NULL);
1223 if (error == GLR_TRYFAILED)
1224 error = 0;
1215 goto try_again; 1225 goto try_again;
1216 } 1226 }
1217 1227
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index b4106ddaaa98..f07119d89557 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -10,6 +10,8 @@
10#ifndef __RGRP_DOT_H__ 10#ifndef __RGRP_DOT_H__
11#define __RGRP_DOT_H__ 11#define __RGRP_DOT_H__
12 12
13#include <linux/slab.h>
14
13struct gfs2_rgrpd; 15struct gfs2_rgrpd;
14struct gfs2_sbd; 16struct gfs2_sbd;
15struct gfs2_holder; 17struct gfs2_holder;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 50aac606b990..4d1aad38f1b1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1113,7 +1113,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
1113 int error; 1113 int error;
1114 1114
1115 spin_lock(&gt->gt_spin); 1115 spin_lock(&gt->gt_spin);
1116 args.ar_commit = gt->gt_log_flush_secs; 1116 args.ar_commit = gt->gt_logd_secs;
1117 args.ar_quota_quantum = gt->gt_quota_quantum; 1117 args.ar_quota_quantum = gt->gt_quota_quantum;
1118 if (gt->gt_statfs_slow) 1118 if (gt->gt_statfs_slow)
1119 args.ar_statfs_quantum = 0; 1119 args.ar_statfs_quantum = 0;
@@ -1160,7 +1160,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
1160 else 1160 else
1161 clear_bit(SDF_NOBARRIERS, &sdp->sd_flags); 1161 clear_bit(SDF_NOBARRIERS, &sdp->sd_flags);
1162 spin_lock(&gt->gt_spin); 1162 spin_lock(&gt->gt_spin);
1163 gt->gt_log_flush_secs = args.ar_commit; 1163 gt->gt_logd_secs = args.ar_commit;
1164 gt->gt_quota_quantum = args.ar_quota_quantum; 1164 gt->gt_quota_quantum = args.ar_quota_quantum;
1165 if (args.ar_statfs_quantum) { 1165 if (args.ar_statfs_quantum) {
1166 gt->gt_statfs_slow = 0; 1166 gt->gt_statfs_slow = 0;
@@ -1305,8 +1305,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1305 } 1305 }
1306 if (args->ar_discard) 1306 if (args->ar_discard)
1307 seq_printf(s, ",discard"); 1307 seq_printf(s, ",discard");
1308 val = sdp->sd_tune.gt_log_flush_secs; 1308 val = sdp->sd_tune.gt_logd_secs;
1309 if (val != 60) 1309 if (val != 30)
1310 seq_printf(s, ",commit=%d", val); 1310 seq_printf(s, ",commit=%d", val);
1311 val = sdp->sd_tune.gt_statfs_quantum; 1311 val = sdp->sd_tune.gt_statfs_quantum;
1312 if (val != 30) 1312 if (val != 30)
@@ -1334,7 +1334,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1334 } 1334 }
1335 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) 1335 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
1336 seq_printf(s, ",nobarrier"); 1336 seq_printf(s, ",nobarrier");
1337 1337 if (test_bit(SDF_DEMOTE, &sdp->sd_flags))
1338 seq_printf(s, ",demote_interface_used");
1338 return 0; 1339 return 0;
1339} 1340}
1340 1341
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index b5f1a46133c8..37f5393e68e6 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -8,7 +8,6 @@
8 */ 8 */
9 9
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h> 11#include <linux/spinlock.h>
13#include <linux/completion.h> 12#include <linux/completion.h>
14#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
@@ -49,7 +48,7 @@ static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
49 return a->store ? a->store(sdp, buf, len) : len; 48 return a->store ? a->store(sdp, buf, len) : len;
50} 49}
51 50
52static struct sysfs_ops gfs2_attr_ops = { 51static const struct sysfs_ops gfs2_attr_ops = {
53 .show = gfs2_attr_show, 52 .show = gfs2_attr_show,
54 .store = gfs2_attr_store, 53 .store = gfs2_attr_store,
55}; 54};
@@ -233,6 +232,8 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
233 glops = gfs2_glops_list[gltype]; 232 glops = gfs2_glops_list[gltype];
234 if (glops == NULL) 233 if (glops == NULL)
235 return -EINVAL; 234 return -EINVAL;
235 if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags))
236 fs_info(sdp, "demote interface used\n");
236 rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl); 237 rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl);
237 if (rv) 238 if (rv)
238 return rv; 239 return rv;
@@ -469,8 +470,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
469} \ 470} \
470TUNE_ATTR_2(name, name##_store) 471TUNE_ATTR_2(name, name##_store)
471 472
472TUNE_ATTR(incore_log_blocks, 0);
473TUNE_ATTR(log_flush_secs, 0);
474TUNE_ATTR(quota_warn_period, 0); 473TUNE_ATTR(quota_warn_period, 0);
475TUNE_ATTR(quota_quantum, 0); 474TUNE_ATTR(quota_quantum, 0);
476TUNE_ATTR(max_readahead, 0); 475TUNE_ATTR(max_readahead, 0);
@@ -482,8 +481,6 @@ TUNE_ATTR(statfs_quantum, 1);
482TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); 481TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
483 482
484static struct attribute *tune_attrs[] = { 483static struct attribute *tune_attrs[] = {
485 &tune_attr_incore_log_blocks.attr,
486 &tune_attr_log_flush_secs.attr,
487 &tune_attr_quota_warn_period.attr, 484 &tune_attr_quota_warn_period.attr,
488 &tune_attr_quota_quantum.attr, 485 &tune_attr_quota_quantum.attr,
489 &tune_attr_max_readahead.attr, 486 &tune_attr_max_readahead.attr,
@@ -574,7 +571,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
574 return 0; 571 return 0;
575} 572}
576 573
577static struct kset_uevent_ops gfs2_uevent_ops = { 574static const struct kset_uevent_ops gfs2_uevent_ops = {
578 .uevent = gfs2_uevent, 575 .uevent = gfs2_uevent,
579}; 576};
580 577
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 4ef0e9fa3549..9ec73a854111 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -23,6 +23,7 @@
23#include "meta_io.h" 23#include "meta_io.h"
24#include "trans.h" 24#include "trans.h"
25#include "util.h" 25#include "util.h"
26#include "trace_gfs2.h"
26 27
27int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, 28int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
28 unsigned int revokes) 29 unsigned int revokes)
@@ -75,6 +76,23 @@ fail_holder_uninit:
75 return error; 76 return error;
76} 77}
77 78
79/**
80 * gfs2_log_release - Release a given number of log blocks
81 * @sdp: The GFS2 superblock
82 * @blks: The number of blocks
83 *
84 */
85
86static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
87{
88
89 atomic_add(blks, &sdp->sd_log_blks_free);
90 trace_gfs2_log_blocks(sdp, blks);
91 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
92 sdp->sd_jdesc->jd_blocks);
93 up_read(&sdp->sd_log_flush_lock);
94}
95
78void gfs2_trans_end(struct gfs2_sbd *sdp) 96void gfs2_trans_end(struct gfs2_sbd *sdp)
79{ 97{
80 struct gfs2_trans *tr = current->journal_info; 98 struct gfs2_trans *tr = current->journal_info;
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 226f2bfbf16a..53511291fe36 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 0d200068d0af..cdb41a1f6a64 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/slab.h>
12#include <linux/swap.h> 13#include <linux/swap.h>
13 14
14#include "btree.h" 15#include "btree.h"
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 052f214ea6f0..38a0a9917d7f 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/slab.h>
12#include <linux/log2.h> 13#include <linux/log2.h>
13 14
14#include "btree.h" 15#include "btree.h"
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 8bbe03c3f6d5..86428f5ac991 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -11,6 +11,7 @@
11#include <linux/cdrom.h> 11#include <linux/cdrom.h>
12#include <linux/genhd.h> 12#include <linux/genhd.h>
13#include <linux/nls.h> 13#include <linux/nls.h>
14#include <linux/slab.h>
14 15
15#include "hfs_fs.h" 16#include "hfs_fs.h"
16#include "btree.h" 17#include "btree.h"
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 5ed7252b7b23..0a81eb7111f3 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -19,6 +19,7 @@
19#include <linux/nls.h> 19#include <linux/nls.h>
20#include <linux/parser.h> 20#include <linux/parser.h>
21#include <linux/seq_file.h> 21#include <linux/seq_file.h>
22#include <linux/slab.h>
22#include <linux/smp_lock.h> 23#include <linux/smp_lock.h>
23#include <linux/vfs.h> 24#include <linux/vfs.h>
24 25
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 3fcbb0e1f6fc..572628b4b07d 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -15,6 +15,7 @@
15#include <linux/nls.h> 15#include <linux/nls.h>
16#include <linux/mount.h> 16#include <linux/mount.h>
17#include <linux/seq_file.h> 17#include <linux/seq_file.h>
18#include <linux/slab.h>
18#include "hfsplus_fs.h" 19#include "hfsplus_fs.h"
19 20
20enum { 21enum {
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 032604e5ef2c..3a029d8f4cf1 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -11,6 +11,7 @@
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/statfs.h> 13#include <linux/statfs.h>
14#include <linux/slab.h>
14#include <linux/seq_file.h> 15#include <linux/seq_file.h>
15#include <linux/mount.h> 16#include <linux/mount.h>
16#include "hostfs.h" 17#include "hostfs.h"
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index b6fca543544c..eac5f96323e3 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -6,6 +6,7 @@
6 * general buffer i/o 6 * general buffer i/o
7 */ 7 */
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/slab.h>
9#include "hpfs_fn.h" 10#include "hpfs_fn.h"
10 11
11void hpfs_lock_creation(struct super_block *s) 12void hpfs_lock_creation(struct super_block *s)
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 26e3964a4b8c..2338130cceba 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/smp_lock.h> 9#include <linux/smp_lock.h>
10#include <linux/slab.h>
10#include "hpfs_fn.h" 11#include "hpfs_fn.h"
11 12
12static int hpfs_dir_release(struct inode *inode, struct file *filp) 13static int hpfs_dir_release(struct inode *inode, struct file *filp)
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index ff90affb94e1..1042a9bc97f3 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/smp_lock.h> 9#include <linux/smp_lock.h>
10#include <linux/slab.h>
10#include "hpfs_fn.h" 11#include "hpfs_fn.h"
11 12
12void hpfs_init_inode(struct inode *i) 13void hpfs_init_inode(struct inode *i)
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index cadc4ce48656..aa53842c599c 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -15,6 +15,7 @@
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/smp_lock.h> 16#include <linux/smp_lock.h>
17#include <linux/bitmap.h> 17#include <linux/bitmap.h>
18#include <linux/slab.h>
18 19
19/* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */ 20/* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
20 21
diff --git a/fs/inode.c b/fs/inode.c
index 407bf392e20a..258ec22bb298 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1205,8 +1205,6 @@ void generic_delete_inode(struct inode *inode)
1205 inodes_stat.nr_inodes--; 1205 inodes_stat.nr_inodes--;
1206 spin_unlock(&inode_lock); 1206 spin_unlock(&inode_lock);
1207 1207
1208 security_inode_delete(inode);
1209
1210 if (op->delete_inode) { 1208 if (op->delete_inode) {
1211 void (*delete)(struct inode *) = op->delete_inode; 1209 void (*delete)(struct inode *) = op->delete_inode;
1212 /* Filesystems implementing their own 1210 /* Filesystems implementing their own
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 6c751106c2e5..7faefb4da939 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -228,14 +228,23 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
228 228
229#ifdef CONFIG_BLOCK 229#ifdef CONFIG_BLOCK
230 230
231#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits) 231static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
232#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits); 232{
233 return (offset >> inode->i_blkbits);
234}
235
236static inline loff_t blk_to_logical(struct inode *inode, sector_t blk)
237{
238 return (blk << inode->i_blkbits);
239}
233 240
234/** 241/**
235 * __generic_block_fiemap - FIEMAP for block based inodes (no locking) 242 * __generic_block_fiemap - FIEMAP for block based inodes (no locking)
236 * @inode - the inode to map 243 * @inode: the inode to map
237 * @arg - the pointer to userspace where we copy everything to 244 * @fieinfo: the fiemap info struct that will be passed back to userspace
238 * @get_block - the fs's get_block function 245 * @start: where to start mapping in the inode
246 * @len: how much space to map
247 * @get_block: the fs's get_block function
239 * 248 *
240 * This does FIEMAP for block based inodes. Basically it will just loop 249 * This does FIEMAP for block based inodes. Basically it will just loop
241 * through get_block until we hit the number of extents we want to map, or we 250 * through get_block until we hit the number of extents we want to map, or we
@@ -250,58 +259,63 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
250 */ 259 */
251 260
252int __generic_block_fiemap(struct inode *inode, 261int __generic_block_fiemap(struct inode *inode,
253 struct fiemap_extent_info *fieinfo, u64 start, 262 struct fiemap_extent_info *fieinfo, loff_t start,
254 u64 len, get_block_t *get_block) 263 loff_t len, get_block_t *get_block)
255{ 264{
256 struct buffer_head tmp; 265 struct buffer_head map_bh;
257 unsigned long long start_blk; 266 sector_t start_blk, last_blk;
258 long long length = 0, map_len = 0; 267 loff_t isize = i_size_read(inode);
259 u64 logical = 0, phys = 0, size = 0; 268 u64 logical = 0, phys = 0, size = 0;
260 u32 flags = FIEMAP_EXTENT_MERGED; 269 u32 flags = FIEMAP_EXTENT_MERGED;
261 int ret = 0, past_eof = 0, whole_file = 0; 270 bool past_eof = false, whole_file = false;
271 int ret = 0;
262 272
263 if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC))) 273 ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
274 if (ret)
264 return ret; 275 return ret;
265 276
266 start_blk = logical_to_blk(inode, start); 277 /*
267 278 * Either the i_mutex or other appropriate locking needs to be held
268 length = (long long)min_t(u64, len, i_size_read(inode)); 279 * since we expect isize to not change at all through the duration of
269 if (length < len) 280 * this call.
270 whole_file = 1; 281 */
282 if (len >= isize) {
283 whole_file = true;
284 len = isize;
285 }
271 286
272 map_len = length; 287 start_blk = logical_to_blk(inode, start);
288 last_blk = logical_to_blk(inode, start + len - 1);
273 289
274 do { 290 do {
275 /* 291 /*
276 * we set b_size to the total size we want so it will map as 292 * we set b_size to the total size we want so it will map as
277 * many contiguous blocks as possible at once 293 * many contiguous blocks as possible at once
278 */ 294 */
279 memset(&tmp, 0, sizeof(struct buffer_head)); 295 memset(&map_bh, 0, sizeof(struct buffer_head));
280 tmp.b_size = map_len; 296 map_bh.b_size = len;
281 297
282 ret = get_block(inode, start_blk, &tmp, 0); 298 ret = get_block(inode, start_blk, &map_bh, 0);
283 if (ret) 299 if (ret)
284 break; 300 break;
285 301
286 /* HOLE */ 302 /* HOLE */
287 if (!buffer_mapped(&tmp)) { 303 if (!buffer_mapped(&map_bh)) {
288 length -= blk_to_logical(inode, 1);
289 start_blk++; 304 start_blk++;
290 305
291 /* 306 /*
292 * we want to handle the case where there is an 307 * We want to handle the case where there is an
293 * allocated block at the front of the file, and then 308 * allocated block at the front of the file, and then
294 * nothing but holes up to the end of the file properly, 309 * nothing but holes up to the end of the file properly,
295 * to make sure that extent at the front gets properly 310 * to make sure that extent at the front gets properly
296 * marked with FIEMAP_EXTENT_LAST 311 * marked with FIEMAP_EXTENT_LAST
297 */ 312 */
298 if (!past_eof && 313 if (!past_eof &&
299 blk_to_logical(inode, start_blk) >= 314 blk_to_logical(inode, start_blk) >= isize)
300 blk_to_logical(inode, 0)+i_size_read(inode))
301 past_eof = 1; 315 past_eof = 1;
302 316
303 /* 317 /*
304 * first hole after going past the EOF, this is our 318 * First hole after going past the EOF, this is our
305 * last extent 319 * last extent
306 */ 320 */
307 if (past_eof && size) { 321 if (past_eof && size) {
@@ -309,15 +323,18 @@ int __generic_block_fiemap(struct inode *inode,
309 ret = fiemap_fill_next_extent(fieinfo, logical, 323 ret = fiemap_fill_next_extent(fieinfo, logical,
310 phys, size, 324 phys, size,
311 flags); 325 flags);
312 break; 326 } else if (size) {
327 ret = fiemap_fill_next_extent(fieinfo, logical,
328 phys, size, flags);
329 size = 0;
313 } 330 }
314 331
315 /* if we have holes up to/past EOF then we're done */ 332 /* if we have holes up to/past EOF then we're done */
316 if (length <= 0 || past_eof) 333 if (start_blk > last_blk || past_eof || ret)
317 break; 334 break;
318 } else { 335 } else {
319 /* 336 /*
320 * we have gone over the length of what we wanted to 337 * We have gone over the length of what we wanted to
321 * map, and it wasn't the entire file, so add the extent 338 * map, and it wasn't the entire file, so add the extent
322 * we got last time and exit. 339 * we got last time and exit.
323 * 340 *
@@ -331,7 +348,7 @@ int __generic_block_fiemap(struct inode *inode,
331 * are good to go, just add the extent to the fieinfo 348 * are good to go, just add the extent to the fieinfo
332 * and break 349 * and break
333 */ 350 */
334 if (length <= 0 && !whole_file) { 351 if (start_blk > last_blk && !whole_file) {
335 ret = fiemap_fill_next_extent(fieinfo, logical, 352 ret = fiemap_fill_next_extent(fieinfo, logical,
336 phys, size, 353 phys, size,
337 flags); 354 flags);
@@ -351,11 +368,10 @@ int __generic_block_fiemap(struct inode *inode,
351 } 368 }
352 369
353 logical = blk_to_logical(inode, start_blk); 370 logical = blk_to_logical(inode, start_blk);
354 phys = blk_to_logical(inode, tmp.b_blocknr); 371 phys = blk_to_logical(inode, map_bh.b_blocknr);
355 size = tmp.b_size; 372 size = map_bh.b_size;
356 flags = FIEMAP_EXTENT_MERGED; 373 flags = FIEMAP_EXTENT_MERGED;
357 374
358 length -= tmp.b_size;
359 start_blk += logical_to_blk(inode, size); 375 start_blk += logical_to_blk(inode, size);
360 376
361 /* 377 /*
@@ -363,15 +379,13 @@ int __generic_block_fiemap(struct inode *inode,
363 * soon as we find a hole that the last extent we found 379 * soon as we find a hole that the last extent we found
364 * is marked with FIEMAP_EXTENT_LAST 380 * is marked with FIEMAP_EXTENT_LAST
365 */ 381 */
366 if (!past_eof && 382 if (!past_eof && logical + size >= isize)
367 logical+size >= 383 past_eof = true;
368 blk_to_logical(inode, 0)+i_size_read(inode))
369 past_eof = 1;
370 } 384 }
371 cond_resched(); 385 cond_resched();
372 } while (1); 386 } while (1);
373 387
374 /* if ret is 1 then we just hit the end of the extent array */ 388 /* If ret is 1 then we just hit the end of the extent array */
375 if (ret == 1) 389 if (ret == 1)
376 ret = 0; 390 ret = 0;
377 391
diff --git a/fs/ioprio.c b/fs/ioprio.c
index c7c0b28d7d21..748cfb92dcc6 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -19,6 +19,7 @@
19 * See also Documentation/block/ioprio.txt 19 * See also Documentation/block/ioprio.txt
20 * 20 *
21 */ 21 */
22#include <linux/gfp.h>
22#include <linux/kernel.h> 23#include <linux/kernel.h>
23#include <linux/ioprio.h> 24#include <linux/ioprio.h>
24#include <linux/blkdev.h> 25#include <linux/blkdev.h>
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 8ba5441063be..b9ab69b3a482 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -11,6 +11,7 @@
11 * isofs directory handling functions 11 * isofs directory handling functions
12 */ 12 */
13#include <linux/smp_lock.h> 13#include <linux/smp_lock.h>
14#include <linux/gfp.h>
14#include "isofs.h" 15#include "isofs.h"
15 16
16int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode) 17int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode)
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index eaa831311c9c..ab438beb867c 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/smp_lock.h> 9#include <linux/smp_lock.h>
10#include <linux/gfp.h>
10#include "isofs.h" 11#include "isofs.h"
11 12
12/* 13/*
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 2c90e3ef625f..ecb44c94ba8d 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -17,7 +17,6 @@
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/jbd.h> 18#include <linux/jbd.h>
19#include <linux/errno.h> 19#include <linux/errno.h>
20#include <linux/slab.h>
21#include <linux/mm.h> 20#include <linux/mm.h>
22#include <linux/pagemap.h> 21#include <linux/pagemap.h>
23#include <linux/bio.h> 22#include <linux/bio.h>
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index cb1a49ae605e..54c9bc9e1b17 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -20,7 +20,6 @@
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/jbd.h> 21#include <linux/jbd.h>
22#include <linux/errno.h> 22#include <linux/errno.h>
23#include <linux/slab.h>
24#endif 23#endif
25 24
26/* 25/*
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 99e9fea11077..5ae71e75a491 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1398,7 +1398,7 @@ int journal_stop(handle_t *handle)
1398 * the case where our storage is so fast that it is more optimal to go 1398 * the case where our storage is so fast that it is more optimal to go
1399 * ahead and force a flush and wait for the transaction to be committed 1399 * ahead and force a flush and wait for the transaction to be committed
1400 * than it is to wait for an arbitrary amount of time for new writers to 1400 * than it is to wait for an arbitrary amount of time for new writers to
1401 * join the transaction. We acheive this by measuring how long it takes 1401 * join the transaction. We achieve this by measuring how long it takes
1402 * to commit a transaction, and compare it with how long this 1402 * to commit a transaction, and compare it with how long this
1403 * transaction has been running, and if run time < commit time then we 1403 * transaction has been running, and if run time < commit time then we
1404 * sleep for the delta and commit. This greatly helps super fast disks 1404 * sleep for the delta and commit. This greatly helps super fast disks
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c03d4dce4d76..bc2ff5932769 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1889,7 +1889,7 @@ static struct kmem_cache *get_slab(size_t size)
1889 BUG_ON(i >= JBD2_MAX_SLABS); 1889 BUG_ON(i >= JBD2_MAX_SLABS);
1890 if (unlikely(i < 0)) 1890 if (unlikely(i < 0))
1891 i = 0; 1891 i = 0;
1892 BUG_ON(jbd2_slab[i] == 0); 1892 BUG_ON(jbd2_slab[i] == NULL);
1893 return jbd2_slab[i]; 1893 return jbd2_slab[i];
1894} 1894}
1895 1895
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 73063285b13f..049281b7cb89 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -20,7 +20,6 @@
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/jbd2.h> 21#include <linux/jbd2.h>
22#include <linux/errno.h> 22#include <linux/errno.h>
23#include <linux/slab.h>
24#include <linux/crc32.h> 23#include <linux/crc32.h>
25#endif 24#endif
26 25
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 3ff50da94789..55f1dde2fa8b 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -23,10 +23,9 @@ static int jffs2_garbage_collect_thread(void *);
23 23
24void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c) 24void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c)
25{ 25{
26 spin_lock(&c->erase_completion_lock); 26 assert_spin_locked(&c->erase_completion_lock);
27 if (c->gc_task && jffs2_thread_should_wake(c)) 27 if (c->gc_task && jffs2_thread_should_wake(c))
28 send_sig(SIGHUP, c->gc_task, 1); 28 send_sig(SIGHUP, c->gc_task, 1);
29 spin_unlock(&c->erase_completion_lock);
30} 29}
31 30
32/* This must only ever be called when no GC thread is currently running */ 31/* This must only ever be called when no GC thread is currently running */
diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
index 90cb60d09787..cd02acafde8a 100644
--- a/fs/jffs2/compr_lzo.c
+++ b/fs/jffs2/compr_lzo.c
@@ -11,7 +11,6 @@
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/slab.h>
15#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/lzo.h> 16#include <linux/lzo.h>
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index cfd301a5edfc..b46661a42758 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -14,7 +14,6 @@
14#endif 14#endif
15 15
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/slab.h>
18#include <linux/zlib.h> 17#include <linux/zlib.h>
19#include <linux/zutil.h> 18#include <linux/zutil.h>
20#include "nodelist.h" 19#include "nodelist.h"
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index 5544d31c066b..ec3538413926 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -15,6 +15,7 @@
15#include <linux/crc32.h> 15#include <linux/crc32.h>
16#include <linux/jffs2.h> 16#include <linux/jffs2.h>
17#include <linux/mtd/mtd.h> 17#include <linux/mtd/mtd.h>
18#include <linux/slab.h>
18#include "nodelist.h" 19#include "nodelist.h"
19#include "debug.h" 20#include "debug.h"
20 21
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index b47679be118a..6286ad9b00f7 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -103,9 +103,10 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
103 jffs2_erase_failed(c, jeb, bad_offset); 103 jffs2_erase_failed(c, jeb, bad_offset);
104} 104}
105 105
106void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count) 106int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
107{ 107{
108 struct jffs2_eraseblock *jeb; 108 struct jffs2_eraseblock *jeb;
109 int work_done = 0;
109 110
110 mutex_lock(&c->erase_free_sem); 111 mutex_lock(&c->erase_free_sem);
111 112
@@ -121,6 +122,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
121 mutex_unlock(&c->erase_free_sem); 122 mutex_unlock(&c->erase_free_sem);
122 jffs2_mark_erased_block(c, jeb); 123 jffs2_mark_erased_block(c, jeb);
123 124
125 work_done++;
124 if (!--count) { 126 if (!--count) {
125 D1(printk(KERN_DEBUG "Count reached. jffs2_erase_pending_blocks leaving\n")); 127 D1(printk(KERN_DEBUG "Count reached. jffs2_erase_pending_blocks leaving\n"));
126 goto done; 128 goto done;
@@ -157,6 +159,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
157 mutex_unlock(&c->erase_free_sem); 159 mutex_unlock(&c->erase_free_sem);
158 done: 160 done:
159 D1(printk(KERN_DEBUG "jffs2_erase_pending_blocks completed\n")); 161 D1(printk(KERN_DEBUG "jffs2_erase_pending_blocks completed\n"));
162 return work_done;
160} 163}
161 164
162static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) 165static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
@@ -165,10 +168,11 @@ static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblo
165 mutex_lock(&c->erase_free_sem); 168 mutex_lock(&c->erase_free_sem);
166 spin_lock(&c->erase_completion_lock); 169 spin_lock(&c->erase_completion_lock);
167 list_move_tail(&jeb->list, &c->erase_complete_list); 170 list_move_tail(&jeb->list, &c->erase_complete_list);
171 /* Wake the GC thread to mark them clean */
172 jffs2_garbage_collect_trigger(c);
168 spin_unlock(&c->erase_completion_lock); 173 spin_unlock(&c->erase_completion_lock);
169 mutex_unlock(&c->erase_free_sem); 174 mutex_unlock(&c->erase_free_sem);
170 /* Ensure that kupdated calls us again to mark them clean */ 175 wake_up(&c->erase_wait);
171 jffs2_erase_pending_trigger(c);
172} 176}
173 177
174static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset) 178static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset)
@@ -487,9 +491,9 @@ filebad:
487 491
488refile: 492refile:
489 /* Stick it back on the list from whence it came and come back later */ 493 /* Stick it back on the list from whence it came and come back later */
490 jffs2_erase_pending_trigger(c);
491 mutex_lock(&c->erase_free_sem); 494 mutex_lock(&c->erase_free_sem);
492 spin_lock(&c->erase_completion_lock); 495 spin_lock(&c->erase_completion_lock);
496 jffs2_garbage_collect_trigger(c);
493 list_move(&jeb->list, &c->erase_complete_list); 497 list_move(&jeb->list, &c->erase_complete_list);
494 spin_unlock(&c->erase_completion_lock); 498 spin_unlock(&c->erase_completion_lock);
495 mutex_unlock(&c->erase_free_sem); 499 mutex_unlock(&c->erase_free_sem);
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index b7b74e299142..e7291c161a19 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/slab.h>
14#include <linux/fs.h> 13#include <linux/fs.h>
15#include <linux/time.h> 14#include <linux/time.h>
16#include <linux/pagemap.h> 15#include <linux/pagemap.h>
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 3451a81b2142..86e0821fc989 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -313,8 +313,8 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
313 case S_IFBLK: 313 case S_IFBLK:
314 case S_IFCHR: 314 case S_IFCHR:
315 /* Read the device numbers from the media */ 315 /* Read the device numbers from the media */
316 if (f->metadata->size != sizeof(jdev.old) && 316 if (f->metadata->size != sizeof(jdev.old_id) &&
317 f->metadata->size != sizeof(jdev.new)) { 317 f->metadata->size != sizeof(jdev.new_id)) {
318 printk(KERN_NOTICE "Device node has strange size %d\n", f->metadata->size); 318 printk(KERN_NOTICE "Device node has strange size %d\n", f->metadata->size);
319 goto error_io; 319 goto error_io;
320 } 320 }
@@ -325,10 +325,10 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
325 printk(KERN_NOTICE "Read device numbers for inode %lu failed\n", (unsigned long)inode->i_ino); 325 printk(KERN_NOTICE "Read device numbers for inode %lu failed\n", (unsigned long)inode->i_ino);
326 goto error; 326 goto error;
327 } 327 }
328 if (f->metadata->size == sizeof(jdev.old)) 328 if (f->metadata->size == sizeof(jdev.old_id))
329 rdev = old_decode_dev(je16_to_cpu(jdev.old)); 329 rdev = old_decode_dev(je16_to_cpu(jdev.old_id));
330 else 330 else
331 rdev = new_decode_dev(je32_to_cpu(jdev.new)); 331 rdev = new_decode_dev(je32_to_cpu(jdev.new_id));
332 332
333 case S_IFSOCK: 333 case S_IFSOCK:
334 case S_IFIFO: 334 case S_IFIFO:
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 3b6f2fa12cff..f5e96bd656e8 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -214,6 +214,19 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
214 return ret; 214 return ret;
215 } 215 }
216 216
217 /* If there are any blocks which need erasing, erase them now */
218 if (!list_empty(&c->erase_complete_list) ||
219 !list_empty(&c->erase_pending_list)) {
220 spin_unlock(&c->erase_completion_lock);
221 D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n"));
222 if (jffs2_erase_pending_blocks(c, 1)) {
223 mutex_unlock(&c->alloc_sem);
224 return 0;
225 }
226 D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n"));
227 spin_lock(&c->erase_completion_lock);
228 }
229
217 /* First, work out which block we're garbage-collecting */ 230 /* First, work out which block we're garbage-collecting */
218 jeb = c->gcblock; 231 jeb = c->gcblock;
219 232
@@ -222,7 +235,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
222 235
223 if (!jeb) { 236 if (!jeb) {
224 /* Couldn't find a free block. But maybe we can just erase one and make 'progress'? */ 237 /* Couldn't find a free block. But maybe we can just erase one and make 'progress'? */
225 if (!list_empty(&c->erase_pending_list)) { 238 if (c->nr_erasing_blocks) {
226 spin_unlock(&c->erase_completion_lock); 239 spin_unlock(&c->erase_completion_lock);
227 mutex_unlock(&c->alloc_sem); 240 mutex_unlock(&c->alloc_sem);
228 return -EAGAIN; 241 return -EAGAIN;
@@ -435,7 +448,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
435 list_add_tail(&c->gcblock->list, &c->erase_pending_list); 448 list_add_tail(&c->gcblock->list, &c->erase_pending_list);
436 c->gcblock = NULL; 449 c->gcblock = NULL;
437 c->nr_erasing_blocks++; 450 c->nr_erasing_blocks++;
438 jffs2_erase_pending_trigger(c); 451 jffs2_garbage_collect_trigger(c);
439 } 452 }
440 spin_unlock(&c->erase_completion_lock); 453 spin_unlock(&c->erase_completion_lock);
441 454
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 87c6f555e1a0..af02bd138469 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -15,7 +15,6 @@
15#include <linux/mtd/mtd.h> 15#include <linux/mtd/mtd.h>
16#include <linux/rbtree.h> 16#include <linux/rbtree.h>
17#include <linux/crc32.h> 17#include <linux/crc32.h>
18#include <linux/slab.h>
19#include <linux/pagemap.h> 18#include <linux/pagemap.h>
20#include "nodelist.h" 19#include "nodelist.h"
21 20
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 507ed6ec1847..a881a42f19e3 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -312,11 +312,11 @@ static inline int jffs2_blocks_use_vmalloc(struct jffs2_sb_info *c)
312static inline int jffs2_encode_dev(union jffs2_device_node *jdev, dev_t rdev) 312static inline int jffs2_encode_dev(union jffs2_device_node *jdev, dev_t rdev)
313{ 313{
314 if (old_valid_dev(rdev)) { 314 if (old_valid_dev(rdev)) {
315 jdev->old = cpu_to_je16(old_encode_dev(rdev)); 315 jdev->old_id = cpu_to_je16(old_encode_dev(rdev));
316 return sizeof(jdev->old); 316 return sizeof(jdev->old_id);
317 } else { 317 } else {
318 jdev->new = cpu_to_je32(new_encode_dev(rdev)); 318 jdev->new_id = cpu_to_je32(new_encode_dev(rdev));
319 return sizeof(jdev->new); 319 return sizeof(jdev->new_id);
320 } 320 }
321} 321}
322 322
@@ -464,7 +464,7 @@ int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
464int jffs2_do_mount_fs(struct jffs2_sb_info *c); 464int jffs2_do_mount_fs(struct jffs2_sb_info *c);
465 465
466/* erase.c */ 466/* erase.c */
467void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count); 467int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count);
468void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); 468void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
469 469
470#ifdef CONFIG_JFFS2_FS_WRITEBUFFER 470#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 21a052915aa9..694aa5b03505 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/slab.h>
14#include <linux/mtd/mtd.h> 13#include <linux/mtd/mtd.h>
15#include <linux/compiler.h> 14#include <linux/compiler.h>
16#include <linux/sched.h> /* For cond_resched() */ 15#include <linux/sched.h> /* For cond_resched() */
@@ -117,9 +116,21 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
117 116
118 ret = jffs2_garbage_collect_pass(c); 117 ret = jffs2_garbage_collect_pass(c);
119 118
120 if (ret == -EAGAIN) 119 if (ret == -EAGAIN) {
121 jffs2_erase_pending_blocks(c, 1); 120 spin_lock(&c->erase_completion_lock);
122 else if (ret) 121 if (c->nr_erasing_blocks &&
122 list_empty(&c->erase_pending_list) &&
123 list_empty(&c->erase_complete_list)) {
124 DECLARE_WAITQUEUE(wait, current);
125 set_current_state(TASK_UNINTERRUPTIBLE);
126 add_wait_queue(&c->erase_wait, &wait);
127 D1(printk(KERN_DEBUG "%s waiting for erase to complete\n", __func__));
128 spin_unlock(&c->erase_completion_lock);
129
130 schedule();
131 } else
132 spin_unlock(&c->erase_completion_lock);
133 } else if (ret)
123 return ret; 134 return ret;
124 135
125 cond_resched(); 136 cond_resched();
@@ -218,7 +229,7 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
218 ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list); 229 ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list);
219 list_move_tail(&ejeb->list, &c->erase_pending_list); 230 list_move_tail(&ejeb->list, &c->erase_pending_list);
220 c->nr_erasing_blocks++; 231 c->nr_erasing_blocks++;
221 jffs2_erase_pending_trigger(c); 232 jffs2_garbage_collect_trigger(c);
222 D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n", 233 D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n",
223 ejeb->offset)); 234 ejeb->offset));
224 } 235 }
@@ -470,7 +481,9 @@ struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c,
470void jffs2_complete_reservation(struct jffs2_sb_info *c) 481void jffs2_complete_reservation(struct jffs2_sb_info *c)
471{ 482{
472 D1(printk(KERN_DEBUG "jffs2_complete_reservation()\n")); 483 D1(printk(KERN_DEBUG "jffs2_complete_reservation()\n"));
484 spin_lock(&c->erase_completion_lock);
473 jffs2_garbage_collect_trigger(c); 485 jffs2_garbage_collect_trigger(c);
486 spin_unlock(&c->erase_completion_lock);
474 mutex_unlock(&c->alloc_sem); 487 mutex_unlock(&c->alloc_sem);
475} 488}
476 489
@@ -612,7 +625,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
612 D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n")); 625 D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n"));
613 list_add_tail(&jeb->list, &c->erase_pending_list); 626 list_add_tail(&jeb->list, &c->erase_pending_list);
614 c->nr_erasing_blocks++; 627 c->nr_erasing_blocks++;
615 jffs2_erase_pending_trigger(c); 628 jffs2_garbage_collect_trigger(c);
616 } else { 629 } else {
617 /* Sometimes, however, we leave it elsewhere so it doesn't get 630 /* Sometimes, however, we leave it elsewhere so it doesn't get
618 immediately reused, and we spread the load a bit. */ 631 immediately reused, and we spread the load a bit. */
@@ -733,6 +746,10 @@ int jffs2_thread_should_wake(struct jffs2_sb_info *c)
733 int nr_very_dirty = 0; 746 int nr_very_dirty = 0;
734 struct jffs2_eraseblock *jeb; 747 struct jffs2_eraseblock *jeb;
735 748
749 if (!list_empty(&c->erase_complete_list) ||
750 !list_empty(&c->erase_pending_list))
751 return 1;
752
736 if (c->unchecked_size) { 753 if (c->unchecked_size) {
737 D1(printk(KERN_DEBUG "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n", 754 D1(printk(KERN_DEBUG "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n",
738 c->unchecked_size, c->checked_ino)); 755 c->unchecked_size, c->checked_ino));
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index a7f03b7ebcb3..035a767f958b 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -140,8 +140,7 @@ void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c);
140 140
141#endif /* WRITEBUFFER */ 141#endif /* WRITEBUFFER */
142 142
143/* erase.c */ 143static inline void jffs2_dirty_trigger(struct jffs2_sb_info *c)
144static inline void jffs2_erase_pending_trigger(struct jffs2_sb_info *c)
145{ 144{
146 OFNI_BS_2SFFJ(c)->s_dirt = 1; 145 OFNI_BS_2SFFJ(c)->s_dirt = 1;
147} 146}
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index e22de8397b74..d32ee9412cb9 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -567,7 +567,7 @@ static void jffs2_free_tmp_dnode_info_list(struct rb_root *list)
567 else BUG(); 567 else BUG();
568 } 568 }
569 } 569 }
570 list->rb_node = NULL; 570 *list = RB_ROOT;
571} 571}
572 572
573static void jffs2_free_full_dirent_list(struct jffs2_full_dirent *fd) 573static void jffs2_free_full_dirent_list(struct jffs2_full_dirent *fd)
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 696686cc206e..46f870d1cc36 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -260,7 +260,9 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
260 ret = -EIO; 260 ret = -EIO;
261 goto out; 261 goto out;
262 } 262 }
263 jffs2_erase_pending_trigger(c); 263 spin_lock(&c->erase_completion_lock);
264 jffs2_garbage_collect_trigger(c);
265 spin_unlock(&c->erase_completion_lock);
264 } 266 }
265 ret = 0; 267 ret = 0;
266 out: 268 out:
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 9a80e8e595d0..511e2d609d12 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -63,8 +63,6 @@ static void jffs2_write_super(struct super_block *sb)
63 63
64 if (!(sb->s_flags & MS_RDONLY)) { 64 if (!(sb->s_flags & MS_RDONLY)) {
65 D1(printk(KERN_DEBUG "jffs2_write_super()\n")); 65 D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
66 jffs2_garbage_collect_trigger(c);
67 jffs2_erase_pending_blocks(c, 0);
68 jffs2_flush_wbuf_gc(c, 0); 66 jffs2_flush_wbuf_gc(c, 0);
69 } 67 }
70 68
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 4ec11e8bda8c..b955626071c2 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/slab.h>
14#include <linux/fs.h> 13#include <linux/fs.h>
15#include <linux/namei.h> 14#include <linux/namei.h>
16#include "nodelist.h" 15#include "nodelist.h"
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 5ef7bac265e5..07ee1546b2fa 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -84,7 +84,7 @@ static void jffs2_wbuf_dirties_inode(struct jffs2_sb_info *c, uint32_t ino)
84 struct jffs2_inodirty *new; 84 struct jffs2_inodirty *new;
85 85
86 /* Mark the superblock dirty so that kupdated will flush... */ 86 /* Mark the superblock dirty so that kupdated will flush... */
87 jffs2_erase_pending_trigger(c); 87 jffs2_dirty_trigger(c);
88 88
89 if (jffs2_wbuf_pending_for_ino(c, ino)) 89 if (jffs2_wbuf_pending_for_ino(c, ino))
90 return; 90 return;
@@ -121,7 +121,7 @@ static inline void jffs2_refile_wbuf_blocks(struct jffs2_sb_info *c)
121 D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n")); 121 D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n"));
122 list_add_tail(&jeb->list, &c->erase_pending_list); 122 list_add_tail(&jeb->list, &c->erase_pending_list);
123 c->nr_erasing_blocks++; 123 c->nr_erasing_blocks++;
124 jffs2_erase_pending_trigger(c); 124 jffs2_garbage_collect_trigger(c);
125 } else { 125 } else {
126 /* Sometimes, however, we leave it elsewhere so it doesn't get 126 /* Sometimes, however, we leave it elsewhere so it doesn't get
127 immediately reused, and we spread the load a bit. */ 127 immediately reused, and we spread the load a bit. */
@@ -152,7 +152,7 @@ static void jffs2_block_refile(struct jffs2_sb_info *c, struct jffs2_eraseblock
152 D1(printk("Refiling block at %08x to erase_pending_list\n", jeb->offset)); 152 D1(printk("Refiling block at %08x to erase_pending_list\n", jeb->offset));
153 list_add(&jeb->list, &c->erase_pending_list); 153 list_add(&jeb->list, &c->erase_pending_list);
154 c->nr_erasing_blocks++; 154 c->nr_erasing_blocks++;
155 jffs2_erase_pending_trigger(c); 155 jffs2_garbage_collect_trigger(c);
156 } 156 }
157 157
158 if (!jffs2_prealloc_raw_node_refs(c, jeb, 1)) { 158 if (!jffs2_prealloc_raw_node_refs(c, jeb, 1)) {
@@ -543,7 +543,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
543 D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset)); 543 D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset));
544 list_move(&jeb->list, &c->erase_pending_list); 544 list_move(&jeb->list, &c->erase_pending_list);
545 c->nr_erasing_blocks++; 545 c->nr_erasing_blocks++;
546 jffs2_erase_pending_trigger(c); 546 jffs2_garbage_collect_trigger(c);
547 } 547 }
548 548
549 jffs2_dbg_acct_sanity_check_nolock(c, jeb); 549 jffs2_dbg_acct_sanity_check_nolock(c, jeb);
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index ca29440e9435..c819eb0e982d 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -12,7 +12,6 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/crc32.h> 14#include <linux/crc32.h>
15#include <linux/slab.h>
16#include <linux/pagemap.h> 15#include <linux/pagemap.h>
17#include <linux/mtd/mtd.h> 16#include <linux/mtd/mtd.h>
18#include "nodelist.h" 17#include "nodelist.h"
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 213169780b6c..1057a4998e4e 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -19,6 +19,7 @@
19 */ 19 */
20 20
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/slab.h>
22#include <linux/fs.h> 23#include <linux/fs.h>
23#include <linux/posix_acl_xattr.h> 24#include <linux/posix_acl_xattr.h>
24#include "jfs_incore.h" 25#include "jfs_incore.h"
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 9dd126276c9f..ed9ba6fe04f5 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -61,7 +61,7 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
61 inode->i_op = &page_symlink_inode_operations; 61 inode->i_op = &page_symlink_inode_operations;
62 inode->i_mapping->a_ops = &jfs_aops; 62 inode->i_mapping->a_ops = &jfs_aops;
63 } else { 63 } else {
64 inode->i_op = &jfs_symlink_inode_operations; 64 inode->i_op = &jfs_fast_symlink_inode_operations;
65 /* 65 /*
66 * The inline data should be null-terminated, but 66 * The inline data should be null-terminated, but
67 * don't let on-disk corruption crash the kernel 67 * don't let on-disk corruption crash the kernel
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index d9b031cf69f5..c92ea3b3ea5e 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/slab.h>
20#include "jfs_incore.h" 21#include "jfs_incore.h"
21#include "jfs_superblock.h" 22#include "jfs_superblock.h"
22#include "jfs_dmap.h" 23#include "jfs_dmap.h"
@@ -195,7 +196,7 @@ int dbMount(struct inode *ipbmap)
195 bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag); 196 bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
196 bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref); 197 bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
197 bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel); 198 bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
198 bmp->db_agheigth = le32_to_cpu(dbmp_le->dn_agheigth); 199 bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight);
199 bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth); 200 bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
200 bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart); 201 bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
201 bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size); 202 bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
@@ -287,7 +288,7 @@ int dbSync(struct inode *ipbmap)
287 dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag); 288 dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag);
288 dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref); 289 dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref);
289 dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel); 290 dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel);
290 dbmp_le->dn_agheigth = cpu_to_le32(bmp->db_agheigth); 291 dbmp_le->dn_agheight = cpu_to_le32(bmp->db_agheight);
291 dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth); 292 dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth);
292 dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart); 293 dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart);
293 dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size); 294 dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size);
@@ -1440,7 +1441,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1440 * tree index of this allocation group within the control page. 1441 * tree index of this allocation group within the control page.
1441 */ 1442 */
1442 agperlev = 1443 agperlev =
1443 (1 << (L2LPERCTL - (bmp->db_agheigth << 1))) / bmp->db_agwidth; 1444 (1 << (L2LPERCTL - (bmp->db_agheight << 1))) / bmp->db_agwidth;
1444 ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1)); 1445 ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1));
1445 1446
1446 /* dmap control page trees fan-out by 4 and a single allocation 1447 /* dmap control page trees fan-out by 4 and a single allocation
@@ -1459,7 +1460,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1459 * the subtree to find the leftmost leaf that describes this 1460 * the subtree to find the leftmost leaf that describes this
1460 * free space. 1461 * free space.
1461 */ 1462 */
1462 for (k = bmp->db_agheigth; k > 0; k--) { 1463 for (k = bmp->db_agheight; k > 0; k--) {
1463 for (n = 0, m = (ti << 2) + 1; n < 4; n++) { 1464 for (n = 0, m = (ti << 2) + 1; n < 4; n++) {
1464 if (l2nb <= dcp->stree[m + n]) { 1465 if (l2nb <= dcp->stree[m + n]) {
1465 ti = m + n; 1466 ti = m + n;
@@ -2437,7 +2438,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
2437 2438
2438 /* check if this is a control page update for an allocation. 2439 /* check if this is a control page update for an allocation.
2439 * if so, update the leaf to reflect the new leaf value using 2440 * if so, update the leaf to reflect the new leaf value using
2440 * dbSplit(); otherwise (deallocation), use dbJoin() to udpate 2441 * dbSplit(); otherwise (deallocation), use dbJoin() to update
2441 * the leaf with the new value. in addition to updating the 2442 * the leaf with the new value. in addition to updating the
2442 * leaf, dbSplit() will also split the binary buddy system of 2443 * leaf, dbSplit() will also split the binary buddy system of
2443 * the leaves, if required, and bubble new values within the 2444 * the leaves, if required, and bubble new values within the
@@ -3606,7 +3607,7 @@ void dbFinalizeBmap(struct inode *ipbmap)
3606 } 3607 }
3607 3608
3608 /* 3609 /*
3609 * compute db_aglevel, db_agheigth, db_width, db_agstart: 3610 * compute db_aglevel, db_agheight, db_width, db_agstart:
3610 * an ag is covered in aglevel dmapctl summary tree, 3611 * an ag is covered in aglevel dmapctl summary tree,
3611 * at agheight level height (from leaf) with agwidth number of nodes 3612 * at agheight level height (from leaf) with agwidth number of nodes
3612 * each, which starts at agstart index node of the smmary tree node 3613 * each, which starts at agstart index node of the smmary tree node
@@ -3615,9 +3616,9 @@ void dbFinalizeBmap(struct inode *ipbmap)
3615 bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize); 3616 bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize);
3616 l2nl = 3617 l2nl =
3617 bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL); 3618 bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL);
3618 bmp->db_agheigth = l2nl >> 1; 3619 bmp->db_agheight = l2nl >> 1;
3619 bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheigth << 1)); 3620 bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheight << 1));
3620 for (i = 5 - bmp->db_agheigth, bmp->db_agstart = 0, n = 1; i > 0; 3621 for (i = 5 - bmp->db_agheight, bmp->db_agstart = 0, n = 1; i > 0;
3621 i--) { 3622 i--) {
3622 bmp->db_agstart += n; 3623 bmp->db_agstart += n;
3623 n <<= 2; 3624 n <<= 2;
diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h
index 1a6eb41569bc..6dcb906c55d8 100644
--- a/fs/jfs/jfs_dmap.h
+++ b/fs/jfs/jfs_dmap.h
@@ -210,7 +210,7 @@ struct dbmap_disk {
210 __le32 dn_maxag; /* 4: max active alloc group number */ 210 __le32 dn_maxag; /* 4: max active alloc group number */
211 __le32 dn_agpref; /* 4: preferred alloc group (hint) */ 211 __le32 dn_agpref; /* 4: preferred alloc group (hint) */
212 __le32 dn_aglevel; /* 4: dmapctl level holding the AG */ 212 __le32 dn_aglevel; /* 4: dmapctl level holding the AG */
213 __le32 dn_agheigth; /* 4: height in dmapctl of the AG */ 213 __le32 dn_agheight; /* 4: height in dmapctl of the AG */
214 __le32 dn_agwidth; /* 4: width in dmapctl of the AG */ 214 __le32 dn_agwidth; /* 4: width in dmapctl of the AG */
215 __le32 dn_agstart; /* 4: start tree index at AG height */ 215 __le32 dn_agstart; /* 4: start tree index at AG height */
216 __le32 dn_agl2size; /* 4: l2 num of blks per alloc group */ 216 __le32 dn_agl2size; /* 4: l2 num of blks per alloc group */
@@ -229,7 +229,7 @@ struct dbmap {
229 int dn_maxag; /* max active alloc group number */ 229 int dn_maxag; /* max active alloc group number */
230 int dn_agpref; /* preferred alloc group (hint) */ 230 int dn_agpref; /* preferred alloc group (hint) */
231 int dn_aglevel; /* dmapctl level holding the AG */ 231 int dn_aglevel; /* dmapctl level holding the AG */
232 int dn_agheigth; /* height in dmapctl of the AG */ 232 int dn_agheight; /* height in dmapctl of the AG */
233 int dn_agwidth; /* width in dmapctl of the AG */ 233 int dn_agwidth; /* width in dmapctl of the AG */
234 int dn_agstart; /* start tree index at AG height */ 234 int dn_agstart; /* start tree index at AG height */
235 int dn_agl2size; /* l2 num of blks per alloc group */ 235 int dn_agl2size; /* l2 num of blks per alloc group */
@@ -255,7 +255,7 @@ struct bmap {
255#define db_agsize db_bmap.dn_agsize 255#define db_agsize db_bmap.dn_agsize
256#define db_agl2size db_bmap.dn_agl2size 256#define db_agl2size db_bmap.dn_agl2size
257#define db_agwidth db_bmap.dn_agwidth 257#define db_agwidth db_bmap.dn_agwidth
258#define db_agheigth db_bmap.dn_agheigth 258#define db_agheight db_bmap.dn_agheight
259#define db_agstart db_bmap.dn_agstart 259#define db_agstart db_bmap.dn_agstart
260#define db_numag db_bmap.dn_numag 260#define db_numag db_bmap.dn_numag
261#define db_maxlevel db_bmap.dn_maxlevel 261#define db_maxlevel db_bmap.dn_maxlevel
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 0e4623be70ce..9197a1b0d02d 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -102,6 +102,7 @@
102 102
103#include <linux/fs.h> 103#include <linux/fs.h>
104#include <linux/quotaops.h> 104#include <linux/quotaops.h>
105#include <linux/slab.h>
105#include "jfs_incore.h" 106#include "jfs_incore.h"
106#include "jfs_superblock.h" 107#include "jfs_superblock.h"
107#include "jfs_filsys.h" 108#include "jfs_filsys.h"
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 0fc30407f039..f8332dc8eeb2 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -45,6 +45,7 @@
45#include <linux/buffer_head.h> 45#include <linux/buffer_head.h>
46#include <linux/pagemap.h> 46#include <linux/pagemap.h>
47#include <linux/quotaops.h> 47#include <linux/quotaops.h>
48#include <linux/slab.h>
48 49
49#include "jfs_incore.h" 50#include "jfs_incore.h"
50#include "jfs_inode.h" 51#include "jfs_inode.h"
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 79e2c79661df..9e6bda30a6e8 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -48,5 +48,6 @@ extern const struct file_operations jfs_dir_operations;
48extern const struct inode_operations jfs_file_inode_operations; 48extern const struct inode_operations jfs_file_inode_operations;
49extern const struct file_operations jfs_file_operations; 49extern const struct file_operations jfs_file_operations;
50extern const struct inode_operations jfs_symlink_inode_operations; 50extern const struct inode_operations jfs_symlink_inode_operations;
51extern const struct inode_operations jfs_fast_symlink_inode_operations;
51extern const struct dentry_operations jfs_ci_dentry_operations; 52extern const struct dentry_operations jfs_ci_dentry_operations;
52#endif /* _H_JFS_INODE */ 53#endif /* _H_JFS_INODE */
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 335c4de6552d..c51af2a14516 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -70,6 +70,7 @@
70#include <linux/delay.h> 70#include <linux/delay.h>
71#include <linux/mutex.h> 71#include <linux/mutex.h>
72#include <linux/seq_file.h> 72#include <linux/seq_file.h>
73#include <linux/slab.h>
73#include "jfs_incore.h" 74#include "jfs_incore.h"
74#include "jfs_filsys.h" 75#include "jfs_filsys.h"
75#include "jfs_metapage.h" 76#include "jfs_metapage.h"
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 07b6c5dfb4b6..48b44bd8267b 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -21,6 +21,7 @@
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/bio.h> 23#include <linux/bio.h>
24#include <linux/slab.h>
24#include <linux/init.h> 25#include <linux/init.h>
25#include <linux/buffer_head.h> 26#include <linux/buffer_head.h>
26#include <linux/mempool.h> 27#include <linux/mempool.h>
diff --git a/fs/jfs/jfs_unicode.h b/fs/jfs/jfs_unicode.h
index 3fbb3a225590..8f0f02cb6ca6 100644
--- a/fs/jfs/jfs_unicode.h
+++ b/fs/jfs/jfs_unicode.h
@@ -19,6 +19,7 @@
19#ifndef _H_JFS_UNICODE 19#ifndef _H_JFS_UNICODE
20#define _H_JFS_UNICODE 20#define _H_JFS_UNICODE
21 21
22#include <linux/slab.h>
22#include <asm/byteorder.h> 23#include <asm/byteorder.h>
23#include "jfs_types.h" 24#include "jfs_types.h"
24 25
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 4a3e9f39c21d..a9cf8e8675be 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -956,7 +956,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
956 */ 956 */
957 957
958 if (ssize <= IDATASIZE) { 958 if (ssize <= IDATASIZE) {
959 ip->i_op = &jfs_symlink_inode_operations; 959 ip->i_op = &jfs_fast_symlink_inode_operations;
960 960
961 i_fastsymlink = JFS_IP(ip)->i_inline; 961 i_fastsymlink = JFS_IP(ip)->i_inline;
962 memcpy(i_fastsymlink, name, ssize); 962 memcpy(i_fastsymlink, name, ssize);
@@ -978,7 +978,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
978 else { 978 else {
979 jfs_info("jfs_symlink: allocate extent ip:0x%p", ip); 979 jfs_info("jfs_symlink: allocate extent ip:0x%p", ip);
980 980
981 ip->i_op = &page_symlink_inode_operations; 981 ip->i_op = &jfs_symlink_inode_operations;
982 ip->i_mapping->a_ops = &jfs_aops; 982 ip->i_mapping->a_ops = &jfs_aops;
983 983
984 /* 984 /*
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 7f24a0bb08ca..1aba0039f1c9 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -81,6 +81,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
81 struct inode *iplist[1]; 81 struct inode *iplist[1];
82 struct jfs_superblock *j_sb, *j_sb2; 82 struct jfs_superblock *j_sb, *j_sb2;
83 uint old_agsize; 83 uint old_agsize;
84 int agsizechanged = 0;
84 struct buffer_head *bh, *bh2; 85 struct buffer_head *bh, *bh2;
85 86
86 /* If the volume hasn't grown, get out now */ 87 /* If the volume hasn't grown, get out now */
@@ -333,6 +334,9 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
333 */ 334 */
334 if ((rc = dbExtendFS(ipbmap, XAddress, nblocks))) 335 if ((rc = dbExtendFS(ipbmap, XAddress, nblocks)))
335 goto error_out; 336 goto error_out;
337
338 agsizechanged |= (bmp->db_agsize != old_agsize);
339
336 /* 340 /*
337 * the map now has extended to cover additional nblocks: 341 * the map now has extended to cover additional nblocks:
338 * dn_mapsize = oldMapsize + nblocks; 342 * dn_mapsize = oldMapsize + nblocks;
@@ -432,7 +436,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
432 * will correctly identify the new ag); 436 * will correctly identify the new ag);
433 */ 437 */
434 /* if new AG size the same as old AG size, done! */ 438 /* if new AG size the same as old AG size, done! */
435 if (bmp->db_agsize != old_agsize) { 439 if (agsizechanged) {
436 if ((rc = diExtendFS(ipimap, ipbmap))) 440 if ((rc = diExtendFS(ipimap, ipbmap)))
437 goto error_out; 441 goto error_out;
438 442
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 266699deb1c6..b66832ac33ac 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -30,6 +30,7 @@
30#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
31#include <linux/exportfs.h> 31#include <linux/exportfs.h>
32#include <linux/crc32.h> 32#include <linux/crc32.h>
33#include <linux/slab.h>
33#include <asm/uaccess.h> 34#include <asm/uaccess.h>
34#include <linux/seq_file.h> 35#include <linux/seq_file.h>
35#include <linux/smp_lock.h> 36#include <linux/smp_lock.h>
@@ -445,10 +446,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
445 /* initialize the mount flag and determine the default error handler */ 446 /* initialize the mount flag and determine the default error handler */
446 flag = JFS_ERR_REMOUNT_RO; 447 flag = JFS_ERR_REMOUNT_RO;
447 448
448 if (!parse_options((char *) data, sb, &newLVSize, &flag)) { 449 if (!parse_options((char *) data, sb, &newLVSize, &flag))
449 kfree(sbi); 450 goto out_kfree;
450 return -EINVAL;
451 }
452 sbi->flag = flag; 451 sbi->flag = flag;
453 452
454#ifdef CONFIG_JFS_POSIX_ACL 453#ifdef CONFIG_JFS_POSIX_ACL
@@ -457,7 +456,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
457 456
458 if (newLVSize) { 457 if (newLVSize) {
459 printk(KERN_ERR "resize option for remount only\n"); 458 printk(KERN_ERR "resize option for remount only\n");
460 return -EINVAL; 459 goto out_kfree;
461 } 460 }
462 461
463 /* 462 /*
@@ -477,7 +476,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
477 inode = new_inode(sb); 476 inode = new_inode(sb);
478 if (inode == NULL) { 477 if (inode == NULL) {
479 ret = -ENOMEM; 478 ret = -ENOMEM;
480 goto out_kfree; 479 goto out_unload;
481 } 480 }
482 inode->i_ino = 0; 481 inode->i_ino = 0;
483 inode->i_nlink = 1; 482 inode->i_nlink = 1;
@@ -549,9 +548,10 @@ out_mount_failed:
549 make_bad_inode(sbi->direct_inode); 548 make_bad_inode(sbi->direct_inode);
550 iput(sbi->direct_inode); 549 iput(sbi->direct_inode);
551 sbi->direct_inode = NULL; 550 sbi->direct_inode = NULL;
552out_kfree: 551out_unload:
553 if (sbi->nls_tab) 552 if (sbi->nls_tab)
554 unload_nls(sbi->nls_tab); 553 unload_nls(sbi->nls_tab);
554out_kfree:
555 kfree(sbi); 555 kfree(sbi);
556 return ret; 556 return ret;
557} 557}
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c
index 4af1a05aad0a..205b946d8e0d 100644
--- a/fs/jfs/symlink.c
+++ b/fs/jfs/symlink.c
@@ -29,9 +29,21 @@ static void *jfs_follow_link(struct dentry *dentry, struct nameidata *nd)
29 return NULL; 29 return NULL;
30} 30}
31 31
32const struct inode_operations jfs_symlink_inode_operations = { 32const struct inode_operations jfs_fast_symlink_inode_operations = {
33 .readlink = generic_readlink, 33 .readlink = generic_readlink,
34 .follow_link = jfs_follow_link, 34 .follow_link = jfs_follow_link,
35 .setattr = jfs_setattr,
36 .setxattr = jfs_setxattr,
37 .getxattr = jfs_getxattr,
38 .listxattr = jfs_listxattr,
39 .removexattr = jfs_removexattr,
40};
41
42const struct inode_operations jfs_symlink_inode_operations = {
43 .readlink = generic_readlink,
44 .follow_link = page_follow_link_light,
45 .put_link = page_put_link,
46 .setattr = jfs_setattr,
35 .setxattr = jfs_setxattr, 47 .setxattr = jfs_setxattr,
36 .getxattr = jfs_getxattr, 48 .getxattr = jfs_getxattr,
37 .listxattr = jfs_listxattr, 49 .listxattr = jfs_listxattr,
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 1f594ab21895..fa96bbb26343 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -21,6 +21,7 @@
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/xattr.h> 22#include <linux/xattr.h>
23#include <linux/posix_acl_xattr.h> 23#include <linux/posix_acl_xattr.h>
24#include <linux/slab.h>
24#include <linux/quotaops.h> 25#include <linux/quotaops.h>
25#include <linux/security.h> 26#include <linux/security.h>
26#include "jfs_incore.h" 27#include "jfs_incore.h"
diff --git a/fs/libfs.c b/fs/libfs.c
index 9e50bcf55857..232bea425b09 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -5,6 +5,7 @@
5 5
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/pagemap.h> 7#include <linux/pagemap.h>
8#include <linux/slab.h>
8#include <linux/mount.h> 9#include <linux/mount.h>
9#include <linux/vfs.h> 10#include <linux/vfs.h>
10#include <linux/mutex.h> 11#include <linux/mutex.h>
@@ -546,6 +547,40 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
546} 547}
547 548
548/** 549/**
550 * simple_write_to_buffer - copy data from user space to the buffer
551 * @to: the buffer to write to
552 * @available: the size of the buffer
553 * @ppos: the current position in the buffer
554 * @from: the user space buffer to read from
555 * @count: the maximum number of bytes to read
556 *
557 * The simple_write_to_buffer() function reads up to @count bytes from the user
558 * space address starting at @from into the buffer @to at offset @ppos.
559 *
560 * On success, the number of bytes written is returned and the offset @ppos is
561 * advanced by this number, or negative value is returned on error.
562 **/
563ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
564 const void __user *from, size_t count)
565{
566 loff_t pos = *ppos;
567 size_t res;
568
569 if (pos < 0)
570 return -EINVAL;
571 if (pos >= available || !count)
572 return 0;
573 if (count > available - pos)
574 count = available - pos;
575 res = copy_from_user(to + pos, from, count);
576 if (res == count)
577 return -EFAULT;
578 count -= res;
579 *ppos = pos + count;
580 return count;
581}
582
583/**
549 * memory_read_from_buffer - copy data from the buffer 584 * memory_read_from_buffer - copy data from the buffer
550 * @to: the kernel space buffer to read to 585 * @to: the kernel space buffer to read to
551 * @count: the maximum number of bytes to read 586 * @count: the maximum number of bytes to read
@@ -863,6 +898,7 @@ EXPORT_SYMBOL(simple_statfs);
863EXPORT_SYMBOL(simple_sync_file); 898EXPORT_SYMBOL(simple_sync_file);
864EXPORT_SYMBOL(simple_unlink); 899EXPORT_SYMBOL(simple_unlink);
865EXPORT_SYMBOL(simple_read_from_buffer); 900EXPORT_SYMBOL(simple_read_from_buffer);
901EXPORT_SYMBOL(simple_write_to_buffer);
866EXPORT_SYMBOL(memory_read_from_buffer); 902EXPORT_SYMBOL(memory_read_from_buffer);
867EXPORT_SYMBOL(simple_transaction_set); 903EXPORT_SYMBOL(simple_transaction_set);
868EXPORT_SYMBOL(simple_transaction_get); 904EXPORT_SYMBOL(simple_transaction_get);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index fc9032dc8862..64fd427c993c 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/slab.h>
11#include <linux/time.h> 12#include <linux/time.h>
12#include <linux/nfs_fs.h> 13#include <linux/nfs_fs.h>
13#include <linux/sunrpc/clnt.h> 14#include <linux/sunrpc/clnt.h>
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index c81249fef11f..7932c399fab4 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/smp_lock.h> 10#include <linux/smp_lock.h>
11#include <linux/slab.h>
11#include <linux/types.h> 12#include <linux/types.h>
12#include <linux/errno.h> 13#include <linux/errno.h>
13#include <linux/fs.h> 14#include <linux/fs.h>
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index fefa4df3f005..e3015464fbab 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -10,6 +10,7 @@
10#include <linux/utsname.h> 10#include <linux/utsname.h>
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/ktime.h> 12#include <linux/ktime.h>
13#include <linux/slab.h>
13 14
14#include <linux/sunrpc/clnt.h> 15#include <linux/sunrpc/clnt.h>
15#include <linux/sunrpc/xprtsock.h> 16#include <linux/sunrpc/xprtsock.h>
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 7d150517ddf0..f1bacf1a0391 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -21,7 +21,6 @@
21#include <linux/errno.h> 21#include <linux/errno.h>
22#include <linux/in.h> 22#include <linux/in.h>
23#include <linux/uio.h> 23#include <linux/uio.h>
24#include <linux/slab.h>
25#include <linux/smp.h> 24#include <linux/smp.h>
26#include <linux/smp_lock.h> 25#include <linux/smp_lock.h>
27#include <linux/mutex.h> 26#include <linux/mutex.h>
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index a7966eed3c17..031c6569a134 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -9,7 +9,6 @@
9 9
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/slab.h>
13#include <linux/smp_lock.h> 12#include <linux/smp_lock.h>
14#include <linux/lockd/lockd.h> 13#include <linux/lockd/lockd.h>
15#include <linux/lockd/share.h> 14#include <linux/lockd/share.h>
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index d1001790fa9a..84055d31bfc5 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/types.h> 23#include <linux/types.h>
24#include <linux/slab.h>
24#include <linux/errno.h> 25#include <linux/errno.h>
25#include <linux/kernel.h> 26#include <linux/kernel.h>
26#include <linux/sched.h> 27#include <linux/sched.h>
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 56c9519d900a..0f2ab741ae7c 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -9,7 +9,6 @@
9 9
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/slab.h>
13#include <linux/smp_lock.h> 12#include <linux/smp_lock.h>
14#include <linux/lockd/lockd.h> 13#include <linux/lockd/lockd.h>
15#include <linux/lockd/share.h> 14#include <linux/lockd/share.h>
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index ad478da7ca63..d0ef94cfb3da 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -10,6 +10,7 @@
10#include <linux/string.h> 10#include <linux/string.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/in.h> 12#include <linux/in.h>
13#include <linux/slab.h>
13#include <linux/mutex.h> 14#include <linux/mutex.h>
14#include <linux/sunrpc/svc.h> 15#include <linux/sunrpc/svc.h>
15#include <linux/sunrpc/clnt.h> 16#include <linux/sunrpc/clnt.h>
diff --git a/fs/locks.c b/fs/locks.c
index ae9ded026b7c..ab24d49fc048 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1455,7 +1455,7 @@ EXPORT_SYMBOL(generic_setlease);
1455 * leases held by processes on this node. 1455 * leases held by processes on this node.
1456 * 1456 *
1457 * There is also no break_lease method; filesystems that 1457 * There is also no break_lease method; filesystems that
1458 * handle their own leases shoud break leases themselves from the 1458 * handle their own leases should break leases themselves from the
1459 * filesystem's open, create, and (on truncate) setattr methods. 1459 * filesystem's open, create, and (on truncate) setattr methods.
1460 * 1460 *
1461 * Warning: the only current setlease methods exist only to disable 1461 * Warning: the only current setlease methods exist only to disable
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 9718c22f186d..9bd2ce2a3040 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -9,6 +9,7 @@
9#include <linux/bio.h> 9#include <linux/bio.h>
10#include <linux/blkdev.h> 10#include <linux/blkdev.h>
11#include <linux/buffer_head.h> 11#include <linux/buffer_head.h>
12#include <linux/gfp.h>
12 13
13#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) 14#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
14 15
@@ -80,6 +81,7 @@ static void writeseg_end_io(struct bio *bio, int err)
80 prefetchw(&bvec->bv_page->flags); 81 prefetchw(&bvec->bv_page->flags);
81 82
82 end_page_writeback(page); 83 end_page_writeback(page);
84 page_cache_release(page);
83 } while (bvec >= bio->bi_io_vec); 85 } while (bvec >= bio->bi_io_vec);
84 bio_put(bio); 86 bio_put(bio);
85 if (atomic_dec_and_test(&super->s_pending_writes)) 87 if (atomic_dec_and_test(&super->s_pending_writes))
@@ -97,8 +99,10 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
97 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9); 99 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
98 int i; 100 int i;
99 101
102 if (max_pages > BIO_MAX_PAGES)
103 max_pages = BIO_MAX_PAGES;
100 bio = bio_alloc(GFP_NOFS, max_pages); 104 bio = bio_alloc(GFP_NOFS, max_pages);
101 BUG_ON(!bio); /* FIXME: handle this */ 105 BUG_ON(!bio);
102 106
103 for (i = 0; i < nr_pages; i++) { 107 for (i = 0; i < nr_pages; i++) {
104 if (i >= max_pages) { 108 if (i >= max_pages) {
@@ -191,8 +195,10 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
191 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9); 195 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
192 int i; 196 int i;
193 197
198 if (max_pages > BIO_MAX_PAGES)
199 max_pages = BIO_MAX_PAGES;
194 bio = bio_alloc(GFP_NOFS, max_pages); 200 bio = bio_alloc(GFP_NOFS, max_pages);
195 BUG_ON(!bio); /* FIXME: handle this */ 201 BUG_ON(!bio);
196 202
197 for (i = 0; i < nr_pages; i++) { 203 for (i = 0; i < nr_pages; i++) {
198 if (i >= max_pages) { 204 if (i >= max_pages) {
@@ -297,6 +303,11 @@ static void bdev_put_device(struct super_block *sb)
297 close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE); 303 close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE);
298} 304}
299 305
306static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
307{
308 return 0;
309}
310
300static const struct logfs_device_ops bd_devops = { 311static const struct logfs_device_ops bd_devops = {
301 .find_first_sb = bdev_find_first_sb, 312 .find_first_sb = bdev_find_first_sb,
302 .find_last_sb = bdev_find_last_sb, 313 .find_last_sb = bdev_find_last_sb,
@@ -304,6 +315,7 @@ static const struct logfs_device_ops bd_devops = {
304 .readpage = bdev_readpage, 315 .readpage = bdev_readpage,
305 .writeseg = bdev_writeseg, 316 .writeseg = bdev_writeseg,
306 .erase = bdev_erase, 317 .erase = bdev_erase,
318 .can_write_buf = bdev_can_write_buf,
307 .sync = bdev_sync, 319 .sync = bdev_sync,
308 .put_device = bdev_put_device, 320 .put_device = bdev_put_device,
309}; 321};
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index cafb6ef2e05b..a85d47d13e4b 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -9,6 +9,7 @@
9#include <linux/completion.h> 9#include <linux/completion.h>
10#include <linux/mount.h> 10#include <linux/mount.h>
11#include <linux/sched.h> 11#include <linux/sched.h>
12#include <linux/slab.h>
12 13
13#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) 14#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
14 15
@@ -126,7 +127,8 @@ static int mtd_readpage(void *_sb, struct page *page)
126 127
127 err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE, 128 err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
128 page_address(page)); 129 page_address(page));
129 if (err == -EUCLEAN) { 130 if (err == -EUCLEAN || err == -EBADMSG) {
131 /* -EBADMSG happens regularly on power failures */
130 err = 0; 132 err = 0;
131 /* FIXME: force GC this segment */ 133 /* FIXME: force GC this segment */
132 } 134 }
@@ -233,12 +235,32 @@ static void mtd_put_device(struct super_block *sb)
233 put_mtd_device(logfs_super(sb)->s_mtd); 235 put_mtd_device(logfs_super(sb)->s_mtd);
234} 236}
235 237
238static int mtd_can_write_buf(struct super_block *sb, u64 ofs)
239{
240 struct logfs_super *super = logfs_super(sb);
241 void *buf;
242 int err;
243
244 buf = kmalloc(super->s_writesize, GFP_KERNEL);
245 if (!buf)
246 return -ENOMEM;
247 err = mtd_read(sb, ofs, super->s_writesize, buf);
248 if (err)
249 goto out;
250 if (memchr_inv(buf, 0xff, super->s_writesize))
251 err = -EIO;
252 kfree(buf);
253out:
254 return err;
255}
256
236static const struct logfs_device_ops mtd_devops = { 257static const struct logfs_device_ops mtd_devops = {
237 .find_first_sb = mtd_find_first_sb, 258 .find_first_sb = mtd_find_first_sb,
238 .find_last_sb = mtd_find_last_sb, 259 .find_last_sb = mtd_find_last_sb,
239 .readpage = mtd_readpage, 260 .readpage = mtd_readpage,
240 .writeseg = mtd_writeseg, 261 .writeseg = mtd_writeseg,
241 .erase = mtd_erase, 262 .erase = mtd_erase,
263 .can_write_buf = mtd_can_write_buf,
242 .sync = mtd_sync, 264 .sync = mtd_sync,
243 .put_device = mtd_put_device, 265 .put_device = mtd_put_device,
244}; 266};
@@ -250,5 +272,7 @@ int logfs_get_sb_mtd(struct file_system_type *type, int flags,
250 const struct logfs_device_ops *devops = &mtd_devops; 272 const struct logfs_device_ops *devops = &mtd_devops;
251 273
252 mtd = get_mtd_device(NULL, mtdnr); 274 mtd = get_mtd_device(NULL, mtdnr);
275 if (IS_ERR(mtd))
276 return PTR_ERR(mtd);
253 return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt); 277 return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt);
254} 278}
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 56a8bfbb0120..72d1893ddd36 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -6,13 +6,13 @@
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> 6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */ 7 */
8#include "logfs.h" 8#include "logfs.h"
9 9#include <linux/slab.h>
10 10
11/* 11/*
12 * Atomic dir operations 12 * Atomic dir operations
13 * 13 *
14 * Directory operations are by default not atomic. Dentries and Inodes are 14 * Directory operations are by default not atomic. Dentries and Inodes are
15 * created/removed/altered in seperate operations. Therefore we need to do 15 * created/removed/altered in separate operations. Therefore we need to do
16 * a small amount of journaling. 16 * a small amount of journaling.
17 * 17 *
18 * Create, link, mkdir, mknod and symlink all share the same function to do 18 * Create, link, mkdir, mknod and symlink all share the same function to do
@@ -303,12 +303,12 @@ static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
303 (filler_t *)logfs_readpage, NULL); 303 (filler_t *)logfs_readpage, NULL);
304 if (IS_ERR(page)) 304 if (IS_ERR(page))
305 return PTR_ERR(page); 305 return PTR_ERR(page);
306 dd = kmap_atomic(page, KM_USER0); 306 dd = kmap(page);
307 BUG_ON(dd->namelen == 0); 307 BUG_ON(dd->namelen == 0);
308 308
309 full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen), 309 full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen),
310 pos, be64_to_cpu(dd->ino), dd->type); 310 pos, be64_to_cpu(dd->ino), dd->type);
311 kunmap_atomic(dd, KM_USER0); 311 kunmap(page);
312 page_cache_release(page); 312 page_cache_release(page);
313 if (full) 313 if (full)
314 break; 314 break;
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 370f367a933e..0de524071870 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -161,7 +161,17 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc)
161 161
162static void logfs_invalidatepage(struct page *page, unsigned long offset) 162static void logfs_invalidatepage(struct page *page, unsigned long offset)
163{ 163{
164 move_page_to_btree(page); 164 struct logfs_block *block = logfs_block(page);
165
166 if (block->reserved_bytes) {
167 struct super_block *sb = page->mapping->host->i_sb;
168 struct logfs_super *super = logfs_super(sb);
169
170 super->s_dirty_pages -= block->reserved_bytes;
171 block->ops->free_block(sb, block);
172 BUG_ON(bitmap_weight(block->alias_map, LOGFS_BLOCK_FACTOR));
173 } else
174 move_page_to_btree(page);
165 BUG_ON(PagePrivate(page) || page->private); 175 BUG_ON(PagePrivate(page) || page->private);
166} 176}
167 177
@@ -212,10 +222,8 @@ int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
212int logfs_fsync(struct file *file, struct dentry *dentry, int datasync) 222int logfs_fsync(struct file *file, struct dentry *dentry, int datasync)
213{ 223{
214 struct super_block *sb = dentry->d_inode->i_sb; 224 struct super_block *sb = dentry->d_inode->i_sb;
215 struct logfs_super *super = logfs_super(sb);
216 225
217 /* FIXME: write anchor */ 226 logfs_write_anchor(sb);
218 super->s_devops->sync(sb);
219 return 0; 227 return 0;
220} 228}
221 229
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
index 92949f95a901..caa4419285dc 100644
--- a/fs/logfs/gc.c
+++ b/fs/logfs/gc.c
@@ -7,6 +7,7 @@
7 */ 7 */
8#include "logfs.h" 8#include "logfs.h"
9#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/slab.h>
10 11
11/* 12/*
12 * Wear leveling needs to kick in when the difference between low erase 13 * Wear leveling needs to kick in when the difference between low erase
@@ -121,7 +122,7 @@ static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
121 logfs_safe_iput(inode, cookie); 122 logfs_safe_iput(inode, cookie);
122} 123}
123 124
124static u32 logfs_gc_segment(struct super_block *sb, u32 segno, u8 dist) 125static u32 logfs_gc_segment(struct super_block *sb, u32 segno)
125{ 126{
126 struct logfs_super *super = logfs_super(sb); 127 struct logfs_super *super = logfs_super(sb);
127 struct logfs_segment_header sh; 128 struct logfs_segment_header sh;
@@ -400,7 +401,7 @@ static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand)
400 segno, (u64)segno << super->s_segshift, 401 segno, (u64)segno << super->s_segshift,
401 dist, no_free_segments(sb), valid, 402 dist, no_free_segments(sb), valid,
402 super->s_free_bytes); 403 super->s_free_bytes);
403 cleaned = logfs_gc_segment(sb, segno, dist); 404 cleaned = logfs_gc_segment(sb, segno);
404 log_gc("GC segment #%02x complete - now %x valid\n", segno, 405 log_gc("GC segment #%02x complete - now %x valid\n", segno,
405 valid - cleaned); 406 valid - cleaned);
406 BUG_ON(cleaned != valid); 407 BUG_ON(cleaned != valid);
@@ -458,6 +459,14 @@ static void __logfs_gc_pass(struct super_block *sb, int target)
458 struct logfs_block *block; 459 struct logfs_block *block;
459 int round, progress, last_progress = 0; 460 int round, progress, last_progress = 0;
460 461
462 /*
463 * Doing too many changes to the segfile at once would result
464 * in a large number of aliases. Write the journal before
465 * things get out of hand.
466 */
467 if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES)
468 logfs_write_anchor(sb);
469
461 if (no_free_segments(sb) >= target && 470 if (no_free_segments(sb) >= target &&
462 super->s_no_object_aliases < MAX_OBJ_ALIASES) 471 super->s_no_object_aliases < MAX_OBJ_ALIASES)
463 return; 472 return;
@@ -623,38 +632,31 @@ static int check_area(struct super_block *sb, int i)
623{ 632{
624 struct logfs_super *super = logfs_super(sb); 633 struct logfs_super *super = logfs_super(sb);
625 struct logfs_area *area = super->s_area[i]; 634 struct logfs_area *area = super->s_area[i];
626 struct logfs_object_header oh; 635 gc_level_t gc_level;
636 u32 cleaned, valid, ec;
627 u32 segno = area->a_segno; 637 u32 segno = area->a_segno;
628 u32 ofs = area->a_used_bytes; 638 u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
629 __be32 crc;
630 int err;
631 639
632 if (!area->a_is_open) 640 if (!area->a_is_open)
633 return 0; 641 return 0;
634 642
635 for (ofs = area->a_used_bytes; 643 if (super->s_devops->can_write_buf(sb, ofs) == 0)
636 ofs <= super->s_segsize - sizeof(oh); 644 return 0;
637 ofs += (u32)be16_to_cpu(oh.len) + sizeof(oh)) {
638 err = wbuf_read(sb, dev_ofs(sb, segno, ofs), sizeof(oh), &oh);
639 if (err)
640 return err;
641
642 if (!memchr_inv(&oh, 0xff, sizeof(oh)))
643 break;
644 645
645 crc = logfs_crc32(&oh, sizeof(oh) - 4, 4); 646 printk(KERN_INFO"LogFS: Possibly incomplete write at %llx\n", ofs);
646 if (crc != oh.crc) { 647 /*
647 printk(KERN_INFO "interrupted header at %llx\n", 648 * The device cannot write back the write buffer. Most likely the
648 dev_ofs(sb, segno, ofs)); 649 * wbuf was already written out and the system crashed at some point
649 return 0; 650 * before the journal commit happened. In that case we wouldn't have
650 } 651 * to do anything. But if the crash happened before the wbuf was
651 } 652 * written out correctly, we must GC this segment. So assume the
652 if (ofs != area->a_used_bytes) { 653 * worst and always do the GC run.
653 printk(KERN_INFO "%x bytes unaccounted data found at %llx\n", 654 */
654 ofs - area->a_used_bytes, 655 area->a_is_open = 0;
655 dev_ofs(sb, segno, area->a_used_bytes)); 656 valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
656 area->a_used_bytes = ofs; 657 cleaned = logfs_gc_segment(sb, segno);
657 } 658 if (cleaned != valid)
659 return -EIO;
658 return 0; 660 return 0;
659} 661}
660 662
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 33ec1aeaeec4..755a92e8daa7 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -6,6 +6,7 @@
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> 6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */ 7 */
8#include "logfs.h" 8#include "logfs.h"
9#include <linux/slab.h>
9#include <linux/writeback.h> 10#include <linux/writeback.h>
10#include <linux/backing-dev.h> 11#include <linux/backing-dev.h>
11 12
@@ -192,6 +193,7 @@ static void logfs_init_inode(struct super_block *sb, struct inode *inode)
192 inode->i_ctime = CURRENT_TIME; 193 inode->i_ctime = CURRENT_TIME;
193 inode->i_mtime = CURRENT_TIME; 194 inode->i_mtime = CURRENT_TIME;
194 inode->i_nlink = 1; 195 inode->i_nlink = 1;
196 li->li_refcount = 1;
195 INIT_LIST_HEAD(&li->li_freeing_list); 197 INIT_LIST_HEAD(&li->li_freeing_list);
196 198
197 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) 199 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
@@ -325,7 +327,7 @@ static void logfs_set_ino_generation(struct super_block *sb,
325 u64 ino; 327 u64 ino;
326 328
327 mutex_lock(&super->s_journal_mutex); 329 mutex_lock(&super->s_journal_mutex);
328 ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino); 330 ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino + 1);
329 super->s_last_ino = ino; 331 super->s_last_ino = ino;
330 super->s_inos_till_wrap--; 332 super->s_inos_till_wrap--;
331 if (super->s_inos_till_wrap < 0) { 333 if (super->s_inos_till_wrap < 0) {
@@ -385,8 +387,7 @@ static void logfs_init_once(void *_li)
385 387
386static int logfs_sync_fs(struct super_block *sb, int wait) 388static int logfs_sync_fs(struct super_block *sb, int wait)
387{ 389{
388 /* FIXME: write anchor */ 390 logfs_write_anchor(sb);
389 logfs_super(sb)->s_devops->sync(sb);
390 return 0; 391 return 0;
391} 392}
392 393
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index 6ad30a4c9052..4b0e0616b357 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -6,6 +6,7 @@
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> 6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */ 7 */
8#include "logfs.h" 8#include "logfs.h"
9#include <linux/slab.h>
9 10
10static void logfs_calc_free(struct super_block *sb) 11static void logfs_calc_free(struct super_block *sb)
11{ 12{
@@ -131,10 +132,9 @@ static int read_area(struct super_block *sb, struct logfs_je_area *a)
131 132
132 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); 133 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
133 if (super->s_writesize > 1) 134 if (super->s_writesize > 1)
134 logfs_buf_recover(area, ofs, a + 1, super->s_writesize); 135 return logfs_buf_recover(area, ofs, a + 1, super->s_writesize);
135 else 136 else
136 logfs_buf_recover(area, ofs, NULL, 0); 137 return logfs_buf_recover(area, ofs, NULL, 0);
137 return 0;
138} 138}
139 139
140static void *unpack(void *from, void *to) 140static void *unpack(void *from, void *to)
@@ -244,7 +244,7 @@ static int read_je(struct super_block *sb, u64 ofs)
244 read_erasecount(sb, unpack(jh, scratch)); 244 read_erasecount(sb, unpack(jh, scratch));
245 break; 245 break;
246 case JE_AREA: 246 case JE_AREA:
247 read_area(sb, unpack(jh, scratch)); 247 err = read_area(sb, unpack(jh, scratch));
248 break; 248 break;
249 case JE_OBJ_ALIAS: 249 case JE_OBJ_ALIAS:
250 err = logfs_load_object_aliases(sb, unpack(jh, scratch), 250 err = logfs_load_object_aliases(sb, unpack(jh, scratch),
@@ -388,7 +388,10 @@ static void journal_get_erase_count(struct logfs_area *area)
388static int journal_erase_segment(struct logfs_area *area) 388static int journal_erase_segment(struct logfs_area *area)
389{ 389{
390 struct super_block *sb = area->a_sb; 390 struct super_block *sb = area->a_sb;
391 struct logfs_segment_header sh; 391 union {
392 struct logfs_segment_header sh;
393 unsigned char c[ALIGN(sizeof(struct logfs_segment_header), 16)];
394 } u;
392 u64 ofs; 395 u64 ofs;
393 int err; 396 int err;
394 397
@@ -396,20 +399,21 @@ static int journal_erase_segment(struct logfs_area *area)
396 if (err) 399 if (err)
397 return err; 400 return err;
398 401
399 sh.pad = 0; 402 memset(&u, 0, sizeof(u));
400 sh.type = SEG_JOURNAL; 403 u.sh.pad = 0;
401 sh.level = 0; 404 u.sh.type = SEG_JOURNAL;
402 sh.segno = cpu_to_be32(area->a_segno); 405 u.sh.level = 0;
403 sh.ec = cpu_to_be32(area->a_erase_count); 406 u.sh.segno = cpu_to_be32(area->a_segno);
404 sh.gec = cpu_to_be64(logfs_super(sb)->s_gec); 407 u.sh.ec = cpu_to_be32(area->a_erase_count);
405 sh.crc = logfs_crc32(&sh, sizeof(sh), 4); 408 u.sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
409 u.sh.crc = logfs_crc32(&u.sh, sizeof(u.sh), 4);
406 410
407 /* This causes a bug in segment.c. Not yet. */ 411 /* This causes a bug in segment.c. Not yet. */
408 //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0); 412 //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0);
409 413
410 ofs = dev_ofs(sb, area->a_segno, 0); 414 ofs = dev_ofs(sb, area->a_segno, 0);
411 area->a_used_bytes = ALIGN(sizeof(sh), 16); 415 area->a_used_bytes = sizeof(u);
412 logfs_buf_write(area, ofs, &sh, sizeof(sh)); 416 logfs_buf_write(area, ofs, &u, sizeof(u));
413 return 0; 417 return 0;
414} 418}
415 419
@@ -493,6 +497,8 @@ static void account_shadows(struct super_block *sb)
493 497
494 btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow); 498 btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow);
495 btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow); 499 btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow);
500 btree_grim_visitor32(&tree->segment_map, 0, NULL);
501 tree->no_shadowed_segments = 0;
496 502
497 if (li->li_block) { 503 if (li->li_block) {
498 /* 504 /*
@@ -606,9 +612,9 @@ static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
606 if (len == 0) 612 if (len == 0)
607 return logfs_write_header(super, header, 0, type); 613 return logfs_write_header(super, header, 0, type);
608 614
615 BUG_ON(len > sb->s_blocksize);
609 compr_len = logfs_compress(buf, data, len, sb->s_blocksize); 616 compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
610 if (compr_len < 0 || type == JE_ANCHOR) { 617 if (compr_len < 0 || type == JE_ANCHOR) {
611 BUG_ON(len > sb->s_blocksize);
612 memcpy(data, buf, len); 618 memcpy(data, buf, len);
613 compr_len = len; 619 compr_len = len;
614 compr = COMPR_NONE; 620 compr = COMPR_NONE;
@@ -660,6 +666,7 @@ static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type,
660 if (ofs < 0) 666 if (ofs < 0)
661 return ofs; 667 return ofs;
662 logfs_buf_write(area, ofs, super->s_compressed_je, len); 668 logfs_buf_write(area, ofs, super->s_compressed_je, len);
669 BUG_ON(super->s_no_je >= MAX_JOURNAL_ENTRIES);
663 super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs); 670 super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs);
664 return 0; 671 return 0;
665} 672}
@@ -800,6 +807,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
800{ 807{
801 struct logfs_super *super = logfs_super(sb); 808 struct logfs_super *super = logfs_super(sb);
802 struct logfs_area *area = super->s_journal_area; 809 struct logfs_area *area = super->s_journal_area;
810 struct btree_head32 *head = &super->s_reserved_segments;
803 u32 segno, ec; 811 u32 segno, ec;
804 int i, err; 812 int i, err;
805 813
@@ -807,6 +815,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
807 /* Drop old segments */ 815 /* Drop old segments */
808 journal_for_each(i) 816 journal_for_each(i)
809 if (super->s_journal_seg[i]) { 817 if (super->s_journal_seg[i]) {
818 btree_remove32(head, super->s_journal_seg[i]);
810 logfs_set_segment_unreserved(sb, 819 logfs_set_segment_unreserved(sb,
811 super->s_journal_seg[i], 820 super->s_journal_seg[i],
812 super->s_journal_ec[i]); 821 super->s_journal_ec[i]);
@@ -819,8 +828,13 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
819 super->s_journal_seg[i] = segno; 828 super->s_journal_seg[i] = segno;
820 super->s_journal_ec[i] = ec; 829 super->s_journal_ec[i] = ec;
821 logfs_set_segment_reserved(sb, segno); 830 logfs_set_segment_reserved(sb, segno);
831 err = btree_insert32(head, segno, (void *)1, GFP_KERNEL);
832 BUG_ON(err); /* mempool should prevent this */
833 err = logfs_erase_segment(sb, segno, 1);
834 BUG_ON(err); /* FIXME: remount-ro would be nicer */
822 } 835 }
823 /* Manually move journal_area */ 836 /* Manually move journal_area */
837 freeseg(sb, area->a_segno);
824 area->a_segno = super->s_journal_seg[0]; 838 area->a_segno = super->s_journal_seg[0];
825 area->a_is_open = 0; 839 area->a_is_open = 0;
826 area->a_used_bytes = 0; 840 area->a_used_bytes = 0;
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 129779431373..1a9db84f8d8f 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -144,6 +144,7 @@ struct logfs_area_ops {
144 * @erase: erase one segment 144 * @erase: erase one segment
145 * @read: read from the device 145 * @read: read from the device
146 * @erase: erase part of the device 146 * @erase: erase part of the device
147 * @can_write_buf: decide whether wbuf can be written to ofs
147 */ 148 */
148struct logfs_device_ops { 149struct logfs_device_ops {
149 struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs); 150 struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs);
@@ -153,6 +154,7 @@ struct logfs_device_ops {
153 void (*writeseg)(struct super_block *sb, u64 ofs, size_t len); 154 void (*writeseg)(struct super_block *sb, u64 ofs, size_t len);
154 int (*erase)(struct super_block *sb, loff_t ofs, size_t len, 155 int (*erase)(struct super_block *sb, loff_t ofs, size_t len,
155 int ensure_write); 156 int ensure_write);
157 int (*can_write_buf)(struct super_block *sb, u64 ofs);
156 void (*sync)(struct super_block *sb); 158 void (*sync)(struct super_block *sb);
157 void (*put_device)(struct super_block *sb); 159 void (*put_device)(struct super_block *sb);
158}; 160};
@@ -257,10 +259,14 @@ struct logfs_shadow {
257 * struct shadow_tree 259 * struct shadow_tree
258 * @new: shadows where old_ofs==0, indexed by new_ofs 260 * @new: shadows where old_ofs==0, indexed by new_ofs
259 * @old: shadows where old_ofs!=0, indexed by old_ofs 261 * @old: shadows where old_ofs!=0, indexed by old_ofs
262 * @segment_map: bitfield of segments containing shadows
263 * @no_shadowed_segment: number of segments containing shadows
260 */ 264 */
261struct shadow_tree { 265struct shadow_tree {
262 struct btree_head64 new; 266 struct btree_head64 new;
263 struct btree_head64 old; 267 struct btree_head64 old;
268 struct btree_head32 segment_map;
269 int no_shadowed_segments;
264}; 270};
265 271
266struct object_alias_item { 272struct object_alias_item {
@@ -305,13 +311,14 @@ typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix,
305 level_t level, int child_no, __be64 val); 311 level_t level, int child_no, __be64 val);
306struct logfs_block_ops { 312struct logfs_block_ops {
307 void (*write_block)(struct logfs_block *block); 313 void (*write_block)(struct logfs_block *block);
308 gc_level_t (*block_level)(struct logfs_block *block);
309 void (*free_block)(struct super_block *sb, struct logfs_block*block); 314 void (*free_block)(struct super_block *sb, struct logfs_block*block);
310 int (*write_alias)(struct super_block *sb, 315 int (*write_alias)(struct super_block *sb,
311 struct logfs_block *block, 316 struct logfs_block *block,
312 write_alias_t *write_one_alias); 317 write_alias_t *write_one_alias);
313}; 318};
314 319
320#define MAX_JOURNAL_ENTRIES 256
321
315struct logfs_super { 322struct logfs_super {
316 struct mtd_info *s_mtd; /* underlying device */ 323 struct mtd_info *s_mtd; /* underlying device */
317 struct block_device *s_bdev; /* underlying device */ 324 struct block_device *s_bdev; /* underlying device */
@@ -378,7 +385,7 @@ struct logfs_super {
378 u32 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */ 385 u32 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */
379 u64 s_last_version; 386 u64 s_last_version;
380 struct logfs_area *s_journal_area; /* open journal segment */ 387 struct logfs_area *s_journal_area; /* open journal segment */
381 __be64 s_je_array[64]; 388 __be64 s_je_array[MAX_JOURNAL_ENTRIES];
382 int s_no_je; 389 int s_no_je;
383 390
384 int s_sum_index; /* for the 12 summaries */ 391 int s_sum_index; /* for the 12 summaries */
@@ -389,6 +396,7 @@ struct logfs_super {
389 int s_lock_count; 396 int s_lock_count;
390 mempool_t *s_block_pool; /* struct logfs_block pool */ 397 mempool_t *s_block_pool; /* struct logfs_block pool */
391 mempool_t *s_shadow_pool; /* struct logfs_shadow pool */ 398 mempool_t *s_shadow_pool; /* struct logfs_shadow pool */
399 struct list_head s_writeback_list; /* writeback pages */
392 /* 400 /*
393 * Space accounting: 401 * Space accounting:
394 * - s_used_bytes specifies space used to store valid data objects. 402 * - s_used_bytes specifies space used to store valid data objects.
@@ -587,24 +595,25 @@ void move_page_to_btree(struct page *page);
587int logfs_init_mapping(struct super_block *sb); 595int logfs_init_mapping(struct super_block *sb);
588void logfs_sync_area(struct logfs_area *area); 596void logfs_sync_area(struct logfs_area *area);
589void logfs_sync_segments(struct super_block *sb); 597void logfs_sync_segments(struct super_block *sb);
598void freeseg(struct super_block *sb, u32 segno);
590 599
591/* area handling */ 600/* area handling */
592int logfs_init_areas(struct super_block *sb); 601int logfs_init_areas(struct super_block *sb);
593void logfs_cleanup_areas(struct super_block *sb); 602void logfs_cleanup_areas(struct super_block *sb);
594int logfs_open_area(struct logfs_area *area, size_t bytes); 603int logfs_open_area(struct logfs_area *area, size_t bytes);
595void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len, 604int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
596 int use_filler); 605 int use_filler);
597 606
598static inline void logfs_buf_write(struct logfs_area *area, u64 ofs, 607static inline int logfs_buf_write(struct logfs_area *area, u64 ofs,
599 void *buf, size_t len) 608 void *buf, size_t len)
600{ 609{
601 __logfs_buf_write(area, ofs, buf, len, 0); 610 return __logfs_buf_write(area, ofs, buf, len, 0);
602} 611}
603 612
604static inline void logfs_buf_recover(struct logfs_area *area, u64 ofs, 613static inline int logfs_buf_recover(struct logfs_area *area, u64 ofs,
605 void *buf, size_t len) 614 void *buf, size_t len)
606{ 615{
607 __logfs_buf_write(area, ofs, buf, len, 1); 616 return __logfs_buf_write(area, ofs, buf, len, 1);
608} 617}
609 618
610/* super.c */ 619/* super.c */
@@ -698,7 +707,7 @@ static inline gc_level_t expand_level(u64 ino, level_t __level)
698 u8 level = (__force u8)__level; 707 u8 level = (__force u8)__level;
699 708
700 if (ino == LOGFS_INO_MASTER) { 709 if (ino == LOGFS_INO_MASTER) {
701 /* ifile has seperate areas */ 710 /* ifile has separate areas */
702 level += LOGFS_MAX_LEVELS; 711 level += LOGFS_MAX_LEVELS;
703 } 712 }
704 return (__force gc_level_t)level; 713 return (__force gc_level_t)level;
@@ -721,4 +730,10 @@ static inline struct logfs_area *get_area(struct super_block *sb,
721 return logfs_super(sb)->s_area[(__force u8)gc_level]; 730 return logfs_super(sb)->s_area[(__force u8)gc_level];
722} 731}
723 732
733static inline void logfs_mempool_destroy(mempool_t *pool)
734{
735 if (pool)
736 mempool_destroy(pool);
737}
738
724#endif 739#endif
diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h
index f674725663fe..ae960519c54a 100644
--- a/fs/logfs/logfs_abi.h
+++ b/fs/logfs/logfs_abi.h
@@ -50,9 +50,9 @@ static inline void check_##type(void) \
50 * 12 - gc recycled blocks, long-lived data 50 * 12 - gc recycled blocks, long-lived data
51 * 13 - replacement blocks, short-lived data 51 * 13 - replacement blocks, short-lived data
52 * 52 *
53 * Levels 1-11 are necessary for robust gc operations and help seperate 53 * Levels 1-11 are necessary for robust gc operations and help separate
54 * short-lived metadata from longer-lived file data. In the future, 54 * short-lived metadata from longer-lived file data. In the future,
55 * file data should get seperated into several segments based on simple 55 * file data should get separated into several segments based on simple
56 * heuristics. Old data recycled during gc operation is expected to be 56 * heuristics. Old data recycled during gc operation is expected to be
57 * long-lived. New data is of uncertain life expectancy. New data 57 * long-lived. New data is of uncertain life expectancy. New data
58 * used to replace older blocks in existing files is expected to be 58 * used to replace older blocks in existing files is expected to be
@@ -117,7 +117,7 @@ static inline void check_##type(void) \
117#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED) 117#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
118 118
119/* 119/*
120 * LogFS needs to seperate data into levels. Each level is defined as the 120 * LogFS needs to separate data into levels. Each level is defined as the
121 * maximal possible distance from the master inode (inode of the inode file). 121 * maximal possible distance from the master inode (inode of the inode file).
122 * Data blocks reside on level 0, 1x indirect block on level 1, etc. 122 * Data blocks reside on level 0, 1x indirect block on level 1, etc.
123 * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11. 123 * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
@@ -204,7 +204,7 @@ SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE);
204 * @ds_crc: crc32 of structure starting with the next field 204 * @ds_crc: crc32 of structure starting with the next field
205 * @ds_ifile_levels: maximum number of levels for ifile 205 * @ds_ifile_levels: maximum number of levels for ifile
206 * @ds_iblock_levels: maximum number of levels for regular files 206 * @ds_iblock_levels: maximum number of levels for regular files
207 * @ds_data_levels: number of seperate levels for data 207 * @ds_data_levels: number of separate levels for data
208 * @pad0: reserved, must be 0 208 * @pad0: reserved, must be 0
209 * @ds_feature_incompat: incompatible filesystem features 209 * @ds_feature_incompat: incompatible filesystem features
210 * @ds_feature_ro_compat: read-only compatible filesystem features 210 * @ds_feature_ro_compat: read-only compatible filesystem features
@@ -456,7 +456,7 @@ enum logfs_vim {
456 * @vim: life expectancy of data 456 * @vim: life expectancy of data
457 * 457 *
458 * "Areas" are segments currently being used for writing. There is at least 458 * "Areas" are segments currently being used for writing. There is at least
459 * one area per GC level. Several may be used to seperate long-living from 459 * one area per GC level. Several may be used to separate long-living from
460 * short-living data. If an area with unknown vim is encountered, it can 460 * short-living data. If an area with unknown vim is encountered, it can
461 * simply be closed. 461 * simply be closed.
462 * The write buffer immediately follow this header. 462 * The write buffer immediately follow this header.
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 7a23b3e7c0a7..0718d112a1a5 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -18,6 +18,7 @@
18 */ 18 */
19#include "logfs.h" 19#include "logfs.h"
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h>
21 22
22static u64 adjust_bix(u64 bix, level_t level) 23static u64 adjust_bix(u64 bix, level_t level)
23{ 24{
@@ -429,25 +430,6 @@ static void inode_write_block(struct logfs_block *block)
429 } 430 }
430} 431}
431 432
432static gc_level_t inode_block_level(struct logfs_block *block)
433{
434 BUG_ON(block->inode->i_ino == LOGFS_INO_MASTER);
435 return GC_LEVEL(LOGFS_MAX_LEVELS);
436}
437
438static gc_level_t indirect_block_level(struct logfs_block *block)
439{
440 struct page *page;
441 struct inode *inode;
442 u64 bix;
443 level_t level;
444
445 page = block->page;
446 inode = page->mapping->host;
447 logfs_unpack_index(page->index, &bix, &level);
448 return expand_level(inode->i_ino, level);
449}
450
451/* 433/*
452 * This silences a false, yet annoying gcc warning. I hate it when my editor 434 * This silences a false, yet annoying gcc warning. I hate it when my editor
453 * jumps into bitops.h each time I recompile this file. 435 * jumps into bitops.h each time I recompile this file.
@@ -586,14 +568,12 @@ static void indirect_free_block(struct super_block *sb,
586 568
587static struct logfs_block_ops inode_block_ops = { 569static struct logfs_block_ops inode_block_ops = {
588 .write_block = inode_write_block, 570 .write_block = inode_write_block,
589 .block_level = inode_block_level,
590 .free_block = inode_free_block, 571 .free_block = inode_free_block,
591 .write_alias = inode_write_alias, 572 .write_alias = inode_write_alias,
592}; 573};
593 574
594struct logfs_block_ops indirect_block_ops = { 575struct logfs_block_ops indirect_block_ops = {
595 .write_block = indirect_write_block, 576 .write_block = indirect_write_block,
596 .block_level = indirect_block_level,
597 .free_block = indirect_free_block, 577 .free_block = indirect_free_block,
598 .write_alias = indirect_write_alias, 578 .write_alias = indirect_write_alias,
599}; 579};
@@ -912,6 +892,8 @@ u64 logfs_seek_hole(struct inode *inode, u64 bix)
912 return bix; 892 return bix;
913 else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED) 893 else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED)
914 bix = maxbix(li->li_height); 894 bix = maxbix(li->li_height);
895 else if (bix >= maxbix(li->li_height))
896 return bix;
915 else { 897 else {
916 bix = seek_holedata_loop(inode, bix, 0); 898 bix = seek_holedata_loop(inode, bix, 0);
917 if (bix < maxbix(li->li_height)) 899 if (bix < maxbix(li->li_height))
@@ -1113,17 +1095,25 @@ static int logfs_reserve_bytes(struct inode *inode, int bytes)
1113int get_page_reserve(struct inode *inode, struct page *page) 1095int get_page_reserve(struct inode *inode, struct page *page)
1114{ 1096{
1115 struct logfs_super *super = logfs_super(inode->i_sb); 1097 struct logfs_super *super = logfs_super(inode->i_sb);
1098 struct logfs_block *block = logfs_block(page);
1116 int ret; 1099 int ret;
1117 1100
1118 if (logfs_block(page) && logfs_block(page)->reserved_bytes) 1101 if (block && block->reserved_bytes)
1119 return 0; 1102 return 0;
1120 1103
1121 logfs_get_wblocks(inode->i_sb, page, WF_LOCK); 1104 logfs_get_wblocks(inode->i_sb, page, WF_LOCK);
1122 ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE); 1105 while ((ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE)) &&
1106 !list_empty(&super->s_writeback_list)) {
1107 block = list_entry(super->s_writeback_list.next,
1108 struct logfs_block, alias_list);
1109 block->ops->write_block(block);
1110 }
1123 if (!ret) { 1111 if (!ret) {
1124 alloc_data_block(inode, page); 1112 alloc_data_block(inode, page);
1125 logfs_block(page)->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE; 1113 block = logfs_block(page);
1114 block->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
1126 super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE; 1115 super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE;
1116 list_move_tail(&block->alias_list, &super->s_writeback_list);
1127 } 1117 }
1128 logfs_put_wblocks(inode->i_sb, page, WF_LOCK); 1118 logfs_put_wblocks(inode->i_sb, page, WF_LOCK);
1129 return ret; 1119 return ret;
@@ -1240,6 +1230,18 @@ static void free_shadow(struct inode *inode, struct logfs_shadow *shadow)
1240 mempool_free(shadow, super->s_shadow_pool); 1230 mempool_free(shadow, super->s_shadow_pool);
1241} 1231}
1242 1232
1233static void mark_segment(struct shadow_tree *tree, u32 segno)
1234{
1235 int err;
1236
1237 if (!btree_lookup32(&tree->segment_map, segno)) {
1238 err = btree_insert32(&tree->segment_map, segno, (void *)1,
1239 GFP_NOFS);
1240 BUG_ON(err);
1241 tree->no_shadowed_segments++;
1242 }
1243}
1244
1243/** 1245/**
1244 * fill_shadow_tree - Propagate shadow tree changes due to a write 1246 * fill_shadow_tree - Propagate shadow tree changes due to a write
1245 * @inode: Inode owning the page 1247 * @inode: Inode owning the page
@@ -1287,6 +1289,8 @@ static void fill_shadow_tree(struct inode *inode, struct page *page,
1287 1289
1288 super->s_dirty_used_bytes += shadow->new_len; 1290 super->s_dirty_used_bytes += shadow->new_len;
1289 super->s_dirty_free_bytes += shadow->old_len; 1291 super->s_dirty_free_bytes += shadow->old_len;
1292 mark_segment(tree, shadow->old_ofs >> super->s_segshift);
1293 mark_segment(tree, shadow->new_ofs >> super->s_segshift);
1290 } 1294 }
1291} 1295}
1292 1296
@@ -1594,7 +1598,6 @@ int logfs_delete(struct inode *inode, pgoff_t index,
1594 return ret; 1598 return ret;
1595} 1599}
1596 1600
1597/* Rewrite cannot mark the inode dirty but has to write it immediatly. */
1598int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs, 1601int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
1599 gc_level_t gc_level, long flags) 1602 gc_level_t gc_level, long flags)
1600{ 1603{
@@ -1611,6 +1614,18 @@ int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
1611 if (level != 0) 1614 if (level != 0)
1612 alloc_indirect_block(inode, page, 0); 1615 alloc_indirect_block(inode, page, 0);
1613 err = logfs_write_buf(inode, page, flags); 1616 err = logfs_write_buf(inode, page, flags);
1617 if (!err && shrink_level(gc_level) == 0) {
1618 /* Rewrite cannot mark the inode dirty but has to
1619 * write it immediatly.
1620 * Q: Can't we just create an alias for the inode
1621 * instead? And if not, why not?
1622 */
1623 if (inode->i_ino == LOGFS_INO_MASTER)
1624 logfs_write_anchor(inode->i_sb);
1625 else {
1626 err = __logfs_write_inode(inode, flags);
1627 }
1628 }
1614 } 1629 }
1615 logfs_put_write_page(page); 1630 logfs_put_write_page(page);
1616 return err; 1631 return err;
@@ -1833,19 +1848,37 @@ static int __logfs_truncate(struct inode *inode, u64 size)
1833 return logfs_truncate_direct(inode, size); 1848 return logfs_truncate_direct(inode, size);
1834} 1849}
1835 1850
1836int logfs_truncate(struct inode *inode, u64 size) 1851/*
1852 * Truncate, by changing the segment file, can consume a fair amount
1853 * of resources. So back off from time to time and do some GC.
1854 * 8 or 2048 blocks should be well within safety limits even if
1855 * every single block resided in a different segment.
1856 */
1857#define TRUNCATE_STEP (8 * 1024 * 1024)
1858int logfs_truncate(struct inode *inode, u64 target)
1837{ 1859{
1838 struct super_block *sb = inode->i_sb; 1860 struct super_block *sb = inode->i_sb;
1839 int err; 1861 u64 size = i_size_read(inode);
1862 int err = 0;
1840 1863
1841 logfs_get_wblocks(sb, NULL, 1); 1864 size = ALIGN(size, TRUNCATE_STEP);
1842 err = __logfs_truncate(inode, size); 1865 while (size > target) {
1843 if (!err) 1866 if (size > TRUNCATE_STEP)
1844 err = __logfs_write_inode(inode, 0); 1867 size -= TRUNCATE_STEP;
1845 logfs_put_wblocks(sb, NULL, 1); 1868 else
1869 size = 0;
1870 if (size < target)
1871 size = target;
1872
1873 logfs_get_wblocks(sb, NULL, 1);
1874 err = __logfs_truncate(inode, size);
1875 if (!err)
1876 err = __logfs_write_inode(inode, 0);
1877 logfs_put_wblocks(sb, NULL, 1);
1878 }
1846 1879
1847 if (!err) 1880 if (!err)
1848 err = vmtruncate(inode, size); 1881 err = vmtruncate(inode, target);
1849 1882
1850 /* I don't trust error recovery yet. */ 1883 /* I don't trust error recovery yet. */
1851 WARN_ON(err); 1884 WARN_ON(err);
@@ -2226,6 +2259,7 @@ int logfs_init_rw(struct super_block *sb)
2226 int min_fill = 3 * super->s_no_blocks; 2259 int min_fill = 3 * super->s_no_blocks;
2227 2260
2228 INIT_LIST_HEAD(&super->s_object_alias); 2261 INIT_LIST_HEAD(&super->s_object_alias);
2262 INIT_LIST_HEAD(&super->s_writeback_list);
2229 mutex_init(&super->s_write_mutex); 2263 mutex_init(&super->s_write_mutex);
2230 super->s_block_pool = mempool_create_kmalloc_pool(min_fill, 2264 super->s_block_pool = mempool_create_kmalloc_pool(min_fill,
2231 sizeof(struct logfs_block)); 2265 sizeof(struct logfs_block));
@@ -2239,8 +2273,6 @@ void logfs_cleanup_rw(struct super_block *sb)
2239 struct logfs_super *super = logfs_super(sb); 2273 struct logfs_super *super = logfs_super(sb);
2240 2274
2241 destroy_meta_inode(super->s_segfile_inode); 2275 destroy_meta_inode(super->s_segfile_inode);
2242 if (super->s_block_pool) 2276 logfs_mempool_destroy(super->s_block_pool);
2243 mempool_destroy(super->s_block_pool); 2277 logfs_mempool_destroy(super->s_shadow_pool);
2244 if (super->s_shadow_pool)
2245 mempool_destroy(super->s_shadow_pool);
2246} 2278}
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 1a14f9910d55..a9657afb70ad 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -10,6 +10,7 @@
10 * three kinds of objects: inodes, dentries and blocks, both data and indirect. 10 * three kinds of objects: inodes, dentries and blocks, both data and indirect.
11 */ 11 */
12#include "logfs.h" 12#include "logfs.h"
13#include <linux/slab.h>
13 14
14static int logfs_mark_segment_bad(struct super_block *sb, u32 segno) 15static int logfs_mark_segment_bad(struct super_block *sb, u32 segno)
15{ 16{
@@ -66,7 +67,7 @@ static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
66 return page; 67 return page;
67} 68}
68 69
69void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len, 70int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
70 int use_filler) 71 int use_filler)
71{ 72{
72 pgoff_t index = ofs >> PAGE_SHIFT; 73 pgoff_t index = ofs >> PAGE_SHIFT;
@@ -80,8 +81,10 @@ void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
80 copylen = min((ulong)len, PAGE_SIZE - offset); 81 copylen = min((ulong)len, PAGE_SIZE - offset);
81 82
82 page = get_mapping_page(area->a_sb, index, use_filler); 83 page = get_mapping_page(area->a_sb, index, use_filler);
83 SetPageUptodate(page); 84 if (IS_ERR(page))
85 return PTR_ERR(page);
84 BUG_ON(!page); /* FIXME: reserve a pool */ 86 BUG_ON(!page); /* FIXME: reserve a pool */
87 SetPageUptodate(page);
85 memcpy(page_address(page) + offset, buf, copylen); 88 memcpy(page_address(page) + offset, buf, copylen);
86 SetPagePrivate(page); 89 SetPagePrivate(page);
87 page_cache_release(page); 90 page_cache_release(page);
@@ -91,52 +94,61 @@ void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
91 offset = 0; 94 offset = 0;
92 index++; 95 index++;
93 } while (len); 96 } while (len);
97 return 0;
94} 98}
95 99
96/* 100static void pad_partial_page(struct logfs_area *area)
97 * bdev_writeseg will write full pages. Memset the tail to prevent data leaks.
98 */
99static void pad_wbuf(struct logfs_area *area, int final)
100{ 101{
101 struct super_block *sb = area->a_sb; 102 struct super_block *sb = area->a_sb;
102 struct logfs_super *super = logfs_super(sb);
103 struct page *page; 103 struct page *page;
104 u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes); 104 u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
105 pgoff_t index = ofs >> PAGE_SHIFT; 105 pgoff_t index = ofs >> PAGE_SHIFT;
106 long offset = ofs & (PAGE_SIZE-1); 106 long offset = ofs & (PAGE_SIZE-1);
107 u32 len = PAGE_SIZE - offset; 107 u32 len = PAGE_SIZE - offset;
108 108
109 if (len == PAGE_SIZE) { 109 if (len % PAGE_SIZE) {
110 /* The math in this function can surely use some love */ 110 page = get_mapping_page(sb, index, 0);
111 len = 0;
112 }
113 if (len) {
114 BUG_ON(area->a_used_bytes >= super->s_segsize);
115
116 page = get_mapping_page(area->a_sb, index, 0);
117 BUG_ON(!page); /* FIXME: reserve a pool */ 111 BUG_ON(!page); /* FIXME: reserve a pool */
118 memset(page_address(page) + offset, 0xff, len); 112 memset(page_address(page) + offset, 0xff, len);
119 SetPagePrivate(page); 113 SetPagePrivate(page);
120 page_cache_release(page); 114 page_cache_release(page);
121 } 115 }
116}
122 117
123 if (!final) 118static void pad_full_pages(struct logfs_area *area)
124 return; 119{
120 struct super_block *sb = area->a_sb;
121 struct logfs_super *super = logfs_super(sb);
122 u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
123 u32 len = super->s_segsize - area->a_used_bytes;
124 pgoff_t index = PAGE_CACHE_ALIGN(ofs) >> PAGE_CACHE_SHIFT;
125 pgoff_t no_indizes = len >> PAGE_CACHE_SHIFT;
126 struct page *page;
125 127
126 area->a_used_bytes += len; 128 while (no_indizes) {
127 for ( ; area->a_used_bytes < super->s_segsize; 129 page = get_mapping_page(sb, index, 0);
128 area->a_used_bytes += PAGE_SIZE) {
129 /* Memset another page */
130 index++;
131 page = get_mapping_page(area->a_sb, index, 0);
132 BUG_ON(!page); /* FIXME: reserve a pool */ 130 BUG_ON(!page); /* FIXME: reserve a pool */
133 memset(page_address(page), 0xff, PAGE_SIZE); 131 SetPageUptodate(page);
132 memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
134 SetPagePrivate(page); 133 SetPagePrivate(page);
135 page_cache_release(page); 134 page_cache_release(page);
135 index++;
136 no_indizes--;
136 } 137 }
137} 138}
138 139
139/* 140/*
141 * bdev_writeseg will write full pages. Memset the tail to prevent data leaks.
142 * Also make sure we allocate (and memset) all pages for final writeout.
143 */
144static void pad_wbuf(struct logfs_area *area, int final)
145{
146 pad_partial_page(area);
147 if (final)
148 pad_full_pages(area);
149}
150
151/*
140 * We have to be careful with the alias tree. Since lookup is done by bix, 152 * We have to be careful with the alias tree. Since lookup is done by bix,
141 * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with 153 * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with
142 * indirect blocks. So always use it through accessor functions. 154 * indirect blocks. So always use it through accessor functions.
@@ -174,14 +186,8 @@ static int btree_write_alias(struct super_block *sb, struct logfs_block *block,
174 return 0; 186 return 0;
175} 187}
176 188
177static gc_level_t btree_block_level(struct logfs_block *block)
178{
179 return expand_level(block->ino, block->level);
180}
181
182static struct logfs_block_ops btree_block_ops = { 189static struct logfs_block_ops btree_block_ops = {
183 .write_block = btree_write_block, 190 .write_block = btree_write_block,
184 .block_level = btree_block_level,
185 .free_block = __free_block, 191 .free_block = __free_block,
186 .write_alias = btree_write_alias, 192 .write_alias = btree_write_alias,
187}; 193};
@@ -683,7 +689,7 @@ int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow)
683 return 0; 689 return 0;
684} 690}
685 691
686static void freeseg(struct super_block *sb, u32 segno) 692void freeseg(struct super_block *sb, u32 segno)
687{ 693{
688 struct logfs_super *super = logfs_super(sb); 694 struct logfs_super *super = logfs_super(sb);
689 struct address_space *mapping = super->s_mapping_inode->i_mapping; 695 struct address_space *mapping = super->s_mapping_inode->i_mapping;
@@ -910,7 +916,7 @@ err:
910 for (i--; i >= 0; i--) 916 for (i--; i >= 0; i--)
911 free_area(super->s_area[i]); 917 free_area(super->s_area[i]);
912 free_area(super->s_journal_area); 918 free_area(super->s_journal_area);
913 mempool_destroy(super->s_alias_pool); 919 logfs_mempool_destroy(super->s_alias_pool);
914 return -ENOMEM; 920 return -ENOMEM;
915} 921}
916 922
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index c66beab78dee..d651e10a1e9c 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -11,6 +11,8 @@
11 */ 11 */
12#include "logfs.h" 12#include "logfs.h"
13#include <linux/bio.h> 13#include <linux/bio.h>
14#include <linux/slab.h>
15#include <linux/blkdev.h>
14#include <linux/mtd/mtd.h> 16#include <linux/mtd/mtd.h>
15#include <linux/statfs.h> 17#include <linux/statfs.h>
16#include <linux/buffer_head.h> 18#include <linux/buffer_head.h>
@@ -136,6 +138,14 @@ static int logfs_sb_set(struct super_block *sb, void *_super)
136 sb->s_fs_info = super; 138 sb->s_fs_info = super;
137 sb->s_mtd = super->s_mtd; 139 sb->s_mtd = super->s_mtd;
138 sb->s_bdev = super->s_bdev; 140 sb->s_bdev = super->s_bdev;
141#ifdef CONFIG_BLOCK
142 if (sb->s_bdev)
143 sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
144#endif
145#ifdef CONFIG_MTD
146 if (sb->s_mtd)
147 sb->s_bdi = sb->s_mtd->backing_dev_info;
148#endif
139 return 0; 149 return 0;
140} 150}
141 151
@@ -277,7 +287,7 @@ static int logfs_recover_sb(struct super_block *sb)
277 } 287 }
278 if (valid0 && valid1 && ds_cmp(ds0, ds1)) { 288 if (valid0 && valid1 && ds_cmp(ds0, ds1)) {
279 printk(KERN_INFO"Superblocks don't match - fixing.\n"); 289 printk(KERN_INFO"Superblocks don't match - fixing.\n");
280 return write_one_sb(sb, super->s_devops->find_last_sb); 290 return logfs_write_sb(sb);
281 } 291 }
282 /* If neither is valid now, something's wrong. Didn't we properly 292 /* If neither is valid now, something's wrong. Didn't we properly
283 * check them before?!? */ 293 * check them before?!? */
@@ -289,6 +299,10 @@ static int logfs_make_writeable(struct super_block *sb)
289{ 299{
290 int err; 300 int err;
291 301
302 err = logfs_open_segfile(sb);
303 if (err)
304 return err;
305
292 /* Repair any broken superblock copies */ 306 /* Repair any broken superblock copies */
293 err = logfs_recover_sb(sb); 307 err = logfs_recover_sb(sb);
294 if (err) 308 if (err)
@@ -299,10 +313,6 @@ static int logfs_make_writeable(struct super_block *sb)
299 if (err) 313 if (err)
300 return err; 314 return err;
301 315
302 err = logfs_open_segfile(sb);
303 if (err)
304 return err;
305
306 /* Do one GC pass before any data gets dirtied */ 316 /* Do one GC pass before any data gets dirtied */
307 logfs_gc_pass(sb); 317 logfs_gc_pass(sb);
308 318
@@ -327,27 +337,27 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
327 goto fail; 337 goto fail;
328 338
329 sb->s_root = d_alloc_root(rootdir); 339 sb->s_root = d_alloc_root(rootdir);
330 if (!sb->s_root) 340 if (!sb->s_root) {
341 iput(rootdir);
331 goto fail; 342 goto fail;
343 }
332 344
333 super->s_erase_page = alloc_pages(GFP_KERNEL, 0); 345 super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
334 if (!super->s_erase_page) 346 if (!super->s_erase_page)
335 goto fail2; 347 goto fail;
336 memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE); 348 memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE);
337 349
338 /* FIXME: check for read-only mounts */ 350 /* FIXME: check for read-only mounts */
339 err = logfs_make_writeable(sb); 351 err = logfs_make_writeable(sb);
340 if (err) 352 if (err)
341 goto fail3; 353 goto fail1;
342 354
343 log_super("LogFS: Finished mounting\n"); 355 log_super("LogFS: Finished mounting\n");
344 simple_set_mnt(mnt, sb); 356 simple_set_mnt(mnt, sb);
345 return 0; 357 return 0;
346 358
347fail3: 359fail1:
348 __free_page(super->s_erase_page); 360 __free_page(super->s_erase_page);
349fail2:
350 iput(rootdir);
351fail: 361fail:
352 iput(logfs_super(sb)->s_master_inode); 362 iput(logfs_super(sb)->s_master_inode);
353 return -EIO; 363 return -EIO;
@@ -376,7 +386,7 @@ static struct page *find_super_block(struct super_block *sb)
376 if (!first || IS_ERR(first)) 386 if (!first || IS_ERR(first))
377 return NULL; 387 return NULL;
378 last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]); 388 last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]);
379 if (!last || IS_ERR(first)) { 389 if (!last || IS_ERR(last)) {
380 page_cache_release(first); 390 page_cache_release(first);
381 return NULL; 391 return NULL;
382 } 392 }
@@ -407,7 +417,7 @@ static int __logfs_read_sb(struct super_block *sb)
407 417
408 page = find_super_block(sb); 418 page = find_super_block(sb);
409 if (!page) 419 if (!page)
410 return -EIO; 420 return -EINVAL;
411 421
412 ds = page_address(page); 422 ds = page_address(page);
413 super->s_size = be64_to_cpu(ds->ds_filesystem_size); 423 super->s_size = be64_to_cpu(ds->ds_filesystem_size);
@@ -451,6 +461,8 @@ static int logfs_read_sb(struct super_block *sb, int read_only)
451 461
452 btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool); 462 btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool);
453 btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool); 463 btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool);
464 btree_init_mempool32(&super->s_shadow_tree.segment_map,
465 super->s_btree_pool);
454 466
455 ret = logfs_init_mapping(sb); 467 ret = logfs_init_mapping(sb);
456 if (ret) 468 if (ret)
@@ -515,8 +527,8 @@ static void logfs_kill_sb(struct super_block *sb)
515 if (super->s_erase_page) 527 if (super->s_erase_page)
516 __free_page(super->s_erase_page); 528 __free_page(super->s_erase_page);
517 super->s_devops->put_device(sb); 529 super->s_devops->put_device(sb);
518 mempool_destroy(super->s_btree_pool); 530 logfs_mempool_destroy(super->s_btree_pool);
519 mempool_destroy(super->s_alias_pool); 531 logfs_mempool_destroy(super->s_alias_pool);
520 kfree(super); 532 kfree(super);
521 log_super("LogFS: Finished unmounting\n"); 533 log_super("LogFS: Finished unmounting\n");
522} 534}
@@ -572,8 +584,7 @@ int logfs_get_sb_device(struct file_system_type *type, int flags,
572 return 0; 584 return 0;
573 585
574err1: 586err1:
575 up_write(&sb->s_umount); 587 deactivate_locked_super(sb);
576 deactivate_super(sb);
577 return err; 588 return err;
578err0: 589err0:
579 kfree(super); 590 kfree(super);
diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c
index 82d6554b02fe..282e15ad8cd8 100644
--- a/fs/minix/itree_v1.c
+++ b/fs/minix/itree_v1.c
@@ -1,4 +1,5 @@
1#include <linux/buffer_head.h> 1#include <linux/buffer_head.h>
2#include <linux/slab.h>
2#include "minix.h" 3#include "minix.h"
3 4
4enum {DEPTH = 3, DIRECT = 7}; /* Only double indirect */ 5enum {DEPTH = 3, DIRECT = 7}; /* Only double indirect */
diff --git a/fs/mpage.c b/fs/mpage.c
index 42381bd6543b..fd56ca2ea556 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -16,6 +16,7 @@
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include <linux/kdev_t.h> 18#include <linux/kdev_t.h>
19#include <linux/gfp.h>
19#include <linux/bio.h> 20#include <linux/bio.h>
20#include <linux/fs.h> 21#include <linux/fs.h>
21#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
@@ -561,7 +562,7 @@ page_is_mapped:
561 if (page->index >= end_index) { 562 if (page->index >= end_index) {
562 /* 563 /*
563 * The page straddles i_size. It must be zeroed out on each 564 * The page straddles i_size. It must be zeroed out on each
564 * and every writepage invokation because it may be mmapped. 565 * and every writepage invocation because it may be mmapped.
565 * "A file is mapped in multiples of the page size. For a file 566 * "A file is mapped in multiples of the page size. For a file
566 * that is not a multiple of the page size, the remaining memory 567 * that is not a multiple of the page size, the remaining memory
567 * is zeroed when mapped, and writes to that region are not 568 * is zeroed when mapped, and writes to that region are not
diff --git a/fs/namei.c b/fs/namei.c
index 48e60a187325..b86b96fe1dc3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1610,8 +1610,7 @@ exit:
1610 1610
1611static struct file *do_last(struct nameidata *nd, struct path *path, 1611static struct file *do_last(struct nameidata *nd, struct path *path,
1612 int open_flag, int acc_mode, 1612 int open_flag, int acc_mode,
1613 int mode, const char *pathname, 1613 int mode, const char *pathname)
1614 int *want_dir)
1615{ 1614{
1616 struct dentry *dir = nd->path.dentry; 1615 struct dentry *dir = nd->path.dentry;
1617 struct file *filp; 1616 struct file *filp;
@@ -1642,7 +1641,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1642 if (nd->last.name[nd->last.len]) { 1641 if (nd->last.name[nd->last.len]) {
1643 if (open_flag & O_CREAT) 1642 if (open_flag & O_CREAT)
1644 goto exit; 1643 goto exit;
1645 *want_dir = 1; 1644 nd->flags |= LOOKUP_DIRECTORY | LOOKUP_FOLLOW;
1646 } 1645 }
1647 1646
1648 /* just plain open? */ 1647 /* just plain open? */
@@ -1656,8 +1655,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1656 if (path->dentry->d_inode->i_op->follow_link) 1655 if (path->dentry->d_inode->i_op->follow_link)
1657 return NULL; 1656 return NULL;
1658 error = -ENOTDIR; 1657 error = -ENOTDIR;
1659 if (*want_dir && !path->dentry->d_inode->i_op->lookup) 1658 if (nd->flags & LOOKUP_DIRECTORY) {
1660 goto exit_dput; 1659 if (!path->dentry->d_inode->i_op->lookup)
1660 goto exit_dput;
1661 }
1661 path_to_nameidata(path, nd); 1662 path_to_nameidata(path, nd);
1662 audit_inode(pathname, nd->path.dentry); 1663 audit_inode(pathname, nd->path.dentry);
1663 goto ok; 1664 goto ok;
@@ -1766,7 +1767,6 @@ struct file *do_filp_open(int dfd, const char *pathname,
1766 int count = 0; 1767 int count = 0;
1767 int flag = open_to_namei_flags(open_flag); 1768 int flag = open_to_namei_flags(open_flag);
1768 int force_reval = 0; 1769 int force_reval = 0;
1769 int want_dir = open_flag & O_DIRECTORY;
1770 1770
1771 if (!(open_flag & O_CREAT)) 1771 if (!(open_flag & O_CREAT))
1772 mode = 0; 1772 mode = 0;
@@ -1828,14 +1828,18 @@ reval:
1828 if (open_flag & O_EXCL) 1828 if (open_flag & O_EXCL)
1829 nd.flags |= LOOKUP_EXCL; 1829 nd.flags |= LOOKUP_EXCL;
1830 } 1830 }
1831 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir); 1831 if (open_flag & O_DIRECTORY)
1832 nd.flags |= LOOKUP_DIRECTORY;
1833 if (!(open_flag & O_NOFOLLOW))
1834 nd.flags |= LOOKUP_FOLLOW;
1835 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1832 while (unlikely(!filp)) { /* trailing symlink */ 1836 while (unlikely(!filp)) { /* trailing symlink */
1833 struct path holder; 1837 struct path holder;
1834 struct inode *inode = path.dentry->d_inode; 1838 struct inode *inode = path.dentry->d_inode;
1835 void *cookie; 1839 void *cookie;
1836 error = -ELOOP; 1840 error = -ELOOP;
1837 /* S_ISDIR part is a temporary automount kludge */ 1841 /* S_ISDIR part is a temporary automount kludge */
1838 if ((open_flag & O_NOFOLLOW) && !S_ISDIR(inode->i_mode)) 1842 if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(inode->i_mode))
1839 goto exit_dput; 1843 goto exit_dput;
1840 if (count++ == 32) 1844 if (count++ == 32)
1841 goto exit_dput; 1845 goto exit_dput;
@@ -1866,7 +1870,7 @@ reval:
1866 } 1870 }
1867 holder = path; 1871 holder = path;
1868 nd.flags &= ~LOOKUP_PARENT; 1872 nd.flags &= ~LOOKUP_PARENT;
1869 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir); 1873 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1870 if (inode->i_op->put_link) 1874 if (inode->i_op->put_link)
1871 inode->i_op->put_link(holder.dentry, &nd, cookie); 1875 inode->i_op->put_link(holder.dentry, &nd, cookie);
1872 path_put(&holder); 1876 path_put(&holder);
@@ -2172,8 +2176,10 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
2172 error = security_inode_rmdir(dir, dentry); 2176 error = security_inode_rmdir(dir, dentry);
2173 if (!error) { 2177 if (!error) {
2174 error = dir->i_op->rmdir(dir, dentry); 2178 error = dir->i_op->rmdir(dir, dentry);
2175 if (!error) 2179 if (!error) {
2176 dentry->d_inode->i_flags |= S_DEAD; 2180 dentry->d_inode->i_flags |= S_DEAD;
2181 dont_mount(dentry);
2182 }
2177 } 2183 }
2178 } 2184 }
2179 mutex_unlock(&dentry->d_inode->i_mutex); 2185 mutex_unlock(&dentry->d_inode->i_mutex);
@@ -2257,7 +2263,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
2257 if (!error) { 2263 if (!error) {
2258 error = dir->i_op->unlink(dir, dentry); 2264 error = dir->i_op->unlink(dir, dentry);
2259 if (!error) 2265 if (!error)
2260 dentry->d_inode->i_flags |= S_DEAD; 2266 dont_mount(dentry);
2261 } 2267 }
2262 } 2268 }
2263 mutex_unlock(&dentry->d_inode->i_mutex); 2269 mutex_unlock(&dentry->d_inode->i_mutex);
@@ -2544,7 +2550,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
2544 * e) conversion from fhandle to dentry may come in the wrong moment - when 2550 * e) conversion from fhandle to dentry may come in the wrong moment - when
2545 * we are removing the target. Solution: we will have to grab ->i_mutex 2551 * we are removing the target. Solution: we will have to grab ->i_mutex
2546 * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on 2552 * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
2547 * ->i_mutex on parents, which works but leads to some truely excessive 2553 * ->i_mutex on parents, which works but leads to some truly excessive
2548 * locking]. 2554 * locking].
2549 */ 2555 */
2550static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, 2556static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
@@ -2568,17 +2574,20 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
2568 return error; 2574 return error;
2569 2575
2570 target = new_dentry->d_inode; 2576 target = new_dentry->d_inode;
2571 if (target) { 2577 if (target)
2572 mutex_lock(&target->i_mutex); 2578 mutex_lock(&target->i_mutex);
2573 dentry_unhash(new_dentry);
2574 }
2575 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) 2579 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
2576 error = -EBUSY; 2580 error = -EBUSY;
2577 else 2581 else {
2582 if (target)
2583 dentry_unhash(new_dentry);
2578 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 2584 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
2585 }
2579 if (target) { 2586 if (target) {
2580 if (!error) 2587 if (!error) {
2581 target->i_flags |= S_DEAD; 2588 target->i_flags |= S_DEAD;
2589 dont_mount(new_dentry);
2590 }
2582 mutex_unlock(&target->i_mutex); 2591 mutex_unlock(&target->i_mutex);
2583 if (d_unhashed(new_dentry)) 2592 if (d_unhashed(new_dentry))
2584 d_rehash(new_dentry); 2593 d_rehash(new_dentry);
@@ -2610,7 +2619,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
2610 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 2619 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
2611 if (!error) { 2620 if (!error) {
2612 if (target) 2621 if (target)
2613 target->i_flags |= S_DEAD; 2622 dont_mount(new_dentry);
2614 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 2623 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
2615 d_move(old_dentry, new_dentry); 2624 d_move(old_dentry, new_dentry);
2616 } 2625 }
diff --git a/fs/namespace.c b/fs/namespace.c
index 8174c8ab5c70..88058de59c7c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -628,7 +628,6 @@ repeat:
628 mnt->mnt_pinned = 0; 628 mnt->mnt_pinned = 0;
629 spin_unlock(&vfsmount_lock); 629 spin_unlock(&vfsmount_lock);
630 acct_auto_close_mnt(mnt); 630 acct_auto_close_mnt(mnt);
631 security_sb_umount_close(mnt);
632 goto repeat; 631 goto repeat;
633 } 632 }
634} 633}
@@ -1117,8 +1116,6 @@ static int do_umount(struct vfsmount *mnt, int flags)
1117 retval = 0; 1116 retval = 0;
1118 } 1117 }
1119 spin_unlock(&vfsmount_lock); 1118 spin_unlock(&vfsmount_lock);
1120 if (retval)
1121 security_sb_umount_busy(mnt);
1122 up_write(&namespace_sem); 1119 up_write(&namespace_sem);
1123 release_mounts(&umount_list); 1120 release_mounts(&umount_list);
1124 return retval; 1121 return retval;
@@ -1432,20 +1429,13 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
1432 1429
1433 err = -ENOENT; 1430 err = -ENOENT;
1434 mutex_lock(&path->dentry->d_inode->i_mutex); 1431 mutex_lock(&path->dentry->d_inode->i_mutex);
1435 if (IS_DEADDIR(path->dentry->d_inode)) 1432 if (cant_mount(path->dentry))
1436 goto out_unlock;
1437
1438 err = security_sb_check_sb(mnt, path);
1439 if (err)
1440 goto out_unlock; 1433 goto out_unlock;
1441 1434
1442 err = -ENOENT;
1443 if (!d_unlinked(path->dentry)) 1435 if (!d_unlinked(path->dentry))
1444 err = attach_recursive_mnt(mnt, path, NULL); 1436 err = attach_recursive_mnt(mnt, path, NULL);
1445out_unlock: 1437out_unlock:
1446 mutex_unlock(&path->dentry->d_inode->i_mutex); 1438 mutex_unlock(&path->dentry->d_inode->i_mutex);
1447 if (!err)
1448 security_sb_post_addmount(mnt, path);
1449 return err; 1439 return err;
1450} 1440}
1451 1441
@@ -1581,8 +1571,6 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
1581 } 1571 }
1582 up_write(&sb->s_umount); 1572 up_write(&sb->s_umount);
1583 if (!err) { 1573 if (!err) {
1584 security_sb_post_remount(path->mnt, flags, data);
1585
1586 spin_lock(&vfsmount_lock); 1574 spin_lock(&vfsmount_lock);
1587 touch_mnt_namespace(path->mnt->mnt_ns); 1575 touch_mnt_namespace(path->mnt->mnt_ns);
1588 spin_unlock(&vfsmount_lock); 1576 spin_unlock(&vfsmount_lock);
@@ -1623,7 +1611,7 @@ static int do_move_mount(struct path *path, char *old_name)
1623 1611
1624 err = -ENOENT; 1612 err = -ENOENT;
1625 mutex_lock(&path->dentry->d_inode->i_mutex); 1613 mutex_lock(&path->dentry->d_inode->i_mutex);
1626 if (IS_DEADDIR(path->dentry->d_inode)) 1614 if (cant_mount(path->dentry))
1627 goto out1; 1615 goto out1;
1628 1616
1629 if (d_unlinked(path->dentry)) 1617 if (d_unlinked(path->dentry))
@@ -2234,7 +2222,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2234 if (!check_mnt(root.mnt)) 2222 if (!check_mnt(root.mnt))
2235 goto out2; 2223 goto out2;
2236 error = -ENOENT; 2224 error = -ENOENT;
2237 if (IS_DEADDIR(new.dentry->d_inode)) 2225 if (cant_mount(old.dentry))
2238 goto out2; 2226 goto out2;
2239 if (d_unlinked(new.dentry)) 2227 if (d_unlinked(new.dentry))
2240 goto out2; 2228 goto out2;
@@ -2277,7 +2265,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2277 touch_mnt_namespace(current->nsproxy->mnt_ns); 2265 touch_mnt_namespace(current->nsproxy->mnt_ns);
2278 spin_unlock(&vfsmount_lock); 2266 spin_unlock(&vfsmount_lock);
2279 chroot_fs_refs(&root, &new); 2267 chroot_fs_refs(&root, &new);
2280 security_sb_post_pivotroot(&root, &new);
2281 error = 0; 2268 error = 0;
2282 path_put(&root_parent); 2269 path_put(&root_parent);
2283 path_put(&parent_path); 2270 path_put(&parent_path);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index b8b5b30d53f0..7edfcd4d5e52 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -15,7 +15,6 @@
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/stat.h> 16#include <linux/stat.h>
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/slab.h>
19#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
20#include <linux/mm.h> 19#include <linux/mm.h>
21#include <asm/uaccess.h> 20#include <asm/uaccess.h>
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 6a7d901f1936..1daabb90e0a5 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -15,7 +15,6 @@
15#include <linux/fcntl.h> 15#include <linux/fcntl.h>
16#include <linux/stat.h> 16#include <linux/stat.h>
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include <linux/slab.h>
19#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
20#include <linux/sched.h> 19#include <linux/sched.h>
21#include <linux/smp_lock.h> 20#include <linux/smp_lock.h>
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index cf98da1be23e..fa3385154023 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -526,10 +526,15 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
526 sb->s_blocksize_bits = 10; 526 sb->s_blocksize_bits = 10;
527 sb->s_magic = NCP_SUPER_MAGIC; 527 sb->s_magic = NCP_SUPER_MAGIC;
528 sb->s_op = &ncp_sops; 528 sb->s_op = &ncp_sops;
529 sb->s_bdi = &server->bdi;
529 530
530 server = NCP_SBP(sb); 531 server = NCP_SBP(sb);
531 memset(server, 0, sizeof(*server)); 532 memset(server, 0, sizeof(*server));
532 533
534 error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY);
535 if (error)
536 goto out_bdi;
537
533 server->ncp_filp = ncp_filp; 538 server->ncp_filp = ncp_filp;
534 server->ncp_sock = sock; 539 server->ncp_sock = sock;
535 540
@@ -719,6 +724,8 @@ out_fput2:
719 if (server->info_filp) 724 if (server->info_filp)
720 fput(server->info_filp); 725 fput(server->info_filp);
721out_fput: 726out_fput:
727 bdi_destroy(&server->bdi);
728out_bdi:
722 /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>: 729 /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>:
723 * 730 *
724 * The previously used put_filp(ncp_filp); was bogous, since 731 * The previously used put_filp(ncp_filp); was bogous, since
@@ -756,6 +763,7 @@ static void ncp_put_super(struct super_block *sb)
756 kill_pid(server->m.wdog_pid, SIGTERM, 1); 763 kill_pid(server->m.wdog_pid, SIGTERM, 1);
757 put_pid(server->m.wdog_pid); 764 put_pid(server->m.wdog_pid);
758 765
766 bdi_destroy(&server->bdi);
759 kfree(server->priv.data); 767 kfree(server->priv.data);
760 kfree(server->auth.object_name); 768 kfree(server->auth.object_name);
761 vfree(server->rxbuf); 769 vfree(server->rxbuf);
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index ec8f45f12e05..60a5e2864ea8 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -15,6 +15,7 @@
15#include <linux/time.h> 15#include <linux/time.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/mount.h> 17#include <linux/mount.h>
18#include <linux/slab.h>
18#include <linux/highuid.h> 19#include <linux/highuid.h>
19#include <linux/smp_lock.h> 20#include <linux/smp_lock.h>
20#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 15458decdb8a..56f5b3a0e1ee 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -9,12 +9,12 @@
9#include <linux/stat.h> 9#include <linux/stat.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/gfp.h>
12#include <linux/mm.h> 13#include <linux/mm.h>
13#include <linux/shm.h> 14#include <linux/shm.h>
14#include <linux/errno.h> 15#include <linux/errno.h>
15#include <linux/mman.h> 16#include <linux/mman.h>
16#include <linux/string.h> 17#include <linux/string.h>
17#include <linux/slab.h>
18#include <linux/fcntl.h> 18#include <linux/fcntl.h>
19#include <linux/ncp_fs.h> 19#include <linux/ncp_fs.h>
20 20
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index e37df8d5fe70..c7ff6c700a6e 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -21,6 +21,7 @@
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/netdevice.h> 22#include <linux/netdevice.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/slab.h>
24#include <net/scm.h> 25#include <net/scm.h>
25#include <net/sock.h> 26#include <net/sock.h>
26#include <linux/ipx.h> 27#include <linux/ipx.h>
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index e3d26c1bd105..c634fd17b337 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -27,6 +27,7 @@
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/ncp_fs.h> 28#include <linux/ncp_fs.h>
29#include <linux/time.h> 29#include <linux/time.h>
30#include <linux/slab.h>
30#include <linux/mm.h> 31#include <linux/mm.h>
31#include <linux/stat.h> 32#include <linux/stat.h>
32#include "ncplib_kernel.h" 33#include "ncplib_kernel.h"
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index b4ffd0146ea6..84690319e625 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -10,6 +10,7 @@
10#include <linux/moduleparam.h> 10#include <linux/moduleparam.h>
11#include <linux/mount.h> 11#include <linux/mount.h>
12#include <linux/namei.h> 12#include <linux/namei.h>
13#include <linux/slab.h>
13#include <linux/sunrpc/cache.h> 14#include <linux/sunrpc/cache.h>
14#include <linux/sunrpc/rpc_pipe_fs.h> 15#include <linux/sunrpc/rpc_pipe_fs.h>
15 16
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 84761b5bb8e2..a08770a7e857 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -7,6 +7,7 @@
7 */ 7 */
8#include <linux/nfs4.h> 8#include <linux/nfs4.h>
9#include <linux/nfs_fs.h> 9#include <linux/nfs_fs.h>
10#include <linux/slab.h>
10#include "nfs4_fs.h" 11#include "nfs4_fs.h"
11#include "callback.h" 12#include "callback.h"
12#include "delegation.h" 13#include "delegation.h"
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index db30c0b398b5..05af212f0edf 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -9,6 +9,7 @@
9#include <linux/sunrpc/svc.h> 9#include <linux/sunrpc/svc.h>
10#include <linux/nfs4.h> 10#include <linux/nfs4.h>
11#include <linux/nfs_fs.h> 11#include <linux/nfs_fs.h>
12#include <linux/slab.h>
12#include "nfs4_fs.h" 13#include "nfs4_fs.h"
13#include "callback.h" 14#include "callback.h"
14 15
@@ -782,6 +783,7 @@ struct svc_version nfs4_callback_version1 = {
782 .vs_proc = nfs4_callback_procedures1, 783 .vs_proc = nfs4_callback_procedures1,
783 .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, 784 .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
784 .vs_dispatch = NULL, 785 .vs_dispatch = NULL,
786 .vs_hidden = 1,
785}; 787};
786 788
787struct svc_version nfs4_callback_version4 = { 789struct svc_version nfs4_callback_version4 = {
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2274f1737336..7ec9b34a59f8 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -35,6 +35,7 @@
35#include <linux/vfs.h> 35#include <linux/vfs.h>
36#include <linux/inet.h> 36#include <linux/inet.h>
37#include <linux/in6.h> 37#include <linux/in6.h>
38#include <linux/slab.h>
38#include <net/ipv6.h> 39#include <net/ipv6.h>
39#include <linux/nfs_xdr.h> 40#include <linux/nfs_xdr.h>
40#include <linux/sunrpc/bc_xprt.h> 41#include <linux/sunrpc/bc_xprt.h>
@@ -933,7 +934,6 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
933 } 934 }
934 935
935 fsinfo.fattr = fattr; 936 fsinfo.fattr = fattr;
936 nfs_fattr_init(fattr);
937 error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); 937 error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
938 if (error < 0) 938 if (error < 0)
939 goto out_error; 939 goto out_error;
@@ -965,6 +965,8 @@ out_error:
965static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source) 965static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source)
966{ 966{
967 target->flags = source->flags; 967 target->flags = source->flags;
968 target->rsize = source->rsize;
969 target->wsize = source->wsize;
968 target->acregmin = source->acregmin; 970 target->acregmin = source->acregmin;
969 target->acregmax = source->acregmax; 971 target->acregmax = source->acregmax;
970 target->acdirmin = source->acdirmin; 972 target->acdirmin = source->acdirmin;
@@ -1044,13 +1046,18 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
1044 struct nfs_fh *mntfh) 1046 struct nfs_fh *mntfh)
1045{ 1047{
1046 struct nfs_server *server; 1048 struct nfs_server *server;
1047 struct nfs_fattr fattr; 1049 struct nfs_fattr *fattr;
1048 int error; 1050 int error;
1049 1051
1050 server = nfs_alloc_server(); 1052 server = nfs_alloc_server();
1051 if (!server) 1053 if (!server)
1052 return ERR_PTR(-ENOMEM); 1054 return ERR_PTR(-ENOMEM);
1053 1055
1056 error = -ENOMEM;
1057 fattr = nfs_alloc_fattr();
1058 if (fattr == NULL)
1059 goto error;
1060
1054 /* Get a client representation */ 1061 /* Get a client representation */
1055 error = nfs_init_server(server, data); 1062 error = nfs_init_server(server, data);
1056 if (error < 0) 1063 if (error < 0)
@@ -1061,7 +1068,7 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
1061 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); 1068 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
1062 1069
1063 /* Probe the root fh to retrieve its FSID */ 1070 /* Probe the root fh to retrieve its FSID */
1064 error = nfs_probe_fsinfo(server, mntfh, &fattr); 1071 error = nfs_probe_fsinfo(server, mntfh, fattr);
1065 if (error < 0) 1072 if (error < 0)
1066 goto error; 1073 goto error;
1067 if (server->nfs_client->rpc_ops->version == 3) { 1074 if (server->nfs_client->rpc_ops->version == 3) {
@@ -1074,14 +1081,14 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
1074 server->namelen = NFS2_MAXNAMLEN; 1081 server->namelen = NFS2_MAXNAMLEN;
1075 } 1082 }
1076 1083
1077 if (!(fattr.valid & NFS_ATTR_FATTR)) { 1084 if (!(fattr->valid & NFS_ATTR_FATTR)) {
1078 error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr); 1085 error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
1079 if (error < 0) { 1086 if (error < 0) {
1080 dprintk("nfs_create_server: getattr error = %d\n", -error); 1087 dprintk("nfs_create_server: getattr error = %d\n", -error);
1081 goto error; 1088 goto error;
1082 } 1089 }
1083 } 1090 }
1084 memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid)); 1091 memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
1085 1092
1086 dprintk("Server FSID: %llx:%llx\n", 1093 dprintk("Server FSID: %llx:%llx\n",
1087 (unsigned long long) server->fsid.major, 1094 (unsigned long long) server->fsid.major,
@@ -1093,9 +1100,11 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
1093 spin_unlock(&nfs_client_lock); 1100 spin_unlock(&nfs_client_lock);
1094 1101
1095 server->mount_time = jiffies; 1102 server->mount_time = jiffies;
1103 nfs_free_fattr(fattr);
1096 return server; 1104 return server;
1097 1105
1098error: 1106error:
1107 nfs_free_fattr(fattr);
1099 nfs_free_server(server); 1108 nfs_free_server(server);
1100 return ERR_PTR(error); 1109 return ERR_PTR(error);
1101} 1110}
@@ -1293,7 +1302,8 @@ static int nfs4_init_server(struct nfs_server *server,
1293 1302
1294 /* Initialise the client representation from the mount data */ 1303 /* Initialise the client representation from the mount data */
1295 server->flags = data->flags; 1304 server->flags = data->flags;
1296 server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR; 1305 server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|
1306 NFS_CAP_POSIX_LOCK;
1297 server->options = data->options; 1307 server->options = data->options;
1298 1308
1299 /* Get a client record */ 1309 /* Get a client record */
@@ -1336,7 +1346,7 @@ error:
1336struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, 1346struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
1337 struct nfs_fh *mntfh) 1347 struct nfs_fh *mntfh)
1338{ 1348{
1339 struct nfs_fattr fattr; 1349 struct nfs_fattr *fattr;
1340 struct nfs_server *server; 1350 struct nfs_server *server;
1341 int error; 1351 int error;
1342 1352
@@ -1346,6 +1356,11 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
1346 if (!server) 1356 if (!server)
1347 return ERR_PTR(-ENOMEM); 1357 return ERR_PTR(-ENOMEM);
1348 1358
1359 error = -ENOMEM;
1360 fattr = nfs_alloc_fattr();
1361 if (fattr == NULL)
1362 goto error;
1363
1349 /* set up the general RPC client */ 1364 /* set up the general RPC client */
1350 error = nfs4_init_server(server, data); 1365 error = nfs4_init_server(server, data);
1351 if (error < 0) 1366 if (error < 0)
@@ -1360,7 +1375,7 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
1360 goto error; 1375 goto error;
1361 1376
1362 /* Probe the root fh to retrieve its FSID */ 1377 /* Probe the root fh to retrieve its FSID */
1363 error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path); 1378 error = nfs4_get_rootfh(server, mntfh);
1364 if (error < 0) 1379 if (error < 0)
1365 goto error; 1380 goto error;
1366 1381
@@ -1371,7 +1386,7 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
1371 1386
1372 nfs4_session_set_rwsize(server); 1387 nfs4_session_set_rwsize(server);
1373 1388
1374 error = nfs_probe_fsinfo(server, mntfh, &fattr); 1389 error = nfs_probe_fsinfo(server, mntfh, fattr);
1375 if (error < 0) 1390 if (error < 0)
1376 goto error; 1391 goto error;
1377 1392
@@ -1385,9 +1400,11 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
1385 1400
1386 server->mount_time = jiffies; 1401 server->mount_time = jiffies;
1387 dprintk("<-- nfs4_create_server() = %p\n", server); 1402 dprintk("<-- nfs4_create_server() = %p\n", server);
1403 nfs_free_fattr(fattr);
1388 return server; 1404 return server;
1389 1405
1390error: 1406error:
1407 nfs_free_fattr(fattr);
1391 nfs_free_server(server); 1408 nfs_free_server(server);
1392 dprintk("<-- nfs4_create_server() = error %d\n", error); 1409 dprintk("<-- nfs4_create_server() = error %d\n", error);
1393 return ERR_PTR(error); 1410 return ERR_PTR(error);
@@ -1401,7 +1418,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
1401{ 1418{
1402 struct nfs_client *parent_client; 1419 struct nfs_client *parent_client;
1403 struct nfs_server *server, *parent_server; 1420 struct nfs_server *server, *parent_server;
1404 struct nfs_fattr fattr; 1421 struct nfs_fattr *fattr;
1405 int error; 1422 int error;
1406 1423
1407 dprintk("--> nfs4_create_referral_server()\n"); 1424 dprintk("--> nfs4_create_referral_server()\n");
@@ -1410,6 +1427,11 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
1410 if (!server) 1427 if (!server)
1411 return ERR_PTR(-ENOMEM); 1428 return ERR_PTR(-ENOMEM);
1412 1429
1430 error = -ENOMEM;
1431 fattr = nfs_alloc_fattr();
1432 if (fattr == NULL)
1433 goto error;
1434
1413 parent_server = NFS_SB(data->sb); 1435 parent_server = NFS_SB(data->sb);
1414 parent_client = parent_server->nfs_client; 1436 parent_client = parent_server->nfs_client;
1415 1437
@@ -1439,12 +1461,12 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
1439 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); 1461 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
1440 1462
1441 /* Probe the root fh to retrieve its FSID and filehandle */ 1463 /* Probe the root fh to retrieve its FSID and filehandle */
1442 error = nfs4_path_walk(server, mntfh, data->mnt_path); 1464 error = nfs4_get_rootfh(server, mntfh);
1443 if (error < 0) 1465 if (error < 0)
1444 goto error; 1466 goto error;
1445 1467
1446 /* probe the filesystem info for this server filesystem */ 1468 /* probe the filesystem info for this server filesystem */
1447 error = nfs_probe_fsinfo(server, mntfh, &fattr); 1469 error = nfs_probe_fsinfo(server, mntfh, fattr);
1448 if (error < 0) 1470 if (error < 0)
1449 goto error; 1471 goto error;
1450 1472
@@ -1462,10 +1484,12 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
1462 1484
1463 server->mount_time = jiffies; 1485 server->mount_time = jiffies;
1464 1486
1487 nfs_free_fattr(fattr);
1465 dprintk("<-- nfs_create_referral_server() = %p\n", server); 1488 dprintk("<-- nfs_create_referral_server() = %p\n", server);
1466 return server; 1489 return server;
1467 1490
1468error: 1491error:
1492 nfs_free_fattr(fattr);
1469 nfs_free_server(server); 1493 nfs_free_server(server);
1470 dprintk("<-- nfs4_create_referral_server() = error %d\n", error); 1494 dprintk("<-- nfs4_create_referral_server() = error %d\n", error);
1471 return ERR_PTR(error); 1495 return ERR_PTR(error);
@@ -1481,7 +1505,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
1481 struct nfs_fattr *fattr) 1505 struct nfs_fattr *fattr)
1482{ 1506{
1483 struct nfs_server *server; 1507 struct nfs_server *server;
1484 struct nfs_fattr fattr_fsinfo; 1508 struct nfs_fattr *fattr_fsinfo;
1485 int error; 1509 int error;
1486 1510
1487 dprintk("--> nfs_clone_server(,%llx:%llx,)\n", 1511 dprintk("--> nfs_clone_server(,%llx:%llx,)\n",
@@ -1492,6 +1516,11 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
1492 if (!server) 1516 if (!server)
1493 return ERR_PTR(-ENOMEM); 1517 return ERR_PTR(-ENOMEM);
1494 1518
1519 error = -ENOMEM;
1520 fattr_fsinfo = nfs_alloc_fattr();
1521 if (fattr_fsinfo == NULL)
1522 goto out_free_server;
1523
1495 /* Copy data from the source */ 1524 /* Copy data from the source */
1496 server->nfs_client = source->nfs_client; 1525 server->nfs_client = source->nfs_client;
1497 atomic_inc(&server->nfs_client->cl_count); 1526 atomic_inc(&server->nfs_client->cl_count);
@@ -1508,7 +1537,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
1508 nfs_init_server_aclclient(server); 1537 nfs_init_server_aclclient(server);
1509 1538
1510 /* probe the filesystem info for this server filesystem */ 1539 /* probe the filesystem info for this server filesystem */
1511 error = nfs_probe_fsinfo(server, fh, &fattr_fsinfo); 1540 error = nfs_probe_fsinfo(server, fh, fattr_fsinfo);
1512 if (error < 0) 1541 if (error < 0)
1513 goto out_free_server; 1542 goto out_free_server;
1514 1543
@@ -1530,10 +1559,12 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
1530 1559
1531 server->mount_time = jiffies; 1560 server->mount_time = jiffies;
1532 1561
1562 nfs_free_fattr(fattr_fsinfo);
1533 dprintk("<-- nfs_clone_server() = %p\n", server); 1563 dprintk("<-- nfs_clone_server() = %p\n", server);
1534 return server; 1564 return server;
1535 1565
1536out_free_server: 1566out_free_server:
1567 nfs_free_fattr(fattr_fsinfo);
1537 nfs_free_server(server); 1568 nfs_free_server(server);
1538 dprintk("<-- nfs_clone_server() = error %d\n", error); 1569 dprintk("<-- nfs_clone_server() = error %d\n", error);
1539 return ERR_PTR(error); 1570 return ERR_PTR(error);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 2563bebc4c67..301634543974 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -10,6 +10,7 @@
10#include <linux/kthread.h> 10#include <linux/kthread.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/slab.h>
13#include <linux/smp_lock.h> 14#include <linux/smp_lock.h>
14#include <linux/spinlock.h> 15#include <linux/spinlock.h>
15 16
@@ -23,6 +24,8 @@
23 24
24static void nfs_do_free_delegation(struct nfs_delegation *delegation) 25static void nfs_do_free_delegation(struct nfs_delegation *delegation)
25{ 26{
27 if (delegation->cred)
28 put_rpccred(delegation->cred);
26 kfree(delegation); 29 kfree(delegation);
27} 30}
28 31
@@ -35,13 +38,7 @@ static void nfs_free_delegation_callback(struct rcu_head *head)
35 38
36static void nfs_free_delegation(struct nfs_delegation *delegation) 39static void nfs_free_delegation(struct nfs_delegation *delegation)
37{ 40{
38 struct rpc_cred *cred;
39
40 cred = rcu_dereference(delegation->cred);
41 rcu_assign_pointer(delegation->cred, NULL);
42 call_rcu(&delegation->rcu, nfs_free_delegation_callback); 41 call_rcu(&delegation->rcu, nfs_free_delegation_callback);
43 if (cred)
44 put_rpccred(cred);
45} 42}
46 43
47void nfs_mark_delegation_referenced(struct nfs_delegation *delegation) 44void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
@@ -128,21 +125,35 @@ again:
128 */ 125 */
129void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res) 126void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
130{ 127{
131 struct nfs_delegation *delegation = NFS_I(inode)->delegation; 128 struct nfs_delegation *delegation;
132 struct rpc_cred *oldcred; 129 struct rpc_cred *oldcred = NULL;
133 130
134 if (delegation == NULL) 131 rcu_read_lock();
135 return; 132 delegation = rcu_dereference(NFS_I(inode)->delegation);
136 memcpy(delegation->stateid.data, res->delegation.data, 133 if (delegation != NULL) {
137 sizeof(delegation->stateid.data)); 134 spin_lock(&delegation->lock);
138 delegation->type = res->delegation_type; 135 if (delegation->inode != NULL) {
139 delegation->maxsize = res->maxsize; 136 memcpy(delegation->stateid.data, res->delegation.data,
140 oldcred = delegation->cred; 137 sizeof(delegation->stateid.data));
141 delegation->cred = get_rpccred(cred); 138 delegation->type = res->delegation_type;
142 clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); 139 delegation->maxsize = res->maxsize;
143 NFS_I(inode)->delegation_state = delegation->type; 140 oldcred = delegation->cred;
144 smp_wmb(); 141 delegation->cred = get_rpccred(cred);
145 put_rpccred(oldcred); 142 clear_bit(NFS_DELEGATION_NEED_RECLAIM,
143 &delegation->flags);
144 NFS_I(inode)->delegation_state = delegation->type;
145 spin_unlock(&delegation->lock);
146 put_rpccred(oldcred);
147 rcu_read_unlock();
148 } else {
149 /* We appear to have raced with a delegation return. */
150 spin_unlock(&delegation->lock);
151 rcu_read_unlock();
152 nfs_inode_set_delegation(inode, cred, res);
153 }
154 } else {
155 rcu_read_unlock();
156 }
146} 157}
147 158
148static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) 159static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
@@ -165,9 +176,13 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation
165 return inode; 176 return inode;
166} 177}
167 178
168static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid) 179static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi,
180 const nfs4_stateid *stateid,
181 struct nfs_client *clp)
169{ 182{
170 struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation); 183 struct nfs_delegation *delegation =
184 rcu_dereference_protected(nfsi->delegation,
185 lockdep_is_held(&clp->cl_lock));
171 186
172 if (delegation == NULL) 187 if (delegation == NULL)
173 goto nomatch; 188 goto nomatch;
@@ -194,11 +209,11 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
194{ 209{
195 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; 210 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
196 struct nfs_inode *nfsi = NFS_I(inode); 211 struct nfs_inode *nfsi = NFS_I(inode);
197 struct nfs_delegation *delegation; 212 struct nfs_delegation *delegation, *old_delegation;
198 struct nfs_delegation *freeme = NULL; 213 struct nfs_delegation *freeme = NULL;
199 int status = 0; 214 int status = 0;
200 215
201 delegation = kmalloc(sizeof(*delegation), GFP_KERNEL); 216 delegation = kmalloc(sizeof(*delegation), GFP_NOFS);
202 if (delegation == NULL) 217 if (delegation == NULL)
203 return -ENOMEM; 218 return -ENOMEM;
204 memcpy(delegation->stateid.data, res->delegation.data, 219 memcpy(delegation->stateid.data, res->delegation.data,
@@ -212,10 +227,12 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
212 spin_lock_init(&delegation->lock); 227 spin_lock_init(&delegation->lock);
213 228
214 spin_lock(&clp->cl_lock); 229 spin_lock(&clp->cl_lock);
215 if (rcu_dereference(nfsi->delegation) != NULL) { 230 old_delegation = rcu_dereference_protected(nfsi->delegation,
216 if (memcmp(&delegation->stateid, &nfsi->delegation->stateid, 231 lockdep_is_held(&clp->cl_lock));
217 sizeof(delegation->stateid)) == 0 && 232 if (old_delegation != NULL) {
218 delegation->type == nfsi->delegation->type) { 233 if (memcmp(&delegation->stateid, &old_delegation->stateid,
234 sizeof(old_delegation->stateid)) == 0 &&
235 delegation->type == old_delegation->type) {
219 goto out; 236 goto out;
220 } 237 }
221 /* 238 /*
@@ -225,12 +242,12 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
225 dfprintk(FILE, "%s: server %s handed out " 242 dfprintk(FILE, "%s: server %s handed out "
226 "a duplicate delegation!\n", 243 "a duplicate delegation!\n",
227 __func__, clp->cl_hostname); 244 __func__, clp->cl_hostname);
228 if (delegation->type <= nfsi->delegation->type) { 245 if (delegation->type <= old_delegation->type) {
229 freeme = delegation; 246 freeme = delegation;
230 delegation = NULL; 247 delegation = NULL;
231 goto out; 248 goto out;
232 } 249 }
233 freeme = nfs_detach_delegation_locked(nfsi, NULL); 250 freeme = nfs_detach_delegation_locked(nfsi, NULL, clp);
234 } 251 }
235 list_add_rcu(&delegation->super_list, &clp->cl_delegations); 252 list_add_rcu(&delegation->super_list, &clp->cl_delegations);
236 nfsi->delegation_state = delegation->type; 253 nfsi->delegation_state = delegation->type;
@@ -300,7 +317,7 @@ restart:
300 if (inode == NULL) 317 if (inode == NULL)
301 continue; 318 continue;
302 spin_lock(&clp->cl_lock); 319 spin_lock(&clp->cl_lock);
303 delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL); 320 delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp);
304 spin_unlock(&clp->cl_lock); 321 spin_unlock(&clp->cl_lock);
305 rcu_read_unlock(); 322 rcu_read_unlock();
306 if (delegation != NULL) { 323 if (delegation != NULL) {
@@ -329,9 +346,9 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode)
329 struct nfs_inode *nfsi = NFS_I(inode); 346 struct nfs_inode *nfsi = NFS_I(inode);
330 struct nfs_delegation *delegation; 347 struct nfs_delegation *delegation;
331 348
332 if (rcu_dereference(nfsi->delegation) != NULL) { 349 if (rcu_access_pointer(nfsi->delegation) != NULL) {
333 spin_lock(&clp->cl_lock); 350 spin_lock(&clp->cl_lock);
334 delegation = nfs_detach_delegation_locked(nfsi, NULL); 351 delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
335 spin_unlock(&clp->cl_lock); 352 spin_unlock(&clp->cl_lock);
336 if (delegation != NULL) 353 if (delegation != NULL)
337 nfs_do_return_delegation(inode, delegation, 0); 354 nfs_do_return_delegation(inode, delegation, 0);
@@ -345,9 +362,9 @@ int nfs_inode_return_delegation(struct inode *inode)
345 struct nfs_delegation *delegation; 362 struct nfs_delegation *delegation;
346 int err = 0; 363 int err = 0;
347 364
348 if (rcu_dereference(nfsi->delegation) != NULL) { 365 if (rcu_access_pointer(nfsi->delegation) != NULL) {
349 spin_lock(&clp->cl_lock); 366 spin_lock(&clp->cl_lock);
350 delegation = nfs_detach_delegation_locked(nfsi, NULL); 367 delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
351 spin_unlock(&clp->cl_lock); 368 spin_unlock(&clp->cl_lock);
352 if (delegation != NULL) { 369 if (delegation != NULL) {
353 nfs_msync_inode(inode); 370 nfs_msync_inode(inode);
@@ -525,7 +542,7 @@ restart:
525 if (inode == NULL) 542 if (inode == NULL)
526 continue; 543 continue;
527 spin_lock(&clp->cl_lock); 544 spin_lock(&clp->cl_lock);
528 delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL); 545 delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp);
529 spin_unlock(&clp->cl_lock); 546 spin_unlock(&clp->cl_lock);
530 rcu_read_unlock(); 547 rcu_read_unlock();
531 if (delegation != NULL) 548 if (delegation != NULL)
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 944b627ec6e1..69e7b8140122 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -71,4 +71,10 @@ static inline int nfs_inode_return_delegation(struct inode *inode)
71} 71}
72#endif 72#endif
73 73
74static inline int nfs_have_delegated_attributes(struct inode *inode)
75{
76 return nfs_have_delegation(inode, FMODE_READ) &&
77 !(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED);
78}
79
74#endif 80#endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index a1f6b4438fb1..ee9a179ebdf3 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -530,9 +530,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
530 nfs_readdir_descriptor_t my_desc, 530 nfs_readdir_descriptor_t my_desc,
531 *desc = &my_desc; 531 *desc = &my_desc;
532 struct nfs_entry my_entry; 532 struct nfs_entry my_entry;
533 struct nfs_fh fh; 533 int res = -ENOMEM;
534 struct nfs_fattr fattr;
535 long res;
536 534
537 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", 535 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
538 dentry->d_parent->d_name.name, dentry->d_name.name, 536 dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -554,9 +552,11 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
554 552
555 my_entry.cookie = my_entry.prev_cookie = 0; 553 my_entry.cookie = my_entry.prev_cookie = 0;
556 my_entry.eof = 0; 554 my_entry.eof = 0;
557 my_entry.fh = &fh; 555 my_entry.fh = nfs_alloc_fhandle();
558 my_entry.fattr = &fattr; 556 my_entry.fattr = nfs_alloc_fattr();
559 nfs_fattr_init(&fattr); 557 if (my_entry.fh == NULL || my_entry.fattr == NULL)
558 goto out_alloc_failed;
559
560 desc->entry = &my_entry; 560 desc->entry = &my_entry;
561 561
562 nfs_block_sillyrename(dentry); 562 nfs_block_sillyrename(dentry);
@@ -598,7 +598,10 @@ out:
598 nfs_unblock_sillyrename(dentry); 598 nfs_unblock_sillyrename(dentry);
599 if (res > 0) 599 if (res > 0)
600 res = 0; 600 res = 0;
601 dfprintk(FILE, "NFS: readdir(%s/%s) returns %ld\n", 601out_alloc_failed:
602 nfs_free_fattr(my_entry.fattr);
603 nfs_free_fhandle(my_entry.fh);
604 dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n",
602 dentry->d_parent->d_name.name, dentry->d_name.name, 605 dentry->d_parent->d_name.name, dentry->d_name.name,
603 res); 606 res);
604 return res; 607 return res;
@@ -776,9 +779,9 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
776 struct inode *dir; 779 struct inode *dir;
777 struct inode *inode; 780 struct inode *inode;
778 struct dentry *parent; 781 struct dentry *parent;
782 struct nfs_fh *fhandle = NULL;
783 struct nfs_fattr *fattr = NULL;
779 int error; 784 int error;
780 struct nfs_fh fhandle;
781 struct nfs_fattr fattr;
782 785
783 parent = dget_parent(dentry); 786 parent = dget_parent(dentry);
784 dir = parent->d_inode; 787 dir = parent->d_inode;
@@ -811,14 +814,22 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
811 if (NFS_STALE(inode)) 814 if (NFS_STALE(inode))
812 goto out_bad; 815 goto out_bad;
813 816
814 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); 817 error = -ENOMEM;
818 fhandle = nfs_alloc_fhandle();
819 fattr = nfs_alloc_fattr();
820 if (fhandle == NULL || fattr == NULL)
821 goto out_error;
822
823 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
815 if (error) 824 if (error)
816 goto out_bad; 825 goto out_bad;
817 if (nfs_compare_fh(NFS_FH(inode), &fhandle)) 826 if (nfs_compare_fh(NFS_FH(inode), fhandle))
818 goto out_bad; 827 goto out_bad;
819 if ((error = nfs_refresh_inode(inode, &fattr)) != 0) 828 if ((error = nfs_refresh_inode(inode, fattr)) != 0)
820 goto out_bad; 829 goto out_bad;
821 830
831 nfs_free_fattr(fattr);
832 nfs_free_fhandle(fhandle);
822out_set_verifier: 833out_set_verifier:
823 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 834 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
824 out_valid: 835 out_valid:
@@ -837,14 +848,26 @@ out_zap_parent:
837 /* If we have submounts, don't unhash ! */ 848 /* If we have submounts, don't unhash ! */
838 if (have_submounts(dentry)) 849 if (have_submounts(dentry))
839 goto out_valid; 850 goto out_valid;
851 if (dentry->d_flags & DCACHE_DISCONNECTED)
852 goto out_valid;
840 shrink_dcache_parent(dentry); 853 shrink_dcache_parent(dentry);
841 } 854 }
842 d_drop(dentry); 855 d_drop(dentry);
856 nfs_free_fattr(fattr);
857 nfs_free_fhandle(fhandle);
843 dput(parent); 858 dput(parent);
844 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n", 859 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
845 __func__, dentry->d_parent->d_name.name, 860 __func__, dentry->d_parent->d_name.name,
846 dentry->d_name.name); 861 dentry->d_name.name);
847 return 0; 862 return 0;
863out_error:
864 nfs_free_fattr(fattr);
865 nfs_free_fhandle(fhandle);
866 dput(parent);
867 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n",
868 __func__, dentry->d_parent->d_name.name,
869 dentry->d_name.name, error);
870 return error;
848} 871}
849 872
850/* 873/*
@@ -909,9 +932,9 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
909 struct dentry *res; 932 struct dentry *res;
910 struct dentry *parent; 933 struct dentry *parent;
911 struct inode *inode = NULL; 934 struct inode *inode = NULL;
935 struct nfs_fh *fhandle = NULL;
936 struct nfs_fattr *fattr = NULL;
912 int error; 937 int error;
913 struct nfs_fh fhandle;
914 struct nfs_fattr fattr;
915 938
916 dfprintk(VFS, "NFS: lookup(%s/%s)\n", 939 dfprintk(VFS, "NFS: lookup(%s/%s)\n",
917 dentry->d_parent->d_name.name, dentry->d_name.name); 940 dentry->d_parent->d_name.name, dentry->d_name.name);
@@ -921,7 +944,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
921 if (dentry->d_name.len > NFS_SERVER(dir)->namelen) 944 if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
922 goto out; 945 goto out;
923 946
924 res = ERR_PTR(-ENOMEM);
925 dentry->d_op = NFS_PROTO(dir)->dentry_ops; 947 dentry->d_op = NFS_PROTO(dir)->dentry_ops;
926 948
927 /* 949 /*
@@ -934,17 +956,23 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
934 goto out; 956 goto out;
935 } 957 }
936 958
959 res = ERR_PTR(-ENOMEM);
960 fhandle = nfs_alloc_fhandle();
961 fattr = nfs_alloc_fattr();
962 if (fhandle == NULL || fattr == NULL)
963 goto out;
964
937 parent = dentry->d_parent; 965 parent = dentry->d_parent;
938 /* Protect against concurrent sillydeletes */ 966 /* Protect against concurrent sillydeletes */
939 nfs_block_sillyrename(parent); 967 nfs_block_sillyrename(parent);
940 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); 968 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
941 if (error == -ENOENT) 969 if (error == -ENOENT)
942 goto no_entry; 970 goto no_entry;
943 if (error < 0) { 971 if (error < 0) {
944 res = ERR_PTR(error); 972 res = ERR_PTR(error);
945 goto out_unblock_sillyrename; 973 goto out_unblock_sillyrename;
946 } 974 }
947 inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr); 975 inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
948 res = (struct dentry *)inode; 976 res = (struct dentry *)inode;
949 if (IS_ERR(res)) 977 if (IS_ERR(res))
950 goto out_unblock_sillyrename; 978 goto out_unblock_sillyrename;
@@ -960,6 +988,8 @@ no_entry:
960out_unblock_sillyrename: 988out_unblock_sillyrename:
961 nfs_unblock_sillyrename(parent); 989 nfs_unblock_sillyrename(parent);
962out: 990out:
991 nfs_free_fattr(fattr);
992 nfs_free_fhandle(fhandle);
963 return res; 993 return res;
964} 994}
965 995
@@ -1025,12 +1055,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1025 res = NULL; 1055 res = NULL;
1026 goto out; 1056 goto out;
1027 /* This turned out not to be a regular file */ 1057 /* This turned out not to be a regular file */
1058 case -EISDIR:
1028 case -ENOTDIR: 1059 case -ENOTDIR:
1029 goto no_open; 1060 goto no_open;
1030 case -ELOOP: 1061 case -ELOOP:
1031 if (!(nd->intent.open.flags & O_NOFOLLOW)) 1062 if (!(nd->intent.open.flags & O_NOFOLLOW))
1032 goto no_open; 1063 goto no_open;
1033 /* case -EISDIR: */
1034 /* case -EINVAL: */ 1064 /* case -EINVAL: */
1035 default: 1065 default:
1036 goto out; 1066 goto out;
@@ -1050,7 +1080,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1050 struct inode *dir; 1080 struct inode *dir;
1051 int openflags, ret = 0; 1081 int openflags, ret = 0;
1052 1082
1053 if (!is_atomic_open(nd)) 1083 if (!is_atomic_open(nd) || d_mountpoint(dentry))
1054 goto no_open; 1084 goto no_open;
1055 parent = dget_parent(dentry); 1085 parent = dget_parent(dentry);
1056 dir = parent->d_inode; 1086 dir = parent->d_inode;
@@ -1667,28 +1697,33 @@ static void nfs_access_free_entry(struct nfs_access_entry *entry)
1667 smp_mb__after_atomic_dec(); 1697 smp_mb__after_atomic_dec();
1668} 1698}
1669 1699
1700static void nfs_access_free_list(struct list_head *head)
1701{
1702 struct nfs_access_entry *cache;
1703
1704 while (!list_empty(head)) {
1705 cache = list_entry(head->next, struct nfs_access_entry, lru);
1706 list_del(&cache->lru);
1707 nfs_access_free_entry(cache);
1708 }
1709}
1710
1670int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask) 1711int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask)
1671{ 1712{
1672 LIST_HEAD(head); 1713 LIST_HEAD(head);
1673 struct nfs_inode *nfsi; 1714 struct nfs_inode *nfsi;
1674 struct nfs_access_entry *cache; 1715 struct nfs_access_entry *cache;
1675 1716
1676restart: 1717 if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
1718 return (nr_to_scan == 0) ? 0 : -1;
1719
1677 spin_lock(&nfs_access_lru_lock); 1720 spin_lock(&nfs_access_lru_lock);
1678 list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) { 1721 list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) {
1679 struct rw_semaphore *s_umount;
1680 struct inode *inode; 1722 struct inode *inode;
1681 1723
1682 if (nr_to_scan-- == 0) 1724 if (nr_to_scan-- == 0)
1683 break; 1725 break;
1684 s_umount = &nfsi->vfs_inode.i_sb->s_umount; 1726 inode = &nfsi->vfs_inode;
1685 if (!down_read_trylock(s_umount))
1686 continue;
1687 inode = igrab(&nfsi->vfs_inode);
1688 if (inode == NULL) {
1689 up_read(s_umount);
1690 continue;
1691 }
1692 spin_lock(&inode->i_lock); 1727 spin_lock(&inode->i_lock);
1693 if (list_empty(&nfsi->access_cache_entry_lru)) 1728 if (list_empty(&nfsi->access_cache_entry_lru))
1694 goto remove_lru_entry; 1729 goto remove_lru_entry;
@@ -1702,61 +1737,47 @@ restart:
1702 else { 1737 else {
1703remove_lru_entry: 1738remove_lru_entry:
1704 list_del_init(&nfsi->access_cache_inode_lru); 1739 list_del_init(&nfsi->access_cache_inode_lru);
1740 smp_mb__before_clear_bit();
1705 clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); 1741 clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
1742 smp_mb__after_clear_bit();
1706 } 1743 }
1707 spin_unlock(&inode->i_lock);
1708 spin_unlock(&nfs_access_lru_lock);
1709 iput(inode);
1710 up_read(s_umount);
1711 goto restart;
1712 } 1744 }
1713 spin_unlock(&nfs_access_lru_lock); 1745 spin_unlock(&nfs_access_lru_lock);
1714 while (!list_empty(&head)) { 1746 nfs_access_free_list(&head);
1715 cache = list_entry(head.next, struct nfs_access_entry, lru);
1716 list_del(&cache->lru);
1717 nfs_access_free_entry(cache);
1718 }
1719 return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure; 1747 return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure;
1720} 1748}
1721 1749
1722static void __nfs_access_zap_cache(struct inode *inode) 1750static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head)
1723{ 1751{
1724 struct nfs_inode *nfsi = NFS_I(inode);
1725 struct rb_root *root_node = &nfsi->access_cache; 1752 struct rb_root *root_node = &nfsi->access_cache;
1726 struct rb_node *n, *dispose = NULL; 1753 struct rb_node *n;
1727 struct nfs_access_entry *entry; 1754 struct nfs_access_entry *entry;
1728 1755
1729 /* Unhook entries from the cache */ 1756 /* Unhook entries from the cache */
1730 while ((n = rb_first(root_node)) != NULL) { 1757 while ((n = rb_first(root_node)) != NULL) {
1731 entry = rb_entry(n, struct nfs_access_entry, rb_node); 1758 entry = rb_entry(n, struct nfs_access_entry, rb_node);
1732 rb_erase(n, root_node); 1759 rb_erase(n, root_node);
1733 list_del(&entry->lru); 1760 list_move(&entry->lru, head);
1734 n->rb_left = dispose;
1735 dispose = n;
1736 } 1761 }
1737 nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS; 1762 nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS;
1738 spin_unlock(&inode->i_lock);
1739
1740 /* Now kill them all! */
1741 while (dispose != NULL) {
1742 n = dispose;
1743 dispose = n->rb_left;
1744 nfs_access_free_entry(rb_entry(n, struct nfs_access_entry, rb_node));
1745 }
1746} 1763}
1747 1764
1748void nfs_access_zap_cache(struct inode *inode) 1765void nfs_access_zap_cache(struct inode *inode)
1749{ 1766{
1767 LIST_HEAD(head);
1768
1769 if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0)
1770 return;
1750 /* Remove from global LRU init */ 1771 /* Remove from global LRU init */
1751 if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { 1772 spin_lock(&nfs_access_lru_lock);
1752 spin_lock(&nfs_access_lru_lock); 1773 if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
1753 list_del_init(&NFS_I(inode)->access_cache_inode_lru); 1774 list_del_init(&NFS_I(inode)->access_cache_inode_lru);
1754 spin_unlock(&nfs_access_lru_lock);
1755 }
1756 1775
1757 spin_lock(&inode->i_lock); 1776 spin_lock(&inode->i_lock);
1758 /* This will release the spinlock */ 1777 __nfs_access_zap_cache(NFS_I(inode), &head);
1759 __nfs_access_zap_cache(inode); 1778 spin_unlock(&inode->i_lock);
1779 spin_unlock(&nfs_access_lru_lock);
1780 nfs_access_free_list(&head);
1760} 1781}
1761 1782
1762static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred) 1783static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred)
@@ -1789,7 +1810,7 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
1789 cache = nfs_access_search_rbtree(inode, cred); 1810 cache = nfs_access_search_rbtree(inode, cred);
1790 if (cache == NULL) 1811 if (cache == NULL)
1791 goto out; 1812 goto out;
1792 if (!nfs_have_delegation(inode, FMODE_READ) && 1813 if (!nfs_have_delegated_attributes(inode) &&
1793 !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) 1814 !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
1794 goto out_stale; 1815 goto out_stale;
1795 res->jiffies = cache->jiffies; 1816 res->jiffies = cache->jiffies;
@@ -1807,8 +1828,8 @@ out_stale:
1807 nfs_access_free_entry(cache); 1828 nfs_access_free_entry(cache);
1808 return -ENOENT; 1829 return -ENOENT;
1809out_zap: 1830out_zap:
1810 /* This will release the spinlock */ 1831 spin_unlock(&inode->i_lock);
1811 __nfs_access_zap_cache(inode); 1832 nfs_access_zap_cache(inode);
1812 return -ENOENT; 1833 return -ENOENT;
1813} 1834}
1814 1835
@@ -1863,9 +1884,11 @@ static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *s
1863 smp_mb__after_atomic_inc(); 1884 smp_mb__after_atomic_inc();
1864 1885
1865 /* Add inode to global LRU list */ 1886 /* Add inode to global LRU list */
1866 if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { 1887 if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
1867 spin_lock(&nfs_access_lru_lock); 1888 spin_lock(&nfs_access_lru_lock);
1868 list_add_tail(&NFS_I(inode)->access_cache_inode_lru, &nfs_access_lru_list); 1889 if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
1890 list_add_tail(&NFS_I(inode)->access_cache_inode_lru,
1891 &nfs_access_lru_list);
1869 spin_unlock(&nfs_access_lru_lock); 1892 spin_unlock(&nfs_access_lru_lock);
1870 } 1893 }
1871} 1894}
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 0d289823e856..ad4cd31d6050 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -44,6 +44,7 @@
44#include <linux/file.h> 44#include <linux/file.h>
45#include <linux/pagemap.h> 45#include <linux/pagemap.h>
46#include <linux/kref.h> 46#include <linux/kref.h>
47#include <linux/slab.h>
47 48
48#include <linux/nfs_fs.h> 49#include <linux/nfs_fs.h>
49#include <linux/nfs_page.h> 50#include <linux/nfs_page.h>
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 3f0cd4dfddaf..76fd235d0024 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -9,6 +9,7 @@
9#include <linux/hash.h> 9#include <linux/hash.h>
10#include <linux/string.h> 10#include <linux/string.h>
11#include <linux/kmod.h> 11#include <linux/kmod.h>
12#include <linux/slab.h>
12#include <linux/module.h> 13#include <linux/module.h>
13#include <linux/socket.h> 14#include <linux/socket.h>
14#include <linux/seq_file.h> 15#include <linux/seq_file.h>
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index ae8d02294e46..cac96bcc91e4 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -24,9 +24,9 @@
24#include <linux/nfs_fs.h> 24#include <linux/nfs_fs.h>
25#include <linux/nfs_mount.h> 25#include <linux/nfs_mount.h>
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/slab.h>
28#include <linux/pagemap.h> 27#include <linux/pagemap.h>
29#include <linux/aio.h> 28#include <linux/aio.h>
29#include <linux/gfp.h>
30 30
31#include <asm/uaccess.h> 31#include <asm/uaccess.h>
32#include <asm/system.h> 32#include <asm/system.h>
@@ -161,14 +161,17 @@ static int nfs_revalidate_file_size(struct inode *inode, struct file *filp)
161 struct nfs_server *server = NFS_SERVER(inode); 161 struct nfs_server *server = NFS_SERVER(inode);
162 struct nfs_inode *nfsi = NFS_I(inode); 162 struct nfs_inode *nfsi = NFS_I(inode);
163 163
164 if (server->flags & NFS_MOUNT_NOAC) 164 if (nfs_have_delegated_attributes(inode))
165 goto force_reval; 165 goto out_noreval;
166
166 if (filp->f_flags & O_DIRECT) 167 if (filp->f_flags & O_DIRECT)
167 goto force_reval; 168 goto force_reval;
168 if (nfsi->npages != 0) 169 if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
169 return 0; 170 goto force_reval;
170 if (!(nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) && !nfs_attribute_timeout(inode)) 171 if (nfs_attribute_timeout(inode))
171 return 0; 172 goto force_reval;
173out_noreval:
174 return 0;
172force_reval: 175force_reval:
173 return __nfs_revalidate_inode(server, inode); 176 return __nfs_revalidate_inode(server, inode);
174} 177}
@@ -491,7 +494,8 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
491{ 494{
492 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); 495 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
493 496
494 if (gfp & __GFP_WAIT) 497 /* Only do I/O if gfp is a superset of GFP_KERNEL */
498 if ((gfp & GFP_KERNEL) == GFP_KERNEL)
495 nfs_wb_page(page->mapping->host, page); 499 nfs_wb_page(page->mapping->host, page);
496 /* If PagePrivate() is set, then the page is not freeable */ 500 /* If PagePrivate() is set, then the page is not freeable */
497 if (PagePrivate(page)) 501 if (PagePrivate(page))
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 237874f1af23..ce153a6b3aec 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -17,6 +17,7 @@
17#include <linux/nfs_fs_sb.h> 17#include <linux/nfs_fs_sb.h>
18#include <linux/in6.h> 18#include <linux/in6.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/slab.h>
20 21
21#include "internal.h" 22#include "internal.h"
22#include "iostat.h" 23#include "iostat.h"
@@ -466,7 +467,8 @@ int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,
466 struct list_head *pages, 467 struct list_head *pages,
467 unsigned *nr_pages) 468 unsigned *nr_pages)
468{ 469{
469 int ret, npages = *nr_pages; 470 unsigned npages = *nr_pages;
471 int ret;
470 472
471 dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n", 473 dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n",
472 NFS_I(inode)->fscache, npages, inode); 474 NFS_I(inode)->fscache, npages, inode);
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b35d2a616066..7428f7d6273b 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -78,159 +78,94 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
78{ 78{
79 struct nfs_server *server = NFS_SB(sb); 79 struct nfs_server *server = NFS_SB(sb);
80 struct nfs_fsinfo fsinfo; 80 struct nfs_fsinfo fsinfo;
81 struct nfs_fattr fattr; 81 struct dentry *ret;
82 struct dentry *mntroot;
83 struct inode *inode; 82 struct inode *inode;
84 int error; 83 int error;
85 84
86 /* get the actual root for this mount */ 85 /* get the actual root for this mount */
87 fsinfo.fattr = &fattr; 86 fsinfo.fattr = nfs_alloc_fattr();
87 if (fsinfo.fattr == NULL)
88 return ERR_PTR(-ENOMEM);
88 89
89 error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); 90 error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
90 if (error < 0) { 91 if (error < 0) {
91 dprintk("nfs_get_root: getattr error = %d\n", -error); 92 dprintk("nfs_get_root: getattr error = %d\n", -error);
92 return ERR_PTR(error); 93 ret = ERR_PTR(error);
94 goto out;
93 } 95 }
94 96
95 inode = nfs_fhget(sb, mntfh, fsinfo.fattr); 97 inode = nfs_fhget(sb, mntfh, fsinfo.fattr);
96 if (IS_ERR(inode)) { 98 if (IS_ERR(inode)) {
97 dprintk("nfs_get_root: get root inode failed\n"); 99 dprintk("nfs_get_root: get root inode failed\n");
98 return ERR_CAST(inode); 100 ret = ERR_CAST(inode);
101 goto out;
99 } 102 }
100 103
101 error = nfs_superblock_set_dummy_root(sb, inode); 104 error = nfs_superblock_set_dummy_root(sb, inode);
102 if (error != 0) 105 if (error != 0) {
103 return ERR_PTR(error); 106 ret = ERR_PTR(error);
107 goto out;
108 }
104 109
105 /* root dentries normally start off anonymous and get spliced in later 110 /* root dentries normally start off anonymous and get spliced in later
106 * if the dentry tree reaches them; however if the dentry already 111 * if the dentry tree reaches them; however if the dentry already
107 * exists, we'll pick it up at this point and use it as the root 112 * exists, we'll pick it up at this point and use it as the root
108 */ 113 */
109 mntroot = d_obtain_alias(inode); 114 ret = d_obtain_alias(inode);
110 if (IS_ERR(mntroot)) { 115 if (IS_ERR(ret)) {
111 dprintk("nfs_get_root: get root dentry failed\n"); 116 dprintk("nfs_get_root: get root dentry failed\n");
112 return mntroot; 117 goto out;
113 } 118 }
114 119
115 security_d_instantiate(mntroot, inode); 120 security_d_instantiate(ret, inode);
116
117 if (!mntroot->d_op)
118 mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops;
119 121
120 return mntroot; 122 if (ret->d_op == NULL)
123 ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
124out:
125 nfs_free_fattr(fsinfo.fattr);
126 return ret;
121} 127}
122 128
123#ifdef CONFIG_NFS_V4 129#ifdef CONFIG_NFS_V4
124 130
125/* 131int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh)
126 * Do a simple pathwalk from the root FH of the server to the nominated target
127 * of the mountpoint
128 * - give error on symlinks
129 * - give error on ".." occurring in the path
130 * - follow traversals
131 */
132int nfs4_path_walk(struct nfs_server *server,
133 struct nfs_fh *mntfh,
134 const char *path)
135{ 132{
136 struct nfs_fsinfo fsinfo; 133 struct nfs_fsinfo fsinfo;
137 struct nfs_fattr fattr; 134 int ret = -ENOMEM;
138 struct nfs_fh lastfh;
139 struct qstr name;
140 int ret;
141 135
142 dprintk("--> nfs4_path_walk(,,%s)\n", path); 136 dprintk("--> nfs4_get_rootfh()\n");
143 137
144 fsinfo.fattr = &fattr; 138 fsinfo.fattr = nfs_alloc_fattr();
145 nfs_fattr_init(&fattr); 139 if (fsinfo.fattr == NULL)
146 140 goto out;
147 /* Eat leading slashes */
148 while (*path == '/')
149 path++;
150 141
151 /* Start by getting the root filehandle from the server */ 142 /* Start by getting the root filehandle from the server */
152 ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); 143 ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
153 if (ret < 0) { 144 if (ret < 0) {
154 dprintk("nfs4_get_root: getroot error = %d\n", -ret); 145 dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret);
155 return ret; 146 goto out;
156 } 147 }
157 148
158 if (!S_ISDIR(fattr.mode)) { 149 if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_MODE)
159 printk(KERN_ERR "nfs4_get_root:" 150 || !S_ISDIR(fsinfo.fattr->mode)) {
151 printk(KERN_ERR "nfs4_get_rootfh:"
160 " getroot encountered non-directory\n"); 152 " getroot encountered non-directory\n");
161 return -ENOTDIR; 153 ret = -ENOTDIR;
154 goto out;
162 } 155 }
163 156
164 /* FIXME: It is quite valid for the server to return a referral here */ 157 if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
165 if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) { 158 printk(KERN_ERR "nfs4_get_rootfh:"
166 printk(KERN_ERR "nfs4_get_root:"
167 " getroot obtained referral\n"); 159 " getroot obtained referral\n");
168 return -EREMOTE; 160 ret = -EREMOTE;
169 } 161 goto out;
170
171next_component:
172 dprintk("Next: %s\n", path);
173
174 /* extract the next bit of the path */
175 if (!*path)
176 goto path_walk_complete;
177
178 name.name = path;
179 while (*path && *path != '/')
180 path++;
181 name.len = path - (const char *) name.name;
182
183 if (name.len > NFS4_MAXNAMLEN)
184 return -ENAMETOOLONG;
185
186eat_dot_dir:
187 while (*path == '/')
188 path++;
189
190 if (path[0] == '.' && (path[1] == '/' || !path[1])) {
191 path += 2;
192 goto eat_dot_dir;
193 }
194
195 /* FIXME: Why shouldn't the user be able to use ".." in the path? */
196 if (path[0] == '.' && path[1] == '.' && (path[2] == '/' || !path[2])
197 ) {
198 printk(KERN_ERR "nfs4_get_root:"
199 " Mount path contains reference to \"..\"\n");
200 return -EINVAL;
201 } 162 }
202 163
203 /* lookup the next FH in the sequence */ 164 memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid));
204 memcpy(&lastfh, mntfh, sizeof(lastfh)); 165out:
205 166 nfs_free_fattr(fsinfo.fattr);
206 dprintk("LookupFH: %*.*s [%s]\n", name.len, name.len, name.name, path); 167 dprintk("<-- nfs4_get_rootfh() = %d\n", ret);
207 168 return ret;
208 ret = server->nfs_client->rpc_ops->lookupfh(server, &lastfh, &name,
209 mntfh, &fattr);
210 if (ret < 0) {
211 dprintk("nfs4_get_root: getroot error = %d\n", -ret);
212 return ret;
213 }
214
215 if (!S_ISDIR(fattr.mode)) {
216 printk(KERN_ERR "nfs4_get_root:"
217 " lookupfh encountered non-directory\n");
218 return -ENOTDIR;
219 }
220
221 /* FIXME: Referrals are quite valid here too */
222 if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) {
223 printk(KERN_ERR "nfs4_get_root:"
224 " lookupfh obtained referral\n");
225 return -EREMOTE;
226 }
227
228 goto next_component;
229
230path_walk_complete:
231 memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid));
232 dprintk("<-- nfs4_path_walk() = 0\n");
233 return 0;
234} 169}
235 170
236/* 171/*
@@ -239,8 +174,8 @@ path_walk_complete:
239struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh) 174struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
240{ 175{
241 struct nfs_server *server = NFS_SB(sb); 176 struct nfs_server *server = NFS_SB(sb);
242 struct nfs_fattr fattr; 177 struct nfs_fattr *fattr = NULL;
243 struct dentry *mntroot; 178 struct dentry *ret;
244 struct inode *inode; 179 struct inode *inode;
245 int error; 180 int error;
246 181
@@ -254,40 +189,50 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
254 return ERR_PTR(error); 189 return ERR_PTR(error);
255 } 190 }
256 191
192 fattr = nfs_alloc_fattr();
193 if (fattr == NULL)
194 return ERR_PTR(-ENOMEM);;
195
257 /* get the actual root for this mount */ 196 /* get the actual root for this mount */
258 error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr); 197 error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
259 if (error < 0) { 198 if (error < 0) {
260 dprintk("nfs_get_root: getattr error = %d\n", -error); 199 dprintk("nfs_get_root: getattr error = %d\n", -error);
261 return ERR_PTR(error); 200 ret = ERR_PTR(error);
201 goto out;
262 } 202 }
263 203
264 inode = nfs_fhget(sb, mntfh, &fattr); 204 inode = nfs_fhget(sb, mntfh, fattr);
265 if (IS_ERR(inode)) { 205 if (IS_ERR(inode)) {
266 dprintk("nfs_get_root: get root inode failed\n"); 206 dprintk("nfs_get_root: get root inode failed\n");
267 return ERR_CAST(inode); 207 ret = ERR_CAST(inode);
208 goto out;
268 } 209 }
269 210
270 error = nfs_superblock_set_dummy_root(sb, inode); 211 error = nfs_superblock_set_dummy_root(sb, inode);
271 if (error != 0) 212 if (error != 0) {
272 return ERR_PTR(error); 213 ret = ERR_PTR(error);
214 goto out;
215 }
273 216
274 /* root dentries normally start off anonymous and get spliced in later 217 /* root dentries normally start off anonymous and get spliced in later
275 * if the dentry tree reaches them; however if the dentry already 218 * if the dentry tree reaches them; however if the dentry already
276 * exists, we'll pick it up at this point and use it as the root 219 * exists, we'll pick it up at this point and use it as the root
277 */ 220 */
278 mntroot = d_obtain_alias(inode); 221 ret = d_obtain_alias(inode);
279 if (IS_ERR(mntroot)) { 222 if (IS_ERR(ret)) {
280 dprintk("nfs_get_root: get root dentry failed\n"); 223 dprintk("nfs_get_root: get root dentry failed\n");
281 return mntroot; 224 goto out;
282 } 225 }
283 226
284 security_d_instantiate(mntroot, inode); 227 security_d_instantiate(ret, inode);
285 228
286 if (!mntroot->d_op) 229 if (ret->d_op == NULL)
287 mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops; 230 ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
288 231
232out:
233 nfs_free_fattr(fattr);
289 dprintk("<-- nfs4_get_root()\n"); 234 dprintk("<-- nfs4_get_root()\n");
290 return mntroot; 235 return ret;
291} 236}
292 237
293#endif /* CONFIG_NFS_V4 */ 238#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 657201acda84..099b3518feea 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -36,6 +36,7 @@
36#include <linux/vfs.h> 36#include <linux/vfs.h>
37#include <linux/inet.h> 37#include <linux/inet.h>
38#include <linux/nfs_xdr.h> 38#include <linux/nfs_xdr.h>
39#include <linux/slab.h>
39 40
40#include <asm/system.h> 41#include <asm/system.h>
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
@@ -392,8 +393,8 @@ int
392nfs_setattr(struct dentry *dentry, struct iattr *attr) 393nfs_setattr(struct dentry *dentry, struct iattr *attr)
393{ 394{
394 struct inode *inode = dentry->d_inode; 395 struct inode *inode = dentry->d_inode;
395 struct nfs_fattr fattr; 396 struct nfs_fattr *fattr;
396 int error; 397 int error = -ENOMEM;
397 398
398 nfs_inc_stats(inode, NFSIOS_VFSSETATTR); 399 nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
399 400
@@ -416,14 +417,20 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
416 filemap_write_and_wait(inode->i_mapping); 417 filemap_write_and_wait(inode->i_mapping);
417 nfs_wb_all(inode); 418 nfs_wb_all(inode);
418 } 419 }
420
421 fattr = nfs_alloc_fattr();
422 if (fattr == NULL)
423 goto out;
419 /* 424 /*
420 * Return any delegations if we're going to change ACLs 425 * Return any delegations if we're going to change ACLs
421 */ 426 */
422 if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) 427 if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0)
423 nfs_inode_return_delegation(inode); 428 nfs_inode_return_delegation(inode);
424 error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr); 429 error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
425 if (error == 0) 430 if (error == 0)
426 nfs_refresh_inode(inode, &fattr); 431 nfs_refresh_inode(inode, fattr);
432 nfs_free_fattr(fattr);
433out:
427 return error; 434 return error;
428} 435}
429 436
@@ -622,10 +629,10 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
622 list_for_each_entry(pos, &nfsi->open_files, list) { 629 list_for_each_entry(pos, &nfsi->open_files, list) {
623 if (cred != NULL && pos->cred != cred) 630 if (cred != NULL && pos->cred != cred)
624 continue; 631 continue;
625 if ((pos->mode & mode) == mode) { 632 if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode)
626 ctx = get_nfs_open_context(pos); 633 continue;
627 break; 634 ctx = get_nfs_open_context(pos);
628 } 635 break;
629 } 636 }
630 spin_unlock(&inode->i_lock); 637 spin_unlock(&inode->i_lock);
631 return ctx; 638 return ctx;
@@ -681,7 +688,7 @@ int
681__nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) 688__nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
682{ 689{
683 int status = -ESTALE; 690 int status = -ESTALE;
684 struct nfs_fattr fattr; 691 struct nfs_fattr *fattr = NULL;
685 struct nfs_inode *nfsi = NFS_I(inode); 692 struct nfs_inode *nfsi = NFS_I(inode);
686 693
687 dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", 694 dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
@@ -692,8 +699,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
692 if (NFS_STALE(inode)) 699 if (NFS_STALE(inode))
693 goto out; 700 goto out;
694 701
702 status = -ENOMEM;
703 fattr = nfs_alloc_fattr();
704 if (fattr == NULL)
705 goto out;
706
695 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); 707 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
696 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); 708 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr);
697 if (status != 0) { 709 if (status != 0) {
698 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", 710 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
699 inode->i_sb->s_id, 711 inode->i_sb->s_id,
@@ -706,7 +718,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
706 goto out; 718 goto out;
707 } 719 }
708 720
709 status = nfs_refresh_inode(inode, &fattr); 721 status = nfs_refresh_inode(inode, fattr);
710 if (status) { 722 if (status) {
711 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", 723 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
712 inode->i_sb->s_id, 724 inode->i_sb->s_id,
@@ -722,6 +734,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
722 (long long)NFS_FILEID(inode)); 734 (long long)NFS_FILEID(inode));
723 735
724 out: 736 out:
737 nfs_free_fattr(fattr);
725 return status; 738 return status;
726} 739}
727 740
@@ -729,11 +742,16 @@ int nfs_attribute_timeout(struct inode *inode)
729{ 742{
730 struct nfs_inode *nfsi = NFS_I(inode); 743 struct nfs_inode *nfsi = NFS_I(inode);
731 744
732 if (nfs_have_delegation(inode, FMODE_READ))
733 return 0;
734 return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); 745 return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
735} 746}
736 747
748static int nfs_attribute_cache_expired(struct inode *inode)
749{
750 if (nfs_have_delegated_attributes(inode))
751 return 0;
752 return nfs_attribute_timeout(inode);
753}
754
737/** 755/**
738 * nfs_revalidate_inode - Revalidate the inode attributes 756 * nfs_revalidate_inode - Revalidate the inode attributes
739 * @server - pointer to nfs_server struct 757 * @server - pointer to nfs_server struct
@@ -744,7 +762,7 @@ int nfs_attribute_timeout(struct inode *inode)
744int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) 762int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
745{ 763{
746 if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR) 764 if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR)
747 && !nfs_attribute_timeout(inode)) 765 && !nfs_attribute_cache_expired(inode))
748 return NFS_STALE(inode) ? -ESTALE : 0; 766 return NFS_STALE(inode) ? -ESTALE : 0;
749 return __nfs_revalidate_inode(server, inode); 767 return __nfs_revalidate_inode(server, inode);
750} 768}
@@ -781,7 +799,8 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
781 int ret = 0; 799 int ret = 0;
782 800
783 if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) 801 if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
784 || nfs_attribute_timeout(inode) || NFS_STALE(inode)) { 802 || nfs_attribute_cache_expired(inode)
803 || NFS_STALE(inode)) {
785 ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); 804 ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
786 if (ret < 0) 805 if (ret < 0)
787 goto out; 806 goto out;
@@ -915,6 +934,26 @@ void nfs_fattr_init(struct nfs_fattr *fattr)
915 fattr->gencount = nfs_inc_attr_generation_counter(); 934 fattr->gencount = nfs_inc_attr_generation_counter();
916} 935}
917 936
937struct nfs_fattr *nfs_alloc_fattr(void)
938{
939 struct nfs_fattr *fattr;
940
941 fattr = kmalloc(sizeof(*fattr), GFP_NOFS);
942 if (fattr != NULL)
943 nfs_fattr_init(fattr);
944 return fattr;
945}
946
947struct nfs_fh *nfs_alloc_fhandle(void)
948{
949 struct nfs_fh *fh;
950
951 fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS);
952 if (fh != NULL)
953 fh->size = 0;
954 return fh;
955}
956
918/** 957/**
919 * nfs_inode_attrs_need_update - check if the inode attributes need updating 958 * nfs_inode_attrs_need_update - check if the inode attributes need updating
920 * @inode - pointer to inode 959 * @inode - pointer to inode
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 11f82f03c5de..d8bd619e386c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -244,9 +244,7 @@ extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *);
244#ifdef CONFIG_NFS_V4 244#ifdef CONFIG_NFS_V4
245extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *); 245extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *);
246 246
247extern int nfs4_path_walk(struct nfs_server *server, 247extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
248 struct nfs_fh *mntfh,
249 const char *path);
250#endif 248#endif
251 249
252/* read.c */ 250/* read.c */
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 1d8d5c813b01..c5832487c456 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -36,14 +36,14 @@ static inline void nfs_inc_stats(const struct inode *inode,
36 36
37static inline void nfs_add_server_stats(const struct nfs_server *server, 37static inline void nfs_add_server_stats(const struct nfs_server *server,
38 enum nfs_stat_bytecounters stat, 38 enum nfs_stat_bytecounters stat,
39 unsigned long addend) 39 long addend)
40{ 40{
41 this_cpu_add(server->io_stats->bytes[stat], addend); 41 this_cpu_add(server->io_stats->bytes[stat], addend);
42} 42}
43 43
44static inline void nfs_add_stats(const struct inode *inode, 44static inline void nfs_add_stats(const struct inode *inode,
45 enum nfs_stat_bytecounters stat, 45 enum nfs_stat_bytecounters stat,
46 unsigned long addend) 46 long addend)
47{ 47{
48 nfs_add_server_stats(NFS_SERVER(inode), stat, addend); 48 nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
49} 49}
@@ -51,7 +51,7 @@ static inline void nfs_add_stats(const struct inode *inode,
51#ifdef CONFIG_NFS_FSCACHE 51#ifdef CONFIG_NFS_FSCACHE
52static inline void nfs_add_fscache_stats(struct inode *inode, 52static inline void nfs_add_fscache_stats(struct inode *inode,
53 enum nfs_stat_fscachecounters stat, 53 enum nfs_stat_fscachecounters stat,
54 unsigned long addend) 54 long addend)
55{ 55{
56 this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend); 56 this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend);
57} 57}
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 40c766782891..db6aa3673cf3 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -8,6 +8,7 @@
8 */ 8 */
9 9
10#include <linux/dcache.h> 10#include <linux/dcache.h>
11#include <linux/gfp.h>
11#include <linux/mount.h> 12#include <linux/mount.h>
12#include <linux/namei.h> 13#include <linux/namei.h>
13#include <linux/nfs_fs.h> 14#include <linux/nfs_fs.h>
@@ -104,8 +105,8 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
104 struct vfsmount *mnt; 105 struct vfsmount *mnt;
105 struct nfs_server *server = NFS_SERVER(dentry->d_inode); 106 struct nfs_server *server = NFS_SERVER(dentry->d_inode);
106 struct dentry *parent; 107 struct dentry *parent;
107 struct nfs_fh fh; 108 struct nfs_fh *fh = NULL;
108 struct nfs_fattr fattr; 109 struct nfs_fattr *fattr = NULL;
109 int err; 110 int err;
110 111
111 dprintk("--> nfs_follow_mountpoint()\n"); 112 dprintk("--> nfs_follow_mountpoint()\n");
@@ -114,6 +115,12 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
114 if (IS_ROOT(dentry)) 115 if (IS_ROOT(dentry))
115 goto out_err; 116 goto out_err;
116 117
118 err = -ENOMEM;
119 fh = nfs_alloc_fhandle();
120 fattr = nfs_alloc_fattr();
121 if (fh == NULL || fattr == NULL)
122 goto out_err;
123
117 dprintk("%s: enter\n", __func__); 124 dprintk("%s: enter\n", __func__);
118 dput(nd->path.dentry); 125 dput(nd->path.dentry);
119 nd->path.dentry = dget(dentry); 126 nd->path.dentry = dget(dentry);
@@ -122,16 +129,16 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
122 parent = dget_parent(nd->path.dentry); 129 parent = dget_parent(nd->path.dentry);
123 err = server->nfs_client->rpc_ops->lookup(parent->d_inode, 130 err = server->nfs_client->rpc_ops->lookup(parent->d_inode,
124 &nd->path.dentry->d_name, 131 &nd->path.dentry->d_name,
125 &fh, &fattr); 132 fh, fattr);
126 dput(parent); 133 dput(parent);
127 if (err != 0) 134 if (err != 0)
128 goto out_err; 135 goto out_err;
129 136
130 if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) 137 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
131 mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry); 138 mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry);
132 else 139 else
133 mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, &fh, 140 mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, fh,
134 &fattr); 141 fattr);
135 err = PTR_ERR(mnt); 142 err = PTR_ERR(mnt);
136 if (IS_ERR(mnt)) 143 if (IS_ERR(mnt))
137 goto out_err; 144 goto out_err;
@@ -150,6 +157,8 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
150 nd->path.dentry = dget(mnt->mnt_root); 157 nd->path.dentry = dget(mnt->mnt_root);
151 schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); 158 schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
152out: 159out:
160 nfs_free_fattr(fattr);
161 nfs_free_fhandle(fh);
153 dprintk("%s: done, returned %d\n", __func__, err); 162 dprintk("%s: done, returned %d\n", __func__, err);
154 163
155 dprintk("<-- nfs_follow_mountpoint() = %d\n", err); 164 dprintk("<-- nfs_follow_mountpoint() = %d\n", err);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 7bc2da8efd4a..81cf14257916 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -12,7 +12,6 @@
12#include <linux/param.h> 12#include <linux/param.h>
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/slab.h>
16#include <linux/errno.h> 15#include <linux/errno.h>
17#include <linux/string.h> 16#include <linux/string.h>
18#include <linux/in.h> 17#include <linux/in.h>
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index bac60515a4b3..9f88c5f4c7e2 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -1,4 +1,5 @@
1#include <linux/fs.h> 1#include <linux/fs.h>
2#include <linux/gfp.h>
2#include <linux/nfs.h> 3#include <linux/nfs.h>
3#include <linux/nfs3.h> 4#include <linux/nfs3.h>
4#include <linux/nfs_fs.h> 5#include <linux/nfs_fs.h>
@@ -184,7 +185,6 @@ static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl,
184struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) 185struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
185{ 186{
186 struct nfs_server *server = NFS_SERVER(inode); 187 struct nfs_server *server = NFS_SERVER(inode);
187 struct nfs_fattr fattr;
188 struct page *pages[NFSACL_MAXPAGES] = { }; 188 struct page *pages[NFSACL_MAXPAGES] = { };
189 struct nfs3_getaclargs args = { 189 struct nfs3_getaclargs args = {
190 .fh = NFS_FH(inode), 190 .fh = NFS_FH(inode),
@@ -192,7 +192,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
192 .pages = pages, 192 .pages = pages,
193 }; 193 };
194 struct nfs3_getaclres res = { 194 struct nfs3_getaclres res = {
195 .fattr = &fattr, 195 0
196 }; 196 };
197 struct rpc_message msg = { 197 struct rpc_message msg = {
198 .rpc_argp = &args, 198 .rpc_argp = &args,
@@ -227,7 +227,10 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
227 227
228 dprintk("NFS call getacl\n"); 228 dprintk("NFS call getacl\n");
229 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL]; 229 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL];
230 nfs_fattr_init(&fattr); 230 res.fattr = nfs_alloc_fattr();
231 if (res.fattr == NULL)
232 return ERR_PTR(-ENOMEM);
233
231 status = rpc_call_sync(server->client_acl, &msg, 0); 234 status = rpc_call_sync(server->client_acl, &msg, 0);
232 dprintk("NFS reply getacl: %d\n", status); 235 dprintk("NFS reply getacl: %d\n", status);
233 236
@@ -237,7 +240,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
237 240
238 switch (status) { 241 switch (status) {
239 case 0: 242 case 0:
240 status = nfs_refresh_inode(inode, &fattr); 243 status = nfs_refresh_inode(inode, res.fattr);
241 break; 244 break;
242 case -EPFNOSUPPORT: 245 case -EPFNOSUPPORT:
243 case -EPROTONOSUPPORT: 246 case -EPROTONOSUPPORT:
@@ -277,6 +280,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
277getout: 280getout:
278 posix_acl_release(res.acl_access); 281 posix_acl_release(res.acl_access);
279 posix_acl_release(res.acl_default); 282 posix_acl_release(res.acl_default);
283 nfs_free_fattr(res.fattr);
280 284
281 if (status != 0) { 285 if (status != 0) {
282 posix_acl_release(acl); 286 posix_acl_release(acl);
@@ -289,7 +293,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
289 struct posix_acl *dfacl) 293 struct posix_acl *dfacl)
290{ 294{
291 struct nfs_server *server = NFS_SERVER(inode); 295 struct nfs_server *server = NFS_SERVER(inode);
292 struct nfs_fattr fattr; 296 struct nfs_fattr *fattr;
293 struct page *pages[NFSACL_MAXPAGES]; 297 struct page *pages[NFSACL_MAXPAGES];
294 struct nfs3_setaclargs args = { 298 struct nfs3_setaclargs args = {
295 .inode = inode, 299 .inode = inode,
@@ -334,8 +338,13 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
334 } 338 }
335 339
336 dprintk("NFS call setacl\n"); 340 dprintk("NFS call setacl\n");
341 status = -ENOMEM;
342 fattr = nfs_alloc_fattr();
343 if (fattr == NULL)
344 goto out_freepages;
345
337 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL]; 346 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
338 nfs_fattr_init(&fattr); 347 msg.rpc_resp = fattr;
339 status = rpc_call_sync(server->client_acl, &msg, 0); 348 status = rpc_call_sync(server->client_acl, &msg, 0);
340 nfs_access_zap_cache(inode); 349 nfs_access_zap_cache(inode);
341 nfs_zap_acl_cache(inode); 350 nfs_zap_acl_cache(inode);
@@ -343,7 +352,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
343 352
344 switch (status) { 353 switch (status) {
345 case 0: 354 case 0:
346 status = nfs_refresh_inode(inode, &fattr); 355 status = nfs_refresh_inode(inode, fattr);
347 nfs3_cache_acls(inode, acl, dfacl); 356 nfs3_cache_acls(inode, acl, dfacl);
348 break; 357 break;
349 case -EPFNOSUPPORT: 358 case -EPFNOSUPPORT:
@@ -354,6 +363,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
354 case -ENOTSUPP: 363 case -ENOTSUPP:
355 status = -EOPNOTSUPP; 364 status = -EOPNOTSUPP;
356 } 365 }
366 nfs_free_fattr(fattr);
357out_freepages: 367out_freepages:
358 while (args.npages != 0) { 368 while (args.npages != 0) {
359 args.npages--; 369 args.npages--;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 24992f0a29f2..fabb4f2849a1 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -10,6 +10,7 @@
10#include <linux/errno.h> 10#include <linux/errno.h>
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/sunrpc/clnt.h> 12#include <linux/sunrpc/clnt.h>
13#include <linux/slab.h>
13#include <linux/nfs.h> 14#include <linux/nfs.h>
14#include <linux/nfs3.h> 15#include <linux/nfs3.h>
15#include <linux/nfs_fs.h> 16#include <linux/nfs_fs.h>
@@ -143,14 +144,12 @@ static int
143nfs3_proc_lookup(struct inode *dir, struct qstr *name, 144nfs3_proc_lookup(struct inode *dir, struct qstr *name,
144 struct nfs_fh *fhandle, struct nfs_fattr *fattr) 145 struct nfs_fh *fhandle, struct nfs_fattr *fattr)
145{ 146{
146 struct nfs_fattr dir_attr;
147 struct nfs3_diropargs arg = { 147 struct nfs3_diropargs arg = {
148 .fh = NFS_FH(dir), 148 .fh = NFS_FH(dir),
149 .name = name->name, 149 .name = name->name,
150 .len = name->len 150 .len = name->len
151 }; 151 };
152 struct nfs3_diropres res = { 152 struct nfs3_diropres res = {
153 .dir_attr = &dir_attr,
154 .fh = fhandle, 153 .fh = fhandle,
155 .fattr = fattr 154 .fattr = fattr
156 }; 155 };
@@ -162,29 +161,30 @@ nfs3_proc_lookup(struct inode *dir, struct qstr *name,
162 int status; 161 int status;
163 162
164 dprintk("NFS call lookup %s\n", name->name); 163 dprintk("NFS call lookup %s\n", name->name);
165 nfs_fattr_init(&dir_attr); 164 res.dir_attr = nfs_alloc_fattr();
165 if (res.dir_attr == NULL)
166 return -ENOMEM;
167
166 nfs_fattr_init(fattr); 168 nfs_fattr_init(fattr);
167 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 169 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
168 nfs_refresh_inode(dir, &dir_attr); 170 nfs_refresh_inode(dir, res.dir_attr);
169 if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) { 171 if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) {
170 msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR]; 172 msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
171 msg.rpc_argp = fhandle; 173 msg.rpc_argp = fhandle;
172 msg.rpc_resp = fattr; 174 msg.rpc_resp = fattr;
173 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 175 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
174 } 176 }
177 nfs_free_fattr(res.dir_attr);
175 dprintk("NFS reply lookup: %d\n", status); 178 dprintk("NFS reply lookup: %d\n", status);
176 return status; 179 return status;
177} 180}
178 181
179static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) 182static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
180{ 183{
181 struct nfs_fattr fattr;
182 struct nfs3_accessargs arg = { 184 struct nfs3_accessargs arg = {
183 .fh = NFS_FH(inode), 185 .fh = NFS_FH(inode),
184 }; 186 };
185 struct nfs3_accessres res = { 187 struct nfs3_accessres res;
186 .fattr = &fattr,
187 };
188 struct rpc_message msg = { 188 struct rpc_message msg = {
189 .rpc_proc = &nfs3_procedures[NFS3PROC_ACCESS], 189 .rpc_proc = &nfs3_procedures[NFS3PROC_ACCESS],
190 .rpc_argp = &arg, 190 .rpc_argp = &arg,
@@ -192,7 +192,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
192 .rpc_cred = entry->cred, 192 .rpc_cred = entry->cred,
193 }; 193 };
194 int mode = entry->mask; 194 int mode = entry->mask;
195 int status; 195 int status = -ENOMEM;
196 196
197 dprintk("NFS call access\n"); 197 dprintk("NFS call access\n");
198 198
@@ -209,9 +209,13 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
209 if (mode & MAY_EXEC) 209 if (mode & MAY_EXEC)
210 arg.access |= NFS3_ACCESS_EXECUTE; 210 arg.access |= NFS3_ACCESS_EXECUTE;
211 } 211 }
212 nfs_fattr_init(&fattr); 212
213 res.fattr = nfs_alloc_fattr();
214 if (res.fattr == NULL)
215 goto out;
216
213 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); 217 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
214 nfs_refresh_inode(inode, &fattr); 218 nfs_refresh_inode(inode, res.fattr);
215 if (status == 0) { 219 if (status == 0) {
216 entry->mask = 0; 220 entry->mask = 0;
217 if (res.access & NFS3_ACCESS_READ) 221 if (res.access & NFS3_ACCESS_READ)
@@ -221,6 +225,8 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
221 if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE)) 225 if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE))
222 entry->mask |= MAY_EXEC; 226 entry->mask |= MAY_EXEC;
223 } 227 }
228 nfs_free_fattr(res.fattr);
229out:
224 dprintk("NFS reply access: %d\n", status); 230 dprintk("NFS reply access: %d\n", status);
225 return status; 231 return status;
226} 232}
@@ -228,7 +234,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
228static int nfs3_proc_readlink(struct inode *inode, struct page *page, 234static int nfs3_proc_readlink(struct inode *inode, struct page *page,
229 unsigned int pgbase, unsigned int pglen) 235 unsigned int pgbase, unsigned int pglen)
230{ 236{
231 struct nfs_fattr fattr; 237 struct nfs_fattr *fattr;
232 struct nfs3_readlinkargs args = { 238 struct nfs3_readlinkargs args = {
233 .fh = NFS_FH(inode), 239 .fh = NFS_FH(inode),
234 .pgbase = pgbase, 240 .pgbase = pgbase,
@@ -238,14 +244,19 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page,
238 struct rpc_message msg = { 244 struct rpc_message msg = {
239 .rpc_proc = &nfs3_procedures[NFS3PROC_READLINK], 245 .rpc_proc = &nfs3_procedures[NFS3PROC_READLINK],
240 .rpc_argp = &args, 246 .rpc_argp = &args,
241 .rpc_resp = &fattr,
242 }; 247 };
243 int status; 248 int status = -ENOMEM;
244 249
245 dprintk("NFS call readlink\n"); 250 dprintk("NFS call readlink\n");
246 nfs_fattr_init(&fattr); 251 fattr = nfs_alloc_fattr();
252 if (fattr == NULL)
253 goto out;
254 msg.rpc_resp = fattr;
255
247 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); 256 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
248 nfs_refresh_inode(inode, &fattr); 257 nfs_refresh_inode(inode, fattr);
258 nfs_free_fattr(fattr);
259out:
249 dprintk("NFS reply readlink: %d\n", status); 260 dprintk("NFS reply readlink: %d\n", status);
250 return status; 261 return status;
251} 262}
@@ -395,12 +406,17 @@ nfs3_proc_remove(struct inode *dir, struct qstr *name)
395 .rpc_argp = &arg, 406 .rpc_argp = &arg,
396 .rpc_resp = &res, 407 .rpc_resp = &res,
397 }; 408 };
398 int status; 409 int status = -ENOMEM;
399 410
400 dprintk("NFS call remove %s\n", name->name); 411 dprintk("NFS call remove %s\n", name->name);
401 nfs_fattr_init(&res.dir_attr); 412 res.dir_attr = nfs_alloc_fattr();
413 if (res.dir_attr == NULL)
414 goto out;
415
402 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 416 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
403 nfs_post_op_update_inode(dir, &res.dir_attr); 417 nfs_post_op_update_inode(dir, res.dir_attr);
418 nfs_free_fattr(res.dir_attr);
419out:
404 dprintk("NFS reply remove: %d\n", status); 420 dprintk("NFS reply remove: %d\n", status);
405 return status; 421 return status;
406} 422}
@@ -418,7 +434,7 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
418 if (nfs3_async_handle_jukebox(task, dir)) 434 if (nfs3_async_handle_jukebox(task, dir))
419 return 0; 435 return 0;
420 res = task->tk_msg.rpc_resp; 436 res = task->tk_msg.rpc_resp;
421 nfs_post_op_update_inode(dir, &res->dir_attr); 437 nfs_post_op_update_inode(dir, res->dir_attr);
422 return 1; 438 return 1;
423} 439}
424 440
@@ -426,7 +442,6 @@ static int
426nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, 442nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
427 struct inode *new_dir, struct qstr *new_name) 443 struct inode *new_dir, struct qstr *new_name)
428{ 444{
429 struct nfs_fattr old_dir_attr, new_dir_attr;
430 struct nfs3_renameargs arg = { 445 struct nfs3_renameargs arg = {
431 .fromfh = NFS_FH(old_dir), 446 .fromfh = NFS_FH(old_dir),
432 .fromname = old_name->name, 447 .fromname = old_name->name,
@@ -435,23 +450,27 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
435 .toname = new_name->name, 450 .toname = new_name->name,
436 .tolen = new_name->len 451 .tolen = new_name->len
437 }; 452 };
438 struct nfs3_renameres res = { 453 struct nfs3_renameres res;
439 .fromattr = &old_dir_attr,
440 .toattr = &new_dir_attr
441 };
442 struct rpc_message msg = { 454 struct rpc_message msg = {
443 .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME], 455 .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME],
444 .rpc_argp = &arg, 456 .rpc_argp = &arg,
445 .rpc_resp = &res, 457 .rpc_resp = &res,
446 }; 458 };
447 int status; 459 int status = -ENOMEM;
448 460
449 dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); 461 dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name);
450 nfs_fattr_init(&old_dir_attr); 462
451 nfs_fattr_init(&new_dir_attr); 463 res.fromattr = nfs_alloc_fattr();
464 res.toattr = nfs_alloc_fattr();
465 if (res.fromattr == NULL || res.toattr == NULL)
466 goto out;
467
452 status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); 468 status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
453 nfs_post_op_update_inode(old_dir, &old_dir_attr); 469 nfs_post_op_update_inode(old_dir, res.fromattr);
454 nfs_post_op_update_inode(new_dir, &new_dir_attr); 470 nfs_post_op_update_inode(new_dir, res.toattr);
471out:
472 nfs_free_fattr(res.toattr);
473 nfs_free_fattr(res.fromattr);
455 dprintk("NFS reply rename: %d\n", status); 474 dprintk("NFS reply rename: %d\n", status);
456 return status; 475 return status;
457} 476}
@@ -459,30 +478,32 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
459static int 478static int
460nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) 479nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
461{ 480{
462 struct nfs_fattr dir_attr, fattr;
463 struct nfs3_linkargs arg = { 481 struct nfs3_linkargs arg = {
464 .fromfh = NFS_FH(inode), 482 .fromfh = NFS_FH(inode),
465 .tofh = NFS_FH(dir), 483 .tofh = NFS_FH(dir),
466 .toname = name->name, 484 .toname = name->name,
467 .tolen = name->len 485 .tolen = name->len
468 }; 486 };
469 struct nfs3_linkres res = { 487 struct nfs3_linkres res;
470 .dir_attr = &dir_attr,
471 .fattr = &fattr
472 };
473 struct rpc_message msg = { 488 struct rpc_message msg = {
474 .rpc_proc = &nfs3_procedures[NFS3PROC_LINK], 489 .rpc_proc = &nfs3_procedures[NFS3PROC_LINK],
475 .rpc_argp = &arg, 490 .rpc_argp = &arg,
476 .rpc_resp = &res, 491 .rpc_resp = &res,
477 }; 492 };
478 int status; 493 int status = -ENOMEM;
479 494
480 dprintk("NFS call link %s\n", name->name); 495 dprintk("NFS call link %s\n", name->name);
481 nfs_fattr_init(&dir_attr); 496 res.fattr = nfs_alloc_fattr();
482 nfs_fattr_init(&fattr); 497 res.dir_attr = nfs_alloc_fattr();
498 if (res.fattr == NULL || res.dir_attr == NULL)
499 goto out;
500
483 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); 501 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
484 nfs_post_op_update_inode(dir, &dir_attr); 502 nfs_post_op_update_inode(dir, res.dir_attr);
485 nfs_post_op_update_inode(inode, &fattr); 503 nfs_post_op_update_inode(inode, res.fattr);
504out:
505 nfs_free_fattr(res.dir_attr);
506 nfs_free_fattr(res.fattr);
486 dprintk("NFS reply link: %d\n", status); 507 dprintk("NFS reply link: %d\n", status);
487 return status; 508 return status;
488} 509}
@@ -553,7 +574,7 @@ out:
553static int 574static int
554nfs3_proc_rmdir(struct inode *dir, struct qstr *name) 575nfs3_proc_rmdir(struct inode *dir, struct qstr *name)
555{ 576{
556 struct nfs_fattr dir_attr; 577 struct nfs_fattr *dir_attr;
557 struct nfs3_diropargs arg = { 578 struct nfs3_diropargs arg = {
558 .fh = NFS_FH(dir), 579 .fh = NFS_FH(dir),
559 .name = name->name, 580 .name = name->name,
@@ -562,14 +583,19 @@ nfs3_proc_rmdir(struct inode *dir, struct qstr *name)
562 struct rpc_message msg = { 583 struct rpc_message msg = {
563 .rpc_proc = &nfs3_procedures[NFS3PROC_RMDIR], 584 .rpc_proc = &nfs3_procedures[NFS3PROC_RMDIR],
564 .rpc_argp = &arg, 585 .rpc_argp = &arg,
565 .rpc_resp = &dir_attr,
566 }; 586 };
567 int status; 587 int status = -ENOMEM;
568 588
569 dprintk("NFS call rmdir %s\n", name->name); 589 dprintk("NFS call rmdir %s\n", name->name);
570 nfs_fattr_init(&dir_attr); 590 dir_attr = nfs_alloc_fattr();
591 if (dir_attr == NULL)
592 goto out;
593
594 msg.rpc_resp = dir_attr;
571 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 595 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
572 nfs_post_op_update_inode(dir, &dir_attr); 596 nfs_post_op_update_inode(dir, dir_attr);
597 nfs_free_fattr(dir_attr);
598out:
573 dprintk("NFS reply rmdir: %d\n", status); 599 dprintk("NFS reply rmdir: %d\n", status);
574 return status; 600 return status;
575} 601}
@@ -588,7 +614,6 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
588 u64 cookie, struct page *page, unsigned int count, int plus) 614 u64 cookie, struct page *page, unsigned int count, int plus)
589{ 615{
590 struct inode *dir = dentry->d_inode; 616 struct inode *dir = dentry->d_inode;
591 struct nfs_fattr dir_attr;
592 __be32 *verf = NFS_COOKIEVERF(dir); 617 __be32 *verf = NFS_COOKIEVERF(dir);
593 struct nfs3_readdirargs arg = { 618 struct nfs3_readdirargs arg = {
594 .fh = NFS_FH(dir), 619 .fh = NFS_FH(dir),
@@ -599,7 +624,6 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
599 .pages = &page 624 .pages = &page
600 }; 625 };
601 struct nfs3_readdirres res = { 626 struct nfs3_readdirres res = {
602 .dir_attr = &dir_attr,
603 .verf = verf, 627 .verf = verf,
604 .plus = plus 628 .plus = plus
605 }; 629 };
@@ -609,7 +633,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
609 .rpc_resp = &res, 633 .rpc_resp = &res,
610 .rpc_cred = cred 634 .rpc_cred = cred
611 }; 635 };
612 int status; 636 int status = -ENOMEM;
613 637
614 if (plus) 638 if (plus)
615 msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS]; 639 msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS];
@@ -617,12 +641,17 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
617 dprintk("NFS call readdir%s %d\n", 641 dprintk("NFS call readdir%s %d\n",
618 plus? "plus" : "", (unsigned int) cookie); 642 plus? "plus" : "", (unsigned int) cookie);
619 643
620 nfs_fattr_init(&dir_attr); 644 res.dir_attr = nfs_alloc_fattr();
645 if (res.dir_attr == NULL)
646 goto out;
647
621 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 648 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
622 649
623 nfs_invalidate_atime(dir); 650 nfs_invalidate_atime(dir);
651 nfs_refresh_inode(dir, res.dir_attr);
624 652
625 nfs_refresh_inode(dir, &dir_attr); 653 nfs_free_fattr(res.dir_attr);
654out:
626 dprintk("NFS reply readdir: %d\n", status); 655 dprintk("NFS reply readdir: %d\n", status);
627 return status; 656 return status;
628} 657}
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 5fe5492fbd29..75dcfc7da365 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -9,7 +9,6 @@
9#include <linux/param.h> 9#include <linux/param.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/slab.h>
13#include <linux/errno.h> 12#include <linux/errno.h>
14#include <linux/string.h> 13#include <linux/string.h>
15#include <linux/in.h> 14#include <linux/in.h>
@@ -763,7 +762,7 @@ nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
763static int 762static int
764nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res) 763nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res)
765{ 764{
766 return nfs3_xdr_wccstat(req, p, &res->dir_attr); 765 return nfs3_xdr_wccstat(req, p, res->dir_attr);
767} 766}
768 767
769/* 768/*
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a187200a7aac..c538c6106e16 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -206,14 +206,14 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
206 206
207 207
208/* nfs4proc.c */ 208/* nfs4proc.c */
209extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *); 209extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
210extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *); 210extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
211extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred); 211extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
212extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *); 212extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
213extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); 213extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
214extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); 214extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
215extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); 215extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
216extern int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait); 216extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
217extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); 217extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
218extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *); 218extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
219extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); 219extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
@@ -286,7 +286,7 @@ extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
286extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); 286extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
287extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); 287extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
288 288
289extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter); 289extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
290extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); 290extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
291extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid); 291extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid);
292extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid); 292extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index fa3408f20112..3c2a1724fbd2 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -11,6 +11,7 @@
11#include <linux/mount.h> 11#include <linux/mount.h>
12#include <linux/namei.h> 12#include <linux/namei.h>
13#include <linux/nfs_fs.h> 13#include <linux/nfs_fs.h>
14#include <linux/slab.h>
14#include <linux/string.h> 15#include <linux/string.h>
15#include <linux/sunrpc/clnt.h> 16#include <linux/sunrpc/clnt.h>
16#include <linux/vfs.h> 17#include <linux/vfs.h>
@@ -114,6 +115,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
114 char *page, char *page2, 115 char *page, char *page2,
115 const struct nfs4_fs_location *location) 116 const struct nfs4_fs_location *location)
116{ 117{
118 const size_t addr_bufsize = sizeof(struct sockaddr_storage);
117 struct vfsmount *mnt = ERR_PTR(-ENOENT); 119 struct vfsmount *mnt = ERR_PTR(-ENOENT);
118 char *mnt_path; 120 char *mnt_path;
119 unsigned int maxbuflen; 121 unsigned int maxbuflen;
@@ -125,9 +127,12 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
125 mountdata->mnt_path = mnt_path; 127 mountdata->mnt_path = mnt_path;
126 maxbuflen = mnt_path - 1 - page2; 128 maxbuflen = mnt_path - 1 - page2;
127 129
130 mountdata->addr = kmalloc(addr_bufsize, GFP_KERNEL);
131 if (mountdata->addr == NULL)
132 return ERR_PTR(-ENOMEM);
133
128 for (s = 0; s < location->nservers; s++) { 134 for (s = 0; s < location->nservers; s++) {
129 const struct nfs4_string *buf = &location->servers[s]; 135 const struct nfs4_string *buf = &location->servers[s];
130 struct sockaddr_storage addr;
131 136
132 if (buf->len <= 0 || buf->len >= maxbuflen) 137 if (buf->len <= 0 || buf->len >= maxbuflen)
133 continue; 138 continue;
@@ -136,11 +141,10 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
136 continue; 141 continue;
137 142
138 mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len, 143 mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len,
139 (struct sockaddr *)&addr, sizeof(addr)); 144 mountdata->addr, addr_bufsize);
140 if (mountdata->addrlen == 0) 145 if (mountdata->addrlen == 0)
141 continue; 146 continue;
142 147
143 mountdata->addr = (struct sockaddr *)&addr;
144 rpc_set_port(mountdata->addr, NFS_PORT); 148 rpc_set_port(mountdata->addr, NFS_PORT);
145 149
146 memcpy(page2, buf->data, buf->len); 150 memcpy(page2, buf->data, buf->len);
@@ -155,6 +159,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
155 if (!IS_ERR(mnt)) 159 if (!IS_ERR(mnt))
156 break; 160 break;
157 } 161 }
162 kfree(mountdata->addr);
158 return mnt; 163 return mnt;
159} 164}
160 165
@@ -220,8 +225,8 @@ out:
220 225
221/* 226/*
222 * nfs_do_refmount - handle crossing a referral on server 227 * nfs_do_refmount - handle crossing a referral on server
228 * @mnt_parent - mountpoint of referral
223 * @dentry - dentry of referral 229 * @dentry - dentry of referral
224 * @nd - nameidata info
225 * 230 *
226 */ 231 */
227struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry) 232struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index eda74c42d552..70015dd60a98 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -39,6 +39,7 @@
39#include <linux/delay.h> 39#include <linux/delay.h>
40#include <linux/errno.h> 40#include <linux/errno.h>
41#include <linux/string.h> 41#include <linux/string.h>
42#include <linux/slab.h>
42#include <linux/sunrpc/clnt.h> 43#include <linux/sunrpc/clnt.h>
43#include <linux/nfs.h> 44#include <linux/nfs.h>
44#include <linux/nfs4.h> 45#include <linux/nfs4.h>
@@ -69,6 +70,9 @@ static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinf
69static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); 70static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
70static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); 71static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
71static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); 72static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
73static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
74 struct nfs_fattr *fattr, struct iattr *sattr,
75 struct nfs4_state *state);
72 76
73/* Prevent leaks of NFSv4 errors into userland */ 77/* Prevent leaks of NFSv4 errors into userland */
74static int nfs4_map_errors(int err) 78static int nfs4_map_errors(int err)
@@ -713,17 +717,18 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
713 717
714static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, 718static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
715 struct nfs4_state_owner *sp, fmode_t fmode, int flags, 719 struct nfs4_state_owner *sp, fmode_t fmode, int flags,
716 const struct iattr *attrs) 720 const struct iattr *attrs,
721 gfp_t gfp_mask)
717{ 722{
718 struct dentry *parent = dget_parent(path->dentry); 723 struct dentry *parent = dget_parent(path->dentry);
719 struct inode *dir = parent->d_inode; 724 struct inode *dir = parent->d_inode;
720 struct nfs_server *server = NFS_SERVER(dir); 725 struct nfs_server *server = NFS_SERVER(dir);
721 struct nfs4_opendata *p; 726 struct nfs4_opendata *p;
722 727
723 p = kzalloc(sizeof(*p), GFP_KERNEL); 728 p = kzalloc(sizeof(*p), gfp_mask);
724 if (p == NULL) 729 if (p == NULL)
725 goto err; 730 goto err;
726 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); 731 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask);
727 if (p->o_arg.seqid == NULL) 732 if (p->o_arg.seqid == NULL)
728 goto err_free; 733 goto err_free;
729 path_get(path); 734 path_get(path);
@@ -1059,7 +1064,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
1059{ 1064{
1060 struct nfs4_opendata *opendata; 1065 struct nfs4_opendata *opendata;
1061 1066
1062 opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL); 1067 opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL, GFP_NOFS);
1063 if (opendata == NULL) 1068 if (opendata == NULL)
1064 return ERR_PTR(-ENOMEM); 1069 return ERR_PTR(-ENOMEM);
1065 opendata->state = state; 1070 opendata->state = state;
@@ -1522,6 +1527,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
1522 nfs_post_op_update_inode(dir, o_res->dir_attr); 1527 nfs_post_op_update_inode(dir, o_res->dir_attr);
1523 } else 1528 } else
1524 nfs_refresh_inode(dir, o_res->dir_attr); 1529 nfs_refresh_inode(dir, o_res->dir_attr);
1530 if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0)
1531 server->caps &= ~NFS_CAP_POSIX_LOCK;
1525 if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { 1532 if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
1526 status = _nfs4_proc_open_confirm(data); 1533 status = _nfs4_proc_open_confirm(data);
1527 if (status != 0) 1534 if (status != 0)
@@ -1645,7 +1652,7 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in
1645 if (path->dentry->d_inode != NULL) 1652 if (path->dentry->d_inode != NULL)
1646 nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode); 1653 nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode);
1647 status = -ENOMEM; 1654 status = -ENOMEM;
1648 opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr); 1655 opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr, GFP_KERNEL);
1649 if (opendata == NULL) 1656 if (opendata == NULL)
1650 goto err_put_state_owner; 1657 goto err_put_state_owner;
1651 1658
@@ -1656,15 +1663,24 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in
1656 if (status != 0) 1663 if (status != 0)
1657 goto err_opendata_put; 1664 goto err_opendata_put;
1658 1665
1659 if (opendata->o_arg.open_flags & O_EXCL)
1660 nfs4_exclusive_attrset(opendata, sattr);
1661
1662 state = nfs4_opendata_to_nfs4_state(opendata); 1666 state = nfs4_opendata_to_nfs4_state(opendata);
1663 status = PTR_ERR(state); 1667 status = PTR_ERR(state);
1664 if (IS_ERR(state)) 1668 if (IS_ERR(state))
1665 goto err_opendata_put; 1669 goto err_opendata_put;
1666 if ((opendata->o_res.rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) != 0) 1670 if (server->caps & NFS_CAP_POSIX_LOCK)
1667 set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); 1671 set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
1672
1673 if (opendata->o_arg.open_flags & O_EXCL) {
1674 nfs4_exclusive_attrset(opendata, sattr);
1675
1676 nfs_fattr_init(opendata->o_res.f_attr);
1677 status = nfs4_do_setattr(state->inode, cred,
1678 opendata->o_res.f_attr, sattr,
1679 state);
1680 if (status == 0)
1681 nfs_setattr_update_inode(state->inode, sattr);
1682 nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
1683 }
1668 nfs4_opendata_put(opendata); 1684 nfs4_opendata_put(opendata);
1669 nfs4_put_state_owner(sp); 1685 nfs4_put_state_owner(sp);
1670 *res = state; 1686 *res = state;
@@ -1911,7 +1927,7 @@ static const struct rpc_call_ops nfs4_close_ops = {
1911 * 1927 *
1912 * NOTE: Caller must be holding the sp->so_owner semaphore! 1928 * NOTE: Caller must be holding the sp->so_owner semaphore!
1913 */ 1929 */
1914int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait) 1930int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait)
1915{ 1931{
1916 struct nfs_server *server = NFS_SERVER(state->inode); 1932 struct nfs_server *server = NFS_SERVER(state->inode);
1917 struct nfs4_closedata *calldata; 1933 struct nfs4_closedata *calldata;
@@ -1930,7 +1946,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
1930 }; 1946 };
1931 int status = -ENOMEM; 1947 int status = -ENOMEM;
1932 1948
1933 calldata = kzalloc(sizeof(*calldata), GFP_KERNEL); 1949 calldata = kzalloc(sizeof(*calldata), gfp_mask);
1934 if (calldata == NULL) 1950 if (calldata == NULL)
1935 goto out; 1951 goto out;
1936 calldata->inode = state->inode; 1952 calldata->inode = state->inode;
@@ -1938,7 +1954,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
1938 calldata->arg.fh = NFS_FH(state->inode); 1954 calldata->arg.fh = NFS_FH(state->inode);
1939 calldata->arg.stateid = &state->open_stateid; 1955 calldata->arg.stateid = &state->open_stateid;
1940 /* Serialization for the sequence id */ 1956 /* Serialization for the sequence id */
1941 calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid); 1957 calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid, gfp_mask);
1942 if (calldata->arg.seqid == NULL) 1958 if (calldata->arg.seqid == NULL)
1943 goto out_free_calldata; 1959 goto out_free_calldata;
1944 calldata->arg.fmode = 0; 1960 calldata->arg.fmode = 0;
@@ -2067,8 +2083,7 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
2067 case -EDQUOT: 2083 case -EDQUOT:
2068 case -ENOSPC: 2084 case -ENOSPC:
2069 case -EROFS: 2085 case -EROFS:
2070 lookup_instantiate_filp(nd, (struct dentry *)state, NULL); 2086 return PTR_ERR(state);
2071 return 1;
2072 default: 2087 default:
2073 goto out_drop; 2088 goto out_drop;
2074 } 2089 }
@@ -2402,14 +2417,12 @@ static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh
2402static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) 2417static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
2403{ 2418{
2404 struct nfs_server *server = NFS_SERVER(inode); 2419 struct nfs_server *server = NFS_SERVER(inode);
2405 struct nfs_fattr fattr;
2406 struct nfs4_accessargs args = { 2420 struct nfs4_accessargs args = {
2407 .fh = NFS_FH(inode), 2421 .fh = NFS_FH(inode),
2408 .bitmask = server->attr_bitmask, 2422 .bitmask = server->attr_bitmask,
2409 }; 2423 };
2410 struct nfs4_accessres res = { 2424 struct nfs4_accessres res = {
2411 .server = server, 2425 .server = server,
2412 .fattr = &fattr,
2413 }; 2426 };
2414 struct rpc_message msg = { 2427 struct rpc_message msg = {
2415 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS], 2428 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS],
@@ -2436,7 +2449,11 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
2436 if (mode & MAY_EXEC) 2449 if (mode & MAY_EXEC)
2437 args.access |= NFS4_ACCESS_EXECUTE; 2450 args.access |= NFS4_ACCESS_EXECUTE;
2438 } 2451 }
2439 nfs_fattr_init(&fattr); 2452
2453 res.fattr = nfs_alloc_fattr();
2454 if (res.fattr == NULL)
2455 return -ENOMEM;
2456
2440 status = nfs4_call_sync(server, &msg, &args, &res, 0); 2457 status = nfs4_call_sync(server, &msg, &args, &res, 0);
2441 if (!status) { 2458 if (!status) {
2442 entry->mask = 0; 2459 entry->mask = 0;
@@ -2446,8 +2463,9 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
2446 entry->mask |= MAY_WRITE; 2463 entry->mask |= MAY_WRITE;
2447 if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE)) 2464 if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE))
2448 entry->mask |= MAY_EXEC; 2465 entry->mask |= MAY_EXEC;
2449 nfs_refresh_inode(inode, &fattr); 2466 nfs_refresh_inode(inode, res.fattr);
2450 } 2467 }
2468 nfs_free_fattr(res.fattr);
2451 return status; 2469 return status;
2452} 2470}
2453 2471
@@ -2560,13 +2578,6 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
2560 } 2578 }
2561 d_add(dentry, igrab(state->inode)); 2579 d_add(dentry, igrab(state->inode));
2562 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 2580 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
2563 if (flags & O_EXCL) {
2564 struct nfs_fattr fattr;
2565 status = nfs4_do_setattr(state->inode, cred, &fattr, sattr, state);
2566 if (status == 0)
2567 nfs_setattr_update_inode(state->inode, sattr);
2568 nfs_post_op_update_inode(state->inode, &fattr);
2569 }
2570 if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0) 2581 if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0)
2571 status = nfs4_intent_set_file(nd, &path, state, fmode); 2582 status = nfs4_intent_set_file(nd, &path, state, fmode);
2572 else 2583 else
@@ -2594,14 +2605,19 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
2594 .rpc_argp = &args, 2605 .rpc_argp = &args,
2595 .rpc_resp = &res, 2606 .rpc_resp = &res,
2596 }; 2607 };
2597 int status; 2608 int status = -ENOMEM;
2609
2610 res.dir_attr = nfs_alloc_fattr();
2611 if (res.dir_attr == NULL)
2612 goto out;
2598 2613
2599 nfs_fattr_init(&res.dir_attr);
2600 status = nfs4_call_sync(server, &msg, &args, &res, 1); 2614 status = nfs4_call_sync(server, &msg, &args, &res, 1);
2601 if (status == 0) { 2615 if (status == 0) {
2602 update_changeattr(dir, &res.cinfo); 2616 update_changeattr(dir, &res.cinfo);
2603 nfs_post_op_update_inode(dir, &res.dir_attr); 2617 nfs_post_op_update_inode(dir, res.dir_attr);
2604 } 2618 }
2619 nfs_free_fattr(res.dir_attr);
2620out:
2605 return status; 2621 return status;
2606} 2622}
2607 2623
@@ -2636,7 +2652,7 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
2636 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) 2652 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
2637 return 0; 2653 return 0;
2638 update_changeattr(dir, &res->cinfo); 2654 update_changeattr(dir, &res->cinfo);
2639 nfs_post_op_update_inode(dir, &res->dir_attr); 2655 nfs_post_op_update_inode(dir, res->dir_attr);
2640 return 1; 2656 return 1;
2641} 2657}
2642 2658
@@ -2651,29 +2667,31 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
2651 .new_name = new_name, 2667 .new_name = new_name,
2652 .bitmask = server->attr_bitmask, 2668 .bitmask = server->attr_bitmask,
2653 }; 2669 };
2654 struct nfs_fattr old_fattr, new_fattr;
2655 struct nfs4_rename_res res = { 2670 struct nfs4_rename_res res = {
2656 .server = server, 2671 .server = server,
2657 .old_fattr = &old_fattr,
2658 .new_fattr = &new_fattr,
2659 }; 2672 };
2660 struct rpc_message msg = { 2673 struct rpc_message msg = {
2661 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME], 2674 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME],
2662 .rpc_argp = &arg, 2675 .rpc_argp = &arg,
2663 .rpc_resp = &res, 2676 .rpc_resp = &res,
2664 }; 2677 };
2665 int status; 2678 int status = -ENOMEM;
2666 2679
2667 nfs_fattr_init(res.old_fattr); 2680 res.old_fattr = nfs_alloc_fattr();
2668 nfs_fattr_init(res.new_fattr); 2681 res.new_fattr = nfs_alloc_fattr();
2669 status = nfs4_call_sync(server, &msg, &arg, &res, 1); 2682 if (res.old_fattr == NULL || res.new_fattr == NULL)
2683 goto out;
2670 2684
2685 status = nfs4_call_sync(server, &msg, &arg, &res, 1);
2671 if (!status) { 2686 if (!status) {
2672 update_changeattr(old_dir, &res.old_cinfo); 2687 update_changeattr(old_dir, &res.old_cinfo);
2673 nfs_post_op_update_inode(old_dir, res.old_fattr); 2688 nfs_post_op_update_inode(old_dir, res.old_fattr);
2674 update_changeattr(new_dir, &res.new_cinfo); 2689 update_changeattr(new_dir, &res.new_cinfo);
2675 nfs_post_op_update_inode(new_dir, res.new_fattr); 2690 nfs_post_op_update_inode(new_dir, res.new_fattr);
2676 } 2691 }
2692out:
2693 nfs_free_fattr(res.new_fattr);
2694 nfs_free_fattr(res.old_fattr);
2677 return status; 2695 return status;
2678} 2696}
2679 2697
@@ -2700,28 +2718,30 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
2700 .name = name, 2718 .name = name,
2701 .bitmask = server->attr_bitmask, 2719 .bitmask = server->attr_bitmask,
2702 }; 2720 };
2703 struct nfs_fattr fattr, dir_attr;
2704 struct nfs4_link_res res = { 2721 struct nfs4_link_res res = {
2705 .server = server, 2722 .server = server,
2706 .fattr = &fattr,
2707 .dir_attr = &dir_attr,
2708 }; 2723 };
2709 struct rpc_message msg = { 2724 struct rpc_message msg = {
2710 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK], 2725 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK],
2711 .rpc_argp = &arg, 2726 .rpc_argp = &arg,
2712 .rpc_resp = &res, 2727 .rpc_resp = &res,
2713 }; 2728 };
2714 int status; 2729 int status = -ENOMEM;
2730
2731 res.fattr = nfs_alloc_fattr();
2732 res.dir_attr = nfs_alloc_fattr();
2733 if (res.fattr == NULL || res.dir_attr == NULL)
2734 goto out;
2715 2735
2716 nfs_fattr_init(res.fattr);
2717 nfs_fattr_init(res.dir_attr);
2718 status = nfs4_call_sync(server, &msg, &arg, &res, 1); 2736 status = nfs4_call_sync(server, &msg, &arg, &res, 1);
2719 if (!status) { 2737 if (!status) {
2720 update_changeattr(dir, &res.cinfo); 2738 update_changeattr(dir, &res.cinfo);
2721 nfs_post_op_update_inode(dir, res.dir_attr); 2739 nfs_post_op_update_inode(dir, res.dir_attr);
2722 nfs_post_op_update_inode(inode, res.fattr); 2740 nfs_post_op_update_inode(inode, res.fattr);
2723 } 2741 }
2724 2742out:
2743 nfs_free_fattr(res.dir_attr);
2744 nfs_free_fattr(res.fattr);
2725 return status; 2745 return status;
2726} 2746}
2727 2747
@@ -3144,23 +3164,31 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
3144 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; 3164 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
3145} 3165}
3146 3166
3167struct nfs4_renewdata {
3168 struct nfs_client *client;
3169 unsigned long timestamp;
3170};
3171
3147/* 3172/*
3148 * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special 3173 * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special
3149 * standalone procedure for queueing an asynchronous RENEW. 3174 * standalone procedure for queueing an asynchronous RENEW.
3150 */ 3175 */
3151static void nfs4_renew_release(void *data) 3176static void nfs4_renew_release(void *calldata)
3152{ 3177{
3153 struct nfs_client *clp = data; 3178 struct nfs4_renewdata *data = calldata;
3179 struct nfs_client *clp = data->client;
3154 3180
3155 if (atomic_read(&clp->cl_count) > 1) 3181 if (atomic_read(&clp->cl_count) > 1)
3156 nfs4_schedule_state_renewal(clp); 3182 nfs4_schedule_state_renewal(clp);
3157 nfs_put_client(clp); 3183 nfs_put_client(clp);
3184 kfree(data);
3158} 3185}
3159 3186
3160static void nfs4_renew_done(struct rpc_task *task, void *data) 3187static void nfs4_renew_done(struct rpc_task *task, void *calldata)
3161{ 3188{
3162 struct nfs_client *clp = data; 3189 struct nfs4_renewdata *data = calldata;
3163 unsigned long timestamp = task->tk_start; 3190 struct nfs_client *clp = data->client;
3191 unsigned long timestamp = data->timestamp;
3164 3192
3165 if (task->tk_status < 0) { 3193 if (task->tk_status < 0) {
3166 /* Unless we're shutting down, schedule state recovery! */ 3194 /* Unless we're shutting down, schedule state recovery! */
@@ -3186,11 +3214,17 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
3186 .rpc_argp = clp, 3214 .rpc_argp = clp,
3187 .rpc_cred = cred, 3215 .rpc_cred = cred,
3188 }; 3216 };
3217 struct nfs4_renewdata *data;
3189 3218
3190 if (!atomic_inc_not_zero(&clp->cl_count)) 3219 if (!atomic_inc_not_zero(&clp->cl_count))
3191 return -EIO; 3220 return -EIO;
3221 data = kmalloc(sizeof(*data), GFP_KERNEL);
3222 if (data == NULL)
3223 return -ENOMEM;
3224 data->client = clp;
3225 data->timestamp = jiffies;
3192 return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, 3226 return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
3193 &nfs4_renew_ops, clp); 3227 &nfs4_renew_ops, data);
3194} 3228}
3195 3229
3196int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) 3230int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
@@ -3492,7 +3526,9 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
3492 return _nfs4_async_handle_error(task, server, server->nfs_client, state); 3526 return _nfs4_async_handle_error(task, server, server->nfs_client, state);
3493} 3527}
3494 3528
3495int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred) 3529int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
3530 unsigned short port, struct rpc_cred *cred,
3531 struct nfs4_setclientid_res *res)
3496{ 3532{
3497 nfs4_verifier sc_verifier; 3533 nfs4_verifier sc_verifier;
3498 struct nfs4_setclientid setclientid = { 3534 struct nfs4_setclientid setclientid = {
@@ -3502,7 +3538,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
3502 struct rpc_message msg = { 3538 struct rpc_message msg = {
3503 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID], 3539 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
3504 .rpc_argp = &setclientid, 3540 .rpc_argp = &setclientid,
3505 .rpc_resp = clp, 3541 .rpc_resp = res,
3506 .rpc_cred = cred, 3542 .rpc_cred = cred,
3507 }; 3543 };
3508 __be32 *p; 3544 __be32 *p;
@@ -3545,12 +3581,14 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
3545 return status; 3581 return status;
3546} 3582}
3547 3583
3548static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred) 3584static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp,
3585 struct nfs4_setclientid_res *arg,
3586 struct rpc_cred *cred)
3549{ 3587{
3550 struct nfs_fsinfo fsinfo; 3588 struct nfs_fsinfo fsinfo;
3551 struct rpc_message msg = { 3589 struct rpc_message msg = {
3552 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM], 3590 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM],
3553 .rpc_argp = clp, 3591 .rpc_argp = arg,
3554 .rpc_resp = &fsinfo, 3592 .rpc_resp = &fsinfo,
3555 .rpc_cred = cred, 3593 .rpc_cred = cred,
3556 }; 3594 };
@@ -3568,12 +3606,14 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cre
3568 return status; 3606 return status;
3569} 3607}
3570 3608
3571int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred) 3609int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
3610 struct nfs4_setclientid_res *arg,
3611 struct rpc_cred *cred)
3572{ 3612{
3573 long timeout = 0; 3613 long timeout = 0;
3574 int err; 3614 int err;
3575 do { 3615 do {
3576 err = _nfs4_proc_setclientid_confirm(clp, cred); 3616 err = _nfs4_proc_setclientid_confirm(clp, arg, cred);
3577 switch (err) { 3617 switch (err) {
3578 case 0: 3618 case 0:
3579 return err; 3619 return err;
@@ -3665,7 +3705,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
3665 }; 3705 };
3666 int status = 0; 3706 int status = 0;
3667 3707
3668 data = kzalloc(sizeof(*data), GFP_KERNEL); 3708 data = kzalloc(sizeof(*data), GFP_NOFS);
3669 if (data == NULL) 3709 if (data == NULL)
3670 return -ENOMEM; 3710 return -ENOMEM;
3671 data->args.fhandle = &data->fh; 3711 data->args.fhandle = &data->fh;
@@ -3821,7 +3861,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
3821 struct nfs4_unlockdata *p; 3861 struct nfs4_unlockdata *p;
3822 struct inode *inode = lsp->ls_state->inode; 3862 struct inode *inode = lsp->ls_state->inode;
3823 3863
3824 p = kzalloc(sizeof(*p), GFP_KERNEL); 3864 p = kzalloc(sizeof(*p), GFP_NOFS);
3825 if (p == NULL) 3865 if (p == NULL)
3826 return NULL; 3866 return NULL;
3827 p->arg.fh = NFS_FH(inode); 3867 p->arg.fh = NFS_FH(inode);
@@ -3959,7 +3999,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
3959 if (test_bit(NFS_DELEGATED_STATE, &state->flags)) 3999 if (test_bit(NFS_DELEGATED_STATE, &state->flags))
3960 goto out; 4000 goto out;
3961 lsp = request->fl_u.nfs4_fl.owner; 4001 lsp = request->fl_u.nfs4_fl.owner;
3962 seqid = nfs_alloc_seqid(&lsp->ls_seqid); 4002 seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL);
3963 status = -ENOMEM; 4003 status = -ENOMEM;
3964 if (seqid == NULL) 4004 if (seqid == NULL)
3965 goto out; 4005 goto out;
@@ -3987,22 +4027,23 @@ struct nfs4_lockdata {
3987}; 4027};
3988 4028
3989static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, 4029static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
3990 struct nfs_open_context *ctx, struct nfs4_lock_state *lsp) 4030 struct nfs_open_context *ctx, struct nfs4_lock_state *lsp,
4031 gfp_t gfp_mask)
3991{ 4032{
3992 struct nfs4_lockdata *p; 4033 struct nfs4_lockdata *p;
3993 struct inode *inode = lsp->ls_state->inode; 4034 struct inode *inode = lsp->ls_state->inode;
3994 struct nfs_server *server = NFS_SERVER(inode); 4035 struct nfs_server *server = NFS_SERVER(inode);
3995 4036
3996 p = kzalloc(sizeof(*p), GFP_KERNEL); 4037 p = kzalloc(sizeof(*p), gfp_mask);
3997 if (p == NULL) 4038 if (p == NULL)
3998 return NULL; 4039 return NULL;
3999 4040
4000 p->arg.fh = NFS_FH(inode); 4041 p->arg.fh = NFS_FH(inode);
4001 p->arg.fl = &p->fl; 4042 p->arg.fl = &p->fl;
4002 p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid); 4043 p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask);
4003 if (p->arg.open_seqid == NULL) 4044 if (p->arg.open_seqid == NULL)
4004 goto out_free; 4045 goto out_free;
4005 p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid); 4046 p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid, gfp_mask);
4006 if (p->arg.lock_seqid == NULL) 4047 if (p->arg.lock_seqid == NULL)
4007 goto out_free_seqid; 4048 goto out_free_seqid;
4008 p->arg.lock_stateid = &lsp->ls_stateid; 4049 p->arg.lock_stateid = &lsp->ls_stateid;
@@ -4156,7 +4197,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
4156 4197
4157 dprintk("%s: begin!\n", __func__); 4198 dprintk("%s: begin!\n", __func__);
4158 data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file), 4199 data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
4159 fl->fl_u.nfs4_fl.owner); 4200 fl->fl_u.nfs4_fl.owner,
4201 recovery_type == NFS_LOCK_NEW ? GFP_KERNEL : GFP_NOFS);
4160 if (data == NULL) 4202 if (data == NULL)
4161 return -ENOMEM; 4203 return -ENOMEM;
4162 if (IS_SETLKW(cmd)) 4204 if (IS_SETLKW(cmd))
@@ -4645,7 +4687,7 @@ static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs,
4645 if (max_reqs != tbl->max_slots) { 4687 if (max_reqs != tbl->max_slots) {
4646 ret = -ENOMEM; 4688 ret = -ENOMEM;
4647 new = kmalloc(max_reqs * sizeof(struct nfs4_slot), 4689 new = kmalloc(max_reqs * sizeof(struct nfs4_slot),
4648 GFP_KERNEL); 4690 GFP_NOFS);
4649 if (!new) 4691 if (!new)
4650 goto out; 4692 goto out;
4651 ret = 0; 4693 ret = 0;
@@ -4710,7 +4752,7 @@ static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
4710 4752
4711 dprintk("--> %s: max_reqs=%u\n", __func__, max_slots); 4753 dprintk("--> %s: max_reqs=%u\n", __func__, max_slots);
4712 4754
4713 slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_KERNEL); 4755 slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_NOFS);
4714 if (!slot) 4756 if (!slot)
4715 goto out; 4757 goto out;
4716 ret = 0; 4758 ret = 0;
@@ -4759,7 +4801,7 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
4759 struct nfs4_session *session; 4801 struct nfs4_session *session;
4760 struct nfs4_slot_table *tbl; 4802 struct nfs4_slot_table *tbl;
4761 4803
4762 session = kzalloc(sizeof(struct nfs4_session), GFP_KERNEL); 4804 session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
4763 if (!session) 4805 if (!session)
4764 return NULL; 4806 return NULL;
4765 4807
@@ -5103,10 +5145,11 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp,
5103 5145
5104 if (!atomic_inc_not_zero(&clp->cl_count)) 5146 if (!atomic_inc_not_zero(&clp->cl_count))
5105 return -EIO; 5147 return -EIO;
5106 args = kzalloc(sizeof(*args), GFP_KERNEL); 5148 args = kzalloc(sizeof(*args), GFP_NOFS);
5107 res = kzalloc(sizeof(*res), GFP_KERNEL); 5149 res = kzalloc(sizeof(*res), GFP_NOFS);
5108 if (!args || !res) { 5150 if (!args || !res) {
5109 kfree(args); 5151 kfree(args);
5152 kfree(res);
5110 nfs_put_client(clp); 5153 nfs_put_client(clp);
5111 return -ENOMEM; 5154 return -ENOMEM;
5112 } 5155 }
@@ -5204,7 +5247,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
5204 int status = -ENOMEM; 5247 int status = -ENOMEM;
5205 5248
5206 dprintk("--> %s\n", __func__); 5249 dprintk("--> %s\n", __func__);
5207 calldata = kzalloc(sizeof(*calldata), GFP_KERNEL); 5250 calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
5208 if (calldata == NULL) 5251 if (calldata == NULL)
5209 goto out; 5252 goto out;
5210 calldata->clp = clp; 5253 calldata->clp = clp;
@@ -5215,9 +5258,12 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
5215 msg.rpc_resp = &calldata->res; 5258 msg.rpc_resp = &calldata->res;
5216 task_setup_data.callback_data = calldata; 5259 task_setup_data.callback_data = calldata;
5217 task = rpc_run_task(&task_setup_data); 5260 task = rpc_run_task(&task_setup_data);
5218 if (IS_ERR(task)) 5261 if (IS_ERR(task)) {
5219 status = PTR_ERR(task); 5262 status = PTR_ERR(task);
5263 goto out;
5264 }
5220 rpc_put_task(task); 5265 rpc_put_task(task);
5266 return 0;
5221out: 5267out:
5222 dprintk("<-- %s status=%d\n", __func__, status); 5268 dprintk("<-- %s status=%d\n", __func__, status);
5223 return status; 5269 return status;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 6c5ed51f105e..34acf5926fdc 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -62,6 +62,7 @@ static LIST_HEAD(nfs4_clientid_list);
62 62
63int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) 63int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
64{ 64{
65 struct nfs4_setclientid_res clid;
65 unsigned short port; 66 unsigned short port;
66 int status; 67 int status;
67 68
@@ -69,11 +70,15 @@ int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
69 if (clp->cl_addr.ss_family == AF_INET6) 70 if (clp->cl_addr.ss_family == AF_INET6)
70 port = nfs_callback_tcpport6; 71 port = nfs_callback_tcpport6;
71 72
72 status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred); 73 status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid);
73 if (status == 0) 74 if (status != 0)
74 status = nfs4_proc_setclientid_confirm(clp, cred); 75 goto out;
75 if (status == 0) 76 status = nfs4_proc_setclientid_confirm(clp, &clid, cred);
76 nfs4_schedule_state_renewal(clp); 77 if (status != 0)
78 goto out;
79 clp->cl_clientid = clid.clientid;
80 nfs4_schedule_state_renewal(clp);
81out:
77 return status; 82 return status;
78} 83}
79 84
@@ -361,7 +366,7 @@ nfs4_alloc_state_owner(void)
361{ 366{
362 struct nfs4_state_owner *sp; 367 struct nfs4_state_owner *sp;
363 368
364 sp = kzalloc(sizeof(*sp),GFP_KERNEL); 369 sp = kzalloc(sizeof(*sp),GFP_NOFS);
365 if (!sp) 370 if (!sp)
366 return NULL; 371 return NULL;
367 spin_lock_init(&sp->so_lock); 372 spin_lock_init(&sp->so_lock);
@@ -435,7 +440,7 @@ nfs4_alloc_open_state(void)
435{ 440{
436 struct nfs4_state *state; 441 struct nfs4_state *state;
437 442
438 state = kzalloc(sizeof(*state), GFP_KERNEL); 443 state = kzalloc(sizeof(*state), GFP_NOFS);
439 if (!state) 444 if (!state)
440 return NULL; 445 return NULL;
441 atomic_set(&state->count, 1); 446 atomic_set(&state->count, 1);
@@ -537,7 +542,8 @@ void nfs4_put_open_state(struct nfs4_state *state)
537/* 542/*
538 * Close the current file. 543 * Close the current file.
539 */ 544 */
540static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fmode, int wait) 545static void __nfs4_close(struct path *path, struct nfs4_state *state,
546 fmode_t fmode, gfp_t gfp_mask, int wait)
541{ 547{
542 struct nfs4_state_owner *owner = state->owner; 548 struct nfs4_state_owner *owner = state->owner;
543 int call_close = 0; 549 int call_close = 0;
@@ -578,17 +584,17 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fm
578 nfs4_put_open_state(state); 584 nfs4_put_open_state(state);
579 nfs4_put_state_owner(owner); 585 nfs4_put_state_owner(owner);
580 } else 586 } else
581 nfs4_do_close(path, state, wait); 587 nfs4_do_close(path, state, gfp_mask, wait);
582} 588}
583 589
584void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode) 590void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
585{ 591{
586 __nfs4_close(path, state, fmode, 0); 592 __nfs4_close(path, state, fmode, GFP_NOFS, 0);
587} 593}
588 594
589void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode) 595void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode)
590{ 596{
591 __nfs4_close(path, state, fmode, 1); 597 __nfs4_close(path, state, fmode, GFP_KERNEL, 1);
592} 598}
593 599
594/* 600/*
@@ -618,7 +624,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
618 struct nfs4_lock_state *lsp; 624 struct nfs4_lock_state *lsp;
619 struct nfs_client *clp = state->owner->so_client; 625 struct nfs_client *clp = state->owner->so_client;
620 626
621 lsp = kzalloc(sizeof(*lsp), GFP_KERNEL); 627 lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
622 if (lsp == NULL) 628 if (lsp == NULL)
623 return NULL; 629 return NULL;
624 rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue"); 630 rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue");
@@ -754,11 +760,11 @@ void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t f
754 nfs4_put_lock_state(lsp); 760 nfs4_put_lock_state(lsp);
755} 761}
756 762
757struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter) 763struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask)
758{ 764{
759 struct nfs_seqid *new; 765 struct nfs_seqid *new;
760 766
761 new = kmalloc(sizeof(*new), GFP_KERNEL); 767 new = kmalloc(sizeof(*new), gfp_mask);
762 if (new != NULL) { 768 if (new != NULL) {
763 new->sequence = counter; 769 new->sequence = counter;
764 INIT_LIST_HEAD(&new->list); 770 INIT_LIST_HEAD(&new->list);
@@ -1347,7 +1353,7 @@ static int nfs4_recall_slot(struct nfs_client *clp)
1347 1353
1348 nfs4_begin_drain_session(clp); 1354 nfs4_begin_drain_session(clp);
1349 new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot), 1355 new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot),
1350 GFP_KERNEL); 1356 GFP_NOFS);
1351 if (!new) 1357 if (!new)
1352 return -ENOMEM; 1358 return -ENOMEM;
1353 1359
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4d338be492cb..6bdef28efa33 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -38,7 +38,6 @@
38#include <linux/param.h> 38#include <linux/param.h>
39#include <linux/time.h> 39#include <linux/time.h>
40#include <linux/mm.h> 40#include <linux/mm.h>
41#include <linux/slab.h>
42#include <linux/errno.h> 41#include <linux/errno.h>
43#include <linux/string.h> 42#include <linux/string.h>
44#include <linux/in.h> 43#include <linux/in.h>
@@ -1505,14 +1504,14 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
1505 hdr->replen += decode_setclientid_maxsz; 1504 hdr->replen += decode_setclientid_maxsz;
1506} 1505}
1507 1506
1508static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state, struct compound_hdr *hdr) 1507static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr)
1509{ 1508{
1510 __be32 *p; 1509 __be32 *p;
1511 1510
1512 p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE); 1511 p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE);
1513 *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM); 1512 *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
1514 p = xdr_encode_hyper(p, client_state->cl_clientid); 1513 p = xdr_encode_hyper(p, arg->clientid);
1515 xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); 1514 xdr_encode_opaque_fixed(p, arg->confirm.data, NFS4_VERIFIER_SIZE);
1516 hdr->nops++; 1515 hdr->nops++;
1517 hdr->replen += decode_setclientid_confirm_maxsz; 1516 hdr->replen += decode_setclientid_confirm_maxsz;
1518} 1517}
@@ -2325,7 +2324,7 @@ static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4
2325/* 2324/*
2326 * a SETCLIENTID_CONFIRM request 2325 * a SETCLIENTID_CONFIRM request
2327 */ 2326 */
2328static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp) 2327static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid_res *arg)
2329{ 2328{
2330 struct xdr_stream xdr; 2329 struct xdr_stream xdr;
2331 struct compound_hdr hdr = { 2330 struct compound_hdr hdr = {
@@ -2335,7 +2334,7 @@ static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
2335 2334
2336 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2335 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2337 encode_compound_hdr(&xdr, req, &hdr); 2336 encode_compound_hdr(&xdr, req, &hdr);
2338 encode_setclientid_confirm(&xdr, clp, &hdr); 2337 encode_setclientid_confirm(&xdr, arg, &hdr);
2339 encode_putrootfh(&xdr, &hdr); 2338 encode_putrootfh(&xdr, &hdr);
2340 encode_fsinfo(&xdr, lease_bitmap, &hdr); 2339 encode_fsinfo(&xdr, lease_bitmap, &hdr);
2341 encode_nops(&hdr); 2340 encode_nops(&hdr);
@@ -4398,7 +4397,7 @@ out_overflow:
4398 return -EIO; 4397 return -EIO;
4399} 4398}
4400 4399
4401static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) 4400static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_res *res)
4402{ 4401{
4403 __be32 *p; 4402 __be32 *p;
4404 uint32_t opnum; 4403 uint32_t opnum;
@@ -4418,8 +4417,8 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
4418 p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE); 4417 p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
4419 if (unlikely(!p)) 4418 if (unlikely(!p))
4420 goto out_overflow; 4419 goto out_overflow;
4421 p = xdr_decode_hyper(p, &clp->cl_clientid); 4420 p = xdr_decode_hyper(p, &res->clientid);
4422 memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE); 4421 memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE);
4423 } else if (nfserr == NFSERR_CLID_INUSE) { 4422 } else if (nfserr == NFSERR_CLID_INUSE) {
4424 uint32_t len; 4423 uint32_t len;
4425 4424
@@ -4816,7 +4815,7 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem
4816 goto out; 4815 goto out;
4817 if ((status = decode_remove(&xdr, &res->cinfo)) != 0) 4816 if ((status = decode_remove(&xdr, &res->cinfo)) != 0)
4818 goto out; 4817 goto out;
4819 decode_getfattr(&xdr, &res->dir_attr, res->server, 4818 decode_getfattr(&xdr, res->dir_attr, res->server,
4820 !RPC_IS_ASYNC(rqstp->rq_task)); 4819 !RPC_IS_ASYNC(rqstp->rq_task));
4821out: 4820out:
4822 return status; 4821 return status;
@@ -5499,7 +5498,7 @@ static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
5499 * Decode SETCLIENTID response 5498 * Decode SETCLIENTID response
5500 */ 5499 */
5501static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p, 5500static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
5502 struct nfs_client *clp) 5501 struct nfs4_setclientid_res *res)
5503{ 5502{
5504 struct xdr_stream xdr; 5503 struct xdr_stream xdr;
5505 struct compound_hdr hdr; 5504 struct compound_hdr hdr;
@@ -5508,7 +5507,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
5508 xdr_init_decode(&xdr, &req->rq_rcv_buf, p); 5507 xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
5509 status = decode_compound_hdr(&xdr, &hdr); 5508 status = decode_compound_hdr(&xdr, &hdr);
5510 if (!status) 5509 if (!status)
5511 status = decode_setclientid(&xdr, clp); 5510 status = decode_setclientid(&xdr, res);
5512 return status; 5511 return status;
5513} 5512}
5514 5513
@@ -5552,6 +5551,8 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf
5552 if (status != 0) 5551 if (status != 0)
5553 goto out; 5552 goto out;
5554 status = decode_delegreturn(&xdr); 5553 status = decode_delegreturn(&xdr);
5554 if (status != 0)
5555 goto out;
5555 decode_getfattr(&xdr, res->fattr, res->server, 5556 decode_getfattr(&xdr, res->fattr, res->server,
5556 !RPC_IS_ASYNC(rqstp->rq_task)); 5557 !RPC_IS_ASYNC(rqstp->rq_task));
5557out: 5558out:
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 8c55b27c0de4..6bd19d843af7 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -488,7 +488,6 @@ static int __init root_nfs_ports(void)
488 */ 488 */
489static int __init root_nfs_get_handle(void) 489static int __init root_nfs_get_handle(void)
490{ 490{
491 struct nfs_fh fh;
492 struct sockaddr_in sin; 491 struct sockaddr_in sin;
493 unsigned int auth_flav_len = 0; 492 unsigned int auth_flav_len = 0;
494 struct nfs_mount_request request = { 493 struct nfs_mount_request request = {
@@ -499,21 +498,24 @@ static int __init root_nfs_get_handle(void)
499 NFS_MNT3_VERSION : NFS_MNT_VERSION, 498 NFS_MNT3_VERSION : NFS_MNT_VERSION,
500 .protocol = (nfs_data.flags & NFS_MOUNT_TCP) ? 499 .protocol = (nfs_data.flags & NFS_MOUNT_TCP) ?
501 XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP, 500 XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP,
502 .fh = &fh,
503 .auth_flav_len = &auth_flav_len, 501 .auth_flav_len = &auth_flav_len,
504 }; 502 };
505 int status; 503 int status = -ENOMEM;
506 504
505 request.fh = nfs_alloc_fhandle();
506 if (!request.fh)
507 goto out;
507 set_sockaddr(&sin, servaddr, htons(mount_port)); 508 set_sockaddr(&sin, servaddr, htons(mount_port));
508 status = nfs_mount(&request); 509 status = nfs_mount(&request);
509 if (status < 0) 510 if (status < 0)
510 printk(KERN_ERR "Root-NFS: Server returned error %d " 511 printk(KERN_ERR "Root-NFS: Server returned error %d "
511 "while mounting %s\n", status, nfs_export_path); 512 "while mounting %s\n", status, nfs_export_path);
512 else { 513 else {
513 nfs_data.root.size = fh.size; 514 nfs_data.root.size = request.fh->size;
514 memcpy(nfs_data.root.data, fh.data, fh.size); 515 memcpy(&nfs_data.root.data, request.fh->data, request.fh->size);
515 } 516 }
516 517 nfs_free_fhandle(request.fh);
518out:
517 return status; 519 return status;
518} 520}
519 521
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index a12c45b65dd4..a3654e57b589 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -60,16 +60,10 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
60{ 60{
61 struct nfs_page *req; 61 struct nfs_page *req;
62 62
63 for (;;) { 63 /* try to allocate the request struct */
64 /* try to allocate the request struct */ 64 req = nfs_page_alloc();
65 req = nfs_page_alloc(); 65 if (req == NULL)
66 if (req != NULL) 66 return ERR_PTR(-ENOMEM);
67 break;
68
69 if (fatal_signal_pending(current))
70 return ERR_PTR(-ERESTARTSYS);
71 yield();
72 }
73 67
74 /* Initialize the request struct. Initially, we assume a 68 /* Initialize the request struct. Initially, we assume a
75 * long write-back delay. This will be adjusted in 69 * long write-back delay. This will be adjusted in
@@ -112,12 +106,10 @@ void nfs_unlock_request(struct nfs_page *req)
112 */ 106 */
113int nfs_set_page_tag_locked(struct nfs_page *req) 107int nfs_set_page_tag_locked(struct nfs_page *req)
114{ 108{
115 struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode);
116
117 if (!nfs_lock_request_dontget(req)) 109 if (!nfs_lock_request_dontget(req))
118 return 0; 110 return 0;
119 if (req->wb_page != NULL) 111 if (req->wb_page != NULL)
120 radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); 112 radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
121 return 1; 113 return 1;
122} 114}
123 115
@@ -126,10 +118,10 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
126 */ 118 */
127void nfs_clear_page_tag_locked(struct nfs_page *req) 119void nfs_clear_page_tag_locked(struct nfs_page *req)
128{ 120{
129 struct inode *inode = req->wb_context->path.dentry->d_inode;
130 struct nfs_inode *nfsi = NFS_I(inode);
131
132 if (req->wb_page != NULL) { 121 if (req->wb_page != NULL) {
122 struct inode *inode = req->wb_context->path.dentry->d_inode;
123 struct nfs_inode *nfsi = NFS_I(inode);
124
133 spin_lock(&inode->i_lock); 125 spin_lock(&inode->i_lock);
134 radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); 126 radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
135 nfs_unlock_request(req); 127 nfs_unlock_request(req);
@@ -142,16 +134,22 @@ void nfs_clear_page_tag_locked(struct nfs_page *req)
142 * nfs_clear_request - Free up all resources allocated to the request 134 * nfs_clear_request - Free up all resources allocated to the request
143 * @req: 135 * @req:
144 * 136 *
145 * Release page resources associated with a write request after it 137 * Release page and open context resources associated with a read/write
146 * has completed. 138 * request after it has completed.
147 */ 139 */
148void nfs_clear_request(struct nfs_page *req) 140void nfs_clear_request(struct nfs_page *req)
149{ 141{
150 struct page *page = req->wb_page; 142 struct page *page = req->wb_page;
143 struct nfs_open_context *ctx = req->wb_context;
144
151 if (page != NULL) { 145 if (page != NULL) {
152 page_cache_release(page); 146 page_cache_release(page);
153 req->wb_page = NULL; 147 req->wb_page = NULL;
154 } 148 }
149 if (ctx != NULL) {
150 put_nfs_open_context(ctx);
151 req->wb_context = NULL;
152 }
155} 153}
156 154
157 155
@@ -165,9 +163,8 @@ static void nfs_free_request(struct kref *kref)
165{ 163{
166 struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); 164 struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
167 165
168 /* Release struct file or cached credential */ 166 /* Release struct file and open context */
169 nfs_clear_request(req); 167 nfs_clear_request(req);
170 put_nfs_open_context(req->wb_context);
171 nfs_page_free(req); 168 nfs_page_free(req);
172} 169}
173 170
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index c752d944fe9e..611bec22f552 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -29,7 +29,6 @@
29 29
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/param.h> 31#include <linux/param.h>
32#include <linux/slab.h>
33#include <linux/time.h> 32#include <linux/time.h>
34#include <linux/mm.h> 33#include <linux/mm.h>
35#include <linux/errno.h> 34#include <linux/errno.h>
@@ -225,35 +224,60 @@ static int nfs_proc_readlink(struct inode *inode, struct page *page,
225 return status; 224 return status;
226} 225}
227 226
227struct nfs_createdata {
228 struct nfs_createargs arg;
229 struct nfs_diropok res;
230 struct nfs_fh fhandle;
231 struct nfs_fattr fattr;
232};
233
234static struct nfs_createdata *nfs_alloc_createdata(struct inode *dir,
235 struct dentry *dentry, struct iattr *sattr)
236{
237 struct nfs_createdata *data;
238
239 data = kmalloc(sizeof(*data), GFP_KERNEL);
240
241 if (data != NULL) {
242 data->arg.fh = NFS_FH(dir);
243 data->arg.name = dentry->d_name.name;
244 data->arg.len = dentry->d_name.len;
245 data->arg.sattr = sattr;
246 nfs_fattr_init(&data->fattr);
247 data->fhandle.size = 0;
248 data->res.fh = &data->fhandle;
249 data->res.fattr = &data->fattr;
250 }
251 return data;
252};
253
254static void nfs_free_createdata(const struct nfs_createdata *data)
255{
256 kfree(data);
257}
258
228static int 259static int
229nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 260nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
230 int flags, struct nameidata *nd) 261 int flags, struct nameidata *nd)
231{ 262{
232 struct nfs_fh fhandle; 263 struct nfs_createdata *data;
233 struct nfs_fattr fattr;
234 struct nfs_createargs arg = {
235 .fh = NFS_FH(dir),
236 .name = dentry->d_name.name,
237 .len = dentry->d_name.len,
238 .sattr = sattr
239 };
240 struct nfs_diropok res = {
241 .fh = &fhandle,
242 .fattr = &fattr
243 };
244 struct rpc_message msg = { 264 struct rpc_message msg = {
245 .rpc_proc = &nfs_procedures[NFSPROC_CREATE], 265 .rpc_proc = &nfs_procedures[NFSPROC_CREATE],
246 .rpc_argp = &arg,
247 .rpc_resp = &res,
248 }; 266 };
249 int status; 267 int status = -ENOMEM;
250 268
251 nfs_fattr_init(&fattr);
252 dprintk("NFS call create %s\n", dentry->d_name.name); 269 dprintk("NFS call create %s\n", dentry->d_name.name);
270 data = nfs_alloc_createdata(dir, dentry, sattr);
271 if (data == NULL)
272 goto out;
273 msg.rpc_argp = &data->arg;
274 msg.rpc_resp = &data->res;
253 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 275 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
254 nfs_mark_for_revalidate(dir); 276 nfs_mark_for_revalidate(dir);
255 if (status == 0) 277 if (status == 0)
256 status = nfs_instantiate(dentry, &fhandle, &fattr); 278 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
279 nfs_free_createdata(data);
280out:
257 dprintk("NFS reply create: %d\n", status); 281 dprintk("NFS reply create: %d\n", status);
258 return status; 282 return status;
259} 283}
@@ -265,24 +289,12 @@ static int
265nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 289nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
266 dev_t rdev) 290 dev_t rdev)
267{ 291{
268 struct nfs_fh fhandle; 292 struct nfs_createdata *data;
269 struct nfs_fattr fattr;
270 struct nfs_createargs arg = {
271 .fh = NFS_FH(dir),
272 .name = dentry->d_name.name,
273 .len = dentry->d_name.len,
274 .sattr = sattr
275 };
276 struct nfs_diropok res = {
277 .fh = &fhandle,
278 .fattr = &fattr
279 };
280 struct rpc_message msg = { 293 struct rpc_message msg = {
281 .rpc_proc = &nfs_procedures[NFSPROC_CREATE], 294 .rpc_proc = &nfs_procedures[NFSPROC_CREATE],
282 .rpc_argp = &arg,
283 .rpc_resp = &res,
284 }; 295 };
285 int status, mode; 296 umode_t mode;
297 int status = -ENOMEM;
286 298
287 dprintk("NFS call mknod %s\n", dentry->d_name.name); 299 dprintk("NFS call mknod %s\n", dentry->d_name.name);
288 300
@@ -295,17 +307,24 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
295 sattr->ia_size = new_encode_dev(rdev);/* get out your barf bag */ 307 sattr->ia_size = new_encode_dev(rdev);/* get out your barf bag */
296 } 308 }
297 309
298 nfs_fattr_init(&fattr); 310 data = nfs_alloc_createdata(dir, dentry, sattr);
311 if (data == NULL)
312 goto out;
313 msg.rpc_argp = &data->arg;
314 msg.rpc_resp = &data->res;
315
299 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 316 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
300 nfs_mark_for_revalidate(dir); 317 nfs_mark_for_revalidate(dir);
301 318
302 if (status == -EINVAL && S_ISFIFO(mode)) { 319 if (status == -EINVAL && S_ISFIFO(mode)) {
303 sattr->ia_mode = mode; 320 sattr->ia_mode = mode;
304 nfs_fattr_init(&fattr); 321 nfs_fattr_init(data->res.fattr);
305 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 322 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
306 } 323 }
307 if (status == 0) 324 if (status == 0)
308 status = nfs_instantiate(dentry, &fhandle, &fattr); 325 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
326 nfs_free_createdata(data);
327out:
309 dprintk("NFS reply mknod: %d\n", status); 328 dprintk("NFS reply mknod: %d\n", status);
310 return status; 329 return status;
311} 330}
@@ -399,8 +418,8 @@ static int
399nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, 418nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
400 unsigned int len, struct iattr *sattr) 419 unsigned int len, struct iattr *sattr)
401{ 420{
402 struct nfs_fh fhandle; 421 struct nfs_fh *fh;
403 struct nfs_fattr fattr; 422 struct nfs_fattr *fattr;
404 struct nfs_symlinkargs arg = { 423 struct nfs_symlinkargs arg = {
405 .fromfh = NFS_FH(dir), 424 .fromfh = NFS_FH(dir),
406 .fromname = dentry->d_name.name, 425 .fromname = dentry->d_name.name,
@@ -413,12 +432,18 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
413 .rpc_proc = &nfs_procedures[NFSPROC_SYMLINK], 432 .rpc_proc = &nfs_procedures[NFSPROC_SYMLINK],
414 .rpc_argp = &arg, 433 .rpc_argp = &arg,
415 }; 434 };
416 int status; 435 int status = -ENAMETOOLONG;
436
437 dprintk("NFS call symlink %s\n", dentry->d_name.name);
417 438
418 if (len > NFS2_MAXPATHLEN) 439 if (len > NFS2_MAXPATHLEN)
419 return -ENAMETOOLONG; 440 goto out;
420 441
421 dprintk("NFS call symlink %s\n", dentry->d_name.name); 442 fh = nfs_alloc_fhandle();
443 fattr = nfs_alloc_fattr();
444 status = -ENOMEM;
445 if (fh == NULL || fattr == NULL)
446 goto out;
422 447
423 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 448 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
424 nfs_mark_for_revalidate(dir); 449 nfs_mark_for_revalidate(dir);
@@ -428,12 +453,12 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
428 * filehandle size to zero indicates to nfs_instantiate that it 453 * filehandle size to zero indicates to nfs_instantiate that it
429 * should fill in the data with a LOOKUP call on the wire. 454 * should fill in the data with a LOOKUP call on the wire.
430 */ 455 */
431 if (status == 0) { 456 if (status == 0)
432 nfs_fattr_init(&fattr); 457 status = nfs_instantiate(dentry, fh, fattr);
433 fhandle.size = 0;
434 status = nfs_instantiate(dentry, &fhandle, &fattr);
435 }
436 458
459 nfs_free_fattr(fattr);
460 nfs_free_fhandle(fh);
461out:
437 dprintk("NFS reply symlink: %d\n", status); 462 dprintk("NFS reply symlink: %d\n", status);
438 return status; 463 return status;
439} 464}
@@ -441,31 +466,25 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
441static int 466static int
442nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) 467nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
443{ 468{
444 struct nfs_fh fhandle; 469 struct nfs_createdata *data;
445 struct nfs_fattr fattr;
446 struct nfs_createargs arg = {
447 .fh = NFS_FH(dir),
448 .name = dentry->d_name.name,
449 .len = dentry->d_name.len,
450 .sattr = sattr
451 };
452 struct nfs_diropok res = {
453 .fh = &fhandle,
454 .fattr = &fattr
455 };
456 struct rpc_message msg = { 470 struct rpc_message msg = {
457 .rpc_proc = &nfs_procedures[NFSPROC_MKDIR], 471 .rpc_proc = &nfs_procedures[NFSPROC_MKDIR],
458 .rpc_argp = &arg,
459 .rpc_resp = &res,
460 }; 472 };
461 int status; 473 int status = -ENOMEM;
462 474
463 dprintk("NFS call mkdir %s\n", dentry->d_name.name); 475 dprintk("NFS call mkdir %s\n", dentry->d_name.name);
464 nfs_fattr_init(&fattr); 476 data = nfs_alloc_createdata(dir, dentry, sattr);
477 if (data == NULL)
478 goto out;
479 msg.rpc_argp = &data->arg;
480 msg.rpc_resp = &data->res;
481
465 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 482 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
466 nfs_mark_for_revalidate(dir); 483 nfs_mark_for_revalidate(dir);
467 if (status == 0) 484 if (status == 0)
468 status = nfs_instantiate(dentry, &fhandle, &fattr); 485 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
486 nfs_free_createdata(data);
487out:
469 dprintk("NFS reply mkdir: %d\n", status); 488 dprintk("NFS reply mkdir: %d\n", status);
470 return status; 489 return status;
471} 490}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index db9b360ae19d..6e2b06e6ca79 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -40,7 +40,7 @@ static mempool_t *nfs_rdata_mempool;
40 40
41struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) 41struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
42{ 42{
43 struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_NOFS); 43 struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_KERNEL);
44 44
45 if (p) { 45 if (p) {
46 memset(p, 0, sizeof(*p)); 46 memset(p, 0, sizeof(*p));
@@ -50,7 +50,7 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
50 if (pagecount <= ARRAY_SIZE(p->page_array)) 50 if (pagecount <= ARRAY_SIZE(p->page_array))
51 p->pagevec = p->page_array; 51 p->pagevec = p->page_array;
52 else { 52 else {
53 p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS); 53 p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
54 if (!p->pagevec) { 54 if (!p->pagevec) {
55 mempool_free(p, nfs_rdata_mempool); 55 mempool_free(p, nfs_rdata_mempool);
56 p = NULL; 56 p = NULL;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f1afee4eea77..2f8b1157daa2 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -48,6 +48,7 @@
48#include <linux/vfs.h> 48#include <linux/vfs.h>
49#include <linux/inet.h> 49#include <linux/inet.h>
50#include <linux/in6.h> 50#include <linux/in6.h>
51#include <linux/slab.h>
51#include <net/ipv6.h> 52#include <net/ipv6.h>
52#include <linux/netdevice.h> 53#include <linux/netdevice.h>
53#include <linux/nfs_xdr.h> 54#include <linux/nfs_xdr.h>
@@ -140,7 +141,6 @@ static const match_table_t nfs_mount_option_tokens = {
140 { Opt_resvport, "resvport" }, 141 { Opt_resvport, "resvport" },
141 { Opt_noresvport, "noresvport" }, 142 { Opt_noresvport, "noresvport" },
142 { Opt_fscache, "fsc" }, 143 { Opt_fscache, "fsc" },
143 { Opt_fscache_uniq, "fsc=%s" },
144 { Opt_nofscache, "nofsc" }, 144 { Opt_nofscache, "nofsc" },
145 145
146 { Opt_port, "port=%s" }, 146 { Opt_port, "port=%s" },
@@ -170,6 +170,7 @@ static const match_table_t nfs_mount_option_tokens = {
170 { Opt_mountaddr, "mountaddr=%s" }, 170 { Opt_mountaddr, "mountaddr=%s" },
171 171
172 { Opt_lookupcache, "lookupcache=%s" }, 172 { Opt_lookupcache, "lookupcache=%s" },
173 { Opt_fscache_uniq, "fsc=%s" },
173 174
174 { Opt_err, NULL } 175 { Opt_err, NULL }
175}; 176};
@@ -422,15 +423,19 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
422 unsigned char blockbits; 423 unsigned char blockbits;
423 unsigned long blockres; 424 unsigned long blockres;
424 struct nfs_fh *fh = NFS_FH(dentry->d_inode); 425 struct nfs_fh *fh = NFS_FH(dentry->d_inode);
425 struct nfs_fattr fattr; 426 struct nfs_fsstat res;
426 struct nfs_fsstat res = { 427 int error = -ENOMEM;
427 .fattr = &fattr, 428
428 }; 429 res.fattr = nfs_alloc_fattr();
429 int error; 430 if (res.fattr == NULL)
431 goto out_err;
430 432
431 error = server->nfs_client->rpc_ops->statfs(server, fh, &res); 433 error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
434
435 nfs_free_fattr(res.fattr);
432 if (error < 0) 436 if (error < 0)
433 goto out_err; 437 goto out_err;
438
434 buf->f_type = NFS_SUPER_MAGIC; 439 buf->f_type = NFS_SUPER_MAGIC;
435 440
436 /* 441 /*
@@ -1045,14 +1050,6 @@ static int nfs_parse_mount_options(char *raw,
1045 kfree(mnt->fscache_uniq); 1050 kfree(mnt->fscache_uniq);
1046 mnt->fscache_uniq = NULL; 1051 mnt->fscache_uniq = NULL;
1047 break; 1052 break;
1048 case Opt_fscache_uniq:
1049 string = match_strdup(args);
1050 if (!string)
1051 goto out_nomem;
1052 kfree(mnt->fscache_uniq);
1053 mnt->fscache_uniq = string;
1054 mnt->options |= NFS_OPTION_FSCACHE;
1055 break;
1056 1053
1057 /* 1054 /*
1058 * options that take numeric values 1055 * options that take numeric values
@@ -1383,6 +1380,14 @@ static int nfs_parse_mount_options(char *raw,
1383 return 0; 1380 return 0;
1384 }; 1381 };
1385 break; 1382 break;
1383 case Opt_fscache_uniq:
1384 string = match_strdup(args);
1385 if (string == NULL)
1386 goto out_nomem;
1387 kfree(mnt->fscache_uniq);
1388 mnt->fscache_uniq = string;
1389 mnt->options |= NFS_OPTION_FSCACHE;
1390 break;
1386 1391
1387 /* 1392 /*
1388 * Special options 1393 * Special options
@@ -2171,7 +2176,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2171 int error = -ENOMEM; 2176 int error = -ENOMEM;
2172 2177
2173 data = nfs_alloc_parsed_mount_data(3); 2178 data = nfs_alloc_parsed_mount_data(3);
2174 mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); 2179 mntfh = nfs_alloc_fhandle();
2175 if (data == NULL || mntfh == NULL) 2180 if (data == NULL || mntfh == NULL)
2176 goto out_free_fh; 2181 goto out_free_fh;
2177 2182
@@ -2186,6 +2191,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2186 if (data->version == 4) { 2191 if (data->version == 4) {
2187 error = nfs4_try_mount(flags, dev_name, data, mnt); 2192 error = nfs4_try_mount(flags, dev_name, data, mnt);
2188 kfree(data->client_address); 2193 kfree(data->client_address);
2194 kfree(data->nfs_server.export_path);
2189 goto out; 2195 goto out;
2190 } 2196 }
2191#endif /* CONFIG_NFS_V4 */ 2197#endif /* CONFIG_NFS_V4 */
@@ -2214,7 +2220,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2214 } else { 2220 } else {
2215 error = nfs_bdi_register(server); 2221 error = nfs_bdi_register(server);
2216 if (error) 2222 if (error)
2217 goto error_splat_super; 2223 goto error_splat_bdi;
2218 } 2224 }
2219 2225
2220 if (!s->s_root) { 2226 if (!s->s_root) {
@@ -2245,7 +2251,7 @@ out:
2245 kfree(data->fscache_uniq); 2251 kfree(data->fscache_uniq);
2246 security_free_mnt_opts(&data->lsm_opts); 2252 security_free_mnt_opts(&data->lsm_opts);
2247out_free_fh: 2253out_free_fh:
2248 kfree(mntfh); 2254 nfs_free_fhandle(mntfh);
2249 kfree(data); 2255 kfree(data);
2250 return error; 2256 return error;
2251 2257
@@ -2256,6 +2262,9 @@ out_err_nosb:
2256error_splat_root: 2262error_splat_root:
2257 dput(mntroot); 2263 dput(mntroot);
2258error_splat_super: 2264error_splat_super:
2265 if (server && !s->s_root)
2266 bdi_unregister(&server->backing_dev_info);
2267error_splat_bdi:
2259 deactivate_locked_super(s); 2268 deactivate_locked_super(s);
2260 goto out; 2269 goto out;
2261} 2270}
@@ -2326,7 +2335,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
2326 } else { 2335 } else {
2327 error = nfs_bdi_register(server); 2336 error = nfs_bdi_register(server);
2328 if (error) 2337 if (error)
2329 goto error_splat_super; 2338 goto error_splat_bdi;
2330 } 2339 }
2331 2340
2332 if (!s->s_root) { 2341 if (!s->s_root) {
@@ -2363,6 +2372,9 @@ out_err_noserver:
2363 return error; 2372 return error;
2364 2373
2365error_splat_super: 2374error_splat_super:
2375 if (server && !s->s_root)
2376 bdi_unregister(&server->backing_dev_info);
2377error_splat_bdi:
2366 deactivate_locked_super(s); 2378 deactivate_locked_super(s);
2367 dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error); 2379 dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error);
2368 return error; 2380 return error;
@@ -2548,7 +2560,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
2548 }; 2560 };
2549 int error = -ENOMEM; 2561 int error = -ENOMEM;
2550 2562
2551 mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); 2563 mntfh = nfs_alloc_fhandle();
2552 if (data == NULL || mntfh == NULL) 2564 if (data == NULL || mntfh == NULL)
2553 goto out_free_fh; 2565 goto out_free_fh;
2554 2566
@@ -2578,7 +2590,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
2578 } else { 2590 } else {
2579 error = nfs_bdi_register(server); 2591 error = nfs_bdi_register(server);
2580 if (error) 2592 if (error)
2581 goto error_splat_super; 2593 goto error_splat_bdi;
2582 } 2594 }
2583 2595
2584 if (!s->s_root) { 2596 if (!s->s_root) {
@@ -2606,7 +2618,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
2606out: 2618out:
2607 security_free_mnt_opts(&data->lsm_opts); 2619 security_free_mnt_opts(&data->lsm_opts);
2608out_free_fh: 2620out_free_fh:
2609 kfree(mntfh); 2621 nfs_free_fhandle(mntfh);
2610 return error; 2622 return error;
2611 2623
2612out_free: 2624out_free:
@@ -2616,6 +2628,9 @@ out_free:
2616error_splat_root: 2628error_splat_root:
2617 dput(mntroot); 2629 dput(mntroot);
2618error_splat_super: 2630error_splat_super:
2631 if (server && !s->s_root)
2632 bdi_unregister(&server->backing_dev_info);
2633error_splat_bdi:
2619 deactivate_locked_super(s); 2634 deactivate_locked_super(s);
2620 goto out; 2635 goto out;
2621} 2636}
@@ -2647,7 +2662,7 @@ static void nfs_fix_devname(const struct path *path, struct vfsmount *mnt)
2647 devname = nfs_path(path->mnt->mnt_devname, 2662 devname = nfs_path(path->mnt->mnt_devname,
2648 path->mnt->mnt_root, path->dentry, 2663 path->mnt->mnt_root, path->dentry,
2649 page, PAGE_SIZE); 2664 page, PAGE_SIZE);
2650 if (devname == NULL) 2665 if (IS_ERR(devname))
2651 goto out_freepage; 2666 goto out_freepage;
2652 tmp = kstrdup(devname, GFP_KERNEL); 2667 tmp = kstrdup(devname, GFP_KERNEL);
2653 if (tmp == NULL) 2668 if (tmp == NULL)
@@ -2658,41 +2673,120 @@ out_freepage:
2658 free_page((unsigned long)page); 2673 free_page((unsigned long)page);
2659} 2674}
2660 2675
2676struct nfs_referral_count {
2677 struct list_head list;
2678 const struct task_struct *task;
2679 unsigned int referral_count;
2680};
2681
2682static LIST_HEAD(nfs_referral_count_list);
2683static DEFINE_SPINLOCK(nfs_referral_count_list_lock);
2684
2685static struct nfs_referral_count *nfs_find_referral_count(void)
2686{
2687 struct nfs_referral_count *p;
2688
2689 list_for_each_entry(p, &nfs_referral_count_list, list) {
2690 if (p->task == current)
2691 return p;
2692 }
2693 return NULL;
2694}
2695
2696#define NFS_MAX_NESTED_REFERRALS 2
2697
2698static int nfs_referral_loop_protect(void)
2699{
2700 struct nfs_referral_count *p, *new;
2701 int ret = -ENOMEM;
2702
2703 new = kmalloc(sizeof(*new), GFP_KERNEL);
2704 if (!new)
2705 goto out;
2706 new->task = current;
2707 new->referral_count = 1;
2708
2709 ret = 0;
2710 spin_lock(&nfs_referral_count_list_lock);
2711 p = nfs_find_referral_count();
2712 if (p != NULL) {
2713 if (p->referral_count >= NFS_MAX_NESTED_REFERRALS)
2714 ret = -ELOOP;
2715 else
2716 p->referral_count++;
2717 } else {
2718 list_add(&new->list, &nfs_referral_count_list);
2719 new = NULL;
2720 }
2721 spin_unlock(&nfs_referral_count_list_lock);
2722 kfree(new);
2723out:
2724 return ret;
2725}
2726
2727static void nfs_referral_loop_unprotect(void)
2728{
2729 struct nfs_referral_count *p;
2730
2731 spin_lock(&nfs_referral_count_list_lock);
2732 p = nfs_find_referral_count();
2733 p->referral_count--;
2734 if (p->referral_count == 0)
2735 list_del(&p->list);
2736 else
2737 p = NULL;
2738 spin_unlock(&nfs_referral_count_list_lock);
2739 kfree(p);
2740}
2741
2661static int nfs_follow_remote_path(struct vfsmount *root_mnt, 2742static int nfs_follow_remote_path(struct vfsmount *root_mnt,
2662 const char *export_path, struct vfsmount *mnt_target) 2743 const char *export_path, struct vfsmount *mnt_target)
2663{ 2744{
2745 struct nameidata *nd = NULL;
2664 struct mnt_namespace *ns_private; 2746 struct mnt_namespace *ns_private;
2665 struct nameidata nd;
2666 struct super_block *s; 2747 struct super_block *s;
2667 int ret; 2748 int ret;
2668 2749
2750 nd = kmalloc(sizeof(*nd), GFP_KERNEL);
2751 if (nd == NULL)
2752 return -ENOMEM;
2753
2669 ns_private = create_mnt_ns(root_mnt); 2754 ns_private = create_mnt_ns(root_mnt);
2670 ret = PTR_ERR(ns_private); 2755 ret = PTR_ERR(ns_private);
2671 if (IS_ERR(ns_private)) 2756 if (IS_ERR(ns_private))
2672 goto out_mntput; 2757 goto out_mntput;
2673 2758
2759 ret = nfs_referral_loop_protect();
2760 if (ret != 0)
2761 goto out_put_mnt_ns;
2762
2674 ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, 2763 ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt,
2675 export_path, LOOKUP_FOLLOW, &nd); 2764 export_path, LOOKUP_FOLLOW, nd);
2676 2765
2766 nfs_referral_loop_unprotect();
2677 put_mnt_ns(ns_private); 2767 put_mnt_ns(ns_private);
2678 2768
2679 if (ret != 0) 2769 if (ret != 0)
2680 goto out_err; 2770 goto out_err;
2681 2771
2682 s = nd.path.mnt->mnt_sb; 2772 s = nd->path.mnt->mnt_sb;
2683 atomic_inc(&s->s_active); 2773 atomic_inc(&s->s_active);
2684 mnt_target->mnt_sb = s; 2774 mnt_target->mnt_sb = s;
2685 mnt_target->mnt_root = dget(nd.path.dentry); 2775 mnt_target->mnt_root = dget(nd->path.dentry);
2686 2776
2687 /* Correct the device pathname */ 2777 /* Correct the device pathname */
2688 nfs_fix_devname(&nd.path, mnt_target); 2778 nfs_fix_devname(&nd->path, mnt_target);
2689 2779
2690 path_put(&nd.path); 2780 path_put(&nd->path);
2781 kfree(nd);
2691 down_write(&s->s_umount); 2782 down_write(&s->s_umount);
2692 return 0; 2783 return 0;
2784out_put_mnt_ns:
2785 put_mnt_ns(ns_private);
2693out_mntput: 2786out_mntput:
2694 mntput(root_mnt); 2787 mntput(root_mnt);
2695out_err: 2788out_err:
2789 kfree(nd);
2696 return ret; 2790 return ret;
2697} 2791}
2698 2792
@@ -2811,7 +2905,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
2811 } else { 2905 } else {
2812 error = nfs_bdi_register(server); 2906 error = nfs_bdi_register(server);
2813 if (error) 2907 if (error)
2814 goto error_splat_super; 2908 goto error_splat_bdi;
2815 } 2909 }
2816 2910
2817 if (!s->s_root) { 2911 if (!s->s_root) {
@@ -2847,6 +2941,9 @@ out_err_noserver:
2847 return error; 2941 return error;
2848 2942
2849error_splat_super: 2943error_splat_super:
2944 if (server && !s->s_root)
2945 bdi_unregister(&server->backing_dev_info);
2946error_splat_bdi:
2850 deactivate_locked_super(s); 2947 deactivate_locked_super(s);
2851 dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error); 2948 dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error);
2852 return error; 2949 return error;
@@ -2860,17 +2957,21 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
2860 struct super_block *s; 2957 struct super_block *s;
2861 struct nfs_server *server; 2958 struct nfs_server *server;
2862 struct dentry *mntroot; 2959 struct dentry *mntroot;
2863 struct nfs_fh mntfh; 2960 struct nfs_fh *mntfh;
2864 int (*compare_super)(struct super_block *, void *) = nfs_compare_super; 2961 int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
2865 struct nfs_sb_mountdata sb_mntdata = { 2962 struct nfs_sb_mountdata sb_mntdata = {
2866 .mntflags = flags, 2963 .mntflags = flags,
2867 }; 2964 };
2868 int error; 2965 int error = -ENOMEM;
2869 2966
2870 dprintk("--> nfs4_referral_get_sb()\n"); 2967 dprintk("--> nfs4_referral_get_sb()\n");
2871 2968
2969 mntfh = nfs_alloc_fhandle();
2970 if (mntfh == NULL)
2971 goto out_err_nofh;
2972
2872 /* create a new volume representation */ 2973 /* create a new volume representation */
2873 server = nfs4_create_referral_server(data, &mntfh); 2974 server = nfs4_create_referral_server(data, mntfh);
2874 if (IS_ERR(server)) { 2975 if (IS_ERR(server)) {
2875 error = PTR_ERR(server); 2976 error = PTR_ERR(server);
2876 goto out_err_noserver; 2977 goto out_err_noserver;
@@ -2893,7 +2994,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
2893 } else { 2994 } else {
2894 error = nfs_bdi_register(server); 2995 error = nfs_bdi_register(server);
2895 if (error) 2996 if (error)
2896 goto error_splat_super; 2997 goto error_splat_bdi;
2897 } 2998 }
2898 2999
2899 if (!s->s_root) { 3000 if (!s->s_root) {
@@ -2902,7 +3003,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
2902 nfs_fscache_get_super_cookie(s, NULL, data); 3003 nfs_fscache_get_super_cookie(s, NULL, data);
2903 } 3004 }
2904 3005
2905 mntroot = nfs4_get_root(s, &mntfh); 3006 mntroot = nfs4_get_root(s, mntfh);
2906 if (IS_ERR(mntroot)) { 3007 if (IS_ERR(mntroot)) {
2907 error = PTR_ERR(mntroot); 3008 error = PTR_ERR(mntroot);
2908 goto error_splat_super; 3009 goto error_splat_super;
@@ -2919,17 +3020,24 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
2919 3020
2920 security_sb_clone_mnt_opts(data->sb, s); 3021 security_sb_clone_mnt_opts(data->sb, s);
2921 3022
3023 nfs_free_fhandle(mntfh);
2922 dprintk("<-- nfs4_referral_get_sb() = 0\n"); 3024 dprintk("<-- nfs4_referral_get_sb() = 0\n");
2923 return 0; 3025 return 0;
2924 3026
2925out_err_nosb: 3027out_err_nosb:
2926 nfs_free_server(server); 3028 nfs_free_server(server);
2927out_err_noserver: 3029out_err_noserver:
3030 nfs_free_fhandle(mntfh);
3031out_err_nofh:
2928 dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error); 3032 dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error);
2929 return error; 3033 return error;
2930 3034
2931error_splat_super: 3035error_splat_super:
3036 if (server && !s->s_root)
3037 bdi_unregister(&server->backing_dev_info);
3038error_splat_bdi:
2932 deactivate_locked_super(s); 3039 deactivate_locked_super(s);
3040 nfs_free_fhandle(mntfh);
2933 dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error); 3041 dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
2934 return error; 3042 return error;
2935} 3043}
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 2ea9e5c27e55..05c9e02f4153 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -19,7 +19,6 @@
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/stat.h> 20#include <linux/stat.h>
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/slab.h>
23#include <linux/string.h> 22#include <linux/string.h>
24#include <linux/namei.h> 23#include <linux/namei.h>
25 24
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 6da3d3ff6edd..a2242af6a17d 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -23,6 +23,7 @@ struct nfs_unlinkdata {
23 struct nfs_removeres res; 23 struct nfs_removeres res;
24 struct inode *dir; 24 struct inode *dir;
25 struct rpc_cred *cred; 25 struct rpc_cred *cred;
26 struct nfs_fattr dir_attr;
26}; 27};
27 28
28/** 29/**
@@ -169,7 +170,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
169 } 170 }
170 nfs_sb_active(dir->i_sb); 171 nfs_sb_active(dir->i_sb);
171 data->args.fh = NFS_FH(dir); 172 data->args.fh = NFS_FH(dir);
172 nfs_fattr_init(&data->res.dir_attr); 173 nfs_fattr_init(data->res.dir_attr);
173 174
174 NFS_PROTO(dir)->unlink_setup(&msg, dir); 175 NFS_PROTO(dir)->unlink_setup(&msg, dir);
175 176
@@ -259,6 +260,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
259 goto out_free; 260 goto out_free;
260 } 261 }
261 data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; 262 data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
263 data->res.dir_attr = &data->dir_attr;
262 264
263 status = -EBUSY; 265 status = -EBUSY;
264 spin_lock(&dentry->d_lock); 266 spin_lock(&dentry->d_lock);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 53ff70e23993..3aea3ca98ab7 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -201,6 +201,7 @@ static int nfs_set_page_writeback(struct page *page)
201 struct inode *inode = page->mapping->host; 201 struct inode *inode = page->mapping->host;
202 struct nfs_server *nfss = NFS_SERVER(inode); 202 struct nfs_server *nfss = NFS_SERVER(inode);
203 203
204 page_cache_get(page);
204 if (atomic_long_inc_return(&nfss->writeback) > 205 if (atomic_long_inc_return(&nfss->writeback) >
205 NFS_CONGESTION_ON_THRESH) { 206 NFS_CONGESTION_ON_THRESH) {
206 set_bdi_congested(&nfss->backing_dev_info, 207 set_bdi_congested(&nfss->backing_dev_info,
@@ -216,6 +217,7 @@ static void nfs_end_page_writeback(struct page *page)
216 struct nfs_server *nfss = NFS_SERVER(inode); 217 struct nfs_server *nfss = NFS_SERVER(inode);
217 218
218 end_page_writeback(page); 219 end_page_writeback(page);
220 page_cache_release(page);
219 if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) 221 if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
220 clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); 222 clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
221} 223}
@@ -421,6 +423,7 @@ static void
421nfs_mark_request_dirty(struct nfs_page *req) 423nfs_mark_request_dirty(struct nfs_page *req)
422{ 424{
423 __set_page_dirty_nobuffers(req->wb_page); 425 __set_page_dirty_nobuffers(req->wb_page);
426 __mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC);
424} 427}
425 428
426#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 429#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -660,9 +663,11 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
660 req = nfs_setup_write_request(ctx, page, offset, count); 663 req = nfs_setup_write_request(ctx, page, offset, count);
661 if (IS_ERR(req)) 664 if (IS_ERR(req))
662 return PTR_ERR(req); 665 return PTR_ERR(req);
666 nfs_mark_request_dirty(req);
663 /* Update file length */ 667 /* Update file length */
664 nfs_grow_file(page, offset, count); 668 nfs_grow_file(page, offset, count);
665 nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); 669 nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
670 nfs_mark_request_dirty(req);
666 nfs_clear_page_tag_locked(req); 671 nfs_clear_page_tag_locked(req);
667 return 0; 672 return 0;
668} 673}
@@ -739,8 +744,6 @@ int nfs_updatepage(struct file *file, struct page *page,
739 status = nfs_writepage_setup(ctx, page, offset, count); 744 status = nfs_writepage_setup(ctx, page, offset, count);
740 if (status < 0) 745 if (status < 0)
741 nfs_set_pageerror(page); 746 nfs_set_pageerror(page);
742 else
743 __set_page_dirty_nobuffers(page);
744 747
745 dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", 748 dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n",
746 status, (long long)i_size_read(inode)); 749 status, (long long)i_size_read(inode));
@@ -749,13 +752,12 @@ int nfs_updatepage(struct file *file, struct page *page,
749 752
750static void nfs_writepage_release(struct nfs_page *req) 753static void nfs_writepage_release(struct nfs_page *req)
751{ 754{
755 struct page *page = req->wb_page;
752 756
753 if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) { 757 if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req))
754 nfs_end_page_writeback(req->wb_page);
755 nfs_inode_remove_request(req); 758 nfs_inode_remove_request(req);
756 } else
757 nfs_end_page_writeback(req->wb_page);
758 nfs_clear_page_tag_locked(req); 759 nfs_clear_page_tag_locked(req);
760 nfs_end_page_writeback(page);
759} 761}
760 762
761static int flush_task_priority(int how) 763static int flush_task_priority(int how)
@@ -779,7 +781,6 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
779 int how) 781 int how)
780{ 782{
781 struct inode *inode = req->wb_context->path.dentry->d_inode; 783 struct inode *inode = req->wb_context->path.dentry->d_inode;
782 int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
783 int priority = flush_task_priority(how); 784 int priority = flush_task_priority(how);
784 struct rpc_task *task; 785 struct rpc_task *task;
785 struct rpc_message msg = { 786 struct rpc_message msg = {
@@ -794,9 +795,10 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
794 .callback_ops = call_ops, 795 .callback_ops = call_ops,
795 .callback_data = data, 796 .callback_data = data,
796 .workqueue = nfsiod_workqueue, 797 .workqueue = nfsiod_workqueue,
797 .flags = flags, 798 .flags = RPC_TASK_ASYNC,
798 .priority = priority, 799 .priority = priority,
799 }; 800 };
801 int ret = 0;
800 802
801 /* Set up the RPC argument and reply structs 803 /* Set up the RPC argument and reply structs
802 * NB: take care not to mess about with data->commit et al. */ 804 * NB: take care not to mess about with data->commit et al. */
@@ -835,10 +837,18 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
835 (unsigned long long)data->args.offset); 837 (unsigned long long)data->args.offset);
836 838
837 task = rpc_run_task(&task_setup_data); 839 task = rpc_run_task(&task_setup_data);
838 if (IS_ERR(task)) 840 if (IS_ERR(task)) {
839 return PTR_ERR(task); 841 ret = PTR_ERR(task);
842 goto out;
843 }
844 if (how & FLUSH_SYNC) {
845 ret = rpc_wait_for_completion_task(task);
846 if (ret == 0)
847 ret = task->tk_status;
848 }
840 rpc_put_task(task); 849 rpc_put_task(task);
841 return 0; 850out:
851 return ret;
842} 852}
843 853
844/* If a nfs_flush_* function fails, it should remove reqs from @head and 854/* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -847,9 +857,11 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
847 */ 857 */
848static void nfs_redirty_request(struct nfs_page *req) 858static void nfs_redirty_request(struct nfs_page *req)
849{ 859{
860 struct page *page = req->wb_page;
861
850 nfs_mark_request_dirty(req); 862 nfs_mark_request_dirty(req);
851 nfs_end_page_writeback(req->wb_page);
852 nfs_clear_page_tag_locked(req); 863 nfs_clear_page_tag_locked(req);
864 nfs_end_page_writeback(page);
853} 865}
854 866
855/* 867/*
@@ -1084,16 +1096,15 @@ static void nfs_writeback_release_full(void *calldata)
1084 if (nfs_write_need_commit(data)) { 1096 if (nfs_write_need_commit(data)) {
1085 memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); 1097 memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
1086 nfs_mark_request_commit(req); 1098 nfs_mark_request_commit(req);
1087 nfs_end_page_writeback(page);
1088 dprintk(" marked for commit\n"); 1099 dprintk(" marked for commit\n");
1089 goto next; 1100 goto next;
1090 } 1101 }
1091 dprintk(" OK\n"); 1102 dprintk(" OK\n");
1092remove_request: 1103remove_request:
1093 nfs_end_page_writeback(page);
1094 nfs_inode_remove_request(req); 1104 nfs_inode_remove_request(req);
1095 next: 1105 next:
1096 nfs_clear_page_tag_locked(req); 1106 nfs_clear_page_tag_locked(req);
1107 nfs_end_page_writeback(page);
1097 } 1108 }
1098 nfs_writedata_release(calldata); 1109 nfs_writedata_release(calldata);
1099} 1110}
@@ -1190,6 +1201,25 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1190 1201
1191 1202
1192#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 1203#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
1204static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
1205{
1206 if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags))
1207 return 1;
1208 if (may_wait && !out_of_line_wait_on_bit_lock(&nfsi->flags,
1209 NFS_INO_COMMIT, nfs_wait_bit_killable,
1210 TASK_KILLABLE))
1211 return 1;
1212 return 0;
1213}
1214
1215static void nfs_commit_clear_lock(struct nfs_inode *nfsi)
1216{
1217 clear_bit(NFS_INO_COMMIT, &nfsi->flags);
1218 smp_mb__after_clear_bit();
1219 wake_up_bit(&nfsi->flags, NFS_INO_COMMIT);
1220}
1221
1222
1193static void nfs_commitdata_release(void *data) 1223static void nfs_commitdata_release(void *data)
1194{ 1224{
1195 struct nfs_write_data *wdata = data; 1225 struct nfs_write_data *wdata = data;
@@ -1207,7 +1237,6 @@ static int nfs_commit_rpcsetup(struct list_head *head,
1207{ 1237{
1208 struct nfs_page *first = nfs_list_entry(head->next); 1238 struct nfs_page *first = nfs_list_entry(head->next);
1209 struct inode *inode = first->wb_context->path.dentry->d_inode; 1239 struct inode *inode = first->wb_context->path.dentry->d_inode;
1210 int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
1211 int priority = flush_task_priority(how); 1240 int priority = flush_task_priority(how);
1212 struct rpc_task *task; 1241 struct rpc_task *task;
1213 struct rpc_message msg = { 1242 struct rpc_message msg = {
@@ -1222,7 +1251,7 @@ static int nfs_commit_rpcsetup(struct list_head *head,
1222 .callback_ops = &nfs_commit_ops, 1251 .callback_ops = &nfs_commit_ops,
1223 .callback_data = data, 1252 .callback_data = data,
1224 .workqueue = nfsiod_workqueue, 1253 .workqueue = nfsiod_workqueue,
1225 .flags = flags, 1254 .flags = RPC_TASK_ASYNC,
1226 .priority = priority, 1255 .priority = priority,
1227 }; 1256 };
1228 1257
@@ -1282,6 +1311,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
1282 BDI_RECLAIMABLE); 1311 BDI_RECLAIMABLE);
1283 nfs_clear_page_tag_locked(req); 1312 nfs_clear_page_tag_locked(req);
1284 } 1313 }
1314 nfs_commit_clear_lock(NFS_I(inode));
1285 return -ENOMEM; 1315 return -ENOMEM;
1286} 1316}
1287 1317
@@ -1337,6 +1367,7 @@ static void nfs_commit_release(void *calldata)
1337 next: 1367 next:
1338 nfs_clear_page_tag_locked(req); 1368 nfs_clear_page_tag_locked(req);
1339 } 1369 }
1370 nfs_commit_clear_lock(NFS_I(data->inode));
1340 nfs_commitdata_release(calldata); 1371 nfs_commitdata_release(calldata);
1341} 1372}
1342 1373
@@ -1351,8 +1382,11 @@ static const struct rpc_call_ops nfs_commit_ops = {
1351static int nfs_commit_inode(struct inode *inode, int how) 1382static int nfs_commit_inode(struct inode *inode, int how)
1352{ 1383{
1353 LIST_HEAD(head); 1384 LIST_HEAD(head);
1354 int res; 1385 int may_wait = how & FLUSH_SYNC;
1386 int res = 0;
1355 1387
1388 if (!nfs_commit_set_lock(NFS_I(inode), may_wait))
1389 goto out;
1356 spin_lock(&inode->i_lock); 1390 spin_lock(&inode->i_lock);
1357 res = nfs_scan_commit(inode, &head, 0, 0); 1391 res = nfs_scan_commit(inode, &head, 0, 0);
1358 spin_unlock(&inode->i_lock); 1392 spin_unlock(&inode->i_lock);
@@ -1360,7 +1394,13 @@ static int nfs_commit_inode(struct inode *inode, int how)
1360 int error = nfs_commit_list(inode, &head, how); 1394 int error = nfs_commit_list(inode, &head, how);
1361 if (error < 0) 1395 if (error < 0)
1362 return error; 1396 return error;
1363 } 1397 if (may_wait)
1398 wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT,
1399 nfs_wait_bit_killable,
1400 TASK_KILLABLE);
1401 } else
1402 nfs_commit_clear_lock(NFS_I(inode));
1403out:
1364 return res; 1404 return res;
1365} 1405}
1366 1406
@@ -1432,6 +1472,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
1432 1472
1433 BUG_ON(!PageLocked(page)); 1473 BUG_ON(!PageLocked(page));
1434 for (;;) { 1474 for (;;) {
1475 wait_on_page_writeback(page);
1435 req = nfs_page_find_request(page); 1476 req = nfs_page_find_request(page);
1436 if (req == NULL) 1477 if (req == NULL)
1437 break; 1478 break;
@@ -1466,30 +1507,18 @@ int nfs_wb_page(struct inode *inode, struct page *page)
1466 .range_start = range_start, 1507 .range_start = range_start,
1467 .range_end = range_end, 1508 .range_end = range_end,
1468 }; 1509 };
1469 struct nfs_page *req;
1470 int need_commit;
1471 int ret; 1510 int ret;
1472 1511
1473 while(PagePrivate(page)) { 1512 while(PagePrivate(page)) {
1513 wait_on_page_writeback(page);
1474 if (clear_page_dirty_for_io(page)) { 1514 if (clear_page_dirty_for_io(page)) {
1475 ret = nfs_writepage_locked(page, &wbc); 1515 ret = nfs_writepage_locked(page, &wbc);
1476 if (ret < 0) 1516 if (ret < 0)
1477 goto out_error; 1517 goto out_error;
1478 } 1518 }
1479 req = nfs_find_and_lock_request(page); 1519 ret = sync_inode(inode, &wbc);
1480 if (!req) 1520 if (ret < 0)
1481 break;
1482 if (IS_ERR(req)) {
1483 ret = PTR_ERR(req);
1484 goto out_error; 1521 goto out_error;
1485 }
1486 need_commit = test_bit(PG_CLEAN, &req->wb_flags);
1487 nfs_clear_page_tag_locked(req);
1488 if (need_commit) {
1489 ret = nfs_commit_inode(inode, FLUSH_SYNC);
1490 if (ret < 0)
1491 goto out_error;
1492 }
1493 } 1522 }
1494 return 0; 1523 return 0;
1495out_error: 1524out_error:
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 04133aacb1e5..fc1c52571c03 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/fs.h> 24#include <linux/fs.h>
25#include <linux/gfp.h>
25#include <linux/sunrpc/xdr.h> 26#include <linux/sunrpc/xdr.h>
26#include <linux/nfsacl.h> 27#include <linux/nfsacl.h>
27#include <linux/nfs3.h> 28#include <linux/nfs3.h>
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index a0c4016413f1..c2a4f71d87dd 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -12,6 +12,7 @@
12 * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de> 12 * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de>
13 */ 13 */
14 14
15#include <linux/slab.h>
15#include <linux/namei.h> 16#include <linux/namei.h>
16#include <linux/module.h> 17#include <linux/module.h>
17#include <linux/exportfs.h> 18#include <linux/exportfs.h>
@@ -258,10 +259,9 @@ static struct cache_detail svc_expkey_cache = {
258 .alloc = expkey_alloc, 259 .alloc = expkey_alloc,
259}; 260};
260 261
261static struct svc_expkey * 262static int
262svc_expkey_lookup(struct svc_expkey *item) 263svc_expkey_hash(struct svc_expkey *item)
263{ 264{
264 struct cache_head *ch;
265 int hash = item->ek_fsidtype; 265 int hash = item->ek_fsidtype;
266 char * cp = (char*)item->ek_fsid; 266 char * cp = (char*)item->ek_fsid;
267 int len = key_len(item->ek_fsidtype); 267 int len = key_len(item->ek_fsidtype);
@@ -269,6 +269,14 @@ svc_expkey_lookup(struct svc_expkey *item)
269 hash ^= hash_mem(cp, len, EXPKEY_HASHBITS); 269 hash ^= hash_mem(cp, len, EXPKEY_HASHBITS);
270 hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS); 270 hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS);
271 hash &= EXPKEY_HASHMASK; 271 hash &= EXPKEY_HASHMASK;
272 return hash;
273}
274
275static struct svc_expkey *
276svc_expkey_lookup(struct svc_expkey *item)
277{
278 struct cache_head *ch;
279 int hash = svc_expkey_hash(item);
272 280
273 ch = sunrpc_cache_lookup(&svc_expkey_cache, &item->h, 281 ch = sunrpc_cache_lookup(&svc_expkey_cache, &item->h,
274 hash); 282 hash);
@@ -282,13 +290,7 @@ static struct svc_expkey *
282svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old) 290svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old)
283{ 291{
284 struct cache_head *ch; 292 struct cache_head *ch;
285 int hash = new->ek_fsidtype; 293 int hash = svc_expkey_hash(new);
286 char * cp = (char*)new->ek_fsid;
287 int len = key_len(new->ek_fsidtype);
288
289 hash ^= hash_mem(cp, len, EXPKEY_HASHBITS);
290 hash ^= hash_ptr(new->ek_client, EXPKEY_HASHBITS);
291 hash &= EXPKEY_HASHMASK;
292 294
293 ch = sunrpc_cache_update(&svc_expkey_cache, &new->h, 295 ch = sunrpc_cache_update(&svc_expkey_cache, &new->h,
294 &old->h, hash); 296 &old->h, hash);
@@ -737,14 +739,22 @@ struct cache_detail svc_export_cache = {
737 .alloc = svc_export_alloc, 739 .alloc = svc_export_alloc,
738}; 740};
739 741
740static struct svc_export * 742static int
741svc_export_lookup(struct svc_export *exp) 743svc_export_hash(struct svc_export *exp)
742{ 744{
743 struct cache_head *ch;
744 int hash; 745 int hash;
746
745 hash = hash_ptr(exp->ex_client, EXPORT_HASHBITS); 747 hash = hash_ptr(exp->ex_client, EXPORT_HASHBITS);
746 hash ^= hash_ptr(exp->ex_path.dentry, EXPORT_HASHBITS); 748 hash ^= hash_ptr(exp->ex_path.dentry, EXPORT_HASHBITS);
747 hash ^= hash_ptr(exp->ex_path.mnt, EXPORT_HASHBITS); 749 hash ^= hash_ptr(exp->ex_path.mnt, EXPORT_HASHBITS);
750 return hash;
751}
752
753static struct svc_export *
754svc_export_lookup(struct svc_export *exp)
755{
756 struct cache_head *ch;
757 int hash = svc_export_hash(exp);
748 758
749 ch = sunrpc_cache_lookup(&svc_export_cache, &exp->h, 759 ch = sunrpc_cache_lookup(&svc_export_cache, &exp->h,
750 hash); 760 hash);
@@ -758,10 +768,7 @@ static struct svc_export *
758svc_export_update(struct svc_export *new, struct svc_export *old) 768svc_export_update(struct svc_export *new, struct svc_export *old)
759{ 769{
760 struct cache_head *ch; 770 struct cache_head *ch;
761 int hash; 771 int hash = svc_export_hash(old);
762 hash = hash_ptr(old->ex_client, EXPORT_HASHBITS);
763 hash ^= hash_ptr(old->ex_path.dentry, EXPORT_HASHBITS);
764 hash ^= hash_ptr(old->ex_path.mnt, EXPORT_HASHBITS);
765 772
766 ch = sunrpc_cache_update(&svc_export_cache, &new->h, 773 ch = sunrpc_cache_update(&svc_export_cache, &new->h,
767 &old->h, 774 &old->h,
@@ -1070,9 +1077,9 @@ exp_export(struct nfsctl_export *nxp)
1070 err = 0; 1077 err = 0;
1071finish: 1078finish:
1072 kfree(new.ex_pathname); 1079 kfree(new.ex_pathname);
1073 if (exp) 1080 if (!IS_ERR_OR_NULL(exp))
1074 exp_put(exp); 1081 exp_put(exp);
1075 if (fsid_key && !IS_ERR(fsid_key)) 1082 if (!IS_ERR_OR_NULL(fsid_key))
1076 cache_put(&fsid_key->h, &svc_expkey_cache); 1083 cache_put(&fsid_key->h, &svc_expkey_cache);
1077 path_put(&path); 1084 path_put(&path);
1078out_put_clp: 1085out_put_clp:
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index f20589d2ae27..6aa5590c3679 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -7,6 +7,7 @@
7#include "nfsd.h" 7#include "nfsd.h"
8/* FIXME: nfsacl.h is a broken header */ 8/* FIXME: nfsacl.h is a broken header */
9#include <linux/nfsacl.h> 9#include <linux/nfsacl.h>
10#include <linux/gfp.h>
10#include "cache.h" 11#include "cache.h"
11#include "xdr3.h" 12#include "xdr3.h"
12#include "vfs.h" 13#include "vfs.h"
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index e0c4846bad92..a596e9d987e4 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -7,6 +7,7 @@
7#include "nfsd.h" 7#include "nfsd.h"
8/* FIXME: nfsacl.h is a broken header */ 8/* FIXME: nfsacl.h is a broken header */
9#include <linux/nfsacl.h> 9#include <linux/nfsacl.h>
10#include <linux/gfp.h>
10#include "cache.h" 11#include "cache.h"
11#include "xdr3.h" 12#include "xdr3.h"
12#include "vfs.h" 13#include "vfs.h"
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 88150685df34..e48052615159 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -34,6 +34,7 @@
34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 */ 35 */
36 36
37#include <linux/slab.h>
37#include <linux/nfs_fs.h> 38#include <linux/nfs_fs.h>
38#include <linux/nfs4_acl.h> 39#include <linux/nfs4_acl.h>
39 40
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 4bc22c763de7..eb78e7e22077 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -32,6 +32,8 @@
32 */ 32 */
33 33
34#include <linux/sunrpc/clnt.h> 34#include <linux/sunrpc/clnt.h>
35#include <linux/sunrpc/svc_xprt.h>
36#include <linux/slab.h>
35#include "nfsd.h" 37#include "nfsd.h"
36#include "state.h" 38#include "state.h"
37 39
@@ -78,11 +80,6 @@ enum nfs_cb_opnum4 {
78 cb_sequence_dec_sz + \ 80 cb_sequence_dec_sz + \
79 op_dec_sz) 81 op_dec_sz)
80 82
81struct nfs4_rpc_args {
82 void *args_op;
83 struct nfsd4_cb_sequence args_seq;
84};
85
86/* 83/*
87* Generic encode routines from fs/nfs/nfs4xdr.c 84* Generic encode routines from fs/nfs/nfs4xdr.c
88*/ 85*/
@@ -427,13 +424,19 @@ static struct rpc_procinfo nfs4_cb_procedures[] = {
427}; 424};
428 425
429static struct rpc_version nfs_cb_version4 = { 426static struct rpc_version nfs_cb_version4 = {
427/*
428 * Note on the callback rpc program version number: despite language in rfc
429 * 5661 section 18.36.3 requiring servers to use 4 in this field, the
430 * official xdr descriptions for both 4.0 and 4.1 specify version 1, and
431 * in practice that appears to be what implementations use. The section
432 * 18.36.3 language is expected to be fixed in an erratum.
433 */
430 .number = 1, 434 .number = 1,
431 .nrprocs = ARRAY_SIZE(nfs4_cb_procedures), 435 .nrprocs = ARRAY_SIZE(nfs4_cb_procedures),
432 .procs = nfs4_cb_procedures 436 .procs = nfs4_cb_procedures
433}; 437};
434 438
435static struct rpc_version * nfs_cb_version[] = { 439static struct rpc_version * nfs_cb_version[] = {
436 NULL,
437 &nfs_cb_version4, 440 &nfs_cb_version4,
438}; 441};
439 442
@@ -455,15 +458,14 @@ static struct rpc_program cb_program = {
455 458
456static int max_cb_time(void) 459static int max_cb_time(void)
457{ 460{
458 return max(NFSD_LEASE_TIME/10, (time_t)1) * HZ; 461 return max(nfsd4_lease/10, (time_t)1) * HZ;
459} 462}
460 463
461/* Reference counting, callback cleanup, etc., all look racy as heck. 464/* Reference counting, callback cleanup, etc., all look racy as heck.
462 * And why is cb_set an atomic? */ 465 * And why is cl_cb_set an atomic? */
463 466
464int setup_callback_client(struct nfs4_client *clp) 467int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
465{ 468{
466 struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
467 struct rpc_timeout timeparms = { 469 struct rpc_timeout timeparms = {
468 .to_initval = max_cb_time(), 470 .to_initval = max_cb_time(),
469 .to_retries = 0, 471 .to_retries = 0,
@@ -475,7 +477,7 @@ int setup_callback_client(struct nfs4_client *clp)
475 .timeout = &timeparms, 477 .timeout = &timeparms,
476 .program = &cb_program, 478 .program = &cb_program,
477 .prognumber = cb->cb_prog, 479 .prognumber = cb->cb_prog,
478 .version = nfs_cb_version[1]->number, 480 .version = 0,
479 .authflavor = clp->cl_flavor, 481 .authflavor = clp->cl_flavor,
480 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), 482 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
481 .client_name = clp->cl_principal, 483 .client_name = clp->cl_principal,
@@ -485,7 +487,7 @@ int setup_callback_client(struct nfs4_client *clp)
485 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) 487 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
486 return -EINVAL; 488 return -EINVAL;
487 if (cb->cb_minorversion) { 489 if (cb->cb_minorversion) {
488 args.bc_xprt = clp->cl_cb_xprt; 490 args.bc_xprt = cb->cb_xprt;
489 args.protocol = XPRT_TRANSPORT_BC_TCP; 491 args.protocol = XPRT_TRANSPORT_BC_TCP;
490 } 492 }
491 /* Create RPC client */ 493 /* Create RPC client */
@@ -495,7 +497,7 @@ int setup_callback_client(struct nfs4_client *clp)
495 PTR_ERR(client)); 497 PTR_ERR(client));
496 return PTR_ERR(client); 498 return PTR_ERR(client);
497 } 499 }
498 cb->cb_client = client; 500 nfsd4_set_callback_client(clp, client);
499 return 0; 501 return 0;
500 502
501} 503}
@@ -513,8 +515,7 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
513 if (task->tk_status) 515 if (task->tk_status)
514 warn_no_callback_path(clp, task->tk_status); 516 warn_no_callback_path(clp, task->tk_status);
515 else 517 else
516 atomic_set(&clp->cl_cb_conn.cb_set, 1); 518 atomic_set(&clp->cl_cb_set, 1);
517 put_nfs4_client(clp);
518} 519}
519 520
520static const struct rpc_call_ops nfsd4_cb_probe_ops = { 521static const struct rpc_call_ops nfsd4_cb_probe_ops = {
@@ -536,7 +537,6 @@ int set_callback_cred(void)
536 537
537void do_probe_callback(struct nfs4_client *clp) 538void do_probe_callback(struct nfs4_client *clp)
538{ 539{
539 struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
540 struct rpc_message msg = { 540 struct rpc_message msg = {
541 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], 541 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
542 .rpc_argp = clp, 542 .rpc_argp = clp,
@@ -544,34 +544,27 @@ void do_probe_callback(struct nfs4_client *clp)
544 }; 544 };
545 int status; 545 int status;
546 546
547 status = rpc_call_async(cb->cb_client, &msg, 547 status = rpc_call_async(clp->cl_cb_client, &msg,
548 RPC_TASK_SOFT | RPC_TASK_SOFTCONN, 548 RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
549 &nfsd4_cb_probe_ops, (void *)clp); 549 &nfsd4_cb_probe_ops, (void *)clp);
550 if (status) { 550 if (status)
551 warn_no_callback_path(clp, status); 551 warn_no_callback_path(clp, status);
552 put_nfs4_client(clp);
553 }
554} 552}
555 553
556/* 554/*
557 * Set up the callback client and put a NFSPROC4_CB_NULL on the wire... 555 * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
558 */ 556 */
559void 557void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
560nfsd4_probe_callback(struct nfs4_client *clp)
561{ 558{
562 int status; 559 int status;
563 560
564 BUG_ON(atomic_read(&clp->cl_cb_conn.cb_set)); 561 BUG_ON(atomic_read(&clp->cl_cb_set));
565 562
566 status = setup_callback_client(clp); 563 status = setup_callback_client(clp, cb);
567 if (status) { 564 if (status) {
568 warn_no_callback_path(clp, status); 565 warn_no_callback_path(clp, status);
569 return; 566 return;
570 } 567 }
571
572 /* the task holds a reference to the nfs4_client struct */
573 atomic_inc(&clp->cl_count);
574
575 do_probe_callback(clp); 568 do_probe_callback(clp);
576} 569}
577 570
@@ -657,18 +650,32 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
657 } 650 }
658} 651}
659 652
653
660static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) 654static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
661{ 655{
662 struct nfs4_delegation *dp = calldata; 656 struct nfs4_delegation *dp = calldata;
663 struct nfs4_client *clp = dp->dl_client; 657 struct nfs4_client *clp = dp->dl_client;
658 struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
664 659
665 nfsd4_cb_done(task, calldata); 660 nfsd4_cb_done(task, calldata);
666 661
662 if (current_rpc_client == NULL) {
663 /* We're shutting down; give up. */
664 /* XXX: err, or is it ok just to fall through
665 * and rpc_restart_call? */
666 return;
667 }
668
667 switch (task->tk_status) { 669 switch (task->tk_status) {
668 case -EIO: 670 case -EIO:
669 /* Network partition? */ 671 /* Network partition? */
670 atomic_set(&clp->cl_cb_conn.cb_set, 0); 672 atomic_set(&clp->cl_cb_set, 0);
671 warn_no_callback_path(clp, task->tk_status); 673 warn_no_callback_path(clp, task->tk_status);
674 if (current_rpc_client != task->tk_client) {
675 /* queue a callback on the new connection: */
676 nfsd4_cb_recall(dp);
677 return;
678 }
672 case -EBADHANDLE: 679 case -EBADHANDLE:
673 case -NFS4ERR_BAD_STATEID: 680 case -NFS4ERR_BAD_STATEID:
674 /* Race: client probably got cb_recall 681 /* Race: client probably got cb_recall
@@ -676,7 +683,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
676 break; 683 break;
677 default: 684 default:
678 /* success, or error we can't handle */ 685 /* success, or error we can't handle */
679 goto done; 686 return;
680 } 687 }
681 if (dp->dl_retries--) { 688 if (dp->dl_retries--) {
682 rpc_delay(task, 2*HZ); 689 rpc_delay(task, 2*HZ);
@@ -684,20 +691,16 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
684 rpc_restart_call(task); 691 rpc_restart_call(task);
685 return; 692 return;
686 } else { 693 } else {
687 atomic_set(&clp->cl_cb_conn.cb_set, 0); 694 atomic_set(&clp->cl_cb_set, 0);
688 warn_no_callback_path(clp, task->tk_status); 695 warn_no_callback_path(clp, task->tk_status);
689 } 696 }
690done:
691 kfree(task->tk_msg.rpc_argp);
692} 697}
693 698
694static void nfsd4_cb_recall_release(void *calldata) 699static void nfsd4_cb_recall_release(void *calldata)
695{ 700{
696 struct nfs4_delegation *dp = calldata; 701 struct nfs4_delegation *dp = calldata;
697 struct nfs4_client *clp = dp->dl_client;
698 702
699 nfs4_put_delegation(dp); 703 nfs4_put_delegation(dp);
700 put_nfs4_client(clp);
701} 704}
702 705
703static const struct rpc_call_ops nfsd4_cb_recall_ops = { 706static const struct rpc_call_ops nfsd4_cb_recall_ops = {
@@ -706,33 +709,75 @@ static const struct rpc_call_ops nfsd4_cb_recall_ops = {
706 .rpc_release = nfsd4_cb_recall_release, 709 .rpc_release = nfsd4_cb_recall_release,
707}; 710};
708 711
712static struct workqueue_struct *callback_wq;
713
714int nfsd4_create_callback_queue(void)
715{
716 callback_wq = create_singlethread_workqueue("nfsd4_callbacks");
717 if (!callback_wq)
718 return -ENOMEM;
719 return 0;
720}
721
722void nfsd4_destroy_callback_queue(void)
723{
724 destroy_workqueue(callback_wq);
725}
726
727/* must be called under the state lock */
728void nfsd4_set_callback_client(struct nfs4_client *clp, struct rpc_clnt *new)
729{
730 struct rpc_clnt *old = clp->cl_cb_client;
731
732 clp->cl_cb_client = new;
733 /*
734 * After this, any work that saw the old value of cl_cb_client will
735 * be gone:
736 */
737 flush_workqueue(callback_wq);
738 /* So we can safely shut it down: */
739 if (old)
740 rpc_shutdown_client(old);
741}
742
709/* 743/*
710 * called with dp->dl_count inc'ed. 744 * called with dp->dl_count inc'ed.
711 */ 745 */
712void 746static void _nfsd4_cb_recall(struct nfs4_delegation *dp)
713nfsd4_cb_recall(struct nfs4_delegation *dp)
714{ 747{
715 struct nfs4_client *clp = dp->dl_client; 748 struct nfs4_client *clp = dp->dl_client;
716 struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client; 749 struct rpc_clnt *clnt = clp->cl_cb_client;
717 struct nfs4_rpc_args *args; 750 struct nfs4_rpc_args *args = &dp->dl_recall.cb_args;
718 struct rpc_message msg = { 751 struct rpc_message msg = {
719 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL], 752 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
720 .rpc_cred = callback_cred 753 .rpc_cred = callback_cred
721 }; 754 };
722 int status = -ENOMEM; 755 int status;
756
757 if (clnt == NULL)
758 return; /* Client is shutting down; give up. */
723 759
724 args = kzalloc(sizeof(*args), GFP_KERNEL);
725 if (!args)
726 goto out;
727 args->args_op = dp; 760 args->args_op = dp;
728 msg.rpc_argp = args; 761 msg.rpc_argp = args;
729 dp->dl_retries = 1; 762 dp->dl_retries = 1;
730 status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, 763 status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT,
731 &nfsd4_cb_recall_ops, dp); 764 &nfsd4_cb_recall_ops, dp);
732out: 765 if (status)
733 if (status) {
734 kfree(args);
735 put_nfs4_client(clp);
736 nfs4_put_delegation(dp); 766 nfs4_put_delegation(dp);
737 } 767}
768
769void nfsd4_do_callback_rpc(struct work_struct *w)
770{
771 /* XXX: for now, just send off delegation recall. */
772 /* In future, generalize to handle any sort of callback. */
773 struct nfsd4_callback *c = container_of(w, struct nfsd4_callback, cb_work);
774 struct nfs4_delegation *dp = container_of(c, struct nfs4_delegation, dl_recall);
775
776 _nfsd4_cb_recall(dp);
777}
778
779
780void nfsd4_cb_recall(struct nfs4_delegation *dp)
781{
782 queue_work(callback_wq, &dp->dl_recall.cb_work);
738} 783}
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 6e2983b27f3c..c78dbf493424 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -36,6 +36,7 @@
36#include <linux/nfsd_idmap.h> 36#include <linux/nfsd_idmap.h>
37#include <linux/seq_file.h> 37#include <linux/seq_file.h>
38#include <linux/sched.h> 38#include <linux/sched.h>
39#include <linux/slab.h>
39 40
40/* 41/*
41 * Cache entry 42 * Cache entry
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 37514c469846..59ec449b0c7f 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -33,6 +33,7 @@
33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 */ 34 */
35#include <linux/file.h> 35#include <linux/file.h>
36#include <linux/slab.h>
36 37
37#include "cache.h" 38#include "cache.h"
38#include "xdr4.h" 39#include "xdr4.h"
@@ -968,20 +969,36 @@ static struct nfsd4_operation nfsd4_ops[];
968static const char *nfsd4_op_name(unsigned opnum); 969static const char *nfsd4_op_name(unsigned opnum);
969 970
970/* 971/*
971 * Enforce NFSv4.1 COMPOUND ordering rules. 972 * Enforce NFSv4.1 COMPOUND ordering rules:
972 * 973 *
973 * TODO: 974 * Also note, enforced elsewhere:
974 * - enforce NFS4ERR_NOT_ONLY_OP, 975 * - SEQUENCE other than as first op results in
975 * - DESTROY_SESSION MUST be the final operation in the COMPOUND request. 976 * NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().)
977 * - BIND_CONN_TO_SESSION must be the only op in its compound
978 * (Will be enforced in nfsd4_bind_conn_to_session().)
979 * - DESTROY_SESSION must be the final operation in a compound, if
980 * sessionid's in SEQUENCE and DESTROY_SESSION are the same.
981 * (Enforced in nfsd4_destroy_session().)
976 */ 982 */
977static bool nfs41_op_ordering_ok(struct nfsd4_compoundargs *args) 983static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args)
978{ 984{
979 if (args->minorversion && args->opcnt > 0) { 985 struct nfsd4_op *op = &args->ops[0];
980 struct nfsd4_op *op = &args->ops[0]; 986
981 return (op->status == nfserr_op_illegal) || 987 /* These ordering requirements don't apply to NFSv4.0: */
982 (nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP); 988 if (args->minorversion == 0)
983 } 989 return nfs_ok;
984 return true; 990 /* This is weird, but OK, not our problem: */
991 if (args->opcnt == 0)
992 return nfs_ok;
993 if (op->status == nfserr_op_illegal)
994 return nfs_ok;
995 if (!(nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP))
996 return nfserr_op_not_in_session;
997 if (op->opnum == OP_SEQUENCE)
998 return nfs_ok;
999 if (args->opcnt != 1)
1000 return nfserr_not_only_op;
1001 return nfs_ok;
985} 1002}
986 1003
987/* 1004/*
@@ -1011,6 +1028,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1011 resp->rqstp = rqstp; 1028 resp->rqstp = rqstp;
1012 resp->cstate.minorversion = args->minorversion; 1029 resp->cstate.minorversion = args->minorversion;
1013 resp->cstate.replay_owner = NULL; 1030 resp->cstate.replay_owner = NULL;
1031 resp->cstate.session = NULL;
1014 fh_init(&resp->cstate.current_fh, NFS4_FHSIZE); 1032 fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
1015 fh_init(&resp->cstate.save_fh, NFS4_FHSIZE); 1033 fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
1016 /* Use the deferral mechanism only for NFSv4.0 compounds */ 1034 /* Use the deferral mechanism only for NFSv4.0 compounds */
@@ -1023,13 +1041,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1023 if (args->minorversion > nfsd_supported_minorversion) 1041 if (args->minorversion > nfsd_supported_minorversion)
1024 goto out; 1042 goto out;
1025 1043
1026 if (!nfs41_op_ordering_ok(args)) { 1044 status = nfs41_check_op_ordering(args);
1045 if (status) {
1027 op = &args->ops[0]; 1046 op = &args->ops[0];
1028 op->status = nfserr_sequence_pos; 1047 op->status = status;
1029 goto encode_op; 1048 goto encode_op;
1030 } 1049 }
1031 1050
1032 status = nfs_ok;
1033 while (!status && resp->opcnt < args->opcnt) { 1051 while (!status && resp->opcnt < args->opcnt) {
1034 op = &args->ops[resp->opcnt++]; 1052 op = &args->ops[resp->opcnt++];
1035 1053
@@ -1294,6 +1312,11 @@ static struct nfsd4_operation nfsd4_ops[] = {
1294 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, 1312 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1295 .op_name = "OP_SEQUENCE", 1313 .op_name = "OP_SEQUENCE",
1296 }, 1314 },
1315 [OP_RECLAIM_COMPLETE] = {
1316 .op_func = (nfsd4op_func)nfsd4_reclaim_complete,
1317 .op_flags = ALLOWED_WITHOUT_FH,
1318 .op_name = "OP_RECLAIM_COMPLETE",
1319 },
1297}; 1320};
1298 1321
1299static const char *nfsd4_op_name(unsigned opnum) 1322static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 98fb98e330b4..7a9ae3254a4b 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -32,6 +32,7 @@
32*/ 32*/
33 33
34#include <linux/file.h> 34#include <linux/file.h>
35#include <linux/slab.h>
35#include <linux/namei.h> 36#include <linux/namei.h>
36#include <linux/crypto.h> 37#include <linux/crypto.h>
37#include <linux/sched.h> 38#include <linux/sched.h>
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c97fddbd17db..12f7109720c2 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -34,6 +34,7 @@
34 34
35#include <linux/file.h> 35#include <linux/file.h>
36#include <linux/smp_lock.h> 36#include <linux/smp_lock.h>
37#include <linux/slab.h>
37#include <linux/namei.h> 38#include <linux/namei.h>
38#include <linux/swap.h> 39#include <linux/swap.h>
39#include <linux/sunrpc/svcauth_gss.h> 40#include <linux/sunrpc/svcauth_gss.h>
@@ -44,8 +45,8 @@
44#define NFSDDBG_FACILITY NFSDDBG_PROC 45#define NFSDDBG_FACILITY NFSDDBG_PROC
45 46
46/* Globals */ 47/* Globals */
47static time_t lease_time = 90; /* default lease time */ 48time_t nfsd4_lease = 90; /* default lease time */
48static time_t user_lease_time = 90; 49time_t nfsd4_grace = 90;
49static time_t boot_time; 50static time_t boot_time;
50static u32 current_ownerid = 1; 51static u32 current_ownerid = 1;
51static u32 current_fileid = 1; 52static u32 current_fileid = 1;
@@ -189,7 +190,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
189 dp->dl_vfs_file = stp->st_vfs_file; 190 dp->dl_vfs_file = stp->st_vfs_file;
190 dp->dl_type = type; 191 dp->dl_type = type;
191 dp->dl_ident = cb->cb_ident; 192 dp->dl_ident = cb->cb_ident;
192 dp->dl_stateid.si_boot = get_seconds(); 193 dp->dl_stateid.si_boot = boot_time;
193 dp->dl_stateid.si_stateownerid = current_delegid++; 194 dp->dl_stateid.si_stateownerid = current_delegid++;
194 dp->dl_stateid.si_fileid = 0; 195 dp->dl_stateid.si_fileid = 0;
195 dp->dl_stateid.si_generation = 0; 196 dp->dl_stateid.si_generation = 0;
@@ -198,6 +199,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
198 atomic_set(&dp->dl_count, 1); 199 atomic_set(&dp->dl_count, 1);
199 list_add(&dp->dl_perfile, &fp->fi_delegations); 200 list_add(&dp->dl_perfile, &fp->fi_delegations);
200 list_add(&dp->dl_perclnt, &clp->cl_delegations); 201 list_add(&dp->dl_perclnt, &clp->cl_delegations);
202 INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc);
201 return dp; 203 return dp;
202} 204}
203 205
@@ -248,6 +250,9 @@ unhash_delegation(struct nfs4_delegation *dp)
248 * SETCLIENTID state 250 * SETCLIENTID state
249 */ 251 */
250 252
253/* client_lock protects the client lru list and session hash table */
254static DEFINE_SPINLOCK(client_lock);
255
251/* Hash tables for nfs4_clientid state */ 256/* Hash tables for nfs4_clientid state */
252#define CLIENT_HASH_BITS 4 257#define CLIENT_HASH_BITS 4
253#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) 258#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS)
@@ -366,7 +371,6 @@ static void release_openowner(struct nfs4_stateowner *sop)
366 nfs4_put_stateowner(sop); 371 nfs4_put_stateowner(sop);
367} 372}
368 373
369static DEFINE_SPINLOCK(sessionid_lock);
370#define SESSION_HASH_SIZE 512 374#define SESSION_HASH_SIZE 512
371static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE]; 375static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
372 376
@@ -564,10 +568,10 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
564 568
565 new->se_flags = cses->flags; 569 new->se_flags = cses->flags;
566 kref_init(&new->se_ref); 570 kref_init(&new->se_ref);
567 spin_lock(&sessionid_lock); 571 spin_lock(&client_lock);
568 list_add(&new->se_hash, &sessionid_hashtbl[idx]); 572 list_add(&new->se_hash, &sessionid_hashtbl[idx]);
569 list_add(&new->se_perclnt, &clp->cl_sessions); 573 list_add(&new->se_perclnt, &clp->cl_sessions);
570 spin_unlock(&sessionid_lock); 574 spin_unlock(&client_lock);
571 575
572 status = nfs_ok; 576 status = nfs_ok;
573out: 577out:
@@ -578,7 +582,7 @@ out_free:
578 goto out; 582 goto out;
579} 583}
580 584
581/* caller must hold sessionid_lock */ 585/* caller must hold client_lock */
582static struct nfsd4_session * 586static struct nfsd4_session *
583find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid) 587find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
584{ 588{
@@ -601,7 +605,7 @@ find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
601 return NULL; 605 return NULL;
602} 606}
603 607
604/* caller must hold sessionid_lock */ 608/* caller must hold client_lock */
605static void 609static void
606unhash_session(struct nfsd4_session *ses) 610unhash_session(struct nfsd4_session *ses)
607{ 611{
@@ -609,15 +613,6 @@ unhash_session(struct nfsd4_session *ses)
609 list_del(&ses->se_perclnt); 613 list_del(&ses->se_perclnt);
610} 614}
611 615
612static void
613release_session(struct nfsd4_session *ses)
614{
615 spin_lock(&sessionid_lock);
616 unhash_session(ses);
617 spin_unlock(&sessionid_lock);
618 nfsd4_put_session(ses);
619}
620
621void 616void
622free_session(struct kref *kref) 617free_session(struct kref *kref)
623{ 618{
@@ -633,9 +628,18 @@ free_session(struct kref *kref)
633 kfree(ses); 628 kfree(ses);
634} 629}
635 630
631/* must be called under the client_lock */
636static inline void 632static inline void
637renew_client(struct nfs4_client *clp) 633renew_client_locked(struct nfs4_client *clp)
638{ 634{
635 if (is_client_expired(clp)) {
636 dprintk("%s: client (clientid %08x/%08x) already expired\n",
637 __func__,
638 clp->cl_clientid.cl_boot,
639 clp->cl_clientid.cl_id);
640 return;
641 }
642
639 /* 643 /*
640 * Move client to the end to the LRU list. 644 * Move client to the end to the LRU list.
641 */ 645 */
@@ -646,6 +650,14 @@ renew_client(struct nfs4_client *clp)
646 clp->cl_time = get_seconds(); 650 clp->cl_time = get_seconds();
647} 651}
648 652
653static inline void
654renew_client(struct nfs4_client *clp)
655{
656 spin_lock(&client_lock);
657 renew_client_locked(clp);
658 spin_unlock(&client_lock);
659}
660
649/* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */ 661/* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */
650static int 662static int
651STALE_CLIENTID(clientid_t *clid) 663STALE_CLIENTID(clientid_t *clid)
@@ -679,27 +691,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
679 return clp; 691 return clp;
680} 692}
681 693
682static void
683shutdown_callback_client(struct nfs4_client *clp)
684{
685 struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
686
687 if (clnt) {
688 /*
689 * Callback threads take a reference on the client, so there
690 * should be no outstanding callbacks at this point.
691 */
692 clp->cl_cb_conn.cb_client = NULL;
693 rpc_shutdown_client(clnt);
694 }
695}
696
697static inline void 694static inline void
698free_client(struct nfs4_client *clp) 695free_client(struct nfs4_client *clp)
699{ 696{
700 shutdown_callback_client(clp);
701 if (clp->cl_cb_xprt)
702 svc_xprt_put(clp->cl_cb_xprt);
703 if (clp->cl_cred.cr_group_info) 697 if (clp->cl_cred.cr_group_info)
704 put_group_info(clp->cl_cred.cr_group_info); 698 put_group_info(clp->cl_cred.cr_group_info);
705 kfree(clp->cl_principal); 699 kfree(clp->cl_principal);
@@ -708,10 +702,34 @@ free_client(struct nfs4_client *clp)
708} 702}
709 703
710void 704void
711put_nfs4_client(struct nfs4_client *clp) 705release_session_client(struct nfsd4_session *session)
712{ 706{
713 if (atomic_dec_and_test(&clp->cl_count)) 707 struct nfs4_client *clp = session->se_client;
708
709 if (!atomic_dec_and_lock(&clp->cl_refcount, &client_lock))
710 return;
711 if (is_client_expired(clp)) {
714 free_client(clp); 712 free_client(clp);
713 session->se_client = NULL;
714 } else
715 renew_client_locked(clp);
716 spin_unlock(&client_lock);
717 nfsd4_put_session(session);
718}
719
720/* must be called under the client_lock */
721static inline void
722unhash_client_locked(struct nfs4_client *clp)
723{
724 mark_client_expired(clp);
725 list_del(&clp->cl_lru);
726 while (!list_empty(&clp->cl_sessions)) {
727 struct nfsd4_session *ses;
728 ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
729 se_perclnt);
730 unhash_session(ses);
731 nfsd4_put_session(ses);
732 }
715} 733}
716 734
717static void 735static void
@@ -721,9 +739,6 @@ expire_client(struct nfs4_client *clp)
721 struct nfs4_delegation *dp; 739 struct nfs4_delegation *dp;
722 struct list_head reaplist; 740 struct list_head reaplist;
723 741
724 dprintk("NFSD: expire_client cl_count %d\n",
725 atomic_read(&clp->cl_count));
726
727 INIT_LIST_HEAD(&reaplist); 742 INIT_LIST_HEAD(&reaplist);
728 spin_lock(&recall_lock); 743 spin_lock(&recall_lock);
729 while (!list_empty(&clp->cl_delegations)) { 744 while (!list_empty(&clp->cl_delegations)) {
@@ -739,20 +754,20 @@ expire_client(struct nfs4_client *clp)
739 list_del_init(&dp->dl_recall_lru); 754 list_del_init(&dp->dl_recall_lru);
740 unhash_delegation(dp); 755 unhash_delegation(dp);
741 } 756 }
742 list_del(&clp->cl_idhash);
743 list_del(&clp->cl_strhash);
744 list_del(&clp->cl_lru);
745 while (!list_empty(&clp->cl_openowners)) { 757 while (!list_empty(&clp->cl_openowners)) {
746 sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); 758 sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
747 release_openowner(sop); 759 release_openowner(sop);
748 } 760 }
749 while (!list_empty(&clp->cl_sessions)) { 761 nfsd4_set_callback_client(clp, NULL);
750 struct nfsd4_session *ses; 762 if (clp->cl_cb_conn.cb_xprt)
751 ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, 763 svc_xprt_put(clp->cl_cb_conn.cb_xprt);
752 se_perclnt); 764 list_del(&clp->cl_idhash);
753 release_session(ses); 765 list_del(&clp->cl_strhash);
754 } 766 spin_lock(&client_lock);
755 put_nfs4_client(clp); 767 unhash_client_locked(clp);
768 if (atomic_read(&clp->cl_refcount) == 0)
769 free_client(clp);
770 spin_unlock(&client_lock);
756} 771}
757 772
758static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) 773static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
@@ -838,14 +853,15 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
838 } 853 }
839 854
840 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); 855 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
841 atomic_set(&clp->cl_count, 1); 856 atomic_set(&clp->cl_refcount, 0);
842 atomic_set(&clp->cl_cb_conn.cb_set, 0); 857 atomic_set(&clp->cl_cb_set, 0);
843 INIT_LIST_HEAD(&clp->cl_idhash); 858 INIT_LIST_HEAD(&clp->cl_idhash);
844 INIT_LIST_HEAD(&clp->cl_strhash); 859 INIT_LIST_HEAD(&clp->cl_strhash);
845 INIT_LIST_HEAD(&clp->cl_openowners); 860 INIT_LIST_HEAD(&clp->cl_openowners);
846 INIT_LIST_HEAD(&clp->cl_delegations); 861 INIT_LIST_HEAD(&clp->cl_delegations);
847 INIT_LIST_HEAD(&clp->cl_sessions); 862 INIT_LIST_HEAD(&clp->cl_sessions);
848 INIT_LIST_HEAD(&clp->cl_lru); 863 INIT_LIST_HEAD(&clp->cl_lru);
864 clp->cl_time = get_seconds();
849 clear_bit(0, &clp->cl_cb_slot_busy); 865 clear_bit(0, &clp->cl_cb_slot_busy);
850 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); 866 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
851 copy_verf(clp, verf); 867 copy_verf(clp, verf);
@@ -876,8 +892,7 @@ add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval)
876 list_add(&clp->cl_strhash, &unconf_str_hashtbl[strhashval]); 892 list_add(&clp->cl_strhash, &unconf_str_hashtbl[strhashval]);
877 idhashval = clientid_hashval(clp->cl_clientid.cl_id); 893 idhashval = clientid_hashval(clp->cl_clientid.cl_id);
878 list_add(&clp->cl_idhash, &unconf_id_hashtbl[idhashval]); 894 list_add(&clp->cl_idhash, &unconf_id_hashtbl[idhashval]);
879 list_add_tail(&clp->cl_lru, &client_lru); 895 renew_client(clp);
880 clp->cl_time = get_seconds();
881} 896}
882 897
883static void 898static void
@@ -887,10 +902,9 @@ move_to_confirmed(struct nfs4_client *clp)
887 unsigned int strhashval; 902 unsigned int strhashval;
888 903
889 dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); 904 dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
890 list_del_init(&clp->cl_strhash);
891 list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]); 905 list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]);
892 strhashval = clientstr_hashval(clp->cl_recdir); 906 strhashval = clientstr_hashval(clp->cl_recdir);
893 list_add(&clp->cl_strhash, &conf_str_hashtbl[strhashval]); 907 list_move(&clp->cl_strhash, &conf_str_hashtbl[strhashval]);
894 renew_client(clp); 908 renew_client(clp);
895} 909}
896 910
@@ -1326,15 +1340,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1326 cs_slot->sl_seqid++; /* from 0 to 1 */ 1340 cs_slot->sl_seqid++; /* from 0 to 1 */
1327 move_to_confirmed(unconf); 1341 move_to_confirmed(unconf);
1328 1342
1329 /*
1330 * We do not support RDMA or persistent sessions
1331 */
1332 cr_ses->flags &= ~SESSION4_PERSIST;
1333 cr_ses->flags &= ~SESSION4_RDMA;
1334
1335 if (cr_ses->flags & SESSION4_BACK_CHAN) { 1343 if (cr_ses->flags & SESSION4_BACK_CHAN) {
1336 unconf->cl_cb_xprt = rqstp->rq_xprt; 1344 unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
1337 svc_xprt_get(unconf->cl_cb_xprt); 1345 svc_xprt_get(rqstp->rq_xprt);
1338 rpc_copy_addr( 1346 rpc_copy_addr(
1339 (struct sockaddr *)&unconf->cl_cb_conn.cb_addr, 1347 (struct sockaddr *)&unconf->cl_cb_conn.cb_addr,
1340 sa); 1348 sa);
@@ -1343,7 +1351,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1343 cstate->minorversion; 1351 cstate->minorversion;
1344 unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog; 1352 unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
1345 unconf->cl_cb_seq_nr = 1; 1353 unconf->cl_cb_seq_nr = 1;
1346 nfsd4_probe_callback(unconf); 1354 nfsd4_probe_callback(unconf, &unconf->cl_cb_conn);
1347 } 1355 }
1348 conf = unconf; 1356 conf = unconf;
1349 } else { 1357 } else {
@@ -1351,6 +1359,12 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1351 goto out; 1359 goto out;
1352 } 1360 }
1353 1361
1362 /*
1363 * We do not support RDMA or persistent sessions
1364 */
1365 cr_ses->flags &= ~SESSION4_PERSIST;
1366 cr_ses->flags &= ~SESSION4_RDMA;
1367
1354 status = alloc_init_session(rqstp, conf, cr_ses); 1368 status = alloc_init_session(rqstp, conf, cr_ses);
1355 if (status) 1369 if (status)
1356 goto out; 1370 goto out;
@@ -1368,6 +1382,21 @@ out:
1368 return status; 1382 return status;
1369} 1383}
1370 1384
1385static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
1386{
1387 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1388 struct nfsd4_compoundargs *argp = rqstp->rq_argp;
1389
1390 return argp->opcnt == resp->opcnt;
1391}
1392
1393static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
1394{
1395 if (!session)
1396 return 0;
1397 return !memcmp(sid, &session->se_sessionid, sizeof(*sid));
1398}
1399
1371__be32 1400__be32
1372nfsd4_destroy_session(struct svc_rqst *r, 1401nfsd4_destroy_session(struct svc_rqst *r,
1373 struct nfsd4_compound_state *cstate, 1402 struct nfsd4_compound_state *cstate,
@@ -1383,19 +1412,25 @@ nfsd4_destroy_session(struct svc_rqst *r,
1383 * - Do we need to clear any callback info from previous session? 1412 * - Do we need to clear any callback info from previous session?
1384 */ 1413 */
1385 1414
1415 if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) {
1416 if (!nfsd4_last_compound_op(r))
1417 return nfserr_not_only_op;
1418 }
1386 dump_sessionid(__func__, &sessionid->sessionid); 1419 dump_sessionid(__func__, &sessionid->sessionid);
1387 spin_lock(&sessionid_lock); 1420 spin_lock(&client_lock);
1388 ses = find_in_sessionid_hashtbl(&sessionid->sessionid); 1421 ses = find_in_sessionid_hashtbl(&sessionid->sessionid);
1389 if (!ses) { 1422 if (!ses) {
1390 spin_unlock(&sessionid_lock); 1423 spin_unlock(&client_lock);
1391 goto out; 1424 goto out;
1392 } 1425 }
1393 1426
1394 unhash_session(ses); 1427 unhash_session(ses);
1395 spin_unlock(&sessionid_lock); 1428 spin_unlock(&client_lock);
1396 1429
1430 nfs4_lock_state();
1397 /* wait for callbacks */ 1431 /* wait for callbacks */
1398 shutdown_callback_client(ses->se_client); 1432 nfsd4_set_callback_client(ses->se_client, NULL);
1433 nfs4_unlock_state();
1399 nfsd4_put_session(ses); 1434 nfsd4_put_session(ses);
1400 status = nfs_ok; 1435 status = nfs_ok;
1401out: 1436out:
@@ -1416,7 +1451,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1416 if (resp->opcnt != 1) 1451 if (resp->opcnt != 1)
1417 return nfserr_sequence_pos; 1452 return nfserr_sequence_pos;
1418 1453
1419 spin_lock(&sessionid_lock); 1454 spin_lock(&client_lock);
1420 status = nfserr_badsession; 1455 status = nfserr_badsession;
1421 session = find_in_sessionid_hashtbl(&seq->sessionid); 1456 session = find_in_sessionid_hashtbl(&seq->sessionid);
1422 if (!session) 1457 if (!session)
@@ -1455,23 +1490,47 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1455 cstate->slot = slot; 1490 cstate->slot = slot;
1456 cstate->session = session; 1491 cstate->session = session;
1457 1492
1458 /* Hold a session reference until done processing the compound:
1459 * nfsd4_put_session called only if the cstate slot is set.
1460 */
1461 nfsd4_get_session(session);
1462out: 1493out:
1463 spin_unlock(&sessionid_lock); 1494 /* Hold a session reference until done processing the compound. */
1464 /* Renew the clientid on success and on replay */
1465 if (cstate->session) { 1495 if (cstate->session) {
1466 nfs4_lock_state(); 1496 nfsd4_get_session(cstate->session);
1467 renew_client(session->se_client); 1497 atomic_inc(&session->se_client->cl_refcount);
1468 nfs4_unlock_state();
1469 } 1498 }
1499 spin_unlock(&client_lock);
1470 dprintk("%s: return %d\n", __func__, ntohl(status)); 1500 dprintk("%s: return %d\n", __func__, ntohl(status));
1471 return status; 1501 return status;
1472} 1502}
1473 1503
1474__be32 1504__be32
1505nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
1506{
1507 if (rc->rca_one_fs) {
1508 if (!cstate->current_fh.fh_dentry)
1509 return nfserr_nofilehandle;
1510 /*
1511 * We don't take advantage of the rca_one_fs case.
1512 * That's OK, it's optional, we can safely ignore it.
1513 */
1514 return nfs_ok;
1515 }
1516 nfs4_lock_state();
1517 if (is_client_expired(cstate->session->se_client)) {
1518 nfs4_unlock_state();
1519 /*
1520 * The following error isn't really legal.
1521 * But we only get here if the client just explicitly
1522 * destroyed the client. Surely it no longer cares what
1523 * error it gets back on an operation for the dead
1524 * client.
1525 */
1526 return nfserr_stale_clientid;
1527 }
1528 nfsd4_create_clid_dir(cstate->session->se_client);
1529 nfs4_unlock_state();
1530 return nfs_ok;
1531}
1532
1533__be32
1475nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 1534nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1476 struct nfsd4_setclientid *setclid) 1535 struct nfsd4_setclientid *setclid)
1477{ 1536{
@@ -1630,9 +1689,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1630 if (!same_creds(&conf->cl_cred, &unconf->cl_cred)) 1689 if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
1631 status = nfserr_clid_inuse; 1690 status = nfserr_clid_inuse;
1632 else { 1691 else {
1633 /* XXX: We just turn off callbacks until we can handle 1692 atomic_set(&conf->cl_cb_set, 0);
1634 * change request correctly. */ 1693 nfsd4_probe_callback(conf, &unconf->cl_cb_conn);
1635 atomic_set(&conf->cl_cb_conn.cb_set, 0);
1636 expire_client(unconf); 1694 expire_client(unconf);
1637 status = nfs_ok; 1695 status = nfs_ok;
1638 1696
@@ -1666,7 +1724,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1666 } 1724 }
1667 move_to_confirmed(unconf); 1725 move_to_confirmed(unconf);
1668 conf = unconf; 1726 conf = unconf;
1669 nfsd4_probe_callback(conf); 1727 nfsd4_probe_callback(conf, &conf->cl_cb_conn);
1670 status = nfs_ok; 1728 status = nfs_ok;
1671 } 1729 }
1672 } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm))) 1730 } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
@@ -1699,12 +1757,12 @@ alloc_init_file(struct inode *ino)
1699 INIT_LIST_HEAD(&fp->fi_hash); 1757 INIT_LIST_HEAD(&fp->fi_hash);
1700 INIT_LIST_HEAD(&fp->fi_stateids); 1758 INIT_LIST_HEAD(&fp->fi_stateids);
1701 INIT_LIST_HEAD(&fp->fi_delegations); 1759 INIT_LIST_HEAD(&fp->fi_delegations);
1702 spin_lock(&recall_lock);
1703 list_add(&fp->fi_hash, &file_hashtbl[hashval]);
1704 spin_unlock(&recall_lock);
1705 fp->fi_inode = igrab(ino); 1760 fp->fi_inode = igrab(ino);
1706 fp->fi_id = current_fileid++; 1761 fp->fi_id = current_fileid++;
1707 fp->fi_had_conflict = false; 1762 fp->fi_had_conflict = false;
1763 spin_lock(&recall_lock);
1764 list_add(&fp->fi_hash, &file_hashtbl[hashval]);
1765 spin_unlock(&recall_lock);
1708 return fp; 1766 return fp;
1709 } 1767 }
1710 return NULL; 1768 return NULL;
@@ -1826,7 +1884,7 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
1826 stp->st_stateowner = sop; 1884 stp->st_stateowner = sop;
1827 get_nfs4_file(fp); 1885 get_nfs4_file(fp);
1828 stp->st_file = fp; 1886 stp->st_file = fp;
1829 stp->st_stateid.si_boot = get_seconds(); 1887 stp->st_stateid.si_boot = boot_time;
1830 stp->st_stateid.si_stateownerid = sop->so_id; 1888 stp->st_stateid.si_stateownerid = sop->so_id;
1831 stp->st_stateid.si_fileid = fp->fi_id; 1889 stp->st_stateid.si_fileid = fp->fi_id;
1832 stp->st_stateid.si_generation = 0; 1890 stp->st_stateid.si_generation = 0;
@@ -2027,7 +2085,6 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
2027 * lock) we know the server hasn't removed the lease yet, we know 2085 * lock) we know the server hasn't removed the lease yet, we know
2028 * it's safe to take a reference: */ 2086 * it's safe to take a reference: */
2029 atomic_inc(&dp->dl_count); 2087 atomic_inc(&dp->dl_count);
2030 atomic_inc(&dp->dl_client->cl_count);
2031 2088
2032 spin_lock(&recall_lock); 2089 spin_lock(&recall_lock);
2033 list_add_tail(&dp->dl_recall_lru, &del_recall_lru); 2090 list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
@@ -2346,7 +2403,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2346{ 2403{
2347 struct nfs4_delegation *dp; 2404 struct nfs4_delegation *dp;
2348 struct nfs4_stateowner *sop = stp->st_stateowner; 2405 struct nfs4_stateowner *sop = stp->st_stateowner;
2349 struct nfs4_cb_conn *cb = &sop->so_client->cl_cb_conn; 2406 int cb_up = atomic_read(&sop->so_client->cl_cb_set);
2350 struct file_lock fl, *flp = &fl; 2407 struct file_lock fl, *flp = &fl;
2351 int status, flag = 0; 2408 int status, flag = 0;
2352 2409
@@ -2354,7 +2411,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2354 open->op_recall = 0; 2411 open->op_recall = 0;
2355 switch (open->op_claim_type) { 2412 switch (open->op_claim_type) {
2356 case NFS4_OPEN_CLAIM_PREVIOUS: 2413 case NFS4_OPEN_CLAIM_PREVIOUS:
2357 if (!atomic_read(&cb->cb_set)) 2414 if (!cb_up)
2358 open->op_recall = 1; 2415 open->op_recall = 1;
2359 flag = open->op_delegate_type; 2416 flag = open->op_delegate_type;
2360 if (flag == NFS4_OPEN_DELEGATE_NONE) 2417 if (flag == NFS4_OPEN_DELEGATE_NONE)
@@ -2365,7 +2422,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2365 * had the chance to reclaim theirs.... */ 2422 * had the chance to reclaim theirs.... */
2366 if (locks_in_grace()) 2423 if (locks_in_grace())
2367 goto out; 2424 goto out;
2368 if (!atomic_read(&cb->cb_set) || !sop->so_confirmed) 2425 if (!cb_up || !sop->so_confirmed)
2369 goto out; 2426 goto out;
2370 if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) 2427 if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
2371 flag = NFS4_OPEN_DELEGATE_WRITE; 2428 flag = NFS4_OPEN_DELEGATE_WRITE;
@@ -2482,10 +2539,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
2482 } 2539 }
2483 memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); 2540 memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
2484 2541
2485 if (nfsd4_has_session(&resp->cstate)) { 2542 if (nfsd4_has_session(&resp->cstate))
2486 open->op_stateowner->so_confirmed = 1; 2543 open->op_stateowner->so_confirmed = 1;
2487 nfsd4_create_clid_dir(open->op_stateowner->so_client);
2488 }
2489 2544
2490 /* 2545 /*
2491 * Attempt to hand out a delegation. No error return, because the 2546 * Attempt to hand out a delegation. No error return, because the
@@ -2536,7 +2591,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2536 renew_client(clp); 2591 renew_client(clp);
2537 status = nfserr_cb_path_down; 2592 status = nfserr_cb_path_down;
2538 if (!list_empty(&clp->cl_delegations) 2593 if (!list_empty(&clp->cl_delegations)
2539 && !atomic_read(&clp->cl_cb_conn.cb_set)) 2594 && !atomic_read(&clp->cl_cb_set))
2540 goto out; 2595 goto out;
2541 status = nfs_ok; 2596 status = nfs_ok;
2542out: 2597out:
@@ -2553,6 +2608,12 @@ nfsd4_end_grace(void)
2553 dprintk("NFSD: end of grace period\n"); 2608 dprintk("NFSD: end of grace period\n");
2554 nfsd4_recdir_purge_old(); 2609 nfsd4_recdir_purge_old();
2555 locks_end_grace(&nfsd4_manager); 2610 locks_end_grace(&nfsd4_manager);
2611 /*
2612 * Now that every NFSv4 client has had the chance to recover and
2613 * to see the (possibly new, possibly shorter) lease time, we
2614 * can safely set the next grace time to the current lease time:
2615 */
2616 nfsd4_grace = nfsd4_lease;
2556} 2617}
2557 2618
2558static time_t 2619static time_t
@@ -2562,15 +2623,17 @@ nfs4_laundromat(void)
2562 struct nfs4_stateowner *sop; 2623 struct nfs4_stateowner *sop;
2563 struct nfs4_delegation *dp; 2624 struct nfs4_delegation *dp;
2564 struct list_head *pos, *next, reaplist; 2625 struct list_head *pos, *next, reaplist;
2565 time_t cutoff = get_seconds() - NFSD_LEASE_TIME; 2626 time_t cutoff = get_seconds() - nfsd4_lease;
2566 time_t t, clientid_val = NFSD_LEASE_TIME; 2627 time_t t, clientid_val = nfsd4_lease;
2567 time_t u, test_val = NFSD_LEASE_TIME; 2628 time_t u, test_val = nfsd4_lease;
2568 2629
2569 nfs4_lock_state(); 2630 nfs4_lock_state();
2570 2631
2571 dprintk("NFSD: laundromat service - starting\n"); 2632 dprintk("NFSD: laundromat service - starting\n");
2572 if (locks_in_grace()) 2633 if (locks_in_grace())
2573 nfsd4_end_grace(); 2634 nfsd4_end_grace();
2635 INIT_LIST_HEAD(&reaplist);
2636 spin_lock(&client_lock);
2574 list_for_each_safe(pos, next, &client_lru) { 2637 list_for_each_safe(pos, next, &client_lru) {
2575 clp = list_entry(pos, struct nfs4_client, cl_lru); 2638 clp = list_entry(pos, struct nfs4_client, cl_lru);
2576 if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) { 2639 if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
@@ -2579,12 +2642,22 @@ nfs4_laundromat(void)
2579 clientid_val = t; 2642 clientid_val = t;
2580 break; 2643 break;
2581 } 2644 }
2645 if (atomic_read(&clp->cl_refcount)) {
2646 dprintk("NFSD: client in use (clientid %08x)\n",
2647 clp->cl_clientid.cl_id);
2648 continue;
2649 }
2650 unhash_client_locked(clp);
2651 list_add(&clp->cl_lru, &reaplist);
2652 }
2653 spin_unlock(&client_lock);
2654 list_for_each_safe(pos, next, &reaplist) {
2655 clp = list_entry(pos, struct nfs4_client, cl_lru);
2582 dprintk("NFSD: purging unused client (clientid %08x)\n", 2656 dprintk("NFSD: purging unused client (clientid %08x)\n",
2583 clp->cl_clientid.cl_id); 2657 clp->cl_clientid.cl_id);
2584 nfsd4_remove_clid_dir(clp); 2658 nfsd4_remove_clid_dir(clp);
2585 expire_client(clp); 2659 expire_client(clp);
2586 } 2660 }
2587 INIT_LIST_HEAD(&reaplist);
2588 spin_lock(&recall_lock); 2661 spin_lock(&recall_lock);
2589 list_for_each_safe(pos, next, &del_recall_lru) { 2662 list_for_each_safe(pos, next, &del_recall_lru) {
2590 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); 2663 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
@@ -2604,7 +2677,7 @@ nfs4_laundromat(void)
2604 list_del_init(&dp->dl_recall_lru); 2677 list_del_init(&dp->dl_recall_lru);
2605 unhash_delegation(dp); 2678 unhash_delegation(dp);
2606 } 2679 }
2607 test_val = NFSD_LEASE_TIME; 2680 test_val = nfsd4_lease;
2608 list_for_each_safe(pos, next, &close_lru) { 2681 list_for_each_safe(pos, next, &close_lru) {
2609 sop = list_entry(pos, struct nfs4_stateowner, so_close_lru); 2682 sop = list_entry(pos, struct nfs4_stateowner, so_close_lru);
2610 if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) { 2683 if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) {
@@ -2660,39 +2733,11 @@ nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp)
2660static int 2733static int
2661STALE_STATEID(stateid_t *stateid) 2734STALE_STATEID(stateid_t *stateid)
2662{ 2735{
2663 if (time_after((unsigned long)boot_time, 2736 if (stateid->si_boot == boot_time)
2664 (unsigned long)stateid->si_boot)) { 2737 return 0;
2665 dprintk("NFSD: stale stateid " STATEID_FMT "!\n", 2738 dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
2666 STATEID_VAL(stateid));
2667 return 1;
2668 }
2669 return 0;
2670}
2671
2672static int
2673EXPIRED_STATEID(stateid_t *stateid)
2674{
2675 if (time_before((unsigned long)boot_time,
2676 ((unsigned long)stateid->si_boot)) &&
2677 time_before((unsigned long)(stateid->si_boot + lease_time), get_seconds())) {
2678 dprintk("NFSD: expired stateid " STATEID_FMT "!\n",
2679 STATEID_VAL(stateid));
2680 return 1;
2681 }
2682 return 0;
2683}
2684
2685static __be32
2686stateid_error_map(stateid_t *stateid)
2687{
2688 if (STALE_STATEID(stateid))
2689 return nfserr_stale_stateid;
2690 if (EXPIRED_STATEID(stateid))
2691 return nfserr_expired;
2692
2693 dprintk("NFSD: bad stateid " STATEID_FMT "!\n",
2694 STATEID_VAL(stateid)); 2739 STATEID_VAL(stateid));
2695 return nfserr_bad_stateid; 2740 return 1;
2696} 2741}
2697 2742
2698static inline int 2743static inline int
@@ -2816,10 +2861,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2816 status = nfserr_bad_stateid; 2861 status = nfserr_bad_stateid;
2817 if (is_delegation_stateid(stateid)) { 2862 if (is_delegation_stateid(stateid)) {
2818 dp = find_delegation_stateid(ino, stateid); 2863 dp = find_delegation_stateid(ino, stateid);
2819 if (!dp) { 2864 if (!dp)
2820 status = stateid_error_map(stateid);
2821 goto out; 2865 goto out;
2822 }
2823 status = check_stateid_generation(stateid, &dp->dl_stateid, 2866 status = check_stateid_generation(stateid, &dp->dl_stateid,
2824 flags); 2867 flags);
2825 if (status) 2868 if (status)
@@ -2832,10 +2875,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2832 *filpp = dp->dl_vfs_file; 2875 *filpp = dp->dl_vfs_file;
2833 } else { /* open or lock stateid */ 2876 } else { /* open or lock stateid */
2834 stp = find_stateid(stateid, flags); 2877 stp = find_stateid(stateid, flags);
2835 if (!stp) { 2878 if (!stp)
2836 status = stateid_error_map(stateid);
2837 goto out; 2879 goto out;
2838 }
2839 if (nfs4_check_fh(current_fh, stp)) 2880 if (nfs4_check_fh(current_fh, stp))
2840 goto out; 2881 goto out;
2841 if (!stp->st_stateowner->so_confirmed) 2882 if (!stp->st_stateowner->so_confirmed)
@@ -2907,7 +2948,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
2907 */ 2948 */
2908 sop = search_close_lru(stateid->si_stateownerid, flags); 2949 sop = search_close_lru(stateid->si_stateownerid, flags);
2909 if (sop == NULL) 2950 if (sop == NULL)
2910 return stateid_error_map(stateid); 2951 return nfserr_bad_stateid;
2911 *sopp = sop; 2952 *sopp = sop;
2912 goto check_replay; 2953 goto check_replay;
2913 } 2954 }
@@ -3174,10 +3215,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3174 if (!is_delegation_stateid(stateid)) 3215 if (!is_delegation_stateid(stateid))
3175 goto out; 3216 goto out;
3176 dp = find_delegation_stateid(inode, stateid); 3217 dp = find_delegation_stateid(inode, stateid);
3177 if (!dp) { 3218 if (!dp)
3178 status = stateid_error_map(stateid);
3179 goto out; 3219 goto out;
3180 }
3181 status = check_stateid_generation(stateid, &dp->dl_stateid, flags); 3220 status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
3182 if (status) 3221 if (status)
3183 goto out; 3222 goto out;
@@ -3403,7 +3442,7 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
3403 stp->st_stateowner = sop; 3442 stp->st_stateowner = sop;
3404 get_nfs4_file(fp); 3443 get_nfs4_file(fp);
3405 stp->st_file = fp; 3444 stp->st_file = fp;
3406 stp->st_stateid.si_boot = get_seconds(); 3445 stp->st_stateid.si_boot = boot_time;
3407 stp->st_stateid.si_stateownerid = sop->so_id; 3446 stp->st_stateid.si_stateownerid = sop->so_id;
3408 stp->st_stateid.si_fileid = fp->fi_id; 3447 stp->st_stateid.si_fileid = fp->fi_id;
3409 stp->st_stateid.si_generation = 0; 3448 stp->st_stateid.si_generation = 0;
@@ -3975,12 +4014,6 @@ nfsd4_load_reboot_recovery_data(void)
3975 printk("NFSD: Failure reading reboot recovery data\n"); 4014 printk("NFSD: Failure reading reboot recovery data\n");
3976} 4015}
3977 4016
3978unsigned long
3979get_nfs4_grace_period(void)
3980{
3981 return max(user_lease_time, lease_time) * HZ;
3982}
3983
3984/* 4017/*
3985 * Since the lifetime of a delegation isn't limited to that of an open, a 4018 * Since the lifetime of a delegation isn't limited to that of an open, a
3986 * client may quite reasonably hang on to a delegation as long as it has 4019 * client may quite reasonably hang on to a delegation as long as it has
@@ -4007,20 +4040,27 @@ set_max_delegations(void)
4007static int 4040static int
4008__nfs4_state_start(void) 4041__nfs4_state_start(void)
4009{ 4042{
4010 unsigned long grace_time; 4043 int ret;
4011 4044
4012 boot_time = get_seconds(); 4045 boot_time = get_seconds();
4013 grace_time = get_nfs4_grace_period();
4014 lease_time = user_lease_time;
4015 locks_start_grace(&nfsd4_manager); 4046 locks_start_grace(&nfsd4_manager);
4016 printk(KERN_INFO "NFSD: starting %ld-second grace period\n", 4047 printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
4017 grace_time/HZ); 4048 nfsd4_grace);
4049 ret = set_callback_cred();
4050 if (ret)
4051 return -ENOMEM;
4018 laundry_wq = create_singlethread_workqueue("nfsd4"); 4052 laundry_wq = create_singlethread_workqueue("nfsd4");
4019 if (laundry_wq == NULL) 4053 if (laundry_wq == NULL)
4020 return -ENOMEM; 4054 return -ENOMEM;
4021 queue_delayed_work(laundry_wq, &laundromat_work, grace_time); 4055 ret = nfsd4_create_callback_queue();
4056 if (ret)
4057 goto out_free_laundry;
4058 queue_delayed_work(laundry_wq, &laundromat_work, nfsd4_grace * HZ);
4022 set_max_delegations(); 4059 set_max_delegations();
4023 return set_callback_cred(); 4060 return 0;
4061out_free_laundry:
4062 destroy_workqueue(laundry_wq);
4063 return ret;
4024} 4064}
4025 4065
4026int 4066int
@@ -4038,12 +4078,6 @@ nfs4_state_start(void)
4038 return 0; 4078 return 0;
4039} 4079}
4040 4080
4041time_t
4042nfs4_lease_time(void)
4043{
4044 return lease_time;
4045}
4046
4047static void 4081static void
4048__nfs4_state_shutdown(void) 4082__nfs4_state_shutdown(void)
4049{ 4083{
@@ -4088,6 +4122,7 @@ nfs4_state_shutdown(void)
4088 nfs4_lock_state(); 4122 nfs4_lock_state();
4089 nfs4_release_reclaim(); 4123 nfs4_release_reclaim();
4090 __nfs4_state_shutdown(); 4124 __nfs4_state_shutdown();
4125 nfsd4_destroy_callback_queue();
4091 nfs4_unlock_state(); 4126 nfs4_unlock_state();
4092} 4127}
4093 4128
@@ -4127,21 +4162,3 @@ nfs4_recoverydir(void)
4127{ 4162{
4128 return user_recovery_dirname; 4163 return user_recovery_dirname;
4129} 4164}
4130
4131/*
4132 * Called when leasetime is changed.
4133 *
4134 * The only way the protocol gives us to handle on-the-fly lease changes is to
4135 * simulate a reboot. Instead of doing that, we just wait till the next time
4136 * we start to register any changes in lease time. If the administrator
4137 * really wants to change the lease time *now*, they can go ahead and bring
4138 * nfsd down and then back up again after changing the lease time.
4139 *
4140 * user_lease_time is protected by nfsd_mutex since it's only really accessed
4141 * when nfsd is starting
4142 */
4143void
4144nfs4_reset_lease(time_t leasetime)
4145{
4146 user_lease_time = leasetime;
4147}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 78c7e24e5129..ac17a7080239 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -40,6 +40,7 @@
40 * at the end of nfs4svc_decode_compoundargs. 40 * at the end of nfs4svc_decode_compoundargs.
41 */ 41 */
42 42
43#include <linux/slab.h>
43#include <linux/namei.h> 44#include <linux/namei.h>
44#include <linux/statfs.h> 45#include <linux/statfs.h>
45#include <linux/utsname.h> 46#include <linux/utsname.h>
@@ -160,10 +161,10 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
160 argp->p = page_address(argp->pagelist[0]); 161 argp->p = page_address(argp->pagelist[0]);
161 argp->pagelist++; 162 argp->pagelist++;
162 if (argp->pagelen < PAGE_SIZE) { 163 if (argp->pagelen < PAGE_SIZE) {
163 argp->end = p + (argp->pagelen>>2); 164 argp->end = argp->p + (argp->pagelen>>2);
164 argp->pagelen = 0; 165 argp->pagelen = 0;
165 } else { 166 } else {
166 argp->end = p + (PAGE_SIZE>>2); 167 argp->end = argp->p + (PAGE_SIZE>>2);
167 argp->pagelen -= PAGE_SIZE; 168 argp->pagelen -= PAGE_SIZE;
168 } 169 }
169 memcpy(((char*)p)+avail, argp->p, (nbytes - avail)); 170 memcpy(((char*)p)+avail, argp->p, (nbytes - avail));
@@ -1233,6 +1234,16 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
1233 DECODE_TAIL; 1234 DECODE_TAIL;
1234} 1235}
1235 1236
1237static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc)
1238{
1239 DECODE_HEAD;
1240
1241 READ_BUF(4);
1242 READ32(rc->rca_one_fs);
1243
1244 DECODE_TAIL;
1245}
1246
1236static __be32 1247static __be32
1237nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) 1248nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
1238{ 1249{
@@ -1345,7 +1356,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
1345 [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, 1356 [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp,
1346 [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, 1357 [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
1347 [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_notsupp, 1358 [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_notsupp,
1348 [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_notsupp, 1359 [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete,
1349}; 1360};
1350 1361
1351struct nfsd4_minorversion_ops { 1362struct nfsd4_minorversion_ops {
@@ -1425,10 +1436,10 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
1425 argp->p = page_address(argp->pagelist[0]); 1436 argp->p = page_address(argp->pagelist[0]);
1426 argp->pagelist++; 1437 argp->pagelist++;
1427 if (argp->pagelen < PAGE_SIZE) { 1438 if (argp->pagelen < PAGE_SIZE) {
1428 argp->end = p + (argp->pagelen>>2); 1439 argp->end = argp->p + (argp->pagelen>>2);
1429 argp->pagelen = 0; 1440 argp->pagelen = 0;
1430 } else { 1441 } else {
1431 argp->end = p + (PAGE_SIZE>>2); 1442 argp->end = argp->p + (PAGE_SIZE>>2);
1432 argp->pagelen -= PAGE_SIZE; 1443 argp->pagelen -= PAGE_SIZE;
1433 } 1444 }
1434 } 1445 }
@@ -1528,7 +1539,7 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
1528 } } while (0); 1539 } } while (0);
1529 1540
1530/* Encode as an array of strings the string given with components 1541/* Encode as an array of strings the string given with components
1531 * seperated @sep. 1542 * separated @sep.
1532 */ 1543 */
1533static __be32 nfsd4_encode_components(char sep, char *components, 1544static __be32 nfsd4_encode_components(char sep, char *components,
1534 __be32 **pp, int *buflen) 1545 __be32 **pp, int *buflen)
@@ -1899,7 +1910,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1899 if (bmval0 & FATTR4_WORD0_LEASE_TIME) { 1910 if (bmval0 & FATTR4_WORD0_LEASE_TIME) {
1900 if ((buflen -= 4) < 0) 1911 if ((buflen -= 4) < 0)
1901 goto out_resource; 1912 goto out_resource;
1902 WRITE32(NFSD_LEASE_TIME); 1913 WRITE32(nfsd4_lease);
1903 } 1914 }
1904 if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) { 1915 if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {
1905 if ((buflen -= 4) < 0) 1916 if ((buflen -= 4) < 0)
@@ -3306,11 +3317,14 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
3306 iov = &rqstp->rq_res.head[0]; 3317 iov = &rqstp->rq_res.head[0];
3307 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base; 3318 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
3308 BUG_ON(iov->iov_len > PAGE_SIZE); 3319 BUG_ON(iov->iov_len > PAGE_SIZE);
3309 if (nfsd4_has_session(cs) && cs->status != nfserr_replay_cache) { 3320 if (nfsd4_has_session(cs)) {
3310 nfsd4_store_cache_entry(resp); 3321 if (cs->status != nfserr_replay_cache) {
3311 dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__); 3322 nfsd4_store_cache_entry(resp);
3312 resp->cstate.slot->sl_inuse = false; 3323 dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
3313 nfsd4_put_session(resp->cstate.session); 3324 cs->slot->sl_inuse = false;
3325 }
3326 /* Renew the clientid on success and on replay */
3327 release_session_client(cs->session);
3314 } 3328 }
3315 return 1; 3329 return 1;
3316} 3330}
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index da08560c4818..4666a209678a 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -8,6 +8,8 @@
8 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> 8 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
9 */ 9 */
10 10
11#include <linux/slab.h>
12
11#include "nfsd.h" 13#include "nfsd.h"
12#include "cache.h" 14#include "cache.h"
13 15
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 0f0e77f2012f..bc3194ea01f5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -4,6 +4,7 @@
4 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> 4 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
5 */ 5 */
6 6
7#include <linux/slab.h>
7#include <linux/namei.h> 8#include <linux/namei.h>
8#include <linux/ctype.h> 9#include <linux/ctype.h>
9 10
@@ -45,6 +46,7 @@ enum {
45 */ 46 */
46#ifdef CONFIG_NFSD_V4 47#ifdef CONFIG_NFSD_V4
47 NFSD_Leasetime, 48 NFSD_Leasetime,
49 NFSD_Gracetime,
48 NFSD_RecoveryDir, 50 NFSD_RecoveryDir,
49#endif 51#endif
50}; 52};
@@ -69,6 +71,7 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size);
69static ssize_t write_maxblksize(struct file *file, char *buf, size_t size); 71static ssize_t write_maxblksize(struct file *file, char *buf, size_t size);
70#ifdef CONFIG_NFSD_V4 72#ifdef CONFIG_NFSD_V4
71static ssize_t write_leasetime(struct file *file, char *buf, size_t size); 73static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
74static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
72static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); 75static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
73#endif 76#endif
74 77
@@ -90,6 +93,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
90 [NFSD_MaxBlkSize] = write_maxblksize, 93 [NFSD_MaxBlkSize] = write_maxblksize,
91#ifdef CONFIG_NFSD_V4 94#ifdef CONFIG_NFSD_V4
92 [NFSD_Leasetime] = write_leasetime, 95 [NFSD_Leasetime] = write_leasetime,
96 [NFSD_Gracetime] = write_gracetime,
93 [NFSD_RecoveryDir] = write_recoverydir, 97 [NFSD_RecoveryDir] = write_recoverydir,
94#endif 98#endif
95}; 99};
@@ -1203,29 +1207,45 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
1203} 1207}
1204 1208
1205#ifdef CONFIG_NFSD_V4 1209#ifdef CONFIG_NFSD_V4
1206extern time_t nfs4_leasetime(void); 1210static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time)
1207
1208static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
1209{ 1211{
1210 /* if size > 10 seconds, call
1211 * nfs4_reset_lease() then write out the new lease (seconds) as reply
1212 */
1213 char *mesg = buf; 1212 char *mesg = buf;
1214 int rv, lease; 1213 int rv, i;
1215 1214
1216 if (size > 0) { 1215 if (size > 0) {
1217 if (nfsd_serv) 1216 if (nfsd_serv)
1218 return -EBUSY; 1217 return -EBUSY;
1219 rv = get_int(&mesg, &lease); 1218 rv = get_int(&mesg, &i);
1220 if (rv) 1219 if (rv)
1221 return rv; 1220 return rv;
1222 if (lease < 10 || lease > 3600) 1221 /*
1222 * Some sanity checking. We don't have a reason for
1223 * these particular numbers, but problems with the
1224 * extremes are:
1225 * - Too short: the briefest network outage may
1226 * cause clients to lose all their locks. Also,
1227 * the frequent polling may be wasteful.
1228 * - Too long: do you really want reboot recovery
1229 * to take more than an hour? Or to make other
1230 * clients wait an hour before being able to
1231 * revoke a dead client's locks?
1232 */
1233 if (i < 10 || i > 3600)
1223 return -EINVAL; 1234 return -EINVAL;
1224 nfs4_reset_lease(lease); 1235 *time = i;
1225 } 1236 }
1226 1237
1227 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", 1238 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", *time);
1228 nfs4_lease_time()); 1239}
1240
1241static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time)
1242{
1243 ssize_t rv;
1244
1245 mutex_lock(&nfsd_mutex);
1246 rv = __nfsd4_write_time(file, buf, size, time);
1247 mutex_unlock(&nfsd_mutex);
1248 return rv;
1229} 1249}
1230 1250
1231/** 1251/**
@@ -1251,12 +1271,22 @@ static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
1251 */ 1271 */
1252static ssize_t write_leasetime(struct file *file, char *buf, size_t size) 1272static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
1253{ 1273{
1254 ssize_t rv; 1274 return nfsd4_write_time(file, buf, size, &nfsd4_lease);
1275}
1255 1276
1256 mutex_lock(&nfsd_mutex); 1277/**
1257 rv = __write_leasetime(file, buf, size); 1278 * write_gracetime - Set or report current NFSv4 grace period time
1258 mutex_unlock(&nfsd_mutex); 1279 *
1259 return rv; 1280 * As above, but sets the time of the NFSv4 grace period.
1281 *
1282 * Note this should never be set to less than the *previous*
1283 * lease-period time, but we don't try to enforce this. (In the common
1284 * case (a new boot), we don't know what the previous lease time was
1285 * anyway.)
1286 */
1287static ssize_t write_gracetime(struct file *file, char *buf, size_t size)
1288{
1289 return nfsd4_write_time(file, buf, size, &nfsd4_grace);
1260} 1290}
1261 1291
1262extern char *nfs4_recoverydir(void); 1292extern char *nfs4_recoverydir(void);
@@ -1350,6 +1380,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
1350 [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, 1380 [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
1351#ifdef CONFIG_NFSD_V4 1381#ifdef CONFIG_NFSD_V4
1352 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, 1382 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
1383 [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
1353 [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, 1384 [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
1354#endif 1385#endif
1355 /* last one */ {""} 1386 /* last one */ {""}
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index e942a1aaac92..72377761270e 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -82,7 +82,6 @@ int nfs4_state_init(void);
82void nfsd4_free_slabs(void); 82void nfsd4_free_slabs(void);
83int nfs4_state_start(void); 83int nfs4_state_start(void);
84void nfs4_state_shutdown(void); 84void nfs4_state_shutdown(void);
85time_t nfs4_lease_time(void);
86void nfs4_reset_lease(time_t leasetime); 85void nfs4_reset_lease(time_t leasetime);
87int nfs4_reset_recoverydir(char *recdir); 86int nfs4_reset_recoverydir(char *recdir);
88#else 87#else
@@ -90,7 +89,6 @@ static inline int nfs4_state_init(void) { return 0; }
90static inline void nfsd4_free_slabs(void) { } 89static inline void nfsd4_free_slabs(void) { }
91static inline int nfs4_state_start(void) { return 0; } 90static inline int nfs4_state_start(void) { return 0; }
92static inline void nfs4_state_shutdown(void) { } 91static inline void nfs4_state_shutdown(void) { }
93static inline time_t nfs4_lease_time(void) { return 0; }
94static inline void nfs4_reset_lease(time_t leasetime) { } 92static inline void nfs4_reset_lease(time_t leasetime) { }
95static inline int nfs4_reset_recoverydir(char *recdir) { return 0; } 93static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }
96#endif 94#endif
@@ -229,6 +227,9 @@ extern struct timeval nfssvc_boot;
229 227
230#ifdef CONFIG_NFSD_V4 228#ifdef CONFIG_NFSD_V4
231 229
230extern time_t nfsd4_lease;
231extern time_t nfsd4_grace;
232
232/* before processing a COMPOUND operation, we have to check that there 233/* before processing a COMPOUND operation, we have to check that there
233 * is enough space in the buffer for XDR encode to succeed. otherwise, 234 * is enough space in the buffer for XDR encode to succeed. otherwise,
234 * we might process an operation with side effects, and be unable to 235 * we might process an operation with side effects, and be unable to
@@ -247,7 +248,6 @@ extern struct timeval nfssvc_boot;
247#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */ 248#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */
248#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */ 249#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */
249 250
250#define NFSD_LEASE_TIME (nfs4_lease_time())
251#define NFSD_LAUNDROMAT_MINTIMEOUT 10 /* seconds */ 251#define NFSD_LAUNDROMAT_MINTIMEOUT 10 /* seconds */
252 252
253/* 253/*
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 171699eb07c8..06b2a26edfe0 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -120,7 +120,7 @@ u32 nfsd_supported_minorversion;
120int nfsd_vers(int vers, enum vers_op change) 120int nfsd_vers(int vers, enum vers_op change)
121{ 121{
122 if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS) 122 if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
123 return -1; 123 return 0;
124 switch(change) { 124 switch(change) {
125 case NFSD_SET: 125 case NFSD_SET:
126 nfsd_versions[vers] = nfsd_version[vers]; 126 nfsd_versions[vers] = nfsd_version[vers];
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index fefeae27f25e..006c84230c7c 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -70,6 +70,16 @@ struct nfsd4_cb_sequence {
70 struct nfs4_client *cbs_clp; 70 struct nfs4_client *cbs_clp;
71}; 71};
72 72
73struct nfs4_rpc_args {
74 void *args_op;
75 struct nfsd4_cb_sequence args_seq;
76};
77
78struct nfsd4_callback {
79 struct nfs4_rpc_args cb_args;
80 struct work_struct cb_work;
81};
82
73struct nfs4_delegation { 83struct nfs4_delegation {
74 struct list_head dl_perfile; 84 struct list_head dl_perfile;
75 struct list_head dl_perclnt; 85 struct list_head dl_perclnt;
@@ -86,6 +96,7 @@ struct nfs4_delegation {
86 stateid_t dl_stateid; 96 stateid_t dl_stateid;
87 struct knfsd_fh dl_fh; 97 struct knfsd_fh dl_fh;
88 int dl_retries; 98 int dl_retries;
99 struct nfsd4_callback dl_recall;
89}; 100};
90 101
91/* client delegation callback info */ 102/* client delegation callback info */
@@ -96,9 +107,7 @@ struct nfs4_cb_conn {
96 u32 cb_prog; 107 u32 cb_prog;
97 u32 cb_minorversion; 108 u32 cb_minorversion;
98 u32 cb_ident; /* minorversion 0 only */ 109 u32 cb_ident; /* minorversion 0 only */
99 /* RPC client info */ 110 struct svc_xprt *cb_xprt; /* minorversion 1 only */
100 atomic_t cb_set; /* successful CB_NULL call */
101 struct rpc_clnt * cb_client;
102}; 111};
103 112
104/* Maximum number of slots per session. 160 is useful for long haul TCP */ 113/* Maximum number of slots per session. 160 is useful for long haul TCP */
@@ -157,7 +166,7 @@ struct nfsd4_session {
157 struct list_head se_hash; /* hash by sessionid */ 166 struct list_head se_hash; /* hash by sessionid */
158 struct list_head se_perclnt; 167 struct list_head se_perclnt;
159 u32 se_flags; 168 u32 se_flags;
160 struct nfs4_client *se_client; /* for expire_client */ 169 struct nfs4_client *se_client;
161 struct nfs4_sessionid se_sessionid; 170 struct nfs4_sessionid se_sessionid;
162 struct nfsd4_channel_attrs se_fchannel; 171 struct nfsd4_channel_attrs se_fchannel;
163 struct nfsd4_channel_attrs se_bchannel; 172 struct nfsd4_channel_attrs se_bchannel;
@@ -212,25 +221,41 @@ struct nfs4_client {
212 struct svc_cred cl_cred; /* setclientid principal */ 221 struct svc_cred cl_cred; /* setclientid principal */
213 clientid_t cl_clientid; /* generated by server */ 222 clientid_t cl_clientid; /* generated by server */
214 nfs4_verifier cl_confirm; /* generated by server */ 223 nfs4_verifier cl_confirm; /* generated by server */
215 struct nfs4_cb_conn cl_cb_conn; /* callback info */
216 atomic_t cl_count; /* ref count */
217 u32 cl_firststate; /* recovery dir creation */ 224 u32 cl_firststate; /* recovery dir creation */
218 225
226 /* for v4.0 and v4.1 callbacks: */
227 struct nfs4_cb_conn cl_cb_conn;
228 struct rpc_clnt *cl_cb_client;
229 atomic_t cl_cb_set;
230
219 /* for nfs41 */ 231 /* for nfs41 */
220 struct list_head cl_sessions; 232 struct list_head cl_sessions;
221 struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */ 233 struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */
222 u32 cl_exchange_flags; 234 u32 cl_exchange_flags;
223 struct nfs4_sessionid cl_sessionid; 235 struct nfs4_sessionid cl_sessionid;
236 /* number of rpc's in progress over an associated session: */
237 atomic_t cl_refcount;
224 238
225 /* for nfs41 callbacks */ 239 /* for nfs41 callbacks */
226 /* We currently support a single back channel with a single slot */ 240 /* We currently support a single back channel with a single slot */
227 unsigned long cl_cb_slot_busy; 241 unsigned long cl_cb_slot_busy;
228 u32 cl_cb_seq_nr; 242 u32 cl_cb_seq_nr;
229 struct svc_xprt *cl_cb_xprt; /* 4.1 callback transport */
230 struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ 243 struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */
231 /* wait here for slots */ 244 /* wait here for slots */
232}; 245};
233 246
247static inline void
248mark_client_expired(struct nfs4_client *clp)
249{
250 clp->cl_time = 0;
251}
252
253static inline bool
254is_client_expired(struct nfs4_client *clp)
255{
256 return clp->cl_time == 0;
257}
258
234/* struct nfs4_client_reset 259/* struct nfs4_client_reset
235 * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl 260 * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl
236 * upon lease reset, or from upcall to state_daemon (to read in state 261 * upon lease reset, or from upcall to state_daemon (to read in state
@@ -377,11 +402,14 @@ extern void nfs4_lock_state(void);
377extern void nfs4_unlock_state(void); 402extern void nfs4_unlock_state(void);
378extern int nfs4_in_grace(void); 403extern int nfs4_in_grace(void);
379extern __be32 nfs4_check_open_reclaim(clientid_t *clid); 404extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
380extern void put_nfs4_client(struct nfs4_client *clp);
381extern void nfs4_free_stateowner(struct kref *kref); 405extern void nfs4_free_stateowner(struct kref *kref);
382extern int set_callback_cred(void); 406extern int set_callback_cred(void);
383extern void nfsd4_probe_callback(struct nfs4_client *clp); 407extern void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
408extern void nfsd4_do_callback_rpc(struct work_struct *);
384extern void nfsd4_cb_recall(struct nfs4_delegation *dp); 409extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
410extern int nfsd4_create_callback_queue(void);
411extern void nfsd4_destroy_callback_queue(void);
412extern void nfsd4_set_callback_client(struct nfs4_client *, struct rpc_clnt *);
385extern void nfs4_put_delegation(struct nfs4_delegation *dp); 413extern void nfs4_put_delegation(struct nfs4_delegation *dp);
386extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); 414extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
387extern void nfsd4_init_recdir(char *recdir_name); 415extern void nfsd4_init_recdir(char *recdir_name);
@@ -392,6 +420,7 @@ extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id);
392extern void nfsd4_recdir_purge_old(void); 420extern void nfsd4_recdir_purge_old(void);
393extern int nfsd4_create_clid_dir(struct nfs4_client *clp); 421extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
394extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); 422extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
423extern void release_session_client(struct nfsd4_session *);
395 424
396static inline void 425static inline void
397nfs4_put_stateowner(struct nfs4_stateowner *so) 426nfs4_put_stateowner(struct nfs4_stateowner *so)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a11b0e8678ee..23c06f77f4ca 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -25,6 +25,7 @@
25#include <linux/xattr.h> 25#include <linux/xattr.h>
26#include <linux/jhash.h> 26#include <linux/jhash.h>
27#include <linux/ima.h> 27#include <linux/ima.h>
28#include <linux/slab.h>
28#include <asm/uaccess.h> 29#include <asm/uaccess.h>
29#include <linux/exportfs.h> 30#include <linux/exportfs.h>
30#include <linux/writeback.h> 31#include <linux/writeback.h>
@@ -723,7 +724,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
723 struct inode *inode; 724 struct inode *inode;
724 int flags = O_RDONLY|O_LARGEFILE; 725 int flags = O_RDONLY|O_LARGEFILE;
725 __be32 err; 726 __be32 err;
726 int host_err; 727 int host_err = 0;
727 728
728 validate_process_creds(); 729 validate_process_creds();
729 730
@@ -760,7 +761,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
760 * Check to see if there are any leases on this file. 761 * Check to see if there are any leases on this file.
761 * This may block while leases are broken. 762 * This may block while leases are broken.
762 */ 763 */
763 host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0)); 764 if (!(access & NFSD_MAY_NOT_BREAK_LEASE))
765 host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
764 if (host_err == -EWOULDBLOCK) 766 if (host_err == -EWOULDBLOCK)
765 host_err = -ETIMEDOUT; 767 host_err = -ETIMEDOUT;
766 if (host_err) /* NOMEM or WOULDBLOCK */ 768 if (host_err) /* NOMEM or WOULDBLOCK */
@@ -1168,7 +1170,8 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
1168 goto out; 1170 goto out;
1169 } 1171 }
1170 1172
1171 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); 1173 err = nfsd_open(rqstp, fhp, S_IFREG,
1174 NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &file);
1172 if (err) 1175 if (err)
1173 goto out; 1176 goto out;
1174 if (EX_ISSYNC(fhp->fh_export)) { 1177 if (EX_ISSYNC(fhp->fh_export)) {
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 4b1de0a9ea75..217a62c2a357 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -20,6 +20,7 @@
20#define NFSD_MAY_OWNER_OVERRIDE 64 20#define NFSD_MAY_OWNER_OVERRIDE 64
21#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ 21#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/
22#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256 22#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
23#define NFSD_MAY_NOT_BREAK_LEASE 512
23 24
24#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) 25#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
25#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) 26#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index efa337739534..4d476ff08ae6 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -381,6 +381,10 @@ struct nfsd4_destroy_session {
381 struct nfs4_sessionid sessionid; 381 struct nfs4_sessionid sessionid;
382}; 382};
383 383
384struct nfsd4_reclaim_complete {
385 u32 rca_one_fs;
386};
387
384struct nfsd4_op { 388struct nfsd4_op {
385 int opnum; 389 int opnum;
386 __be32 status; 390 __be32 status;
@@ -421,6 +425,7 @@ struct nfsd4_op {
421 struct nfsd4_create_session create_session; 425 struct nfsd4_create_session create_session;
422 struct nfsd4_destroy_session destroy_session; 426 struct nfsd4_destroy_session destroy_session;
423 struct nfsd4_sequence sequence; 427 struct nfsd4_sequence sequence;
428 struct nfsd4_reclaim_complete reclaim_complete;
424 } u; 429 } u;
425 struct nfs4_replay * replay; 430 struct nfs4_replay * replay;
426}; 431};
@@ -513,9 +518,8 @@ extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp);
513extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, 518extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
514 struct nfsd4_sequence *seq); 519 struct nfsd4_sequence *seq);
515extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, 520extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
516 struct nfsd4_compound_state *, 521 struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
517struct nfsd4_exchange_id *); 522extern __be32 nfsd4_create_session(struct svc_rqst *,
518 extern __be32 nfsd4_create_session(struct svc_rqst *,
519 struct nfsd4_compound_state *, 523 struct nfsd4_compound_state *,
520 struct nfsd4_create_session *); 524 struct nfsd4_create_session *);
521extern __be32 nfsd4_sequence(struct svc_rqst *, 525extern __be32 nfsd4_sequence(struct svc_rqst *,
@@ -524,6 +528,7 @@ extern __be32 nfsd4_sequence(struct svc_rqst *,
524extern __be32 nfsd4_destroy_session(struct svc_rqst *, 528extern __be32 nfsd4_destroy_session(struct svc_rqst *,
525 struct nfsd4_compound_state *, 529 struct nfsd4_compound_state *,
526 struct nfsd4_destroy_session *); 530 struct nfsd4_destroy_session *);
531__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *);
527extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, 532extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
528 struct nfsd4_open *open); 533 struct nfsd4_open *open);
529extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, 534extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 3f959f1879d8..d7fd696e595c 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -26,10 +26,16 @@
26#include <linux/buffer_head.h> 26#include <linux/buffer_head.h>
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/bitops.h> 28#include <linux/bitops.h>
29#include <linux/slab.h>
29#include "mdt.h" 30#include "mdt.h"
30#include "alloc.h" 31#include "alloc.h"
31 32
32 33
34/**
35 * nilfs_palloc_groups_per_desc_block - get the number of groups that a group
36 * descriptor block can maintain
37 * @inode: inode of metadata file using this allocator
38 */
33static inline unsigned long 39static inline unsigned long
34nilfs_palloc_groups_per_desc_block(const struct inode *inode) 40nilfs_palloc_groups_per_desc_block(const struct inode *inode)
35{ 41{
@@ -37,12 +43,21 @@ nilfs_palloc_groups_per_desc_block(const struct inode *inode)
37 sizeof(struct nilfs_palloc_group_desc); 43 sizeof(struct nilfs_palloc_group_desc);
38} 44}
39 45
46/**
47 * nilfs_palloc_groups_count - get maximum number of groups
48 * @inode: inode of metadata file using this allocator
49 */
40static inline unsigned long 50static inline unsigned long
41nilfs_palloc_groups_count(const struct inode *inode) 51nilfs_palloc_groups_count(const struct inode *inode)
42{ 52{
43 return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */)); 53 return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */));
44} 54}
45 55
56/**
57 * nilfs_palloc_init_blockgroup - initialize private variables for allocator
58 * @inode: inode of metadata file using this allocator
59 * @entry_size: size of the persistent object
60 */
46int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size) 61int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
47{ 62{
48 struct nilfs_mdt_info *mi = NILFS_MDT(inode); 63 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
@@ -68,6 +83,12 @@ int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
68 return 0; 83 return 0;
69} 84}
70 85
86/**
87 * nilfs_palloc_group - get group number and offset from an entry number
88 * @inode: inode of metadata file using this allocator
89 * @nr: serial number of the entry (e.g. inode number)
90 * @offset: pointer to store offset number in the group
91 */
71static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr, 92static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
72 unsigned long *offset) 93 unsigned long *offset)
73{ 94{
@@ -77,6 +98,14 @@ static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
77 return group; 98 return group;
78} 99}
79 100
101/**
102 * nilfs_palloc_desc_blkoff - get block offset of a group descriptor block
103 * @inode: inode of metadata file using this allocator
104 * @group: group number
105 *
106 * nilfs_palloc_desc_blkoff() returns block offset of the descriptor
107 * block which contains a descriptor of the specified group.
108 */
80static unsigned long 109static unsigned long
81nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group) 110nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
82{ 111{
@@ -85,6 +114,14 @@ nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
85 return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block; 114 return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block;
86} 115}
87 116
117/**
118 * nilfs_palloc_bitmap_blkoff - get block offset of a bitmap block
119 * @inode: inode of metadata file using this allocator
120 * @group: group number
121 *
122 * nilfs_palloc_bitmap_blkoff() returns block offset of the bitmap
123 * block used to allocate/deallocate entries in the specified group.
124 */
88static unsigned long 125static unsigned long
89nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group) 126nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
90{ 127{
@@ -94,6 +131,12 @@ nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
94 desc_offset * NILFS_MDT(inode)->mi_blocks_per_group; 131 desc_offset * NILFS_MDT(inode)->mi_blocks_per_group;
95} 132}
96 133
134/**
135 * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group
136 * @inode: inode of metadata file using this allocator
137 * @group: group number
138 * @desc: pointer to descriptor structure for the group
139 */
97static unsigned long 140static unsigned long
98nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group, 141nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
99 const struct nilfs_palloc_group_desc *desc) 142 const struct nilfs_palloc_group_desc *desc)
@@ -106,6 +149,13 @@ nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
106 return nfree; 149 return nfree;
107} 150}
108 151
152/**
153 * nilfs_palloc_group_desc_add_entries - adjust count of free entries
154 * @inode: inode of metadata file using this allocator
155 * @group: group number
156 * @desc: pointer to descriptor structure for the group
157 * @n: delta to be added
158 */
109static void 159static void
110nilfs_palloc_group_desc_add_entries(struct inode *inode, 160nilfs_palloc_group_desc_add_entries(struct inode *inode,
111 unsigned long group, 161 unsigned long group,
@@ -117,6 +167,11 @@ nilfs_palloc_group_desc_add_entries(struct inode *inode,
117 spin_unlock(nilfs_mdt_bgl_lock(inode, group)); 167 spin_unlock(nilfs_mdt_bgl_lock(inode, group));
118} 168}
119 169
170/**
171 * nilfs_palloc_entry_blkoff - get block offset of an entry block
172 * @inode: inode of metadata file using this allocator
173 * @nr: serial number of the entry (e.g. inode number)
174 */
120static unsigned long 175static unsigned long
121nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr) 176nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
122{ 177{
@@ -128,6 +183,12 @@ nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
128 group_offset / NILFS_MDT(inode)->mi_entries_per_block; 183 group_offset / NILFS_MDT(inode)->mi_entries_per_block;
129} 184}
130 185
186/**
187 * nilfs_palloc_desc_block_init - initialize buffer of a group descriptor block
188 * @inode: inode of metadata file
189 * @bh: buffer head of the buffer to be initialized
190 * @kaddr: kernel address mapped for the page including the buffer
191 */
131static void nilfs_palloc_desc_block_init(struct inode *inode, 192static void nilfs_palloc_desc_block_init(struct inode *inode,
132 struct buffer_head *bh, void *kaddr) 193 struct buffer_head *bh, void *kaddr)
133{ 194{
@@ -178,6 +239,13 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
178 return ret; 239 return ret;
179} 240}
180 241
242/**
243 * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block
244 * @inode: inode of metadata file using this allocator
245 * @group: group number
246 * @create: create flag
247 * @bhp: pointer to store the resultant buffer head
248 */
181static int nilfs_palloc_get_desc_block(struct inode *inode, 249static int nilfs_palloc_get_desc_block(struct inode *inode,
182 unsigned long group, 250 unsigned long group,
183 int create, struct buffer_head **bhp) 251 int create, struct buffer_head **bhp)
@@ -190,6 +258,13 @@ static int nilfs_palloc_get_desc_block(struct inode *inode,
190 bhp, &cache->prev_desc, &cache->lock); 258 bhp, &cache->prev_desc, &cache->lock);
191} 259}
192 260
261/**
262 * nilfs_palloc_get_bitmap_block - get buffer head of a bitmap block
263 * @inode: inode of metadata file using this allocator
264 * @group: group number
265 * @create: create flag
266 * @bhp: pointer to store the resultant buffer head
267 */
193static int nilfs_palloc_get_bitmap_block(struct inode *inode, 268static int nilfs_palloc_get_bitmap_block(struct inode *inode,
194 unsigned long group, 269 unsigned long group,
195 int create, struct buffer_head **bhp) 270 int create, struct buffer_head **bhp)
@@ -202,6 +277,13 @@ static int nilfs_palloc_get_bitmap_block(struct inode *inode,
202 &cache->prev_bitmap, &cache->lock); 277 &cache->prev_bitmap, &cache->lock);
203} 278}
204 279
280/**
281 * nilfs_palloc_get_entry_block - get buffer head of an entry block
282 * @inode: inode of metadata file using this allocator
283 * @nr: serial number of the entry (e.g. inode number)
284 * @create: create flag
285 * @bhp: pointer to store the resultant buffer head
286 */
205int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr, 287int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
206 int create, struct buffer_head **bhp) 288 int create, struct buffer_head **bhp)
207{ 289{
@@ -213,6 +295,13 @@ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
213 &cache->prev_entry, &cache->lock); 295 &cache->prev_entry, &cache->lock);
214} 296}
215 297
298/**
299 * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor
300 * @inode: inode of metadata file using this allocator
301 * @group: group number
302 * @bh: buffer head of the buffer storing the group descriptor block
303 * @kaddr: kernel address mapped for the page including the buffer
304 */
216static struct nilfs_palloc_group_desc * 305static struct nilfs_palloc_group_desc *
217nilfs_palloc_block_get_group_desc(const struct inode *inode, 306nilfs_palloc_block_get_group_desc(const struct inode *inode,
218 unsigned long group, 307 unsigned long group,
@@ -222,6 +311,13 @@ nilfs_palloc_block_get_group_desc(const struct inode *inode,
222 group % nilfs_palloc_groups_per_desc_block(inode); 311 group % nilfs_palloc_groups_per_desc_block(inode);
223} 312}
224 313
314/**
315 * nilfs_palloc_block_get_entry - get kernel address of an entry
316 * @inode: inode of metadata file using this allocator
317 * @nr: serial number of the entry (e.g. inode number)
318 * @bh: buffer head of the buffer storing the entry block
319 * @kaddr: kernel address mapped for the page including the buffer
320 */
225void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, 321void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
226 const struct buffer_head *bh, void *kaddr) 322 const struct buffer_head *bh, void *kaddr)
227{ 323{
@@ -234,11 +330,19 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
234 entry_offset * NILFS_MDT(inode)->mi_entry_size; 330 entry_offset * NILFS_MDT(inode)->mi_entry_size;
235} 331}
236 332
333/**
334 * nilfs_palloc_find_available_slot - find available slot in a group
335 * @inode: inode of metadata file using this allocator
336 * @group: group number
337 * @target: offset number of an entry in the group (start point)
338 * @bitmap: bitmap of the group
339 * @bsize: size in bits
340 */
237static int nilfs_palloc_find_available_slot(struct inode *inode, 341static int nilfs_palloc_find_available_slot(struct inode *inode,
238 unsigned long group, 342 unsigned long group,
239 unsigned long target, 343 unsigned long target,
240 unsigned char *bitmap, 344 unsigned char *bitmap,
241 int bsize) /* size in bits */ 345 int bsize)
242{ 346{
243 int curr, pos, end, i; 347 int curr, pos, end, i;
244 348
@@ -276,6 +380,13 @@ static int nilfs_palloc_find_available_slot(struct inode *inode,
276 return -ENOSPC; 380 return -ENOSPC;
277} 381}
278 382
383/**
384 * nilfs_palloc_rest_groups_in_desc_block - get the remaining number of groups
385 * in a group descriptor block
386 * @inode: inode of metadata file using this allocator
387 * @curr: current group number
388 * @max: maximum number of groups
389 */
279static unsigned long 390static unsigned long
280nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode, 391nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
281 unsigned long curr, unsigned long max) 392 unsigned long curr, unsigned long max)
@@ -286,6 +397,11 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
286 max - curr + 1); 397 max - curr + 1);
287} 398}
288 399
400/**
401 * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
402 * @inode: inode of metadata file using this allocator
403 * @req: nilfs_palloc_req structure exchanged for the allocation
404 */
289int nilfs_palloc_prepare_alloc_entry(struct inode *inode, 405int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
290 struct nilfs_palloc_req *req) 406 struct nilfs_palloc_req *req)
291{ 407{
@@ -365,6 +481,11 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
365 return ret; 481 return ret;
366} 482}
367 483
484/**
485 * nilfs_palloc_commit_alloc_entry - finish allocation of a persistent object
486 * @inode: inode of metadata file using this allocator
487 * @req: nilfs_palloc_req structure exchanged for the allocation
488 */
368void nilfs_palloc_commit_alloc_entry(struct inode *inode, 489void nilfs_palloc_commit_alloc_entry(struct inode *inode,
369 struct nilfs_palloc_req *req) 490 struct nilfs_palloc_req *req)
370{ 491{
@@ -376,6 +497,11 @@ void nilfs_palloc_commit_alloc_entry(struct inode *inode,
376 brelse(req->pr_desc_bh); 497 brelse(req->pr_desc_bh);
377} 498}
378 499
500/**
501 * nilfs_palloc_commit_free_entry - finish deallocating a persistent object
502 * @inode: inode of metadata file using this allocator
503 * @req: nilfs_palloc_req structure exchanged for the removal
504 */
379void nilfs_palloc_commit_free_entry(struct inode *inode, 505void nilfs_palloc_commit_free_entry(struct inode *inode,
380 struct nilfs_palloc_req *req) 506 struct nilfs_palloc_req *req)
381{ 507{
@@ -409,6 +535,11 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
409 brelse(req->pr_desc_bh); 535 brelse(req->pr_desc_bh);
410} 536}
411 537
538/**
539 * nilfs_palloc_abort_alloc_entry - cancel allocation of a persistent object
540 * @inode: inode of metadata file using this allocator
541 * @req: nilfs_palloc_req structure exchanged for the allocation
542 */
412void nilfs_palloc_abort_alloc_entry(struct inode *inode, 543void nilfs_palloc_abort_alloc_entry(struct inode *inode,
413 struct nilfs_palloc_req *req) 544 struct nilfs_palloc_req *req)
414{ 545{
@@ -425,7 +556,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
425 bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); 556 bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
426 if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), 557 if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
427 group_offset, bitmap)) 558 group_offset, bitmap))
428 printk(KERN_WARNING "%s: entry numer %llu already freed\n", 559 printk(KERN_WARNING "%s: entry number %llu already freed\n",
429 __func__, (unsigned long long)req->pr_entry_nr); 560 __func__, (unsigned long long)req->pr_entry_nr);
430 561
431 nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); 562 nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
@@ -441,6 +572,11 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
441 req->pr_desc_bh = NULL; 572 req->pr_desc_bh = NULL;
442} 573}
443 574
575/**
576 * nilfs_palloc_prepare_free_entry - prepare to deallocate a persistent object
577 * @inode: inode of metadata file using this allocator
578 * @req: nilfs_palloc_req structure exchanged for the removal
579 */
444int nilfs_palloc_prepare_free_entry(struct inode *inode, 580int nilfs_palloc_prepare_free_entry(struct inode *inode,
445 struct nilfs_palloc_req *req) 581 struct nilfs_palloc_req *req)
446{ 582{
@@ -463,6 +599,11 @@ int nilfs_palloc_prepare_free_entry(struct inode *inode,
463 return 0; 599 return 0;
464} 600}
465 601
602/**
603 * nilfs_palloc_abort_free_entry - cancel deallocating a persistent object
604 * @inode: inode of metadata file using this allocator
605 * @req: nilfs_palloc_req structure exchanged for the removal
606 */
466void nilfs_palloc_abort_free_entry(struct inode *inode, 607void nilfs_palloc_abort_free_entry(struct inode *inode,
467 struct nilfs_palloc_req *req) 608 struct nilfs_palloc_req *req)
468{ 609{
@@ -474,6 +615,12 @@ void nilfs_palloc_abort_free_entry(struct inode *inode,
474 req->pr_desc_bh = NULL; 615 req->pr_desc_bh = NULL;
475} 616}
476 617
618/**
619 * nilfs_palloc_group_is_in - judge if an entry is in a group
620 * @inode: inode of metadata file using this allocator
621 * @group: group number
622 * @nr: serial number of the entry (e.g. inode number)
623 */
477static int 624static int
478nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr) 625nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
479{ 626{
@@ -484,6 +631,12 @@ nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
484 return (nr >= first) && (nr <= last); 631 return (nr >= first) && (nr <= last);
485} 632}
486 633
634/**
635 * nilfs_palloc_freev - deallocate a set of persistent objects
636 * @inode: inode of metadata file using this allocator
637 * @entry_nrs: array of entry numbers to be deallocated
638 * @nitems: number of entries stored in @entry_nrs
639 */
487int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) 640int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
488{ 641{
489 struct buffer_head *desc_bh, *bitmap_bh; 642 struct buffer_head *desc_bh, *bitmap_bh;
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index f4543ac4f560..9af34a7e6e13 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -29,6 +29,13 @@
29#include <linux/buffer_head.h> 29#include <linux/buffer_head.h>
30#include <linux/fs.h> 30#include <linux/fs.h>
31 31
32/**
33 * nilfs_palloc_entries_per_group - get the number of entries per group
34 * @inode: inode of metadata file using this allocator
35 *
36 * The number of entries per group is defined by the number of bits
37 * that a bitmap block can maintain.
38 */
32static inline unsigned long 39static inline unsigned long
33nilfs_palloc_entries_per_group(const struct inode *inode) 40nilfs_palloc_entries_per_group(const struct inode *inode)
34{ 41{
@@ -42,7 +49,7 @@ void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
42 const struct buffer_head *, void *); 49 const struct buffer_head *, void *);
43 50
44/** 51/**
45 * nilfs_palloc_req - persistent alloctor request and reply 52 * nilfs_palloc_req - persistent allocator request and reply
46 * @pr_entry_nr: entry number (vblocknr or inode number) 53 * @pr_entry_nr: entry number (vblocknr or inode number)
47 * @pr_desc_bh: buffer head of the buffer containing block group descriptors 54 * @pr_desc_bh: buffer head of the buffer containing block group descriptors
48 * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap 55 * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 471e269536ae..447ce47a3306 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -27,6 +27,7 @@
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/backing-dev.h> 29#include <linux/backing-dev.h>
30#include <linux/gfp.h>
30#include "nilfs.h" 31#include "nilfs.h"
31#include "mdt.h" 32#include "mdt.h"
32#include "dat.h" 33#include "dat.h"
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 7cdd98b8d514..b27a342c5af6 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -31,63 +31,16 @@
31#include "alloc.h" 31#include "alloc.h"
32#include "dat.h" 32#include "dat.h"
33 33
34/** 34static struct nilfs_btree_path *nilfs_btree_alloc_path(void)
35 * struct nilfs_btree_path - A path on which B-tree operations are executed
36 * @bp_bh: buffer head of node block
37 * @bp_sib_bh: buffer head of sibling node block
38 * @bp_index: index of child node
39 * @bp_oldreq: ptr end request for old ptr
40 * @bp_newreq: ptr alloc request for new ptr
41 * @bp_op: rebalance operation
42 */
43struct nilfs_btree_path {
44 struct buffer_head *bp_bh;
45 struct buffer_head *bp_sib_bh;
46 int bp_index;
47 union nilfs_bmap_ptr_req bp_oldreq;
48 union nilfs_bmap_ptr_req bp_newreq;
49 struct nilfs_btnode_chkey_ctxt bp_ctxt;
50 void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
51 int, __u64 *, __u64 *);
52};
53
54/*
55 * B-tree path operations
56 */
57
58static struct kmem_cache *nilfs_btree_path_cache;
59
60int __init nilfs_btree_path_cache_init(void)
61{
62 nilfs_btree_path_cache =
63 kmem_cache_create("nilfs2_btree_path_cache",
64 sizeof(struct nilfs_btree_path) *
65 NILFS_BTREE_LEVEL_MAX, 0, 0, NULL);
66 return (nilfs_btree_path_cache != NULL) ? 0 : -ENOMEM;
67}
68
69void nilfs_btree_path_cache_destroy(void)
70{
71 kmem_cache_destroy(nilfs_btree_path_cache);
72}
73
74static inline struct nilfs_btree_path *nilfs_btree_alloc_path(void)
75{
76 return kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
77}
78
79static inline void nilfs_btree_free_path(struct nilfs_btree_path *path)
80{ 35{
81 kmem_cache_free(nilfs_btree_path_cache, path); 36 struct nilfs_btree_path *path;
82} 37 int level = NILFS_BTREE_LEVEL_DATA;
83 38
84static void nilfs_btree_init_path(struct nilfs_btree_path *path) 39 path = kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
85{ 40 if (path == NULL)
86 int level; 41 goto out;
87 42
88 for (level = NILFS_BTREE_LEVEL_DATA; 43 for (; level < NILFS_BTREE_LEVEL_MAX; level++) {
89 level < NILFS_BTREE_LEVEL_MAX;
90 level++) {
91 path[level].bp_bh = NULL; 44 path[level].bp_bh = NULL;
92 path[level].bp_sib_bh = NULL; 45 path[level].bp_sib_bh = NULL;
93 path[level].bp_index = 0; 46 path[level].bp_index = 0;
@@ -95,15 +48,19 @@ static void nilfs_btree_init_path(struct nilfs_btree_path *path)
95 path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; 48 path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
96 path[level].bp_op = NULL; 49 path[level].bp_op = NULL;
97 } 50 }
51
52out:
53 return path;
98} 54}
99 55
100static void nilfs_btree_release_path(struct nilfs_btree_path *path) 56static void nilfs_btree_free_path(struct nilfs_btree_path *path)
101{ 57{
102 int level; 58 int level = NILFS_BTREE_LEVEL_DATA;
103 59
104 for (level = NILFS_BTREE_LEVEL_DATA; level < NILFS_BTREE_LEVEL_MAX; 60 for (; level < NILFS_BTREE_LEVEL_MAX; level++)
105 level++)
106 brelse(path[level].bp_bh); 61 brelse(path[level].bp_bh);
62
63 kmem_cache_free(nilfs_btree_path_cache, path);
107} 64}
108 65
109/* 66/*
@@ -566,14 +523,12 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
566 path = nilfs_btree_alloc_path(); 523 path = nilfs_btree_alloc_path();
567 if (path == NULL) 524 if (path == NULL)
568 return -ENOMEM; 525 return -ENOMEM;
569 nilfs_btree_init_path(path);
570 526
571 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); 527 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
572 528
573 if (ptrp != NULL) 529 if (ptrp != NULL)
574 *ptrp = ptr; 530 *ptrp = ptr;
575 531
576 nilfs_btree_release_path(path);
577 nilfs_btree_free_path(path); 532 nilfs_btree_free_path(path);
578 533
579 return ret; 534 return ret;
@@ -594,7 +549,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
594 path = nilfs_btree_alloc_path(); 549 path = nilfs_btree_alloc_path();
595 if (path == NULL) 550 if (path == NULL)
596 return -ENOMEM; 551 return -ENOMEM;
597 nilfs_btree_init_path(path); 552
598 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); 553 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
599 if (ret < 0) 554 if (ret < 0)
600 goto out; 555 goto out;
@@ -655,7 +610,6 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
655 *ptrp = ptr; 610 *ptrp = ptr;
656 ret = cnt; 611 ret = cnt;
657 out: 612 out:
658 nilfs_btree_release_path(path);
659 nilfs_btree_free_path(path); 613 nilfs_btree_free_path(path);
660 return ret; 614 return ret;
661} 615}
@@ -1123,7 +1077,6 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
1123 path = nilfs_btree_alloc_path(); 1077 path = nilfs_btree_alloc_path();
1124 if (path == NULL) 1078 if (path == NULL)
1125 return -ENOMEM; 1079 return -ENOMEM;
1126 nilfs_btree_init_path(path);
1127 1080
1128 ret = nilfs_btree_do_lookup(btree, path, key, NULL, 1081 ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1129 NILFS_BTREE_LEVEL_NODE_MIN); 1082 NILFS_BTREE_LEVEL_NODE_MIN);
@@ -1140,7 +1093,6 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
1140 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); 1093 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
1141 1094
1142 out: 1095 out:
1143 nilfs_btree_release_path(path);
1144 nilfs_btree_free_path(path); 1096 nilfs_btree_free_path(path);
1145 return ret; 1097 return ret;
1146} 1098}
@@ -1456,7 +1408,7 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
1456 path = nilfs_btree_alloc_path(); 1408 path = nilfs_btree_alloc_path();
1457 if (path == NULL) 1409 if (path == NULL)
1458 return -ENOMEM; 1410 return -ENOMEM;
1459 nilfs_btree_init_path(path); 1411
1460 ret = nilfs_btree_do_lookup(btree, path, key, NULL, 1412 ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1461 NILFS_BTREE_LEVEL_NODE_MIN); 1413 NILFS_BTREE_LEVEL_NODE_MIN);
1462 if (ret < 0) 1414 if (ret < 0)
@@ -1473,7 +1425,6 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
1473 nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); 1425 nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
1474 1426
1475out: 1427out:
1476 nilfs_btree_release_path(path);
1477 nilfs_btree_free_path(path); 1428 nilfs_btree_free_path(path);
1478 return ret; 1429 return ret;
1479} 1430}
@@ -1488,11 +1439,9 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
1488 path = nilfs_btree_alloc_path(); 1439 path = nilfs_btree_alloc_path();
1489 if (path == NULL) 1440 if (path == NULL)
1490 return -ENOMEM; 1441 return -ENOMEM;
1491 nilfs_btree_init_path(path);
1492 1442
1493 ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL); 1443 ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
1494 1444
1495 nilfs_btree_release_path(path);
1496 nilfs_btree_free_path(path); 1445 nilfs_btree_free_path(path);
1497 1446
1498 return ret; 1447 return ret;
@@ -1879,7 +1828,7 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
1879 struct nilfs_btree_path *path, 1828 struct nilfs_btree_path *path,
1880 int level, struct buffer_head *bh) 1829 int level, struct buffer_head *bh)
1881{ 1830{
1882 int maxlevel, ret; 1831 int maxlevel = 0, ret;
1883 struct nilfs_btree_node *parent; 1832 struct nilfs_btree_node *parent;
1884 struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap); 1833 struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
1885 __u64 ptr; 1834 __u64 ptr;
@@ -1923,7 +1872,6 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
1923 path = nilfs_btree_alloc_path(); 1872 path = nilfs_btree_alloc_path();
1924 if (path == NULL) 1873 if (path == NULL)
1925 return -ENOMEM; 1874 return -ENOMEM;
1926 nilfs_btree_init_path(path);
1927 1875
1928 if (buffer_nilfs_node(bh)) { 1876 if (buffer_nilfs_node(bh)) {
1929 node = (struct nilfs_btree_node *)bh->b_data; 1877 node = (struct nilfs_btree_node *)bh->b_data;
@@ -1947,7 +1895,6 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
1947 nilfs_btree_propagate_p(btree, path, level, bh); 1895 nilfs_btree_propagate_p(btree, path, level, bh);
1948 1896
1949 out: 1897 out:
1950 nilfs_btree_release_path(path);
1951 nilfs_btree_free_path(path); 1898 nilfs_btree_free_path(path);
1952 1899
1953 return ret; 1900 return ret;
@@ -2108,7 +2055,6 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
2108 path = nilfs_btree_alloc_path(); 2055 path = nilfs_btree_alloc_path();
2109 if (path == NULL) 2056 if (path == NULL)
2110 return -ENOMEM; 2057 return -ENOMEM;
2111 nilfs_btree_init_path(path);
2112 2058
2113 if (buffer_nilfs_node(*bh)) { 2059 if (buffer_nilfs_node(*bh)) {
2114 node = (struct nilfs_btree_node *)(*bh)->b_data; 2060 node = (struct nilfs_btree_node *)(*bh)->b_data;
@@ -2130,7 +2076,6 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
2130 nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo); 2076 nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
2131 2077
2132 out: 2078 out:
2133 nilfs_btree_release_path(path);
2134 nilfs_btree_free_path(path); 2079 nilfs_btree_free_path(path);
2135 2080
2136 return ret; 2081 return ret;
@@ -2175,7 +2120,6 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
2175 path = nilfs_btree_alloc_path(); 2120 path = nilfs_btree_alloc_path();
2176 if (path == NULL) 2121 if (path == NULL)
2177 return -ENOMEM; 2122 return -ENOMEM;
2178 nilfs_btree_init_path(path);
2179 2123
2180 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1); 2124 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
2181 if (ret < 0) { 2125 if (ret < 0) {
@@ -2195,7 +2139,6 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
2195 nilfs_bmap_set_dirty(&btree->bt_bmap); 2139 nilfs_bmap_set_dirty(&btree->bt_bmap);
2196 2140
2197 out: 2141 out:
2198 nilfs_btree_release_path(path);
2199 nilfs_btree_free_path(path); 2142 nilfs_btree_free_path(path);
2200 return ret; 2143 return ret;
2201} 2144}
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 4b82d84ade75..af638d59e3bf 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -30,9 +30,6 @@
30#include "btnode.h" 30#include "btnode.h"
31#include "bmap.h" 31#include "bmap.h"
32 32
33struct nilfs_btree;
34struct nilfs_btree_path;
35
36/** 33/**
37 * struct nilfs_btree - B-tree structure 34 * struct nilfs_btree - B-tree structure
38 * @bt_bmap: bmap base structure 35 * @bt_bmap: bmap base structure
@@ -41,6 +38,25 @@ struct nilfs_btree {
41 struct nilfs_bmap bt_bmap; 38 struct nilfs_bmap bt_bmap;
42}; 39};
43 40
41/**
42 * struct nilfs_btree_path - A path on which B-tree operations are executed
43 * @bp_bh: buffer head of node block
44 * @bp_sib_bh: buffer head of sibling node block
45 * @bp_index: index of child node
46 * @bp_oldreq: ptr end request for old ptr
47 * @bp_newreq: ptr alloc request for new ptr
48 * @bp_op: rebalance operation
49 */
50struct nilfs_btree_path {
51 struct buffer_head *bp_bh;
52 struct buffer_head *bp_sib_bh;
53 int bp_index;
54 union nilfs_bmap_ptr_req bp_oldreq;
55 union nilfs_bmap_ptr_req bp_newreq;
56 struct nilfs_btnode_chkey_ctxt bp_ctxt;
57 void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
58 int, __u64 *, __u64 *);
59};
44 60
45#define NILFS_BTREE_ROOT_SIZE NILFS_BMAP_SIZE 61#define NILFS_BTREE_ROOT_SIZE NILFS_BMAP_SIZE
46#define NILFS_BTREE_ROOT_NCHILDREN_MAX \ 62#define NILFS_BTREE_ROOT_NCHILDREN_MAX \
@@ -57,6 +73,7 @@ struct nilfs_btree {
57#define NILFS_BTREE_KEY_MIN ((__u64)0) 73#define NILFS_BTREE_KEY_MIN ((__u64)0)
58#define NILFS_BTREE_KEY_MAX (~(__u64)0) 74#define NILFS_BTREE_KEY_MAX (~(__u64)0)
59 75
76extern struct kmem_cache *nilfs_btree_path_cache;
60 77
61int nilfs_btree_path_cache_init(void); 78int nilfs_btree_path_cache_init(void);
62void nilfs_btree_path_cache_destroy(void); 79void nilfs_btree_path_cache_destroy(void);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 9d1e5de91afb..013146755683 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -288,7 +288,7 @@ int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr)
288 * @vblocknrs and @nitems. 288 * @vblocknrs and @nitems.
289 * 289 *
290 * Return Value: On success, 0 is returned. On error, one of the following 290 * Return Value: On success, 0 is returned. On error, one of the following
291 * nagative error codes is returned. 291 * negative error codes is returned.
292 * 292 *
293 * %-EIO - I/O error. 293 * %-EIO - I/O error.
294 * 294 *
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 0092840492ee..85c89dfc71f0 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -396,7 +396,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
396 /* next page is past the blocks we've got */ 396 /* next page is past the blocks we've got */
397 if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) { 397 if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
398 nilfs_error(dir->i_sb, __func__, 398 nilfs_error(dir->i_sb, __func__,
399 "dir %lu size %lld exceeds block cout %llu", 399 "dir %lu size %lld exceeds block count %llu",
400 dir->i_ino, dir->i_size, 400 dir->i_ino, dir->i_size,
401 (unsigned long long)dir->i_blocks); 401 (unsigned long long)dir->i_blocks);
402 goto out; 402 goto out;
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index e16a6664dfa2..145f03cd7d3e 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -28,10 +28,10 @@
28 * gcinodes), and this file provides lookup function of the dummy 28 * gcinodes), and this file provides lookup function of the dummy
29 * inodes and their buffer read function. 29 * inodes and their buffer read function.
30 * 30 *
31 * Since NILFS2 keeps up multiple checkpoints/snapshots accross GC, it 31 * Since NILFS2 keeps up multiple checkpoints/snapshots across GC, it
32 * has to treat blocks that belong to a same file but have different 32 * has to treat blocks that belong to a same file but have different
33 * checkpoint numbers. To avoid interference among generations, dummy 33 * checkpoint numbers. To avoid interference among generations, dummy
34 * inodes are managed separatly from actual inodes, and their lookup 34 * inodes are managed separately from actual inodes, and their lookup
35 * function (nilfs_gc_iget) is designed to be specified with a 35 * function (nilfs_gc_iget) is designed to be specified with a
36 * checkpoint number argument as well as an inode number. 36 * checkpoint number argument as well as an inode number.
37 * 37 *
@@ -45,6 +45,7 @@
45#include <linux/buffer_head.h> 45#include <linux/buffer_head.h>
46#include <linux/mpage.h> 46#include <linux/mpage.h>
47#include <linux/hash.h> 47#include <linux/hash.h>
48#include <linux/slab.h>
48#include <linux/swap.h> 49#include <linux/swap.h>
49#include "nilfs.h" 50#include "nilfs.h"
50#include "page.h" 51#include "page.h"
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 7868cc122ac7..5e226d4b41d3 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -22,6 +22,7 @@
22 */ 22 */
23 23
24#include <linux/buffer_head.h> 24#include <linux/buffer_head.h>
25#include <linux/gfp.h>
25#include <linux/mpage.h> 26#include <linux/mpage.h>
26#include <linux/writeback.h> 27#include <linux/writeback.h>
27#include <linux/uio.h> 28#include <linux/uio.h>
@@ -450,7 +451,7 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
450 inode->i_op = &nilfs_special_inode_operations; 451 inode->i_op = &nilfs_special_inode_operations;
451 init_special_inode( 452 init_special_inode(
452 inode, inode->i_mode, 453 inode, inode->i_mode,
453 new_decode_dev(le64_to_cpu(raw_inode->i_device_code))); 454 huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
454 } 455 }
455 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh); 456 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
456 brelse(bh); 457 brelse(bh);
@@ -510,7 +511,7 @@ void nilfs_write_inode_common(struct inode *inode,
510 nilfs_bmap_write(ii->i_bmap, raw_inode); 511 nilfs_bmap_write(ii->i_bmap, raw_inode);
511 else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) 512 else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
512 raw_inode->i_device_code = 513 raw_inode->i_device_code =
513 cpu_to_le64(new_encode_dev(inode->i_rdev)); 514 cpu_to_le64(huge_encode_dev(inode->i_rdev));
514 /* When extending inode, nilfs->ns_inode_size should be checked 515 /* When extending inode, nilfs->ns_inode_size should be checked
515 for substitutions of appended fields */ 516 for substitutions of appended fields */
516} 517}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 313d0a21da48..f90a33d9a5b0 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -23,6 +23,7 @@
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/wait.h> 24#include <linux/wait.h>
25#include <linux/smp_lock.h> /* lock_kernel(), unlock_kernel() */ 25#include <linux/smp_lock.h> /* lock_kernel(), unlock_kernel() */
26#include <linux/slab.h>
26#include <linux/capability.h> /* capable() */ 27#include <linux/capability.h> /* capable() */
27#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */ 28#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */
28#include <linux/vmalloc.h> 29#include <linux/vmalloc.h>
@@ -648,7 +649,7 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
648long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 649long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
649{ 650{
650 struct inode *inode = filp->f_dentry->d_inode; 651 struct inode *inode = filp->f_dentry->d_inode;
651 void __user *argp = (void * __user *)arg; 652 void __user *argp = (void __user *)arg;
652 653
653 switch (cmd) { 654 switch (cmd) {
654 case NILFS_IOCTL_CHANGE_CPMODE: 655 case NILFS_IOCTL_CHANGE_CPMODE:
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 06713ffcc7f2..024be8c35bb6 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -26,6 +26,7 @@
26#include <linux/writeback.h> 26#include <linux/writeback.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/swap.h> 28#include <linux/swap.h>
29#include <linux/slab.h>
29#include "nilfs.h" 30#include "nilfs.h"
30#include "segment.h" 31#include "segment.h"
31#include "page.h" 32#include "page.h"
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index a2692bbc7b50..8de3e1e48130 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -29,6 +29,7 @@
29#include <linux/list.h> 29#include <linux/list.h>
30#include <linux/highmem.h> 30#include <linux/highmem.h>
31#include <linux/pagevec.h> 31#include <linux/pagevec.h>
32#include <linux/gfp.h>
32#include "nilfs.h" 33#include "nilfs.h"
33#include "page.h" 34#include "page.h"
34#include "mdt.h" 35#include "mdt.h"
@@ -292,7 +293,7 @@ void nilfs_free_private_page(struct page *page)
292 * @src: source page 293 * @src: source page
293 * @copy_dirty: flag whether to copy dirty states on the page's buffer heads. 294 * @copy_dirty: flag whether to copy dirty states on the page's buffer heads.
294 * 295 *
295 * This fuction is for both data pages and btnode pages. The dirty flag 296 * This function is for both data pages and btnode pages. The dirty flag
296 * should be treated by caller. The page must not be under i/o. 297 * should be treated by caller. The page must not be under i/o.
297 * Both src and dst page must be locked 298 * Both src and dst page must be locked
298 */ 299 */
@@ -388,7 +389,7 @@ repeat:
388} 389}
389 390
390/** 391/**
391 * nilfs_copy_back_pages -- copy back pages to orignal cache from shadow cache 392 * nilfs_copy_back_pages -- copy back pages to original cache from shadow cache
392 * @dmap: destination page cache 393 * @dmap: destination page cache
393 * @smap: source page cache 394 * @smap: source page cache
394 * 395 *
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 017bedc761a0..bae2a516b4ee 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -23,6 +23,7 @@
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/blkdev.h> 24#include <linux/blkdev.h>
25#include <linux/swap.h> 25#include <linux/swap.h>
26#include <linux/slab.h>
26#include <linux/crc32.h> 27#include <linux/crc32.h>
27#include "nilfs.h" 28#include "nilfs.h"
28#include "segment.h" 29#include "segment.h"
@@ -104,6 +105,8 @@ static void store_segsum_info(struct nilfs_segsum_info *ssi,
104 105
105 ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize); 106 ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize);
106 ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi); 107 ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi);
108
109 /* need to verify ->ss_bytes field if read ->ss_cno */
107} 110}
108 111
109/** 112/**
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index ab56fe44e377..2e6a2723b8fa 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -25,6 +25,7 @@
25#include <linux/writeback.h> 25#include <linux/writeback.h>
26#include <linux/crc32.h> 26#include <linux/crc32.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/slab.h>
28#include "page.h" 29#include "page.h"
29#include "segbuf.h" 30#include "segbuf.h"
30 31
@@ -32,42 +33,17 @@
32struct nilfs_write_info { 33struct nilfs_write_info {
33 struct the_nilfs *nilfs; 34 struct the_nilfs *nilfs;
34 struct bio *bio; 35 struct bio *bio;
35 int start, end; /* The region to be submitted */ 36 int start, end; /* The region to be submitted */
36 int rest_blocks; 37 int rest_blocks;
37 int max_pages; 38 int max_pages;
38 int nr_vecs; 39 int nr_vecs;
39 sector_t blocknr; 40 sector_t blocknr;
40}; 41};
41 42
42
43static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, 43static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
44 struct the_nilfs *nilfs); 44 struct the_nilfs *nilfs);
45static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf); 45static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
46 46
47
48static struct kmem_cache *nilfs_segbuf_cachep;
49
50static void nilfs_segbuf_init_once(void *obj)
51{
52 memset(obj, 0, sizeof(struct nilfs_segment_buffer));
53}
54
55int __init nilfs_init_segbuf_cache(void)
56{
57 nilfs_segbuf_cachep =
58 kmem_cache_create("nilfs2_segbuf_cache",
59 sizeof(struct nilfs_segment_buffer),
60 0, SLAB_RECLAIM_ACCOUNT,
61 nilfs_segbuf_init_once);
62
63 return (nilfs_segbuf_cachep == NULL) ? -ENOMEM : 0;
64}
65
66void nilfs_destroy_segbuf_cache(void)
67{
68 kmem_cache_destroy(nilfs_segbuf_cachep);
69}
70
71struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb) 47struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
72{ 48{
73 struct nilfs_segment_buffer *segbuf; 49 struct nilfs_segment_buffer *segbuf;
@@ -80,6 +56,7 @@ struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
80 INIT_LIST_HEAD(&segbuf->sb_list); 56 INIT_LIST_HEAD(&segbuf->sb_list);
81 INIT_LIST_HEAD(&segbuf->sb_segsum_buffers); 57 INIT_LIST_HEAD(&segbuf->sb_segsum_buffers);
82 INIT_LIST_HEAD(&segbuf->sb_payload_buffers); 58 INIT_LIST_HEAD(&segbuf->sb_payload_buffers);
59 segbuf->sb_super_root = NULL;
83 60
84 init_completion(&segbuf->sb_bio_event); 61 init_completion(&segbuf->sb_bio_event);
85 atomic_set(&segbuf->sb_err, 0); 62 atomic_set(&segbuf->sb_err, 0);
@@ -157,7 +134,7 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf,
157} 134}
158 135
159int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags, 136int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
160 time_t ctime) 137 time_t ctime, __u64 cno)
161{ 138{
162 int err; 139 int err;
163 140
@@ -170,11 +147,12 @@ int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
170 segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary); 147 segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary);
171 segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0; 148 segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0;
172 segbuf->sb_sum.ctime = ctime; 149 segbuf->sb_sum.ctime = ctime;
150 segbuf->sb_sum.cno = cno;
173 return 0; 151 return 0;
174} 152}
175 153
176/* 154/*
177 * Setup segument summary 155 * Setup segment summary
178 */ 156 */
179void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf) 157void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
180{ 158{
@@ -195,13 +173,14 @@ void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
195 raw_sum->ss_nfinfo = cpu_to_le32(segbuf->sb_sum.nfinfo); 173 raw_sum->ss_nfinfo = cpu_to_le32(segbuf->sb_sum.nfinfo);
196 raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes); 174 raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes);
197 raw_sum->ss_pad = 0; 175 raw_sum->ss_pad = 0;
176 raw_sum->ss_cno = cpu_to_le64(segbuf->sb_sum.cno);
198} 177}
199 178
200/* 179/*
201 * CRC calculation routines 180 * CRC calculation routines
202 */ 181 */
203void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, 182static void
204 u32 seed) 183nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, u32 seed)
205{ 184{
206 struct buffer_head *bh; 185 struct buffer_head *bh;
207 struct nilfs_segment_summary *raw_sum; 186 struct nilfs_segment_summary *raw_sum;
@@ -228,8 +207,8 @@ void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf,
228 raw_sum->ss_sumsum = cpu_to_le32(crc); 207 raw_sum->ss_sumsum = cpu_to_le32(crc);
229} 208}
230 209
231void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf, 210static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
232 u32 seed) 211 u32 seed)
233{ 212{
234 struct buffer_head *bh; 213 struct buffer_head *bh;
235 struct nilfs_segment_summary *raw_sum; 214 struct nilfs_segment_summary *raw_sum;
@@ -255,6 +234,20 @@ void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
255 raw_sum->ss_datasum = cpu_to_le32(crc); 234 raw_sum->ss_datasum = cpu_to_le32(crc);
256} 235}
257 236
237static void
238nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf,
239 u32 seed)
240{
241 struct nilfs_super_root *raw_sr;
242 u32 crc;
243
244 raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data;
245 crc = crc32_le(seed,
246 (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
247 NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
248 raw_sr->sr_sum = cpu_to_le32(crc);
249}
250
258static void nilfs_release_buffers(struct list_head *list) 251static void nilfs_release_buffers(struct list_head *list)
259{ 252{
260 struct buffer_head *bh, *n; 253 struct buffer_head *bh, *n;
@@ -281,6 +274,7 @@ static void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
281{ 274{
282 nilfs_release_buffers(&segbuf->sb_segsum_buffers); 275 nilfs_release_buffers(&segbuf->sb_segsum_buffers);
283 nilfs_release_buffers(&segbuf->sb_payload_buffers); 276 nilfs_release_buffers(&segbuf->sb_payload_buffers);
277 segbuf->sb_super_root = NULL;
284} 278}
285 279
286/* 280/*
@@ -323,14 +317,31 @@ int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs)
323int nilfs_wait_on_logs(struct list_head *logs) 317int nilfs_wait_on_logs(struct list_head *logs)
324{ 318{
325 struct nilfs_segment_buffer *segbuf; 319 struct nilfs_segment_buffer *segbuf;
326 int err; 320 int err, ret = 0;
327 321
328 list_for_each_entry(segbuf, logs, sb_list) { 322 list_for_each_entry(segbuf, logs, sb_list) {
329 err = nilfs_segbuf_wait(segbuf); 323 err = nilfs_segbuf_wait(segbuf);
330 if (err) 324 if (err && !ret)
331 return err; 325 ret = err;
326 }
327 return ret;
328}
329
330/**
331 * nilfs_add_checksums_on_logs - add checksums on the logs
332 * @logs: list of segment buffers storing target logs
333 * @seed: checksum seed value
334 */
335void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed)
336{
337 struct nilfs_segment_buffer *segbuf;
338
339 list_for_each_entry(segbuf, logs, sb_list) {
340 if (segbuf->sb_super_root)
341 nilfs_segbuf_fill_in_super_root_crc(segbuf, seed);
342 nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
343 nilfs_segbuf_fill_in_data_crc(segbuf, seed);
332 } 344 }
333 return 0;
334} 345}
335 346
336/* 347/*
@@ -470,8 +481,8 @@ static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf,
470 * 481 *
471 * %-ENOMEM - Insufficient memory available. 482 * %-ENOMEM - Insufficient memory available.
472 */ 483 */
473int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, 484static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
474 struct the_nilfs *nilfs) 485 struct the_nilfs *nilfs)
475{ 486{
476 struct nilfs_write_info wi; 487 struct nilfs_write_info wi;
477 struct buffer_head *bh; 488 struct buffer_head *bh;
@@ -514,7 +525,7 @@ int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
514 * 525 *
515 * %-EIO - I/O error 526 * %-EIO - I/O error
516 */ 527 */
517int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf) 528static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf)
518{ 529{
519 int err = 0; 530 int err = 0;
520 531
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 94dfd3517bc0..fdf1c3b6d673 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -37,6 +37,7 @@
37 * @sumbytes: Byte count of segment summary 37 * @sumbytes: Byte count of segment summary
38 * @nfileblk: Total number of file blocks 38 * @nfileblk: Total number of file blocks
39 * @seg_seq: Segment sequence number 39 * @seg_seq: Segment sequence number
40 * @cno: Checkpoint number
40 * @ctime: Creation time 41 * @ctime: Creation time
41 * @next: Block number of the next full segment 42 * @next: Block number of the next full segment
42 */ 43 */
@@ -48,6 +49,7 @@ struct nilfs_segsum_info {
48 unsigned long sumbytes; 49 unsigned long sumbytes;
49 unsigned long nfileblk; 50 unsigned long nfileblk;
50 u64 seg_seq; 51 u64 seg_seq;
52 __u64 cno;
51 time_t ctime; 53 time_t ctime;
52 sector_t next; 54 sector_t next;
53}; 55};
@@ -76,6 +78,7 @@ struct nilfs_segsum_info {
76 * @sb_rest_blocks: Number of residual blocks in the current segment 78 * @sb_rest_blocks: Number of residual blocks in the current segment
77 * @sb_segsum_buffers: List of buffers for segment summaries 79 * @sb_segsum_buffers: List of buffers for segment summaries
78 * @sb_payload_buffers: List of buffers for segment payload 80 * @sb_payload_buffers: List of buffers for segment payload
81 * @sb_super_root: Pointer to buffer storing a super root block (if exists)
79 * @sb_nbio: Number of flying bio requests 82 * @sb_nbio: Number of flying bio requests
80 * @sb_err: I/O error status 83 * @sb_err: I/O error status
81 * @sb_bio_event: Completion event of log writing 84 * @sb_bio_event: Completion event of log writing
@@ -95,6 +98,7 @@ struct nilfs_segment_buffer {
95 /* Buffers */ 98 /* Buffers */
96 struct list_head sb_segsum_buffers; 99 struct list_head sb_segsum_buffers;
97 struct list_head sb_payload_buffers; /* including super root */ 100 struct list_head sb_payload_buffers; /* including super root */
101 struct buffer_head *sb_super_root;
98 102
99 /* io status */ 103 /* io status */
100 int sb_nbio; 104 int sb_nbio;
@@ -121,6 +125,7 @@ struct nilfs_segment_buffer {
121 b_assoc_buffers)) 125 b_assoc_buffers))
122#define NILFS_SEGBUF_BH_IS_LAST(bh, head) ((bh)->b_assoc_buffers.next == head) 126#define NILFS_SEGBUF_BH_IS_LAST(bh, head) ((bh)->b_assoc_buffers.next == head)
123 127
128extern struct kmem_cache *nilfs_segbuf_cachep;
124 129
125int __init nilfs_init_segbuf_cache(void); 130int __init nilfs_init_segbuf_cache(void);
126void nilfs_destroy_segbuf_cache(void); 131void nilfs_destroy_segbuf_cache(void);
@@ -132,13 +137,11 @@ void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
132 struct nilfs_segment_buffer *prev); 137 struct nilfs_segment_buffer *prev);
133void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64, 138void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
134 struct the_nilfs *); 139 struct the_nilfs *);
135int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t); 140int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t, __u64);
136int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *); 141int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *);
137int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *, 142int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
138 struct buffer_head **); 143 struct buffer_head **);
139void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *); 144void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *);
140void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *, u32);
141void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *, u32);
142 145
143static inline void 146static inline void
144nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf, 147nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf,
@@ -171,6 +174,7 @@ void nilfs_truncate_logs(struct list_head *logs,
171 struct nilfs_segment_buffer *last); 174 struct nilfs_segment_buffer *last);
172int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs); 175int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs);
173int nilfs_wait_on_logs(struct list_head *logs); 176int nilfs_wait_on_logs(struct list_head *logs);
177void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed);
174 178
175static inline void nilfs_destroy_logs(struct list_head *logs) 179static inline void nilfs_destroy_logs(struct list_head *logs)
176{ 180{
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index ada2f1b947a3..c9201649cc49 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -32,6 +32,7 @@
32#include <linux/kthread.h> 32#include <linux/kthread.h>
33#include <linux/crc32.h> 33#include <linux/crc32.h>
34#include <linux/pagevec.h> 34#include <linux/pagevec.h>
35#include <linux/slab.h>
35#include "nilfs.h" 36#include "nilfs.h"
36#include "btnode.h" 37#include "btnode.h"
37#include "page.h" 38#include "page.h"
@@ -115,42 +116,6 @@ static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *,
115#define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a) 116#define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a)
116#define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a) 117#define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a)
117 118
118/*
119 * Transaction
120 */
121static struct kmem_cache *nilfs_transaction_cachep;
122
123/**
124 * nilfs_init_transaction_cache - create a cache for nilfs_transaction_info
125 *
126 * nilfs_init_transaction_cache() creates a slab cache for the struct
127 * nilfs_transaction_info.
128 *
129 * Return Value: On success, it returns 0. On error, one of the following
130 * negative error code is returned.
131 *
132 * %-ENOMEM - Insufficient memory available.
133 */
134int nilfs_init_transaction_cache(void)
135{
136 nilfs_transaction_cachep =
137 kmem_cache_create("nilfs2_transaction_cache",
138 sizeof(struct nilfs_transaction_info),
139 0, SLAB_RECLAIM_ACCOUNT, NULL);
140 return (nilfs_transaction_cachep == NULL) ? -ENOMEM : 0;
141}
142
143/**
144 * nilfs_detroy_transaction_cache - destroy the cache for transaction info
145 *
146 * nilfs_destroy_transaction_cache() frees the slab cache for the struct
147 * nilfs_transaction_info.
148 */
149void nilfs_destroy_transaction_cache(void)
150{
151 kmem_cache_destroy(nilfs_transaction_cachep);
152}
153
154static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti) 119static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
155{ 120{
156 struct nilfs_transaction_info *cur_ti = current->journal_info; 121 struct nilfs_transaction_info *cur_ti = current->journal_info;
@@ -201,7 +166,7 @@ static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
201 * This function allocates a nilfs_transaction_info struct to keep context 166 * This function allocates a nilfs_transaction_info struct to keep context
202 * information on it. It is initialized and hooked onto the current task in 167 * information on it. It is initialized and hooked onto the current task in
203 * the outermost call. If a pre-allocated struct is given to @ti, it is used 168 * the outermost call. If a pre-allocated struct is given to @ti, it is used
204 * instead; othewise a new struct is assigned from a slab. 169 * instead; otherwise a new struct is assigned from a slab.
205 * 170 *
206 * When @vacancy_check flag is set, this function will check the amount of 171 * When @vacancy_check flag is set, this function will check the amount of
207 * free space, and will wait for the GC to reclaim disk space if low capacity. 172 * free space, and will wait for the GC to reclaim disk space if low capacity.
@@ -401,7 +366,8 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
401 366
402 if (nilfs_doing_gc()) 367 if (nilfs_doing_gc())
403 flags = NILFS_SS_GC; 368 flags = NILFS_SS_GC;
404 err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime); 369 err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime,
370 sci->sc_sbi->s_nilfs->ns_cno);
405 if (unlikely(err)) 371 if (unlikely(err))
406 return err; 372 return err;
407 373
@@ -434,7 +400,7 @@ static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci)
434 return err; 400 return err;
435 segbuf = sci->sc_curseg; 401 segbuf = sci->sc_curseg;
436 } 402 }
437 err = nilfs_segbuf_extend_payload(segbuf, &sci->sc_super_root); 403 err = nilfs_segbuf_extend_payload(segbuf, &segbuf->sb_super_root);
438 if (likely(!err)) 404 if (likely(!err))
439 segbuf->sb_sum.flags |= NILFS_SS_SR; 405 segbuf->sb_sum.flags |= NILFS_SS_SR;
440 return err; 406 return err;
@@ -598,7 +564,7 @@ static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci,
598 *vblocknr = binfo->bi_v.bi_vblocknr; 564 *vblocknr = binfo->bi_v.bi_vblocknr;
599} 565}
600 566
601struct nilfs_sc_operations nilfs_sc_file_ops = { 567static struct nilfs_sc_operations nilfs_sc_file_ops = {
602 .collect_data = nilfs_collect_file_data, 568 .collect_data = nilfs_collect_file_data,
603 .collect_node = nilfs_collect_file_node, 569 .collect_node = nilfs_collect_file_node,
604 .collect_bmap = nilfs_collect_file_bmap, 570 .collect_bmap = nilfs_collect_file_bmap,
@@ -648,7 +614,7 @@ static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci,
648 *binfo_dat = binfo->bi_dat; 614 *binfo_dat = binfo->bi_dat;
649} 615}
650 616
651struct nilfs_sc_operations nilfs_sc_dat_ops = { 617static struct nilfs_sc_operations nilfs_sc_dat_ops = {
652 .collect_data = nilfs_collect_dat_data, 618 .collect_data = nilfs_collect_dat_data,
653 .collect_node = nilfs_collect_file_node, 619 .collect_node = nilfs_collect_file_node,
654 .collect_bmap = nilfs_collect_dat_bmap, 620 .collect_bmap = nilfs_collect_dat_bmap,
@@ -656,7 +622,7 @@ struct nilfs_sc_operations nilfs_sc_dat_ops = {
656 .write_node_binfo = nilfs_write_dat_node_binfo, 622 .write_node_binfo = nilfs_write_dat_node_binfo,
657}; 623};
658 624
659struct nilfs_sc_operations nilfs_sc_dsync_ops = { 625static struct nilfs_sc_operations nilfs_sc_dsync_ops = {
660 .collect_data = nilfs_collect_file_data, 626 .collect_data = nilfs_collect_file_data,
661 .collect_node = NULL, 627 .collect_node = NULL,
662 .collect_bmap = NULL, 628 .collect_bmap = NULL,
@@ -931,43 +897,16 @@ static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci,
931 } 897 }
932} 898}
933 899
934/*
935 * CRC calculation routines
936 */
937static void nilfs_fill_in_super_root_crc(struct buffer_head *bh_sr, u32 seed)
938{
939 struct nilfs_super_root *raw_sr =
940 (struct nilfs_super_root *)bh_sr->b_data;
941 u32 crc;
942
943 crc = crc32_le(seed,
944 (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
945 NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
946 raw_sr->sr_sum = cpu_to_le32(crc);
947}
948
949static void nilfs_segctor_fill_in_checksums(struct nilfs_sc_info *sci,
950 u32 seed)
951{
952 struct nilfs_segment_buffer *segbuf;
953
954 if (sci->sc_super_root)
955 nilfs_fill_in_super_root_crc(sci->sc_super_root, seed);
956
957 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
958 nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
959 nilfs_segbuf_fill_in_data_crc(segbuf, seed);
960 }
961}
962
963static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, 900static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
964 struct the_nilfs *nilfs) 901 struct the_nilfs *nilfs)
965{ 902{
966 struct buffer_head *bh_sr = sci->sc_super_root; 903 struct buffer_head *bh_sr;
967 struct nilfs_super_root *raw_sr = 904 struct nilfs_super_root *raw_sr;
968 (struct nilfs_super_root *)bh_sr->b_data;
969 unsigned isz = nilfs->ns_inode_size; 905 unsigned isz = nilfs->ns_inode_size;
970 906
907 bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root;
908 raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
909
971 raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES); 910 raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES);
972 raw_sr->sr_nongc_ctime 911 raw_sr->sr_nongc_ctime
973 = cpu_to_le64(nilfs_doing_gc() ? 912 = cpu_to_le64(nilfs_doing_gc() ?
@@ -1490,7 +1429,6 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
1490 1429
1491 /* Collection retry loop */ 1430 /* Collection retry loop */
1492 for (;;) { 1431 for (;;) {
1493 sci->sc_super_root = NULL;
1494 sci->sc_nblk_this_inc = 0; 1432 sci->sc_nblk_this_inc = 0;
1495 sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); 1433 sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
1496 1434
@@ -1510,6 +1448,12 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
1510 if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE) 1448 if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
1511 break; 1449 break;
1512 1450
1451 nilfs_clear_logs(&sci->sc_segbufs);
1452
1453 err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
1454 if (unlikely(err))
1455 return err;
1456
1513 if (sci->sc_stage.flags & NILFS_CF_SUFREED) { 1457 if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
1514 err = nilfs_sufile_cancel_freev(nilfs->ns_sufile, 1458 err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
1515 sci->sc_freesegs, 1459 sci->sc_freesegs,
@@ -1517,12 +1461,6 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
1517 NULL); 1461 NULL);
1518 WARN_ON(err); /* do not happen */ 1462 WARN_ON(err); /* do not happen */
1519 } 1463 }
1520 nilfs_clear_logs(&sci->sc_segbufs);
1521
1522 err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
1523 if (unlikely(err))
1524 return err;
1525
1526 nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA); 1464 nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
1527 sci->sc_stage = prev_stage; 1465 sci->sc_stage = prev_stage;
1528 } 1466 }
@@ -1567,7 +1505,7 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
1567 ssp.offset = sizeof(struct nilfs_segment_summary); 1505 ssp.offset = sizeof(struct nilfs_segment_summary);
1568 1506
1569 list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { 1507 list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
1570 if (bh == sci->sc_super_root) 1508 if (bh == segbuf->sb_super_root)
1571 break; 1509 break;
1572 if (!finfo) { 1510 if (!finfo) {
1573 finfo = nilfs_segctor_map_segsum_entry( 1511 finfo = nilfs_segctor_map_segsum_entry(
@@ -1728,7 +1666,7 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
1728 1666
1729 list_for_each_entry(bh, &segbuf->sb_payload_buffers, 1667 list_for_each_entry(bh, &segbuf->sb_payload_buffers,
1730 b_assoc_buffers) { 1668 b_assoc_buffers) {
1731 if (bh == sci->sc_super_root) { 1669 if (bh == segbuf->sb_super_root) {
1732 if (bh->b_page != bd_page) { 1670 if (bh->b_page != bd_page) {
1733 lock_page(bd_page); 1671 lock_page(bd_page);
1734 clear_page_dirty_for_io(bd_page); 1672 clear_page_dirty_for_io(bd_page);
@@ -1847,7 +1785,7 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err)
1847} 1785}
1848 1786
1849static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page, 1787static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
1850 struct buffer_head *bh_sr, int err) 1788 int err)
1851{ 1789{
1852 struct nilfs_segment_buffer *segbuf; 1790 struct nilfs_segment_buffer *segbuf;
1853 struct page *bd_page = NULL, *fs_page = NULL; 1791 struct page *bd_page = NULL, *fs_page = NULL;
@@ -1868,7 +1806,7 @@ static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
1868 1806
1869 list_for_each_entry(bh, &segbuf->sb_payload_buffers, 1807 list_for_each_entry(bh, &segbuf->sb_payload_buffers,
1870 b_assoc_buffers) { 1808 b_assoc_buffers) {
1871 if (bh == bh_sr) { 1809 if (bh == segbuf->sb_super_root) {
1872 if (bh->b_page != bd_page) { 1810 if (bh->b_page != bd_page) {
1873 end_page_writeback(bd_page); 1811 end_page_writeback(bd_page);
1874 bd_page = bh->b_page; 1812 bd_page = bh->b_page;
@@ -1897,8 +1835,7 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
1897 1835
1898 list_splice_tail_init(&sci->sc_write_logs, &logs); 1836 list_splice_tail_init(&sci->sc_write_logs, &logs);
1899 ret = nilfs_wait_on_logs(&logs); 1837 ret = nilfs_wait_on_logs(&logs);
1900 if (ret) 1838 nilfs_abort_logs(&logs, NULL, ret ? : err);
1901 nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret);
1902 1839
1903 list_splice_tail_init(&sci->sc_segbufs, &logs); 1840 list_splice_tail_init(&sci->sc_segbufs, &logs);
1904 nilfs_cancel_segusage(&logs, nilfs->ns_sufile); 1841 nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
@@ -1914,7 +1851,6 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
1914 } 1851 }
1915 1852
1916 nilfs_destroy_logs(&logs); 1853 nilfs_destroy_logs(&logs);
1917 sci->sc_super_root = NULL;
1918} 1854}
1919 1855
1920static void nilfs_set_next_segment(struct the_nilfs *nilfs, 1856static void nilfs_set_next_segment(struct the_nilfs *nilfs,
@@ -1933,7 +1869,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1933 struct nilfs_segment_buffer *segbuf; 1869 struct nilfs_segment_buffer *segbuf;
1934 struct page *bd_page = NULL, *fs_page = NULL; 1870 struct page *bd_page = NULL, *fs_page = NULL;
1935 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; 1871 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
1936 int update_sr = (sci->sc_super_root != NULL); 1872 int update_sr = false;
1937 1873
1938 list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) { 1874 list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
1939 struct buffer_head *bh; 1875 struct buffer_head *bh;
@@ -1964,11 +1900,12 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1964 set_buffer_uptodate(bh); 1900 set_buffer_uptodate(bh);
1965 clear_buffer_dirty(bh); 1901 clear_buffer_dirty(bh);
1966 clear_buffer_nilfs_volatile(bh); 1902 clear_buffer_nilfs_volatile(bh);
1967 if (bh == sci->sc_super_root) { 1903 if (bh == segbuf->sb_super_root) {
1968 if (bh->b_page != bd_page) { 1904 if (bh->b_page != bd_page) {
1969 end_page_writeback(bd_page); 1905 end_page_writeback(bd_page);
1970 bd_page = bh->b_page; 1906 bd_page = bh->b_page;
1971 } 1907 }
1908 update_sr = true;
1972 break; 1909 break;
1973 } 1910 }
1974 if (bh->b_page != fs_page) { 1911 if (bh->b_page != fs_page) {
@@ -2115,7 +2052,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2115 struct nilfs_sb_info *sbi = sci->sc_sbi; 2052 struct nilfs_sb_info *sbi = sci->sc_sbi;
2116 struct the_nilfs *nilfs = sbi->s_nilfs; 2053 struct the_nilfs *nilfs = sbi->s_nilfs;
2117 struct page *failed_page; 2054 struct page *failed_page;
2118 int err, has_sr = 0; 2055 int err;
2119 2056
2120 sci->sc_stage.scnt = NILFS_ST_INIT; 2057 sci->sc_stage.scnt = NILFS_ST_INIT;
2121 2058
@@ -2143,8 +2080,6 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2143 if (unlikely(err)) 2080 if (unlikely(err))
2144 goto failed; 2081 goto failed;
2145 2082
2146 has_sr = (sci->sc_super_root != NULL);
2147
2148 /* Avoid empty segment */ 2083 /* Avoid empty segment */
2149 if (sci->sc_stage.scnt == NILFS_ST_DONE && 2084 if (sci->sc_stage.scnt == NILFS_ST_DONE &&
2150 NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) { 2085 NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) {
@@ -2159,7 +2094,8 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2159 if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) 2094 if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
2160 nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile); 2095 nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile);
2161 2096
2162 if (has_sr) { 2097 if (mode == SC_LSEG_SR &&
2098 sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
2163 err = nilfs_segctor_fill_in_checkpoint(sci); 2099 err = nilfs_segctor_fill_in_checkpoint(sci);
2164 if (unlikely(err)) 2100 if (unlikely(err))
2165 goto failed_to_write; 2101 goto failed_to_write;
@@ -2171,11 +2107,12 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2171 /* Write partial segments */ 2107 /* Write partial segments */
2172 err = nilfs_segctor_prepare_write(sci, &failed_page); 2108 err = nilfs_segctor_prepare_write(sci, &failed_page);
2173 if (err) { 2109 if (err) {
2174 nilfs_abort_logs(&sci->sc_segbufs, failed_page, 2110 nilfs_abort_logs(&sci->sc_segbufs, failed_page, err);
2175 sci->sc_super_root, err);
2176 goto failed_to_write; 2111 goto failed_to_write;
2177 } 2112 }
2178 nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed); 2113
2114 nilfs_add_checksums_on_logs(&sci->sc_segbufs,
2115 nilfs->ns_crc_seed);
2179 2116
2180 err = nilfs_segctor_write(sci, nilfs); 2117 err = nilfs_segctor_write(sci, nilfs);
2181 if (unlikely(err)) 2118 if (unlikely(err))
@@ -2196,8 +2133,6 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2196 } 2133 }
2197 } while (sci->sc_stage.scnt != NILFS_ST_DONE); 2134 } while (sci->sc_stage.scnt != NILFS_ST_DONE);
2198 2135
2199 sci->sc_super_root = NULL;
2200
2201 out: 2136 out:
2202 nilfs_segctor_check_out_files(sci, sbi); 2137 nilfs_segctor_check_out_files(sci, sbi);
2203 return err; 2138 return err;
@@ -2214,7 +2149,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2214} 2149}
2215 2150
2216/** 2151/**
2217 * nilfs_secgtor_start_timer - set timer of background write 2152 * nilfs_segctor_start_timer - set timer of background write
2218 * @sci: nilfs_sc_info 2153 * @sci: nilfs_sc_info
2219 * 2154 *
2220 * If the timer has already been set, it ignores the new request. 2155 * If the timer has already been set, it ignores the new request.
@@ -2224,9 +2159,9 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2224static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci) 2159static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci)
2225{ 2160{
2226 spin_lock(&sci->sc_state_lock); 2161 spin_lock(&sci->sc_state_lock);
2227 if (sci->sc_timer && !(sci->sc_state & NILFS_SEGCTOR_COMMIT)) { 2162 if (!(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
2228 sci->sc_timer->expires = jiffies + sci->sc_interval; 2163 sci->sc_timer.expires = jiffies + sci->sc_interval;
2229 add_timer(sci->sc_timer); 2164 add_timer(&sci->sc_timer);
2230 sci->sc_state |= NILFS_SEGCTOR_COMMIT; 2165 sci->sc_state |= NILFS_SEGCTOR_COMMIT;
2231 } 2166 }
2232 spin_unlock(&sci->sc_state_lock); 2167 spin_unlock(&sci->sc_state_lock);
@@ -2431,9 +2366,7 @@ static void nilfs_segctor_accept(struct nilfs_sc_info *sci)
2431 spin_lock(&sci->sc_state_lock); 2366 spin_lock(&sci->sc_state_lock);
2432 sci->sc_seq_accepted = sci->sc_seq_request; 2367 sci->sc_seq_accepted = sci->sc_seq_request;
2433 spin_unlock(&sci->sc_state_lock); 2368 spin_unlock(&sci->sc_state_lock);
2434 2369 del_timer_sync(&sci->sc_timer);
2435 if (sci->sc_timer)
2436 del_timer_sync(sci->sc_timer);
2437} 2370}
2438 2371
2439/** 2372/**
@@ -2459,9 +2392,9 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
2459 sci->sc_flush_request &= ~FLUSH_DAT_BIT; 2392 sci->sc_flush_request &= ~FLUSH_DAT_BIT;
2460 2393
2461 /* re-enable timer if checkpoint creation was not done */ 2394 /* re-enable timer if checkpoint creation was not done */
2462 if (sci->sc_timer && (sci->sc_state & NILFS_SEGCTOR_COMMIT) && 2395 if ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
2463 time_before(jiffies, sci->sc_timer->expires)) 2396 time_before(jiffies, sci->sc_timer.expires))
2464 add_timer(sci->sc_timer); 2397 add_timer(&sci->sc_timer);
2465 } 2398 }
2466 spin_unlock(&sci->sc_state_lock); 2399 spin_unlock(&sci->sc_state_lock);
2467} 2400}
@@ -2640,13 +2573,10 @@ static int nilfs_segctor_thread(void *arg)
2640{ 2573{
2641 struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg; 2574 struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
2642 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; 2575 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
2643 struct timer_list timer;
2644 int timeout = 0; 2576 int timeout = 0;
2645 2577
2646 init_timer(&timer); 2578 sci->sc_timer.data = (unsigned long)current;
2647 timer.data = (unsigned long)current; 2579 sci->sc_timer.function = nilfs_construction_timeout;
2648 timer.function = nilfs_construction_timeout;
2649 sci->sc_timer = &timer;
2650 2580
2651 /* start sync. */ 2581 /* start sync. */
2652 sci->sc_task = current; 2582 sci->sc_task = current;
@@ -2695,7 +2625,7 @@ static int nilfs_segctor_thread(void *arg)
2695 should_sleep = 0; 2625 should_sleep = 0;
2696 else if (sci->sc_state & NILFS_SEGCTOR_COMMIT) 2626 else if (sci->sc_state & NILFS_SEGCTOR_COMMIT)
2697 should_sleep = time_before(jiffies, 2627 should_sleep = time_before(jiffies,
2698 sci->sc_timer->expires); 2628 sci->sc_timer.expires);
2699 2629
2700 if (should_sleep) { 2630 if (should_sleep) {
2701 spin_unlock(&sci->sc_state_lock); 2631 spin_unlock(&sci->sc_state_lock);
@@ -2704,7 +2634,7 @@ static int nilfs_segctor_thread(void *arg)
2704 } 2634 }
2705 finish_wait(&sci->sc_wait_daemon, &wait); 2635 finish_wait(&sci->sc_wait_daemon, &wait);
2706 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && 2636 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
2707 time_after_eq(jiffies, sci->sc_timer->expires)); 2637 time_after_eq(jiffies, sci->sc_timer.expires));
2708 2638
2709 if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs)) 2639 if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs))
2710 set_nilfs_discontinued(nilfs); 2640 set_nilfs_discontinued(nilfs);
@@ -2713,8 +2643,6 @@ static int nilfs_segctor_thread(void *arg)
2713 2643
2714 end_thread: 2644 end_thread:
2715 spin_unlock(&sci->sc_state_lock); 2645 spin_unlock(&sci->sc_state_lock);
2716 del_timer_sync(sci->sc_timer);
2717 sci->sc_timer = NULL;
2718 2646
2719 /* end sync. */ 2647 /* end sync. */
2720 sci->sc_task = NULL; 2648 sci->sc_task = NULL;
@@ -2750,13 +2678,6 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
2750 } 2678 }
2751} 2679}
2752 2680
2753static int nilfs_segctor_init(struct nilfs_sc_info *sci)
2754{
2755 sci->sc_seq_done = sci->sc_seq_request;
2756
2757 return nilfs_segctor_start_thread(sci);
2758}
2759
2760/* 2681/*
2761 * Setup & clean-up functions 2682 * Setup & clean-up functions
2762 */ 2683 */
@@ -2780,6 +2701,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
2780 INIT_LIST_HEAD(&sci->sc_write_logs); 2701 INIT_LIST_HEAD(&sci->sc_write_logs);
2781 INIT_LIST_HEAD(&sci->sc_gc_inodes); 2702 INIT_LIST_HEAD(&sci->sc_gc_inodes);
2782 INIT_LIST_HEAD(&sci->sc_copied_buffers); 2703 INIT_LIST_HEAD(&sci->sc_copied_buffers);
2704 init_timer(&sci->sc_timer);
2783 2705
2784 sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT; 2706 sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
2785 sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ; 2707 sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
@@ -2846,6 +2768,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2846 2768
2847 down_write(&sbi->s_nilfs->ns_segctor_sem); 2769 down_write(&sbi->s_nilfs->ns_segctor_sem);
2848 2770
2771 del_timer_sync(&sci->sc_timer);
2849 kfree(sci); 2772 kfree(sci);
2850} 2773}
2851 2774
@@ -2854,7 +2777,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2854 * @sbi: nilfs_sb_info 2777 * @sbi: nilfs_sb_info
2855 * 2778 *
2856 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info, 2779 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
2857 * initilizes it, and starts the segment constructor. 2780 * initializes it, and starts the segment constructor.
2858 * 2781 *
2859 * Return Value: On success, 0 is returned. On error, one of the following 2782 * Return Value: On success, 0 is returned. On error, one of the following
2860 * negative error code is returned. 2783 * negative error code is returned.
@@ -2880,7 +2803,7 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
2880 return -ENOMEM; 2803 return -ENOMEM;
2881 2804
2882 nilfs_attach_writer(nilfs, sbi); 2805 nilfs_attach_writer(nilfs, sbi);
2883 err = nilfs_segctor_init(NILFS_SC(sbi)); 2806 err = nilfs_segctor_start_thread(NILFS_SC(sbi));
2884 if (err) { 2807 if (err) {
2885 nilfs_detach_writer(nilfs, sbi); 2808 nilfs_detach_writer(nilfs, sbi);
2886 kfree(sbi->s_sc_info); 2809 kfree(sbi->s_sc_info);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 3155e0c7f415..dca142361ccf 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -30,7 +30,7 @@
30#include "sb.h" 30#include "sb.h"
31 31
32/** 32/**
33 * struct nilfs_recovery_info - Recovery infomation 33 * struct nilfs_recovery_info - Recovery information
34 * @ri_need_recovery: Recovery status 34 * @ri_need_recovery: Recovery status
35 * @ri_super_root: Block number of the last super root 35 * @ri_super_root: Block number of the last super root
36 * @ri_ri_cno: Number of the last checkpoint 36 * @ri_ri_cno: Number of the last checkpoint
@@ -71,7 +71,7 @@ struct nilfs_recovery_info {
71 */ 71 */
72struct nilfs_cstage { 72struct nilfs_cstage {
73 int scnt; 73 int scnt;
74 unsigned flags; 74 unsigned flags;
75 struct nilfs_inode_info *dirty_file_ptr; 75 struct nilfs_inode_info *dirty_file_ptr;
76 struct nilfs_inode_info *gc_inode_ptr; 76 struct nilfs_inode_info *gc_inode_ptr;
77}; 77};
@@ -100,7 +100,6 @@ struct nilfs_segsum_pointer {
100 * @sc_write_logs: List of segment buffers to hold logs under writing 100 * @sc_write_logs: List of segment buffers to hold logs under writing
101 * @sc_segbuf_nblocks: Number of available blocks in segment buffers. 101 * @sc_segbuf_nblocks: Number of available blocks in segment buffers.
102 * @sc_curseg: Current segment buffer 102 * @sc_curseg: Current segment buffer
103 * @sc_super_root: Pointer to the super root buffer
104 * @sc_stage: Collection stage 103 * @sc_stage: Collection stage
105 * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary 104 * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary
106 * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary 105 * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary
@@ -148,7 +147,6 @@ struct nilfs_sc_info {
148 struct list_head sc_write_logs; 147 struct list_head sc_write_logs;
149 unsigned long sc_segbuf_nblocks; 148 unsigned long sc_segbuf_nblocks;
150 struct nilfs_segment_buffer *sc_curseg; 149 struct nilfs_segment_buffer *sc_curseg;
151 struct buffer_head *sc_super_root;
152 150
153 struct nilfs_cstage sc_stage; 151 struct nilfs_cstage sc_stage;
154 152
@@ -179,7 +177,7 @@ struct nilfs_sc_info {
179 unsigned long sc_lseg_stime; /* in 1/HZ seconds */ 177 unsigned long sc_lseg_stime; /* in 1/HZ seconds */
180 unsigned long sc_watermark; 178 unsigned long sc_watermark;
181 179
182 struct timer_list *sc_timer; 180 struct timer_list sc_timer;
183 struct task_struct *sc_task; 181 struct task_struct *sc_task;
184}; 182};
185 183
@@ -219,6 +217,8 @@ enum {
219 */ 217 */
220#define NILFS_SC_DEFAULT_WATERMARK 3600 218#define NILFS_SC_DEFAULT_WATERMARK 3600
221 219
220/* super.c */
221extern struct kmem_cache *nilfs_transaction_cachep;
222 222
223/* segment.c */ 223/* segment.c */
224extern int nilfs_init_transaction_cache(void); 224extern int nilfs_init_transaction_cache(void);
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index b6c36d0cc331..3c6cc6005c2e 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -18,7 +18,7 @@
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 * 19 *
20 * Written by Koji Sato <koji@osrg.net>. 20 * Written by Koji Sato <koji@osrg.net>.
21 * Rivised by Ryusuke Konishi <ryusuke@osrg.net>. 21 * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
22 */ 22 */
23 23
24#include <linux/kernel.h> 24#include <linux/kernel.h>
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 92579cc4c935..03b34b738993 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -67,6 +67,11 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
67 "(NILFS)"); 67 "(NILFS)");
68MODULE_LICENSE("GPL"); 68MODULE_LICENSE("GPL");
69 69
70struct kmem_cache *nilfs_inode_cachep;
71struct kmem_cache *nilfs_transaction_cachep;
72struct kmem_cache *nilfs_segbuf_cachep;
73struct kmem_cache *nilfs_btree_path_cache;
74
70static int nilfs_remount(struct super_block *sb, int *flags, char *data); 75static int nilfs_remount(struct super_block *sb, int *flags, char *data);
71 76
72/** 77/**
@@ -129,7 +134,6 @@ void nilfs_warning(struct super_block *sb, const char *function,
129 va_end(args); 134 va_end(args);
130} 135}
131 136
132static struct kmem_cache *nilfs_inode_cachep;
133 137
134struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs) 138struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
135{ 139{
@@ -155,34 +159,6 @@ void nilfs_destroy_inode(struct inode *inode)
155 kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode)); 159 kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
156} 160}
157 161
158static void init_once(void *obj)
159{
160 struct nilfs_inode_info *ii = obj;
161
162 INIT_LIST_HEAD(&ii->i_dirty);
163#ifdef CONFIG_NILFS_XATTR
164 init_rwsem(&ii->xattr_sem);
165#endif
166 nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
167 ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
168 inode_init_once(&ii->vfs_inode);
169}
170
171static int nilfs_init_inode_cache(void)
172{
173 nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
174 sizeof(struct nilfs_inode_info),
175 0, SLAB_RECLAIM_ACCOUNT,
176 init_once);
177
178 return (nilfs_inode_cachep == NULL) ? -ENOMEM : 0;
179}
180
181static inline void nilfs_destroy_inode_cache(void)
182{
183 kmem_cache_destroy(nilfs_inode_cachep);
184}
185
186static void nilfs_clear_inode(struct inode *inode) 162static void nilfs_clear_inode(struct inode *inode)
187{ 163{
188 struct nilfs_inode_info *ii = NILFS_I(inode); 164 struct nilfs_inode_info *ii = NILFS_I(inode);
@@ -266,8 +242,8 @@ int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
266 int err; 242 int err;
267 243
268 /* nilfs->sem must be locked by the caller. */ 244 /* nilfs->sem must be locked by the caller. */
269 if (sbp[0]->s_magic != NILFS_SUPER_MAGIC) { 245 if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
270 if (sbp[1] && sbp[1]->s_magic == NILFS_SUPER_MAGIC) 246 if (sbp[1] && sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC))
271 nilfs_swap_super_block(nilfs); 247 nilfs_swap_super_block(nilfs);
272 else { 248 else {
273 printk(KERN_CRIT "NILFS: superblock broke on dev %s\n", 249 printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
@@ -436,7 +412,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
436 /* 412 /*
437 * Compute the overhead 413 * Compute the overhead
438 * 414 *
439 * When distributing meta data blocks outside semgent structure, 415 * When distributing meta data blocks outside segment structure,
440 * We must count them as the overhead. 416 * We must count them as the overhead.
441 */ 417 */
442 overhead = 0; 418 overhead = 0;
@@ -470,10 +446,10 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
470 if (nilfs_test_opt(sbi, SNAPSHOT)) 446 if (nilfs_test_opt(sbi, SNAPSHOT))
471 seq_printf(seq, ",cp=%llu", 447 seq_printf(seq, ",cp=%llu",
472 (unsigned long long int)sbi->s_snapshot_cno); 448 (unsigned long long int)sbi->s_snapshot_cno);
473 if (nilfs_test_opt(sbi, ERRORS_RO))
474 seq_printf(seq, ",errors=remount-ro");
475 if (nilfs_test_opt(sbi, ERRORS_PANIC)) 449 if (nilfs_test_opt(sbi, ERRORS_PANIC))
476 seq_printf(seq, ",errors=panic"); 450 seq_printf(seq, ",errors=panic");
451 if (nilfs_test_opt(sbi, ERRORS_CONT))
452 seq_printf(seq, ",errors=continue");
477 if (nilfs_test_opt(sbi, STRICT_ORDER)) 453 if (nilfs_test_opt(sbi, STRICT_ORDER))
478 seq_printf(seq, ",order=strict"); 454 seq_printf(seq, ",order=strict");
479 if (nilfs_test_opt(sbi, NORECOVERY)) 455 if (nilfs_test_opt(sbi, NORECOVERY))
@@ -631,7 +607,7 @@ nilfs_set_default_options(struct nilfs_sb_info *sbi,
631 struct nilfs_super_block *sbp) 607 struct nilfs_super_block *sbp)
632{ 608{
633 sbi->s_mount_opt = 609 sbi->s_mount_opt =
634 NILFS_MOUNT_ERRORS_CONT | NILFS_MOUNT_BARRIER; 610 NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
635} 611}
636 612
637static int nilfs_setup_super(struct nilfs_sb_info *sbi) 613static int nilfs_setup_super(struct nilfs_sb_info *sbi)
@@ -749,6 +725,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
749 sb->s_export_op = &nilfs_export_ops; 725 sb->s_export_op = &nilfs_export_ops;
750 sb->s_root = NULL; 726 sb->s_root = NULL;
751 sb->s_time_gran = 1; 727 sb->s_time_gran = 1;
728 sb->s_bdi = nilfs->ns_bdi;
752 729
753 err = load_nilfs(nilfs, sbi); 730 err = load_nilfs(nilfs, sbi);
754 if (err) 731 if (err)
@@ -777,9 +754,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
777 goto failed_sbi; 754 goto failed_sbi;
778 } 755 }
779 cno = sbi->s_snapshot_cno; 756 cno = sbi->s_snapshot_cno;
780 } else 757 }
781 /* Read-only mount */
782 sbi->s_snapshot_cno = cno;
783 } 758 }
784 759
785 err = nilfs_attach_checkpoint(sbi, cno); 760 err = nilfs_attach_checkpoint(sbi, cno);
@@ -848,7 +823,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
848 struct the_nilfs *nilfs = sbi->s_nilfs; 823 struct the_nilfs *nilfs = sbi->s_nilfs;
849 unsigned long old_sb_flags; 824 unsigned long old_sb_flags;
850 struct nilfs_mount_options old_opts; 825 struct nilfs_mount_options old_opts;
851 int err; 826 int was_snapshot, err;
852 827
853 lock_kernel(); 828 lock_kernel();
854 829
@@ -856,6 +831,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
856 old_sb_flags = sb->s_flags; 831 old_sb_flags = sb->s_flags;
857 old_opts.mount_opt = sbi->s_mount_opt; 832 old_opts.mount_opt = sbi->s_mount_opt;
858 old_opts.snapshot_cno = sbi->s_snapshot_cno; 833 old_opts.snapshot_cno = sbi->s_snapshot_cno;
834 was_snapshot = nilfs_test_opt(sbi, SNAPSHOT);
859 835
860 if (!parse_options(data, sb)) { 836 if (!parse_options(data, sb)) {
861 err = -EINVAL; 837 err = -EINVAL;
@@ -863,20 +839,32 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
863 } 839 }
864 sb->s_flags = (sb->s_flags & ~MS_POSIXACL); 840 sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
865 841
866 if ((*flags & MS_RDONLY) && 842 err = -EINVAL;
867 sbi->s_snapshot_cno != old_opts.snapshot_cno) { 843 if (was_snapshot) {
868 printk(KERN_WARNING "NILFS (device %s): couldn't " 844 if (!(*flags & MS_RDONLY)) {
869 "remount to a different snapshot. \n", 845 printk(KERN_ERR "NILFS (device %s): cannot remount "
870 sb->s_id); 846 "snapshot read/write.\n",
871 err = -EINVAL; 847 sb->s_id);
872 goto restore_opts; 848 goto restore_opts;
849 } else if (sbi->s_snapshot_cno != old_opts.snapshot_cno) {
850 printk(KERN_ERR "NILFS (device %s): cannot "
851 "remount to a different snapshot.\n",
852 sb->s_id);
853 goto restore_opts;
854 }
855 } else {
856 if (nilfs_test_opt(sbi, SNAPSHOT)) {
857 printk(KERN_ERR "NILFS (device %s): cannot change "
858 "a regular mount to a snapshot.\n",
859 sb->s_id);
860 goto restore_opts;
861 }
873 } 862 }
874 863
875 if (!nilfs_valid_fs(nilfs)) { 864 if (!nilfs_valid_fs(nilfs)) {
876 printk(KERN_WARNING "NILFS (device %s): couldn't " 865 printk(KERN_WARNING "NILFS (device %s): couldn't "
877 "remount because the filesystem is in an " 866 "remount because the filesystem is in an "
878 "incomplete recovery state.\n", sb->s_id); 867 "incomplete recovery state.\n", sb->s_id);
879 err = -EINVAL;
880 goto restore_opts; 868 goto restore_opts;
881 } 869 }
882 870
@@ -887,9 +875,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
887 nilfs_detach_segment_constructor(sbi); 875 nilfs_detach_segment_constructor(sbi);
888 sb->s_flags |= MS_RDONLY; 876 sb->s_flags |= MS_RDONLY;
889 877
890 sbi->s_snapshot_cno = nilfs_last_cno(nilfs);
891 /* nilfs_set_opt(sbi, SNAPSHOT); */
892
893 /* 878 /*
894 * Remounting a valid RW partition RDONLY, so set 879 * Remounting a valid RW partition RDONLY, so set
895 * the RDONLY flag and then mark the partition as valid again. 880 * the RDONLY flag and then mark the partition as valid again.
@@ -908,24 +893,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
908 * store the current valid flag. (It may have been changed 893 * store the current valid flag. (It may have been changed
909 * by fsck since we originally mounted the partition.) 894 * by fsck since we originally mounted the partition.)
910 */ 895 */
911 if (nilfs->ns_current && nilfs->ns_current != sbi) {
912 printk(KERN_WARNING "NILFS (device %s): couldn't "
913 "remount because an RW-mount exists.\n",
914 sb->s_id);
915 err = -EBUSY;
916 goto restore_opts;
917 }
918 if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
919 printk(KERN_WARNING "NILFS (device %s): couldn't "
920 "remount because the current RO-mount is not "
921 "the latest one.\n",
922 sb->s_id);
923 err = -EINVAL;
924 goto restore_opts;
925 }
926 sb->s_flags &= ~MS_RDONLY; 896 sb->s_flags &= ~MS_RDONLY;
927 nilfs_clear_opt(sbi, SNAPSHOT);
928 sbi->s_snapshot_cno = 0;
929 897
930 err = nilfs_attach_segment_constructor(sbi); 898 err = nilfs_attach_segment_constructor(sbi);
931 if (err) 899 if (err)
@@ -934,8 +902,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
934 down_write(&nilfs->ns_sem); 902 down_write(&nilfs->ns_sem);
935 nilfs_setup_super(sbi); 903 nilfs_setup_super(sbi);
936 up_write(&nilfs->ns_sem); 904 up_write(&nilfs->ns_sem);
937
938 nilfs->ns_current = sbi;
939 } 905 }
940 out: 906 out:
941 up_write(&nilfs->ns_super_sem); 907 up_write(&nilfs->ns_super_sem);
@@ -1021,10 +987,14 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1021{ 987{
1022 struct nilfs_super_data sd; 988 struct nilfs_super_data sd;
1023 struct super_block *s; 989 struct super_block *s;
990 fmode_t mode = FMODE_READ;
1024 struct the_nilfs *nilfs; 991 struct the_nilfs *nilfs;
1025 int err, need_to_close = 1; 992 int err, need_to_close = 1;
1026 993
1027 sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type); 994 if (!(flags & MS_RDONLY))
995 mode |= FMODE_WRITE;
996
997 sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
1028 if (IS_ERR(sd.bdev)) 998 if (IS_ERR(sd.bdev))
1029 return PTR_ERR(sd.bdev); 999 return PTR_ERR(sd.bdev);
1030 1000
@@ -1091,10 +1061,12 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1091 1061
1092 /* New superblock instance created */ 1062 /* New superblock instance created */
1093 s->s_flags = flags; 1063 s->s_flags = flags;
1064 s->s_mode = mode;
1094 strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id)); 1065 strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
1095 sb_set_blocksize(s, block_size(sd.bdev)); 1066 sb_set_blocksize(s, block_size(sd.bdev));
1096 1067
1097 err = nilfs_fill_super(s, data, flags & MS_VERBOSE, nilfs); 1068 err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0,
1069 nilfs);
1098 if (err) 1070 if (err)
1099 goto cancel_new; 1071 goto cancel_new;
1100 1072
@@ -1105,7 +1077,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1105 mutex_unlock(&nilfs->ns_mount_mutex); 1077 mutex_unlock(&nilfs->ns_mount_mutex);
1106 put_nilfs(nilfs); 1078 put_nilfs(nilfs);
1107 if (need_to_close) 1079 if (need_to_close)
1108 close_bdev_exclusive(sd.bdev, flags); 1080 close_bdev_exclusive(sd.bdev, mode);
1109 simple_set_mnt(mnt, s); 1081 simple_set_mnt(mnt, s);
1110 return 0; 1082 return 0;
1111 1083
@@ -1113,7 +1085,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1113 mutex_unlock(&nilfs->ns_mount_mutex); 1085 mutex_unlock(&nilfs->ns_mount_mutex);
1114 put_nilfs(nilfs); 1086 put_nilfs(nilfs);
1115 failed: 1087 failed:
1116 close_bdev_exclusive(sd.bdev, flags); 1088 close_bdev_exclusive(sd.bdev, mode);
1117 1089
1118 return err; 1090 return err;
1119 1091
@@ -1123,7 +1095,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1123 put_nilfs(nilfs); 1095 put_nilfs(nilfs);
1124 deactivate_locked_super(s); 1096 deactivate_locked_super(s);
1125 /* 1097 /*
1126 * deactivate_super() invokes close_bdev_exclusive(). 1098 * deactivate_locked_super() invokes close_bdev_exclusive().
1127 * We must finish all post-cleaning before this call; 1099 * We must finish all post-cleaning before this call;
1128 * put_nilfs() needs the block device. 1100 * put_nilfs() needs the block device.
1129 */ 1101 */
@@ -1138,54 +1110,93 @@ struct file_system_type nilfs_fs_type = {
1138 .fs_flags = FS_REQUIRES_DEV, 1110 .fs_flags = FS_REQUIRES_DEV,
1139}; 1111};
1140 1112
1141static int __init init_nilfs_fs(void) 1113static void nilfs_inode_init_once(void *obj)
1142{ 1114{
1143 int err; 1115 struct nilfs_inode_info *ii = obj;
1144
1145 err = nilfs_init_inode_cache();
1146 if (err)
1147 goto failed;
1148 1116
1149 err = nilfs_init_transaction_cache(); 1117 INIT_LIST_HEAD(&ii->i_dirty);
1150 if (err) 1118#ifdef CONFIG_NILFS_XATTR
1151 goto failed_inode_cache; 1119 init_rwsem(&ii->xattr_sem);
1120#endif
1121 nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
1122 ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
1123 inode_init_once(&ii->vfs_inode);
1124}
1152 1125
1153 err = nilfs_init_segbuf_cache(); 1126static void nilfs_segbuf_init_once(void *obj)
1154 if (err) 1127{
1155 goto failed_transaction_cache; 1128 memset(obj, 0, sizeof(struct nilfs_segment_buffer));
1129}
1156 1130
1157 err = nilfs_btree_path_cache_init(); 1131static void nilfs_destroy_cachep(void)
1158 if (err) 1132{
1159 goto failed_segbuf_cache; 1133 if (nilfs_inode_cachep)
1134 kmem_cache_destroy(nilfs_inode_cachep);
1135 if (nilfs_transaction_cachep)
1136 kmem_cache_destroy(nilfs_transaction_cachep);
1137 if (nilfs_segbuf_cachep)
1138 kmem_cache_destroy(nilfs_segbuf_cachep);
1139 if (nilfs_btree_path_cache)
1140 kmem_cache_destroy(nilfs_btree_path_cache);
1141}
1160 1142
1161 err = register_filesystem(&nilfs_fs_type); 1143static int __init nilfs_init_cachep(void)
1162 if (err) 1144{
1163 goto failed_btree_path_cache; 1145 nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
1146 sizeof(struct nilfs_inode_info), 0,
1147 SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once);
1148 if (!nilfs_inode_cachep)
1149 goto fail;
1150
1151 nilfs_transaction_cachep = kmem_cache_create("nilfs2_transaction_cache",
1152 sizeof(struct nilfs_transaction_info), 0,
1153 SLAB_RECLAIM_ACCOUNT, NULL);
1154 if (!nilfs_transaction_cachep)
1155 goto fail;
1156
1157 nilfs_segbuf_cachep = kmem_cache_create("nilfs2_segbuf_cache",
1158 sizeof(struct nilfs_segment_buffer), 0,
1159 SLAB_RECLAIM_ACCOUNT, nilfs_segbuf_init_once);
1160 if (!nilfs_segbuf_cachep)
1161 goto fail;
1162
1163 nilfs_btree_path_cache = kmem_cache_create("nilfs2_btree_path_cache",
1164 sizeof(struct nilfs_btree_path) * NILFS_BTREE_LEVEL_MAX,
1165 0, 0, NULL);
1166 if (!nilfs_btree_path_cache)
1167 goto fail;
1164 1168
1165 return 0; 1169 return 0;
1166 1170
1167 failed_btree_path_cache: 1171fail:
1168 nilfs_btree_path_cache_destroy(); 1172 nilfs_destroy_cachep();
1173 return -ENOMEM;
1174}
1175
1176static int __init init_nilfs_fs(void)
1177{
1178 int err;
1169 1179
1170 failed_segbuf_cache: 1180 err = nilfs_init_cachep();
1171 nilfs_destroy_segbuf_cache(); 1181 if (err)
1182 goto fail;
1172 1183
1173 failed_transaction_cache: 1184 err = register_filesystem(&nilfs_fs_type);
1174 nilfs_destroy_transaction_cache(); 1185 if (err)
1186 goto free_cachep;
1175 1187
1176 failed_inode_cache: 1188 printk(KERN_INFO "NILFS version 2 loaded\n");
1177 nilfs_destroy_inode_cache(); 1189 return 0;
1178 1190
1179 failed: 1191free_cachep:
1192 nilfs_destroy_cachep();
1193fail:
1180 return err; 1194 return err;
1181} 1195}
1182 1196
1183static void __exit exit_nilfs_fs(void) 1197static void __exit exit_nilfs_fs(void)
1184{ 1198{
1185 nilfs_destroy_segbuf_cache(); 1199 nilfs_destroy_cachep();
1186 nilfs_destroy_transaction_cache();
1187 nilfs_destroy_inode_cache();
1188 nilfs_btree_path_cache_destroy();
1189 unregister_filesystem(&nilfs_fs_type); 1200 unregister_filesystem(&nilfs_fs_type);
1190} 1201}
1191 1202
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 92733d5651d2..a756168a21c2 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -386,7 +386,7 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
386 386
387 nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); 387 nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
388 if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) { 388 if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
389 printk(KERN_ERR "NILFS: too short segment. \n"); 389 printk(KERN_ERR "NILFS: too short segment.\n");
390 return -EINVAL; 390 return -EINVAL;
391 } 391 }
392 392
@@ -486,11 +486,15 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
486 printk(KERN_WARNING 486 printk(KERN_WARNING
487 "NILFS warning: unable to read secondary superblock\n"); 487 "NILFS warning: unable to read secondary superblock\n");
488 488
489 /*
490 * Compare two super blocks and set 1 in swp if the secondary
491 * super block is valid and newer. Otherwise, set 0 in swp.
492 */
489 valid[0] = nilfs_valid_sb(sbp[0]); 493 valid[0] = nilfs_valid_sb(sbp[0]);
490 valid[1] = nilfs_valid_sb(sbp[1]); 494 valid[1] = nilfs_valid_sb(sbp[1]);
491 swp = valid[1] && 495 swp = valid[1] && (!valid[0] ||
492 (!valid[0] || 496 le64_to_cpu(sbp[1]->s_last_cno) >
493 le64_to_cpu(sbp[1]->s_wtime) > le64_to_cpu(sbp[0]->s_wtime)); 497 le64_to_cpu(sbp[0]->s_last_cno));
494 498
495 if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) { 499 if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) {
496 brelse(sbh[1]); 500 brelse(sbh[1]);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index e9795f1724d7..1ab974533697 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -29,6 +29,7 @@
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/blkdev.h> 30#include <linux/blkdev.h>
31#include <linux/backing-dev.h> 31#include <linux/backing-dev.h>
32#include <linux/slab.h>
32#include "sb.h" 33#include "sb.h"
33 34
34/* the_nilfs struct */ 35/* the_nilfs struct */
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 037e878e03fc..fcc2f064af83 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/dcache.h> 19#include <linux/dcache.h>
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/gfp.h>
21#include <linux/init.h> 22#include <linux/init.h>
22#include <linux/module.h> 23#include <linux/module.h>
23#include <linux/srcu.h> 24#include <linux/srcu.h>
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 3165d85aada2..0399bcbe09c8 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -87,7 +87,6 @@
87#include <linux/kernel.h> 87#include <linux/kernel.h>
88#include <linux/module.h> 88#include <linux/module.h>
89#include <linux/mutex.h> 89#include <linux/mutex.h>
90#include <linux/slab.h>
91#include <linux/spinlock.h> 90#include <linux/spinlock.h>
92#include <linux/writeback.h> /* for inode_lock */ 91#include <linux/writeback.h> /* for inode_lock */
93 92
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index 3e56dbffe729..b3a159b21cfd 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -15,6 +15,7 @@ config INOTIFY
15 15
16config INOTIFY_USER 16config INOTIFY_USER
17 bool "Inotify support for userspace" 17 bool "Inotify support for userspace"
18 select ANON_INODES
18 select FSNOTIFY 19 select FSNOTIFY
19 default y 20 default y
20 ---help--- 21 ---help---
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 1afb0a10229f..e27960cd76ab 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -28,6 +28,7 @@
28#include <linux/path.h> /* struct path */ 28#include <linux/path.h> /* struct path */
29#include <linux/slab.h> /* kmem_* */ 29#include <linux/slab.h> /* kmem_* */
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/sched.h>
31 32
32#include "inotify.h" 33#include "inotify.h"
33 34
@@ -146,6 +147,7 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
146 idr_for_each(&group->inotify_data.idr, idr_callback, group); 147 idr_for_each(&group->inotify_data.idr, idr_callback, group);
147 idr_remove_all(&group->inotify_data.idr); 148 idr_remove_all(&group->inotify_data.idr);
148 idr_destroy(&group->inotify_data.idr); 149 idr_destroy(&group->inotify_data.idr);
150 free_uid(group->inotify_data.user);
149} 151}
150 152
151void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv) 153void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 472cdf29ef82..e46ca685b9be 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -546,21 +546,24 @@ retry:
546 if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL))) 546 if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
547 goto out_err; 547 goto out_err;
548 548
549 /* we are putting the mark on the idr, take a reference */
550 fsnotify_get_mark(&tmp_ientry->fsn_entry);
551
549 spin_lock(&group->inotify_data.idr_lock); 552 spin_lock(&group->inotify_data.idr_lock);
550 ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry, 553 ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
551 group->inotify_data.last_wd+1, 554 group->inotify_data.last_wd+1,
552 &tmp_ientry->wd); 555 &tmp_ientry->wd);
553 spin_unlock(&group->inotify_data.idr_lock); 556 spin_unlock(&group->inotify_data.idr_lock);
554 if (ret) { 557 if (ret) {
558 /* we didn't get on the idr, drop the idr reference */
559 fsnotify_put_mark(&tmp_ientry->fsn_entry);
560
555 /* idr was out of memory allocate and try again */ 561 /* idr was out of memory allocate and try again */
556 if (ret == -EAGAIN) 562 if (ret == -EAGAIN)
557 goto retry; 563 goto retry;
558 goto out_err; 564 goto out_err;
559 } 565 }
560 566
561 /* we put the mark on the idr, take a reference */
562 fsnotify_get_mark(&tmp_ientry->fsn_entry);
563
564 /* we are on the idr, now get on the inode */ 567 /* we are on the idr, now get on the inode */
565 ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode); 568 ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
566 if (ret) { 569 if (ret) {
@@ -578,16 +581,13 @@ retry:
578 /* return the watch descriptor for this new entry */ 581 /* return the watch descriptor for this new entry */
579 ret = tmp_ientry->wd; 582 ret = tmp_ientry->wd;
580 583
581 /* match the ref from fsnotify_init_markentry() */
582 fsnotify_put_mark(&tmp_ientry->fsn_entry);
583
584 /* if this mark added a new event update the group mask */ 584 /* if this mark added a new event update the group mask */
585 if (mask & ~group->mask) 585 if (mask & ~group->mask)
586 fsnotify_recalc_group_mask(group); 586 fsnotify_recalc_group_mask(group);
587 587
588out_err: 588out_err:
589 if (ret < 0) 589 /* match the ref from fsnotify_init_markentry() */
590 kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry); 590 fsnotify_put_mark(&tmp_ientry->fsn_entry);
591 591
592 return ret; 592 return ret;
593} 593}
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
deleted file mode 100644
index 37c11e194372..000000000000
--- a/fs/ntfs/ChangeLog
+++ /dev/null
@@ -1,1702 +0,0 @@
1ToDo/Notes:
2 - Find and fix bugs.
3 - The only places in the kernel where a file is resized are
4 ntfs_file_write*() and ntfs_truncate() for both of which i_mutex is
5 held. Just have to be careful in read-/writepage and other helpers
6 not running under i_mutex that we play nice. Also need to be careful
7 with initialized_size extension in ntfs_file_write*() and writepage.
8 UPDATE: The only things that need to be checked are the compressed
9 write and the other attribute resize/write cases like index
10 attributes, etc. For now none of these are implemented so are safe.
11 - Implement filling in of holes in aops.c::ntfs_writepage() and its
12 helpers.
13 - Implement mft.c::sync_mft_mirror_umount(). We currently will just
14 leave the volume dirty on umount if the final iput(vol->mft_ino)
15 causes a write of any mirrored mft records due to the mft mirror
16 inode having been discarded already. Whether this can actually ever
17 happen is unclear however so it is worth waiting until someone hits
18 the problem.
19
202.1.29 - Fix a deadlock at mount time.
21
22 - During mount the VFS holds s_umount lock on the superblock. So when
23 we try to empty the journal $LogFile contents by calling
24 ntfs_attr_set() when the machine does not have much memory and the
25 journal is large ntfs_attr_set() results in the VM trying to balance
26 dirty pages which in turn tries to that the s_umount lock and thus we
27 get a deadlock. The solution is to not use ntfs_attr_set() and
28 instead do the zeroing by hand at the block level rather than page
29 cache level.
30 - Fix sparse warnings.
31
322.1.28 - Fix a deadlock.
33
34 - Fix deadlock in fs/ntfs/inode.c::ntfs_put_inode(). Thanks to Sergey
35 Vlasov for the report and detailed analysis of the deadlock. The fix
36 involved getting rid of ntfs_put_inode() altogether and hence NTFS no
37 longer has a ->put_inode super operation.
38
392.1.27 - Various bug fixes and cleanups.
40
41 - Fix two compiler warnings on Alpha. Thanks to Andrew Morton for
42 reporting them.
43 - Fix an (innocent) off-by-one error in the runlist code.
44 - Fix a buggette in an "should be impossible" case handling where we
45 continued the attribute lookup loop instead of aborting it.
46 - Use buffer_migrate_page() for the ->migratepage function of all ntfs
47 address space operations.
48 - Fix comparison of $MFT and $MFTMirr to not bail out when there are
49 unused, invalid mft records which are the same in both $MFT and
50 $MFTMirr.
51 - Add support for sparse files which have a compression unit of 0.
52 - Remove all the make_bad_inode() calls. This should only be called
53 from read inode and new inode code paths.
54 - Limit name length in fs/ntfs/unistr.c::ntfs_nlstoucs() to maximum
55 allowed by NTFS, i.e. 255 Unicode characters, not including the
56 terminating NULL (which is not stored on disk).
57 - Improve comments on file attribute flags in fs/ntfs/layout.h.
58 - Fix a bug in fs/ntfs/inode.c::ntfs_read_locked_index_inode() where we
59 forgot to update a temporary variable so loading index inodes which
60 have an index allocation attribute failed.
61 - Add a missing call to flush_dcache_mft_record_page() in
62 fs/ntfs/inode.c::ntfs_write_inode().
63 - Handle the recently introduced -ENAMETOOLONG return value from
64 fs/ntfs/unistr.c::ntfs_nlstoucs() in fs/ntfs/namei.c::ntfs_lookup().
65 - Semaphore to mutex conversion. (Ingo Molnar)
66
672.1.26 - Minor bug fixes and updates.
68
69 - Fix a potential overflow in file.c where a cast to s64 was missing in
70 a left shift of a page index.
71 - The struct inode has had its i_sem semaphore changed to a mutex named
72 i_mutex.
73 - We have struct kmem_cache now so use it instead of the typedef
74 kmem_cache_t. (Pekka Enberg)
75 - Implement support for sector sizes above 512 bytes (up to the maximum
76 supported by NTFS which is 4096 bytes).
77 - Do more detailed reporting of why we cannot mount read-write by
78 special casing the VOLUME_MODIFIED_BY_CHKDSK flag.
79 - Miscellaneous updates to layout.h.
80 - Cope with attribute list attribute having invalid flags. Windows
81 copes with this and even chkdsk does not detect or fix this so we
82 have to cope with it, too. Thanks to Pawel Kot for reporting the
83 problem.
84
852.1.25 - (Almost) fully implement write(2) and truncate(2).
86
87 - Change ntfs_map_runlist_nolock(), ntfs_attr_find_vcn_nolock() and
88 {__,}ntfs_cluster_free() to also take an optional attribute search
89 context as argument. This allows calling these functions with the
90 mft record mapped. Update all callers.
91 - Fix potential deadlock in ntfs_mft_data_extend_allocation_nolock()
92 error handling by passing in the active search context when calling
93 ntfs_cluster_free().
94 - Change ntfs_cluster_alloc() to take an extra boolean parameter
95 specifying whether the cluster are being allocated to extend an
96 attribute or to fill a hole.
97 - Change ntfs_attr_make_non_resident() to call ntfs_cluster_alloc()
98 with @is_extension set to TRUE and remove the runlist terminator
99 fixup code as this is now done by ntfs_cluster_alloc().
100 - Change ntfs_attr_make_non_resident to take the attribute value size
101 as an extra parameter. This is needed since we need to know the size
102 before we can map the mft record and our callers always know it. The
103 reason we cannot simply read the size from the vfs inode i_size is
104 that this is not necessarily uptodate. This happens when
105 ntfs_attr_make_non_resident() is called in the ->truncate call path.
106 - Fix ntfs_attr_make_non_resident() to update the vfs inode i_blocks
107 which is zero for a resident attribute but should no longer be zero
108 once the attribute is non-resident as it then has real clusters
109 allocated.
110 - Add fs/ntfs/attrib.[hc]::ntfs_attr_extend_allocation(), a function to
111 extend the allocation of an attributes. Optionally, the data size,
112 but not the initialized size can be extended, too.
113 - Implement fs/ntfs/inode.[hc]::ntfs_truncate(). It only supports
114 uncompressed and unencrypted files and it never creates sparse files
115 at least for the moment (making a file sparse requires us to modify
116 its directory entries and we do not support directory operations at
117 the moment). Also, support for highly fragmented files, i.e. ones
118 whose data attribute is split across multiple extents, is severly
119 limited. When such a case is encountered, EOPNOTSUPP is returned.
120 - Enable ATTR_SIZE attribute changes in ntfs_setattr(). This completes
121 the initial implementation of file truncation. Now both open(2)ing
122 a file with the O_TRUNC flag and the {,f}truncate(2) system calls
123 will resize a file appropriately. The limitations are that only
124 uncompressed and unencrypted files are supported. Also, there is
125 only very limited support for highly fragmented files (the ones whose
126 $DATA attribute is split into multiple attribute extents).
127 - In attrib.c::ntfs_attr_set() call balance_dirty_pages_ratelimited()
128 and cond_resched() in the main loop as we could be dirtying a lot of
129 pages and this ensures we play nice with the VM and the system as a
130 whole.
131 - Implement file operations ->write, ->aio_write, ->writev for regular
132 files. This replaces the old use of generic_file_write(), et al and
133 the address space operations ->prepare_write and ->commit_write.
134 This means that both sparse and non-sparse (unencrypted and
135 uncompressed) files can now be extended using the normal write(2)
136 code path. There are two limitations at present and these are that
137 we never create sparse files and that we only have limited support
138 for highly fragmented files, i.e. ones whose data attribute is split
139 across multiple extents. When such a case is encountered,
140 EOPNOTSUPP is returned.
141 - $EA attributes can be both resident and non-resident.
142 - Use %z for size_t to fix compilation warnings. (Andrew Morton)
143 - Fix compilation warnings with gcc-4.0.2 on SUSE 10.0.
144 - Document extended attribute ($EA) NEED_EA flag. (Based on libntfs
145 patch by Yura Pakhuchiy.)
146
1472.1.24 - Lots of bug fixes and support more clean journal states.
148
149 - Support journals ($LogFile) which have been modified by chkdsk. This
150 means users can boot into Windows after we marked the volume dirty.
151 The Windows boot will run chkdsk and then reboot. The user can then
152 immediately boot into Linux rather than having to do a full Windows
153 boot first before rebooting into Linux and we will recognize such a
154 journal and empty it as it is clean by definition. Note, this only
155 works if chkdsk left the journal in an obviously clean state.
156 - Support journals ($LogFile) with only one restart page as well as
157 journals with two different restart pages. We sanity check both and
158 either use the only sane one or the more recent one of the two in the
159 case that both are valid.
160 - Add fs/ntfs/malloc.h::ntfs_malloc_nofs_nofail() which is analogous to
161 ntfs_malloc_nofs() but it performs allocations with __GFP_NOFAIL and
162 hence cannot fail.
163 - Use ntfs_malloc_nofs_nofail() in the two critical regions in
164 fs/ntfs/runlist.c::ntfs_runlists_merge(). This means we no longer
165 need to panic() if the allocation fails as it now cannot fail.
166 - Fix two nasty runlist merging bugs that had gone unnoticed so far.
167 Thanks to Stefano Picerno for the bug report.
168 - Remove two bogus BUG_ON()s from fs/ntfs/mft.c.
169 - Fix handling of valid but empty mapping pairs array in
170 fs/ntfs/runlist.c::ntfs_mapping_pairs_decompress().
171 - Report unrepresentable inodes during ntfs_readdir() as KERN_WARNING
172 messages and include the inode number. Thanks to Yura Pakhuchiy for
173 pointing this out.
174 - Change ntfs_rl_truncate_nolock() to throw away the runlist if the new
175 length is zero.
176 - Add runlist.[hc]::ntfs_rl_punch_nolock() which punches a caller
177 specified hole into a runlist.
178 - Fix a bug in fs/ntfs/index.c::ntfs_index_lookup(). When the returned
179 index entry is in the index root, we forgot to set the @ir pointer in
180 the index context. Thanks to Yura Pakhuchiy for finding this bug.
181 - Remove bogus setting of PageError in ntfs_read_compressed_block().
182 - Add fs/ntfs/attrib.[hc]::ntfs_resident_attr_value_resize().
183 - Fix a bug in ntfs_map_runlist_nolock() where we forgot to protect
184 access to the allocated size in the ntfs inode with the size lock.
185 - Fix ntfs_attr_vcn_to_lcn_nolock() and ntfs_attr_find_vcn_nolock() to
186 return LCN_ENOENT when there is no runlist and the allocated size is
187 zero.
188 - Fix load_attribute_list() to handle the case of a NULL runlist.
189 - Fix handling of sparse attributes in ntfs_attr_make_non_resident().
190 - Add BUG() checks to ntfs_attr_make_non_resident() and ntfs_attr_set()
191 to ensure that these functions are never called for compressed or
192 encrypted attributes.
193 - Fix cluster (de)allocators to work when the runlist is NULL and more
194 importantly to take a locked runlist rather than them locking it
195 which leads to lock reversal.
196 - Truncate {a,c,m}time to the ntfs supported time granularity when
197 updating the times in the inode in ntfs_setattr().
198 - Fixup handling of sparse, compressed, and encrypted attributes in
199 fs/ntfs/inode.c::ntfs_read_locked_{,attr_,index_}inode(),
200 fs/ntfs/aops.c::ntfs_{read,write}page().
201 - Make ntfs_write_block() not instantiate sparse blocks if they contain
202 only zeroes.
203 - Optimize fs/ntfs/aops.c::ntfs_write_block() by extending the page
204 lock protection over the buffer submission for i/o which allows the
205 removal of the get_bh()/put_bh() pairs for each buffer.
206 - Fix fs/ntfs/aops.c::ntfs_{read,write}_block() to handle the case
207 where a concurrent truncate has truncated the runlist under our feet.
208 - Fix page_has_buffers()/page_buffers() handling in fs/ntfs/aops.c.
209 - In fs/ntfs/aops.c::ntfs_end_buffer_async_read(), use a bit spin lock
210 in the first buffer head instead of a driver global spin lock to
211 improve scalability.
212 - Minor fix to error handling and error message display in
213 fs/ntfs/aops.c::ntfs_prepare_nonresident_write().
214 - Change the mount options {u,f,d}mask to always parse the number as
215 an octal number to conform to how chmod(1) works, too. Thanks to
216 Giuseppe Bilotta and Horst von Brand for pointing out the errors of
217 my ways.
218 - Fix various bugs in the runlist merging code. (Based on libntfs
219 changes by Richard Russon.)
220 - Fix sparse warnings that have crept in over time.
221 - Change ntfs_cluster_free() to require a write locked runlist on entry
222 since we otherwise get into a lock reversal deadlock if a read locked
223 runlist is passed in. In the process also change it to take an ntfs
224 inode instead of a vfs inode as parameter.
225 - Fix the definition of the CHKD ntfs record magic. It had an off by
226 two error causing it to be CHKB instead of CHKD.
227 - Fix a stupid bug in __ntfs_bitmap_set_bits_in_run() which caused the
228 count to become negative and hence we had a wild memset() scribbling
229 all over the system's ram.
230
2312.1.23 - Implement extension of resident files and make writing safe as well as
232 many bug fixes, cleanups, and enhancements...
233
234 - Add printk rate limiting for ntfs_warning() and ntfs_error() when
235 compiled without debug. This avoids a possible denial of service
236 attack. Thanks to Carl-Daniel Hailfinger from SuSE for pointing this
237 out.
238 - Fix compilation warnings on ia64. (Randy Dunlap)
239 - Use i_size_{read,write}() instead of reading i_size by hand and cache
240 the value where apropriate.
241 - Add size_lock to the ntfs_inode structure. This is an rw spinlock
242 and it locks against access to the inode sizes. Note, ->size_lock
243 is also accessed from irq context so you must use the _irqsave and
244 _irqrestore lock and unlock functions, respectively. Protect all
245 accesses to allocated_size, initialized_size, and compressed_size.
246 - Minor optimization to fs/ntfs/super.c::ntfs_statfs() and its helpers.
247 - Implement extension of resident files in the regular file write code
248 paths (fs/ntfs/aops.c::ntfs_{prepare,commit}_write()). At present
249 this only works until the data attribute becomes too big for the mft
250 record after which we abort the write returning -EOPNOTSUPP from
251 ntfs_prepare_write().
252 - Add disable_sparse mount option together with a per volume sparse
253 enable bit which is set appropriately and a per inode sparse disable
254 bit which is preset on some system file inodes as appropriate.
255 - Enforce that sparse support is disabled on NTFS volumes pre 3.0.
256 - Fix a bug in fs/ntfs/runlist.c::ntfs_mapping_pairs_decompress() in
257 the creation of the unmapped runlist element for the base attribute
258 extent.
259 - Split ntfs_map_runlist() into ntfs_map_runlist() and a non-locking
260 helper ntfs_map_runlist_nolock() which is used by ntfs_map_runlist().
261 This allows us to map runlist fragments with the runlist lock already
262 held without having to drop and reacquire it around the call. Adapt
263 all callers.
264 - Change ntfs_find_vcn() to ntfs_find_vcn_nolock() which takes a locked
265 runlist. This allows us to find runlist elements with the runlist
266 lock already held without having to drop and reacquire it around the
267 call. Adapt all callers.
268 - Change time to u64 in time.h::ntfs2utc() as it otherwise generates a
269 warning in the do_div() call on sparc32. Thanks to Meelis Roos for
270 the report and analysis of the warning.
271 - Fix a nasty runlist merge bug when merging two holes.
272 - Set the ntfs_inode->allocated_size to the real allocated size in the
273 mft record for resident attributes (fs/ntfs/inode.c).
274 - Small readability cleanup to use "a" instead of "ctx->attr"
275 everywhere (fs/ntfs/inode.c).
276 - Make fs/ntfs/namei.c::ntfs_get_{parent,dentry} static and move the
277 definition of ntfs_export_ops from fs/ntfs/super.c to namei.c. Also,
278 declare ntfs_export_ops in fs/ntfs/ntfs.h.
279 - Correct sparse file handling. The compressed values need to be
280 checked and set in the ntfs inode as done for compressed files and
281 the compressed size needs to be used for vfs inode->i_blocks instead
282 of the allocated size, again, as done for compressed files.
283 - Add AT_EA in addition to AT_DATA to whitelist for being allowed to be
284 non-resident in fs/ntfs/attrib.c::ntfs_attr_can_be_non_resident().
285 - Add fs/ntfs/attrib.c::ntfs_attr_vcn_to_lcn_nolock() used by the new
286 write code.
287 - Fix bug in fs/ntfs/attrib.c::ntfs_find_vcn_nolock() where after
288 dropping the read lock and taking the write lock we were not checking
289 whether someone else did not already do the work we wanted to do.
290 - Rename fs/ntfs/attrib.c::ntfs_find_vcn_nolock() to
291 ntfs_attr_find_vcn_nolock() and update all callers.
292 - Add fs/ntfs/attrib.[hc]::ntfs_attr_make_non_resident().
293 - Fix sign of various error return values to be negative in
294 fs/ntfs/lcnalloc.c.
295 - Modify ->readpage and ->writepage (fs/ntfs/aops.c) so they detect and
296 handle the case where an attribute is converted from resident to
297 non-resident by a concurrent file write.
298 - Remove checks for NULL before calling kfree() since kfree() does the
299 checking itself. (Jesper Juhl)
300 - Some utilities modify the boot sector but do not update the checksum.
301 Thus, relax the checking in fs/ntfs/super.c::is_boot_sector_ntfs() to
302 only emit a warning when the checksum is incorrect rather than
303 refusing the mount. Thanks to Bernd Casimir for pointing this
304 problem out.
305 - Update attribute definition handling.
306 - Add NTFS_MAX_CLUSTER_SIZE and NTFS_MAX_PAGES_PER_CLUSTER constants.
307 - Use NTFS_MAX_CLUSTER_SIZE in super.c instead of hard coding 0x10000.
308 - Use MAX_BUF_PER_PAGE instead of variable sized array allocation for
309 better code generation and one less sparse warning in fs/ntfs/aops.c.
310 - Remove spurious void pointer casts from fs/ntfs/. (Pekka Enberg)
311 - Use C99 style structure initialization after memory allocation where
312 possible (fs/ntfs/{attrib.c,index.c,super.c}). Thanks to Al Viro and
313 Pekka Enberg.
314 - Stamp the transaction log ($UsnJrnl), aka user space journal, if it
315 is active on the volume and we are mounting read-write or remounting
316 from read-only to read-write.
317 - Fix a bug in address space operations error recovery code paths where
318 if the runlist was not mapped at all and a mapping error occured we
319 would leave the runlist locked on exit to the function so that the
320 next access to the same file would try to take the lock and deadlock.
321 - Detect the case when Windows has been suspended to disk on the volume
322 to be mounted and if this is the case do not allow (re)mounting
323 read-write. This is done by parsing hiberfil.sys if present.
324 - Fix several occurences of a bug where we would perform 'var & ~const'
325 with a 64-bit variable and a int, i.e. 32-bit, constant. This causes
326 the higher order 32-bits of the 64-bit variable to be zeroed. To fix
327 this cast the 'const' to the same 64-bit type as 'var'.
328 - Change the runlist terminator of the newly allocated cluster(s) to
329 LCN_ENOENT in ntfs_attr_make_non_resident(). Otherwise the runlist
330 code gets confused.
331 - Add an extra parameter @last_vcn to ntfs_get_size_for_mapping_pairs()
332 and ntfs_mapping_pairs_build() to allow the runlist encoding to be
333 partial which is desirable when filling holes in sparse attributes.
334 Update all callers.
335 - Change ntfs_map_runlist_nolock() to only decompress the mapping pairs
336 if the requested vcn is inside it. Otherwise we get into problems
337 when we try to map an out of bounds vcn because we then try to map
338 the already mapped runlist fragment which causes
339 ntfs_mapping_pairs_decompress() to fail and return error. Update
340 ntfs_attr_find_vcn_nolock() accordingly.
341 - Fix a nasty deadlock that appeared in recent kernels.
342 The situation: VFS inode X on a mounted ntfs volume is dirty. For
343 same inode X, the ntfs_inode is dirty and thus corresponding on-disk
344 inode, i.e. mft record, which is in a dirty PAGE_CACHE_PAGE belonging
345 to the table of inodes, i.e. $MFT, inode 0.
346 What happens:
347 Process 1: sys_sync()/umount()/whatever... calls
348 __sync_single_inode() for $MFT -> do_writepages() -> write_page for
349 the dirty page containing the on-disk inode X, the page is now locked
350 -> ntfs_write_mst_block() which clears PageUptodate() on the page to
351 prevent anyone else getting hold of it whilst it does the write out.
352 This is necessary as the on-disk inode needs "fixups" applied before
353 the write to disk which are removed again after the write and
354 PageUptodate is then set again. It then analyses the page looking
355 for dirty on-disk inodes and when it finds one it calls
356 ntfs_may_write_mft_record() to see if it is safe to write this
357 on-disk inode. This then calls ilookup5() to check if the
358 corresponding VFS inode is in icache(). This in turn calls ifind()
359 which waits on the inode lock via wait_on_inode whilst holding the
360 global inode_lock.
361 Process 2: pdflush results in a call to __sync_single_inode for the
362 same VFS inode X on the ntfs volume. This locks the inode (I_LOCK)
363 then calls write-inode -> ntfs_write_inode -> map_mft_record() ->
364 read_cache_page() for the page (in page cache of table of inodes
365 $MFT, inode 0) containing the on-disk inode. This page has
366 PageUptodate() clear because of Process 1 (see above) so
367 read_cache_page() blocks when it tries to take the page lock for the
368 page so it can call ntfs_read_page().
369 Thus Process 1 is holding the page lock on the page containing the
370 on-disk inode X and it is waiting on the inode X to be unlocked in
371 ifind() so it can write the page out and then unlock the page.
372 And Process 2 is holding the inode lock on inode X and is waiting for
373 the page to be unlocked so it can call ntfs_readpage() or discover
374 that Process 1 set PageUptodate() again and use the page.
375 Thus we have a deadlock due to ifind() waiting on the inode lock.
376 The solution: The fix is to use the newly introduced
377 ilookup5_nowait() which does not wait on the inode's lock and hence
378 avoids the deadlock. This is safe as we do not care about the VFS
379 inode and only use the fact that it is in the VFS inode cache and the
380 fact that the vfs and ntfs inodes are one struct in memory to find
381 the ntfs inode in memory if present. Also, the ntfs inode has its
382 own locking so it does not matter if the vfs inode is locked.
383 - Fix bug in mft record writing where we forgot to set the device in
384 the buffers when mapping them after the VM had discarded them.
385 Thanks to Martin MOKREJÃ… for the bug report.
386
3872.1.22 - Many bug and race fixes and error handling improvements.
388
389 - Improve error handling in fs/ntfs/inode.c::ntfs_truncate().
390 - Change fs/ntfs/inode.c::ntfs_truncate() to return an error code
391 instead of void and provide a helper ntfs_truncate_vfs() for the
392 vfs ->truncate method.
393 - Add a new ntfs inode flag NInoTruncateFailed() and modify
394 fs/ntfs/inode.c::ntfs_truncate() to set and clear it appropriately.
395 - Fix min_size and max_size definitions in ATTR_DEF structure in
396 fs/ntfs/layout.h to be signed.
397 - Add attribute definition handling helpers to fs/ntfs/attrib.[hc]:
398 ntfs_attr_size_bounds_check(), ntfs_attr_can_be_non_resident(), and
399 ntfs_attr_can_be_resident(), which in turn use the new private helper
400 ntfs_attr_find_in_attrdef().
401 - In fs/ntfs/aops.c::mark_ntfs_record_dirty(), take the
402 mapping->private_lock around the dirtying of the buffer heads
403 analagous to the way it is done in __set_page_dirty_buffers().
404 - Ensure the mft record size does not exceed the PAGE_CACHE_SIZE at
405 mount time as this cannot work with the current implementation.
406 - Check for location of attribute name and improve error handling in
407 general in fs/ntfs/inode.c::ntfs_read_locked_inode() and friends.
408 - In fs/ntfs/aops.c::ntfs_writepage(), if the page is fully outside
409 i_size, i.e. race with truncate, invalidate the buffers on the page
410 so that they become freeable and hence the page does not leak.
411 - Remove unused function fs/ntfs/runlist.c::ntfs_rl_merge(). (Adrian
412 Bunk)
413 - Fix stupid bug in fs/ntfs/attrib.c::ntfs_attr_find() that resulted in
414 a NULL pointer dereference in the error code path when a corrupt
415 attribute was found. (Thanks to Domen Puncer for the bug report.)
416 - Add MODULE_VERSION() to fs/ntfs/super.c.
417 - Make several functions and variables static. (Adrian Bunk)
418 - Modify fs/ntfs/aops.c::mark_ntfs_record_dirty() so it allocates
419 buffers for the page if they are not present and then marks the
420 buffers belonging to the ntfs record dirty. This causes the buffers
421 to become busy and hence they are safe from removal until the page
422 has been written out.
423 - Fix stupid bug in fs/ntfs/attrib.c::ntfs_external_attr_find() in the
424 error handling code path that resulted in a BUG() due to trying to
425 unmap an extent mft record when the mapping of it had failed and it
426 thus was not mapped. (Thanks to Ken MacFerrin for the bug report.)
427 - Drop the runlist lock after the vcn has been read in
428 fs/ntfs/lcnalloc.c::__ntfs_cluster_free().
429 - Rewrite handling of multi sector transfer errors. We now do not set
430 PageError() when such errors are detected in the async i/o handler
431 fs/ntfs/aops.c::ntfs_end_buffer_async_read(). All users of mst
432 protected attributes now check the magic of each ntfs record as they
433 use it and act appropriately. This has the effect of making errors
434 granular per ntfs record rather than per page which solves the case
435 where we cannot access any of the ntfs records in a page when a
436 single one of them had an mst error. (Thanks to Ken MacFerrin for
437 the bug report.)
438 - Fix error handling in fs/ntfs/quota.c::ntfs_mark_quotas_out_of_date()
439 where we failed to release i_mutex on the $Quota/$Q attribute inode.
440 - Fix bug in handling of bad inodes in fs/ntfs/namei.c::ntfs_lookup().
441 - Add mapping of unmapped buffers to all remaining code paths, i.e.
442 fs/ntfs/aops.c::ntfs_write_mst_block(), mft.c::ntfs_sync_mft_mirror(),
443 and write_mft_record_nolock(). From now on we require that the
444 complete runlist for the mft mirror is always mapped into memory.
445 - Add creation of buffers to fs/ntfs/mft.c::ntfs_sync_mft_mirror().
446 - Improve error handling in fs/ntfs/aops.c::ntfs_{read,write}_block().
447 - Cleanup fs/ntfs/aops.c::ntfs_{read,write}page() since we know that a
448 resident attribute will be smaller than a page which makes the code
449 simpler. Also make the code more tolerant to concurrent ->truncate.
450
4512.1.21 - Fix some races and bugs, rewrite mft write code, add mft allocator.
452
453 - Implement extent mft record deallocation
454 fs/ntfs/mft.c::ntfs_extent_mft_record_free().
455 - Splitt runlist related functions off from attrib.[hc] to runlist.[hc].
456 - Add vol->mft_data_pos and initialize it at mount time.
457 - Rename init_runlist() to ntfs_init_runlist(), ntfs_vcn_to_lcn() to
458 ntfs_rl_vcn_to_lcn(), decompress_mapping_pairs() to
459 ntfs_mapping_pairs_decompress(), ntfs_merge_runlists() to
460 ntfs_runlists_merge() and adapt all callers.
461 - Add fs/ntfs/runlist.[hc]::ntfs_get_nr_significant_bytes(),
462 ntfs_get_size_for_mapping_pairs(), ntfs_write_significant_bytes(),
463 and ntfs_mapping_pairs_build(), adapted from libntfs.
464 - Make fs/ntfs/lcnalloc.c::ntfs_cluster_free_from_rl_nolock() not
465 static and add a declaration for it to lcnalloc.h.
466 - Add fs/ntfs/lcnalloc.h::ntfs_cluster_free_from_rl() which is a static
467 inline wrapper for ntfs_cluster_free_from_rl_nolock() which takes the
468 cluster bitmap lock for the duration of the call.
469 - Add fs/ntfs/attrib.[hc]::ntfs_attr_record_resize().
470 - Implement the equivalent of memset() for an ntfs attribute in
471 fs/ntfs/attrib.[hc]::ntfs_attr_set() and switch
472 fs/ntfs/logfile.c::ntfs_empty_logfile() to using it.
473 - Remove unnecessary casts from LCN_* constants.
474 - Implement fs/ntfs/runlist.c::ntfs_rl_truncate_nolock().
475 - Add MFT_RECORD_OLD as a copy of MFT_RECORD in fs/ntfs/layout.h and
476 change MFT_RECORD to contain the NTFS 3.1+ specific fields.
477 - Add a helper function fs/ntfs/aops.c::mark_ntfs_record_dirty() which
478 marks all buffers belonging to an ntfs record dirty, followed by
479 marking the page the ntfs record is in dirty and also marking the vfs
480 inode containing the ntfs record dirty (I_DIRTY_PAGES).
481 - Switch fs/ntfs/index.h::ntfs_index_entry_mark_dirty() to using the
482 new helper fs/ntfs/aops.c::mark_ntfs_record_dirty() and remove the no
483 longer needed fs/ntfs/index.[hc]::__ntfs_index_entry_mark_dirty().
484 - Move ntfs_{un,}map_page() from ntfs.h to aops.h and fix resulting
485 include errors.
486 - Move the typedefs for runlist_element and runlist from types.h to
487 runlist.h and fix resulting include errors.
488 - Remove unused {__,}format_mft_record() from fs/ntfs/mft.c.
489 - Modify fs/ntfs/mft.c::__mark_mft_record_dirty() to use the helper
490 mark_ntfs_record_dirty() which also changes the behaviour in that we
491 now set the buffers belonging to the mft record dirty as well as the
492 page itself.
493 - Update fs/ntfs/mft.c::write_mft_record_nolock() and sync_mft_mirror()
494 to cope with the fact that there now are dirty buffers in mft pages.
495 - Update fs/ntfs/inode.c::ntfs_write_inode() to also use the helper
496 mark_ntfs_record_dirty() and thus to set the buffers belonging to the
497 mft record dirty as well as the page itself.
498 - Fix compiler warnings on x86-64 in fs/ntfs/dir.c. (Randy Dunlap,
499 slightly modified by me)
500 - Add fs/ntfs/mft.c::try_map_mft_record() which fails with -EALREADY if
501 the mft record is already locked and otherwise behaves the same way
502 as fs/ntfs/mft.c::map_mft_record().
503 - Modify fs/ntfs/mft.c::write_mft_record_nolock() so that it only
504 writes the mft record if the buffers belonging to it are dirty.
505 Otherwise we assume that it was written out by other means already.
506 - Attempting to write outside initialized size is _not_ a bug so remove
507 the bug check from fs/ntfs/aops.c::ntfs_write_mst_block(). It is in
508 fact required to write outside initialized size when preparing to
509 extend the initialized size.
510 - Map the page instead of using page_address() before writing to it in
511 fs/ntfs/aops.c::ntfs_mft_writepage().
512 - Provide exclusion between opening an inode / mapping an mft record
513 and accessing the mft record in fs/ntfs/mft.c::ntfs_mft_writepage()
514 by setting the page not uptodate throughout ntfs_mft_writepage().
515 - Clear the page uptodate flag in fs/ntfs/aops.c::ntfs_write_mst_block()
516 to ensure noone can see the page whilst the mst fixups are applied.
517 - Add the helper fs/ntfs/mft.c::ntfs_may_write_mft_record() which
518 checks if an mft record may be written out safely obtaining any
519 necessary locks in the process. This is used by
520 fs/ntfs/aops.c::ntfs_write_mst_block().
521 - Modify fs/ntfs/aops.c::ntfs_write_mst_block() to also work for
522 writing mft records and improve its error handling in the process.
523 Now if any of the records in the page fail to be written out, all
524 other records will be written out instead of aborting completely.
525 - Remove ntfs_mft_aops and update all users to use ntfs_mst_aops.
526 - Modify fs/ntfs/inode.c::ntfs_read_locked_inode() to set the
527 ntfs_mst_aops for all inodes which are NInoMstProtected() and
528 ntfs_aops for all other inodes.
529 - Rename fs/ntfs/mft.c::sync_mft_mirror{,_umount}() to
530 ntfs_sync_mft_mirror{,_umount}() and change their parameters so they
531 no longer require an ntfs inode to be present. Update all callers.
532 - Cleanup the error handling in fs/ntfs/mft.c::ntfs_sync_mft_mirror().
533 - Clear the page uptodate flag in fs/ntfs/mft.c::ntfs_sync_mft_mirror()
534 to ensure noone can see the page whilst the mst fixups are applied.
535 - Remove the no longer needed fs/ntfs/mft.c::ntfs_mft_writepage() and
536 fs/ntfs/mft.c::try_map_mft_record().
537 - Fix callers of fs/ntfs/aops.c::mark_ntfs_record_dirty() to call it
538 with the ntfs inode which contains the page rather than the ntfs
539 inode the mft record of which is in the page.
540 - Fix race condition in fs/ntfs/inode.c::ntfs_put_inode() by moving the
541 index inode bitmap inode release code from there to
542 fs/ntfs/inode.c::ntfs_clear_big_inode(). (Thanks to Christoph
543 Hellwig for spotting this.)
544 - Fix race condition in fs/ntfs/inode.c::ntfs_put_inode() by taking the
545 inode semaphore around the code that sets ni->itype.index.bmp_ino to
546 NULL and reorganize the code to optimize it a bit. (Thanks to
547 Christoph Hellwig for spotting this.)
548 - Modify fs/ntfs/aops.c::mark_ntfs_record_dirty() to no longer take the
549 ntfs inode as a parameter as this is confusing and misleading and the
550 needed ntfs inode is available via NTFS_I(page->mapping->host).
551 Adapt all callers to this change.
552 - Modify fs/ntfs/mft.c::write_mft_record_nolock() and
553 fs/ntfs/aops.c::ntfs_write_mst_block() to only check the dirty state
554 of the first buffer in a record and to take this as the ntfs record
555 dirty state. We cannot look at the dirty state for subsequent
556 buffers because we might be racing with
557 fs/ntfs/aops.c::mark_ntfs_record_dirty().
558 - Move the static inline ntfs_init_big_inode() from fs/ntfs/inode.c to
559 inode.h and make fs/ntfs/inode.c::__ntfs_init_inode() non-static and
560 add a declaration for it to inode.h. Fix some compilation issues
561 that resulted due to #includes and header file interdependencies.
562 - Simplify setup of i_mode in fs/ntfs/inode.c::ntfs_read_locked_inode().
563 - Add helpers fs/ntfs/layout.h::MK_MREF() and MK_LE_MREF().
564 - Modify fs/ntfs/mft.c::map_extent_mft_record() to only verify the mft
565 record sequence number if it is specified (i.e. not zero).
566 - Add fs/ntfs/mft.[hc]::ntfs_mft_record_alloc() and various helper
567 functions used by it.
568 - Update Documentation/filesystems/ntfs.txt with instructions on how to
569 use the Device-Mapper driver with NTFS ftdisk/LDM raid. This removes
570 the linear raid problem with the Software RAID / MD driver when one
571 or more of the devices has an odd number of sectors.
572
5732.1.20 - Fix two stupid bugs introduced in 2.1.18 release.
574
575 - Fix stupid bug in fs/ntfs/attrib.c::ntfs_attr_reinit_search_ctx()
576 where we did not clear ctx->al_entry but it was still set due to
577 changes in ntfs_attr_lookup() and ntfs_external_attr_find() in
578 particular.
579 - Fix another stupid bug in fs/ntfs/attrib.c::ntfs_external_attr_find()
580 where we forgot to unmap the extent mft record when we had finished
581 enumerating an attribute which caused a bug check to trigger when the
582 VFS calls ->clear_inode.
583
5842.1.19 - Many cleanups, improvements, and a minor bug fix.
585
586 - Update ->setattr (fs/ntfs/inode.c::ntfs_setattr()) to refuse to
587 change the uid, gid, and mode of an inode as we do not support NTFS
588 ACLs yet.
589 - Remove BKL use from ntfs_setattr() syncing up with the rest of the
590 kernel.
591 - Get rid of the ugly transparent union in fs/ntfs/dir.c::ntfs_readdir()
592 and ntfs_filldir() as per suggestion from Al Viro.
593 - Change '\0' and L'\0' to simply 0 as per advice from Linus Torvalds.
594 - Update ->truncate (fs/ntfs/inode.c::ntfs_truncate()) to check if the
595 inode size has changed and to only output an error if so.
596 - Rename fs/ntfs/attrib.h::attribute_value_length() to ntfs_attr_size().
597 - Add le{16,32,64} as well as sle{16,32,64} data types to
598 fs/ntfs/types.h.
599 - Change ntfschar to be le16 instead of u16 in fs/ntfs/types.h.
600 - Add le versions of VCN, LCN, and LSN called leVCN, leLCN, and leLSN,
601 respectively, to fs/ntfs/types.h.
602 - Update endianness conversion macros in fs/ntfs/endian.h to use the
603 new types as appropriate.
604 - Do proper type casting when using sle64_to_cpup() in fs/ntfs/dir.c
605 and index.c.
606 - Add leMFT_REF data type to fs/ntfs/layout.h.
607 - Update all NTFS header files with the new little endian data types.
608 Affected files are fs/ntfs/layout.h, logfile.h, and time.h.
609 - Do proper type casting when using ntfs_is_*_recordp() in
610 fs/ntfs/logfile.c, mft.c, and super.c.
611 - Fix all the sparse bitwise warnings. Had to change all the typedef
612 enums storing little endian values to simple enums plus a typedef for
613 the datatype to make sparse happy.
614 - Fix a bug found by the new sparse bitwise warnings where the default
615 upcase table was defined as a pointer to wchar_t rather than ntfschar
616 in fs/ntfs/ntfs.h and super.c.
617 - Change {const_,}cpu_to_le{16,32}(0) to just 0 as suggested by Al Viro.
618
6192.1.18 - Fix scheduling latencies at mount time as well as an endianness bug.
620
621 - Remove vol->nr_mft_records as it was pretty meaningless and optimize
622 the calculation of total/free inodes as used by statfs().
623 - Fix scheduling latencies in ntfs_fill_super() by dropping the BKL
624 because the code itself is using the ntfs_lock semaphore which
625 provides safe locking. (Ingo Molnar)
626 - Fix a potential bug in fs/ntfs/mft.c::map_extent_mft_record() that
627 could occur in the future for when we start closing/freeing extent
628 inodes if we don't set base_ni->ext.extent_ntfs_inos to NULL after
629 we free it.
630 - Rename {find,lookup}_attr() to ntfs_attr_{find,lookup}() as well as
631 find_external_attr() to ntfs_external_attr_find() to cleanup the
632 namespace a bit and to be more consistent with libntfs.
633 - Rename {{re,}init,get,put}_attr_search_ctx() to
634 ntfs_attr_{{re,}init,get,put}_search_ctx() as well as the type
635 attr_search_context to ntfs_attr_search_ctx.
636 - Force use of ntfs_attr_find() in ntfs_attr_lookup() when searching
637 for the attribute list attribute itself.
638 - Fix endianness bug in ntfs_external_attr_find().
639 - Change ntfs_{external_,}attr_find() to return 0 on success, -ENOENT
640 if the attribute is not found, and -EIO on real error. In the case
641 of -ENOENT, the search context is updated to describe the attribute
642 before which the attribute being searched for would need to be
643 inserted if such an action were to be desired and in the case of
644 ntfs_external_attr_find() the search context is also updated to
645 indicate the attribute list entry before which the attribute list
646 entry of the attribute being searched for would need to be inserted
647 if such an action were to be desired. Also make ntfs_find_attr()
648 static and remove its prototype from attrib.h as it is not used
649 anywhere other than attrib.c. Update ntfs_attr_lookup() and all
650 callers of ntfs_{external,}attr_{find,lookup}() for the new return
651 values.
652 - Minor cleanup of fs/ntfs/inode.c::ntfs_init_locked_inode().
653
6542.1.17 - Fix bugs in mount time error code paths and other updates.
655
656 - Implement bitmap modification code (fs/ntfs/bitmap.[hc]). This
657 includes functions to set/clear a single bit or a run of bits.
658 - Add fs/ntfs/attrib.[hc]::ntfs_find_vcn() which returns the locked
659 runlist element containing a particular vcn. It also takes care of
660 mapping any needed runlist fragments.
661 - Implement cluster (de-)allocation code (fs/ntfs/lcnalloc.[hc]).
662 - Load attribute definition table from $AttrDef at mount time.
663 - Fix bugs in mount time error code paths involving (de)allocation of
664 the default and volume upcase tables.
665 - Remove ntfs_nr_mounts as it is no longer used.
666
6672.1.16 - Implement access time updates, file sync, async io, and read/writev.
668
669 - Add support for readv/writev and aio_read/aio_write (fs/ntfs/file.c).
670 This is done by setting the appropriate file operations pointers to
671 the generic helper functions provided by mm/filemap.c.
672 - Implement fsync, fdatasync, and msync both for files (fs/ntfs/file.c)
673 and directories (fs/ntfs/dir.c).
674 - Add support for {a,m,c}time updates to inode.c::ntfs_write_inode().
675 Note, except for the root directory and any other system files opened
676 by the user, the system files will not have their access times
677 updated as they are only accessed at the inode level an hence the
678 file level functions which cause the times to be updated are never
679 invoked.
680
6812.1.15 - Invalidate quotas when (re)mounting read-write.
682
683 - Add new element itype.index.collation_rule to the ntfs inode
684 structure and set it appropriately in ntfs_read_locked_inode().
685 - Implement a new inode type "index" to allow efficient access to the
686 indices found in various system files and adapt inode handling
687 accordingly (fs/ntfs/inode.[hc]). An index inode is essentially an
688 attribute inode (NInoAttr() is true) with an attribute type of
689 AT_INDEX_ALLOCATION. As such, it is no longer allowed to call
690 ntfs_attr_iget() with an attribute type of AT_INDEX_ALLOCATION as
691 there would be no way to distinguish between normal attribute inodes
692 and index inodes. The function to obtain an index inode is
693 ntfs_index_iget() and it uses the helper function
694 ntfs_read_locked_index_inode(). Note, we do not overload
695 ntfs_attr_iget() as indices consist of multiple attributes so using
696 ntfs_attr_iget() to obtain an index inode would be confusing.
697 - Ensure that there is no overflow when doing page->index <<
698 PAGE_CACHE_SHIFT by casting page->index to s64 in fs/ntfs/aops.c.
699 - Use atomic kmap instead of kmap() in fs/ntfs/aops.c::ntfs_read_page()
700 and ntfs_read_block().
701 - Use case sensitive attribute lookups instead of case insensitive ones.
702 - Lock all page cache pages belonging to mst protected attributes while
703 accessing them to ensure we never see corrupt data while the page is
704 under writeout.
705 - Add framework for generic ntfs collation (fs/ntfs/collation.[hc]).
706 We have ntfs_is_collation_rule_supported() to check if the collation
707 rule you want to use is supported and ntfs_collation() which actually
708 collates two data items. We currently only support COLLATION_BINARY
709 and COLLATION_NTOFS_ULONG but support for other collation rules will
710 be added as the need arises.
711 - Add a new type, ntfs_index_context, to allow retrieval of an index
712 entry using the corresponding index key. To get an index context,
713 use ntfs_index_ctx_get() and to release it, use ntfs_index_ctx_put().
714 This also adds a new slab cache for the index contexts. To lookup a
715 key in an index inode, use ntfs_index_lookup(). After modifying an
716 index entry, call ntfs_index_entry_flush_dcache_page() followed by
717 ntfs_index_entry_mark_dirty() to ensure the changes are written out
718 to disk. For details see fs/ntfs/index.[hc]. Note, at present, if
719 an index entry is in the index allocation attribute rather than the
720 index root attribute it will not be written out (you will get a
721 warning message about discarded changes instead).
722 - Load the quota file ($Quota) and check if quota tracking is enabled
723 and if so, mark the quotas out of date. This causes windows to
724 rescan the volume on boot and update all quota entries.
725 - Add a set_page_dirty address space operation for ntfs_m[fs]t_aops.
726 It is simply set to __set_page_dirty_nobuffers() to make sure that
727 running set_page_dirty() on a page containing mft/ntfs records will
728 not affect the dirty state of the page buffers.
729 - Add fs/ntfs/index.c::__ntfs_index_entry_mark_dirty() which sets all
730 buffers that are inside the ntfs record in the page dirty after which
731 it sets the page dirty. This allows ->writepage to only write the
732 dirty index records rather than having to write all the records in
733 the page. Modify fs/ntfs/index.h::ntfs_index_entry_mark_dirty() to
734 use this rather than __set_page_dirty_nobuffers().
735 - Implement fs/ntfs/aops.c::ntfs_write_mst_block() which enables the
736 writing of page cache pages belonging to mst protected attributes
737 like the index allocation attribute in directory indices and other
738 indices like $Quota/$Q, etc. This means that the quota is now marked
739 out of date on all volumes rather than only on ones where the quota
740 defaults entry is in the index root attribute of the $Quota/$Q index.
741
7422.1.14 - Fix an NFSd caused deadlock reported by several users.
743
744 - Modify fs/ntfs/ntfs_readdir() to copy the index root attribute value
745 to a buffer so that we can put the search context and unmap the mft
746 record before calling the filldir() callback. We need to do this
747 because of NFSd which calls ->lookup() from its filldir callback()
748 and this causes NTFS to deadlock as ntfs_lookup() maps the mft record
749 of the directory and since ntfs_readdir() has got it mapped already
750 ntfs_lookup() deadlocks.
751
7522.1.13 - Enable overwriting of resident files and housekeeping of system files.
753
754 - Implement writing of mft records (fs/ntfs/mft.[hc]), which includes
755 keeping the mft mirror in sync with the mft when mirrored mft records
756 are written. The functions are write_mft_record{,_nolock}(). The
757 implementation is quite rudimentary for now with lots of things not
758 implemented yet but I am not sure any of them can actually occur so
759 I will wait for people to hit each one and only then implement it.
760 - Commit open system inodes at umount time. This should make it
761 virtually impossible for sync_mft_mirror_umount() to ever be needed.
762 - Implement ->write_inode (fs/ntfs/inode.c::ntfs_write_inode()) for the
763 ntfs super operations. This gives us inode writing via the VFS inode
764 dirty code paths. Note: Access time updates are not implemented yet.
765 - Implement fs/ntfs/mft.[hc]::{,__}mark_mft_record_dirty() and make
766 fs/ntfs/aops.c::ntfs_writepage() and ntfs_commit_write() use it, thus
767 finally enabling resident file overwrite! (-8 This also includes a
768 placeholder for ->writepage (ntfs_mft_writepage()), which for now
769 just redirties the page and returns. Also, at umount time, we for
770 now throw away all mft data page cache pages after the last call to
771 ntfs_commit_inode() in the hope that all inodes will have been
772 written out by then and hence no dirty (meta)data will be lost. We
773 also check for this case and emit an error message telling the user
774 to run chkdsk.
775 - Use set_page_writeback() and end_page_writeback() in the resident
776 attribute code path of fs/ntfs/aops.c::ntfs_writepage() otherwise
777 the radix-tree tag PAGECACHE_TAG_DIRTY remains set even though the
778 page is clean.
779 - Implement ntfs_mft_writepage() so it now checks if any of the mft
780 records in the page are dirty and if so redirties the page and
781 returns. Otherwise it just returns (after doing set_page_writeback(),
782 unlock_page(), end_page_writeback() or the radix-tree tag
783 PAGECACHE_TAG_DIRTY remains set even though the page is clean), thus
784 alowing the VM to do with the page as it pleases. Also, at umount
785 time, now only throw away dirty mft (meta)data pages if dirty inodes
786 are present and ask the user to email us if they see this happening.
787 - Add functions ntfs_{clear,set}_volume_flags(), to modify the volume
788 information flags (fs/ntfs/super.c).
789 - Mark the volume dirty when (re)mounting read-write and mark it clean
790 when unmounting or remounting read-only. If any volume errors are
791 found, the volume is left marked dirty to force chkdsk to run.
792 - Add code to set the NT4 compatibility flag when (re)mounting
793 read-write for newer NTFS versions but leave it commented out for now
794 since we do not make any modifications that are NTFS 1.2 specific yet
795 and since setting this flag breaks Captive-NTFS which is not nice.
796 This code must be enabled once we start writing NTFS 1.2 specific
797 changes otherwise Windows NTFS driver might crash / cause corruption.
798
7992.1.12 - Fix the second fix to the decompression engine and some cleanups.
800
801 - Add a new address space operations struct, ntfs_mst_aops, for mst
802 protected attributes. This is because the default ntfs_aops do not
803 make sense with mst protected data and were they to write anything to
804 such an attribute they would cause data corruption so we provide
805 ntfs_mst_aops which does not have any write related operations set.
806 - Cleanup dirty ntfs inode handling (fs/ntfs/inode.[hc]) which also
807 includes an adapted ntfs_commit_inode() and an implementation of
808 ntfs_write_inode() which for now just cleans dirty inodes without
809 writing them (it does emit a warning that this is happening).
810 - Undo the second decompression engine fix (see 2.1.9 release ChangeLog
811 entry) as it was only fixing a theoretical bug but at the same time
812 it badly broke the handling of sparse and uncompressed compression
813 blocks.
814
8152.1.11 - Driver internal cleanups.
816
817 - Only build logfile.o if building the driver with read-write support.
818 - Really final white space cleanups.
819 - Use generic_ffs() instead of ffs() in logfile.c which allows the
820 log_page_size variable to be optimized by gcc into a constant.
821 - Rename uchar_t to ntfschar everywhere as uchar_t is unsigned 1-byte
822 char as defined by POSIX and as found on some systems.
823
8242.1.10 - Force read-only (re)mounting of volumes with unsupported volume flags.
825
826 - Finish off the white space cleanups (remove trailing spaces, etc).
827 - Clean up ntfs_fill_super() and ntfs_read_inode_mount() by removing
828 the kludges around the first iget(). Instead of (re)setting ->s_op
829 we have the $MFT inode set up by explicit new_inode() / set ->i_ino /
830 insert_inode_hash() / call ntfs_read_inode_mount() directly. This
831 kills the need for second super_operations and allows to return error
832 from ntfs_read_inode_mount() without resorting to ugly "poisoning"
833 tricks. (Al Viro)
834 - Force read-only (re)mounting if any of the following bits are set in
835 the volume information flags:
836 VOLUME_IS_DIRTY, VOLUME_RESIZE_LOG_FILE,
837 VOLUME_UPGRADE_ON_MOUNT, VOLUME_DELETE_USN_UNDERWAY,
838 VOLUME_REPAIR_OBJECT_ID, VOLUME_MODIFIED_BY_CHKDSK
839 To make this easier we define VOLUME_MUST_MOUNT_RO_MASK with all the
840 above bits set so the test is made easy.
841
8422.1.9 - Fix two bugs in decompression engine.
843
844 - Fix a bug where we would not always detect that we have reached the
845 end of a compression block because we were ending at minus one byte
846 which is effectively the same as being at the end. The fix is to
847 check whether the uncompressed buffer has been fully filled and if so
848 we assume we have reached the end of the compression block. A big
849 thank you to Marcin Gibuła for the bug report, the assistance in
850 tracking down the bug and testing the fix.
851 - Fix a possible bug where when a compressed read is truncated to the
852 end of the file, the offset inside the last page was not truncated.
853
8542.1.8 - Handle $MFT mirror and $LogFile, improve time handling, and cleanups.
855
856 - Use get_bh() instead of manual atomic_inc() in fs/ntfs/compress.c.
857 - Modify fs/ntfs/time.c::ntfs2utc(), get_current_ntfs_time(), and
858 utc2ntfs() to work with struct timespec instead of time_t on the
859 Linux UTC time side thus preserving the full precision of the NTFS
860 time and only loosing up to 99 nano-seconds in the Linux UTC time.
861 - Move fs/ntfs/time.c to fs/ntfs/time.h and make the time functions
862 static inline.
863 - Remove unused ntfs_dirty_inode().
864 - Cleanup super operations declaration in fs/ntfs/super.c.
865 - Wrap flush_dcache_mft_record_page() in #ifdef NTFS_RW.
866 - Add NInoTestSetFoo() and NInoTestClearFoo() macro magic to
867 fs/ntfs/inode.h and use it to declare NInoTest{Set,Clear}Dirty.
868 - Move typedefs for ntfs_attr and test_t from fs/ntfs/inode.c to
869 fs/ntfs/inode.h so they can be used elsewhere.
870 - Determine the mft mirror size as the number of mirrored mft records
871 and store it in ntfs_volume->mftmirr_size (fs/ntfs/super.c).
872 - Load the mft mirror at mount time and compare the mft records stored
873 in it to the ones in the mft. Force a read-only mount if the two do
874 not match (fs/ntfs/super.c).
875 - Fix type casting related warnings on 64-bit architectures. Thanks
876 to Meelis Roos for reporting them.
877 - Move %L to %ll as %L is floating point and %ll is integer which is
878 what we want.
879 - Read the journal ($LogFile) and determine if the volume has been
880 shutdown cleanly and force a read-only mount if not (fs/ntfs/super.c
881 and fs/ntfs/logfile.c). This is a little bit of a crude check in
882 that we only look at the restart areas and not at the actual log
883 records so that there will be a very small number of cases where we
884 think that a volume is dirty when in fact it is clean. This should
885 only affect volumes that have not been shutdown cleanly and did not
886 have any pending, non-check-pointed i/o.
887 - If the $LogFile indicates a clean shutdown and a read-write (re)mount
888 is requested, empty $LogFile by overwriting it with 0xff bytes to
889 ensure that Windows cannot cause data corruption by replaying a stale
890 journal after Linux has written to the volume.
891
8922.1.7 - Enable NFS exporting of mounted NTFS volumes.
893
894 - Set i_generation in the VFS inode from the seq_no of the NTFS inode.
895 - Make ntfs_lookup() NFS export safe, i.e. use d_splice_alias(), etc.
896 - Implement ->get_dentry() in fs/ntfs/namei.c::ntfs_get_dentry() as the
897 default doesn't allow inode number 0 which is a valid inode on NTFS
898 and even if it did allow that it uses iget() instead of ntfs_iget()
899 which makes it useless for us.
900 - Implement ->get_parent() in fs/ntfs/namei.c::ntfs_get_parent() as the
901 default just returns -EACCES which is not very useful.
902 - Define export operations (->s_export_op) for NTFS (ntfs_export_ops)
903 and set them up in the super block at mount time (super.c) this
904 allows mounted NTFS volumes to be exported via NFS.
905 - Add missing return -EOPNOTSUPP; in
906 fs/ntfs/aops.c::ntfs_commit_nonresident_write().
907 - Enforce no atime and no dir atime updates at mount/remount time as
908 they are not implemented yet anyway.
909 - Move a few assignments in fs/ntfs/attrib.c::load_attribute_list() to
910 after a NULL check. Thanks to Dave Jones for pointing this out.
911
9122.1.6 - Fix minor bug in handling of compressed directories.
913
914 - Fix bug in handling of compressed directories. A compressed
915 directory is not really compressed so when we set the ->i_blocks
916 field of a compressed directory inode we were setting it from the
917 non-existing field ni->itype.compressed.size which gave random
918 results... For directories we now always use ni->allocated_size.
919
9202.1.5 - Fix minor bug in attribute list attribute handling.
921
922 - Fix bug in attribute list handling. Actually it is not as much a bug
923 as too much protection in that we were not allowing attribute lists
924 which waste space on disk while Windows XP clearly allows it and in
925 fact creates such attribute lists so our driver was failing.
926 - Update NTFS documentation ready for 2.6 kernel release.
927
9282.1.4 - Reduce compiler requirements.
929
930 - Remove all uses of unnamed structs and unions in the driver to make
931 old and newer gcc versions happy. Makes it a bit uglier IMO but at
932 least people will stop hassling me about it.
933
9342.1.3 - Important bug fixes in corner cases.
935
936 - super.c::parse_ntfs_boot_sector(): Correct the check for 64-bit
937 clusters. (Philipp Thomas)
938 - attrib.c::load_attribute_list(): Fix bug when initialized_size is a
939 multiple of the block_size but not the cluster size. (Szabolcs
940 Szakacsits)
941
9422.1.2 - Important bug fixes aleviating the hangs in statfs.
943
944 - Fix buggy free cluster and free inode determination logic.
945
9462.1.1 - Minor updates.
947
948 - Add handling for initialized_size != data_size in compressed files.
949 - Reduce function local stack usage from 0x3d4 bytes to just noise in
950 fs/ntfs/upcase.c. (Randy Dunlap)
951 - Remove compiler warnings for newer gcc.
952 - Pages are no longer kmapped by mm/filemap.c::generic_file_write()
953 around calls to ->{prepare,commit}_write. Adapt NTFS appropriately
954 in fs/ntfs/aops.c::ntfs_prepare_nonresident_write() by using
955 kmap_atomic(KM_USER0).
956
9572.1.0 - First steps towards write support: implement file overwrite.
958
959 - Add configuration option for developmental write support with an
960 appropriately scary configuration help text.
961 - Initial implementation of fs/ntfs/aops.c::ntfs_writepage() and its
962 helper fs/ntfs/aops.c::ntfs_write_block(). This enables mmap(2) based
963 overwriting of existing files on ntfs. Note: Resident files are
964 only written into memory, and not written out to disk at present, so
965 avoid writing to files smaller than about 1kiB.
966 - Initial implementation of fs/ntfs/aops.c::ntfs_prepare_write(), its
967 helper fs/ntfs/aops.c::ntfs_prepare_nonresident_write() and their
968 counterparts, fs/ntfs/aops.c::ntfs_commit_write(), and
969 fs/ntfs/aops.c::ntfs_commit_nonresident_write(), respectively. Also,
970 add generic_file_write() to the ntfs file operations (fs/ntfs/file.c).
971 This enables write(2) based overwriting of existing files on ntfs.
972 Note: As with mmap(2) based overwriting, resident files are only
973 written into memory, and not written out to disk at present, so avoid
974 writing to files smaller than about 1kiB.
975 - Implement ->truncate (fs/ntfs/inode.c::ntfs_truncate()) and
976 ->setattr() (fs/ntfs/inode.c::ntfs_setattr()) inode operations for
977 files with the purpose of intercepting and aborting all i_size
978 changes which we do not support yet. ntfs_truncate() actually only
979 emits a warning message but AFAICS our interception of i_size changes
980 elsewhere means ntfs_truncate() never gets called for i_size changes.
981 It is only called from generic_file_write() when we fail in
982 ntfs_prepare_{,nonresident_}write() in order to discard any
983 instantiated buffers beyond i_size. Thus i_size is not actually
984 changed so our warning message is enough. Unfortunately it is not
985 possible to easily determine if i_size is being changed or not hence
986 we just emit an appropriately worded error message.
987
9882.0.25 - Small bug fixes and cleanups.
989
990 - Unlock the page in an out of memory error code path in
991 fs/ntfs/aops.c::ntfs_read_block().
992 - If fs/ntfs/aops.c::ntfs_read_page() is called on an uptodate page,
993 just unlock the page and return. (This can happen due to ->writepage
994 clearing PageUptodate() during write out of MstProtected()
995 attributes.
996 - Remove leaked write code again.
997
9982.0.24 - Cleanups.
999
1000 - Treat BUG_ON() as ASSERT() not VERIFY(), i.e. do not use side effects
1001 inside BUG_ON(). (Adam J. Richter)
1002 - Split logical OR expressions inside BUG_ON() into individual BUG_ON()
1003 calls for improved debugging. (Adam J. Richter)
1004 - Add errors flag to the ntfs volume state, accessed via
1005 NVol{,Set,Clear}Errors(vol).
1006 - Do not allow read-write remounts of read-only volumes with errors.
1007 - Clarify comment for ntfs file operation sendfile which was added by
1008 Christoph Hellwig a while ago (just using generic_file_sendfile())
1009 to say that ntfs ->sendfile is only used for the case where the
1010 source data is on the ntfs partition and the destination is
1011 somewhere else, i.e. nothing we need to concern ourselves with.
1012 - Add generic_file_write() as our ntfs file write operation.
1013
10142.0.23 - Major bug fixes (races, deadlocks, non-i386 architectures).
1015
1016 - Massive internal locking changes to mft record locking. Fixes lock
1017 recursion and replaces the mrec_lock read/write semaphore with a
1018 mutex. Also removes the now superfluous mft_count. This fixes several
1019 race conditions and deadlocks, especially in the future write code.
1020 - Fix ntfs over loopback for compressed files by adding an
1021 optimization barrier. (gcc was screwing up otherwise ?)
1022 - Miscellaneous cleanups all over the code and a fix or two in error
1023 handling code paths.
1024 Thanks go to Christoph Hellwig for pointing out the following two:
1025 - Remove now unused function fs/ntfs/malloc.h::vmalloc_nofs().
1026 - Fix ntfs_free() for ia64 and parisc by checking for VMALLOC_END, too.
1027
10282.0.22 - Cleanups, mainly to ntfs_readdir(), and use C99 initializers.
1029
1030 - Change fs/ntfs/dir.c::ntfs_reddir() to only read/write ->f_pos once
1031 at entry/exit respectively.
1032 - Use C99 initializers for structures.
1033 - Remove unused variable blocks from fs/ntfs/aops.c::ntfs_read_block().
1034
10352.0.21 - Check for, and refuse to work with too large files/directories/volumes.
1036
1037 - Limit volume size at mount time to 2TiB on architectures where
1038 unsigned long is 32-bits (fs/ntfs/super.c::parse_ntfs_boot_sector()).
1039 This is the most we can do without overflowing the 32-bit limit of
1040 the block device size imposed on us by sb_bread() and sb_getblk()
1041 for the time being.
1042 - Limit file/directory size at open() time to 16TiB on architectures
1043 where unsigned long is 32-bits (fs/ntfs/file.c::ntfs_file_open() and
1044 fs/ntfs/dir.c::ntfs_dir_open()). This is the most we can do without
1045 overflowing the page cache page index.
1046
10472.0.20 - Support non-resident directory index bitmaps, fix page leak in readdir.
1048
1049 - Move the directory index bitmap to use an attribute inode instead of
1050 having special fields for it inside the ntfs inode structure. This
1051 means that the index bitmaps now use the page cache for i/o, too,
1052 and also as a side effect we get support for non-resident index
1053 bitmaps for free.
1054 - Simplify/cleanup error handling in fs/ntfs/dir.c::ntfs_readdir() and
1055 fix a page leak that manifested itself in some cases.
1056 - Add fs/ntfs/inode.c::ntfs_put_inode(), which we need to release the
1057 index bitmap inode on the final iput().
1058
10592.0.19 - Fix race condition, improvements, and optimizations in i/o interface.
1060
1061 - Apply block optimization added to fs/ntfs/aops.c::ntfs_read_block()
1062 to fs/ntfs/compress.c::ntfs_file_read_compressed_block() as well.
1063 - Drop the "file" from ntfs_file_read_compressed_block().
1064 - Rename fs/ntfs/aops.c::ntfs_enb_buffer_read_async() to
1065 ntfs_end_buffer_async_read() (more like the fs/buffer.c counterpart).
1066 - Update ntfs_end_buffer_async_read() with the improved logic from
1067 its updated counterpart fs/buffer.c::end_buffer_async_read(). Apply
1068 further logic improvements to better determine when we set PageError.
1069 - Update submission of buffers in fs/ntfs/aops.c::ntfs_read_block() to
1070 check for the buffers being uptodate first in line with the updated
1071 fs/buffer.c::block_read_full_page(). This plugs a small race
1072 condition.
1073
10742.0.18 - Fix race condition in reading of compressed files.
1075
1076 - There was a narrow window between checking a buffer head for being
1077 uptodate and locking it in ntfs_file_read_compressed_block(). We now
1078 lock the buffer and then check whether it is uptodate or not.
1079
10802.0.17 - Cleanups and optimizations - shrinking the ToDo list.
1081
1082 - Modify fs/ntfs/inode.c::ntfs_read_locked_inode() to return an error
1083 code and update callers, i.e. ntfs_iget(), to pass that error code
1084 up instead of just using -EIO.
1085 - Modifications to super.c to ensure that both mount and remount
1086 cannot set any write related options when the driver is compiled
1087 read-only.
1088 - Optimize block resolution in fs/ntfs/aops.c::ntfs_read_block() to
1089 cache the current runlist element. This should improve performance
1090 when reading very large and/or very fragmented data.
1091
10922.0.16 - Convert access to $MFT/$BITMAP to attribute inode API.
1093
1094 - Fix a stupid bug introduced in 2.0.15 where we were unmapping the
1095 wrong inode in fs/ntfs/inode.c::ntfs_attr_iget().
1096 - Fix debugging check in fs/ntfs/aops.c::ntfs_read_block().
1097 - Convert $MFT/$BITMAP access to attribute inode API and remove all
1098 remnants of the ugly mftbmp address space and operations hack. This
1099 means we finally have only one readpage function as well as only one
1100 async io completion handler. Yey! The mft bitmap is now just an
1101 attribute inode and is accessed from vol->mftbmp_ino just as if it
1102 were a normal file. Fake inodes rule. (-:
1103
11042.0.15 - Fake inodes based attribute i/o via the pagecache, fixes and cleanups.
1105
1106 - Fix silly bug in fs/ntfs/super.c::parse_options() which was causing
1107 remounts to fail when the partition had an entry in /etc/fstab and
1108 the entry specified the nls= option.
1109 - Apply same macro magic used in fs/ntfs/inode.h to fs/ntfs/volume.h to
1110 expand all the helper functions NVolFoo(), NVolSetFoo(), and
1111 NVolClearFoo().
1112 - Move copyright statement from driver initialisation message to
1113 module description (fs/super.c). This makes the initialisation
1114 message fit on one line and fits in better with rest of kernel.
1115 - Update fs/ntfs/attrib.c::map_run_list() to work on both real and
1116 attribute inodes, and both for files and directories.
1117 - Implement fake attribute inodes allowing all attribute i/o to go via
1118 the page cache and to use all the normal vfs/mm functionality:
1119 - Add ntfs_attr_iget() and its helper ntfs_read_locked_attr_inode()
1120 to fs/ntfs/inode.c.
1121 - Add needed cleanup code to ntfs_clear_big_inode().
1122 - Merge address space operations for files and directories (aops.c),
1123 now just have ntfs_aops:
1124 - Rename:
1125 end_buffer_read_attr_async() -> ntfs_end_buffer_read_async(),
1126 ntfs_attr_read_block() -> ntfs_read_block(),
1127 ntfs_file_read_page() -> ntfs_readpage().
1128 - Rewrite fs/ntfs/aops.c::ntfs_readpage() to work on both real and
1129 attribute inodes, and both for files and directories.
1130 - Remove obsolete fs/ntfs/aops.c::ntfs_mst_readpage().
1131
11322.0.14 - Run list merging code cleanup, minor locking changes, typo fixes.
1133
1134 - Change fs/ntfs/super.c::ntfs_statfs() to not rely on BKL by moving
1135 the locking out of super.c::get_nr_free_mft_records() and taking and
1136 dropping the mftbmp_lock rw_semaphore in ntfs_statfs() itself.
1137 - Bring attribute runlist merging code (fs/ntfs/attrib.c) in sync with
1138 current userspace ntfs library code. This means that if a merge
1139 fails the original runlists are always left unmodified instead of
1140 being silently corrupted.
1141 - Misc typo fixes.
1142
11432.0.13 - Use iget5_locked() in preparation for fake inodes and small cleanups.
1144
1145 - Remove nr_mft_bits and the now superfluous union with nr_mft_records
1146 from ntfs_volume structure.
1147 - Remove nr_lcn_bits and the now superfluous union with nr_clusters
1148 from ntfs_volume structure.
1149 - Use iget5_locked() and friends instead of conventional iget(). Wrap
1150 the call in fs/ntfs/inode.c::ntfs_iget() and update callers of iget()
1151 to use ntfs_iget(). Leave only one iget() call at mount time so we
1152 don't need an ntfs_iget_mount().
1153 - Change fs/ntfs/inode.c::ntfs_new_extent_inode() to take mft_no as an
1154 additional argument.
1155
11562.0.12 - Initial cleanup of address space operations following 2.0.11 changes.
1157
1158 - Merge fs/ntfs/aops.c::end_buffer_read_mst_async() and
1159 fs/ntfs/aops.c::end_buffer_read_file_async() into one function
1160 fs/ntfs/aops.c::end_buffer_read_attr_async() using NInoMstProtected()
1161 to determine whether to apply mst fixups or not.
1162 - Above change allows merging fs/ntfs/aops.c::ntfs_file_read_block()
1163 and fs/ntfs/aops.c::ntfs_mst_readpage() into one function
1164 fs/ntfs/aops.c::ntfs_attr_read_block(). Also, create a tiny wrapper
1165 fs/ntfs/aops.c::ntfs_mst_readpage() to transform the parameters from
1166 the VFS readpage function prototype to the ntfs_attr_read_block()
1167 function prototype.
1168
11692.0.11 - Initial preparations for fake inode based attribute i/o.
1170
1171 - Move definition of ntfs_inode_state_bits to fs/ntfs/inode.h and
1172 do some macro magic (adapted from include/linux/buffer_head.h) to
1173 expand all the helper functions NInoFoo(), NInoSetFoo(), and
1174 NInoClearFoo().
1175 - Add new flag to ntfs_inode_state_bits: NI_Sparse.
1176 - Add new fields to ntfs_inode structure to allow use of fake inodes
1177 for attribute i/o: type, name, name_len. Also add new state bits:
1178 NI_Attr, which, if set, indicates the inode is a fake inode, and
1179 NI_MstProtected, which, if set, indicates the attribute uses multi
1180 sector transfer protection, i.e. fixups need to be applied after
1181 reads and before/after writes.
1182 - Rename fs/ntfs/inode.c::ntfs_{new,clear,destroy}_inode() to
1183 ntfs_{new,clear,destroy}_extent_inode() and update callers.
1184 - Use ntfs_clear_extent_inode() in fs/ntfs/inode.c::__ntfs_clear_inode()
1185 instead of ntfs_destroy_extent_inode().
1186 - Cleanup memory deallocations in {__,}ntfs_clear_{,big_}inode().
1187 - Make all operations on ntfs inode state bits use the NIno* functions.
1188 - Set up the new ntfs inode fields and state bits in
1189 fs/ntfs/inode.c::ntfs_read_inode() and add appropriate cleanup of
1190 allocated memory to __ntfs_clear_inode().
1191 - Cleanup ntfs_inode structure a bit for better ordering of elements
1192 w.r.t. their size to allow better packing of the structure in memory.
1193
11942.0.10 - There can only be 2^32 - 1 inodes on an NTFS volume.
1195
1196 - Add check at mount time to verify that the number of inodes on the
1197 volume does not exceed 2^32 - 1, which is the maximum allowed for
1198 NTFS according to Microsoft.
1199 - Change mft_no member of ntfs_inode structure to be unsigned long.
1200 Update all users. This makes ntfs_inode->mft_no just a copy of struct
1201 inode->i_ino. But we can't just always use struct inode->i_ino and
1202 remove mft_no because extent inodes do not have an attached struct
1203 inode.
1204
12052.0.9 - Decompression engine now uses a single buffer and other cleanups.
1206
1207 - Change decompression engine to use a single buffer protected by a
1208 spin lock instead of per-CPU buffers. (Rusty Russell)
1209 - Do not update cb_pos when handling a partial final page during
1210 decompression of a sparse compression block, as the value is later
1211 reset without being read/used. (Rusty Russell)
1212 - Switch to using the new KM_BIO_SRC_IRQ for atomic kmap()s. (Andrew
1213 Morton)
1214 - Change buffer size in ntfs_readdir()/ntfs_filldir() to use
1215 NLS_MAX_CHARSET_SIZE which makes the buffers almost 1kiB each but
1216 it also makes everything safer so it is a good thing.
1217 - Miscellaneous minor cleanups to comments.
1218
12192.0.8 - Major updates for handling of case sensitivity and dcache aliasing.
1220
1221 Big thanks go to Al Viro and other inhabitants of #kernel for investing
1222 their time to discuss the case sensitivity and dcache aliasing issues.
1223
1224 - Remove unused source file fs/ntfs/attraops.c.
1225 - Remove show_inodes mount option(s), thus dropping support for
1226 displaying of short file names.
1227 - Remove deprecated mount option posix.
1228 - Restore show_sys_files mount option.
1229 - Add new mount option case_sensitive, to determine if the driver
1230 treats file names as case sensitive or not. If case sensitive, create
1231 file names in the POSIX namespace. Otherwise create file names in the
1232 LONG/WIN32 namespace. Note, files remain accessible via their short
1233 file name, if it exists.
1234 - Remove really dumb logic bug in boot sector recovery code.
1235 - Fix dcache aliasing issues wrt short/long file names via changes
1236 to fs/ntfs/dir.c::ntfs_lookup_inode_by_name() and
1237 fs/ntfs/namei.c::ntfs_lookup():
1238 - Add additional argument to ntfs_lookup_inode_by_name() in which we
1239 return information about the matching file name if the case is not
1240 matching or the match is a short file name. See comments above the
1241 function definition for details.
1242 - Change ntfs_lookup() to only create dcache entries for the correctly
1243 cased file name and only for the WIN32 namespace counterpart of DOS
1244 namespace file names. This ensures we have only one dentry per
1245 directory and also removes all dcache aliasing issues between short
1246 and long file names once we add write support. See comments above
1247 function for details.
1248 - Fix potential 1 byte overflow in fs/ntfs/unistr.c::ntfs_ucstonls().
1249
12502.0.7 - Minor cleanups and updates for changes in core kernel code.
1251
1252 - Remove much of the NULL struct element initializers.
1253 - Various updates to make compatible with recent kernels.
1254 - Remove defines of MAX_BUF_PER_PAGE and include linux/buffer_head.h
1255 in fs/ntfs/ntfs.h instead.
1256 - Remove no longer needed KERNEL_VERSION checks. We are now in the
1257 kernel proper so they are no longer needed.
1258
12592.0.6 - Major bugfix to make compatible with other kernel changes.
1260
1261 - Initialize the mftbmp address space properly now that there are more
1262 fields in the struct address_space. This was leading to hangs and
1263 oopses on umount since 2.5.12 because of changes to other parts of
1264 the kernel. We probably want a kernel generic init_address_space()
1265 function...
1266 - Drop BKL from ntfs_readdir() after consultation with Al Viro. The
1267 only caller of ->readdir() is vfs_readdir() which holds i_mutex
1268 during the call, and i_mutex is sufficient protection against changes
1269 in the directory inode (including ->i_size).
1270 - Use generic_file_llseek() for directories (as opposed to
1271 default_llseek()) as this downs i_mutex instead of the BKL which is
1272 what we now need for exclusion against ->f_pos changes considering we
1273 no longer take the BKL in ntfs_readdir().
1274
12752.0.5 - Major bugfix. Buffer overflow in extent inode handling.
1276
1277 - No need to set old blocksize in super.c::ntfs_fill_super() as the
1278 VFS does so via invocation of deactivate_super() calling
1279 fs->fill_super() calling block_kill_super() which does it.
1280 - BKL moved from VFS into dir.c::ntfs_readdir(). (Linus Torvalds)
1281 -> Do we really need it? I don't think so as we have exclusion on
1282 the directory ntfs_inode rw_semaphore mrec_lock. We mmight have to
1283 move the ->f_pos accesses under the mrec_lock though. Check this...
1284 - Fix really, really, really stupid buffer overflow in extent inode
1285 handling in mft.c::map_extent_mft_record().
1286
12872.0.4 - Cleanups and updates for kernel 2.5.11.
1288
1289 - Add documentation on how to use the MD driver to be able to use NTFS
1290 stripe and volume sets in Linux and generally cleanup documentation
1291 a bit.
1292 Remove all uses of kdev_t in favour of struct block_device *:
1293 - Change compress.c::ntfs_file_read_compressed_block() to use
1294 sb_getblk() instead of getblk().
1295 - Change super.c::ntfs_fill_super() to use bdev_hardsect_size() instead
1296 of get_hardsect_size().
1297 - No need to get old blocksize in super.c::ntfs_fill_super() as
1298 fs/super.c::get_sb_bdev() already does this.
1299 - Set bh->b_bdev instead of bh->b_dev throughout aops.c.
1300
13012.0.3 - Small bug fixes, cleanups, and performance improvements.
1302
1303 - Remove some dead code from mft.c.
1304 - Optimize readpage and read_block functions throughout aops.c so that
1305 only initialized blocks are read. Non-initialized ones have their
1306 buffer head mapped, zeroed, and set up to date, without scheduling
1307 any i/o. Thanks to Al Viro for advice on how to avoid the device i/o.
1308 Thanks go to Andrew Morton for spotting the below:
1309 - Fix buglet in allocate_compression_buffers() error code path.
1310 - Call flush_dcache_page() after modifying page cache page contents in
1311 ntfs_file_readpage().
1312 - Check for existence of page buffers throughout aops.c before calling
1313 create_empty_buffers(). This happens when an I/O error occurs and the
1314 read is retried. (It also happens once writing is implemented so that
1315 needed doing anyway but I had left it for later...)
1316 - Don't BUG_ON() uptodate and/or mapped buffers throughout aops.c in
1317 readpage and read_block functions. Reasoning same as above (i.e. I/O
1318 error retries and future write code paths.)
1319
13202.0.2 - Minor updates and cleanups.
1321
1322 - Cleanup: rename mst.c::__post_read_mst_fixup to post_write_mst_fixup
1323 and cleanup the code a bit, removing the unused size parameter.
1324 - Change default fmask to 0177 and update documentation.
1325 - Change attrib.c::get_attr_search_ctx() to return the search context
1326 directly instead of taking the address of a pointer. A return value
1327 of NULL means the allocation failed. Updated all callers
1328 appropriately.
1329 - Update to 2.5.9 kernel (preserving backwards compatibility) by
1330 replacing all occurences of page->buffers with page_buffers(page).
1331 - Fix minor bugs in runlist merging, also minor cleanup.
1332 - Updates to bootsector layout and mft mirror contents descriptions.
1333 - Small bug fix in error detection in unistr.c and some cleanups.
1334 - Grow name buffer allocations in unistr.c in aligned mutlipled of 64
1335 bytes.
1336
13372.0.1 - Minor updates.
1338
1339 - Make default umask correspond to documentation.
1340 - Improve documentation.
1341 - Set default mode to include execute bit. The {u,f,d}mask can be used
1342 to take it away if desired. This allows binaries to be executed from
1343 a mounted ntfs partition.
1344
13452.0.0 - New version number. Remove TNG from the name. Now in the kernel.
1346
1347 - Add kill_super, just keeping up with the vfs changes in the kernel.
1348 - Repeat some changes from tng-0.0.8 that somehow got lost on the way
1349 from the CVS import into BitKeeper.
1350 - Begin to implement proper handling of allocated_size vs
1351 initialized_size vs data_size (i.e. i_size). Done are
1352 mft.c::ntfs_mft_readpage(), aops.c::end_buffer_read_index_async(),
1353 and attrib.c::load_attribute_list().
1354 - Lock the runlist in attrib.c::load_attribute_list() while using it.
1355 - Fix memory leak in ntfs_file_read_compressed_block() and generally
1356 clean up compress.c a little, removing some uncommented/unused debug
1357 code.
1358 - Tidy up dir.c a little bit.
1359 - Don't bother getting the runlist in inode.c::ntfs_read_inode().
1360 - Merge mft.c::ntfs_mft_readpage() and aops.c::ntfs_index_readpage()
1361 creating aops.c::ntfs_mst_readpage(), improving the handling of
1362 holes and overflow in the process and implementing the correct
1363 equivalent of ntfs_file_get_block() in ntfs_mst_readpage() itself.
1364 I am aiming for correctness at the moment. Modularisation can come
1365 later.
1366 - Rename aops.c::end_buffer_read_index_async() to
1367 end_buffer_read_mst_async() and optimize the overflow checking and
1368 handling.
1369 - Use the host of the mftbmp address space mapping to hold the ntfs
1370 volume. This is needed so the async i/o completion handler can
1371 retrieve a pointer to the volume. Hopefully this will not cause
1372 problems elsewhere in the kernel... Otherwise will need to use a
1373 fake inode.
1374 - Complete implementation of proper handling of allocated_size vs
1375 initialized_size vs data_size (i.e. i_size) in whole driver.
1376 Basically aops.c is now completely rewritten.
1377 - Change NTFS driver name to just NTFS and set version number to 2.0.0
1378 to make a clear distinction from the old driver which is still on
1379 version 1.1.22.
1380
1381tng-0.0.8 - 08/03/2002 - Now using BitKeeper, http://linux-ntfs.bkbits.net/
1382
1383 - Replace bdevname(sb->s_dev) with sb->s_id.
1384 - Remove now superfluous new-line characters in all callers of
1385 ntfs_debug().
1386 - Apply kludge in ntfs_read_inode(), setting i_nlink to 1 for
1387 directories. Without this the "find" utility gets very upset which is
1388 fair enough as Linux/Unix do not support directory hard links.
1389 - Further runlist merging work. (Richard Russon)
1390 - Backwards compatibility for gcc-2.95. (Richard Russon)
1391 - Update to kernel 2.5.5-pre1 and rediff the now tiny patch.
1392 - Convert to new filesystem declaration using ->ntfs_get_sb() and
1393 replacing ntfs_read_super() with ntfs_fill_super().
1394 - Set s_maxbytes to MAX_LFS_FILESIZE to avoid page cache page index
1395 overflow on 32-bit architectures.
1396 - Cleanup upcase loading code to use ntfs_(un)map_page().
1397 - Disable/reenable preemtion in critical sections of compession engine.
1398 - Replace device size determination in ntfs_fill_super() with
1399 sb->s_bdev->bd_inode->i_size (in bytes) and remove now superfluous
1400 function super.c::get_nr_blocks().
1401 - Implement a mount time option (show_inodes) allowing choice of which
1402 types of inode names readdir() returns and modify ntfs_filldir()
1403 accordingly. There are several parameters to show_inodes:
1404 system: system files
1405 win32: long file names (including POSIX file names) [DEFAULT]
1406 long: same as win32
1407 dos: short file names only (excluding POSIX file names)
1408 short: same as dos
1409 posix: same as both win32 and dos
1410 all: all file names
1411 Note that the options are additive, i.e. specifying:
1412 -o show_inodes=system,show_inodes=win32,show_inodes=dos
1413 is the same as specifying:
1414 -o show_inodes=all
1415 Note that the "posix" and "all" options will show all directory
1416 names, BUT the link count on each directory inode entry is set to 1,
1417 due to Linux not supporting directory hard links. This may well
1418 confuse some userspace applications, since the directory names will
1419 have the same inode numbers. Thus it is NOT advisable to use the
1420 "posix" or "all" options. We provide them only for completeness sake.
1421 - Add copies of allocated_size, initialized_size, and compressed_size to
1422 the ntfs inode structure and set them up in
1423 inode.c::ntfs_read_inode(). These reflect the unnamed data attribute
1424 for files and the index allocation attribute for directories.
1425 - Add copies of allocated_size and initialized_size to ntfs inode for
1426 $BITMAP attribute of large directories and set them up in
1427 inode.c::ntfs_read_inode().
1428 - Add copies of allocated_size and initialized_size to ntfs volume for
1429 $BITMAP attribute of $MFT and set them up in
1430 super.c::load_system_files().
1431 - Parse deprecated ntfs driver options (iocharset, show_sys_files,
1432 posix, and utf8) and tell user what the new options to use are. Note
1433 we still do support them but they will be removed with kernel 2.7.x.
1434 - Change all occurences of integer long long printf formatting to hex
1435 as printk() will not support long long integer format if/when the
1436 div64 patch goes into the kernel.
1437 - Make slab caches have stable names and change the names to what they
1438 were intended to be. These changes are required/made possible by the
1439 new slab cache name handling which removes the length limitation by
1440 requiring the caller of kmem_cache_create() to supply a stable name
1441 which is then referenced but not copied.
1442 - Rename run_list structure to run_list_element and create a new
1443 run_list structure containing a pointer to a run_list_element
1444 structure and a read/write semaphore. Adapt all users of runlists
1445 to new scheme and take and release the lock as needed. This fixes a
1446 nasty race as the run_list changes even when inodes are locked for
1447 reading and even when the inode isn't locked at all, so we really
1448 needed the serialization. We use a semaphore rather than a spinlock
1449 as memory allocations can sleep and doing everything GFP_ATOMIC
1450 would be silly.
1451 - Cleanup read_inode() removing all code checking for lowest_vcn != 0.
1452 This can never happen due to the nature of lookup_attr() and how we
1453 support attribute lists. If it did happen it would imply the inode
1454 being corrupt.
1455 - Check for lowest_vcn != 0 in ntfs_read_inode() and mark the inode as
1456 bad if found.
1457 - Update to 2.5.6-pre2 changes in struct address_space.
1458 - Use parent_ino() when accessing d_parent inode number in dir.c.
1459 - Import Sourceforge CVS repository into BitKeeper repository:
1460 http://linux-ntfs.bkbits.net/ntfs-tng-2.5
1461 - Update fs/Makefile, fs/Config.help, fs/Config.in, and
1462 Documentation/filesystems/ntfs.txt for NTFS TNG.
1463 - Create kernel configuration option controlling whether debugging
1464 is enabled or not.
1465 - Add the required export of end_buffer_io_sync() from the patches
1466 directory to the kernel code.
1467 - Update inode.c::ntfs_show_options() with show_inodes mount option.
1468 - Update errors mount option.
1469
1470tng-0.0.7 - 13/02/2002 - The driver is now feature complete for read-only!
1471
1472 - Cleanup mft.c and it's debug/error output in particular. Fix a minor
1473 bug in mapping of extent inodes. Update all the comments to fit all
1474 the recent code changes.
1475 - Modify vcn_to_lcn() to cope with entirely unmapped runlists.
1476 - Cleanups in compress.c, mostly comments and folding help.
1477 - Implement attrib.c::map_run_list() as a generic helper.
1478 - Make compress.c::ntfs_file_read_compressed_block() use map_run_list()
1479 thus making code shorter and enabling attribute list support.
1480 - Cleanup incorrect use of [su]64 with %L printf format specifier in
1481 all source files. Type casts to [unsigned] long long added to correct
1482 the mismatches (important for architectures which have long long not
1483 being 64 bits).
1484 - Merge async io completion handlers for directory indexes and $MFT
1485 data into one by setting the index_block_size{_bits} of the ntfs
1486 inode for $MFT to the mft_record_size{_bits} of the ntfs_volume.
1487 - Cleanup aops.c, update comments.
1488 - Make ntfs_file_get_block() use map_run_list() so all files now
1489 support attribute lists.
1490 - Make ntfs_dir_readpage() almost verbatim copy of
1491 block_read_full_page() by using ntfs_file_get_block() with only real
1492 difference being the use of our own async io completion handler
1493 rather than the default one, thus reducing the amount of code and
1494 automatically enabling attribute list support for directory indices.
1495 - Fix bug in load_attribute_list() - forgot to call brelse in error
1496 code path.
1497 - Change parameters to find_attr() and lookup_attr(). We no longer
1498 pass in the upcase table and its length. These can be gotten from
1499 ctx->ntfs_ino->vol->upcase{_len}. Update all callers.
1500 - Cleanups in attrib.c.
1501 - Implement merging of runlists, attrib.c::merge_run_lists() and its
1502 helpers. (Richard Russon)
1503 - Attribute lists part 2, attribute extents and multi part runlists:
1504 enable proper support for LCN_RL_NOT_MAPPED and automatic mapping of
1505 further runlist parts via attrib.c::map_run_list().
1506 - Tiny endianness bug fix in decompress_mapping_pairs().
1507
1508tng-0.0.6 - Encrypted directories, bug fixes, cleanups, debugging enhancements.
1509
1510 - Enable encrypted directories. (Their index root is marked encrypted
1511 to indicate that new files in that directory should be created
1512 encrypted.)
1513 - Fix bug in NInoBmpNonResident() macro. (Cut and paste error.)
1514 - Enable $Extend system directory. Most (if not all) extended system
1515 files do not have unnamed data attributes so ntfs_read_inode() had to
1516 special case them but that is ok, as the special casing recovery
1517 happens inside an error code path so there is zero slow down in the
1518 normal fast path. The special casing is done by introducing a new
1519 function inode.c::ntfs_is_extended_system_file() which checks if any
1520 of the hard links in the inode point to $Extend as being their parent
1521 directory and if they do we assume this is an extended system file.
1522 - Create a sysctl/proc interface to allow {dis,en}abling of debug output
1523 when compiled with -DDEBUG. Default is debug messages to be disabled.
1524 To enable them, one writes a non-zero value to /proc/sys/fs/ntfs-debug
1525 (if /proc is enabled) or uses sysctl(2) to effect the same (if sysctl
1526 interface is enabled). Inspired by old ntfs driver.
1527 - Add debug_msgs insmod/kernel boot parameter to set whether debug
1528 messages are {dis,en}abled. This is useful to enable debug messages
1529 during ntfs initialization and is the only way to activate debugging
1530 when the sysctl interface is not enabled.
1531 - Cleanup debug output in various places.
1532 - Remove all dollar signs ($) from the source (except comments) to
1533 enable compilation on architectures whose gcc compiler does not
1534 support dollar signs in the names of variables/constants. Attribute
1535 types now start with AT_ instead of $ and $I30 is now just I30.
1536 - Cleanup ntfs_lookup() and add consistency check of sequence numbers.
1537 - Load complete runlist for $MFT/$BITMAP during mount and cleanup
1538 access functions. This means we now cope with $MFT/$BITMAP being
1539 spread accross several mft records.
1540 - Disable modification of mft_zone_multiplier on remount. We can always
1541 reenable this later on if we really want to, but we will need to make
1542 sure we readjust the mft_zone size / layout accordingly.
1543
1544tng-0.0.5 - Modernize for 2.5.x and further in line-ing with Al Viro's comments.
1545
1546 - Use sb_set_blocksize() instead of set_blocksize() and verify the
1547 return value.
1548 - Use sb_bread() instead of bread() throughout.
1549 - Add index_vcn_size{_bits} to ntfs_inode structure to store the size
1550 of a directory index block vcn. Apply resulting simplifications in
1551 dir.c everywhere.
1552 - Fix a small bug somewhere (but forgot what it was).
1553 - Change ntfs_{debug,error,warning} to enable gcc to do type checking
1554 on the printf-format parameter list and fix bugs reported by gcc
1555 as a result. (Richard Russon)
1556 - Move inode allocation strategy to Al's new stuff but maintain the
1557 divorce of ntfs_inode from struct inode. To achieve this we have two
1558 separate slab caches, one for big ntfs inodes containing a struct
1559 inode and pure ntfs inodes and at the same time fix some faulty
1560 error code paths in ntfs_read_inode().
1561 - Show mount options in proc (inode.c::ntfs_show_options()).
1562
1563tng-0.0.4 - Big changes, getting in line with Al Viro's comments.
1564
1565 - Modified (un)map_mft_record functions to be common for read and write
1566 case. To specify which is which, added extra parameter at front of
1567 parameter list. Pass either READ or WRITE to this, each has the
1568 obvious meaning.
1569 - General cleanups to allow for easier folding in vi.
1570 - attrib.c::decompress_mapping_pairs() now accepts the old runlist
1571 argument, and invokes attrib.c::merge_run_lists() to merge the old
1572 and the new runlists.
1573 - Removed attrib.c::find_first_attr().
1574 - Implemented loading of attribute list and complete runlist for $MFT.
1575 This means we now cope with $MFT being spread across several mft
1576 records.
1577 - Adapt to 2.5.2-pre9 and the changed create_empty_buffers() syntax.
1578 - Adapt major/minor/kdev_t/[bk]devname stuff to new 2.5.x kernels.
1579 - Make ntfs_volume be allocated via kmalloc() instead of using a slab
1580 cache. There are too little ntfs_volume structures at any one time
1581 to justify a private slab cache.
1582 - Fix bogus kmap() use in async io completion. Now use kmap_atomic().
1583 Use KM_BIO_IRQ on advice from IRC/kernel...
1584 - Use ntfs_map_page() in map_mft_record() and create ->readpage method
1585 for reading $MFT (ntfs_mft_readpage). In the process create dedicated
1586 address space operations (ntfs_mft_aops) for $MFT inode mapping. Also
1587 removed the now superfluous exports from the kernel core patch.
1588 - Fix a bug where kfree() was used instead of ntfs_free().
1589 - Change map_mft_record() to take ntfs_inode as argument instead of
1590 vfs inode. Dito for unmap_mft_record(). Adapt all callers.
1591 - Add pointer to ntfs_volume to ntfs_inode.
1592 - Add mft record number and sequence number to ntfs_inode. Stop using
1593 i_ino and i_generation for in-driver purposes.
1594 - Implement attrib.c::merge_run_lists(). (Richard Russon)
1595 - Remove use of proper inodes by extent inodes. Move i_ino and
1596 i_generation to ntfs_inode to do this. Apply simplifications that
1597 result and remove iget_no_wait(), etc.
1598 - Pass ntfs_inode everywhere in the driver (used to be struct inode).
1599 - Add reference counting in ntfs_inode for the ntfs inode itself and
1600 for the mapped mft record.
1601 - Extend mft record mapping so we can (un)map extent mft records (new
1602 functions (un)map_extent_mft_record), and so mappings are reference
1603 counted and don't have to happen twice if already mapped - just ref
1604 count increases.
1605 - Add -o iocharset as alias to -o nls for backwards compatibility.
1606 - The latest core patch is now tiny. In fact just a single additional
1607 export is necessary over the base kernel.
1608
1609tng-0.0.3 - Cleanups, enhancements, bug fixes.
1610
1611 - Work on attrib.c::decompress_mapping_pairs() to detect base extents
1612 and setup the runlist appropriately using knowledge provided by the
1613 sizes in the base attribute record.
1614 - Balance the get_/put_attr_search_ctx() calls so we don't leak memory
1615 any more.
1616 - Introduce ntfs_malloc_nofs() and ntfs_free() to allocate/free a single
1617 page or use vmalloc depending on the amount of memory requested.
1618 - Cleanup error output. The __FUNCTION__ "(): " is now added
1619 automatically. Introduced a new header file debug.h to support this
1620 and also moved ntfs_debug() function into it.
1621 - Make reading of compressed files more intelligent and especially get
1622 rid of the vmalloc_nofs() from readpage(). This now uses per CPU
1623 buffers (allocated at first mount with cluster size <= 4kiB and
1624 deallocated on last umount with cluster size <= 4kiB), and
1625 asynchronous io for the compressed data using a list of buffer heads.
1626 Er, we use synchronous io as async io only works on whole pages
1627 covered by buffers and not on individual buffer heads...
1628 - Bug fix for reading compressed files with sparse compression blocks.
1629
1630tng-0.0.2 - Now handles larger/fragmented/compressed volumes/files/dirs.
1631
1632 - Fixed handling of directories when cluster size exceeds index block
1633 size.
1634 - Hide DOS only name space directory entries from readdir() but allow
1635 them in lookup(). This should fix the problem that Linux doesn't
1636 support directory hard links, while still allowing access to entries
1637 via their short file name. This also has the benefit of mimicking
1638 what Windows users are used to, so it is the ideal solution.
1639 - Implemented sync_page everywhere so no more hangs in D state when
1640 waiting for a page.
1641 - Stop using bforget() in favour of brelse().
1642 - Stop locking buffers unnecessarily.
1643 - Implemented compressed files (inode->mapping contains uncompressed
1644 data, raw compressed data is currently bread() into a vmalloc()ed
1645 memory buffer).
1646 - Enable compressed directories. (Their index root is marked compressed
1647 to indicate that new files in that directory should be created
1648 compressed.)
1649 - Use vsnprintf rather than vsprintf in the ntfs_error and ntfs_warning
1650 functions. (Thanks to Will Dyson for pointing this out.)
1651 - Moved the ntfs_inode and ntfs_volume (the former ntfs_inode_info and
1652 ntfs_sb_info) out of the common inode and super_block structures and
1653 started using the generic_ip and generic_sbp pointers instead. This
1654 makes ntfs entirely private with respect to the kernel tree.
1655 - Detect compiler version and abort with error message if gcc less than
1656 2.96 is used.
1657 - Fix bug in name comparison function in unistr.c.
1658 - Implement attribute lists part 1, the infrastructure: search contexts
1659 and operations, find_external_attr(), lookup_attr()) and make the
1660 code use the infrastructure.
1661 - Fix stupid buffer overflow bug that became apparent on larger run
1662 list containing attributes.
1663 - Fix bugs in readdir() that became apparent on larger directories.
1664
1665 The driver is now really useful and survives the test
1666 find . -type f -exec md5sum "{}" \;
1667 without any error messages on a over 1GiB sized partition with >16k
1668 files on it, including compressed files and directories and many files
1669 and directories with attribute lists.
1670
1671tng-0.0.1 - The first useful version.
1672
1673 - Added ntfs_lookup().
1674 - Added default upcase generation and handling.
1675 - Added compile options to be shown on module init.
1676 - Many bug fixes that were "hidden" before.
1677 - Update to latest kernel.
1678 - Added ntfs_readdir().
1679 - Added file operations for mmap(), read(), open() and llseek(). We just
1680 use the generic ones. The whole point of going through implementing
1681 readpage() methods and where possible get_block() call backs is that
1682 this allows us to make use of the generic high level methods provided
1683 by the kernel.
1684
1685 The driver is now actually useful! Yey. (-: It undoubtedly has got bugs
1686 though and it doesn't implement accesssing compressed files yet. Also,
1687 accessing files with attribute list attributes is not implemented yet
1688 either. But for small or simple filesystems it should work and allow
1689 you to list directories, use stat on directory entries and the file
1690 system, open, read, mmap and llseek around in files. A big mile stone
1691 has been reached!
1692
1693tng-0.0.0 - Initial version tag.
1694
1695 Initial driver implementation. The driver can mount and umount simple
1696 NTFS filesystems (i.e. ones without attribute lists in the system
1697 files). If the mount fails there might be problems in the error handling
1698 code paths, so be warned. Otherwise it seems to be loading the system
1699 files nicely and the mft record read mapping/unmapping seems to be
1700 working nicely, too. Proof of inode metadata in the page cache and non-
1701 resident file unnamed stream data in the page cache concepts is thus
1702 complete.
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index cfce53cb65d7..c3c2c7ac9020 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -23,6 +23,7 @@
23 23
24#include <linux/errno.h> 24#include <linux/errno.h>
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/gfp.h>
26#include <linux/mm.h> 27#include <linux/mm.h>
27#include <linux/pagemap.h> 28#include <linux/pagemap.h>
28#include <linux/swap.h> 29#include <linux/swap.h>
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 50d3b0c258e3..f5094ee224c1 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/slab.h>
25#include <linux/swap.h> 26#include <linux/swap.h>
26#include <linux/writeback.h> 27#include <linux/writeback.h>
27 28
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 08f7530e9341..6551c7cbad92 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -25,6 +25,7 @@
25#include <linux/buffer_head.h> 25#include <linux/buffer_head.h>
26#include <linux/blkdev.h> 26#include <linux/blkdev.h>
27#include <linux/vmalloc.h> 27#include <linux/vmalloc.h>
28#include <linux/slab.h>
28 29
29#include "attrib.h" 30#include "attrib.h"
30#include "inode.h" 31#include "inode.h"
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 9173e82a45d1..fe44d3feee4a 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/slab.h>
24 25
25#include "dir.h" 26#include "dir.h"
26#include "aops.h" 27#include "aops.h"
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index b681c71d7069..8804f093ba75 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -20,6 +20,7 @@
20 */ 20 */
21 21
22#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
23#include <linux/gfp.h>
23#include <linux/pagemap.h> 24#include <linux/pagemap.h>
24#include <linux/pagevec.h> 25#include <linux/pagevec.h>
25#include <linux/sched.h> 26#include <linux/sched.h>
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
index 2194eff49743..096c135691ae 100644
--- a/fs/ntfs/index.c
+++ b/fs/ntfs/index.c
@@ -19,6 +19,8 @@
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21 21
22#include <linux/slab.h>
23
22#include "aops.h" 24#include "aops.h"
23#include "collate.h" 25#include "collate.h"
24#include "debug.h" 26#include "debug.h"
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 1caa0ef0b2bb..b572b6727181 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/slab.h>
24#include <linux/swap.h> 25#include <linux/swap.h>
25 26
26#include "attrib.h" 27#include "attrib.h"
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 2ca00153b6ec..358273e59ade 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -23,6 +23,7 @@
23#include <linux/dcache.h> 23#include <linux/dcache.h>
24#include <linux/exportfs.h> 24#include <linux/exportfs.h>
25#include <linux/security.h> 25#include <linux/security.h>
26#include <linux/slab.h>
26 27
27#include "attrib.h" 28#include "attrib.h"
28#include "debug.h" 29#include "debug.h"
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 1cf39dfaee7a..0de1db6cddbf 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -31,6 +31,7 @@
31#include <linux/vfs.h> 31#include <linux/vfs.h>
32#include <linux/moduleparam.h> 32#include <linux/moduleparam.h>
33#include <linux/smp_lock.h> 33#include <linux/smp_lock.h>
34#include <linux/bitmap.h>
34 35
35#include "sysctl.h" 36#include "sysctl.h"
36#include "logfile.h" 37#include "logfile.h"
@@ -2458,7 +2459,6 @@ static void ntfs_put_super(struct super_block *sb)
2458static s64 get_nr_free_clusters(ntfs_volume *vol) 2459static s64 get_nr_free_clusters(ntfs_volume *vol)
2459{ 2460{
2460 s64 nr_free = vol->nr_clusters; 2461 s64 nr_free = vol->nr_clusters;
2461 u32 *kaddr;
2462 struct address_space *mapping = vol->lcnbmp_ino->i_mapping; 2462 struct address_space *mapping = vol->lcnbmp_ino->i_mapping;
2463 struct page *page; 2463 struct page *page;
2464 pgoff_t index, max_index; 2464 pgoff_t index, max_index;
@@ -2477,7 +2477,8 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
2477 ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.", 2477 ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.",
2478 max_index, PAGE_CACHE_SIZE / 4); 2478 max_index, PAGE_CACHE_SIZE / 4);
2479 for (index = 0; index < max_index; index++) { 2479 for (index = 0; index < max_index; index++) {
2480 unsigned int i; 2480 unsigned long *kaddr;
2481
2481 /* 2482 /*
2482 * Read the page from page cache, getting it from backing store 2483 * Read the page from page cache, getting it from backing store
2483 * if necessary, and increment the use count. 2484 * if necessary, and increment the use count.
@@ -2490,16 +2491,16 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
2490 nr_free -= PAGE_CACHE_SIZE * 8; 2491 nr_free -= PAGE_CACHE_SIZE * 8;
2491 continue; 2492 continue;
2492 } 2493 }
2493 kaddr = (u32*)kmap_atomic(page, KM_USER0); 2494 kaddr = kmap_atomic(page, KM_USER0);
2494 /* 2495 /*
2495 * For each 4 bytes, subtract the number of set bits. If this 2496 * Subtract the number of set bits. If this
2496 * is the last page and it is partial we don't really care as 2497 * is the last page and it is partial we don't really care as
2497 * it just means we do a little extra work but it won't affect 2498 * it just means we do a little extra work but it won't affect
2498 * the result as all out of range bytes are set to zero by 2499 * the result as all out of range bytes are set to zero by
2499 * ntfs_readpage(). 2500 * ntfs_readpage().
2500 */ 2501 */
2501 for (i = 0; i < PAGE_CACHE_SIZE / 4; i++) 2502 nr_free -= bitmap_weight(kaddr,
2502 nr_free -= (s64)hweight32(kaddr[i]); 2503 PAGE_CACHE_SIZE * BITS_PER_BYTE);
2503 kunmap_atomic(kaddr, KM_USER0); 2504 kunmap_atomic(kaddr, KM_USER0);
2504 page_cache_release(page); 2505 page_cache_release(page);
2505 } 2506 }
@@ -2538,7 +2539,6 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
2538static unsigned long __get_nr_free_mft_records(ntfs_volume *vol, 2539static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
2539 s64 nr_free, const pgoff_t max_index) 2540 s64 nr_free, const pgoff_t max_index)
2540{ 2541{
2541 u32 *kaddr;
2542 struct address_space *mapping = vol->mftbmp_ino->i_mapping; 2542 struct address_space *mapping = vol->mftbmp_ino->i_mapping;
2543 struct page *page; 2543 struct page *page;
2544 pgoff_t index; 2544 pgoff_t index;
@@ -2548,7 +2548,8 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
2548 ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = " 2548 ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = "
2549 "0x%lx.", max_index, PAGE_CACHE_SIZE / 4); 2549 "0x%lx.", max_index, PAGE_CACHE_SIZE / 4);
2550 for (index = 0; index < max_index; index++) { 2550 for (index = 0; index < max_index; index++) {
2551 unsigned int i; 2551 unsigned long *kaddr;
2552
2552 /* 2553 /*
2553 * Read the page from page cache, getting it from backing store 2554 * Read the page from page cache, getting it from backing store
2554 * if necessary, and increment the use count. 2555 * if necessary, and increment the use count.
@@ -2561,16 +2562,16 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
2561 nr_free -= PAGE_CACHE_SIZE * 8; 2562 nr_free -= PAGE_CACHE_SIZE * 8;
2562 continue; 2563 continue;
2563 } 2564 }
2564 kaddr = (u32*)kmap_atomic(page, KM_USER0); 2565 kaddr = kmap_atomic(page, KM_USER0);
2565 /* 2566 /*
2566 * For each 4 bytes, subtract the number of set bits. If this 2567 * Subtract the number of set bits. If this
2567 * is the last page and it is partial we don't really care as 2568 * is the last page and it is partial we don't really care as
2568 * it just means we do a little extra work but it won't affect 2569 * it just means we do a little extra work but it won't affect
2569 * the result as all out of range bytes are set to zero by 2570 * the result as all out of range bytes are set to zero by
2570 * ntfs_readpage(). 2571 * ntfs_readpage().
2571 */ 2572 */
2572 for (i = 0; i < PAGE_CACHE_SIZE / 4; i++) 2573 nr_free -= bitmap_weight(kaddr,
2573 nr_free -= (s64)hweight32(kaddr[i]); 2574 PAGE_CACHE_SIZE * BITS_PER_BYTE);
2574 kunmap_atomic(kaddr, KM_USER0); 2575 kunmap_atomic(kaddr, KM_USER0);
2575 page_cache_release(page); 2576 page_cache_release(page);
2576 } 2577 }
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 791c0886c060..07d9fd854350 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -29,6 +29,7 @@ ocfs2-objs := \
29 mmap.o \ 29 mmap.o \
30 namei.o \ 30 namei.o \
31 refcounttree.o \ 31 refcounttree.o \
32 reservations.o \
32 resize.o \ 33 resize.o \
33 slot_map.o \ 34 slot_map.o \
34 suballoc.o \ 35 suballoc.o \
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 0501974bedd0..e13fc9e8fcdc 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -21,6 +21,7 @@
21 21
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/slab.h>
24#include <linux/string.h> 25#include <linux/string.h>
25 26
26#define MLOG_MASK_PREFIX ML_INODE 27#define MLOG_MASK_PREFIX ML_INODE
@@ -30,6 +31,8 @@
30#include "alloc.h" 31#include "alloc.h"
31#include "dlmglue.h" 32#include "dlmglue.h"
32#include "file.h" 33#include "file.h"
34#include "inode.h"
35#include "journal.h"
33#include "ocfs2_fs.h" 36#include "ocfs2_fs.h"
34 37
35#include "xattr.h" 38#include "xattr.h"
@@ -166,6 +169,60 @@ static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
166} 169}
167 170
168/* 171/*
172 * Helper function to set i_mode in memory and disk. Some call paths
173 * will not have di_bh or a journal handle to pass, in which case it
174 * will create it's own.
175 */
176static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
177 handle_t *handle, umode_t new_mode)
178{
179 int ret, commit_handle = 0;
180 struct ocfs2_dinode *di;
181
182 if (di_bh == NULL) {
183 ret = ocfs2_read_inode_block(inode, &di_bh);
184 if (ret) {
185 mlog_errno(ret);
186 goto out;
187 }
188 } else
189 get_bh(di_bh);
190
191 if (handle == NULL) {
192 handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
193 OCFS2_INODE_UPDATE_CREDITS);
194 if (IS_ERR(handle)) {
195 ret = PTR_ERR(handle);
196 mlog_errno(ret);
197 goto out_brelse;
198 }
199
200 commit_handle = 1;
201 }
202
203 di = (struct ocfs2_dinode *)di_bh->b_data;
204 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
205 OCFS2_JOURNAL_ACCESS_WRITE);
206 if (ret) {
207 mlog_errno(ret);
208 goto out_commit;
209 }
210
211 inode->i_mode = new_mode;
212 di->i_mode = cpu_to_le16(inode->i_mode);
213
214 ocfs2_journal_dirty(handle, di_bh);
215
216out_commit:
217 if (commit_handle)
218 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
219out_brelse:
220 brelse(di_bh);
221out:
222 return ret;
223}
224
225/*
169 * Set the access or default ACL of an inode. 226 * Set the access or default ACL of an inode.
170 */ 227 */
171static int ocfs2_set_acl(handle_t *handle, 228static int ocfs2_set_acl(handle_t *handle,
@@ -193,9 +250,14 @@ static int ocfs2_set_acl(handle_t *handle,
193 if (ret < 0) 250 if (ret < 0)
194 return ret; 251 return ret;
195 else { 252 else {
196 inode->i_mode = mode;
197 if (ret == 0) 253 if (ret == 0)
198 acl = NULL; 254 acl = NULL;
255
256 ret = ocfs2_acl_set_mode(inode, di_bh,
257 handle, mode);
258 if (ret)
259 return ret;
260
199 } 261 }
200 } 262 }
201 break; 263 break;
@@ -283,6 +345,7 @@ int ocfs2_init_acl(handle_t *handle,
283 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 345 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
284 struct posix_acl *acl = NULL; 346 struct posix_acl *acl = NULL;
285 int ret = 0; 347 int ret = 0;
348 mode_t mode;
286 349
287 if (!S_ISLNK(inode->i_mode)) { 350 if (!S_ISLNK(inode->i_mode)) {
288 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { 351 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
@@ -291,12 +354,17 @@ int ocfs2_init_acl(handle_t *handle,
291 if (IS_ERR(acl)) 354 if (IS_ERR(acl))
292 return PTR_ERR(acl); 355 return PTR_ERR(acl);
293 } 356 }
294 if (!acl) 357 if (!acl) {
295 inode->i_mode &= ~current_umask(); 358 mode = inode->i_mode & ~current_umask();
359 ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
360 if (ret) {
361 mlog_errno(ret);
362 goto cleanup;
363 }
364 }
296 } 365 }
297 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { 366 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
298 struct posix_acl *clone; 367 struct posix_acl *clone;
299 mode_t mode;
300 368
301 if (S_ISDIR(inode->i_mode)) { 369 if (S_ISDIR(inode->i_mode)) {
302 ret = ocfs2_set_acl(handle, inode, di_bh, 370 ret = ocfs2_set_acl(handle, inode, di_bh,
@@ -313,7 +381,7 @@ int ocfs2_init_acl(handle_t *handle,
313 mode = inode->i_mode; 381 mode = inode->i_mode;
314 ret = posix_acl_create_masq(clone, &mode); 382 ret = posix_acl_create_masq(clone, &mode);
315 if (ret >= 0) { 383 if (ret >= 0) {
316 inode->i_mode = mode; 384 ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
317 if (ret > 0) { 385 if (ret > 0) {
318 ret = ocfs2_set_acl(handle, inode, 386 ret = ocfs2_set_acl(handle, inode,
319 di_bh, ACL_TYPE_ACCESS, 387 di_bh, ACL_TYPE_ACCESS,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9f8bd913c51e..215e12ce1d85 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1006,7 +1006,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
1006 int count, status, i; 1006 int count, status, i;
1007 u16 suballoc_bit_start; 1007 u16 suballoc_bit_start;
1008 u32 num_got; 1008 u32 num_got;
1009 u64 first_blkno; 1009 u64 suballoc_loc, first_blkno;
1010 struct ocfs2_super *osb = 1010 struct ocfs2_super *osb =
1011 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); 1011 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
1012 struct ocfs2_extent_block *eb; 1012 struct ocfs2_extent_block *eb;
@@ -1015,10 +1015,10 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
1015 1015
1016 count = 0; 1016 count = 0;
1017 while (count < wanted) { 1017 while (count < wanted) {
1018 status = ocfs2_claim_metadata(osb, 1018 status = ocfs2_claim_metadata(handle,
1019 handle,
1020 meta_ac, 1019 meta_ac,
1021 wanted - count, 1020 wanted - count,
1021 &suballoc_loc,
1022 &suballoc_bit_start, 1022 &suballoc_bit_start,
1023 &num_got, 1023 &num_got,
1024 &first_blkno); 1024 &first_blkno);
@@ -1052,6 +1052,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
1052 eb->h_fs_generation = cpu_to_le32(osb->fs_generation); 1052 eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
1053 eb->h_suballoc_slot = 1053 eb->h_suballoc_slot =
1054 cpu_to_le16(meta_ac->ac_alloc_slot); 1054 cpu_to_le16(meta_ac->ac_alloc_slot);
1055 eb->h_suballoc_loc = cpu_to_le64(suballoc_loc);
1055 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1056 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1056 eb->h_list.l_count = 1057 eb->h_list.l_count =
1057 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); 1058 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -1061,11 +1062,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
1061 1062
1062 /* We'll also be dirtied by the caller, so 1063 /* We'll also be dirtied by the caller, so
1063 * this isn't absolutely necessary. */ 1064 * this isn't absolutely necessary. */
1064 status = ocfs2_journal_dirty(handle, bhs[i]); 1065 ocfs2_journal_dirty(handle, bhs[i]);
1065 if (status < 0) {
1066 mlog_errno(status);
1067 goto bail;
1068 }
1069 } 1066 }
1070 1067
1071 count += num_got; 1068 count += num_got;
@@ -1129,8 +1126,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1129 goto out; 1126 goto out;
1130 } 1127 }
1131 1128
1132 status = ocfs2_extend_trans(handle, path_num_items(path) + 1129 status = ocfs2_extend_trans(handle, path_num_items(path));
1133 handle->h_buffer_credits);
1134 if (status < 0) { 1130 if (status < 0) {
1135 mlog_errno(status); 1131 mlog_errno(status);
1136 goto out; 1132 goto out;
@@ -1270,12 +1266,7 @@ static int ocfs2_add_branch(handle_t *handle,
1270 if (!eb_el->l_tree_depth) 1266 if (!eb_el->l_tree_depth)
1271 new_last_eb_blk = le64_to_cpu(eb->h_blkno); 1267 new_last_eb_blk = le64_to_cpu(eb->h_blkno);
1272 1268
1273 status = ocfs2_journal_dirty(handle, bh); 1269 ocfs2_journal_dirty(handle, bh);
1274 if (status < 0) {
1275 mlog_errno(status);
1276 goto bail;
1277 }
1278
1279 next_blkno = le64_to_cpu(eb->h_blkno); 1270 next_blkno = le64_to_cpu(eb->h_blkno);
1280 } 1271 }
1281 1272
@@ -1321,17 +1312,10 @@ static int ocfs2_add_branch(handle_t *handle,
1321 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; 1312 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
1322 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); 1313 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
1323 1314
1324 status = ocfs2_journal_dirty(handle, *last_eb_bh); 1315 ocfs2_journal_dirty(handle, *last_eb_bh);
1325 if (status < 0) 1316 ocfs2_journal_dirty(handle, et->et_root_bh);
1326 mlog_errno(status); 1317 if (eb_bh)
1327 status = ocfs2_journal_dirty(handle, et->et_root_bh); 1318 ocfs2_journal_dirty(handle, eb_bh);
1328 if (status < 0)
1329 mlog_errno(status);
1330 if (eb_bh) {
1331 status = ocfs2_journal_dirty(handle, eb_bh);
1332 if (status < 0)
1333 mlog_errno(status);
1334 }
1335 1319
1336 /* 1320 /*
1337 * Some callers want to track the rightmost leaf so pass it 1321 * Some callers want to track the rightmost leaf so pass it
@@ -1399,11 +1383,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
1399 for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++) 1383 for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1400 eb_el->l_recs[i] = root_el->l_recs[i]; 1384 eb_el->l_recs[i] = root_el->l_recs[i];
1401 1385
1402 status = ocfs2_journal_dirty(handle, new_eb_bh); 1386 ocfs2_journal_dirty(handle, new_eb_bh);
1403 if (status < 0) {
1404 mlog_errno(status);
1405 goto bail;
1406 }
1407 1387
1408 status = ocfs2_et_root_journal_access(handle, et, 1388 status = ocfs2_et_root_journal_access(handle, et,
1409 OCFS2_JOURNAL_ACCESS_WRITE); 1389 OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1428,11 +1408,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
1428 if (root_el->l_tree_depth == cpu_to_le16(1)) 1408 if (root_el->l_tree_depth == cpu_to_le16(1))
1429 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); 1409 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
1430 1410
1431 status = ocfs2_journal_dirty(handle, et->et_root_bh); 1411 ocfs2_journal_dirty(handle, et->et_root_bh);
1432 if (status < 0) {
1433 mlog_errno(status);
1434 goto bail;
1435 }
1436 1412
1437 *ret_new_eb_bh = new_eb_bh; 1413 *ret_new_eb_bh = new_eb_bh;
1438 new_eb_bh = NULL; 1414 new_eb_bh = NULL;
@@ -2064,7 +2040,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
2064 struct ocfs2_path *right_path, 2040 struct ocfs2_path *right_path,
2065 int subtree_index) 2041 int subtree_index)
2066{ 2042{
2067 int ret, i, idx; 2043 int i, idx;
2068 struct ocfs2_extent_list *el, *left_el, *right_el; 2044 struct ocfs2_extent_list *el, *left_el, *right_el;
2069 struct ocfs2_extent_rec *left_rec, *right_rec; 2045 struct ocfs2_extent_rec *left_rec, *right_rec;
2070 struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; 2046 struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
@@ -2102,13 +2078,8 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
2102 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, 2078 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
2103 right_el); 2079 right_el);
2104 2080
2105 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh); 2081 ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
2106 if (ret) 2082 ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
2107 mlog_errno(ret);
2108
2109 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
2110 if (ret)
2111 mlog_errno(ret);
2112 2083
2113 /* 2084 /*
2114 * Setup our list pointers now so that the current 2085 * Setup our list pointers now so that the current
@@ -2132,9 +2103,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
2132 2103
2133 root_bh = left_path->p_node[subtree_index].bh; 2104 root_bh = left_path->p_node[subtree_index].bh;
2134 2105
2135 ret = ocfs2_journal_dirty(handle, root_bh); 2106 ocfs2_journal_dirty(handle, root_bh);
2136 if (ret)
2137 mlog_errno(ret);
2138} 2107}
2139 2108
2140static int ocfs2_rotate_subtree_right(handle_t *handle, 2109static int ocfs2_rotate_subtree_right(handle_t *handle,
@@ -2207,11 +2176,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
2207 2176
2208 ocfs2_create_empty_extent(right_el); 2177 ocfs2_create_empty_extent(right_el);
2209 2178
2210 ret = ocfs2_journal_dirty(handle, right_leaf_bh); 2179 ocfs2_journal_dirty(handle, right_leaf_bh);
2211 if (ret) {
2212 mlog_errno(ret);
2213 goto out;
2214 }
2215 2180
2216 /* Do the copy now. */ 2181 /* Do the copy now. */
2217 i = le16_to_cpu(left_el->l_next_free_rec) - 1; 2182 i = le16_to_cpu(left_el->l_next_free_rec) - 1;
@@ -2230,11 +2195,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
2230 memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 2195 memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2231 le16_add_cpu(&left_el->l_next_free_rec, 1); 2196 le16_add_cpu(&left_el->l_next_free_rec, 1);
2232 2197
2233 ret = ocfs2_journal_dirty(handle, left_leaf_bh); 2198 ocfs2_journal_dirty(handle, left_leaf_bh);
2234 if (ret) {
2235 mlog_errno(ret);
2236 goto out;
2237 }
2238 2199
2239 ocfs2_complete_edge_insert(handle, left_path, right_path, 2200 ocfs2_complete_edge_insert(handle, left_path, right_path,
2240 subtree_index); 2201 subtree_index);
@@ -2249,8 +2210,8 @@ out:
2249 * 2210 *
2250 * Will return zero if the path passed in is already the leftmost path. 2211 * Will return zero if the path passed in is already the leftmost path.
2251 */ 2212 */
2252static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, 2213int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
2253 struct ocfs2_path *path, u32 *cpos) 2214 struct ocfs2_path *path, u32 *cpos)
2254{ 2215{
2255 int i, j, ret = 0; 2216 int i, j, ret = 0;
2256 u64 blkno; 2217 u64 blkno;
@@ -2327,20 +2288,14 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
2327 int op_credits, 2288 int op_credits,
2328 struct ocfs2_path *path) 2289 struct ocfs2_path *path)
2329{ 2290{
2330 int ret; 2291 int ret = 0;
2331 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; 2292 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
2332 2293
2333 if (handle->h_buffer_credits < credits) { 2294 if (handle->h_buffer_credits < credits)
2334 ret = ocfs2_extend_trans(handle, 2295 ret = ocfs2_extend_trans(handle,
2335 credits - handle->h_buffer_credits); 2296 credits - handle->h_buffer_credits);
2336 if (ret)
2337 return ret;
2338 2297
2339 if (unlikely(handle->h_buffer_credits < credits)) 2298 return ret;
2340 return ocfs2_extend_trans(handle, credits);
2341 }
2342
2343 return 0;
2344} 2299}
2345 2300
2346/* 2301/*
@@ -2584,8 +2539,7 @@ static int ocfs2_update_edge_lengths(handle_t *handle,
2584 * records for all the bh in the path. 2539 * records for all the bh in the path.
2585 * So we have to allocate extra credits and access them. 2540 * So we have to allocate extra credits and access them.
2586 */ 2541 */
2587 ret = ocfs2_extend_trans(handle, 2542 ret = ocfs2_extend_trans(handle, subtree_index);
2588 handle->h_buffer_credits + subtree_index);
2589 if (ret) { 2543 if (ret) {
2590 mlog_errno(ret); 2544 mlog_errno(ret);
2591 goto out; 2545 goto out;
@@ -2823,12 +2777,8 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
2823 ocfs2_remove_empty_extent(right_leaf_el); 2777 ocfs2_remove_empty_extent(right_leaf_el);
2824 } 2778 }
2825 2779
2826 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); 2780 ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2827 if (ret) 2781 ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2828 mlog_errno(ret);
2829 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2830 if (ret)
2831 mlog_errno(ret);
2832 2782
2833 if (del_right_subtree) { 2783 if (del_right_subtree) {
2834 ocfs2_unlink_subtree(handle, et, left_path, right_path, 2784 ocfs2_unlink_subtree(handle, et, left_path, right_path,
@@ -2851,9 +2801,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
2851 if (right_has_empty) 2801 if (right_has_empty)
2852 ocfs2_remove_empty_extent(left_leaf_el); 2802 ocfs2_remove_empty_extent(left_leaf_el);
2853 2803
2854 ret = ocfs2_journal_dirty(handle, et_root_bh); 2804 ocfs2_journal_dirty(handle, et_root_bh);
2855 if (ret)
2856 mlog_errno(ret);
2857 2805
2858 *deleted = 1; 2806 *deleted = 1;
2859 } else 2807 } else
@@ -2962,10 +2910,7 @@ static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
2962 } 2910 }
2963 2911
2964 ocfs2_remove_empty_extent(el); 2912 ocfs2_remove_empty_extent(el);
2965 2913 ocfs2_journal_dirty(handle, bh);
2966 ret = ocfs2_journal_dirty(handle, bh);
2967 if (ret)
2968 mlog_errno(ret);
2969 2914
2970out: 2915out:
2971 return ret; 2916 return ret;
@@ -3506,15 +3451,9 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
3506 3451
3507 ocfs2_cleanup_merge(el, index); 3452 ocfs2_cleanup_merge(el, index);
3508 3453
3509 ret = ocfs2_journal_dirty(handle, bh); 3454 ocfs2_journal_dirty(handle, bh);
3510 if (ret)
3511 mlog_errno(ret);
3512
3513 if (right_path) { 3455 if (right_path) {
3514 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); 3456 ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3515 if (ret)
3516 mlog_errno(ret);
3517
3518 ocfs2_complete_edge_insert(handle, left_path, right_path, 3457 ocfs2_complete_edge_insert(handle, left_path, right_path,
3519 subtree_index); 3458 subtree_index);
3520 } 3459 }
@@ -3683,14 +3622,9 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
3683 3622
3684 ocfs2_cleanup_merge(el, index); 3623 ocfs2_cleanup_merge(el, index);
3685 3624
3686 ret = ocfs2_journal_dirty(handle, bh); 3625 ocfs2_journal_dirty(handle, bh);
3687 if (ret)
3688 mlog_errno(ret);
3689
3690 if (left_path) { 3626 if (left_path) {
3691 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); 3627 ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3692 if (ret)
3693 mlog_errno(ret);
3694 3628
3695 /* 3629 /*
3696 * In the situation that the right_rec is empty and the extent 3630 * In the situation that the right_rec is empty and the extent
@@ -4016,10 +3950,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
4016 le32_add_cpu(&rec->e_int_clusters, 3950 le32_add_cpu(&rec->e_int_clusters,
4017 -le32_to_cpu(rec->e_cpos)); 3951 -le32_to_cpu(rec->e_cpos));
4018 3952
4019 ret = ocfs2_journal_dirty(handle, bh); 3953 ocfs2_journal_dirty(handle, bh);
4020 if (ret)
4021 mlog_errno(ret);
4022
4023 } 3954 }
4024} 3955}
4025 3956
@@ -4203,17 +4134,13 @@ static int ocfs2_insert_path(handle_t *handle,
4203 struct buffer_head *leaf_bh = path_leaf_bh(right_path); 4134 struct buffer_head *leaf_bh = path_leaf_bh(right_path);
4204 4135
4205 if (left_path) { 4136 if (left_path) {
4206 int credits = handle->h_buffer_credits;
4207
4208 /* 4137 /*
4209 * There's a chance that left_path got passed back to 4138 * There's a chance that left_path got passed back to
4210 * us without being accounted for in the 4139 * us without being accounted for in the
4211 * journal. Extend our transaction here to be sure we 4140 * journal. Extend our transaction here to be sure we
4212 * can change those blocks. 4141 * can change those blocks.
4213 */ 4142 */
4214 credits += left_path->p_tree_depth; 4143 ret = ocfs2_extend_trans(handle, left_path->p_tree_depth);
4215
4216 ret = ocfs2_extend_trans(handle, credits);
4217 if (ret < 0) { 4144 if (ret < 0) {
4218 mlog_errno(ret); 4145 mlog_errno(ret);
4219 goto out; 4146 goto out;
@@ -4251,17 +4178,13 @@ static int ocfs2_insert_path(handle_t *handle,
4251 * dirty this for us. 4178 * dirty this for us.
4252 */ 4179 */
4253 if (left_path) 4180 if (left_path)
4254 ret = ocfs2_journal_dirty(handle, 4181 ocfs2_journal_dirty(handle,
4255 path_leaf_bh(left_path)); 4182 path_leaf_bh(left_path));
4256 if (ret)
4257 mlog_errno(ret);
4258 } else 4183 } else
4259 ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path), 4184 ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
4260 insert); 4185 insert);
4261 4186
4262 ret = ocfs2_journal_dirty(handle, leaf_bh); 4187 ocfs2_journal_dirty(handle, leaf_bh);
4263 if (ret)
4264 mlog_errno(ret);
4265 4188
4266 if (left_path) { 4189 if (left_path) {
4267 /* 4190 /*
@@ -4384,9 +4307,7 @@ out_update_clusters:
4384 ocfs2_et_update_clusters(et, 4307 ocfs2_et_update_clusters(et,
4385 le16_to_cpu(insert_rec->e_leaf_clusters)); 4308 le16_to_cpu(insert_rec->e_leaf_clusters));
4386 4309
4387 ret = ocfs2_journal_dirty(handle, et->et_root_bh); 4310 ocfs2_journal_dirty(handle, et->et_root_bh);
4388 if (ret)
4389 mlog_errno(ret);
4390 4311
4391out: 4312out:
4392 ocfs2_free_path(left_path); 4313 ocfs2_free_path(left_path);
@@ -4866,7 +4787,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
4866 goto leave; 4787 goto leave;
4867 } 4788 }
4868 4789
4869 status = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 4790 status = __ocfs2_claim_clusters(handle, data_ac, 1,
4870 clusters_to_add, &bit_off, &num_bits); 4791 clusters_to_add, &bit_off, &num_bits);
4871 if (status < 0) { 4792 if (status < 0) {
4872 if (status != -ENOSPC) 4793 if (status != -ENOSPC)
@@ -4895,11 +4816,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
4895 goto leave; 4816 goto leave;
4896 } 4817 }
4897 4818
4898 status = ocfs2_journal_dirty(handle, et->et_root_bh); 4819 ocfs2_journal_dirty(handle, et->et_root_bh);
4899 if (status < 0) {
4900 mlog_errno(status);
4901 goto leave;
4902 }
4903 4820
4904 clusters_to_add -= num_bits; 4821 clusters_to_add -= num_bits;
4905 *logical_offset += num_bits; 4822 *logical_offset += num_bits;
@@ -5309,7 +5226,7 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
5309 int index, u32 new_range, 5226 int index, u32 new_range,
5310 struct ocfs2_alloc_context *meta_ac) 5227 struct ocfs2_alloc_context *meta_ac)
5311{ 5228{
5312 int ret, depth, credits = handle->h_buffer_credits; 5229 int ret, depth, credits;
5313 struct buffer_head *last_eb_bh = NULL; 5230 struct buffer_head *last_eb_bh = NULL;
5314 struct ocfs2_extent_block *eb; 5231 struct ocfs2_extent_block *eb;
5315 struct ocfs2_extent_list *rightmost_el, *el; 5232 struct ocfs2_extent_list *rightmost_el, *el;
@@ -5340,8 +5257,8 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
5340 } else 5257 } else
5341 rightmost_el = path_leaf_el(path); 5258 rightmost_el = path_leaf_el(path);
5342 5259
5343 credits += path->p_tree_depth + 5260 credits = path->p_tree_depth +
5344 ocfs2_extend_meta_needed(et->et_root_el); 5261 ocfs2_extend_meta_needed(et->et_root_el);
5345 ret = ocfs2_extend_trans(handle, credits); 5262 ret = ocfs2_extend_trans(handle, credits);
5346 if (ret) { 5263 if (ret) {
5347 mlog_errno(ret); 5264 mlog_errno(ret);
@@ -5671,19 +5588,97 @@ out:
5671 return ret; 5588 return ret;
5672} 5589}
5673 5590
5591/*
5592 * ocfs2_reserve_blocks_for_rec_trunc() would look basically the
5593 * same as ocfs2_lock_alloctors(), except for it accepts a blocks
5594 * number to reserve some extra blocks, and it only handles meta
5595 * data allocations.
5596 *
5597 * Currently, only ocfs2_remove_btree_range() uses it for truncating
5598 * and punching holes.
5599 */
5600static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
5601 struct ocfs2_extent_tree *et,
5602 u32 extents_to_split,
5603 struct ocfs2_alloc_context **ac,
5604 int extra_blocks)
5605{
5606 int ret = 0, num_free_extents;
5607 unsigned int max_recs_needed = 2 * extents_to_split;
5608 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5609
5610 *ac = NULL;
5611
5612 num_free_extents = ocfs2_num_free_extents(osb, et);
5613 if (num_free_extents < 0) {
5614 ret = num_free_extents;
5615 mlog_errno(ret);
5616 goto out;
5617 }
5618
5619 if (!num_free_extents ||
5620 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
5621 extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
5622
5623 if (extra_blocks) {
5624 ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac);
5625 if (ret < 0) {
5626 if (ret != -ENOSPC)
5627 mlog_errno(ret);
5628 goto out;
5629 }
5630 }
5631
5632out:
5633 if (ret) {
5634 if (*ac) {
5635 ocfs2_free_alloc_context(*ac);
5636 *ac = NULL;
5637 }
5638 }
5639
5640 return ret;
5641}
5642
5674int ocfs2_remove_btree_range(struct inode *inode, 5643int ocfs2_remove_btree_range(struct inode *inode,
5675 struct ocfs2_extent_tree *et, 5644 struct ocfs2_extent_tree *et,
5676 u32 cpos, u32 phys_cpos, u32 len, 5645 u32 cpos, u32 phys_cpos, u32 len, int flags,
5677 struct ocfs2_cached_dealloc_ctxt *dealloc) 5646 struct ocfs2_cached_dealloc_ctxt *dealloc,
5647 u64 refcount_loc)
5678{ 5648{
5679 int ret; 5649 int ret, credits = 0, extra_blocks = 0;
5680 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 5650 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
5681 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 5651 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5682 struct inode *tl_inode = osb->osb_tl_inode; 5652 struct inode *tl_inode = osb->osb_tl_inode;
5683 handle_t *handle; 5653 handle_t *handle;
5684 struct ocfs2_alloc_context *meta_ac = NULL; 5654 struct ocfs2_alloc_context *meta_ac = NULL;
5655 struct ocfs2_refcount_tree *ref_tree = NULL;
5656
5657 if ((flags & OCFS2_EXT_REFCOUNTED) && len) {
5658 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
5659 OCFS2_HAS_REFCOUNT_FL));
5660
5661 ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
5662 &ref_tree, NULL);
5663 if (ret) {
5664 mlog_errno(ret);
5665 goto out;
5666 }
5685 5667
5686 ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac); 5668 ret = ocfs2_prepare_refcount_change_for_del(inode,
5669 refcount_loc,
5670 phys_blkno,
5671 len,
5672 &credits,
5673 &extra_blocks);
5674 if (ret < 0) {
5675 mlog_errno(ret);
5676 goto out;
5677 }
5678 }
5679
5680 ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac,
5681 extra_blocks);
5687 if (ret) { 5682 if (ret) {
5688 mlog_errno(ret); 5683 mlog_errno(ret);
5689 return ret; 5684 return ret;
@@ -5699,7 +5694,8 @@ int ocfs2_remove_btree_range(struct inode *inode,
5699 } 5694 }
5700 } 5695 }
5701 5696
5702 handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb)); 5697 handle = ocfs2_start_trans(osb,
5698 ocfs2_remove_extent_credits(osb->sb) + credits);
5703 if (IS_ERR(handle)) { 5699 if (IS_ERR(handle)) {
5704 ret = PTR_ERR(handle); 5700 ret = PTR_ERR(handle);
5705 mlog_errno(ret); 5701 mlog_errno(ret);
@@ -5724,15 +5720,22 @@ int ocfs2_remove_btree_range(struct inode *inode,
5724 5720
5725 ocfs2_et_update_clusters(et, -len); 5721 ocfs2_et_update_clusters(et, -len);
5726 5722
5727 ret = ocfs2_journal_dirty(handle, et->et_root_bh); 5723 ocfs2_journal_dirty(handle, et->et_root_bh);
5728 if (ret) {
5729 mlog_errno(ret);
5730 goto out_commit;
5731 }
5732 5724
5733 ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len); 5725 if (phys_blkno) {
5734 if (ret) 5726 if (flags & OCFS2_EXT_REFCOUNTED)
5735 mlog_errno(ret); 5727 ret = ocfs2_decrease_refcount(inode, handle,
5728 ocfs2_blocks_to_clusters(osb->sb,
5729 phys_blkno),
5730 len, meta_ac,
5731 dealloc, 1);
5732 else
5733 ret = ocfs2_truncate_log_append(osb, handle,
5734 phys_blkno, len);
5735 if (ret)
5736 mlog_errno(ret);
5737
5738 }
5736 5739
5737out_commit: 5740out_commit:
5738 ocfs2_commit_trans(osb, handle); 5741 ocfs2_commit_trans(osb, handle);
@@ -5742,6 +5745,9 @@ out:
5742 if (meta_ac) 5745 if (meta_ac)
5743 ocfs2_free_alloc_context(meta_ac); 5746 ocfs2_free_alloc_context(meta_ac);
5744 5747
5748 if (ref_tree)
5749 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
5750
5745 return ret; 5751 return ret;
5746} 5752}
5747 5753
@@ -5850,11 +5856,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5850 } 5856 }
5851 tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters); 5857 tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5852 5858
5853 status = ocfs2_journal_dirty(handle, tl_bh); 5859 ocfs2_journal_dirty(handle, tl_bh);
5854 if (status < 0) {
5855 mlog_errno(status);
5856 goto bail;
5857 }
5858 5860
5859bail: 5861bail:
5860 mlog_exit(status); 5862 mlog_exit(status);
@@ -5893,11 +5895,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5893 5895
5894 tl->tl_used = cpu_to_le16(i); 5896 tl->tl_used = cpu_to_le16(i);
5895 5897
5896 status = ocfs2_journal_dirty(handle, tl_bh); 5898 ocfs2_journal_dirty(handle, tl_bh);
5897 if (status < 0) {
5898 mlog_errno(status);
5899 goto bail;
5900 }
5901 5899
5902 /* TODO: Perhaps we can calculate the bulk of the 5900 /* TODO: Perhaps we can calculate the bulk of the
5903 * credits up front rather than extending like 5901 * credits up front rather than extending like
@@ -6298,6 +6296,7 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
6298 */ 6296 */
6299struct ocfs2_cached_block_free { 6297struct ocfs2_cached_block_free {
6300 struct ocfs2_cached_block_free *free_next; 6298 struct ocfs2_cached_block_free *free_next;
6299 u64 free_bg;
6301 u64 free_blk; 6300 u64 free_blk;
6302 unsigned int free_bit; 6301 unsigned int free_bit;
6303}; 6302};
@@ -6344,8 +6343,11 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
6344 } 6343 }
6345 6344
6346 while (head) { 6345 while (head) {
6347 bg_blkno = ocfs2_which_suballoc_group(head->free_blk, 6346 if (head->free_bg)
6348 head->free_bit); 6347 bg_blkno = head->free_bg;
6348 else
6349 bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
6350 head->free_bit);
6349 mlog(0, "Free bit: (bit %u, blkno %llu)\n", 6351 mlog(0, "Free bit: (bit %u, blkno %llu)\n",
6350 head->free_bit, (unsigned long long)head->free_blk); 6352 head->free_bit, (unsigned long long)head->free_blk);
6351 6353
@@ -6393,7 +6395,7 @@ int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6393 int ret = 0; 6395 int ret = 0;
6394 struct ocfs2_cached_block_free *item; 6396 struct ocfs2_cached_block_free *item;
6395 6397
6396 item = kmalloc(sizeof(*item), GFP_NOFS); 6398 item = kzalloc(sizeof(*item), GFP_NOFS);
6397 if (item == NULL) { 6399 if (item == NULL) {
6398 ret = -ENOMEM; 6400 ret = -ENOMEM;
6399 mlog_errno(ret); 6401 mlog_errno(ret);
@@ -6533,8 +6535,8 @@ ocfs2_find_per_slot_free_list(int type,
6533} 6535}
6534 6536
6535int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 6537int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6536 int type, int slot, u64 blkno, 6538 int type, int slot, u64 suballoc,
6537 unsigned int bit) 6539 u64 blkno, unsigned int bit)
6538{ 6540{
6539 int ret; 6541 int ret;
6540 struct ocfs2_per_slot_free_list *fl; 6542 struct ocfs2_per_slot_free_list *fl;
@@ -6547,7 +6549,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6547 goto out; 6549 goto out;
6548 } 6550 }
6549 6551
6550 item = kmalloc(sizeof(*item), GFP_NOFS); 6552 item = kzalloc(sizeof(*item), GFP_NOFS);
6551 if (item == NULL) { 6553 if (item == NULL) {
6552 ret = -ENOMEM; 6554 ret = -ENOMEM;
6553 mlog_errno(ret); 6555 mlog_errno(ret);
@@ -6557,6 +6559,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6557 mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n", 6559 mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
6558 type, slot, bit, (unsigned long long)blkno); 6560 type, slot, bit, (unsigned long long)blkno);
6559 6561
6562 item->free_bg = suballoc;
6560 item->free_blk = blkno; 6563 item->free_blk = blkno;
6561 item->free_bit = bit; 6564 item->free_bit = bit;
6562 item->free_next = fl->f_first; 6565 item->free_next = fl->f_first;
@@ -6573,433 +6576,11 @@ static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
6573{ 6576{
6574 return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE, 6577 return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
6575 le16_to_cpu(eb->h_suballoc_slot), 6578 le16_to_cpu(eb->h_suballoc_slot),
6579 le64_to_cpu(eb->h_suballoc_loc),
6576 le64_to_cpu(eb->h_blkno), 6580 le64_to_cpu(eb->h_blkno),
6577 le16_to_cpu(eb->h_suballoc_bit)); 6581 le16_to_cpu(eb->h_suballoc_bit));
6578} 6582}
6579 6583
6580/* This function will figure out whether the currently last extent
6581 * block will be deleted, and if it will, what the new last extent
6582 * block will be so we can update his h_next_leaf_blk field, as well
6583 * as the dinodes i_last_eb_blk */
6584static int ocfs2_find_new_last_ext_blk(struct inode *inode,
6585 unsigned int clusters_to_del,
6586 struct ocfs2_path *path,
6587 struct buffer_head **new_last_eb)
6588{
6589 int next_free, ret = 0;
6590 u32 cpos;
6591 struct ocfs2_extent_rec *rec;
6592 struct ocfs2_extent_block *eb;
6593 struct ocfs2_extent_list *el;
6594 struct buffer_head *bh = NULL;
6595
6596 *new_last_eb = NULL;
6597
6598 /* we have no tree, so of course, no last_eb. */
6599 if (!path->p_tree_depth)
6600 goto out;
6601
6602 /* trunc to zero special case - this makes tree_depth = 0
6603 * regardless of what it is. */
6604 if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
6605 goto out;
6606
6607 el = path_leaf_el(path);
6608 BUG_ON(!el->l_next_free_rec);
6609
6610 /*
6611 * Make sure that this extent list will actually be empty
6612 * after we clear away the data. We can shortcut out if
6613 * there's more than one non-empty extent in the
6614 * list. Otherwise, a check of the remaining extent is
6615 * necessary.
6616 */
6617 next_free = le16_to_cpu(el->l_next_free_rec);
6618 rec = NULL;
6619 if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6620 if (next_free > 2)
6621 goto out;
6622
6623 /* We may have a valid extent in index 1, check it. */
6624 if (next_free == 2)
6625 rec = &el->l_recs[1];
6626
6627 /*
6628 * Fall through - no more nonempty extents, so we want
6629 * to delete this leaf.
6630 */
6631 } else {
6632 if (next_free > 1)
6633 goto out;
6634
6635 rec = &el->l_recs[0];
6636 }
6637
6638 if (rec) {
6639 /*
6640 * Check it we'll only be trimming off the end of this
6641 * cluster.
6642 */
6643 if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
6644 goto out;
6645 }
6646
6647 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
6648 if (ret) {
6649 mlog_errno(ret);
6650 goto out;
6651 }
6652
6653 ret = ocfs2_find_leaf(INODE_CACHE(inode), path_root_el(path), cpos, &bh);
6654 if (ret) {
6655 mlog_errno(ret);
6656 goto out;
6657 }
6658
6659 eb = (struct ocfs2_extent_block *) bh->b_data;
6660 el = &eb->h_list;
6661
6662 /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
6663 * Any corruption is a code bug. */
6664 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
6665
6666 *new_last_eb = bh;
6667 get_bh(*new_last_eb);
6668 mlog(0, "returning block %llu, (cpos: %u)\n",
6669 (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
6670out:
6671 brelse(bh);
6672
6673 return ret;
6674}
6675
6676/*
6677 * Trim some clusters off the rightmost edge of a tree. Only called
6678 * during truncate.
6679 *
6680 * The caller needs to:
6681 * - start journaling of each path component.
6682 * - compute and fully set up any new last ext block
6683 */
6684static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6685 handle_t *handle, struct ocfs2_truncate_context *tc,
6686 u32 clusters_to_del, u64 *delete_start, u8 *flags)
6687{
6688 int ret, i, index = path->p_tree_depth;
6689 u32 new_edge = 0;
6690 u64 deleted_eb = 0;
6691 struct buffer_head *bh;
6692 struct ocfs2_extent_list *el;
6693 struct ocfs2_extent_rec *rec;
6694
6695 *delete_start = 0;
6696 *flags = 0;
6697
6698 while (index >= 0) {
6699 bh = path->p_node[index].bh;
6700 el = path->p_node[index].el;
6701
6702 mlog(0, "traveling tree (index = %d, block = %llu)\n",
6703 index, (unsigned long long)bh->b_blocknr);
6704
6705 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
6706
6707 if (index !=
6708 (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
6709 ocfs2_error(inode->i_sb,
6710 "Inode %lu has invalid ext. block %llu",
6711 inode->i_ino,
6712 (unsigned long long)bh->b_blocknr);
6713 ret = -EROFS;
6714 goto out;
6715 }
6716
6717find_tail_record:
6718 i = le16_to_cpu(el->l_next_free_rec) - 1;
6719 rec = &el->l_recs[i];
6720
6721 mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
6722 "next = %u\n", i, le32_to_cpu(rec->e_cpos),
6723 ocfs2_rec_clusters(el, rec),
6724 (unsigned long long)le64_to_cpu(rec->e_blkno),
6725 le16_to_cpu(el->l_next_free_rec));
6726
6727 BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
6728
6729 if (le16_to_cpu(el->l_tree_depth) == 0) {
6730 /*
6731 * If the leaf block contains a single empty
6732 * extent and no records, we can just remove
6733 * the block.
6734 */
6735 if (i == 0 && ocfs2_is_empty_extent(rec)) {
6736 memset(rec, 0,
6737 sizeof(struct ocfs2_extent_rec));
6738 el->l_next_free_rec = cpu_to_le16(0);
6739
6740 goto delete;
6741 }
6742
6743 /*
6744 * Remove any empty extents by shifting things
6745 * left. That should make life much easier on
6746 * the code below. This condition is rare
6747 * enough that we shouldn't see a performance
6748 * hit.
6749 */
6750 if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6751 le16_add_cpu(&el->l_next_free_rec, -1);
6752
6753 for(i = 0;
6754 i < le16_to_cpu(el->l_next_free_rec); i++)
6755 el->l_recs[i] = el->l_recs[i + 1];
6756
6757 memset(&el->l_recs[i], 0,
6758 sizeof(struct ocfs2_extent_rec));
6759
6760 /*
6761 * We've modified our extent list. The
6762 * simplest way to handle this change
6763 * is to being the search from the
6764 * start again.
6765 */
6766 goto find_tail_record;
6767 }
6768
6769 le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
6770
6771 /*
6772 * We'll use "new_edge" on our way back up the
6773 * tree to know what our rightmost cpos is.
6774 */
6775 new_edge = le16_to_cpu(rec->e_leaf_clusters);
6776 new_edge += le32_to_cpu(rec->e_cpos);
6777
6778 /*
6779 * The caller will use this to delete data blocks.
6780 */
6781 *delete_start = le64_to_cpu(rec->e_blkno)
6782 + ocfs2_clusters_to_blocks(inode->i_sb,
6783 le16_to_cpu(rec->e_leaf_clusters));
6784 *flags = rec->e_flags;
6785
6786 /*
6787 * If it's now empty, remove this record.
6788 */
6789 if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
6790 memset(rec, 0,
6791 sizeof(struct ocfs2_extent_rec));
6792 le16_add_cpu(&el->l_next_free_rec, -1);
6793 }
6794 } else {
6795 if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
6796 memset(rec, 0,
6797 sizeof(struct ocfs2_extent_rec));
6798 le16_add_cpu(&el->l_next_free_rec, -1);
6799
6800 goto delete;
6801 }
6802
6803 /* Can this actually happen? */
6804 if (le16_to_cpu(el->l_next_free_rec) == 0)
6805 goto delete;
6806
6807 /*
6808 * We never actually deleted any clusters
6809 * because our leaf was empty. There's no
6810 * reason to adjust the rightmost edge then.
6811 */
6812 if (new_edge == 0)
6813 goto delete;
6814
6815 rec->e_int_clusters = cpu_to_le32(new_edge);
6816 le32_add_cpu(&rec->e_int_clusters,
6817 -le32_to_cpu(rec->e_cpos));
6818
6819 /*
6820 * A deleted child record should have been
6821 * caught above.
6822 */
6823 BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
6824 }
6825
6826delete:
6827 ret = ocfs2_journal_dirty(handle, bh);
6828 if (ret) {
6829 mlog_errno(ret);
6830 goto out;
6831 }
6832
6833 mlog(0, "extent list container %llu, after: record %d: "
6834 "(%u, %u, %llu), next = %u.\n",
6835 (unsigned long long)bh->b_blocknr, i,
6836 le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
6837 (unsigned long long)le64_to_cpu(rec->e_blkno),
6838 le16_to_cpu(el->l_next_free_rec));
6839
6840 /*
6841 * We must be careful to only attempt delete of an
6842 * extent block (and not the root inode block).
6843 */
6844 if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
6845 struct ocfs2_extent_block *eb =
6846 (struct ocfs2_extent_block *)bh->b_data;
6847
6848 /*
6849 * Save this for use when processing the
6850 * parent block.
6851 */
6852 deleted_eb = le64_to_cpu(eb->h_blkno);
6853
6854 mlog(0, "deleting this extent block.\n");
6855
6856 ocfs2_remove_from_cache(INODE_CACHE(inode), bh);
6857
6858 BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
6859 BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
6860 BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
6861
6862 ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
6863 /* An error here is not fatal. */
6864 if (ret < 0)
6865 mlog_errno(ret);
6866 } else {
6867 deleted_eb = 0;
6868 }
6869
6870 index--;
6871 }
6872
6873 ret = 0;
6874out:
6875 return ret;
6876}
6877
6878static int ocfs2_do_truncate(struct ocfs2_super *osb,
6879 unsigned int clusters_to_del,
6880 struct inode *inode,
6881 struct buffer_head *fe_bh,
6882 handle_t *handle,
6883 struct ocfs2_truncate_context *tc,
6884 struct ocfs2_path *path,
6885 struct ocfs2_alloc_context *meta_ac)
6886{
6887 int status;
6888 struct ocfs2_dinode *fe;
6889 struct ocfs2_extent_block *last_eb = NULL;
6890 struct ocfs2_extent_list *el;
6891 struct buffer_head *last_eb_bh = NULL;
6892 u64 delete_blk = 0;
6893 u8 rec_flags;
6894
6895 fe = (struct ocfs2_dinode *) fe_bh->b_data;
6896
6897 status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
6898 path, &last_eb_bh);
6899 if (status < 0) {
6900 mlog_errno(status);
6901 goto bail;
6902 }
6903
6904 /*
6905 * Each component will be touched, so we might as well journal
6906 * here to avoid having to handle errors later.
6907 */
6908 status = ocfs2_journal_access_path(INODE_CACHE(inode), handle, path);
6909 if (status < 0) {
6910 mlog_errno(status);
6911 goto bail;
6912 }
6913
6914 if (last_eb_bh) {
6915 status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), last_eb_bh,
6916 OCFS2_JOURNAL_ACCESS_WRITE);
6917 if (status < 0) {
6918 mlog_errno(status);
6919 goto bail;
6920 }
6921
6922 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6923 }
6924
6925 el = &(fe->id2.i_list);
6926
6927 /*
6928 * Lower levels depend on this never happening, but it's best
6929 * to check it up here before changing the tree.
6930 */
6931 if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
6932 ocfs2_error(inode->i_sb,
6933 "Inode %lu has an empty extent record, depth %u\n",
6934 inode->i_ino, le16_to_cpu(el->l_tree_depth));
6935 status = -EROFS;
6936 goto bail;
6937 }
6938
6939 dquot_free_space_nodirty(inode,
6940 ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
6941 spin_lock(&OCFS2_I(inode)->ip_lock);
6942 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
6943 clusters_to_del;
6944 spin_unlock(&OCFS2_I(inode)->ip_lock);
6945 le32_add_cpu(&fe->i_clusters, -clusters_to_del);
6946 inode->i_blocks = ocfs2_inode_sector_count(inode);
6947
6948 status = ocfs2_trim_tree(inode, path, handle, tc,
6949 clusters_to_del, &delete_blk, &rec_flags);
6950 if (status) {
6951 mlog_errno(status);
6952 goto bail;
6953 }
6954
6955 if (le32_to_cpu(fe->i_clusters) == 0) {
6956 /* trunc to zero is a special case. */
6957 el->l_tree_depth = 0;
6958 fe->i_last_eb_blk = 0;
6959 } else if (last_eb)
6960 fe->i_last_eb_blk = last_eb->h_blkno;
6961
6962 status = ocfs2_journal_dirty(handle, fe_bh);
6963 if (status < 0) {
6964 mlog_errno(status);
6965 goto bail;
6966 }
6967
6968 if (last_eb) {
6969 /* If there will be a new last extent block, then by
6970 * definition, there cannot be any leaves to the right of
6971 * him. */
6972 last_eb->h_next_leaf_blk = 0;
6973 status = ocfs2_journal_dirty(handle, last_eb_bh);
6974 if (status < 0) {
6975 mlog_errno(status);
6976 goto bail;
6977 }
6978 }
6979
6980 if (delete_blk) {
6981 if (rec_flags & OCFS2_EXT_REFCOUNTED)
6982 status = ocfs2_decrease_refcount(inode, handle,
6983 ocfs2_blocks_to_clusters(osb->sb,
6984 delete_blk),
6985 clusters_to_del, meta_ac,
6986 &tc->tc_dealloc, 1);
6987 else
6988 status = ocfs2_truncate_log_append(osb, handle,
6989 delete_blk,
6990 clusters_to_del);
6991 if (status < 0) {
6992 mlog_errno(status);
6993 goto bail;
6994 }
6995 }
6996 status = 0;
6997bail:
6998 brelse(last_eb_bh);
6999 mlog_exit(status);
7000 return status;
7001}
7002
7003static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh) 6584static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
7004{ 6585{
7005 set_buffer_uptodate(bh); 6586 set_buffer_uptodate(bh);
@@ -7307,7 +6888,9 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
7307 goto out_commit; 6888 goto out_commit;
7308 did_quota = 1; 6889 did_quota = 1;
7309 6890
7310 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, 6891 data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
6892
6893 ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
7311 &num); 6894 &num);
7312 if (ret) { 6895 if (ret) {
7313 mlog_errno(ret); 6896 mlog_errno(ret);
@@ -7406,26 +6989,29 @@ out:
7406 */ 6989 */
7407int ocfs2_commit_truncate(struct ocfs2_super *osb, 6990int ocfs2_commit_truncate(struct ocfs2_super *osb,
7408 struct inode *inode, 6991 struct inode *inode,
7409 struct buffer_head *fe_bh, 6992 struct buffer_head *di_bh)
7410 struct ocfs2_truncate_context *tc)
7411{ 6993{
7412 int status, i, credits, tl_sem = 0; 6994 int status = 0, i, flags = 0;
7413 u32 clusters_to_del, new_highest_cpos, range; 6995 u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff;
7414 u64 blkno = 0; 6996 u64 blkno = 0;
7415 struct ocfs2_extent_list *el; 6997 struct ocfs2_extent_list *el;
7416 handle_t *handle = NULL; 6998 struct ocfs2_extent_rec *rec;
7417 struct inode *tl_inode = osb->osb_tl_inode;
7418 struct ocfs2_path *path = NULL; 6999 struct ocfs2_path *path = NULL;
7419 struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; 7000 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7420 struct ocfs2_alloc_context *meta_ac = NULL; 7001 struct ocfs2_extent_list *root_el = &(di->id2.i_list);
7421 struct ocfs2_refcount_tree *ref_tree = NULL; 7002 u64 refcount_loc = le64_to_cpu(di->i_refcount_loc);
7003 struct ocfs2_extent_tree et;
7004 struct ocfs2_cached_dealloc_ctxt dealloc;
7422 7005
7423 mlog_entry_void(); 7006 mlog_entry_void();
7424 7007
7008 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
7009 ocfs2_init_dealloc_ctxt(&dealloc);
7010
7425 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, 7011 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
7426 i_size_read(inode)); 7012 i_size_read(inode));
7427 7013
7428 path = ocfs2_new_path(fe_bh, &di->id2.i_list, 7014 path = ocfs2_new_path(di_bh, &di->id2.i_list,
7429 ocfs2_journal_access_di); 7015 ocfs2_journal_access_di);
7430 if (!path) { 7016 if (!path) {
7431 status = -ENOMEM; 7017 status = -ENOMEM;
@@ -7444,8 +7030,6 @@ start:
7444 goto bail; 7030 goto bail;
7445 } 7031 }
7446 7032
7447 credits = 0;
7448
7449 /* 7033 /*
7450 * Truncate always works against the rightmost tree branch. 7034 * Truncate always works against the rightmost tree branch.
7451 */ 7035 */
@@ -7480,101 +7064,62 @@ start:
7480 } 7064 }
7481 7065
7482 i = le16_to_cpu(el->l_next_free_rec) - 1; 7066 i = le16_to_cpu(el->l_next_free_rec) - 1;
7483 range = le32_to_cpu(el->l_recs[i].e_cpos) + 7067 rec = &el->l_recs[i];
7484 ocfs2_rec_clusters(el, &el->l_recs[i]); 7068 flags = rec->e_flags;
7485 if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) { 7069 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
7486 clusters_to_del = 0; 7070
7487 } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) { 7071 if (i == 0 && ocfs2_is_empty_extent(rec)) {
7488 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]); 7072 /*
7489 blkno = le64_to_cpu(el->l_recs[i].e_blkno); 7073 * Lower levels depend on this never happening, but it's best
7074 * to check it up here before changing the tree.
7075 */
7076 if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
7077 ocfs2_error(inode->i_sb, "Inode %lu has an empty "
7078 "extent record, depth %u\n", inode->i_ino,
7079 le16_to_cpu(root_el->l_tree_depth));
7080 status = -EROFS;
7081 goto bail;
7082 }
7083 trunc_cpos = le32_to_cpu(rec->e_cpos);
7084 trunc_len = 0;
7085 blkno = 0;
7086 } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
7087 /*
7088 * Truncate entire record.
7089 */
7090 trunc_cpos = le32_to_cpu(rec->e_cpos);
7091 trunc_len = ocfs2_rec_clusters(el, rec);
7092 blkno = le64_to_cpu(rec->e_blkno);
7490 } else if (range > new_highest_cpos) { 7093 } else if (range > new_highest_cpos) {
7491 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) + 7094 /*
7492 le32_to_cpu(el->l_recs[i].e_cpos)) - 7095 * Partial truncate. it also should be
7493 new_highest_cpos; 7096 * the last truncate we're doing.
7494 blkno = le64_to_cpu(el->l_recs[i].e_blkno) + 7097 */
7495 ocfs2_clusters_to_blocks(inode->i_sb, 7098 trunc_cpos = new_highest_cpos;
7496 ocfs2_rec_clusters(el, &el->l_recs[i]) - 7099 trunc_len = range - new_highest_cpos;
7497 clusters_to_del); 7100 coff = new_highest_cpos - le32_to_cpu(rec->e_cpos);
7101 blkno = le64_to_cpu(rec->e_blkno) +
7102 ocfs2_clusters_to_blocks(inode->i_sb, coff);
7498 } else { 7103 } else {
7104 /*
7105 * Truncate completed, leave happily.
7106 */
7499 status = 0; 7107 status = 0;
7500 goto bail; 7108 goto bail;
7501 } 7109 }
7502 7110
7503 mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", 7111 phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
7504 clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
7505
7506 if (el->l_recs[i].e_flags & OCFS2_EXT_REFCOUNTED && clusters_to_del) {
7507 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
7508 OCFS2_HAS_REFCOUNT_FL));
7509
7510 status = ocfs2_lock_refcount_tree(osb,
7511 le64_to_cpu(di->i_refcount_loc),
7512 1, &ref_tree, NULL);
7513 if (status) {
7514 mlog_errno(status);
7515 goto bail;
7516 }
7517
7518 status = ocfs2_prepare_refcount_change_for_del(inode, fe_bh,
7519 blkno,
7520 clusters_to_del,
7521 &credits,
7522 &meta_ac);
7523 if (status < 0) {
7524 mlog_errno(status);
7525 goto bail;
7526 }
7527 }
7528
7529 mutex_lock(&tl_inode->i_mutex);
7530 tl_sem = 1;
7531 /* ocfs2_truncate_log_needs_flush guarantees us at least one
7532 * record is free for use. If there isn't any, we flush to get
7533 * an empty truncate log. */
7534 if (ocfs2_truncate_log_needs_flush(osb)) {
7535 status = __ocfs2_flush_truncate_log(osb);
7536 if (status < 0) {
7537 mlog_errno(status);
7538 goto bail;
7539 }
7540 }
7541 7112
7542 credits += ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, 7113 status = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
7543 (struct ocfs2_dinode *)fe_bh->b_data, 7114 phys_cpos, trunc_len, flags, &dealloc,
7544 el); 7115 refcount_loc);
7545 handle = ocfs2_start_trans(osb, credits);
7546 if (IS_ERR(handle)) {
7547 status = PTR_ERR(handle);
7548 handle = NULL;
7549 mlog_errno(status);
7550 goto bail;
7551 }
7552
7553 status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
7554 tc, path, meta_ac);
7555 if (status < 0) { 7116 if (status < 0) {
7556 mlog_errno(status); 7117 mlog_errno(status);
7557 goto bail; 7118 goto bail;
7558 } 7119 }
7559 7120
7560 mutex_unlock(&tl_inode->i_mutex);
7561 tl_sem = 0;
7562
7563 ocfs2_commit_trans(osb, handle);
7564 handle = NULL;
7565
7566 ocfs2_reinit_path(path, 1); 7121 ocfs2_reinit_path(path, 1);
7567 7122
7568 if (meta_ac) {
7569 ocfs2_free_alloc_context(meta_ac);
7570 meta_ac = NULL;
7571 }
7572
7573 if (ref_tree) {
7574 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
7575 ref_tree = NULL;
7576 }
7577
7578 /* 7123 /*
7579 * The check above will catch the case where we've truncated 7124 * The check above will catch the case where we've truncated
7580 * away all allocation. 7125 * away all allocation.
@@ -7585,25 +7130,10 @@ bail:
7585 7130
7586 ocfs2_schedule_truncate_log_flush(osb, 1); 7131 ocfs2_schedule_truncate_log_flush(osb, 1);
7587 7132
7588 if (tl_sem) 7133 ocfs2_run_deallocs(osb, &dealloc);
7589 mutex_unlock(&tl_inode->i_mutex);
7590
7591 if (handle)
7592 ocfs2_commit_trans(osb, handle);
7593
7594 if (meta_ac)
7595 ocfs2_free_alloc_context(meta_ac);
7596
7597 if (ref_tree)
7598 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
7599
7600 ocfs2_run_deallocs(osb, &tc->tc_dealloc);
7601 7134
7602 ocfs2_free_path(path); 7135 ocfs2_free_path(path);
7603 7136
7604 /* This will drop the ext_alloc cluster lock for us */
7605 ocfs2_free_truncate_context(tc);
7606
7607 mlog_exit(status); 7137 mlog_exit(status);
7608 return status; 7138 return status;
7609} 7139}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 1db4359ccb90..55762b554b99 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -140,8 +140,9 @@ int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et,
140 struct ocfs2_cached_dealloc_ctxt *dealloc); 140 struct ocfs2_cached_dealloc_ctxt *dealloc);
141int ocfs2_remove_btree_range(struct inode *inode, 141int ocfs2_remove_btree_range(struct inode *inode,
142 struct ocfs2_extent_tree *et, 142 struct ocfs2_extent_tree *et,
143 u32 cpos, u32 phys_cpos, u32 len, 143 u32 cpos, u32 phys_cpos, u32 len, int flags,
144 struct ocfs2_cached_dealloc_ctxt *dealloc); 144 struct ocfs2_cached_dealloc_ctxt *dealloc,
145 u64 refcount_loc);
145 146
146int ocfs2_num_free_extents(struct ocfs2_super *osb, 147int ocfs2_num_free_extents(struct ocfs2_super *osb,
147 struct ocfs2_extent_tree *et); 148 struct ocfs2_extent_tree *et);
@@ -209,7 +210,7 @@ static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
209int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 210int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
210 u64 blkno, unsigned int bit); 211 u64 blkno, unsigned int bit);
211int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 212int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
212 int type, int slot, u64 blkno, 213 int type, int slot, u64 suballoc, u64 blkno,
213 unsigned int bit); 214 unsigned int bit);
214static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c) 215static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
215{ 216{
@@ -233,8 +234,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
233 struct ocfs2_truncate_context **tc); 234 struct ocfs2_truncate_context **tc);
234int ocfs2_commit_truncate(struct ocfs2_super *osb, 235int ocfs2_commit_truncate(struct ocfs2_super *osb,
235 struct inode *inode, 236 struct inode *inode,
236 struct buffer_head *fe_bh, 237 struct buffer_head *di_bh);
237 struct ocfs2_truncate_context *tc);
238int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, 238int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
239 unsigned int start, unsigned int end, int trunc); 239 unsigned int start, unsigned int end, int trunc);
240 240
@@ -319,6 +319,8 @@ int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
319 struct ocfs2_path *path); 319 struct ocfs2_path *path);
320int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, 320int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
321 struct ocfs2_path *path, u32 *cpos); 321 struct ocfs2_path *path, u32 *cpos);
322int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
323 struct ocfs2_path *path, u32 *cpos);
322int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, 324int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
323 struct ocfs2_path *left, 325 struct ocfs2_path *left,
324 struct ocfs2_path *right); 326 struct ocfs2_path *right);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 21441ddb5506..3623ca20cc18 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1735,6 +1735,9 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1735 goto out; 1735 goto out;
1736 } 1736 }
1737 1737
1738 if (data_ac)
1739 data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
1740
1738 credits = ocfs2_calc_extend_credits(inode->i_sb, 1741 credits = ocfs2_calc_extend_credits(inode->i_sb,
1739 &di->id2.i_list, 1742 &di->id2.i_list,
1740 clusters_to_alloc); 1743 clusters_to_alloc);
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 21c808f752d8..f9d5d3ffc75a 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30 29
31#include <cluster/masklog.h> 30#include <cluster/masklog.h>
@@ -407,6 +406,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
407 struct buffer_head *bh) 406 struct buffer_head *bh)
408{ 407{
409 int ret = 0; 408 int ret = 0;
409 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
410 410
411 mlog_entry_void(); 411 mlog_entry_void();
412 412
@@ -426,6 +426,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
426 426
427 get_bh(bh); /* for end_buffer_write_sync() */ 427 get_bh(bh); /* for end_buffer_write_sync() */
428 bh->b_end_io = end_buffer_write_sync; 428 bh->b_end_io = end_buffer_write_sync;
429 ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check);
429 submit_bh(WRITE, bh); 430 submit_bh(WRITE, bh);
430 431
431 wait_on_buffer(bh); 432 wait_on_buffer(bh);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 5c9890006708..41d5f1f92d56 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -34,6 +34,7 @@
34#include <linux/crc32.h> 34#include <linux/crc32.h>
35#include <linux/time.h> 35#include <linux/time.h>
36#include <linux/debugfs.h> 36#include <linux/debugfs.h>
37#include <linux/slab.h>
37 38
38#include "heartbeat.h" 39#include "heartbeat.h"
39#include "tcp.h" 40#include "tcp.h"
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index b39da877b12f..c7fba396392d 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -116,6 +116,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
116 define_mask(ERROR), 116 define_mask(ERROR),
117 define_mask(NOTICE), 117 define_mask(NOTICE),
118 define_mask(KTHREAD), 118 define_mask(KTHREAD),
119 define_mask(RESERVATIONS),
119}; 120};
120 121
121static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, }; 122static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
@@ -136,7 +137,7 @@ static ssize_t mlog_store(struct kobject *obj, struct attribute *attr,
136 return mlog_mask_store(mlog_attr->mask, buf, count); 137 return mlog_mask_store(mlog_attr->mask, buf, count);
137} 138}
138 139
139static struct sysfs_ops mlog_attr_ops = { 140static const struct sysfs_ops mlog_attr_ops = {
140 .show = mlog_show, 141 .show = mlog_show,
141 .store = mlog_store, 142 .store = mlog_store,
142}; 143};
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 3dfddbec32f2..fd96e2a2fa56 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -119,6 +119,7 @@
119#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ 119#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
120#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ 120#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
121#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */ 121#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */
122#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */
122 123
123#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) 124#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
124#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT) 125#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index c81142e3ef84..ed0c9f367fed 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -19,6 +19,7 @@
19 * Boston, MA 021110-1307, USA. 19 * Boston, MA 021110-1307, USA.
20 */ 20 */
21 21
22#include <linux/slab.h>
22#include <linux/kernel.h> 23#include <linux/kernel.h>
23#include <linux/module.h> 24#include <linux/module.h>
24#include <linux/configfs.h> 25#include <linux/configfs.h>
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 639024033fce..cf3e16696216 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -44,7 +44,6 @@
44 * and if they're the last, they fire off the decision. 44 * and if they're the last, they fire off the decision.
45 */ 45 */
46#include <linux/kernel.h> 46#include <linux/kernel.h>
47#include <linux/slab.h>
48#include <linux/workqueue.h> 47#include <linux/workqueue.h>
49#include <linux/reboot.h> 48#include <linux/reboot.h>
50 49
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index d8d0c65ac03c..aa75ca3f78da 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -72,9 +72,9 @@
72 72
73#include "tcp_internal.h" 73#include "tcp_internal.h"
74 74
75#define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u" 75#define SC_NODEF_FMT "node %s (num %u) at %pI4:%u"
76#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num, \ 76#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num, \
77 NIPQUAD(sc->sc_node->nd_ipv4_address), \ 77 &sc->sc_node->nd_ipv4_address, \
78 ntohs(sc->sc_node->nd_ipv4_port) 78 ntohs(sc->sc_node->nd_ipv4_port)
79 79
80/* 80/*
@@ -583,6 +583,9 @@ static void o2net_state_change(struct sock *sk)
583 o2net_sc_queue_work(sc, &sc->sc_connect_work); 583 o2net_sc_queue_work(sc, &sc->sc_connect_work);
584 break; 584 break;
585 default: 585 default:
586 printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT
587 " shutdown, state %d\n",
588 SC_NODEF_ARGS(sc), sk->sk_state);
586 o2net_sc_queue_work(sc, &sc->sc_shutdown_work); 589 o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
587 break; 590 break;
588 } 591 }
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index efd77d071c80..f04ebcfffc4a 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1194,7 +1194,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
1194 else 1194 else
1195 de->inode = 0; 1195 de->inode = 0;
1196 dir->i_version++; 1196 dir->i_version++;
1197 status = ocfs2_journal_dirty(handle, bh); 1197 ocfs2_journal_dirty(handle, bh);
1198 goto bail; 1198 goto bail;
1199 } 1199 }
1200 i += le16_to_cpu(de->rec_len); 1200 i += le16_to_cpu(de->rec_len);
@@ -1752,7 +1752,7 @@ int __ocfs2_add_entry(handle_t *handle,
1752 ocfs2_recalc_free_list(dir, handle, lookup); 1752 ocfs2_recalc_free_list(dir, handle, lookup);
1753 1753
1754 dir->i_version++; 1754 dir->i_version++;
1755 status = ocfs2_journal_dirty(handle, insert_bh); 1755 ocfs2_journal_dirty(handle, insert_bh);
1756 retval = 0; 1756 retval = 0;
1757 goto bail; 1757 goto bail;
1758 } 1758 }
@@ -2297,12 +2297,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
2297 } 2297 }
2298 2298
2299 ocfs2_fill_initial_dirents(inode, parent, data->id_data, size); 2299 ocfs2_fill_initial_dirents(inode, parent, data->id_data, size);
2300
2301 ocfs2_journal_dirty(handle, di_bh); 2300 ocfs2_journal_dirty(handle, di_bh);
2302 if (ret) {
2303 mlog_errno(ret);
2304 goto out;
2305 }
2306 2301
2307 i_size_write(inode, size); 2302 i_size_write(inode, size);
2308 inode->i_nlink = 2; 2303 inode->i_nlink = 2;
@@ -2366,11 +2361,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
2366 ocfs2_init_dir_trailer(inode, new_bh, size); 2361 ocfs2_init_dir_trailer(inode, new_bh, size);
2367 } 2362 }
2368 2363
2369 status = ocfs2_journal_dirty(handle, new_bh); 2364 ocfs2_journal_dirty(handle, new_bh);
2370 if (status < 0) {
2371 mlog_errno(status);
2372 goto bail;
2373 }
2374 2365
2375 i_size_write(inode, inode->i_sb->s_blocksize); 2366 i_size_write(inode, inode->i_sb->s_blocksize);
2376 inode->i_nlink = 2; 2367 inode->i_nlink = 2;
@@ -2404,15 +2395,15 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2404 int ret; 2395 int ret;
2405 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 2396 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
2406 u16 dr_suballoc_bit; 2397 u16 dr_suballoc_bit;
2407 u64 dr_blkno; 2398 u64 suballoc_loc, dr_blkno;
2408 unsigned int num_bits; 2399 unsigned int num_bits;
2409 struct buffer_head *dx_root_bh = NULL; 2400 struct buffer_head *dx_root_bh = NULL;
2410 struct ocfs2_dx_root_block *dx_root; 2401 struct ocfs2_dx_root_block *dx_root;
2411 struct ocfs2_dir_block_trailer *trailer = 2402 struct ocfs2_dir_block_trailer *trailer =
2412 ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb); 2403 ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
2413 2404
2414 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit, 2405 ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
2415 &num_bits, &dr_blkno); 2406 &dr_suballoc_bit, &num_bits, &dr_blkno);
2416 if (ret) { 2407 if (ret) {
2417 mlog_errno(ret); 2408 mlog_errno(ret);
2418 goto out; 2409 goto out;
@@ -2440,6 +2431,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2440 memset(dx_root, 0, osb->sb->s_blocksize); 2431 memset(dx_root, 0, osb->sb->s_blocksize);
2441 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE); 2432 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
2442 dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); 2433 dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
2434 dx_root->dr_suballoc_loc = cpu_to_le64(suballoc_loc);
2443 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit); 2435 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
2444 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation); 2436 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
2445 dx_root->dr_blkno = cpu_to_le64(dr_blkno); 2437 dx_root->dr_blkno = cpu_to_le64(dr_blkno);
@@ -2458,10 +2450,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2458 dx_root->dr_list.l_count = 2450 dx_root->dr_list.l_count =
2459 cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb)); 2451 cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
2460 } 2452 }
2461 2453 ocfs2_journal_dirty(handle, dx_root_bh);
2462 ret = ocfs2_journal_dirty(handle, dx_root_bh);
2463 if (ret)
2464 mlog_errno(ret);
2465 2454
2466 ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh, 2455 ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
2467 OCFS2_JOURNAL_ACCESS_CREATE); 2456 OCFS2_JOURNAL_ACCESS_CREATE);
@@ -2475,9 +2464,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2475 OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL; 2464 OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
2476 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); 2465 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
2477 2466
2478 ret = ocfs2_journal_dirty(handle, di_bh); 2467 ocfs2_journal_dirty(handle, di_bh);
2479 if (ret)
2480 mlog_errno(ret);
2481 2468
2482 *ret_dx_root_bh = dx_root_bh; 2469 *ret_dx_root_bh = dx_root_bh;
2483 dx_root_bh = NULL; 2470 dx_root_bh = NULL;
@@ -2558,7 +2545,7 @@ static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
2558 * chance of contiguousness as the directory grows in number 2545 * chance of contiguousness as the directory grows in number
2559 * of entries. 2546 * of entries.
2560 */ 2547 */
2561 ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num); 2548 ret = __ocfs2_claim_clusters(handle, data_ac, 1, 1, &phys, &num);
2562 if (ret) { 2549 if (ret) {
2563 mlog_errno(ret); 2550 mlog_errno(ret);
2564 goto out; 2551 goto out;
@@ -2991,7 +2978,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
2991 * if we only get one now, that's enough to continue. The rest 2978 * if we only get one now, that's enough to continue. The rest
2992 * will be claimed after the conversion to extents. 2979 * will be claimed after the conversion to extents.
2993 */ 2980 */
2994 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len); 2981 if (ocfs2_dir_resv_allowed(osb))
2982 data_ac->ac_resv = &oi->ip_la_data_resv;
2983 ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len);
2995 if (ret) { 2984 if (ret) {
2996 mlog_errno(ret); 2985 mlog_errno(ret);
2997 goto out_commit; 2986 goto out_commit;
@@ -3034,11 +3023,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3034 ocfs2_init_dir_trailer(dir, dirdata_bh, i); 3023 ocfs2_init_dir_trailer(dir, dirdata_bh, i);
3035 } 3024 }
3036 3025
3037 ret = ocfs2_journal_dirty(handle, dirdata_bh); 3026 ocfs2_journal_dirty(handle, dirdata_bh);
3038 if (ret) {
3039 mlog_errno(ret);
3040 goto out_commit;
3041 }
3042 3027
3043 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { 3028 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
3044 /* 3029 /*
@@ -3104,11 +3089,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3104 */ 3089 */
3105 dir->i_blocks = ocfs2_inode_sector_count(dir); 3090 dir->i_blocks = ocfs2_inode_sector_count(dir);
3106 3091
3107 ret = ocfs2_journal_dirty(handle, di_bh); 3092 ocfs2_journal_dirty(handle, di_bh);
3108 if (ret) {
3109 mlog_errno(ret);
3110 goto out_commit;
3111 }
3112 3093
3113 if (ocfs2_supports_indexed_dirs(osb)) { 3094 if (ocfs2_supports_indexed_dirs(osb)) {
3114 ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh, 3095 ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
@@ -3138,7 +3119,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3138 * pass. Claim the 2nd cluster as a separate extent. 3119 * pass. Claim the 2nd cluster as a separate extent.
3139 */ 3120 */
3140 if (alloc > len) { 3121 if (alloc > len) {
3141 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, 3122 ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
3142 &len); 3123 &len);
3143 if (ret) { 3124 if (ret) {
3144 mlog_errno(ret); 3125 mlog_errno(ret);
@@ -3369,6 +3350,9 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
3369 goto bail; 3350 goto bail;
3370 } 3351 }
3371 3352
3353 if (ocfs2_dir_resv_allowed(osb))
3354 data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
3355
3372 credits = ocfs2_calc_extend_credits(sb, el, 1); 3356 credits = ocfs2_calc_extend_credits(sb, el, 1);
3373 } else { 3357 } else {
3374 spin_unlock(&OCFS2_I(dir)->ip_lock); 3358 spin_unlock(&OCFS2_I(dir)->ip_lock);
@@ -3423,11 +3407,7 @@ do_extend:
3423 } else { 3407 } else {
3424 de->rec_len = cpu_to_le16(sb->s_blocksize); 3408 de->rec_len = cpu_to_le16(sb->s_blocksize);
3425 } 3409 }
3426 status = ocfs2_journal_dirty(handle, new_bh); 3410 ocfs2_journal_dirty(handle, new_bh);
3427 if (status < 0) {
3428 mlog_errno(status);
3429 goto bail;
3430 }
3431 3411
3432 dir_i_size += dir->i_sb->s_blocksize; 3412 dir_i_size += dir->i_sb->s_blocksize;
3433 i_size_write(dir, dir_i_size); 3413 i_size_write(dir, dir_i_size);
@@ -3906,11 +3886,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3906 sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp, 3886 sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
3907 dx_leaf_sort_swap); 3887 dx_leaf_sort_swap);
3908 3888
3909 ret = ocfs2_journal_dirty(handle, dx_leaf_bh); 3889 ocfs2_journal_dirty(handle, dx_leaf_bh);
3910 if (ret) {
3911 mlog_errno(ret);
3912 goto out_commit;
3913 }
3914 3890
3915 ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash, 3891 ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
3916 &split_hash); 3892 &split_hash);
@@ -4490,7 +4466,10 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
4490 4466
4491 blk = le64_to_cpu(dx_root->dr_blkno); 4467 blk = le64_to_cpu(dx_root->dr_blkno);
4492 bit = le16_to_cpu(dx_root->dr_suballoc_bit); 4468 bit = le16_to_cpu(dx_root->dr_suballoc_bit);
4493 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 4469 if (dx_root->dr_suballoc_loc)
4470 bg_blkno = le64_to_cpu(dx_root->dr_suballoc_loc);
4471 else
4472 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
4494 ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh, 4473 ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
4495 bit, bg_blkno, 1); 4474 bit, bg_blkno, 1);
4496 if (ret) 4475 if (ret)
@@ -4551,8 +4530,8 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
4551 4530
4552 p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno); 4531 p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
4553 4532
4554 ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 4533 ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0,
4555 &dealloc); 4534 &dealloc, 0);
4556 if (ret) { 4535 if (ret) {
4557 mlog_errno(ret); 4536 mlog_errno(ret);
4558 goto out; 4537 goto out;
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index dccc439fa087..f44999156839 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
@@ -89,7 +88,7 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
89 return 0; 88 return 0;
90} 89}
91 90
92static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) 91void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
93{ 92{
94 mlog_entry_void(); 93 mlog_entry_void();
95 94
@@ -146,7 +145,7 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
146} 145}
147 146
148 147
149static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) 148void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
150{ 149{
151 mlog_entry_void(); 150 mlog_entry_void();
152 151
@@ -185,9 +184,8 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
185 BUG_ON(!lksb); 184 BUG_ON(!lksb);
186 185
187 /* only updates if this node masters the lockres */ 186 /* only updates if this node masters the lockres */
187 spin_lock(&res->spinlock);
188 if (res->owner == dlm->node_num) { 188 if (res->owner == dlm->node_num) {
189
190 spin_lock(&res->spinlock);
191 /* check the lksb flags for the direction */ 189 /* check the lksb flags for the direction */
192 if (lksb->flags & DLM_LKSB_GET_LVB) { 190 if (lksb->flags & DLM_LKSB_GET_LVB) {
193 mlog(0, "getting lvb from lockres for %s node\n", 191 mlog(0, "getting lvb from lockres for %s node\n",
@@ -202,8 +200,8 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
202 * here. In the future we might want to clear it at the time 200 * here. In the future we might want to clear it at the time
203 * the put is actually done. 201 * the put is actually done.
204 */ 202 */
205 spin_unlock(&res->spinlock);
206 } 203 }
204 spin_unlock(&res->spinlock);
207 205
208 /* reset any lvb flags on the lksb */ 206 /* reset any lvb flags on the lksb */
209 lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB); 207 lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
@@ -453,7 +451,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
453 ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen, 451 ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
454 lock->ml.node, &status); 452 lock->ml.node, &status);
455 if (ret < 0) 453 if (ret < 0)
456 mlog_errno(ret); 454 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
455 "node %u\n", ret, DLM_PROXY_AST_MSG, dlm->key,
456 lock->ml.node);
457 else { 457 else {
458 if (status == DLM_RECOVERING) { 458 if (status == DLM_RECOVERING) {
459 mlog(ML_ERROR, "sent AST to node %u, it thinks this " 459 mlog(ML_ERROR, "sent AST to node %u, it thinks this "
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 0102be35980c..4b6ae2c13b47 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -37,7 +37,7 @@
37#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes 37#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes
38#define DLM_THREAD_MS 200 // flush at least every 200 ms 38#define DLM_THREAD_MS 200 // flush at least every 200 ms
39 39
40#define DLM_HASH_SIZE_DEFAULT (1 << 14) 40#define DLM_HASH_SIZE_DEFAULT (1 << 17)
41#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE 41#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
42# define DLM_HASH_PAGES 1 42# define DLM_HASH_PAGES 1
43#else 43#else
@@ -904,6 +904,8 @@ void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
904 904
905void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); 905void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
906void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); 906void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
907void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
908void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
907void dlm_do_local_ast(struct dlm_ctxt *dlm, 909void dlm_do_local_ast(struct dlm_ctxt *dlm,
908 struct dlm_lock_resource *res, 910 struct dlm_lock_resource *res,
909 struct dlm_lock *lock); 911 struct dlm_lock *lock);
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index f283bce776b4..9f30491e5e88 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
@@ -391,7 +390,9 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
391 } else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED) 390 } else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
392 dlm_error(ret); 391 dlm_error(ret);
393 } else { 392 } else {
394 mlog_errno(tmpret); 393 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
394 "node %u\n", tmpret, DLM_CONVERT_LOCK_MSG, dlm->key,
395 res->owner);
395 if (dlm_is_host_down(tmpret)) { 396 if (dlm_is_host_down(tmpret)) {
396 /* instead of logging the same network error over 397 /* instead of logging the same network error over
397 * and over, sleep here and wait for the heartbeat 398 * and over, sleep here and wait for the heartbeat
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 988c9055fd4e..6b5a492e1749 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -511,7 +511,7 @@ static void __dlm_print_nodes(struct dlm_ctxt *dlm)
511 511
512 assert_spin_locked(&dlm->spinlock); 512 assert_spin_locked(&dlm->spinlock);
513 513
514 printk(KERN_INFO "ocfs2_dlm: Nodes in domain (\"%s\"): ", dlm->name); 514 printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name);
515 515
516 while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 516 while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
517 node + 1)) < O2NM_MAX_NODES) { 517 node + 1)) < O2NM_MAX_NODES) {
@@ -534,7 +534,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
534 534
535 node = exit_msg->node_idx; 535 node = exit_msg->node_idx;
536 536
537 printk(KERN_INFO "ocfs2_dlm: Node %u leaves domain %s\n", node, dlm->name); 537 printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name);
538 538
539 spin_lock(&dlm->spinlock); 539 spin_lock(&dlm->spinlock);
540 clear_bit(node, dlm->domain_map); 540 clear_bit(node, dlm->domain_map);
@@ -565,7 +565,9 @@ static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
565 status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key, 565 status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
566 &leave_msg, sizeof(leave_msg), node, 566 &leave_msg, sizeof(leave_msg), node,
567 NULL); 567 NULL);
568 568 if (status < 0)
569 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
570 "node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node);
569 mlog(0, "status return %d from o2net_send_message\n", status); 571 mlog(0, "status return %d from o2net_send_message\n", status);
570 572
571 return status; 573 return status;
@@ -904,7 +906,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
904 set_bit(assert->node_idx, dlm->domain_map); 906 set_bit(assert->node_idx, dlm->domain_map);
905 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); 907 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
906 908
907 printk(KERN_INFO "ocfs2_dlm: Node %u joins domain %s\n", 909 printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
908 assert->node_idx, dlm->name); 910 assert->node_idx, dlm->name);
909 __dlm_print_nodes(dlm); 911 __dlm_print_nodes(dlm);
910 912
@@ -962,7 +964,9 @@ static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm,
962 &cancel_msg, sizeof(cancel_msg), node, 964 &cancel_msg, sizeof(cancel_msg), node,
963 NULL); 965 NULL);
964 if (status < 0) { 966 if (status < 0) {
965 mlog_errno(status); 967 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
968 "node %u\n", status, DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
969 node);
966 goto bail; 970 goto bail;
967 } 971 }
968 972
@@ -1029,10 +1033,11 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
1029 byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES); 1033 byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES);
1030 1034
1031 status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg, 1035 status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
1032 sizeof(join_msg), node, 1036 sizeof(join_msg), node, &join_resp);
1033 &join_resp);
1034 if (status < 0 && status != -ENOPROTOOPT) { 1037 if (status < 0 && status != -ENOPROTOOPT) {
1035 mlog_errno(status); 1038 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
1039 "node %u\n", status, DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
1040 node);
1036 goto bail; 1041 goto bail;
1037 } 1042 }
1038 dlm_query_join_wire_to_packet(join_resp, &packet); 1043 dlm_query_join_wire_to_packet(join_resp, &packet);
@@ -1103,7 +1108,9 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
1103 &assert_msg, sizeof(assert_msg), node, 1108 &assert_msg, sizeof(assert_msg), node,
1104 NULL); 1109 NULL);
1105 if (status < 0) 1110 if (status < 0)
1106 mlog_errno(status); 1111 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
1112 "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
1113 node);
1107 1114
1108 return status; 1115 return status;
1109} 1116}
@@ -1516,7 +1523,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1516 goto leave; 1523 goto leave;
1517 } 1524 }
1518 1525
1519 dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL); 1526 dlm->name = kstrdup(domain, GFP_KERNEL);
1520 if (dlm->name == NULL) { 1527 if (dlm->name == NULL) {
1521 mlog_errno(-ENOMEM); 1528 mlog_errno(-ENOMEM);
1522 kfree(dlm); 1529 kfree(dlm);
@@ -1550,7 +1557,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1550 for (i = 0; i < DLM_HASH_BUCKETS; i++) 1557 for (i = 0; i < DLM_HASH_BUCKETS; i++)
1551 INIT_HLIST_HEAD(dlm_master_hash(dlm, i)); 1558 INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
1552 1559
1553 strcpy(dlm->name, domain);
1554 dlm->key = key; 1560 dlm->key = key;
1555 dlm->node_num = o2nm_this_node(); 1561 dlm->node_num = o2nm_this_node();
1556 1562
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 733337772671..69cf369961c4 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -329,7 +329,9 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
329 BUG(); 329 BUG();
330 } 330 }
331 } else { 331 } else {
332 mlog_errno(tmpret); 332 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
333 "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key,
334 res->owner);
333 if (dlm_is_host_down(tmpret)) { 335 if (dlm_is_host_down(tmpret)) {
334 ret = DLM_RECOVERING; 336 ret = DLM_RECOVERING;
335 mlog(0, "node %u died so returning DLM_RECOVERING " 337 mlog(0, "node %u died so returning DLM_RECOVERING "
@@ -429,7 +431,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
429 struct dlm_lock *lock; 431 struct dlm_lock *lock;
430 int kernel_allocated = 0; 432 int kernel_allocated = 0;
431 433
432 lock = (struct dlm_lock *) kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS); 434 lock = kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
433 if (!lock) 435 if (!lock)
434 return NULL; 436 return NULL;
435 437
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index a659606dcb95..4a7506a4e314 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -617,13 +617,11 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
617{ 617{
618 struct dlm_lock_resource *res = NULL; 618 struct dlm_lock_resource *res = NULL;
619 619
620 res = (struct dlm_lock_resource *) 620 res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
621 kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
622 if (!res) 621 if (!res)
623 goto error; 622 goto error;
624 623
625 res->lockname.name = (char *) 624 res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
626 kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
627 if (!res->lockname.name) 625 if (!res->lockname.name)
628 goto error; 626 goto error;
629 627
@@ -757,8 +755,7 @@ lookup:
757 spin_unlock(&dlm->spinlock); 755 spin_unlock(&dlm->spinlock);
758 mlog(0, "allocating a new resource\n"); 756 mlog(0, "allocating a new resource\n");
759 /* nothing found and we need to allocate one. */ 757 /* nothing found and we need to allocate one. */
760 alloc_mle = (struct dlm_master_list_entry *) 758 alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
761 kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
762 if (!alloc_mle) 759 if (!alloc_mle)
763 goto leave; 760 goto leave;
764 res = dlm_new_lockres(dlm, lockid, namelen); 761 res = dlm_new_lockres(dlm, lockid, namelen);
@@ -1542,8 +1539,7 @@ way_up_top:
1542 spin_unlock(&dlm->master_lock); 1539 spin_unlock(&dlm->master_lock);
1543 spin_unlock(&dlm->spinlock); 1540 spin_unlock(&dlm->spinlock);
1544 1541
1545 mle = (struct dlm_master_list_entry *) 1542 mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
1546 kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
1547 if (!mle) { 1543 if (!mle) {
1548 response = DLM_MASTER_RESP_ERROR; 1544 response = DLM_MASTER_RESP_ERROR;
1549 mlog_errno(-ENOMEM); 1545 mlog_errno(-ENOMEM);
@@ -1666,7 +1662,9 @@ again:
1666 tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key, 1662 tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
1667 &assert, sizeof(assert), to, &r); 1663 &assert, sizeof(assert), to, &r);
1668 if (tmpret < 0) { 1664 if (tmpret < 0) {
1669 mlog(0, "assert_master returned %d!\n", tmpret); 1665 mlog(ML_ERROR, "Error %d when sending message %u (key "
1666 "0x%x) to node %u\n", tmpret,
1667 DLM_ASSERT_MASTER_MSG, dlm->key, to);
1670 if (!dlm_is_host_down(tmpret)) { 1668 if (!dlm_is_host_down(tmpret)) {
1671 mlog(ML_ERROR, "unhandled error=%d!\n", tmpret); 1669 mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
1672 BUG(); 1670 BUG();
@@ -1875,7 +1873,6 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
1875ok: 1873ok:
1876 spin_unlock(&res->spinlock); 1874 spin_unlock(&res->spinlock);
1877 } 1875 }
1878 spin_unlock(&dlm->spinlock);
1879 1876
1880 // mlog(0, "woo! got an assert_master from node %u!\n", 1877 // mlog(0, "woo! got an assert_master from node %u!\n",
1881 // assert->node_idx); 1878 // assert->node_idx);
@@ -1926,7 +1923,6 @@ ok:
1926 /* master is known, detach if not already detached. 1923 /* master is known, detach if not already detached.
1927 * ensures that only one assert_master call will happen 1924 * ensures that only one assert_master call will happen
1928 * on this mle. */ 1925 * on this mle. */
1929 spin_lock(&dlm->spinlock);
1930 spin_lock(&dlm->master_lock); 1926 spin_lock(&dlm->master_lock);
1931 1927
1932 rr = atomic_read(&mle->mle_refs.refcount); 1928 rr = atomic_read(&mle->mle_refs.refcount);
@@ -1959,7 +1955,6 @@ ok:
1959 __dlm_put_mle(mle); 1955 __dlm_put_mle(mle);
1960 } 1956 }
1961 spin_unlock(&dlm->master_lock); 1957 spin_unlock(&dlm->master_lock);
1962 spin_unlock(&dlm->spinlock);
1963 } else if (res) { 1958 } else if (res) {
1964 if (res->owner != assert->node_idx) { 1959 if (res->owner != assert->node_idx) {
1965 mlog(0, "assert_master from %u, but current " 1960 mlog(0, "assert_master from %u, but current "
@@ -1967,6 +1962,7 @@ ok:
1967 res->owner, namelen, name); 1962 res->owner, namelen, name);
1968 } 1963 }
1969 } 1964 }
1965 spin_unlock(&dlm->spinlock);
1970 1966
1971done: 1967done:
1972 ret = 0; 1968 ret = 0;
@@ -2207,7 +2203,9 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2207 ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, 2203 ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
2208 &deref, sizeof(deref), res->owner, &r); 2204 &deref, sizeof(deref), res->owner, &r);
2209 if (ret < 0) 2205 if (ret < 0)
2210 mlog_errno(ret); 2206 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
2207 "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key,
2208 res->owner);
2211 else if (r < 0) { 2209 else if (r < 0) {
2212 /* BAD. other node says I did not have a ref. */ 2210 /* BAD. other node says I did not have a ref. */
2213 mlog(ML_ERROR,"while dropping ref on %s:%.*s " 2211 mlog(ML_ERROR,"while dropping ref on %s:%.*s "
@@ -2454,8 +2452,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2454 goto leave; 2452 goto leave;
2455 } 2453 }
2456 2454
2457 mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, 2455 mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
2458 GFP_NOFS);
2459 if (!mle) { 2456 if (!mle) {
2460 mlog_errno(ret); 2457 mlog_errno(ret);
2461 goto leave; 2458 goto leave;
@@ -2977,7 +2974,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
2977 &migrate, sizeof(migrate), nodenum, 2974 &migrate, sizeof(migrate), nodenum,
2978 &status); 2975 &status);
2979 if (ret < 0) { 2976 if (ret < 0) {
2980 mlog(0, "migrate_request returned %d!\n", ret); 2977 mlog(ML_ERROR, "Error %d when sending message %u (key "
2978 "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG,
2979 dlm->key, nodenum);
2981 if (!dlm_is_host_down(ret)) { 2980 if (!dlm_is_host_down(ret)) {
2982 mlog(ML_ERROR, "unhandled error=%d!\n", ret); 2981 mlog(ML_ERROR, "unhandled error=%d!\n", ret);
2983 BUG(); 2982 BUG();
@@ -3035,8 +3034,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
3035 hash = dlm_lockid_hash(name, namelen); 3034 hash = dlm_lockid_hash(name, namelen);
3036 3035
3037 /* preallocate.. if this fails, abort */ 3036 /* preallocate.. if this fails, abort */
3038 mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, 3037 mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
3039 GFP_NOFS);
3040 3038
3041 if (!mle) { 3039 if (!mle) {
3042 ret = -ENOMEM; 3040 ret = -ENOMEM;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index b4f99de2caf3..f8b75ce4be70 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -803,7 +803,9 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
803 803
804 /* negative status is handled by caller */ 804 /* negative status is handled by caller */
805 if (ret < 0) 805 if (ret < 0)
806 mlog_errno(ret); 806 mlog(ML_ERROR, "Error %d when sending message %u (key "
807 "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG,
808 dlm->key, request_from);
807 809
808 // return from here, then 810 // return from here, then
809 // sleep until all received or error 811 // sleep until all received or error
@@ -955,10 +957,10 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
955 ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, 957 ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
956 sizeof(done_msg), send_to, &tmpret); 958 sizeof(done_msg), send_to, &tmpret);
957 if (ret < 0) { 959 if (ret < 0) {
960 mlog(ML_ERROR, "Error %d when sending message %u (key "
961 "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG,
962 dlm->key, send_to);
958 if (!dlm_is_host_down(ret)) { 963 if (!dlm_is_host_down(ret)) {
959 mlog_errno(ret);
960 mlog(ML_ERROR, "%s: unknown error sending data-done "
961 "to %u\n", dlm->name, send_to);
962 BUG(); 964 BUG();
963 } 965 }
964 } else 966 } else
@@ -1126,7 +1128,9 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
1126 if (ret < 0) { 1128 if (ret < 0) {
1127 /* XXX: negative status is not handled. 1129 /* XXX: negative status is not handled.
1128 * this will end up killing this node. */ 1130 * this will end up killing this node. */
1129 mlog_errno(ret); 1131 mlog(ML_ERROR, "Error %d when sending message %u (key "
1132 "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG,
1133 dlm->key, send_to);
1130 } else { 1134 } else {
1131 /* might get an -ENOMEM back here */ 1135 /* might get an -ENOMEM back here */
1132 ret = status; 1136 ret = status;
@@ -1642,7 +1646,9 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
1642 &req, sizeof(req), nodenum, &status); 1646 &req, sizeof(req), nodenum, &status);
1643 /* XXX: negative status not handled properly here. */ 1647 /* XXX: negative status not handled properly here. */
1644 if (ret < 0) 1648 if (ret < 0)
1645 mlog_errno(ret); 1649 mlog(ML_ERROR, "Error %d when sending message %u (key "
1650 "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG,
1651 dlm->key, nodenum);
1646 else { 1652 else {
1647 BUG_ON(status < 0); 1653 BUG_ON(status < 0);
1648 BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN); 1654 BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -2640,7 +2646,7 @@ retry:
2640 if (dlm_is_host_down(ret)) { 2646 if (dlm_is_host_down(ret)) {
2641 /* node is down. not involved in recovery 2647 /* node is down. not involved in recovery
2642 * so just keep going */ 2648 * so just keep going */
2643 mlog(0, "%s: node %u was down when sending " 2649 mlog(ML_NOTICE, "%s: node %u was down when sending "
2644 "begin reco msg (%d)\n", dlm->name, nodenum, ret); 2650 "begin reco msg (%d)\n", dlm->name, nodenum, ret);
2645 ret = 0; 2651 ret = 0;
2646 } 2652 }
@@ -2660,11 +2666,12 @@ retry:
2660 } 2666 }
2661 if (ret < 0) { 2667 if (ret < 0) {
2662 struct dlm_lock_resource *res; 2668 struct dlm_lock_resource *res;
2669
2663 /* this is now a serious problem, possibly ENOMEM 2670 /* this is now a serious problem, possibly ENOMEM
2664 * in the network stack. must retry */ 2671 * in the network stack. must retry */
2665 mlog_errno(ret); 2672 mlog_errno(ret);
2666 mlog(ML_ERROR, "begin reco of dlm %s to node %u " 2673 mlog(ML_ERROR, "begin reco of dlm %s to node %u "
2667 " returned %d\n", dlm->name, nodenum, ret); 2674 "returned %d\n", dlm->name, nodenum, ret);
2668 res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME, 2675 res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
2669 DLM_RECOVERY_LOCK_NAME_LEN); 2676 DLM_RECOVERY_LOCK_NAME_LEN);
2670 if (res) { 2677 if (res) {
@@ -2789,7 +2796,9 @@ stage2:
2789 if (ret >= 0) 2796 if (ret >= 0)
2790 ret = status; 2797 ret = status;
2791 if (ret < 0) { 2798 if (ret < 0) {
2792 mlog_errno(ret); 2799 mlog(ML_ERROR, "Error %d when sending message %u (key "
2800 "0x%x) to node %u\n", ret, DLM_FINALIZE_RECO_MSG,
2801 dlm->key, nodenum);
2793 if (dlm_is_host_down(ret)) { 2802 if (dlm_is_host_down(ret)) {
2794 /* this has no effect on this recovery 2803 /* this has no effect on this recovery
2795 * session, so set the status to zero to 2804 * session, so set the status to zero to
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 52ec020ea78b..d4f73ca68fe5 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
@@ -310,6 +309,7 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
310 * spinlock, and because we know that it is not migrating/ 309 * spinlock, and because we know that it is not migrating/
311 * recovering/in-progress, it is fine to reserve asts and 310 * recovering/in-progress, it is fine to reserve asts and
312 * basts right before queueing them all throughout */ 311 * basts right before queueing them all throughout */
312 assert_spin_locked(&dlm->ast_lock);
313 assert_spin_locked(&res->spinlock); 313 assert_spin_locked(&res->spinlock);
314 BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING| 314 BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
315 DLM_LOCK_RES_RECOVERING| 315 DLM_LOCK_RES_RECOVERING|
@@ -338,7 +338,7 @@ converting:
338 /* queue the BAST if not already */ 338 /* queue the BAST if not already */
339 if (lock->ml.highest_blocked == LKM_IVMODE) { 339 if (lock->ml.highest_blocked == LKM_IVMODE) {
340 __dlm_lockres_reserve_ast(res); 340 __dlm_lockres_reserve_ast(res);
341 dlm_queue_bast(dlm, lock); 341 __dlm_queue_bast(dlm, lock);
342 } 342 }
343 /* update the highest_blocked if needed */ 343 /* update the highest_blocked if needed */
344 if (lock->ml.highest_blocked < target->ml.convert_type) 344 if (lock->ml.highest_blocked < target->ml.convert_type)
@@ -356,7 +356,7 @@ converting:
356 can_grant = 0; 356 can_grant = 0;
357 if (lock->ml.highest_blocked == LKM_IVMODE) { 357 if (lock->ml.highest_blocked == LKM_IVMODE) {
358 __dlm_lockres_reserve_ast(res); 358 __dlm_lockres_reserve_ast(res);
359 dlm_queue_bast(dlm, lock); 359 __dlm_queue_bast(dlm, lock);
360 } 360 }
361 if (lock->ml.highest_blocked < target->ml.convert_type) 361 if (lock->ml.highest_blocked < target->ml.convert_type)
362 lock->ml.highest_blocked = 362 lock->ml.highest_blocked =
@@ -384,7 +384,7 @@ converting:
384 spin_unlock(&target->spinlock); 384 spin_unlock(&target->spinlock);
385 385
386 __dlm_lockres_reserve_ast(res); 386 __dlm_lockres_reserve_ast(res);
387 dlm_queue_ast(dlm, target); 387 __dlm_queue_ast(dlm, target);
388 /* go back and check for more */ 388 /* go back and check for more */
389 goto converting; 389 goto converting;
390 } 390 }
@@ -403,7 +403,7 @@ blocked:
403 can_grant = 0; 403 can_grant = 0;
404 if (lock->ml.highest_blocked == LKM_IVMODE) { 404 if (lock->ml.highest_blocked == LKM_IVMODE) {
405 __dlm_lockres_reserve_ast(res); 405 __dlm_lockres_reserve_ast(res);
406 dlm_queue_bast(dlm, lock); 406 __dlm_queue_bast(dlm, lock);
407 } 407 }
408 if (lock->ml.highest_blocked < target->ml.type) 408 if (lock->ml.highest_blocked < target->ml.type)
409 lock->ml.highest_blocked = target->ml.type; 409 lock->ml.highest_blocked = target->ml.type;
@@ -419,7 +419,7 @@ blocked:
419 can_grant = 0; 419 can_grant = 0;
420 if (lock->ml.highest_blocked == LKM_IVMODE) { 420 if (lock->ml.highest_blocked == LKM_IVMODE) {
421 __dlm_lockres_reserve_ast(res); 421 __dlm_lockres_reserve_ast(res);
422 dlm_queue_bast(dlm, lock); 422 __dlm_queue_bast(dlm, lock);
423 } 423 }
424 if (lock->ml.highest_blocked < target->ml.type) 424 if (lock->ml.highest_blocked < target->ml.type)
425 lock->ml.highest_blocked = target->ml.type; 425 lock->ml.highest_blocked = target->ml.type;
@@ -445,7 +445,7 @@ blocked:
445 spin_unlock(&target->spinlock); 445 spin_unlock(&target->spinlock);
446 446
447 __dlm_lockres_reserve_ast(res); 447 __dlm_lockres_reserve_ast(res);
448 dlm_queue_ast(dlm, target); 448 __dlm_queue_ast(dlm, target);
449 /* go back and check for more */ 449 /* go back and check for more */
450 goto converting; 450 goto converting;
451 } 451 }
@@ -675,6 +675,7 @@ static int dlm_thread(void *data)
675 /* lockres can be re-dirtied/re-added to the 675 /* lockres can be re-dirtied/re-added to the
676 * dirty_list in this gap, but that is ok */ 676 * dirty_list in this gap, but that is ok */
677 677
678 spin_lock(&dlm->ast_lock);
678 spin_lock(&res->spinlock); 679 spin_lock(&res->spinlock);
679 if (res->owner != dlm->node_num) { 680 if (res->owner != dlm->node_num) {
680 __dlm_print_one_lock_resource(res); 681 __dlm_print_one_lock_resource(res);
@@ -695,6 +696,7 @@ static int dlm_thread(void *data)
695 /* move it to the tail and keep going */ 696 /* move it to the tail and keep going */
696 res->state &= ~DLM_LOCK_RES_DIRTY; 697 res->state &= ~DLM_LOCK_RES_DIRTY;
697 spin_unlock(&res->spinlock); 698 spin_unlock(&res->spinlock);
699 spin_unlock(&dlm->ast_lock);
698 mlog(0, "delaying list shuffling for in-" 700 mlog(0, "delaying list shuffling for in-"
699 "progress lockres %.*s, state=%d\n", 701 "progress lockres %.*s, state=%d\n",
700 res->lockname.len, res->lockname.name, 702 res->lockname.len, res->lockname.name,
@@ -716,6 +718,7 @@ static int dlm_thread(void *data)
716 dlm_shuffle_lists(dlm, res); 718 dlm_shuffle_lists(dlm, res);
717 res->state &= ~DLM_LOCK_RES_DIRTY; 719 res->state &= ~DLM_LOCK_RES_DIRTY;
718 spin_unlock(&res->spinlock); 720 spin_unlock(&res->spinlock);
721 spin_unlock(&dlm->ast_lock);
719 722
720 dlm_lockres_calc_usage(dlm, res); 723 dlm_lockres_calc_usage(dlm, res);
721 724
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 49e29ecd0201..817287c6a6db 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
@@ -355,7 +354,8 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
355 mlog(0, "master was in-progress. retry\n"); 354 mlog(0, "master was in-progress. retry\n");
356 ret = status; 355 ret = status;
357 } else { 356 } else {
358 mlog_errno(tmpret); 357 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
358 "node %u\n", tmpret, DLM_UNLOCK_LOCK_MSG, dlm->key, owner);
359 if (dlm_is_host_down(tmpret)) { 359 if (dlm_is_host_down(tmpret)) {
360 /* NOTE: this seems strange, but it is what we want. 360 /* NOTE: this seems strange, but it is what we want.
361 * when the master goes down during a cancel or 361 * when the master goes down during a cancel or
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 1b0de157a08c..b83d6107a1f5 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -112,20 +112,20 @@ MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES);
112 * O_RDONLY -> PRMODE level 112 * O_RDONLY -> PRMODE level
113 * O_WRONLY -> EXMODE level 113 * O_WRONLY -> EXMODE level
114 * 114 *
115 * O_NONBLOCK -> LKM_NOQUEUE 115 * O_NONBLOCK -> NOQUEUE
116 */ 116 */
117static int dlmfs_decode_open_flags(int open_flags, 117static int dlmfs_decode_open_flags(int open_flags,
118 int *level, 118 int *level,
119 int *flags) 119 int *flags)
120{ 120{
121 if (open_flags & (O_WRONLY|O_RDWR)) 121 if (open_flags & (O_WRONLY|O_RDWR))
122 *level = LKM_EXMODE; 122 *level = DLM_LOCK_EX;
123 else 123 else
124 *level = LKM_PRMODE; 124 *level = DLM_LOCK_PR;
125 125
126 *flags = 0; 126 *flags = 0;
127 if (open_flags & O_NONBLOCK) 127 if (open_flags & O_NONBLOCK)
128 *flags |= LKM_NOQUEUE; 128 *flags |= DLM_LKF_NOQUEUE;
129 129
130 return 0; 130 return 0;
131} 131}
@@ -166,7 +166,7 @@ static int dlmfs_file_open(struct inode *inode,
166 * to be able userspace to be able to distinguish a 166 * to be able userspace to be able to distinguish a
167 * valid lock request from one that simply couldn't be 167 * valid lock request from one that simply couldn't be
168 * granted. */ 168 * granted. */
169 if (flags & LKM_NOQUEUE && status == -EAGAIN) 169 if (flags & DLM_LKF_NOQUEUE && status == -EAGAIN)
170 status = -ETXTBSY; 170 status = -ETXTBSY;
171 kfree(fp); 171 kfree(fp);
172 goto bail; 172 goto bail;
@@ -193,7 +193,7 @@ static int dlmfs_file_release(struct inode *inode,
193 status = 0; 193 status = 0;
194 if (fp) { 194 if (fp) {
195 level = fp->fp_lock_level; 195 level = fp->fp_lock_level;
196 if (level != LKM_IVMODE) 196 if (level != DLM_LOCK_IV)
197 user_dlm_cluster_unlock(&ip->ip_lockres, level); 197 user_dlm_cluster_unlock(&ip->ip_lockres, level);
198 198
199 kfree(fp); 199 kfree(fp);
@@ -262,7 +262,7 @@ static ssize_t dlmfs_file_read(struct file *filp,
262 if ((count + *ppos) > i_size_read(inode)) 262 if ((count + *ppos) > i_size_read(inode))
263 readlen = i_size_read(inode) - *ppos; 263 readlen = i_size_read(inode) - *ppos;
264 else 264 else
265 readlen = count - *ppos; 265 readlen = count;
266 266
267 lvb_buf = kmalloc(readlen, GFP_NOFS); 267 lvb_buf = kmalloc(readlen, GFP_NOFS);
268 if (!lvb_buf) 268 if (!lvb_buf)
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 8298608d4165..50c4ee805da4 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1881,7 +1881,7 @@ out:
1881 * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of 1881 * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
1882 * flock() calls. The locking approach this requires is sufficiently 1882 * flock() calls. The locking approach this requires is sufficiently
1883 * different from all other cluster lock types that we implement a 1883 * different from all other cluster lock types that we implement a
1884 * seperate path to the "low-level" dlm calls. In particular: 1884 * separate path to the "low-level" dlm calls. In particular:
1885 * 1885 *
1886 * - No optimization of lock levels is done - we take at exactly 1886 * - No optimization of lock levels is done - we take at exactly
1887 * what's been requested. 1887 * what's been requested.
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 5328529e7fd2..09e3fdfa6d33 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -24,6 +24,7 @@
24 24
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/slab.h>
27#include <linux/types.h> 28#include <linux/types.h>
28#include <linux/fiemap.h> 29#include <linux/fiemap.h>
29 30
@@ -453,7 +454,7 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
453 if (i == -1) { 454 if (i == -1) {
454 /* 455 /*
455 * Holes can be larger than the maximum size of an 456 * Holes can be larger than the maximum size of an
456 * extent, so we return their lengths in a seperate 457 * extent, so we return their lengths in a separate
457 * field. 458 * field.
458 */ 459 */
459 if (hole_len) { 460 if (hole_len) {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 17947dc8341e..f74f1400eccd 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -278,10 +278,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
278 inode->i_atime = CURRENT_TIME; 278 inode->i_atime = CURRENT_TIME;
279 di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); 279 di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
280 di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); 280 di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
281 281 ocfs2_journal_dirty(handle, bh);
282 ret = ocfs2_journal_dirty(handle, bh);
283 if (ret < 0)
284 mlog_errno(ret);
285 282
286out_commit: 283out_commit:
287 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 284 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
@@ -430,9 +427,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
430 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); 427 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
431 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 428 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
432 429
433 status = ocfs2_journal_dirty(handle, fe_bh); 430 ocfs2_journal_dirty(handle, fe_bh);
434 if (status < 0)
435 mlog_errno(status);
436 431
437out_commit: 432out_commit:
438 ocfs2_commit_trans(osb, handle); 433 ocfs2_commit_trans(osb, handle);
@@ -449,7 +444,6 @@ static int ocfs2_truncate_file(struct inode *inode,
449 int status = 0; 444 int status = 0;
450 struct ocfs2_dinode *fe = NULL; 445 struct ocfs2_dinode *fe = NULL;
451 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 446 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
452 struct ocfs2_truncate_context *tc = NULL;
453 447
454 mlog_entry("(inode = %llu, new_i_size = %llu\n", 448 mlog_entry("(inode = %llu, new_i_size = %llu\n",
455 (unsigned long long)OCFS2_I(inode)->ip_blkno, 449 (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -488,6 +482,9 @@ static int ocfs2_truncate_file(struct inode *inode,
488 482
489 down_write(&OCFS2_I(inode)->ip_alloc_sem); 483 down_write(&OCFS2_I(inode)->ip_alloc_sem);
490 484
485 ocfs2_resv_discard(&osb->osb_la_resmap,
486 &OCFS2_I(inode)->ip_la_data_resv);
487
491 /* 488 /*
492 * The inode lock forced other nodes to sync and drop their 489 * The inode lock forced other nodes to sync and drop their
493 * pages, which (correctly) happens even if we have a truncate 490 * pages, which (correctly) happens even if we have a truncate
@@ -517,13 +514,7 @@ static int ocfs2_truncate_file(struct inode *inode,
517 goto bail_unlock_sem; 514 goto bail_unlock_sem;
518 } 515 }
519 516
520 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); 517 status = ocfs2_commit_truncate(osb, inode, di_bh);
521 if (status < 0) {
522 mlog_errno(status);
523 goto bail_unlock_sem;
524 }
525
526 status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
527 if (status < 0) { 518 if (status < 0) {
528 mlog_errno(status); 519 mlog_errno(status);
529 goto bail_unlock_sem; 520 goto bail_unlock_sem;
@@ -666,11 +657,7 @@ restarted_transaction:
666 goto leave; 657 goto leave;
667 } 658 }
668 659
669 status = ocfs2_journal_dirty(handle, bh); 660 ocfs2_journal_dirty(handle, bh);
670 if (status < 0) {
671 mlog_errno(status);
672 goto leave;
673 }
674 661
675 spin_lock(&OCFS2_I(inode)->ip_lock); 662 spin_lock(&OCFS2_I(inode)->ip_lock);
676 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); 663 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
@@ -684,6 +671,7 @@ restarted_transaction:
684 if (why == RESTART_META) { 671 if (why == RESTART_META) {
685 mlog(0, "restarting function.\n"); 672 mlog(0, "restarting function.\n");
686 restart_func = 1; 673 restart_func = 1;
674 status = 0;
687 } else { 675 } else {
688 BUG_ON(why != RESTART_TRANS); 676 BUG_ON(why != RESTART_TRANS);
689 677
@@ -1194,9 +1182,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
1194 di = (struct ocfs2_dinode *) bh->b_data; 1182 di = (struct ocfs2_dinode *) bh->b_data;
1195 di->i_mode = cpu_to_le16(inode->i_mode); 1183 di->i_mode = cpu_to_le16(inode->i_mode);
1196 1184
1197 ret = ocfs2_journal_dirty(handle, bh); 1185 ocfs2_journal_dirty(handle, bh);
1198 if (ret < 0)
1199 mlog_errno(ret);
1200 1186
1201out_trans: 1187out_trans:
1202 ocfs2_commit_trans(osb, handle); 1188 ocfs2_commit_trans(osb, handle);
@@ -1433,16 +1419,90 @@ out:
1433 return ret; 1419 return ret;
1434} 1420}
1435 1421
1422static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
1423{
1424 int i;
1425 struct ocfs2_extent_rec *rec = NULL;
1426
1427 for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
1428
1429 rec = &el->l_recs[i];
1430
1431 if (le32_to_cpu(rec->e_cpos) < pos)
1432 break;
1433 }
1434
1435 return i;
1436}
1437
1438/*
1439 * Helper to calculate the punching pos and length in one run, we handle the
1440 * following three cases in order:
1441 *
1442 * - remove the entire record
1443 * - remove a partial record
1444 * - no record needs to be removed (hole-punching completed)
1445*/
1446static void ocfs2_calc_trunc_pos(struct inode *inode,
1447 struct ocfs2_extent_list *el,
1448 struct ocfs2_extent_rec *rec,
1449 u32 trunc_start, u32 *trunc_cpos,
1450 u32 *trunc_len, u32 *trunc_end,
1451 u64 *blkno, int *done)
1452{
1453 int ret = 0;
1454 u32 coff, range;
1455
1456 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1457
1458 if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
1459 *trunc_cpos = le32_to_cpu(rec->e_cpos);
1460 /*
1461 * Skip holes if any.
1462 */
1463 if (range < *trunc_end)
1464 *trunc_end = range;
1465 *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
1466 *blkno = le64_to_cpu(rec->e_blkno);
1467 *trunc_end = le32_to_cpu(rec->e_cpos);
1468 } else if (range > trunc_start) {
1469 *trunc_cpos = trunc_start;
1470 *trunc_len = *trunc_end - trunc_start;
1471 coff = trunc_start - le32_to_cpu(rec->e_cpos);
1472 *blkno = le64_to_cpu(rec->e_blkno) +
1473 ocfs2_clusters_to_blocks(inode->i_sb, coff);
1474 *trunc_end = trunc_start;
1475 } else {
1476 /*
1477 * It may have two following possibilities:
1478 *
1479 * - last record has been removed
1480 * - trunc_start was within a hole
1481 *
1482 * both two cases mean the completion of hole punching.
1483 */
1484 ret = 1;
1485 }
1486
1487 *done = ret;
1488}
1489
1436static int ocfs2_remove_inode_range(struct inode *inode, 1490static int ocfs2_remove_inode_range(struct inode *inode,
1437 struct buffer_head *di_bh, u64 byte_start, 1491 struct buffer_head *di_bh, u64 byte_start,
1438 u64 byte_len) 1492 u64 byte_len)
1439{ 1493{
1440 int ret = 0; 1494 int ret = 0, flags = 0, done = 0, i;
1441 u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size; 1495 u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
1496 u32 cluster_in_el;
1442 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1497 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1443 struct ocfs2_cached_dealloc_ctxt dealloc; 1498 struct ocfs2_cached_dealloc_ctxt dealloc;
1444 struct address_space *mapping = inode->i_mapping; 1499 struct address_space *mapping = inode->i_mapping;
1445 struct ocfs2_extent_tree et; 1500 struct ocfs2_extent_tree et;
1501 struct ocfs2_path *path = NULL;
1502 struct ocfs2_extent_list *el = NULL;
1503 struct ocfs2_extent_rec *rec = NULL;
1504 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1505 u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
1446 1506
1447 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); 1507 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
1448 ocfs2_init_dealloc_ctxt(&dealloc); 1508 ocfs2_init_dealloc_ctxt(&dealloc);
@@ -1468,17 +1528,35 @@ static int ocfs2_remove_inode_range(struct inode *inode,
1468 goto out; 1528 goto out;
1469 } 1529 }
1470 1530
1531 /*
1532 * For reflinks, we may need to CoW 2 clusters which might be
1533 * partially zero'd later, if hole's start and end offset were
1534 * within one cluster(means is not exactly aligned to clustersize).
1535 */
1536
1537 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
1538
1539 ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
1540 if (ret) {
1541 mlog_errno(ret);
1542 goto out;
1543 }
1544
1545 ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
1546 if (ret) {
1547 mlog_errno(ret);
1548 goto out;
1549 }
1550 }
1551
1471 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start); 1552 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
1472 trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits; 1553 trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
1473 if (trunc_len >= trunc_start) 1554 cluster_in_el = trunc_end;
1474 trunc_len -= trunc_start;
1475 else
1476 trunc_len = 0;
1477 1555
1478 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n", 1556 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, cend: %u\n",
1479 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1557 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1480 (unsigned long long)byte_start, 1558 (unsigned long long)byte_start,
1481 (unsigned long long)byte_len, trunc_start, trunc_len); 1559 (unsigned long long)byte_len, trunc_start, trunc_end);
1482 1560
1483 ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len); 1561 ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
1484 if (ret) { 1562 if (ret) {
@@ -1486,31 +1564,79 @@ static int ocfs2_remove_inode_range(struct inode *inode,
1486 goto out; 1564 goto out;
1487 } 1565 }
1488 1566
1489 cpos = trunc_start; 1567 path = ocfs2_new_path_from_et(&et);
1490 while (trunc_len) { 1568 if (!path) {
1491 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, 1569 ret = -ENOMEM;
1492 &alloc_size, NULL); 1570 mlog_errno(ret);
1571 goto out;
1572 }
1573
1574 while (trunc_end > trunc_start) {
1575
1576 ret = ocfs2_find_path(INODE_CACHE(inode), path,
1577 cluster_in_el);
1493 if (ret) { 1578 if (ret) {
1494 mlog_errno(ret); 1579 mlog_errno(ret);
1495 goto out; 1580 goto out;
1496 } 1581 }
1497 1582
1498 if (alloc_size > trunc_len) 1583 el = path_leaf_el(path);
1499 alloc_size = trunc_len;
1500 1584
1501 /* Only do work for non-holes */ 1585 i = ocfs2_find_rec(el, trunc_end);
1502 if (phys_cpos != 0) { 1586 /*
1503 ret = ocfs2_remove_btree_range(inode, &et, cpos, 1587 * Need to go to previous extent block.
1504 phys_cpos, alloc_size, 1588 */
1505 &dealloc); 1589 if (i < 0) {
1590 if (path->p_tree_depth == 0)
1591 break;
1592
1593 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
1594 path,
1595 &cluster_in_el);
1506 if (ret) { 1596 if (ret) {
1507 mlog_errno(ret); 1597 mlog_errno(ret);
1508 goto out; 1598 goto out;
1509 } 1599 }
1600
1601 /*
1602 * We've reached the leftmost extent block,
1603 * it's safe to leave.
1604 */
1605 if (cluster_in_el == 0)
1606 break;
1607
1608 /*
1609 * The 'pos' searched for previous extent block is
1610 * always one cluster less than actual trunc_end.
1611 */
1612 trunc_end = cluster_in_el + 1;
1613
1614 ocfs2_reinit_path(path, 1);
1615
1616 continue;
1617
1618 } else
1619 rec = &el->l_recs[i];
1620
1621 ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
1622 &trunc_len, &trunc_end, &blkno, &done);
1623 if (done)
1624 break;
1625
1626 flags = rec->e_flags;
1627 phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
1628
1629 ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
1630 phys_cpos, trunc_len, flags,
1631 &dealloc, refcount_loc);
1632 if (ret < 0) {
1633 mlog_errno(ret);
1634 goto out;
1510 } 1635 }
1511 1636
1512 cpos += alloc_size; 1637 cluster_in_el = trunc_end;
1513 trunc_len -= alloc_size; 1638
1639 ocfs2_reinit_path(path, 1);
1514 } 1640 }
1515 1641
1516 ocfs2_truncate_cluster_pages(inode, byte_start, byte_len); 1642 ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
@@ -1981,18 +2107,18 @@ relock:
1981 /* communicate with ocfs2_dio_end_io */ 2107 /* communicate with ocfs2_dio_end_io */
1982 ocfs2_iocb_set_rw_locked(iocb, rw_level); 2108 ocfs2_iocb_set_rw_locked(iocb, rw_level);
1983 2109
1984 if (direct_io) { 2110 ret = generic_segment_checks(iov, &nr_segs, &ocount,
1985 ret = generic_segment_checks(iov, &nr_segs, &ocount, 2111 VERIFY_READ);
1986 VERIFY_READ); 2112 if (ret)
1987 if (ret) 2113 goto out_dio;
1988 goto out_dio;
1989 2114
1990 count = ocount; 2115 count = ocount;
1991 ret = generic_write_checks(file, ppos, &count, 2116 ret = generic_write_checks(file, ppos, &count,
1992 S_ISBLK(inode->i_mode)); 2117 S_ISBLK(inode->i_mode));
1993 if (ret) 2118 if (ret)
1994 goto out_dio; 2119 goto out_dio;
1995 2120
2121 if (direct_io) {
1996 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, 2122 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
1997 ppos, count, ocount); 2123 ppos, count, ocount);
1998 if (written < 0) { 2124 if (written < 0) {
@@ -2007,7 +2133,10 @@ relock:
2007 goto out_dio; 2133 goto out_dio;
2008 } 2134 }
2009 } else { 2135 } else {
2010 written = __generic_file_aio_write(iocb, iov, nr_segs, ppos); 2136 current->backing_dev_info = file->f_mapping->backing_dev_info;
2137 written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos,
2138 ppos, count, 0);
2139 current->backing_dev_info = NULL;
2011 } 2140 }
2012 2141
2013out_dio: 2142out_dio:
@@ -2021,9 +2150,9 @@ out_dio:
2021 if (ret < 0) 2150 if (ret < 0)
2022 written = ret; 2151 written = ret;
2023 2152
2024 if (!ret && (old_size != i_size_read(inode) || 2153 if (!ret && ((old_size != i_size_read(inode)) ||
2025 old_clusters != OCFS2_I(inode)->ip_clusters || 2154 (old_clusters != OCFS2_I(inode)->ip_clusters) ||
2026 has_refcount)) { 2155 has_refcount)) {
2027 ret = jbd2_journal_force_commit(osb->journal->j_journal); 2156 ret = jbd2_journal_force_commit(osb->journal->j_journal);
2028 if (ret < 0) 2157 if (ret < 0)
2029 written = ret; 2158 written = ret;
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index c6e7213db868..1aa863dd901f 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -26,7 +26,6 @@
26 26
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h> 29#include <linux/highmem.h>
31 30
32#define MLOG_MASK_PREFIX ML_SUPER 31#define MLOG_MASK_PREFIX ML_SUPER
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 278a223aae14..abb0a95cc717 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30#include <linux/pagemap.h> 29#include <linux/pagemap.h>
31#include <linux/quotaops.h> 30#include <linux/quotaops.h>
@@ -377,6 +376,10 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
377 376
378 OCFS2_I(inode)->ip_last_used_slot = 0; 377 OCFS2_I(inode)->ip_last_used_slot = 0;
379 OCFS2_I(inode)->ip_last_used_group = 0; 378 OCFS2_I(inode)->ip_last_used_group = 0;
379
380 if (S_ISDIR(inode->i_mode))
381 ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv,
382 OCFS2_RESV_FLAG_DIR);
380 mlog_exit_void(); 383 mlog_exit_void();
381} 384}
382 385
@@ -540,7 +543,6 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
540 struct buffer_head *fe_bh) 543 struct buffer_head *fe_bh)
541{ 544{
542 int status = 0; 545 int status = 0;
543 struct ocfs2_truncate_context *tc = NULL;
544 struct ocfs2_dinode *fe; 546 struct ocfs2_dinode *fe;
545 handle_t *handle = NULL; 547 handle_t *handle = NULL;
546 548
@@ -559,6 +561,7 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
559 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 561 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
560 if (IS_ERR(handle)) { 562 if (IS_ERR(handle)) {
561 status = PTR_ERR(handle); 563 status = PTR_ERR(handle);
564 handle = NULL;
562 mlog_errno(status); 565 mlog_errno(status);
563 goto out; 566 goto out;
564 } 567 }
@@ -582,13 +585,7 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
582 ocfs2_commit_trans(osb, handle); 585 ocfs2_commit_trans(osb, handle);
583 handle = NULL; 586 handle = NULL;
584 587
585 status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc); 588 status = ocfs2_commit_truncate(osb, inode, fe_bh);
586 if (status < 0) {
587 mlog_errno(status);
588 goto out;
589 }
590
591 status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
592 if (status < 0) { 589 if (status < 0) {
593 mlog_errno(status); 590 mlog_errno(status);
594 goto out; 591 goto out;
@@ -640,11 +637,13 @@ static int ocfs2_remove_inode(struct inode *inode,
640 goto bail_unlock; 637 goto bail_unlock;
641 } 638 }
642 639
643 status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, 640 if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
644 orphan_dir_bh); 641 status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
645 if (status < 0) { 642 orphan_dir_bh);
646 mlog_errno(status); 643 if (status < 0) {
647 goto bail_commit; 644 mlog_errno(status);
645 goto bail_commit;
646 }
648 } 647 }
649 648
650 /* set the inodes dtime */ 649 /* set the inodes dtime */
@@ -657,12 +656,7 @@ static int ocfs2_remove_inode(struct inode *inode,
657 656
658 di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec); 657 di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
659 di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); 658 di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
660 659 ocfs2_journal_dirty(handle, di_bh);
661 status = ocfs2_journal_dirty(handle, di_bh);
662 if (status < 0) {
663 mlog_errno(status);
664 goto bail_commit;
665 }
666 660
667 ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh); 661 ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
668 dquot_free_inode(inode); 662 dquot_free_inode(inode);
@@ -723,38 +717,39 @@ static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb,
723static int ocfs2_wipe_inode(struct inode *inode, 717static int ocfs2_wipe_inode(struct inode *inode,
724 struct buffer_head *di_bh) 718 struct buffer_head *di_bh)
725{ 719{
726 int status, orphaned_slot; 720 int status, orphaned_slot = -1;
727 struct inode *orphan_dir_inode = NULL; 721 struct inode *orphan_dir_inode = NULL;
728 struct buffer_head *orphan_dir_bh = NULL; 722 struct buffer_head *orphan_dir_bh = NULL;
729 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 723 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
730 struct ocfs2_dinode *di; 724 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
731 725
732 di = (struct ocfs2_dinode *) di_bh->b_data; 726 if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
733 orphaned_slot = le16_to_cpu(di->i_orphaned_slot); 727 orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
734 728
735 status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); 729 status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
736 if (status) 730 if (status)
737 return status; 731 return status;
738 732
739 orphan_dir_inode = ocfs2_get_system_file_inode(osb, 733 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
740 ORPHAN_DIR_SYSTEM_INODE, 734 ORPHAN_DIR_SYSTEM_INODE,
741 orphaned_slot); 735 orphaned_slot);
742 if (!orphan_dir_inode) { 736 if (!orphan_dir_inode) {
743 status = -EEXIST; 737 status = -EEXIST;
744 mlog_errno(status); 738 mlog_errno(status);
745 goto bail; 739 goto bail;
746 } 740 }
747 741
748 /* Lock the orphan dir. The lock will be held for the entire 742 /* Lock the orphan dir. The lock will be held for the entire
749 * delete_inode operation. We do this now to avoid races with 743 * delete_inode operation. We do this now to avoid races with
750 * recovery completion on other nodes. */ 744 * recovery completion on other nodes. */
751 mutex_lock(&orphan_dir_inode->i_mutex); 745 mutex_lock(&orphan_dir_inode->i_mutex);
752 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); 746 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
753 if (status < 0) { 747 if (status < 0) {
754 mutex_unlock(&orphan_dir_inode->i_mutex); 748 mutex_unlock(&orphan_dir_inode->i_mutex);
755 749
756 mlog_errno(status); 750 mlog_errno(status);
757 goto bail; 751 goto bail;
752 }
758 } 753 }
759 754
760 /* we do this while holding the orphan dir lock because we 755 /* we do this while holding the orphan dir lock because we
@@ -795,6 +790,9 @@ static int ocfs2_wipe_inode(struct inode *inode,
795 mlog_errno(status); 790 mlog_errno(status);
796 791
797bail_unlock_dir: 792bail_unlock_dir:
793 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)
794 return status;
795
798 ocfs2_inode_unlock(orphan_dir_inode, 1); 796 ocfs2_inode_unlock(orphan_dir_inode, 1);
799 mutex_unlock(&orphan_dir_inode->i_mutex); 797 mutex_unlock(&orphan_dir_inode->i_mutex);
800 brelse(orphan_dir_bh); 798 brelse(orphan_dir_bh);
@@ -890,7 +888,23 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
890 888
891 /* Do some basic inode verification... */ 889 /* Do some basic inode verification... */
892 di = (struct ocfs2_dinode *) di_bh->b_data; 890 di = (struct ocfs2_dinode *) di_bh->b_data;
893 if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) { 891 if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL)) &&
892 !(oi->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
893 /*
894 * Inodes in the orphan dir must have ORPHANED_FL. The only
895 * inodes that come back out of the orphan dir are reflink
896 * targets. A reflink target may be moved out of the orphan
897 * dir between the time we scan the directory and the time we
898 * process it. This would lead to HAS_REFCOUNT_FL being set but
899 * ORPHANED_FL not.
900 */
901 if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) {
902 mlog(0, "Reflinked inode %llu is no longer orphaned. "
903 "it shouldn't be deleted\n",
904 (unsigned long long)oi->ip_blkno);
905 goto bail;
906 }
907
894 /* for lack of a better error? */ 908 /* for lack of a better error? */
895 status = -EEXIST; 909 status = -EEXIST;
896 mlog(ML_ERROR, 910 mlog(ML_ERROR,
@@ -958,7 +972,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
958void ocfs2_delete_inode(struct inode *inode) 972void ocfs2_delete_inode(struct inode *inode)
959{ 973{
960 int wipe, status; 974 int wipe, status;
961 sigset_t blocked, oldset; 975 sigset_t oldset;
962 struct buffer_head *di_bh = NULL; 976 struct buffer_head *di_bh = NULL;
963 977
964 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); 978 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
@@ -985,13 +999,7 @@ void ocfs2_delete_inode(struct inode *inode)
985 * messaging paths may return us -ERESTARTSYS. Which would 999 * messaging paths may return us -ERESTARTSYS. Which would
986 * cause us to exit early, resulting in inodes being orphaned 1000 * cause us to exit early, resulting in inodes being orphaned
987 * forever. */ 1001 * forever. */
988 sigfillset(&blocked); 1002 ocfs2_block_signals(&oldset);
989 status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
990 if (status < 0) {
991 mlog_errno(status);
992 ocfs2_cleanup_delete_inode(inode, 1);
993 goto bail;
994 }
995 1003
996 /* 1004 /*
997 * Synchronize us against ocfs2_get_dentry. We take this in 1005 * Synchronize us against ocfs2_get_dentry. We take this in
@@ -1065,9 +1073,7 @@ bail_unlock_nfs_sync:
1065 ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0); 1073 ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
1066 1074
1067bail_unblock: 1075bail_unblock:
1068 status = sigprocmask(SIG_SETMASK, &oldset, NULL); 1076 ocfs2_unblock_signals(&oldset);
1069 if (status < 0)
1070 mlog_errno(status);
1071bail: 1077bail:
1072 clear_inode(inode); 1078 clear_inode(inode);
1073 mlog_exit_void(); 1079 mlog_exit_void();
@@ -1101,6 +1107,10 @@ void ocfs2_clear_inode(struct inode *inode)
1101 ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres); 1107 ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
1102 ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); 1108 ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
1103 1109
1110 ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap,
1111 &oi->ip_la_data_resv);
1112 ocfs2_resv_init_once(&oi->ip_la_data_resv);
1113
1104 /* We very well may get a clear_inode before all an inodes 1114 /* We very well may get a clear_inode before all an inodes
1105 * metadata has hit disk. Of course, we can't drop any cluster 1115 * metadata has hit disk. Of course, we can't drop any cluster
1106 * locks until the journal has finished with it. The only 1116 * locks until the journal has finished with it. The only
@@ -1276,13 +1286,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
1276 fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); 1286 fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
1277 fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 1287 fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
1278 1288
1279 status = ocfs2_journal_dirty(handle, bh); 1289 ocfs2_journal_dirty(handle, bh);
1280 if (status < 0)
1281 mlog_errno(status);
1282
1283 status = 0;
1284leave: 1290leave:
1285
1286 mlog_exit(status); 1291 mlog_exit(status);
1287 return status; 1292 return status;
1288} 1293}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index ba4fe07b293c..9f5f5fcadc45 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -70,6 +70,8 @@ struct ocfs2_inode_info
70 /* Only valid if the inode is the dir. */ 70 /* Only valid if the inode is the dir. */
71 u32 ip_last_used_slot; 71 u32 ip_last_used_slot;
72 u64 ip_last_used_group; 72 u64 ip_last_used_group;
73
74 struct ocfs2_alloc_reservation ip_la_data_resv;
73}; 75};
74 76
75/* 77/*
@@ -100,6 +102,8 @@ struct ocfs2_inode_info
100#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020 102#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020
101/* Does someone have the file open O_DIRECT */ 103/* Does someone have the file open O_DIRECT */
102#define OCFS2_INODE_OPEN_DIRECT 0x00000040 104#define OCFS2_INODE_OPEN_DIRECT 0x00000040
105/* Tell the inode wipe code it's not in orphan dir */
106#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000080
103 107
104static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) 108static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
105{ 109{
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9336c60e3a36..47878cf16418 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -402,9 +402,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
402} 402}
403 403
404/* 404/*
405 * 'nblocks' is what you want to add to the current 405 * 'nblocks' is what you want to add to the current transaction.
406 * transaction. extend_trans will either extend the current handle by
407 * nblocks, or commit it and start a new one with nblocks credits.
408 * 406 *
409 * This might call jbd2_journal_restart() which will commit dirty buffers 407 * This might call jbd2_journal_restart() which will commit dirty buffers
410 * and then restart the transaction. Before calling 408 * and then restart the transaction. Before calling
@@ -422,11 +420,15 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
422 */ 420 */
423int ocfs2_extend_trans(handle_t *handle, int nblocks) 421int ocfs2_extend_trans(handle_t *handle, int nblocks)
424{ 422{
425 int status; 423 int status, old_nblocks;
426 424
427 BUG_ON(!handle); 425 BUG_ON(!handle);
428 BUG_ON(!nblocks); 426 BUG_ON(nblocks < 0);
427
428 if (!nblocks)
429 return 0;
429 430
431 old_nblocks = handle->h_buffer_credits;
430 mlog_entry_void(); 432 mlog_entry_void();
431 433
432 mlog(0, "Trying to extend transaction by %d blocks\n", nblocks); 434 mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
@@ -445,7 +447,8 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
445 mlog(0, 447 mlog(0,
446 "jbd2_journal_extend failed, trying " 448 "jbd2_journal_extend failed, trying "
447 "jbd2_journal_restart\n"); 449 "jbd2_journal_restart\n");
448 status = jbd2_journal_restart(handle, nblocks); 450 status = jbd2_journal_restart(handle,
451 old_nblocks + nblocks);
449 if (status < 0) { 452 if (status < 0) {
450 mlog_errno(status); 453 mlog_errno(status);
451 goto bail; 454 goto bail;
@@ -734,8 +737,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
734 return __ocfs2_journal_access(handle, ci, bh, NULL, type); 737 return __ocfs2_journal_access(handle, ci, bh, NULL, type);
735} 738}
736 739
737int ocfs2_journal_dirty(handle_t *handle, 740void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
738 struct buffer_head *bh)
739{ 741{
740 int status; 742 int status;
741 743
@@ -743,13 +745,9 @@ int ocfs2_journal_dirty(handle_t *handle,
743 (unsigned long long)bh->b_blocknr); 745 (unsigned long long)bh->b_blocknr);
744 746
745 status = jbd2_journal_dirty_metadata(handle, bh); 747 status = jbd2_journal_dirty_metadata(handle, bh);
746 if (status < 0) 748 BUG_ON(status);
747 mlog(ML_ERROR, "Could not dirty metadata buffer. "
748 "(bh->b_blocknr=%llu)\n",
749 (unsigned long long)bh->b_blocknr);
750 749
751 mlog_exit(status); 750 mlog_exit_void();
752 return status;
753} 751}
754 752
755#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE) 753#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3f74e09b0d80..b5baaa8e710f 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -325,8 +325,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
325 * <modify the bh> 325 * <modify the bh>
326 * ocfs2_journal_dirty(handle, bh); 326 * ocfs2_journal_dirty(handle, bh);
327 */ 327 */
328int ocfs2_journal_dirty(handle_t *handle, 328void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh);
329 struct buffer_head *bh);
330 329
331/* 330/*
332 * Credit Macros: 331 * Credit Macros:
@@ -562,6 +561,18 @@ static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
562 return blocks; 561 return blocks;
563} 562}
564 563
564/*
565 * Allocating a discontiguous block group requires the credits from
566 * ocfs2_calc_group_alloc_credits() as well as enough credits to fill
567 * the group descriptor's extent list. The caller already has started
568 * the transaction with ocfs2_calc_group_alloc_credits(). They extend
569 * it with these credits.
570 */
571static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb)
572{
573 return ocfs2_extent_recs_per_gd(sb);
574}
575
565static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, 576static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
566 unsigned int clusters_to_del, 577 unsigned int clusters_to_del,
567 struct ocfs2_dinode *fe, 578 struct ocfs2_dinode *fe,
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ca992d91f511..3d7419682dc0 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -52,7 +52,8 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
52 52
53static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, 53static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
54 struct ocfs2_dinode *alloc, 54 struct ocfs2_dinode *alloc,
55 u32 numbits); 55 u32 *numbits,
56 struct ocfs2_alloc_reservation *resv);
56 57
57static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc); 58static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
58 59
@@ -74,6 +75,144 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
74static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, 75static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
75 struct inode *local_alloc_inode); 76 struct inode *local_alloc_inode);
76 77
78/*
79 * ocfs2_la_default_mb() - determine a default size, in megabytes of
80 * the local alloc.
81 *
82 * Generally, we'd like to pick as large a local alloc as
83 * possible. Performance on large workloads tends to scale
84 * proportionally to la size. In addition to that, the reservations
85 * code functions more efficiently as it can reserve more windows for
86 * write.
87 *
88 * Some things work against us when trying to choose a large local alloc:
89 *
90 * - We need to ensure our sizing is picked to leave enough space in
91 * group descriptors for other allocations (such as block groups,
92 * etc). Picking default sizes which are a multiple of 4 could help
93 * - block groups are allocated in 2mb and 4mb chunks.
94 *
95 * - Likewise, we don't want to starve other nodes of bits on small
96 * file systems. This can easily be taken care of by limiting our
97 * default to a reasonable size (256M) on larger cluster sizes.
98 *
99 * - Some file systems can't support very large sizes - 4k and 8k in
100 * particular are limited to less than 128 and 256 megabytes respectively.
101 *
102 * The following reference table shows group descriptor and local
103 * alloc maximums at various cluster sizes (4k blocksize)
104 *
105 * csize: 4K group: 126M la: 121M
106 * csize: 8K group: 252M la: 243M
107 * csize: 16K group: 504M la: 486M
108 * csize: 32K group: 1008M la: 972M
109 * csize: 64K group: 2016M la: 1944M
110 * csize: 128K group: 4032M la: 3888M
111 * csize: 256K group: 8064M la: 7776M
112 * csize: 512K group: 16128M la: 15552M
113 * csize: 1024K group: 32256M la: 31104M
114 */
115#define OCFS2_LA_MAX_DEFAULT_MB 256
116#define OCFS2_LA_OLD_DEFAULT 8
117unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
118{
119 unsigned int la_mb;
120 unsigned int gd_mb;
121 unsigned int megs_per_slot;
122 struct super_block *sb = osb->sb;
123
124 gd_mb = ocfs2_clusters_to_megabytes(osb->sb,
125 8 * ocfs2_group_bitmap_size(sb, 0, osb->s_feature_incompat));
126
127 /*
128 * This takes care of files systems with very small group
129 * descriptors - 512 byte blocksize at cluster sizes lower
130 * than 16K and also 1k blocksize with 4k cluster size.
131 */
132 if ((sb->s_blocksize == 512 && osb->s_clustersize <= 8192)
133 || (sb->s_blocksize == 1024 && osb->s_clustersize == 4096))
134 return OCFS2_LA_OLD_DEFAULT;
135
136 /*
137 * Leave enough room for some block groups and make the final
138 * value we work from a multiple of 4.
139 */
140 gd_mb -= 16;
141 gd_mb &= 0xFFFFFFFB;
142
143 la_mb = gd_mb;
144
145 /*
146 * Keep window sizes down to a reasonable default
147 */
148 if (la_mb > OCFS2_LA_MAX_DEFAULT_MB) {
149 /*
150 * Some clustersize / blocksize combinations will have
151 * given us a larger than OCFS2_LA_MAX_DEFAULT_MB
152 * default size, but get poor distribution when
153 * limited to exactly 256 megabytes.
154 *
155 * As an example, 16K clustersize at 4K blocksize
156 * gives us a cluster group size of 504M. Paring the
157 * local alloc size down to 256 however, would give us
158 * only one window and around 200MB left in the
159 * cluster group. Instead, find the first size below
160 * 256 which would give us an even distribution.
161 *
162 * Larger cluster group sizes actually work out pretty
163 * well when pared to 256, so we don't have to do this
164 * for any group that fits more than two
165 * OCFS2_LA_MAX_DEFAULT_MB windows.
166 */
167 if (gd_mb > (2 * OCFS2_LA_MAX_DEFAULT_MB))
168 la_mb = 256;
169 else {
170 unsigned int gd_mult = gd_mb;
171
172 while (gd_mult > 256)
173 gd_mult = gd_mult >> 1;
174
175 la_mb = gd_mult;
176 }
177 }
178
179 megs_per_slot = osb->osb_clusters_at_boot / osb->max_slots;
180 megs_per_slot = ocfs2_clusters_to_megabytes(osb->sb, megs_per_slot);
181 /* Too many nodes, too few disk clusters. */
182 if (megs_per_slot < la_mb)
183 la_mb = megs_per_slot;
184
185 return la_mb;
186}
187
188void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb)
189{
190 struct super_block *sb = osb->sb;
191 unsigned int la_default_mb = ocfs2_la_default_mb(osb);
192 unsigned int la_max_mb;
193
194 la_max_mb = ocfs2_clusters_to_megabytes(sb,
195 ocfs2_local_alloc_size(sb) * 8);
196
197 mlog(0, "requested: %dM, max: %uM, default: %uM\n",
198 requested_mb, la_max_mb, la_default_mb);
199
200 if (requested_mb == -1) {
201 /* No user request - use defaults */
202 osb->local_alloc_default_bits =
203 ocfs2_megabytes_to_clusters(sb, la_default_mb);
204 } else if (requested_mb > la_max_mb) {
205 /* Request is too big, we give the maximum available */
206 osb->local_alloc_default_bits =
207 ocfs2_megabytes_to_clusters(sb, la_max_mb);
208 } else {
209 osb->local_alloc_default_bits =
210 ocfs2_megabytes_to_clusters(sb, requested_mb);
211 }
212
213 osb->local_alloc_bits = osb->local_alloc_default_bits;
214}
215
77static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb) 216static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
78{ 217{
79 return (osb->local_alloc_state == OCFS2_LA_THROTTLED || 218 return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
@@ -156,7 +295,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
156 osb->local_alloc_bits, (osb->bitmap_cpg - 1)); 295 osb->local_alloc_bits, (osb->bitmap_cpg - 1));
157 osb->local_alloc_bits = 296 osb->local_alloc_bits =
158 ocfs2_megabytes_to_clusters(osb->sb, 297 ocfs2_megabytes_to_clusters(osb->sb,
159 OCFS2_DEFAULT_LOCAL_ALLOC_SIZE); 298 ocfs2_la_default_mb(osb));
160 } 299 }
161 300
162 /* read the alloc off disk */ 301 /* read the alloc off disk */
@@ -262,6 +401,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
262 401
263 osb->local_alloc_state = OCFS2_LA_DISABLED; 402 osb->local_alloc_state = OCFS2_LA_DISABLED;
264 403
404 ocfs2_resmap_uninit(&osb->osb_la_resmap);
405
265 main_bm_inode = ocfs2_get_system_file_inode(osb, 406 main_bm_inode = ocfs2_get_system_file_inode(osb,
266 GLOBAL_BITMAP_SYSTEM_INODE, 407 GLOBAL_BITMAP_SYSTEM_INODE,
267 OCFS2_INVALID_SLOT); 408 OCFS2_INVALID_SLOT);
@@ -305,12 +446,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
305 } 446 }
306 447
307 ocfs2_clear_local_alloc(alloc); 448 ocfs2_clear_local_alloc(alloc);
308 449 ocfs2_journal_dirty(handle, bh);
309 status = ocfs2_journal_dirty(handle, bh);
310 if (status < 0) {
311 mlog_errno(status);
312 goto out_commit;
313 }
314 450
315 brelse(bh); 451 brelse(bh);
316 osb->local_alloc_bh = NULL; 452 osb->local_alloc_bh = NULL;
@@ -481,46 +617,6 @@ out:
481 return status; 617 return status;
482} 618}
483 619
484/* Check to see if the local alloc window is within ac->ac_max_block */
485static int ocfs2_local_alloc_in_range(struct inode *inode,
486 struct ocfs2_alloc_context *ac,
487 u32 bits_wanted)
488{
489 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
490 struct ocfs2_dinode *alloc;
491 struct ocfs2_local_alloc *la;
492 int start;
493 u64 block_off;
494
495 if (!ac->ac_max_block)
496 return 1;
497
498 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
499 la = OCFS2_LOCAL_ALLOC(alloc);
500
501 start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
502 if (start == -1) {
503 mlog_errno(-ENOSPC);
504 return 0;
505 }
506
507 /*
508 * Converting (bm_off + start + bits_wanted) to blocks gives us
509 * the blkno just past our actual allocation. This is perfect
510 * to compare with ac_max_block.
511 */
512 block_off = ocfs2_clusters_to_blocks(inode->i_sb,
513 le32_to_cpu(la->la_bm_off) +
514 start + bits_wanted);
515 mlog(0, "Checking %llu against %llu\n",
516 (unsigned long long)block_off,
517 (unsigned long long)ac->ac_max_block);
518 if (block_off > ac->ac_max_block)
519 return 0;
520
521 return 1;
522}
523
524/* 620/*
525 * make sure we've got at least bits_wanted contiguous bits in the 621 * make sure we've got at least bits_wanted contiguous bits in the
526 * local alloc. You lose them when you drop i_mutex. 622 * local alloc. You lose them when you drop i_mutex.
@@ -613,17 +709,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
613 mlog(0, "Calling in_range for max block %llu\n", 709 mlog(0, "Calling in_range for max block %llu\n",
614 (unsigned long long)ac->ac_max_block); 710 (unsigned long long)ac->ac_max_block);
615 711
616 if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac,
617 bits_wanted)) {
618 /*
619 * The window is outside ac->ac_max_block.
620 * This errno tells the caller to keep localalloc enabled
621 * but to get the allocation from the main bitmap.
622 */
623 status = -EFBIG;
624 goto bail;
625 }
626
627 ac->ac_inode = local_alloc_inode; 712 ac->ac_inode = local_alloc_inode;
628 /* We should never use localalloc from another slot */ 713 /* We should never use localalloc from another slot */
629 ac->ac_alloc_slot = osb->slot_num; 714 ac->ac_alloc_slot = osb->slot_num;
@@ -664,7 +749,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
664 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; 749 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
665 la = OCFS2_LOCAL_ALLOC(alloc); 750 la = OCFS2_LOCAL_ALLOC(alloc);
666 751
667 start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted); 752 start = ocfs2_local_alloc_find_clear_bits(osb, alloc, &bits_wanted,
753 ac->ac_resv);
668 if (start == -1) { 754 if (start == -1) {
669 /* TODO: Shouldn't we just BUG here? */ 755 /* TODO: Shouldn't we just BUG here? */
670 status = -ENOSPC; 756 status = -ENOSPC;
@@ -674,8 +760,6 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
674 760
675 bitmap = la->la_bitmap; 761 bitmap = la->la_bitmap;
676 *bit_off = le32_to_cpu(la->la_bm_off) + start; 762 *bit_off = le32_to_cpu(la->la_bm_off) + start;
677 /* local alloc is always contiguous by nature -- we never
678 * delete bits from it! */
679 *num_bits = bits_wanted; 763 *num_bits = bits_wanted;
680 764
681 status = ocfs2_journal_access_di(handle, 765 status = ocfs2_journal_access_di(handle,
@@ -687,18 +771,15 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
687 goto bail; 771 goto bail;
688 } 772 }
689 773
774 ocfs2_resmap_claimed_bits(&osb->osb_la_resmap, ac->ac_resv, start,
775 bits_wanted);
776
690 while(bits_wanted--) 777 while(bits_wanted--)
691 ocfs2_set_bit(start++, bitmap); 778 ocfs2_set_bit(start++, bitmap);
692 779
693 le32_add_cpu(&alloc->id1.bitmap1.i_used, *num_bits); 780 le32_add_cpu(&alloc->id1.bitmap1.i_used, *num_bits);
781 ocfs2_journal_dirty(handle, osb->local_alloc_bh);
694 782
695 status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
696 if (status < 0) {
697 mlog_errno(status);
698 goto bail;
699 }
700
701 status = 0;
702bail: 783bail:
703 mlog_exit(status); 784 mlog_exit(status);
704 return status; 785 return status;
@@ -722,13 +803,17 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
722} 803}
723 804
724static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, 805static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
725 struct ocfs2_dinode *alloc, 806 struct ocfs2_dinode *alloc,
726 u32 numbits) 807 u32 *numbits,
808 struct ocfs2_alloc_reservation *resv)
727{ 809{
728 int numfound, bitoff, left, startoff, lastzero; 810 int numfound, bitoff, left, startoff, lastzero;
811 int local_resv = 0;
812 struct ocfs2_alloc_reservation r;
729 void *bitmap = NULL; 813 void *bitmap = NULL;
814 struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap;
730 815
731 mlog_entry("(numbits wanted = %u)\n", numbits); 816 mlog_entry("(numbits wanted = %u)\n", *numbits);
732 817
733 if (!alloc->id1.bitmap1.i_total) { 818 if (!alloc->id1.bitmap1.i_total) {
734 mlog(0, "No bits in my window!\n"); 819 mlog(0, "No bits in my window!\n");
@@ -736,6 +821,30 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
736 goto bail; 821 goto bail;
737 } 822 }
738 823
824 if (!resv) {
825 local_resv = 1;
826 ocfs2_resv_init_once(&r);
827 ocfs2_resv_set_type(&r, OCFS2_RESV_FLAG_TMP);
828 resv = &r;
829 }
830
831 numfound = *numbits;
832 if (ocfs2_resmap_resv_bits(resmap, resv, &bitoff, &numfound) == 0) {
833 if (numfound < *numbits)
834 *numbits = numfound;
835 goto bail;
836 }
837
838 /*
839 * Code error. While reservations are enabled, local
840 * allocation should _always_ go through them.
841 */
842 BUG_ON(osb->osb_resv_level != 0);
843
844 /*
845 * Reservations are disabled. Handle this the old way.
846 */
847
739 bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap; 848 bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
740 849
741 numfound = bitoff = startoff = 0; 850 numfound = bitoff = startoff = 0;
@@ -761,7 +870,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
761 startoff = bitoff+1; 870 startoff = bitoff+1;
762 } 871 }
763 /* we got everything we needed */ 872 /* we got everything we needed */
764 if (numfound == numbits) { 873 if (numfound == *numbits) {
765 /* mlog(0, "Found it all!\n"); */ 874 /* mlog(0, "Found it all!\n"); */
766 break; 875 break;
767 } 876 }
@@ -770,12 +879,15 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
770 mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff, 879 mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
771 numfound); 880 numfound);
772 881
773 if (numfound == numbits) 882 if (numfound == *numbits)
774 bitoff = startoff - numfound; 883 bitoff = startoff - numfound;
775 else 884 else
776 bitoff = -1; 885 bitoff = -1;
777 886
778bail: 887bail:
888 if (local_resv)
889 ocfs2_resv_discard(resmap, resv);
890
779 mlog_exit(bitoff); 891 mlog_exit(bitoff);
780 return bitoff; 892 return bitoff;
781} 893}
@@ -872,8 +984,10 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
872 (unsigned long long)la_start_blk, 984 (unsigned long long)la_start_blk,
873 (unsigned long long)blkno); 985 (unsigned long long)blkno);
874 986
875 status = ocfs2_free_clusters(handle, main_bm_inode, 987 status = ocfs2_release_clusters(handle,
876 main_bm_bh, blkno, count); 988 main_bm_inode,
989 main_bm_bh, blkno,
990 count);
877 if (status < 0) { 991 if (status < 0) {
878 mlog_errno(status); 992 mlog_errno(status);
879 goto bail; 993 goto bail;
@@ -984,8 +1098,7 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
984 } 1098 }
985 1099
986retry_enospc: 1100retry_enospc:
987 (*ac)->ac_bits_wanted = osb->local_alloc_bits; 1101 (*ac)->ac_bits_wanted = osb->local_alloc_default_bits;
988
989 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); 1102 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
990 if (status == -ENOSPC) { 1103 if (status == -ENOSPC) {
991 if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) == 1104 if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
@@ -1048,7 +1161,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
1048 /* we used the generic suballoc reserve function, but we set 1161 /* we used the generic suballoc reserve function, but we set
1049 * everything up nicely, so there's no reason why we can't use 1162 * everything up nicely, so there's no reason why we can't use
1050 * the more specific cluster api to claim bits. */ 1163 * the more specific cluster api to claim bits. */
1051 status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits, 1164 status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits,
1052 &cluster_off, &cluster_count); 1165 &cluster_off, &cluster_count);
1053 if (status == -ENOSPC) { 1166 if (status == -ENOSPC) {
1054retry_enospc: 1167retry_enospc:
@@ -1061,7 +1174,8 @@ retry_enospc:
1061 OCFS2_LA_DISABLED) 1174 OCFS2_LA_DISABLED)
1062 goto bail; 1175 goto bail;
1063 1176
1064 status = ocfs2_claim_clusters(osb, handle, ac, 1177 ac->ac_bits_wanted = osb->local_alloc_default_bits;
1178 status = ocfs2_claim_clusters(handle, ac,
1065 osb->local_alloc_bits, 1179 osb->local_alloc_bits,
1066 &cluster_off, 1180 &cluster_off,
1067 &cluster_count); 1181 &cluster_count);
@@ -1096,6 +1210,9 @@ retry_enospc:
1096 memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0, 1210 memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
1097 le16_to_cpu(la->la_size)); 1211 le16_to_cpu(la->la_size));
1098 1212
1213 ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count,
1214 OCFS2_LOCAL_ALLOC(alloc)->la_bitmap);
1215
1099 mlog(0, "New window allocated:\n"); 1216 mlog(0, "New window allocated:\n");
1100 mlog(0, "window la_bm_off = %u\n", 1217 mlog(0, "window la_bm_off = %u\n",
1101 OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); 1218 OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
@@ -1167,12 +1284,7 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
1167 } 1284 }
1168 1285
1169 ocfs2_clear_local_alloc(alloc); 1286 ocfs2_clear_local_alloc(alloc);
1170 1287 ocfs2_journal_dirty(handle, osb->local_alloc_bh);
1171 status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
1172 if (status < 0) {
1173 mlog_errno(status);
1174 goto bail;
1175 }
1176 1288
1177 status = ocfs2_sync_local_to_main(osb, handle, alloc_copy, 1289 status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
1178 main_bm_inode, main_bm_bh); 1290 main_bm_inode, main_bm_bh);
@@ -1190,7 +1302,6 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
1190 1302
1191 atomic_inc(&osb->alloc_stats.moves); 1303 atomic_inc(&osb->alloc_stats.moves);
1192 1304
1193 status = 0;
1194bail: 1305bail:
1195 if (handle) 1306 if (handle)
1196 ocfs2_commit_trans(osb, handle); 1307 ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index ac5ea9f86653..1be9b5864460 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -30,6 +30,9 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb);
30 30
31void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb); 31void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
32 32
33void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb);
34unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb);
35
33int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, 36int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
34 int node_num, 37 int node_num,
35 struct ocfs2_dinode **alloc_copy); 38 struct ocfs2_dinode **alloc_copy);
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 544ac6245175..b5cb3ede9408 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -133,7 +133,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
133 133
134 if (!(fl->fl_flags & FL_POSIX)) 134 if (!(fl->fl_flags & FL_POSIX))
135 return -ENOLCK; 135 return -ENOLCK;
136 if (__mandatory_lock(inode)) 136 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
137 return -ENOLCK; 137 return -ENOLCK;
138 138
139 return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl); 139 return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 39737613424a..af2b8fe1f139 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30#include <linux/pagemap.h> 29#include <linux/pagemap.h>
31#include <linux/uio.h> 30#include <linux/uio.h>
@@ -42,44 +41,20 @@
42#include "file.h" 41#include "file.h"
43#include "inode.h" 42#include "inode.h"
44#include "mmap.h" 43#include "mmap.h"
44#include "super.h"
45 45
46static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset)
47{
48 /* The best way to deal with signals in the vm path is
49 * to block them upfront, rather than allowing the
50 * locking paths to return -ERESTARTSYS. */
51 sigfillset(blocked);
52
53 /* We should technically never get a bad return value
54 * from sigprocmask */
55 return sigprocmask(SIG_BLOCK, blocked, oldset);
56}
57
58static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
59{
60 return sigprocmask(SIG_SETMASK, oldset, NULL);
61}
62 46
63static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf) 47static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
64{ 48{
65 sigset_t blocked, oldset; 49 sigset_t oldset;
66 int error, ret; 50 int ret;
67 51
68 mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff); 52 mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff);
69 53
70 error = ocfs2_vm_op_block_sigs(&blocked, &oldset); 54 ocfs2_block_signals(&oldset);
71 if (error < 0) {
72 mlog_errno(error);
73 ret = VM_FAULT_SIGBUS;
74 goto out;
75 }
76
77 ret = filemap_fault(area, vmf); 55 ret = filemap_fault(area, vmf);
56 ocfs2_unblock_signals(&oldset);
78 57
79 error = ocfs2_vm_op_unblock_sigs(&oldset);
80 if (error < 0)
81 mlog_errno(error);
82out:
83 mlog_exit_ptr(vmf->page); 58 mlog_exit_ptr(vmf->page);
84 return ret; 59 return ret;
85} 60}
@@ -159,14 +134,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
159 struct page *page = vmf->page; 134 struct page *page = vmf->page;
160 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 135 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
161 struct buffer_head *di_bh = NULL; 136 struct buffer_head *di_bh = NULL;
162 sigset_t blocked, oldset; 137 sigset_t oldset;
163 int ret, ret2; 138 int ret;
164 139
165 ret = ocfs2_vm_op_block_sigs(&blocked, &oldset); 140 ocfs2_block_signals(&oldset);
166 if (ret < 0) {
167 mlog_errno(ret);
168 return ret;
169 }
170 141
171 /* 142 /*
172 * The cluster locks taken will block a truncate from another 143 * The cluster locks taken will block a truncate from another
@@ -194,9 +165,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
194 ocfs2_inode_unlock(inode, 1); 165 ocfs2_inode_unlock(inode, 1);
195 166
196out: 167out:
197 ret2 = ocfs2_vm_op_unblock_sigs(&oldset); 168 ocfs2_unblock_signals(&oldset);
198 if (ret2 < 0)
199 mlog_errno(ret2);
200 if (ret) 169 if (ret)
201 ret = VM_FAULT_SIGBUS; 170 ret = VM_FAULT_SIGBUS;
202 return ret; 171 return ret;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d9cd4e373a53..db5dd3ed4df4 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -84,7 +84,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
84static int ocfs2_orphan_add(struct ocfs2_super *osb, 84static int ocfs2_orphan_add(struct ocfs2_super *osb,
85 handle_t *handle, 85 handle_t *handle,
86 struct inode *inode, 86 struct inode *inode,
87 struct ocfs2_dinode *fe, 87 struct buffer_head *fe_bh,
88 char *name, 88 char *name,
89 struct ocfs2_dir_lookup_result *lookup, 89 struct ocfs2_dir_lookup_result *lookup,
90 struct inode *orphan_dir_inode); 90 struct inode *orphan_dir_inode);
@@ -239,6 +239,8 @@ static int ocfs2_mknod(struct inode *dir,
239 }; 239 };
240 int did_quota_inode = 0; 240 int did_quota_inode = 0;
241 struct ocfs2_dir_lookup_result lookup = { NULL, }; 241 struct ocfs2_dir_lookup_result lookup = { NULL, };
242 sigset_t oldset;
243 int did_block_signals = 0;
242 244
243 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, 245 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
244 (unsigned long)dev, dentry->d_name.len, 246 (unsigned long)dev, dentry->d_name.len,
@@ -350,6 +352,10 @@ static int ocfs2_mknod(struct inode *dir,
350 goto leave; 352 goto leave;
351 } 353 }
352 354
355 /* Starting to change things, restart is no longer possible. */
356 ocfs2_block_signals(&oldset);
357 did_block_signals = 1;
358
353 status = dquot_alloc_inode(inode); 359 status = dquot_alloc_inode(inode);
354 if (status) 360 if (status)
355 goto leave; 361 goto leave;
@@ -384,11 +390,7 @@ static int ocfs2_mknod(struct inode *dir,
384 goto leave; 390 goto leave;
385 } 391 }
386 ocfs2_add_links_count(dirfe, 1); 392 ocfs2_add_links_count(dirfe, 1);
387 status = ocfs2_journal_dirty(handle, parent_fe_bh); 393 ocfs2_journal_dirty(handle, parent_fe_bh);
388 if (status < 0) {
389 mlog_errno(status);
390 goto leave;
391 }
392 inc_nlink(dir); 394 inc_nlink(dir);
393 } 395 }
394 396
@@ -408,23 +410,28 @@ static int ocfs2_mknod(struct inode *dir,
408 } 410 }
409 } 411 }
410 412
411 status = ocfs2_add_entry(handle, dentry, inode, 413 /*
412 OCFS2_I(inode)->ip_blkno, parent_fe_bh, 414 * Do this before adding the entry to the directory. We add
413 &lookup); 415 * also set d_op after success so that ->d_iput() will cleanup
414 if (status < 0) { 416 * the dentry lock even if ocfs2_add_entry() fails below.
417 */
418 status = ocfs2_dentry_attach_lock(dentry, inode,
419 OCFS2_I(dir)->ip_blkno);
420 if (status) {
415 mlog_errno(status); 421 mlog_errno(status);
416 goto leave; 422 goto leave;
417 } 423 }
424 dentry->d_op = &ocfs2_dentry_ops;
418 425
419 status = ocfs2_dentry_attach_lock(dentry, inode, 426 status = ocfs2_add_entry(handle, dentry, inode,
420 OCFS2_I(dir)->ip_blkno); 427 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
421 if (status) { 428 &lookup);
429 if (status < 0) {
422 mlog_errno(status); 430 mlog_errno(status);
423 goto leave; 431 goto leave;
424 } 432 }
425 433
426 insert_inode_hash(inode); 434 insert_inode_hash(inode);
427 dentry->d_op = &ocfs2_dentry_ops;
428 d_instantiate(dentry, inode); 435 d_instantiate(dentry, inode);
429 status = 0; 436 status = 0;
430leave: 437leave:
@@ -434,6 +441,8 @@ leave:
434 ocfs2_commit_trans(osb, handle); 441 ocfs2_commit_trans(osb, handle);
435 442
436 ocfs2_inode_unlock(dir, 1); 443 ocfs2_inode_unlock(dir, 1);
444 if (did_block_signals)
445 ocfs2_unblock_signals(&oldset);
437 446
438 if (status == -ENOSPC) 447 if (status == -ENOSPC)
439 mlog(0, "Disk is full\n"); 448 mlog(0, "Disk is full\n");
@@ -445,11 +454,6 @@ leave:
445 454
446 ocfs2_free_dir_lookup_result(&lookup); 455 ocfs2_free_dir_lookup_result(&lookup);
447 456
448 if ((status < 0) && inode) {
449 clear_nlink(inode);
450 iput(inode);
451 }
452
453 if (inode_ac) 457 if (inode_ac)
454 ocfs2_free_alloc_context(inode_ac); 458 ocfs2_free_alloc_context(inode_ac);
455 459
@@ -459,6 +463,17 @@ leave:
459 if (meta_ac) 463 if (meta_ac)
460 ocfs2_free_alloc_context(meta_ac); 464 ocfs2_free_alloc_context(meta_ac);
461 465
466 /*
467 * We should call iput after the i_mutex of the bitmap been
468 * unlocked in ocfs2_free_alloc_context, or the
469 * ocfs2_delete_inode will mutex_lock again.
470 */
471 if ((status < 0) && inode) {
472 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
473 clear_nlink(inode);
474 iput(inode);
475 }
476
462 mlog_exit(status); 477 mlog_exit(status);
463 478
464 return status; 479 return status;
@@ -476,14 +491,15 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
476 int status = 0; 491 int status = 0;
477 struct ocfs2_dinode *fe = NULL; 492 struct ocfs2_dinode *fe = NULL;
478 struct ocfs2_extent_list *fel; 493 struct ocfs2_extent_list *fel;
479 u64 fe_blkno = 0; 494 u64 suballoc_loc, fe_blkno = 0;
480 u16 suballoc_bit; 495 u16 suballoc_bit;
481 u16 feat; 496 u16 feat;
482 497
483 *new_fe_bh = NULL; 498 *new_fe_bh = NULL;
484 499
485 status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh, 500 status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
486 inode_ac, &suballoc_bit, &fe_blkno); 501 inode_ac, &suballoc_loc,
502 &suballoc_bit, &fe_blkno);
487 if (status < 0) { 503 if (status < 0) {
488 mlog_errno(status); 504 mlog_errno(status);
489 goto leave; 505 goto leave;
@@ -520,6 +536,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
520 fe->i_generation = cpu_to_le32(inode->i_generation); 536 fe->i_generation = cpu_to_le32(inode->i_generation);
521 fe->i_fs_generation = cpu_to_le32(osb->fs_generation); 537 fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
522 fe->i_blkno = cpu_to_le64(fe_blkno); 538 fe->i_blkno = cpu_to_le64(fe_blkno);
539 fe->i_suballoc_loc = cpu_to_le64(suballoc_loc);
523 fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); 540 fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
524 fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot); 541 fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
525 fe->i_uid = cpu_to_le32(inode->i_uid); 542 fe->i_uid = cpu_to_le32(inode->i_uid);
@@ -556,11 +573,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
556 fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb)); 573 fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
557 } 574 }
558 575
559 status = ocfs2_journal_dirty(handle, *new_fe_bh); 576 ocfs2_journal_dirty(handle, *new_fe_bh);
560 if (status < 0) {
561 mlog_errno(status);
562 goto leave;
563 }
564 577
565 ocfs2_populate_inode(inode, fe, 1); 578 ocfs2_populate_inode(inode, fe, 1);
566 ocfs2_ci_set_new(osb, INODE_CACHE(inode)); 579 ocfs2_ci_set_new(osb, INODE_CACHE(inode));
@@ -626,6 +639,7 @@ static int ocfs2_link(struct dentry *old_dentry,
626 struct ocfs2_dinode *fe = NULL; 639 struct ocfs2_dinode *fe = NULL;
627 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 640 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
628 struct ocfs2_dir_lookup_result lookup = { NULL, }; 641 struct ocfs2_dir_lookup_result lookup = { NULL, };
642 sigset_t oldset;
629 643
630 mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino, 644 mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
631 old_dentry->d_name.len, old_dentry->d_name.name, 645 old_dentry->d_name.len, old_dentry->d_name.name,
@@ -682,6 +696,9 @@ static int ocfs2_link(struct dentry *old_dentry,
682 goto out_unlock_inode; 696 goto out_unlock_inode;
683 } 697 }
684 698
699 /* Starting to change things, restart is no longer possible. */
700 ocfs2_block_signals(&oldset);
701
685 err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh, 702 err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
686 OCFS2_JOURNAL_ACCESS_WRITE); 703 OCFS2_JOURNAL_ACCESS_WRITE);
687 if (err < 0) { 704 if (err < 0) {
@@ -694,14 +711,7 @@ static int ocfs2_link(struct dentry *old_dentry,
694 ocfs2_set_links_count(fe, inode->i_nlink); 711 ocfs2_set_links_count(fe, inode->i_nlink);
695 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 712 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
696 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 713 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
697 714 ocfs2_journal_dirty(handle, fe_bh);
698 err = ocfs2_journal_dirty(handle, fe_bh);
699 if (err < 0) {
700 ocfs2_add_links_count(fe, -1);
701 drop_nlink(inode);
702 mlog_errno(err);
703 goto out_commit;
704 }
705 715
706 err = ocfs2_add_entry(handle, dentry, inode, 716 err = ocfs2_add_entry(handle, dentry, inode,
707 OCFS2_I(inode)->ip_blkno, 717 OCFS2_I(inode)->ip_blkno,
@@ -725,6 +735,7 @@ static int ocfs2_link(struct dentry *old_dentry,
725 735
726out_commit: 736out_commit:
727 ocfs2_commit_trans(osb, handle); 737 ocfs2_commit_trans(osb, handle);
738 ocfs2_unblock_signals(&oldset);
728out_unlock_inode: 739out_unlock_inode:
729 ocfs2_inode_unlock(inode, 1); 740 ocfs2_inode_unlock(inode, 1);
730 741
@@ -879,7 +890,7 @@ static int ocfs2_unlink(struct inode *dir,
879 fe = (struct ocfs2_dinode *) fe_bh->b_data; 890 fe = (struct ocfs2_dinode *) fe_bh->b_data;
880 891
881 if (inode_is_unlinkable(inode)) { 892 if (inode_is_unlinkable(inode)) {
882 status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name, 893 status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name,
883 &orphan_insert, orphan_dir); 894 &orphan_insert, orphan_dir);
884 if (status < 0) { 895 if (status < 0) {
885 mlog_errno(status); 896 mlog_errno(status);
@@ -898,12 +909,7 @@ static int ocfs2_unlink(struct inode *dir,
898 drop_nlink(inode); 909 drop_nlink(inode);
899 drop_nlink(inode); 910 drop_nlink(inode);
900 ocfs2_set_links_count(fe, inode->i_nlink); 911 ocfs2_set_links_count(fe, inode->i_nlink);
901 912 ocfs2_journal_dirty(handle, fe_bh);
902 status = ocfs2_journal_dirty(handle, fe_bh);
903 if (status < 0) {
904 mlog_errno(status);
905 goto leave;
906 }
907 913
908 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 914 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
909 if (S_ISDIR(inode->i_mode)) 915 if (S_ISDIR(inode->i_mode))
@@ -1300,7 +1306,7 @@ static int ocfs2_rename(struct inode *old_dir,
1300 if (S_ISDIR(new_inode->i_mode) || 1306 if (S_ISDIR(new_inode->i_mode) ||
1301 (ocfs2_read_links_count(newfe) == 1)) { 1307 (ocfs2_read_links_count(newfe) == 1)) {
1302 status = ocfs2_orphan_add(osb, handle, new_inode, 1308 status = ocfs2_orphan_add(osb, handle, new_inode,
1303 newfe, orphan_name, 1309 newfe_bh, orphan_name,
1304 &orphan_insert, orphan_dir); 1310 &orphan_insert, orphan_dir);
1305 if (status < 0) { 1311 if (status < 0) {
1306 mlog_errno(status); 1312 mlog_errno(status);
@@ -1321,12 +1327,7 @@ static int ocfs2_rename(struct inode *old_dir,
1321 ocfs2_set_links_count(newfe, 0); 1327 ocfs2_set_links_count(newfe, 0);
1322 else 1328 else
1323 ocfs2_add_links_count(newfe, -1); 1329 ocfs2_add_links_count(newfe, -1);
1324 1330 ocfs2_journal_dirty(handle, newfe_bh);
1325 status = ocfs2_journal_dirty(handle, newfe_bh);
1326 if (status < 0) {
1327 mlog_errno(status);
1328 goto bail;
1329 }
1330 } else { 1331 } else {
1331 /* if the name was not found in new_dir, add it now */ 1332 /* if the name was not found in new_dir, add it now */
1332 status = ocfs2_add_entry(handle, new_dentry, old_inode, 1333 status = ocfs2_add_entry(handle, new_dentry, old_inode,
@@ -1345,10 +1346,7 @@ static int ocfs2_rename(struct inode *old_dir,
1345 1346
1346 old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec); 1347 old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec);
1347 old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec); 1348 old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec);
1348 1349 ocfs2_journal_dirty(handle, old_inode_bh);
1349 status = ocfs2_journal_dirty(handle, old_inode_bh);
1350 if (status < 0)
1351 mlog_errno(status);
1352 } else 1350 } else
1353 mlog_errno(status); 1351 mlog_errno(status);
1354 1352
@@ -1420,7 +1418,7 @@ static int ocfs2_rename(struct inode *old_dir,
1420 OCFS2_JOURNAL_ACCESS_WRITE); 1418 OCFS2_JOURNAL_ACCESS_WRITE);
1421 fe = (struct ocfs2_dinode *) old_dir_bh->b_data; 1419 fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
1422 ocfs2_set_links_count(fe, old_dir->i_nlink); 1420 ocfs2_set_links_count(fe, old_dir->i_nlink);
1423 status = ocfs2_journal_dirty(handle, old_dir_bh); 1421 ocfs2_journal_dirty(handle, old_dir_bh);
1424 } 1422 }
1425 } 1423 }
1426 ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir); 1424 ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
@@ -1552,11 +1550,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
1552 (bytes_left > sb->s_blocksize) ? sb->s_blocksize : 1550 (bytes_left > sb->s_blocksize) ? sb->s_blocksize :
1553 bytes_left); 1551 bytes_left);
1554 1552
1555 status = ocfs2_journal_dirty(handle, bhs[virtual]); 1553 ocfs2_journal_dirty(handle, bhs[virtual]);
1556 if (status < 0) {
1557 mlog_errno(status);
1558 goto bail;
1559 }
1560 1554
1561 virtual++; 1555 virtual++;
1562 p_blkno++; 1556 p_blkno++;
@@ -1600,6 +1594,8 @@ static int ocfs2_symlink(struct inode *dir,
1600 }; 1594 };
1601 int did_quota = 0, did_quota_inode = 0; 1595 int did_quota = 0, did_quota_inode = 0;
1602 struct ocfs2_dir_lookup_result lookup = { NULL, }; 1596 struct ocfs2_dir_lookup_result lookup = { NULL, };
1597 sigset_t oldset;
1598 int did_block_signals = 0;
1603 1599
1604 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, 1600 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
1605 dentry, symname, dentry->d_name.len, dentry->d_name.name); 1601 dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1695,6 +1691,10 @@ static int ocfs2_symlink(struct inode *dir,
1695 goto bail; 1691 goto bail;
1696 } 1692 }
1697 1693
1694 /* Starting to change things, restart is no longer possible. */
1695 ocfs2_block_signals(&oldset);
1696 did_block_signals = 1;
1697
1698 status = dquot_alloc_inode(inode); 1698 status = dquot_alloc_inode(inode);
1699 if (status) 1699 if (status)
1700 goto bail; 1700 goto bail;
@@ -1771,22 +1771,27 @@ static int ocfs2_symlink(struct inode *dir,
1771 } 1771 }
1772 } 1772 }
1773 1773
1774 status = ocfs2_add_entry(handle, dentry, inode, 1774 /*
1775 le64_to_cpu(fe->i_blkno), parent_fe_bh, 1775 * Do this before adding the entry to the directory. We add
1776 &lookup); 1776 * also set d_op after success so that ->d_iput() will cleanup
1777 if (status < 0) { 1777 * the dentry lock even if ocfs2_add_entry() fails below.
1778 */
1779 status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
1780 if (status) {
1778 mlog_errno(status); 1781 mlog_errno(status);
1779 goto bail; 1782 goto bail;
1780 } 1783 }
1784 dentry->d_op = &ocfs2_dentry_ops;
1781 1785
1782 status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno); 1786 status = ocfs2_add_entry(handle, dentry, inode,
1783 if (status) { 1787 le64_to_cpu(fe->i_blkno), parent_fe_bh,
1788 &lookup);
1789 if (status < 0) {
1784 mlog_errno(status); 1790 mlog_errno(status);
1785 goto bail; 1791 goto bail;
1786 } 1792 }
1787 1793
1788 insert_inode_hash(inode); 1794 insert_inode_hash(inode);
1789 dentry->d_op = &ocfs2_dentry_ops;
1790 d_instantiate(dentry, inode); 1795 d_instantiate(dentry, inode);
1791bail: 1796bail:
1792 if (status < 0 && did_quota) 1797 if (status < 0 && did_quota)
@@ -1798,6 +1803,8 @@ bail:
1798 ocfs2_commit_trans(osb, handle); 1803 ocfs2_commit_trans(osb, handle);
1799 1804
1800 ocfs2_inode_unlock(dir, 1); 1805 ocfs2_inode_unlock(dir, 1);
1806 if (did_block_signals)
1807 ocfs2_unblock_signals(&oldset);
1801 1808
1802 brelse(new_fe_bh); 1809 brelse(new_fe_bh);
1803 brelse(parent_fe_bh); 1810 brelse(parent_fe_bh);
@@ -1811,6 +1818,7 @@ bail:
1811 if (xattr_ac) 1818 if (xattr_ac)
1812 ocfs2_free_alloc_context(xattr_ac); 1819 ocfs2_free_alloc_context(xattr_ac);
1813 if ((status < 0) && inode) { 1820 if ((status < 0) && inode) {
1821 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
1814 clear_nlink(inode); 1822 clear_nlink(inode);
1815 iput(inode); 1823 iput(inode);
1816 } 1824 }
@@ -1911,7 +1919,7 @@ leave:
1911static int ocfs2_orphan_add(struct ocfs2_super *osb, 1919static int ocfs2_orphan_add(struct ocfs2_super *osb,
1912 handle_t *handle, 1920 handle_t *handle,
1913 struct inode *inode, 1921 struct inode *inode,
1914 struct ocfs2_dinode *fe, 1922 struct buffer_head *fe_bh,
1915 char *name, 1923 char *name,
1916 struct ocfs2_dir_lookup_result *lookup, 1924 struct ocfs2_dir_lookup_result *lookup,
1917 struct inode *orphan_dir_inode) 1925 struct inode *orphan_dir_inode)
@@ -1919,6 +1927,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1919 struct buffer_head *orphan_dir_bh = NULL; 1927 struct buffer_head *orphan_dir_bh = NULL;
1920 int status = 0; 1928 int status = 0;
1921 struct ocfs2_dinode *orphan_fe; 1929 struct ocfs2_dinode *orphan_fe;
1930 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1922 1931
1923 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); 1932 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
1924 1933
@@ -1943,29 +1952,42 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1943 if (S_ISDIR(inode->i_mode)) 1952 if (S_ISDIR(inode->i_mode))
1944 ocfs2_add_links_count(orphan_fe, 1); 1953 ocfs2_add_links_count(orphan_fe, 1);
1945 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe); 1954 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
1955 ocfs2_journal_dirty(handle, orphan_dir_bh);
1946 1956
1947 status = ocfs2_journal_dirty(handle, orphan_dir_bh); 1957 status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
1958 OCFS2_ORPHAN_NAMELEN, inode,
1959 OCFS2_I(inode)->ip_blkno,
1960 orphan_dir_bh, lookup);
1948 if (status < 0) { 1961 if (status < 0) {
1949 mlog_errno(status); 1962 mlog_errno(status);
1950 goto leave; 1963 goto leave;
1951 } 1964 }
1952 1965
1953 status = __ocfs2_add_entry(handle, orphan_dir_inode, name, 1966 /*
1954 OCFS2_ORPHAN_NAMELEN, inode, 1967 * We're going to journal the change of i_flags and i_orphaned_slot.
1955 OCFS2_I(inode)->ip_blkno, 1968 * It's safe anyway, though some callers may duplicate the journaling.
1956 orphan_dir_bh, lookup); 1969 * Journaling within the func just make the logic look more
1970 * straightforward.
1971 */
1972 status = ocfs2_journal_access_di(handle,
1973 INODE_CACHE(inode),
1974 fe_bh,
1975 OCFS2_JOURNAL_ACCESS_WRITE);
1957 if (status < 0) { 1976 if (status < 0) {
1958 mlog_errno(status); 1977 mlog_errno(status);
1959 goto leave; 1978 goto leave;
1960 } 1979 }
1961 1980
1962 le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL); 1981 le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
1982 OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
1963 1983
1964 /* Record which orphan dir our inode now resides 1984 /* Record which orphan dir our inode now resides
1965 * in. delete_inode will use this to determine which orphan 1985 * in. delete_inode will use this to determine which orphan
1966 * dir to lock. */ 1986 * dir to lock. */
1967 fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); 1987 fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
1968 1988
1989 ocfs2_journal_dirty(handle, fe_bh);
1990
1969 mlog(0, "Inode %llu orphaned in slot %d\n", 1991 mlog(0, "Inode %llu orphaned in slot %d\n",
1970 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); 1992 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
1971 1993
@@ -2029,12 +2051,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
2029 if (S_ISDIR(inode->i_mode)) 2051 if (S_ISDIR(inode->i_mode))
2030 ocfs2_add_links_count(orphan_fe, -1); 2052 ocfs2_add_links_count(orphan_fe, -1);
2031 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe); 2053 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
2032 2054 ocfs2_journal_dirty(handle, orphan_dir_bh);
2033 status = ocfs2_journal_dirty(handle, orphan_dir_bh);
2034 if (status < 0) {
2035 mlog_errno(status);
2036 goto leave;
2037 }
2038 2055
2039leave: 2056leave:
2040 ocfs2_free_dir_lookup_result(&lookup); 2057 ocfs2_free_dir_lookup_result(&lookup);
@@ -2123,7 +2140,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2123 } 2140 }
2124 2141
2125 di = (struct ocfs2_dinode *)new_di_bh->b_data; 2142 di = (struct ocfs2_dinode *)new_di_bh->b_data;
2126 status = ocfs2_orphan_add(osb, handle, inode, di, orphan_name, 2143 status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
2127 &orphan_insert, orphan_dir); 2144 &orphan_insert, orphan_dir);
2128 if (status < 0) { 2145 if (status < 0) {
2129 mlog_errno(status); 2146 mlog_errno(status);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 1238b491db90..c67003b6b5a2 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -47,6 +47,7 @@
47/* For struct ocfs2_blockcheck_stats */ 47/* For struct ocfs2_blockcheck_stats */
48#include "blockcheck.h" 48#include "blockcheck.h"
49 49
50#include "reservations.h"
50 51
51/* Caching of metadata buffers */ 52/* Caching of metadata buffers */
52 53
@@ -341,6 +342,9 @@ struct ocfs2_super
341 */ 342 */
342 unsigned int local_alloc_bits; 343 unsigned int local_alloc_bits;
343 unsigned int local_alloc_default_bits; 344 unsigned int local_alloc_default_bits;
345 /* osb_clusters_at_boot can become stale! Do not trust it to
346 * be up to date. */
347 unsigned int osb_clusters_at_boot;
344 348
345 enum ocfs2_local_alloc_state local_alloc_state; /* protected 349 enum ocfs2_local_alloc_state local_alloc_state; /* protected
346 * by osb_lock */ 350 * by osb_lock */
@@ -349,6 +353,11 @@ struct ocfs2_super
349 353
350 u64 la_last_gd; 354 u64 la_last_gd;
351 355
356 struct ocfs2_reservation_map osb_la_resmap;
357
358 unsigned int osb_resv_level;
359 unsigned int osb_dir_resv_level;
360
352 /* Next three fields are for local node slot recovery during 361 /* Next three fields are for local node slot recovery during
353 * mount. */ 362 * mount. */
354 int dirty; 363 int dirty;
@@ -482,6 +491,13 @@ static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
482 return 0; 491 return 0;
483} 492}
484 493
494static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb)
495{
496 if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
497 return 1;
498 return 0;
499}
500
485static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb) 501static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
486{ 502{
487 if (ocfs2_supports_indexed_dirs(osb)) 503 if (ocfs2_supports_indexed_dirs(osb))
@@ -763,8 +779,24 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
763 return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits); 779 return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
764} 780}
765 781
766#define ocfs2_set_bit ext2_set_bit 782static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
767#define ocfs2_clear_bit ext2_clear_bit 783 unsigned int clusters)
784{
785 return clusters >> (20 - OCFS2_SB(sb)->s_clustersize_bits);
786}
787
788static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
789{
790 ext2_set_bit(bit, bitmap);
791}
792#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
793
794static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
795{
796 ext2_clear_bit(bit, bitmap);
797}
798#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
799
768#define ocfs2_test_bit ext2_test_bit 800#define ocfs2_test_bit ext2_test_bit
769#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit 801#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
770#define ocfs2_find_next_bit ext2_find_next_bit 802#define ocfs2_find_next_bit ext2_find_next_bit
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index bb37218a7978..33f1c9a8258d 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -100,7 +100,8 @@
100 | OCFS2_FEATURE_INCOMPAT_XATTR \ 100 | OCFS2_FEATURE_INCOMPAT_XATTR \
101 | OCFS2_FEATURE_INCOMPAT_META_ECC \ 101 | OCFS2_FEATURE_INCOMPAT_META_ECC \
102 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ 102 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
103 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE) 103 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
104 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
104#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ 105#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
105 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ 106 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
106 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) 107 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -165,6 +166,9 @@
165/* Refcount tree support */ 166/* Refcount tree support */
166#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000 167#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000
167 168
169/* Discontigous block groups */
170#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000
171
168/* 172/*
169 * backup superblock flag is used to indicate that this volume 173 * backup superblock flag is used to indicate that this volume
170 * has backup superblocks. 174 * has backup superblocks.
@@ -283,14 +287,6 @@
283#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) 287#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
284 288
285/* 289/*
286 * Default local alloc size (in megabytes)
287 *
288 * The value chosen should be such that most allocations, including new
289 * block groups, use local alloc.
290 */
291#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8
292
293/*
294 * Inline extended attribute size (in bytes) 290 * Inline extended attribute size (in bytes)
295 * The value chosen should be aligned to 16 byte boundaries. 291 * The value chosen should be aligned to 16 byte boundaries.
296 */ 292 */
@@ -512,7 +508,10 @@ struct ocfs2_extent_block
512 block group */ 508 block group */
513 __le32 h_fs_generation; /* Must match super block */ 509 __le32 h_fs_generation; /* Must match super block */
514 __le64 h_blkno; /* Offset on disk, in blocks */ 510 __le64 h_blkno; /* Offset on disk, in blocks */
515/*20*/ __le64 h_reserved3; 511/*20*/ __le64 h_suballoc_loc; /* Suballocator block group this
512 eb belongs to. Only valid
513 if allocated from a
514 discontiguous block group */
516 __le64 h_next_leaf_blk; /* Offset on disk, in blocks, 515 __le64 h_next_leaf_blk; /* Offset on disk, in blocks,
517 of next leaf header pointing 516 of next leaf header pointing
518 to data */ 517 to data */
@@ -679,7 +678,11 @@ struct ocfs2_dinode {
679/*80*/ struct ocfs2_block_check i_check; /* Error checking */ 678/*80*/ struct ocfs2_block_check i_check; /* Error checking */
680/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */ 679/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */
681/*90*/ __le64 i_refcount_loc; 680/*90*/ __le64 i_refcount_loc;
682 __le64 i_reserved2[4]; 681 __le64 i_suballoc_loc; /* Suballocator block group this
682 inode belongs to. Only valid
683 if allocated from a
684 discontiguous block group */
685/*A0*/ __le64 i_reserved2[3];
683/*B8*/ union { 686/*B8*/ union {
684 __le64 i_pad1; /* Generic way to refer to this 687 __le64 i_pad1; /* Generic way to refer to this
685 64bit union */ 688 64bit union */
@@ -814,7 +817,12 @@ struct ocfs2_dx_root_block {
814 __le32 dr_reserved2; 817 __le32 dr_reserved2;
815 __le64 dr_free_blk; /* Pointer to head of free 818 __le64 dr_free_blk; /* Pointer to head of free
816 * unindexed block list. */ 819 * unindexed block list. */
817 __le64 dr_reserved3[15]; 820 __le64 dr_suballoc_loc; /* Suballocator block group
821 this root belongs to.
822 Only valid if allocated
823 from a discontiguous
824 block group */
825 __le64 dr_reserved3[14];
818 union { 826 union {
819 struct ocfs2_extent_list dr_list; /* Keep this aligned to 128 827 struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
820 * bits for maximum space 828 * bits for maximum space
@@ -840,6 +848,13 @@ struct ocfs2_dx_leaf {
840}; 848};
841 849
842/* 850/*
851 * Largest bitmap for a block (suballocator) group in bytes. This limit
852 * does not affect cluster groups (global allocator). Cluster group
853 * bitmaps run to the end of the block.
854 */
855#define OCFS2_MAX_BG_BITMAP_SIZE 256
856
857/*
843 * On disk allocator group structure for OCFS2 858 * On disk allocator group structure for OCFS2
844 */ 859 */
845struct ocfs2_group_desc 860struct ocfs2_group_desc
@@ -860,7 +875,29 @@ struct ocfs2_group_desc
860 __le64 bg_blkno; /* Offset on disk, in blocks */ 875 __le64 bg_blkno; /* Offset on disk, in blocks */
861/*30*/ struct ocfs2_block_check bg_check; /* Error checking */ 876/*30*/ struct ocfs2_block_check bg_check; /* Error checking */
862 __le64 bg_reserved2; 877 __le64 bg_reserved2;
863/*40*/ __u8 bg_bitmap[0]; 878/*40*/ union {
879 __u8 bg_bitmap[0];
880 struct {
881 /*
882 * Block groups may be discontiguous when
883 * OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set.
884 * The extents of a discontigous block group are
885 * stored in bg_list. It is a flat list.
886 * l_tree_depth must always be zero. A
887 * discontiguous group is signified by a non-zero
888 * bg_list->l_next_free_rec. Only block groups
889 * can be discontiguous; Cluster groups cannot.
890 * We've never made a block group with more than
891 * 2048 blocks (256 bytes of bg_bitmap). This
892 * codifies that limit so that we can fit bg_list.
893 * bg_size of a discontiguous block group will
894 * be 256 to match bg_bitmap_filler.
895 */
896 __u8 bg_bitmap_filler[OCFS2_MAX_BG_BITMAP_SIZE];
897/*140*/ struct ocfs2_extent_list bg_list;
898 };
899 };
900/* Actual on-disk size is one block */
864}; 901};
865 902
866struct ocfs2_refcount_rec { 903struct ocfs2_refcount_rec {
@@ -905,7 +942,11 @@ struct ocfs2_refcount_block {
905/*40*/ __le32 rf_generation; /* generation number. all be the same 942/*40*/ __le32 rf_generation; /* generation number. all be the same
906 * for the same refcount tree. */ 943 * for the same refcount tree. */
907 __le32 rf_reserved0; 944 __le32 rf_reserved0;
908 __le64 rf_reserved1[7]; 945 __le64 rf_suballoc_loc; /* Suballocator block group this
946 refcount block belongs to. Only
947 valid if allocated from a
948 discontiguous block group */
949/*50*/ __le64 rf_reserved1[6];
909/*80*/ union { 950/*80*/ union {
910 struct ocfs2_refcount_list rf_records; /* List of refcount 951 struct ocfs2_refcount_list rf_records; /* List of refcount
911 records */ 952 records */
@@ -1017,7 +1058,10 @@ struct ocfs2_xattr_block {
1017 real xattr or a xattr tree. */ 1058 real xattr or a xattr tree. */
1018 __le16 xb_reserved0; 1059 __le16 xb_reserved0;
1019 __le32 xb_reserved1; 1060 __le32 xb_reserved1;
1020 __le64 xb_reserved2; 1061 __le64 xb_suballoc_loc; /* Suballocator block group this
1062 xattr block belongs to. Only
1063 valid if allocated from a
1064 discontiguous block group */
1021/*30*/ union { 1065/*30*/ union {
1022 struct ocfs2_xattr_header xb_header; /* xattr header if this 1066 struct ocfs2_xattr_header xb_header; /* xattr header if this
1023 block contains xattr */ 1067 block contains xattr */
@@ -1254,6 +1298,16 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
1254 return size / sizeof(struct ocfs2_extent_rec); 1298 return size / sizeof(struct ocfs2_extent_rec);
1255} 1299}
1256 1300
1301static inline u16 ocfs2_extent_recs_per_gd(struct super_block *sb)
1302{
1303 int size;
1304
1305 size = sb->s_blocksize -
1306 offsetof(struct ocfs2_group_desc, bg_list.l_recs);
1307
1308 return size / sizeof(struct ocfs2_extent_rec);
1309}
1310
1257static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb) 1311static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
1258{ 1312{
1259 int size; 1313 int size;
@@ -1284,13 +1338,23 @@ static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
1284 return size; 1338 return size;
1285} 1339}
1286 1340
1287static inline int ocfs2_group_bitmap_size(struct super_block *sb) 1341static inline int ocfs2_group_bitmap_size(struct super_block *sb,
1342 int suballocator,
1343 u32 feature_incompat)
1288{ 1344{
1289 int size; 1345 int size = sb->s_blocksize -
1290
1291 size = sb->s_blocksize -
1292 offsetof(struct ocfs2_group_desc, bg_bitmap); 1346 offsetof(struct ocfs2_group_desc, bg_bitmap);
1293 1347
1348 /*
1349 * The cluster allocator uses the entire block. Suballocators have
1350 * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older
1351 * code expects bg_size set to the maximum. Thus we must keep
1352 * bg_size as-is unless discontig_bg is enabled.
1353 */
1354 if (suballocator &&
1355 (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
1356 size = OCFS2_MAX_BG_BITMAP_SIZE;
1357
1294 return size; 1358 return size;
1295} 1359}
1296 1360
@@ -1402,23 +1466,43 @@ static inline int ocfs2_extent_recs_per_eb(int blocksize)
1402 return size / sizeof(struct ocfs2_extent_rec); 1466 return size / sizeof(struct ocfs2_extent_rec);
1403} 1467}
1404 1468
1405static inline int ocfs2_local_alloc_size(int blocksize) 1469static inline int ocfs2_extent_recs_per_gd(int blocksize)
1406{ 1470{
1407 int size; 1471 int size;
1408 1472
1409 size = blocksize - 1473 size = blocksize -
1410 offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); 1474 offsetof(struct ocfs2_group_desc, bg_list.l_recs);
1411 1475
1412 return size; 1476 return size / sizeof(struct ocfs2_extent_rec);
1413} 1477}
1414 1478
1415static inline int ocfs2_group_bitmap_size(int blocksize) 1479static inline int ocfs2_local_alloc_size(int blocksize)
1416{ 1480{
1417 int size; 1481 int size;
1418 1482
1419 size = blocksize - 1483 size = blocksize -
1484 offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
1485
1486 return size;
1487}
1488
1489static inline int ocfs2_group_bitmap_size(int blocksize,
1490 int suballocator,
1491 uint32_t feature_incompat)
1492{
1493 int size = sb->s_blocksize -
1420 offsetof(struct ocfs2_group_desc, bg_bitmap); 1494 offsetof(struct ocfs2_group_desc, bg_bitmap);
1421 1495
1496 /*
1497 * The cluster allocator uses the entire block. Suballocators have
1498 * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older
1499 * code expects bg_size set to the maximum. Thus we must keep
1500 * bg_size as-is unless discontig_bg is enabled.
1501 */
1502 if (suballocator &&
1503 (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
1504 size = OCFS2_MAX_BG_BITMAP_SIZE;
1505
1422 return size; 1506 return size;
1423} 1507}
1424 1508
@@ -1491,5 +1575,19 @@ static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
1491 de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; 1575 de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
1492} 1576}
1493 1577
1578static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd)
1579{
1580 if ((offsetof(struct ocfs2_group_desc, bg_bitmap) +
1581 le16_to_cpu(gd->bg_size)) !=
1582 offsetof(struct ocfs2_group_desc, bg_list))
1583 return 0;
1584 /*
1585 * Only valid to check l_next_free_rec if
1586 * bg_bitmap + bg_size == bg_list.
1587 */
1588 if (!gd->bg_list.l_next_free_rec)
1589 return 0;
1590 return 1;
1591}
1494#endif /* _OCFS2_FS_H */ 1592#endif /* _OCFS2_FS_H */
1495 1593
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 355f41d1d520..04ae76d8c6ab 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -3,6 +3,7 @@
3 */ 3 */
4#include <linux/spinlock.h> 4#include <linux/spinlock.h>
5#include <linux/fs.h> 5#include <linux/fs.h>
6#include <linux/slab.h>
6#include <linux/quota.h> 7#include <linux/quota.h>
7#include <linux/quotaops.h> 8#include <linux/quotaops.h>
8#include <linux/dqblk_qtree.h> 9#include <linux/dqblk_qtree.h>
@@ -260,10 +261,8 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
260 brelse(bh); 261 brelse(bh);
261 goto out; 262 goto out;
262 } 263 }
263 err = ocfs2_journal_dirty(handle, bh); 264 ocfs2_journal_dirty(handle, bh);
264 brelse(bh); 265 brelse(bh);
265 if (err < 0)
266 goto out;
267out: 266out:
268 if (err) { 267 if (err) {
269 mutex_unlock(&gqinode->i_mutex); 268 mutex_unlock(&gqinode->i_mutex);
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index a6467f3d262e..884b641f199e 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -3,6 +3,7 @@
3 */ 3 */
4 4
5#include <linux/fs.h> 5#include <linux/fs.h>
6#include <linux/slab.h>
6#include <linux/quota.h> 7#include <linux/quota.h>
7#include <linux/quotaops.h> 8#include <linux/quotaops.h>
8#include <linux/module.h> 9#include <linux/module.h>
@@ -118,12 +119,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
118 lock_buffer(bh); 119 lock_buffer(bh);
119 modify(bh, private); 120 modify(bh, private);
120 unlock_buffer(bh); 121 unlock_buffer(bh);
121 status = ocfs2_journal_dirty(handle, bh); 122 ocfs2_journal_dirty(handle, bh);
122 if (status < 0) { 123
123 mlog_errno(status);
124 ocfs2_commit_trans(OCFS2_SB(sb), handle);
125 return status;
126 }
127 status = ocfs2_commit_trans(OCFS2_SB(sb), handle); 124 status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
128 if (status < 0) { 125 if (status < 0) {
129 mlog_errno(status); 126 mlog_errno(status);
@@ -522,9 +519,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
522 ocfs2_clear_bit(bit, dchunk->dqc_bitmap); 519 ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
523 le32_add_cpu(&dchunk->dqc_free, 1); 520 le32_add_cpu(&dchunk->dqc_free, 1);
524 unlock_buffer(qbh); 521 unlock_buffer(qbh);
525 status = ocfs2_journal_dirty(handle, qbh); 522 ocfs2_journal_dirty(handle, qbh);
526 if (status < 0)
527 mlog_errno(status);
528out_commit: 523out_commit:
529 mutex_unlock(&sb_dqopt(sb)->dqio_mutex); 524 mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
530 ocfs2_commit_trans(OCFS2_SB(sb), handle); 525 ocfs2_commit_trans(OCFS2_SB(sb), handle);
@@ -630,9 +625,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
630 lock_buffer(bh); 625 lock_buffer(bh);
631 ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN); 626 ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
632 unlock_buffer(bh); 627 unlock_buffer(bh);
633 status = ocfs2_journal_dirty(handle, bh); 628 ocfs2_journal_dirty(handle, bh);
634 if (status < 0)
635 mlog_errno(status);
636out_trans: 629out_trans:
637 ocfs2_commit_trans(osb, handle); 630 ocfs2_commit_trans(osb, handle);
638out_bh: 631out_bh:
@@ -1008,11 +1001,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
1008 sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) - 1001 sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
1009 OCFS2_QBLK_RESERVED_SPACE); 1002 OCFS2_QBLK_RESERVED_SPACE);
1010 unlock_buffer(bh); 1003 unlock_buffer(bh);
1011 status = ocfs2_journal_dirty(handle, bh); 1004 ocfs2_journal_dirty(handle, bh);
1012 if (status < 0) {
1013 mlog_errno(status);
1014 goto out_trans;
1015 }
1016 1005
1017 /* Initialize new block with structures */ 1006 /* Initialize new block with structures */
1018 down_read(&OCFS2_I(lqinode)->ip_alloc_sem); 1007 down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
@@ -1039,11 +1028,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
1039 lock_buffer(dbh); 1028 lock_buffer(dbh);
1040 memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE); 1029 memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE);
1041 unlock_buffer(dbh); 1030 unlock_buffer(dbh);
1042 status = ocfs2_journal_dirty(handle, dbh); 1031 ocfs2_journal_dirty(handle, dbh);
1043 if (status < 0) {
1044 mlog_errno(status);
1045 goto out_trans;
1046 }
1047 1032
1048 /* Update local quotafile info */ 1033 /* Update local quotafile info */
1049 oinfo->dqi_blocks += 2; 1034 oinfo->dqi_blocks += 2;
@@ -1154,11 +1139,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1154 lock_buffer(bh); 1139 lock_buffer(bh);
1155 memset(bh->b_data, 0, sb->s_blocksize); 1140 memset(bh->b_data, 0, sb->s_blocksize);
1156 unlock_buffer(bh); 1141 unlock_buffer(bh);
1157 status = ocfs2_journal_dirty(handle, bh); 1142 ocfs2_journal_dirty(handle, bh);
1158 if (status < 0) { 1143
1159 mlog_errno(status);
1160 goto out_trans;
1161 }
1162 /* Update chunk header */ 1144 /* Update chunk header */
1163 status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), 1145 status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
1164 chunk->qc_headerbh, 1146 chunk->qc_headerbh,
@@ -1172,11 +1154,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1172 lock_buffer(chunk->qc_headerbh); 1154 lock_buffer(chunk->qc_headerbh);
1173 le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb)); 1155 le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
1174 unlock_buffer(chunk->qc_headerbh); 1156 unlock_buffer(chunk->qc_headerbh);
1175 status = ocfs2_journal_dirty(handle, chunk->qc_headerbh); 1157 ocfs2_journal_dirty(handle, chunk->qc_headerbh);
1176 if (status < 0) { 1158
1177 mlog_errno(status);
1178 goto out_trans;
1179 }
1180 /* Update file header */ 1159 /* Update file header */
1181 oinfo->dqi_blocks++; 1160 oinfo->dqi_blocks++;
1182 status = ocfs2_local_write_info(sb, type); 1161 status = ocfs2_local_write_info(sb, type);
@@ -1311,12 +1290,8 @@ static int ocfs2_local_release_dquot(struct dquot *dquot)
1311 ocfs2_clear_bit(offset, dchunk->dqc_bitmap); 1290 ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
1312 le32_add_cpu(&dchunk->dqc_free, 1); 1291 le32_add_cpu(&dchunk->dqc_free, 1);
1313 unlock_buffer(od->dq_chunk->qc_headerbh); 1292 unlock_buffer(od->dq_chunk->qc_headerbh);
1314 status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh); 1293 ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
1315 if (status < 0) { 1294
1316 mlog_errno(status);
1317 goto out;
1318 }
1319 status = 0;
1320out: 1295out:
1321 /* Clear the read bit so that next time someone uses this 1296 /* Clear the read bit so that next time someone uses this
1322 * dquot he reads fresh info from disk and allocates local 1297 * dquot he reads fresh info from disk and allocates local
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 9e96921dffda..4793f36f6518 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -37,7 +37,6 @@
37 37
38#include <linux/bio.h> 38#include <linux/bio.h>
39#include <linux/blkdev.h> 39#include <linux/blkdev.h>
40#include <linux/gfp.h>
41#include <linux/slab.h> 40#include <linux/slab.h>
42#include <linux/writeback.h> 41#include <linux/writeback.h>
43#include <linux/pagevec.h> 42#include <linux/pagevec.h>
@@ -571,7 +570,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
571 struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL; 570 struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
572 u16 suballoc_bit_start; 571 u16 suballoc_bit_start;
573 u32 num_got; 572 u32 num_got;
574 u64 first_blkno; 573 u64 suballoc_loc, first_blkno;
575 574
576 BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); 575 BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
577 576
@@ -597,7 +596,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
597 goto out_commit; 596 goto out_commit;
598 } 597 }
599 598
600 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, 599 ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
601 &suballoc_bit_start, &num_got, 600 &suballoc_bit_start, &num_got,
602 &first_blkno); 601 &first_blkno);
603 if (ret) { 602 if (ret) {
@@ -627,6 +626,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
627 memset(rb, 0, inode->i_sb->s_blocksize); 626 memset(rb, 0, inode->i_sb->s_blocksize);
628 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); 627 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
629 rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); 628 rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
629 rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
630 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 630 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
631 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation); 631 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
632 rb->rf_blkno = cpu_to_le64(first_blkno); 632 rb->rf_blkno = cpu_to_le64(first_blkno);
@@ -791,7 +791,10 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
791 if (le32_to_cpu(rb->rf_count) == 1) { 791 if (le32_to_cpu(rb->rf_count) == 1) {
792 blk = le64_to_cpu(rb->rf_blkno); 792 blk = le64_to_cpu(rb->rf_blkno);
793 bit = le16_to_cpu(rb->rf_suballoc_bit); 793 bit = le16_to_cpu(rb->rf_suballoc_bit);
794 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 794 if (rb->rf_suballoc_loc)
795 bg_blkno = le64_to_cpu(rb->rf_suballoc_loc);
796 else
797 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
795 798
796 alloc_inode = ocfs2_get_system_file_inode(osb, 799 alloc_inode = ocfs2_get_system_file_inode(osb,
797 EXTENT_ALLOC_SYSTEM_INODE, 800 EXTENT_ALLOC_SYSTEM_INODE,
@@ -1269,9 +1272,7 @@ static int ocfs2_change_refcount_rec(handle_t *handle,
1269 } else if (merge) 1272 } else if (merge)
1270 ocfs2_refcount_rec_merge(rb, index); 1273 ocfs2_refcount_rec_merge(rb, index);
1271 1274
1272 ret = ocfs2_journal_dirty(handle, ref_leaf_bh); 1275 ocfs2_journal_dirty(handle, ref_leaf_bh);
1273 if (ret)
1274 mlog_errno(ret);
1275out: 1276out:
1276 return ret; 1277 return ret;
1277} 1278}
@@ -1285,7 +1286,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
1285 int ret; 1286 int ret;
1286 u16 suballoc_bit_start; 1287 u16 suballoc_bit_start;
1287 u32 num_got; 1288 u32 num_got;
1288 u64 blkno; 1289 u64 suballoc_loc, blkno;
1289 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 1290 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1290 struct buffer_head *new_bh = NULL; 1291 struct buffer_head *new_bh = NULL;
1291 struct ocfs2_refcount_block *new_rb; 1292 struct ocfs2_refcount_block *new_rb;
@@ -1299,7 +1300,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
1299 goto out; 1300 goto out;
1300 } 1301 }
1301 1302
1302 ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1, 1303 ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
1303 &suballoc_bit_start, &num_got, 1304 &suballoc_bit_start, &num_got,
1304 &blkno); 1305 &blkno);
1305 if (ret) { 1306 if (ret) {
@@ -1331,6 +1332,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
1331 1332
1332 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; 1333 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1333 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); 1334 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1335 new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
1334 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1336 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1335 new_rb->rf_blkno = cpu_to_le64(blkno); 1337 new_rb->rf_blkno = cpu_to_le64(blkno);
1336 new_rb->rf_cpos = cpu_to_le32(0); 1338 new_rb->rf_cpos = cpu_to_le32(0);
@@ -1525,7 +1527,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1525 int ret; 1527 int ret;
1526 u16 suballoc_bit_start; 1528 u16 suballoc_bit_start;
1527 u32 num_got, new_cpos; 1529 u32 num_got, new_cpos;
1528 u64 blkno; 1530 u64 suballoc_loc, blkno;
1529 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 1531 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1530 struct ocfs2_refcount_block *root_rb = 1532 struct ocfs2_refcount_block *root_rb =
1531 (struct ocfs2_refcount_block *)ref_root_bh->b_data; 1533 (struct ocfs2_refcount_block *)ref_root_bh->b_data;
@@ -1549,7 +1551,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1549 goto out; 1551 goto out;
1550 } 1552 }
1551 1553
1552 ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1, 1554 ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
1553 &suballoc_bit_start, &num_got, 1555 &suballoc_bit_start, &num_got,
1554 &blkno); 1556 &blkno);
1555 if (ret) { 1557 if (ret) {
@@ -1577,6 +1579,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1577 memset(new_rb, 0, sb->s_blocksize); 1579 memset(new_rb, 0, sb->s_blocksize);
1578 strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); 1580 strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
1579 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); 1581 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1582 new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
1580 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1583 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1581 new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); 1584 new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
1582 new_rb->rf_blkno = cpu_to_le64(blkno); 1585 new_rb->rf_blkno = cpu_to_le64(blkno);
@@ -1695,7 +1698,7 @@ static int ocfs2_adjust_refcount_rec(handle_t *handle,
1695 * 2 more credits, one for the leaf refcount block, one for 1698 * 2 more credits, one for the leaf refcount block, one for
1696 * the extent block contains the extent rec. 1699 * the extent block contains the extent rec.
1697 */ 1700 */
1698 ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2); 1701 ret = ocfs2_extend_trans(handle, 2);
1699 if (ret < 0) { 1702 if (ret < 0) {
1700 mlog_errno(ret); 1703 mlog_errno(ret);
1701 goto out; 1704 goto out;
@@ -1803,11 +1806,7 @@ static int ocfs2_insert_refcount_rec(handle_t *handle,
1803 if (merge) 1806 if (merge)
1804 ocfs2_refcount_rec_merge(rb, index); 1807 ocfs2_refcount_rec_merge(rb, index);
1805 1808
1806 ret = ocfs2_journal_dirty(handle, ref_leaf_bh); 1809 ocfs2_journal_dirty(handle, ref_leaf_bh);
1807 if (ret) {
1808 mlog_errno(ret);
1809 goto out;
1810 }
1811 1810
1812 if (index == 0) { 1811 if (index == 0) {
1813 ret = ocfs2_adjust_refcount_rec(handle, ci, 1812 ret = ocfs2_adjust_refcount_rec(handle, ci,
@@ -1978,9 +1977,7 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
1978 ocfs2_refcount_rec_merge(rb, index); 1977 ocfs2_refcount_rec_merge(rb, index);
1979 } 1978 }
1980 1979
1981 ret = ocfs2_journal_dirty(handle, ref_leaf_bh); 1980 ocfs2_journal_dirty(handle, ref_leaf_bh);
1982 if (ret)
1983 mlog_errno(ret);
1984 1981
1985out: 1982out:
1986 brelse(new_bh); 1983 brelse(new_bh);
@@ -2113,6 +2110,7 @@ static int ocfs2_remove_refcount_extent(handle_t *handle,
2113 */ 2110 */
2114 ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE, 2111 ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
2115 le16_to_cpu(rb->rf_suballoc_slot), 2112 le16_to_cpu(rb->rf_suballoc_slot),
2113 le64_to_cpu(rb->rf_suballoc_loc),
2116 le64_to_cpu(rb->rf_blkno), 2114 le64_to_cpu(rb->rf_blkno),
2117 le16_to_cpu(rb->rf_suballoc_bit)); 2115 le16_to_cpu(rb->rf_suballoc_bit));
2118 if (ret) { 2116 if (ret) {
@@ -2517,20 +2515,19 @@ out:
2517 * 2515 *
2518 * Normally the refcount blocks store these refcount should be 2516 * Normally the refcount blocks store these refcount should be
2519 * contiguous also, so that we can get the number easily. 2517 * contiguous also, so that we can get the number easily.
2520 * As for meta_ac, we will at most add split 2 refcount record and 2518 * We will at most add split 2 refcount records and 2 more
2521 * 2 more refcount block, so just check it in a rough way. 2519 * refcount blocks, so just check it in a rough way.
2522 * 2520 *
2523 * Caller must hold refcount tree lock. 2521 * Caller must hold refcount tree lock.
2524 */ 2522 */
2525int ocfs2_prepare_refcount_change_for_del(struct inode *inode, 2523int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
2526 struct buffer_head *di_bh, 2524 u64 refcount_loc,
2527 u64 phys_blkno, 2525 u64 phys_blkno,
2528 u32 clusters, 2526 u32 clusters,
2529 int *credits, 2527 int *credits,
2530 struct ocfs2_alloc_context **meta_ac) 2528 int *ref_blocks)
2531{ 2529{
2532 int ret, ref_blocks = 0; 2530 int ret;
2533 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2534 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2531 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2535 struct buffer_head *ref_root_bh = NULL; 2532 struct buffer_head *ref_root_bh = NULL;
2536 struct ocfs2_refcount_tree *tree; 2533 struct ocfs2_refcount_tree *tree;
@@ -2547,14 +2544,13 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
2547 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); 2544 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
2548 2545
2549 ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), 2546 ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
2550 le64_to_cpu(di->i_refcount_loc), &tree); 2547 refcount_loc, &tree);
2551 if (ret) { 2548 if (ret) {
2552 mlog_errno(ret); 2549 mlog_errno(ret);
2553 goto out; 2550 goto out;
2554 } 2551 }
2555 2552
2556 ret = ocfs2_read_refcount_block(&tree->rf_ci, 2553 ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc,
2557 le64_to_cpu(di->i_refcount_loc),
2558 &ref_root_bh); 2554 &ref_root_bh);
2559 if (ret) { 2555 if (ret) {
2560 mlog_errno(ret); 2556 mlog_errno(ret);
@@ -2565,21 +2561,14 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
2565 &tree->rf_ci, 2561 &tree->rf_ci,
2566 ref_root_bh, 2562 ref_root_bh,
2567 start_cpos, clusters, 2563 start_cpos, clusters,
2568 &ref_blocks, credits); 2564 ref_blocks, credits);
2569 if (ret) { 2565 if (ret) {
2570 mlog_errno(ret); 2566 mlog_errno(ret);
2571 goto out; 2567 goto out;
2572 } 2568 }
2573 2569
2574 mlog(0, "reserve new metadata %d, credits = %d\n", 2570 mlog(0, "reserve new metadata %d blocks, credits = %d\n",
2575 ref_blocks, *credits); 2571 *ref_blocks, *credits);
2576
2577 if (ref_blocks) {
2578 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
2579 ref_blocks, meta_ac);
2580 if (ret)
2581 mlog_errno(ret);
2582 }
2583 2572
2584out: 2573out:
2585 brelse(ref_root_bh); 2574 brelse(ref_root_bh);
@@ -3041,11 +3030,7 @@ static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
3041 } 3030 }
3042 3031
3043 memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize); 3032 memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
3044 ret = ocfs2_journal_dirty(handle, new_bh); 3033 ocfs2_journal_dirty(handle, new_bh);
3045 if (ret) {
3046 mlog_errno(ret);
3047 break;
3048 }
3049 3034
3050 brelse(new_bh); 3035 brelse(new_bh);
3051 brelse(old_bh); 3036 brelse(old_bh);
@@ -3283,7 +3268,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
3283 } else { 3268 } else {
3284 delete = 1; 3269 delete = 1;
3285 3270
3286 ret = __ocfs2_claim_clusters(osb, handle, 3271 ret = __ocfs2_claim_clusters(handle,
3287 context->data_ac, 3272 context->data_ac,
3288 1, set_len, 3273 1, set_len,
3289 &new_bit, &new_len); 3274 &new_bit, &new_len);
@@ -4075,6 +4060,7 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
4075 OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features; 4060 OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
4076 spin_unlock(&OCFS2_I(t_inode)->ip_lock); 4061 spin_unlock(&OCFS2_I(t_inode)->ip_lock);
4077 i_size_write(t_inode, size); 4062 i_size_write(t_inode, size);
4063 t_inode->i_blocks = s_inode->i_blocks;
4078 4064
4079 di->i_xattr_inline_size = s_di->i_xattr_inline_size; 4065 di->i_xattr_inline_size = s_di->i_xattr_inline_size;
4080 di->i_clusters = s_di->i_clusters; 4066 di->i_clusters = s_di->i_clusters;
@@ -4083,6 +4069,9 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
4083 di->i_attr = s_di->i_attr; 4069 di->i_attr = s_di->i_attr;
4084 4070
4085 if (preserve) { 4071 if (preserve) {
4072 t_inode->i_uid = s_inode->i_uid;
4073 t_inode->i_gid = s_inode->i_gid;
4074 t_inode->i_mode = s_inode->i_mode;
4086 di->i_uid = s_di->i_uid; 4075 di->i_uid = s_di->i_uid;
4087 di->i_gid = s_di->i_gid; 4076 di->i_gid = s_di->i_gid;
4088 di->i_mode = s_di->i_mode; 4077 di->i_mode = s_di->i_mode;
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index c1d19b1d3ecc..9983ba1570e2 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -47,11 +47,11 @@ int ocfs2_decrease_refcount(struct inode *inode,
47 struct ocfs2_cached_dealloc_ctxt *dealloc, 47 struct ocfs2_cached_dealloc_ctxt *dealloc,
48 int delete); 48 int delete);
49int ocfs2_prepare_refcount_change_for_del(struct inode *inode, 49int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
50 struct buffer_head *di_bh, 50 u64 refcount_loc,
51 u64 phys_blkno, 51 u64 phys_blkno,
52 u32 clusters, 52 u32 clusters,
53 int *credits, 53 int *credits,
54 struct ocfs2_alloc_context **meta_ac); 54 int *ref_blocks);
55int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh, 55int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
56 u32 cpos, u32 write_len, u32 max_cpos); 56 u32 cpos, u32 write_len, u32 max_cpos);
57 57
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
new file mode 100644
index 000000000000..40650021fc24
--- /dev/null
+++ b/fs/ocfs2/reservations.c
@@ -0,0 +1,847 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * reservations.c
5 *
6 * Allocation reservations implementation
7 *
8 * Some code borrowed from fs/ext3/balloc.c and is:
9 *
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 *
15 * The rest is copyright (C) 2010 Novell. All rights reserved.
16 *
17 * This program is free software; you can redistribute it and/or
18 * modify it under the terms of the GNU General Public
19 * License version 2 as published by the Free Software Foundation.
20 *
21 * This program is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24 * General Public License for more details.
25 */
26
27#include <linux/fs.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h>
31#include <linux/bitops.h>
32#include <linux/list.h>
33
34#define MLOG_MASK_PREFIX ML_RESERVATIONS
35#include <cluster/masklog.h>
36
37#include "ocfs2.h"
38
39#ifdef CONFIG_OCFS2_DEBUG_FS
40#define OCFS2_CHECK_RESERVATIONS
41#endif
42
43DEFINE_SPINLOCK(resv_lock);
44
45#define OCFS2_MIN_RESV_WINDOW_BITS 8
46#define OCFS2_MAX_RESV_WINDOW_BITS 1024
47
48int ocfs2_dir_resv_allowed(struct ocfs2_super *osb)
49{
50 return (osb->osb_resv_level && osb->osb_dir_resv_level);
51}
52
53static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap,
54 struct ocfs2_alloc_reservation *resv)
55{
56 struct ocfs2_super *osb = resmap->m_osb;
57 unsigned int bits;
58
59 if (!(resv->r_flags & OCFS2_RESV_FLAG_DIR)) {
60 /* 8, 16, 32, 64, 128, 256, 512, 1024 */
61 bits = 4 << osb->osb_resv_level;
62 } else {
63 bits = 4 << osb->osb_dir_resv_level;
64 }
65 return bits;
66}
67
68static inline unsigned int ocfs2_resv_end(struct ocfs2_alloc_reservation *resv)
69{
70 if (resv->r_len)
71 return resv->r_start + resv->r_len - 1;
72 return resv->r_start;
73}
74
75static inline int ocfs2_resv_empty(struct ocfs2_alloc_reservation *resv)
76{
77 return !!(resv->r_len == 0);
78}
79
80static inline int ocfs2_resmap_disabled(struct ocfs2_reservation_map *resmap)
81{
82 if (resmap->m_osb->osb_resv_level == 0)
83 return 1;
84 return 0;
85}
86
87static void ocfs2_dump_resv(struct ocfs2_reservation_map *resmap)
88{
89 struct ocfs2_super *osb = resmap->m_osb;
90 struct rb_node *node;
91 struct ocfs2_alloc_reservation *resv;
92 int i = 0;
93
94 mlog(ML_NOTICE, "Dumping resmap for device %s. Bitmap length: %u\n",
95 osb->dev_str, resmap->m_bitmap_len);
96
97 node = rb_first(&resmap->m_reservations);
98 while (node) {
99 resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
100
101 mlog(ML_NOTICE, "start: %u\tend: %u\tlen: %u\tlast_start: %u"
102 "\tlast_len: %u\n", resv->r_start,
103 ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
104 resv->r_last_len);
105
106 node = rb_next(node);
107 i++;
108 }
109
110 mlog(ML_NOTICE, "%d reservations found. LRU follows\n", i);
111
112 i = 0;
113 list_for_each_entry(resv, &resmap->m_lru, r_lru) {
114 mlog(ML_NOTICE, "LRU(%d) start: %u\tend: %u\tlen: %u\t"
115 "last_start: %u\tlast_len: %u\n", i, resv->r_start,
116 ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
117 resv->r_last_len);
118
119 i++;
120 }
121}
122
123#ifdef OCFS2_CHECK_RESERVATIONS
124static int ocfs2_validate_resmap_bits(struct ocfs2_reservation_map *resmap,
125 int i,
126 struct ocfs2_alloc_reservation *resv)
127{
128 char *disk_bitmap = resmap->m_disk_bitmap;
129 unsigned int start = resv->r_start;
130 unsigned int end = ocfs2_resv_end(resv);
131
132 while (start <= end) {
133 if (ocfs2_test_bit(start, disk_bitmap)) {
134 mlog(ML_ERROR,
135 "reservation %d covers an allocated area "
136 "starting at bit %u!\n", i, start);
137 return 1;
138 }
139
140 start++;
141 }
142 return 0;
143}
144
145static void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
146{
147 unsigned int off = 0;
148 int i = 0;
149 struct rb_node *node;
150 struct ocfs2_alloc_reservation *resv;
151
152 node = rb_first(&resmap->m_reservations);
153 while (node) {
154 resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
155
156 if (i > 0 && resv->r_start <= off) {
157 mlog(ML_ERROR, "reservation %d has bad start off!\n",
158 i);
159 goto bad;
160 }
161
162 if (resv->r_len == 0) {
163 mlog(ML_ERROR, "reservation %d has no length!\n",
164 i);
165 goto bad;
166 }
167
168 if (resv->r_start > ocfs2_resv_end(resv)) {
169 mlog(ML_ERROR, "reservation %d has invalid range!\n",
170 i);
171 goto bad;
172 }
173
174 if (ocfs2_resv_end(resv) >= resmap->m_bitmap_len) {
175 mlog(ML_ERROR, "reservation %d extends past bitmap!\n",
176 i);
177 goto bad;
178 }
179
180 if (ocfs2_validate_resmap_bits(resmap, i, resv))
181 goto bad;
182
183 off = ocfs2_resv_end(resv);
184 node = rb_next(node);
185
186 i++;
187 }
188 return;
189
190bad:
191 ocfs2_dump_resv(resmap);
192 BUG();
193}
194#else
195static inline void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
196{
197
198}
199#endif
200
201void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv)
202{
203 memset(resv, 0, sizeof(*resv));
204 INIT_LIST_HEAD(&resv->r_lru);
205}
206
207void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
208 unsigned int flags)
209{
210 BUG_ON(flags & ~OCFS2_RESV_TYPES);
211
212 resv->r_flags |= flags;
213}
214
215int ocfs2_resmap_init(struct ocfs2_super *osb,
216 struct ocfs2_reservation_map *resmap)
217{
218 memset(resmap, 0, sizeof(*resmap));
219
220 resmap->m_osb = osb;
221 resmap->m_reservations = RB_ROOT;
222 /* m_bitmap_len is initialized to zero by the above memset. */
223 INIT_LIST_HEAD(&resmap->m_lru);
224
225 return 0;
226}
227
228static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap,
229 struct ocfs2_alloc_reservation *resv)
230{
231 assert_spin_locked(&resv_lock);
232
233 if (!list_empty(&resv->r_lru))
234 list_del_init(&resv->r_lru);
235
236 list_add_tail(&resv->r_lru, &resmap->m_lru);
237}
238
239static void __ocfs2_resv_trunc(struct ocfs2_alloc_reservation *resv)
240{
241 resv->r_len = 0;
242 resv->r_start = 0;
243}
244
245static void ocfs2_resv_remove(struct ocfs2_reservation_map *resmap,
246 struct ocfs2_alloc_reservation *resv)
247{
248 if (resv->r_flags & OCFS2_RESV_FLAG_INUSE) {
249 list_del_init(&resv->r_lru);
250 rb_erase(&resv->r_node, &resmap->m_reservations);
251 resv->r_flags &= ~OCFS2_RESV_FLAG_INUSE;
252 }
253}
254
255static void __ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
256 struct ocfs2_alloc_reservation *resv)
257{
258 assert_spin_locked(&resv_lock);
259
260 __ocfs2_resv_trunc(resv);
261 /*
262 * last_len and last_start no longer make sense if
263 * we're changing the range of our allocations.
264 */
265 resv->r_last_len = resv->r_last_start = 0;
266
267 ocfs2_resv_remove(resmap, resv);
268}
269
270/* does nothing if 'resv' is null */
271void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
272 struct ocfs2_alloc_reservation *resv)
273{
274 if (resv) {
275 spin_lock(&resv_lock);
276 __ocfs2_resv_discard(resmap, resv);
277 spin_unlock(&resv_lock);
278 }
279}
280
281static void ocfs2_resmap_clear_all_resv(struct ocfs2_reservation_map *resmap)
282{
283 struct rb_node *node;
284 struct ocfs2_alloc_reservation *resv;
285
286 assert_spin_locked(&resv_lock);
287
288 while ((node = rb_last(&resmap->m_reservations)) != NULL) {
289 resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
290
291 __ocfs2_resv_discard(resmap, resv);
292 }
293}
294
295void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
296 unsigned int clen, char *disk_bitmap)
297{
298 if (ocfs2_resmap_disabled(resmap))
299 return;
300
301 spin_lock(&resv_lock);
302
303 ocfs2_resmap_clear_all_resv(resmap);
304 resmap->m_bitmap_len = clen;
305 resmap->m_disk_bitmap = disk_bitmap;
306
307 spin_unlock(&resv_lock);
308}
309
310void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap)
311{
312 /* Does nothing for now. Keep this around for API symmetry */
313}
314
315static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap,
316 struct ocfs2_alloc_reservation *new)
317{
318 struct rb_root *root = &resmap->m_reservations;
319 struct rb_node *parent = NULL;
320 struct rb_node **p = &root->rb_node;
321 struct ocfs2_alloc_reservation *tmp;
322
323 assert_spin_locked(&resv_lock);
324
325 mlog(0, "Insert reservation start: %u len: %u\n", new->r_start,
326 new->r_len);
327
328 while (*p) {
329 parent = *p;
330
331 tmp = rb_entry(parent, struct ocfs2_alloc_reservation, r_node);
332
333 if (new->r_start < tmp->r_start) {
334 p = &(*p)->rb_left;
335
336 /*
337 * This is a good place to check for
338 * overlapping reservations.
339 */
340 BUG_ON(ocfs2_resv_end(new) >= tmp->r_start);
341 } else if (new->r_start > ocfs2_resv_end(tmp)) {
342 p = &(*p)->rb_right;
343 } else {
344 /* This should never happen! */
345 mlog(ML_ERROR, "Duplicate reservation window!\n");
346 BUG();
347 }
348 }
349
350 rb_link_node(&new->r_node, parent, p);
351 rb_insert_color(&new->r_node, root);
352 new->r_flags |= OCFS2_RESV_FLAG_INUSE;
353
354 ocfs2_resv_mark_lru(resmap, new);
355
356 ocfs2_check_resmap(resmap);
357}
358
359/**
360 * ocfs2_find_resv_lhs() - find the window which contains goal
361 * @resmap: reservation map to search
362 * @goal: which bit to search for
363 *
364 * If a window containing that goal is not found, we return the window
365 * which comes before goal. Returns NULL on empty rbtree or no window
366 * before goal.
367 */
368static struct ocfs2_alloc_reservation *
369ocfs2_find_resv_lhs(struct ocfs2_reservation_map *resmap, unsigned int goal)
370{
371 struct ocfs2_alloc_reservation *resv = NULL;
372 struct ocfs2_alloc_reservation *prev_resv = NULL;
373 struct rb_node *node = resmap->m_reservations.rb_node;
374
375 assert_spin_locked(&resv_lock);
376
377 if (!node)
378 return NULL;
379
380 node = rb_first(&resmap->m_reservations);
381 while (node) {
382 resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
383
384 if (resv->r_start <= goal && ocfs2_resv_end(resv) >= goal)
385 break;
386
387 /* Check if we overshot the reservation just before goal? */
388 if (resv->r_start > goal) {
389 resv = prev_resv;
390 break;
391 }
392
393 prev_resv = resv;
394 node = rb_next(node);
395 }
396
397 return resv;
398}
399
400/*
401 * We are given a range within the bitmap, which corresponds to a gap
402 * inside the reservations tree (search_start, search_len). The range
403 * can be anything from the whole bitmap, to a gap between
404 * reservations.
405 *
406 * The start value of *rstart is insignificant.
407 *
408 * This function searches the bitmap range starting at search_start
409 * with length search_len for a set of contiguous free bits. We try
410 * to find up to 'wanted' bits, but can sometimes return less.
411 *
412 * Returns the length of allocation, 0 if no free bits are found.
413 *
414 * *cstart and *clen will also be populated with the result.
415 */
416static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
417 unsigned int wanted,
418 unsigned int search_start,
419 unsigned int search_len,
420 unsigned int *rstart,
421 unsigned int *rlen)
422{
423 void *bitmap = resmap->m_disk_bitmap;
424 unsigned int best_start, best_len = 0;
425 int offset, start, found;
426
427 mlog(0, "Find %u bits within range (%u, len %u) resmap len: %u\n",
428 wanted, search_start, search_len, resmap->m_bitmap_len);
429
430 found = best_start = best_len = 0;
431
432 start = search_start;
433 while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len,
434 start)) != -1) {
435 /* Search reached end of the region */
436 if (offset >= (search_start + search_len))
437 break;
438
439 if (offset == start) {
440 /* we found a zero */
441 found++;
442 /* move start to the next bit to test */
443 start++;
444 } else {
445 /* got a zero after some ones */
446 found = 1;
447 start = offset + 1;
448 }
449 if (found > best_len) {
450 best_len = found;
451 best_start = start - found;
452 }
453
454 if (found >= wanted)
455 break;
456 }
457
458 if (best_len == 0)
459 return 0;
460
461 if (best_len >= wanted)
462 best_len = wanted;
463
464 *rlen = best_len;
465 *rstart = best_start;
466
467 mlog(0, "Found start: %u len: %u\n", best_start, best_len);
468
469 return *rlen;
470}
471
472static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
473 struct ocfs2_alloc_reservation *resv,
474 unsigned int goal, unsigned int wanted)
475{
476 struct rb_root *root = &resmap->m_reservations;
477 unsigned int gap_start, gap_end, gap_len;
478 struct ocfs2_alloc_reservation *prev_resv, *next_resv;
479 struct rb_node *prev, *next;
480 unsigned int cstart, clen;
481 unsigned int best_start = 0, best_len = 0;
482
483 /*
484 * Nasty cases to consider:
485 *
486 * - rbtree is empty
487 * - our window should be first in all reservations
488 * - our window should be last in all reservations
489 * - need to make sure we don't go past end of bitmap
490 */
491
492 mlog(0, "resv start: %u resv end: %u goal: %u wanted: %u\n",
493 resv->r_start, ocfs2_resv_end(resv), goal, wanted);
494
495 assert_spin_locked(&resv_lock);
496
497 if (RB_EMPTY_ROOT(root)) {
498 /*
499 * Easiest case - empty tree. We can just take
500 * whatever window of free bits we want.
501 */
502
503 mlog(0, "Empty root\n");
504
505 clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
506 resmap->m_bitmap_len - goal,
507 &cstart, &clen);
508
509 /*
510 * This should never happen - the local alloc window
511 * will always have free bits when we're called.
512 */
513 BUG_ON(goal == 0 && clen == 0);
514
515 if (clen == 0)
516 return;
517
518 resv->r_start = cstart;
519 resv->r_len = clen;
520
521 ocfs2_resv_insert(resmap, resv);
522 return;
523 }
524
525 prev_resv = ocfs2_find_resv_lhs(resmap, goal);
526
527 if (prev_resv == NULL) {
528 mlog(0, "Goal on LHS of leftmost window\n");
529
530 /*
531 * A NULL here means that the search code couldn't
532 * find a window that starts before goal.
533 *
534 * However, we can take the first window after goal,
535 * which is also by definition, the leftmost window in
536 * the entire tree. If we can find free bits in the
537 * gap between goal and the LHS window, then the
538 * reservation can safely be placed there.
539 *
540 * Otherwise we fall back to a linear search, checking
541 * the gaps in between windows for a place to
542 * allocate.
543 */
544
545 next = rb_first(root);
546 next_resv = rb_entry(next, struct ocfs2_alloc_reservation,
547 r_node);
548
549 /*
550 * The search should never return such a window. (see
551 * comment above
552 */
553 if (next_resv->r_start <= goal) {
554 mlog(ML_ERROR, "goal: %u next_resv: start %u len %u\n",
555 goal, next_resv->r_start, next_resv->r_len);
556 ocfs2_dump_resv(resmap);
557 BUG();
558 }
559
560 clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
561 next_resv->r_start - goal,
562 &cstart, &clen);
563 if (clen) {
564 best_len = clen;
565 best_start = cstart;
566 if (best_len == wanted)
567 goto out_insert;
568 }
569
570 prev_resv = next_resv;
571 next_resv = NULL;
572 }
573
574 prev = &prev_resv->r_node;
575
576 /* Now we do a linear search for a window, starting at 'prev_rsv' */
577 while (1) {
578 next = rb_next(prev);
579 if (next) {
580 mlog(0, "One more resv found in linear search\n");
581 next_resv = rb_entry(next,
582 struct ocfs2_alloc_reservation,
583 r_node);
584
585 gap_start = ocfs2_resv_end(prev_resv) + 1;
586 gap_end = next_resv->r_start - 1;
587 gap_len = gap_end - gap_start + 1;
588 } else {
589 mlog(0, "No next node\n");
590 /*
591 * We're at the rightmost edge of the
592 * tree. See if a reservation between this
593 * window and the end of the bitmap will work.
594 */
595 gap_start = ocfs2_resv_end(prev_resv) + 1;
596 gap_len = resmap->m_bitmap_len - gap_start;
597 gap_end = resmap->m_bitmap_len - 1;
598 }
599
600 /*
601 * No need to check this gap if we have already found
602 * a larger region of free bits.
603 */
604 if (gap_len <= best_len)
605 goto next_resv;
606
607 clen = ocfs2_resmap_find_free_bits(resmap, wanted, gap_start,
608 gap_len, &cstart, &clen);
609 if (clen == wanted) {
610 best_len = clen;
611 best_start = cstart;
612 goto out_insert;
613 } else if (clen > best_len) {
614 best_len = clen;
615 best_start = cstart;
616 }
617
618next_resv:
619 if (!next)
620 break;
621
622 prev = next;
623 prev_resv = rb_entry(prev, struct ocfs2_alloc_reservation,
624 r_node);
625 }
626
627out_insert:
628 if (best_len) {
629 resv->r_start = best_start;
630 resv->r_len = best_len;
631 ocfs2_resv_insert(resmap, resv);
632 }
633}
634
635static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
636 struct ocfs2_alloc_reservation *resv,
637 unsigned int wanted)
638{
639 struct ocfs2_alloc_reservation *lru_resv;
640 int tmpwindow = !!(resv->r_flags & OCFS2_RESV_FLAG_TMP);
641 unsigned int min_bits;
642
643 if (!tmpwindow)
644 min_bits = ocfs2_resv_window_bits(resmap, resv) >> 1;
645 else
646 min_bits = wanted; /* We at know the temp window will use all
647 * of these bits */
648
649 /*
650 * Take the first reservation off the LRU as our 'target'. We
651 * don't try to be smart about it. There might be a case for
652 * searching based on size but I don't have enough data to be
653 * sure. --Mark (3/16/2010)
654 */
655 lru_resv = list_first_entry(&resmap->m_lru,
656 struct ocfs2_alloc_reservation, r_lru);
657
658 mlog(0, "lru resv: start: %u len: %u end: %u\n", lru_resv->r_start,
659 lru_resv->r_len, ocfs2_resv_end(lru_resv));
660
661 /*
662 * Cannibalize (some or all) of the target reservation and
663 * feed it to the current window.
664 */
665 if (lru_resv->r_len <= min_bits) {
666 /*
667 * Discard completely if size is less than or equal to a
668 * reasonable threshold - 50% of window bits for non temporary
669 * windows.
670 */
671 resv->r_start = lru_resv->r_start;
672 resv->r_len = lru_resv->r_len;
673
674 __ocfs2_resv_discard(resmap, lru_resv);
675 } else {
676 unsigned int shrink;
677 if (tmpwindow)
678 shrink = min_bits;
679 else
680 shrink = lru_resv->r_len / 2;
681
682 lru_resv->r_len -= shrink;
683
684 resv->r_start = ocfs2_resv_end(lru_resv) + 1;
685 resv->r_len = shrink;
686 }
687
688 mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
689 "r_len: %u r_last_start: %u r_last_len: %u\n",
690 resv->r_start, ocfs2_resv_end(resv), resv->r_len,
691 resv->r_last_start, resv->r_last_len);
692
693 ocfs2_resv_insert(resmap, resv);
694}
695
696static void ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
697 struct ocfs2_alloc_reservation *resv,
698 unsigned int wanted)
699{
700 unsigned int goal = 0;
701
702 BUG_ON(!ocfs2_resv_empty(resv));
703
704 /*
705 * Begin by trying to get a window as close to the previous
706 * one as possible. Using the most recent allocation as a
707 * start goal makes sense.
708 */
709 if (resv->r_last_len) {
710 goal = resv->r_last_start + resv->r_last_len;
711 if (goal >= resmap->m_bitmap_len)
712 goal = 0;
713 }
714
715 __ocfs2_resv_find_window(resmap, resv, goal, wanted);
716
717 /* Search from last alloc didn't work, try once more from beginning. */
718 if (ocfs2_resv_empty(resv) && goal != 0)
719 __ocfs2_resv_find_window(resmap, resv, 0, wanted);
720
721 if (ocfs2_resv_empty(resv)) {
722 /*
723 * Still empty? Pull oldest one off the LRU, remove it from
724 * tree, put this one in it's place.
725 */
726 ocfs2_cannibalize_resv(resmap, resv, wanted);
727 }
728
729 BUG_ON(ocfs2_resv_empty(resv));
730}
731
732int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
733 struct ocfs2_alloc_reservation *resv,
734 int *cstart, int *clen)
735{
736 unsigned int wanted = *clen;
737
738 if (resv == NULL || ocfs2_resmap_disabled(resmap))
739 return -ENOSPC;
740
741 spin_lock(&resv_lock);
742
743 /*
744 * We don't want to over-allocate for temporary
745 * windows. Otherwise, we run the risk of fragmenting the
746 * allocation space.
747 */
748 wanted = ocfs2_resv_window_bits(resmap, resv);
749 if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
750 wanted = *clen;
751
752 if (ocfs2_resv_empty(resv)) {
753 mlog(0, "empty reservation, find new window\n");
754
755 /*
756 * Try to get a window here. If it works, we must fall
757 * through and test the bitmap . This avoids some
758 * ping-ponging of windows due to non-reserved space
759 * being allocation before we initialize a window for
760 * that inode.
761 */
762 ocfs2_resv_find_window(resmap, resv, wanted);
763 }
764
765 BUG_ON(ocfs2_resv_empty(resv));
766
767 *cstart = resv->r_start;
768 *clen = resv->r_len;
769
770 spin_unlock(&resv_lock);
771 return 0;
772}
773
774static void
775 ocfs2_adjust_resv_from_alloc(struct ocfs2_reservation_map *resmap,
776 struct ocfs2_alloc_reservation *resv,
777 unsigned int start, unsigned int end)
778{
779 unsigned int rhs = 0;
780 unsigned int old_end = ocfs2_resv_end(resv);
781
782 BUG_ON(start != resv->r_start || old_end < end);
783
784 /*
785 * Completely used? We can remove it then.
786 */
787 if (old_end == end) {
788 __ocfs2_resv_discard(resmap, resv);
789 return;
790 }
791
792 rhs = old_end - end;
793
794 /*
795 * This should have been trapped above.
796 */
797 BUG_ON(rhs == 0);
798
799 resv->r_start = end + 1;
800 resv->r_len = old_end - resv->r_start + 1;
801}
802
803void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
804 struct ocfs2_alloc_reservation *resv,
805 u32 cstart, u32 clen)
806{
807 unsigned int cend = cstart + clen - 1;
808
809 if (resmap == NULL || ocfs2_resmap_disabled(resmap))
810 return;
811
812 if (resv == NULL)
813 return;
814
815 BUG_ON(cstart != resv->r_start);
816
817 spin_lock(&resv_lock);
818
819 mlog(0, "claim bits: cstart: %u cend: %u clen: %u r_start: %u "
820 "r_end: %u r_len: %u, r_last_start: %u r_last_len: %u\n",
821 cstart, cend, clen, resv->r_start, ocfs2_resv_end(resv),
822 resv->r_len, resv->r_last_start, resv->r_last_len);
823
824 BUG_ON(cstart < resv->r_start);
825 BUG_ON(cstart > ocfs2_resv_end(resv));
826 BUG_ON(cend > ocfs2_resv_end(resv));
827
828 ocfs2_adjust_resv_from_alloc(resmap, resv, cstart, cend);
829 resv->r_last_start = cstart;
830 resv->r_last_len = clen;
831
832 /*
833 * May have been discarded above from
834 * ocfs2_adjust_resv_from_alloc().
835 */
836 if (!ocfs2_resv_empty(resv))
837 ocfs2_resv_mark_lru(resmap, resv);
838
839 mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
840 "r_len: %u r_last_start: %u r_last_len: %u\n",
841 resv->r_start, ocfs2_resv_end(resv), resv->r_len,
842 resv->r_last_start, resv->r_last_len);
843
844 ocfs2_check_resmap(resmap);
845
846 spin_unlock(&resv_lock);
847}
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
new file mode 100644
index 000000000000..1e49cc29d06c
--- /dev/null
+++ b/fs/ocfs2/reservations.h
@@ -0,0 +1,159 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * reservations.h
5 *
6 * Allocation reservations function prototypes and structures.
7 *
8 * Copyright (C) 2010 Novell. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License version 2 as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#ifndef OCFS2_RESERVATIONS_H
21#define OCFS2_RESERVATIONS_H
22
23#include <linux/rbtree.h>
24
25#define OCFS2_DEFAULT_RESV_LEVEL 2
26#define OCFS2_MAX_RESV_LEVEL 9
27#define OCFS2_MIN_RESV_LEVEL 0
28
29struct ocfs2_alloc_reservation {
30 struct rb_node r_node;
31
32 unsigned int r_start; /* Begining of current window */
33 unsigned int r_len; /* Length of the window */
34
35 unsigned int r_last_len; /* Length of most recent alloc */
36 unsigned int r_last_start; /* Start of most recent alloc */
37 struct list_head r_lru; /* LRU list head */
38
39 unsigned int r_flags;
40};
41
42#define OCFS2_RESV_FLAG_INUSE 0x01 /* Set when r_node is part of a btree */
43#define OCFS2_RESV_FLAG_TMP 0x02 /* Temporary reservation, will be
44 * destroyed immedately after use */
45#define OCFS2_RESV_FLAG_DIR 0x04 /* Reservation is for an unindexed
46 * directory btree */
47
48struct ocfs2_reservation_map {
49 struct rb_root m_reservations;
50 char *m_disk_bitmap;
51
52 struct ocfs2_super *m_osb;
53
54 /* The following are not initialized to meaningful values until a disk
55 * bitmap is provided. */
56 u32 m_bitmap_len; /* Number of valid
57 * bits available */
58
59 struct list_head m_lru; /* LRU of reservations
60 * structures. */
61
62};
63
64void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv);
65
66#define OCFS2_RESV_TYPES (OCFS2_RESV_FLAG_TMP|OCFS2_RESV_FLAG_DIR)
67void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
68 unsigned int flags);
69
70int ocfs2_dir_resv_allowed(struct ocfs2_super *osb);
71
72/**
73 * ocfs2_resv_discard() - truncate a reservation
74 * @resmap:
75 * @resv: the reservation to truncate.
76 *
77 * After this function is called, the reservation will be empty, and
78 * unlinked from the rbtree.
79 */
80void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
81 struct ocfs2_alloc_reservation *resv);
82
83
84/**
85 * ocfs2_resmap_init() - Initialize fields of a reservations bitmap
86 * @resmap: struct ocfs2_reservation_map to initialize
87 * @obj: unused for now
88 * @ops: unused for now
89 * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize)
90 *
91 * Only possible return value other than '0' is -ENOMEM for failure to
92 * allocation mirror bitmap.
93 */
94int ocfs2_resmap_init(struct ocfs2_super *osb,
95 struct ocfs2_reservation_map *resmap);
96
97/**
98 * ocfs2_resmap_restart() - "restart" a reservation bitmap
99 * @resmap: reservations bitmap
100 * @clen: Number of valid bits in the bitmap
101 * @disk_bitmap: the disk bitmap this resmap should refer to.
102 *
103 * Re-initialize the parameters of a reservation bitmap. This is
104 * useful for local alloc window slides.
105 *
106 * This function will call ocfs2_trunc_resv against all existing
107 * reservations. A future version will recalculate existing
108 * reservations based on the new bitmap.
109 */
110void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
111 unsigned int clen, char *disk_bitmap);
112
113/**
114 * ocfs2_resmap_uninit() - uninitialize a reservation bitmap structure
115 * @resmap: the struct ocfs2_reservation_map to uninitialize
116 */
117void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap);
118
119/**
120 * ocfs2_resmap_resv_bits() - Return still-valid reservation bits
121 * @resmap: reservations bitmap
122 * @resv: reservation to base search from
123 * @cstart: start of proposed allocation
124 * @clen: length (in clusters) of proposed allocation
125 *
126 * Using the reservation data from resv, this function will compare
127 * resmap and resmap->m_disk_bitmap to determine what part (if any) of
128 * the reservation window is still clear to use. If resv is empty,
129 * this function will try to allocate a window for it.
130 *
131 * On success, zero is returned and the valid allocation area is set in cstart
132 * and clen.
133 *
134 * Returns -ENOSPC if reservations are disabled.
135 */
136int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
137 struct ocfs2_alloc_reservation *resv,
138 int *cstart, int *clen);
139
140/**
141 * ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used.
142 * @resmap: reservations bitmap
143 * @resv: optional reservation to recalulate based on new bitmap
144 * @cstart: start of allocation in clusters
145 * @clen: end of allocation in clusters.
146 *
147 * Tell the reservation code that bits were used to fulfill allocation in
148 * resmap. The bits don't have to have been part of any existing
149 * reservation. But we must always call this function when bits are claimed.
150 * Internally, the reservations code will use this information to mark the
151 * reservations bitmap. If resv is passed, it's next allocation window will be
152 * calculated. It also expects that 'cstart' is the same as we passed back
153 * from ocfs2_resmap_resv_bits().
154 */
155void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
156 struct ocfs2_alloc_reservation *resv,
157 u32 cstart, u32 clen);
158
159#endif /* OCFS2_RESERVATIONS_H */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 3c3d673a4d20..dacd553d8617 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -134,11 +134,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
134 le16_add_cpu(&group->bg_free_bits_count, -1 * backups); 134 le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
135 } 135 }
136 136
137 ret = ocfs2_journal_dirty(handle, group_bh); 137 ocfs2_journal_dirty(handle, group_bh);
138 if (ret < 0) {
139 mlog_errno(ret);
140 goto out_rollback;
141 }
142 138
143 /* update the inode accordingly. */ 139 /* update the inode accordingly. */
144 ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh, 140 ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh,
@@ -319,7 +315,8 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
319 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 315 BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
320 316
321 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) != 317 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
322 ocfs2_group_bitmap_size(osb->sb) * 8) { 318 ocfs2_group_bitmap_size(osb->sb, 0,
319 osb->s_feature_incompat) * 8) {
323 mlog(ML_ERROR, "The disk is too old and small. " 320 mlog(ML_ERROR, "The disk is too old and small. "
324 "Force to do offline resize."); 321 "Force to do offline resize.");
325 ret = -EINVAL; 322 ret = -EINVAL;
@@ -500,7 +497,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
500 fe = (struct ocfs2_dinode *)main_bm_bh->b_data; 497 fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
501 498
502 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) != 499 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
503 ocfs2_group_bitmap_size(osb->sb) * 8) { 500 ocfs2_group_bitmap_size(osb->sb, 0,
501 osb->s_feature_incompat) * 8) {
504 mlog(ML_ERROR, "The disk is too old and small." 502 mlog(ML_ERROR, "The disk is too old and small."
505 " Force to do offline resize."); 503 " Force to do offline resize.");
506 ret = -EINVAL; 504 ret = -EINVAL;
@@ -545,12 +543,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
545 543
546 group = (struct ocfs2_group_desc *)group_bh->b_data; 544 group = (struct ocfs2_group_desc *)group_bh->b_data;
547 group->bg_next_group = cr->c_blkno; 545 group->bg_next_group = cr->c_blkno;
548 546 ocfs2_journal_dirty(handle, group_bh);
549 ret = ocfs2_journal_dirty(handle, group_bh);
550 if (ret < 0) {
551 mlog_errno(ret);
552 goto out_commit;
553 }
554 547
555 ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode), 548 ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
556 main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE); 549 main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 7020e1253ffa..0d3049f696c5 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -19,6 +19,7 @@
19 19
20#include <linux/kernel.h> 20#include <linux/kernel.h>
21#include <linux/crc32.h> 21#include <linux/crc32.h>
22#include <linux/slab.h>
22#include <linux/module.h> 23#include <linux/module.h>
23 24
24/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */ 25/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 5ae8812b2864..2dc57bca0688 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -21,6 +21,7 @@
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/miscdevice.h> 22#include <linux/miscdevice.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h>
24#include <linux/smp_lock.h> 25#include <linux/smp_lock.h>
25#include <linux/reboot.h> 26#include <linux/reboot.h>
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c3c60bc3e072..f4c2a9eb8c4d 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -53,6 +53,15 @@
53 53
54#define OCFS2_MAX_TO_STEAL 1024 54#define OCFS2_MAX_TO_STEAL 1024
55 55
56struct ocfs2_suballoc_result {
57 u64 sr_bg_blkno; /* The bg we allocated from. Set
58 to 0 when a block group is
59 contiguous. */
60 u64 sr_blkno; /* The first allocated block */
61 unsigned int sr_bit_offset; /* The bit in the bg */
62 unsigned int sr_bits; /* How many bits we claimed */
63};
64
56static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); 65static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
57static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); 66static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
58static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); 67static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -60,6 +69,7 @@ static int ocfs2_block_group_fill(handle_t *handle,
60 struct inode *alloc_inode, 69 struct inode *alloc_inode,
61 struct buffer_head *bg_bh, 70 struct buffer_head *bg_bh,
62 u64 group_blkno, 71 u64 group_blkno,
72 unsigned int group_clusters,
63 u16 my_chain, 73 u16 my_chain,
64 struct ocfs2_chain_list *cl); 74 struct ocfs2_chain_list *cl);
65static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 75static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
@@ -73,20 +83,17 @@ static int ocfs2_cluster_group_search(struct inode *inode,
73 struct buffer_head *group_bh, 83 struct buffer_head *group_bh,
74 u32 bits_wanted, u32 min_bits, 84 u32 bits_wanted, u32 min_bits,
75 u64 max_block, 85 u64 max_block,
76 u16 *bit_off, u16 *bits_found); 86 struct ocfs2_suballoc_result *res);
77static int ocfs2_block_group_search(struct inode *inode, 87static int ocfs2_block_group_search(struct inode *inode,
78 struct buffer_head *group_bh, 88 struct buffer_head *group_bh,
79 u32 bits_wanted, u32 min_bits, 89 u32 bits_wanted, u32 min_bits,
80 u64 max_block, 90 u64 max_block,
81 u16 *bit_off, u16 *bits_found); 91 struct ocfs2_suballoc_result *res);
82static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, 92static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
83 struct ocfs2_alloc_context *ac,
84 handle_t *handle, 93 handle_t *handle,
85 u32 bits_wanted, 94 u32 bits_wanted,
86 u32 min_bits, 95 u32 min_bits,
87 u16 *bit_off, 96 struct ocfs2_suballoc_result *res);
88 unsigned int *num_bits,
89 u64 *bg_blkno);
90static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 97static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
91 int nr); 98 int nr);
92static inline int ocfs2_block_group_set_bits(handle_t *handle, 99static inline int ocfs2_block_group_set_bits(handle_t *handle,
@@ -95,13 +102,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
95 struct buffer_head *group_bh, 102 struct buffer_head *group_bh,
96 unsigned int bit_off, 103 unsigned int bit_off,
97 unsigned int num_bits); 104 unsigned int num_bits);
98static inline int ocfs2_block_group_clear_bits(handle_t *handle,
99 struct inode *alloc_inode,
100 struct ocfs2_group_desc *bg,
101 struct buffer_head *group_bh,
102 unsigned int bit_off,
103 unsigned int num_bits);
104
105static int ocfs2_relink_block_group(handle_t *handle, 105static int ocfs2_relink_block_group(handle_t *handle,
106 struct inode *alloc_inode, 106 struct inode *alloc_inode,
107 struct buffer_head *fe_bh, 107 struct buffer_head *fe_bh,
@@ -137,6 +137,7 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
137 } 137 }
138 brelse(ac->ac_bh); 138 brelse(ac->ac_bh);
139 ac->ac_bh = NULL; 139 ac->ac_bh = NULL;
140 ac->ac_resv = NULL;
140} 141}
141 142
142void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) 143void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -152,7 +153,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
152 153
153#define do_error(fmt, ...) \ 154#define do_error(fmt, ...) \
154 do{ \ 155 do{ \
155 if (clean_error) \ 156 if (resize) \
156 mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \ 157 mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \
157 else \ 158 else \
158 ocfs2_error(sb, fmt, ##__VA_ARGS__); \ 159 ocfs2_error(sb, fmt, ##__VA_ARGS__); \
@@ -160,7 +161,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
160 161
161static int ocfs2_validate_gd_self(struct super_block *sb, 162static int ocfs2_validate_gd_self(struct super_block *sb,
162 struct buffer_head *bh, 163 struct buffer_head *bh,
163 int clean_error) 164 int resize)
164{ 165{
165 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 166 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
166 167
@@ -211,7 +212,7 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
211static int ocfs2_validate_gd_parent(struct super_block *sb, 212static int ocfs2_validate_gd_parent(struct super_block *sb,
212 struct ocfs2_dinode *di, 213 struct ocfs2_dinode *di,
213 struct buffer_head *bh, 214 struct buffer_head *bh,
214 int clean_error) 215 int resize)
215{ 216{
216 unsigned int max_bits; 217 unsigned int max_bits;
217 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 218 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
@@ -233,8 +234,11 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
233 return -EINVAL; 234 return -EINVAL;
234 } 235 }
235 236
236 if (le16_to_cpu(gd->bg_chain) >= 237 /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
237 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) { 238 if ((le16_to_cpu(gd->bg_chain) >
239 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
240 ((le16_to_cpu(gd->bg_chain) ==
241 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
238 do_error("Group descriptor #%llu has bad chain %u", 242 do_error("Group descriptor #%llu has bad chain %u",
239 (unsigned long long)bh->b_blocknr, 243 (unsigned long long)bh->b_blocknr,
240 le16_to_cpu(gd->bg_chain)); 244 le16_to_cpu(gd->bg_chain));
@@ -329,14 +333,38 @@ out:
329 return rc; 333 return rc;
330} 334}
331 335
336static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
337 struct ocfs2_group_desc *bg,
338 struct ocfs2_chain_list *cl,
339 u64 p_blkno, u32 clusters)
340{
341 struct ocfs2_extent_list *el = &bg->bg_list;
342 struct ocfs2_extent_rec *rec;
343
344 BUG_ON(!ocfs2_supports_discontig_bg(osb));
345 if (!el->l_next_free_rec)
346 el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
347 rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
348 rec->e_blkno = cpu_to_le64(p_blkno);
349 rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
350 le16_to_cpu(cl->cl_bpc));
351 rec->e_leaf_clusters = cpu_to_le32(clusters);
352 le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
353 le16_add_cpu(&bg->bg_free_bits_count,
354 clusters * le16_to_cpu(cl->cl_bpc));
355 le16_add_cpu(&el->l_next_free_rec, 1);
356}
357
332static int ocfs2_block_group_fill(handle_t *handle, 358static int ocfs2_block_group_fill(handle_t *handle,
333 struct inode *alloc_inode, 359 struct inode *alloc_inode,
334 struct buffer_head *bg_bh, 360 struct buffer_head *bg_bh,
335 u64 group_blkno, 361 u64 group_blkno,
362 unsigned int group_clusters,
336 u16 my_chain, 363 u16 my_chain,
337 struct ocfs2_chain_list *cl) 364 struct ocfs2_chain_list *cl)
338{ 365{
339 int status = 0; 366 int status = 0;
367 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
340 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 368 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
341 struct super_block * sb = alloc_inode->i_sb; 369 struct super_block * sb = alloc_inode->i_sb;
342 370
@@ -363,19 +391,23 @@ static int ocfs2_block_group_fill(handle_t *handle,
363 memset(bg, 0, sb->s_blocksize); 391 memset(bg, 0, sb->s_blocksize);
364 strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); 392 strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
365 bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); 393 bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
366 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb)); 394 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
367 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl)); 395 osb->s_feature_incompat));
368 bg->bg_chain = cpu_to_le16(my_chain); 396 bg->bg_chain = cpu_to_le16(my_chain);
369 bg->bg_next_group = cl->cl_recs[my_chain].c_blkno; 397 bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
370 bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno); 398 bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
371 bg->bg_blkno = cpu_to_le64(group_blkno); 399 bg->bg_blkno = cpu_to_le64(group_blkno);
400 if (group_clusters == le16_to_cpu(cl->cl_cpg))
401 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
402 else
403 ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
404 group_clusters);
405
372 /* set the 1st bit in the bitmap to account for the descriptor block */ 406 /* set the 1st bit in the bitmap to account for the descriptor block */
373 ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap); 407 ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
374 bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1); 408 bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
375 409
376 status = ocfs2_journal_dirty(handle, bg_bh); 410 ocfs2_journal_dirty(handle, bg_bh);
377 if (status < 0)
378 mlog_errno(status);
379 411
380 /* There is no need to zero out or otherwise initialize the 412 /* There is no need to zero out or otherwise initialize the
381 * other blocks in a group - All valid FS metadata in a block 413 * other blocks in a group - All valid FS metadata in a block
@@ -401,6 +433,238 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
401 return best; 433 return best;
402} 434}
403 435
436static struct buffer_head *
437ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
438 struct inode *alloc_inode,
439 struct ocfs2_alloc_context *ac,
440 struct ocfs2_chain_list *cl)
441{
442 int status;
443 u32 bit_off, num_bits;
444 u64 bg_blkno;
445 struct buffer_head *bg_bh;
446 unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
447
448 status = ocfs2_claim_clusters(handle, ac,
449 le16_to_cpu(cl->cl_cpg), &bit_off,
450 &num_bits);
451 if (status < 0) {
452 if (status != -ENOSPC)
453 mlog_errno(status);
454 goto bail;
455 }
456
457 /* setup the group */
458 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
459 mlog(0, "new descriptor, record %u, at block %llu\n",
460 alloc_rec, (unsigned long long)bg_blkno);
461
462 bg_bh = sb_getblk(osb->sb, bg_blkno);
463 if (!bg_bh) {
464 status = -EIO;
465 mlog_errno(status);
466 goto bail;
467 }
468 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
469
470 status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
471 bg_blkno, num_bits, alloc_rec, cl);
472 if (status < 0) {
473 brelse(bg_bh);
474 mlog_errno(status);
475 }
476
477bail:
478 return status ? ERR_PTR(status) : bg_bh;
479}
480
481static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
482 handle_t *handle,
483 struct ocfs2_alloc_context *ac,
484 unsigned int min_bits,
485 u32 *bit_off, u32 *num_bits)
486{
487 int status = 0;
488
489 while (min_bits) {
490 status = ocfs2_claim_clusters(handle, ac, min_bits,
491 bit_off, num_bits);
492 if (status != -ENOSPC)
493 break;
494
495 min_bits >>= 1;
496 }
497
498 return status;
499}
500
501static int ocfs2_block_group_grow_discontig(handle_t *handle,
502 struct inode *alloc_inode,
503 struct buffer_head *bg_bh,
504 struct ocfs2_alloc_context *ac,
505 struct ocfs2_chain_list *cl,
506 unsigned int min_bits)
507{
508 int status;
509 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
510 struct ocfs2_group_desc *bg =
511 (struct ocfs2_group_desc *)bg_bh->b_data;
512 unsigned int needed = le16_to_cpu(cl->cl_cpg) -
513 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
514 u32 p_cpos, clusters;
515 u64 p_blkno;
516 struct ocfs2_extent_list *el = &bg->bg_list;
517
518 status = ocfs2_journal_access_gd(handle,
519 INODE_CACHE(alloc_inode),
520 bg_bh,
521 OCFS2_JOURNAL_ACCESS_CREATE);
522 if (status < 0) {
523 mlog_errno(status);
524 goto bail;
525 }
526
527 while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
528 le16_to_cpu(el->l_count))) {
529 if (min_bits > needed)
530 min_bits = needed;
531 status = ocfs2_block_group_claim_bits(osb, handle, ac,
532 min_bits, &p_cpos,
533 &clusters);
534 if (status < 0) {
535 if (status != -ENOSPC)
536 mlog_errno(status);
537 goto bail;
538 }
539 p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
540 ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
541 clusters);
542
543 min_bits = clusters;
544 needed = le16_to_cpu(cl->cl_cpg) -
545 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
546 }
547
548 if (needed > 0) {
549 /*
550 * We have used up all the extent rec but can't fill up
551 * the cpg. So bail out.
552 */
553 status = -ENOSPC;
554 goto bail;
555 }
556
557 ocfs2_journal_dirty(handle, bg_bh);
558
559bail:
560 return status;
561}
562
563static void ocfs2_bg_alloc_cleanup(handle_t *handle,
564 struct ocfs2_alloc_context *cluster_ac,
565 struct inode *alloc_inode,
566 struct buffer_head *bg_bh)
567{
568 int i, ret;
569 struct ocfs2_group_desc *bg;
570 struct ocfs2_extent_list *el;
571 struct ocfs2_extent_rec *rec;
572
573 if (!bg_bh)
574 return;
575
576 bg = (struct ocfs2_group_desc *)bg_bh->b_data;
577 el = &bg->bg_list;
578 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
579 rec = &el->l_recs[i];
580 ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
581 cluster_ac->ac_bh,
582 le64_to_cpu(rec->e_blkno),
583 le32_to_cpu(rec->e_leaf_clusters));
584 if (ret)
585 mlog_errno(ret);
586 /* Try all the clusters to free */
587 }
588
589 ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
590 brelse(bg_bh);
591}
592
593static struct buffer_head *
594ocfs2_block_group_alloc_discontig(handle_t *handle,
595 struct inode *alloc_inode,
596 struct ocfs2_alloc_context *ac,
597 struct ocfs2_chain_list *cl)
598{
599 int status;
600 u32 bit_off, num_bits;
601 u64 bg_blkno;
602 unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
603 struct buffer_head *bg_bh = NULL;
604 unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
605 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
606
607 if (!ocfs2_supports_discontig_bg(osb)) {
608 status = -ENOSPC;
609 goto bail;
610 }
611
612 status = ocfs2_extend_trans(handle,
613 ocfs2_calc_bg_discontig_credits(osb->sb));
614 if (status) {
615 mlog_errno(status);
616 goto bail;
617 }
618
619 /*
620 * We're going to be grabbing from multiple cluster groups.
621 * We don't have enough credits to relink them all, and the
622 * cluster groups will be staying in cache for the duration of
623 * this operation.
624 */
625 ac->ac_allow_chain_relink = 0;
626
627 /* Claim the first region */
628 status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
629 &bit_off, &num_bits);
630 if (status < 0) {
631 if (status != -ENOSPC)
632 mlog_errno(status);
633 goto bail;
634 }
635 min_bits = num_bits;
636
637 /* setup the group */
638 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
639 mlog(0, "new descriptor, record %u, at block %llu\n",
640 alloc_rec, (unsigned long long)bg_blkno);
641
642 bg_bh = sb_getblk(osb->sb, bg_blkno);
643 if (!bg_bh) {
644 status = -EIO;
645 mlog_errno(status);
646 goto bail;
647 }
648 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
649
650 status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
651 bg_blkno, num_bits, alloc_rec, cl);
652 if (status < 0) {
653 mlog_errno(status);
654 goto bail;
655 }
656
657 status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
658 bg_bh, ac, cl, min_bits);
659 if (status)
660 mlog_errno(status);
661
662bail:
663 if (status)
664 ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
665 return status ? ERR_PTR(status) : bg_bh;
666}
667
404/* 668/*
405 * We expect the block group allocator to already be locked. 669 * We expect the block group allocator to already be locked.
406 */ 670 */
@@ -416,9 +680,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
416 struct ocfs2_chain_list *cl; 680 struct ocfs2_chain_list *cl;
417 struct ocfs2_alloc_context *ac = NULL; 681 struct ocfs2_alloc_context *ac = NULL;
418 handle_t *handle = NULL; 682 handle_t *handle = NULL;
419 u32 bit_off, num_bits;
420 u16 alloc_rec; 683 u16 alloc_rec;
421 u64 bg_blkno;
422 struct buffer_head *bg_bh = NULL; 684 struct buffer_head *bg_bh = NULL;
423 struct ocfs2_group_desc *bg; 685 struct ocfs2_group_desc *bg;
424 686
@@ -451,44 +713,20 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
451 (unsigned long long)*last_alloc_group); 713 (unsigned long long)*last_alloc_group);
452 ac->ac_last_group = *last_alloc_group; 714 ac->ac_last_group = *last_alloc_group;
453 } 715 }
454 status = ocfs2_claim_clusters(osb, 716
455 handle, 717 bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
456 ac, 718 ac, cl);
457 le16_to_cpu(cl->cl_cpg), 719 if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
458 &bit_off, 720 bg_bh = ocfs2_block_group_alloc_discontig(handle,
459 &num_bits); 721 alloc_inode,
460 if (status < 0) { 722 ac, cl);
723 if (IS_ERR(bg_bh)) {
724 status = PTR_ERR(bg_bh);
725 bg_bh = NULL;
461 if (status != -ENOSPC) 726 if (status != -ENOSPC)
462 mlog_errno(status); 727 mlog_errno(status);
463 goto bail; 728 goto bail;
464 } 729 }
465
466 alloc_rec = ocfs2_find_smallest_chain(cl);
467
468 /* setup the group */
469 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
470 mlog(0, "new descriptor, record %u, at block %llu\n",
471 alloc_rec, (unsigned long long)bg_blkno);
472
473 bg_bh = sb_getblk(osb->sb, bg_blkno);
474 if (!bg_bh) {
475 status = -EIO;
476 mlog_errno(status);
477 goto bail;
478 }
479 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
480
481 status = ocfs2_block_group_fill(handle,
482 alloc_inode,
483 bg_bh,
484 bg_blkno,
485 alloc_rec,
486 cl);
487 if (status < 0) {
488 mlog_errno(status);
489 goto bail;
490 }
491
492 bg = (struct ocfs2_group_desc *) bg_bh->b_data; 730 bg = (struct ocfs2_group_desc *) bg_bh->b_data;
493 731
494 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 732 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
@@ -498,10 +736,12 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
498 goto bail; 736 goto bail;
499 } 737 }
500 738
739 alloc_rec = le16_to_cpu(bg->bg_chain);
501 le32_add_cpu(&cl->cl_recs[alloc_rec].c_free, 740 le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
502 le16_to_cpu(bg->bg_free_bits_count)); 741 le16_to_cpu(bg->bg_free_bits_count));
503 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits)); 742 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
504 cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg_blkno); 743 le16_to_cpu(bg->bg_bits));
744 cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg->bg_blkno);
505 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) 745 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
506 le16_add_cpu(&cl->cl_next_free_rec, 1); 746 le16_add_cpu(&cl->cl_next_free_rec, 1);
507 747
@@ -510,11 +750,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
510 le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits)); 750 le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
511 le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg)); 751 le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
512 752
513 status = ocfs2_journal_dirty(handle, bh); 753 ocfs2_journal_dirty(handle, bh);
514 if (status < 0) {
515 mlog_errno(status);
516 goto bail;
517 }
518 754
519 spin_lock(&OCFS2_I(alloc_inode)->ip_lock); 755 spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
520 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 756 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
@@ -764,7 +1000,7 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
764 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 1000 status = ocfs2_reserve_suballoc_bits(osb, (*ac),
765 EXTENT_ALLOC_SYSTEM_INODE, 1001 EXTENT_ALLOC_SYSTEM_INODE,
766 (u32)osb->slot_num, NULL, 1002 (u32)osb->slot_num, NULL,
767 ALLOC_NEW_GROUP); 1003 ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
768 1004
769 1005
770 if (status >= 0) { 1006 if (status >= 0) {
@@ -950,11 +1186,7 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
950 status = ocfs2_reserve_local_alloc_bits(osb, 1186 status = ocfs2_reserve_local_alloc_bits(osb,
951 bits_wanted, 1187 bits_wanted,
952 *ac); 1188 *ac);
953 if (status == -EFBIG) { 1189 if ((status < 0) && (status != -ENOSPC)) {
954 /* The local alloc window is outside ac_max_block.
955 * use the main bitmap. */
956 status = -ENOSPC;
957 } else if ((status < 0) && (status != -ENOSPC)) {
958 mlog_errno(status); 1190 mlog_errno(status);
959 goto bail; 1191 goto bail;
960 } 1192 }
@@ -1037,8 +1269,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1037 struct buffer_head *bg_bh, 1269 struct buffer_head *bg_bh,
1038 unsigned int bits_wanted, 1270 unsigned int bits_wanted,
1039 unsigned int total_bits, 1271 unsigned int total_bits,
1040 u16 *bit_off, 1272 struct ocfs2_suballoc_result *res)
1041 u16 *bits_found)
1042{ 1273{
1043 void *bitmap; 1274 void *bitmap;
1044 u16 best_offset, best_size; 1275 u16 best_offset, best_size;
@@ -1082,14 +1313,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1082 } 1313 }
1083 } 1314 }
1084 1315
1085 /* XXX: I think the first clause is equivalent to the second 1316 if (best_size) {
1086 * - jlbec */ 1317 res->sr_bit_offset = best_offset;
1087 if (found == bits_wanted) { 1318 res->sr_bits = best_size;
1088 *bit_off = start - found;
1089 *bits_found = found;
1090 } else if (best_size) {
1091 *bit_off = best_offset;
1092 *bits_found = best_size;
1093 } else { 1319 } else {
1094 status = -ENOSPC; 1320 status = -ENOSPC;
1095 /* No error log here -- see the comment above 1321 /* No error log here -- see the comment above
@@ -1133,16 +1359,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
1133 } 1359 }
1134 1360
1135 le16_add_cpu(&bg->bg_free_bits_count, -num_bits); 1361 le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1136
1137 while(num_bits--) 1362 while(num_bits--)
1138 ocfs2_set_bit(bit_off++, bitmap); 1363 ocfs2_set_bit(bit_off++, bitmap);
1139 1364
1140 status = ocfs2_journal_dirty(handle, 1365 ocfs2_journal_dirty(handle, group_bh);
1141 group_bh);
1142 if (status < 0) {
1143 mlog_errno(status);
1144 goto bail;
1145 }
1146 1366
1147bail: 1367bail:
1148 mlog_exit(status); 1368 mlog_exit(status);
@@ -1206,12 +1426,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
1206 } 1426 }
1207 1427
1208 prev_bg->bg_next_group = bg->bg_next_group; 1428 prev_bg->bg_next_group = bg->bg_next_group;
1209 1429 ocfs2_journal_dirty(handle, prev_bg_bh);
1210 status = ocfs2_journal_dirty(handle, prev_bg_bh);
1211 if (status < 0) {
1212 mlog_errno(status);
1213 goto out_rollback;
1214 }
1215 1430
1216 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1431 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1217 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1432 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1221,12 +1436,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
1221 } 1436 }
1222 1437
1223 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; 1438 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1224 1439 ocfs2_journal_dirty(handle, bg_bh);
1225 status = ocfs2_journal_dirty(handle, bg_bh);
1226 if (status < 0) {
1227 mlog_errno(status);
1228 goto out_rollback;
1229 }
1230 1440
1231 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 1441 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1232 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1442 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1236,14 +1446,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
1236 } 1446 }
1237 1447
1238 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; 1448 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1449 ocfs2_journal_dirty(handle, fe_bh);
1239 1450
1240 status = ocfs2_journal_dirty(handle, fe_bh);
1241 if (status < 0) {
1242 mlog_errno(status);
1243 goto out_rollback;
1244 }
1245
1246 status = 0;
1247out_rollback: 1451out_rollback:
1248 if (status < 0) { 1452 if (status < 0) {
1249 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr); 1453 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
@@ -1267,14 +1471,13 @@ static int ocfs2_cluster_group_search(struct inode *inode,
1267 struct buffer_head *group_bh, 1471 struct buffer_head *group_bh,
1268 u32 bits_wanted, u32 min_bits, 1472 u32 bits_wanted, u32 min_bits,
1269 u64 max_block, 1473 u64 max_block,
1270 u16 *bit_off, u16 *bits_found) 1474 struct ocfs2_suballoc_result *res)
1271{ 1475{
1272 int search = -ENOSPC; 1476 int search = -ENOSPC;
1273 int ret; 1477 int ret;
1274 u64 blkoff; 1478 u64 blkoff;
1275 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; 1479 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1276 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1480 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1277 u16 tmp_off, tmp_found;
1278 unsigned int max_bits, gd_cluster_off; 1481 unsigned int max_bits, gd_cluster_off;
1279 1482
1280 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 1483 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
@@ -1301,15 +1504,15 @@ static int ocfs2_cluster_group_search(struct inode *inode,
1301 1504
1302 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), 1505 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1303 group_bh, bits_wanted, 1506 group_bh, bits_wanted,
1304 max_bits, 1507 max_bits, res);
1305 &tmp_off, &tmp_found);
1306 if (ret) 1508 if (ret)
1307 return ret; 1509 return ret;
1308 1510
1309 if (max_block) { 1511 if (max_block) {
1310 blkoff = ocfs2_clusters_to_blocks(inode->i_sb, 1512 blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1311 gd_cluster_off + 1513 gd_cluster_off +
1312 tmp_off + tmp_found); 1514 res->sr_bit_offset +
1515 res->sr_bits);
1313 mlog(0, "Checking %llu against %llu\n", 1516 mlog(0, "Checking %llu against %llu\n",
1314 (unsigned long long)blkoff, 1517 (unsigned long long)blkoff,
1315 (unsigned long long)max_block); 1518 (unsigned long long)max_block);
@@ -1321,16 +1524,14 @@ static int ocfs2_cluster_group_search(struct inode *inode,
1321 * return success, but we still want to return 1524 * return success, but we still want to return
1322 * -ENOSPC unless it found the minimum number 1525 * -ENOSPC unless it found the minimum number
1323 * of bits. */ 1526 * of bits. */
1324 if (min_bits <= tmp_found) { 1527 if (min_bits <= res->sr_bits)
1325 *bit_off = tmp_off;
1326 *bits_found = tmp_found;
1327 search = 0; /* success */ 1528 search = 0; /* success */
1328 } else if (tmp_found) { 1529 else if (res->sr_bits) {
1329 /* 1530 /*
1330 * Don't show bits which we'll be returning 1531 * Don't show bits which we'll be returning
1331 * for allocation to the local alloc bitmap. 1532 * for allocation to the local alloc bitmap.
1332 */ 1533 */
1333 ocfs2_local_alloc_seen_free_bits(osb, tmp_found); 1534 ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
1334 } 1535 }
1335 } 1536 }
1336 1537
@@ -1341,7 +1542,7 @@ static int ocfs2_block_group_search(struct inode *inode,
1341 struct buffer_head *group_bh, 1542 struct buffer_head *group_bh,
1342 u32 bits_wanted, u32 min_bits, 1543 u32 bits_wanted, u32 min_bits,
1343 u64 max_block, 1544 u64 max_block,
1344 u16 *bit_off, u16 *bits_found) 1545 struct ocfs2_suballoc_result *res)
1345{ 1546{
1346 int ret = -ENOSPC; 1547 int ret = -ENOSPC;
1347 u64 blkoff; 1548 u64 blkoff;
@@ -1354,10 +1555,10 @@ static int ocfs2_block_group_search(struct inode *inode,
1354 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), 1555 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1355 group_bh, bits_wanted, 1556 group_bh, bits_wanted,
1356 le16_to_cpu(bg->bg_bits), 1557 le16_to_cpu(bg->bg_bits),
1357 bit_off, bits_found); 1558 res);
1358 if (!ret && max_block) { 1559 if (!ret && max_block) {
1359 blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off + 1560 blkoff = le64_to_cpu(bg->bg_blkno) +
1360 *bits_found; 1561 res->sr_bit_offset + res->sr_bits;
1361 mlog(0, "Checking %llu against %llu\n", 1562 mlog(0, "Checking %llu against %llu\n",
1362 (unsigned long long)blkoff, 1563 (unsigned long long)blkoff,
1363 (unsigned long long)max_block); 1564 (unsigned long long)max_block);
@@ -1390,33 +1591,76 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1390 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); 1591 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1391 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); 1592 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1392 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); 1593 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1393 1594 ocfs2_journal_dirty(handle, di_bh);
1394 ret = ocfs2_journal_dirty(handle, di_bh);
1395 if (ret < 0)
1396 mlog_errno(ret);
1397 1595
1398out: 1596out:
1399 return ret; 1597 return ret;
1400} 1598}
1401 1599
1600static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
1601 struct ocfs2_extent_rec *rec,
1602 struct ocfs2_chain_list *cl)
1603{
1604 unsigned int bpc = le16_to_cpu(cl->cl_bpc);
1605 unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
1606 unsigned int bitcount = le32_to_cpu(rec->e_leaf_clusters) * bpc;
1607
1608 if (res->sr_bit_offset < bitoff)
1609 return 0;
1610 if (res->sr_bit_offset >= (bitoff + bitcount))
1611 return 0;
1612 res->sr_blkno = le64_to_cpu(rec->e_blkno) +
1613 (res->sr_bit_offset - bitoff);
1614 if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
1615 res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
1616 return 1;
1617}
1618
1619static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
1620 struct ocfs2_group_desc *bg,
1621 struct ocfs2_suballoc_result *res)
1622{
1623 int i;
1624 u64 bg_blkno = res->sr_bg_blkno; /* Save off */
1625 struct ocfs2_extent_rec *rec;
1626 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1627 struct ocfs2_chain_list *cl = &di->id2.i_chain;
1628
1629 if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
1630 res->sr_blkno = 0;
1631 return;
1632 }
1633
1634 res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
1635 res->sr_bg_blkno = 0; /* Clear it for contig block groups */
1636 if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
1637 !bg->bg_list.l_next_free_rec)
1638 return;
1639
1640 for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
1641 rec = &bg->bg_list.l_recs[i];
1642 if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
1643 res->sr_bg_blkno = bg_blkno; /* Restore */
1644 break;
1645 }
1646 }
1647}
1648
1402static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, 1649static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1403 handle_t *handle, 1650 handle_t *handle,
1404 u32 bits_wanted, 1651 u32 bits_wanted,
1405 u32 min_bits, 1652 u32 min_bits,
1406 u16 *bit_off, 1653 struct ocfs2_suballoc_result *res,
1407 unsigned int *num_bits,
1408 u64 gd_blkno,
1409 u16 *bits_left) 1654 u16 *bits_left)
1410{ 1655{
1411 int ret; 1656 int ret;
1412 u16 found;
1413 struct buffer_head *group_bh = NULL; 1657 struct buffer_head *group_bh = NULL;
1414 struct ocfs2_group_desc *gd; 1658 struct ocfs2_group_desc *gd;
1415 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; 1659 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1416 struct inode *alloc_inode = ac->ac_inode; 1660 struct inode *alloc_inode = ac->ac_inode;
1417 1661
1418 ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno, 1662 ret = ocfs2_read_group_descriptor(alloc_inode, di,
1419 &group_bh); 1663 res->sr_bg_blkno, &group_bh);
1420 if (ret < 0) { 1664 if (ret < 0) {
1421 mlog_errno(ret); 1665 mlog_errno(ret);
1422 return ret; 1666 return ret;
@@ -1424,17 +1668,18 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1424 1668
1425 gd = (struct ocfs2_group_desc *) group_bh->b_data; 1669 gd = (struct ocfs2_group_desc *) group_bh->b_data;
1426 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, 1670 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1427 ac->ac_max_block, bit_off, &found); 1671 ac->ac_max_block, res);
1428 if (ret < 0) { 1672 if (ret < 0) {
1429 if (ret != -ENOSPC) 1673 if (ret != -ENOSPC)
1430 mlog_errno(ret); 1674 mlog_errno(ret);
1431 goto out; 1675 goto out;
1432 } 1676 }
1433 1677
1434 *num_bits = found; 1678 if (!ret)
1679 ocfs2_bg_discontig_fix_result(ac, gd, res);
1435 1680
1436 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, 1681 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1437 *num_bits, 1682 res->sr_bits,
1438 le16_to_cpu(gd->bg_chain)); 1683 le16_to_cpu(gd->bg_chain));
1439 if (ret < 0) { 1684 if (ret < 0) {
1440 mlog_errno(ret); 1685 mlog_errno(ret);
@@ -1442,7 +1687,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1442 } 1687 }
1443 1688
1444 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, 1689 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1445 *bit_off, *num_bits); 1690 res->sr_bit_offset, res->sr_bits);
1446 if (ret < 0) 1691 if (ret < 0)
1447 mlog_errno(ret); 1692 mlog_errno(ret);
1448 1693
@@ -1458,13 +1703,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1458 handle_t *handle, 1703 handle_t *handle,
1459 u32 bits_wanted, 1704 u32 bits_wanted,
1460 u32 min_bits, 1705 u32 min_bits,
1461 u16 *bit_off, 1706 struct ocfs2_suballoc_result *res,
1462 unsigned int *num_bits,
1463 u64 *bg_blkno,
1464 u16 *bits_left) 1707 u16 *bits_left)
1465{ 1708{
1466 int status; 1709 int status;
1467 u16 chain, tmp_bits; 1710 u16 chain;
1468 u32 tmp_used; 1711 u32 tmp_used;
1469 u64 next_group; 1712 u64 next_group;
1470 struct inode *alloc_inode = ac->ac_inode; 1713 struct inode *alloc_inode = ac->ac_inode;
@@ -1493,8 +1736,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1493 * the 1st group with any empty bits. */ 1736 * the 1st group with any empty bits. */
1494 while ((status = ac->ac_group_search(alloc_inode, group_bh, 1737 while ((status = ac->ac_group_search(alloc_inode, group_bh,
1495 bits_wanted, min_bits, 1738 bits_wanted, min_bits,
1496 ac->ac_max_block, bit_off, 1739 ac->ac_max_block,
1497 &tmp_bits)) == -ENOSPC) { 1740 res)) == -ENOSPC) {
1498 if (!bg->bg_next_group) 1741 if (!bg->bg_next_group)
1499 break; 1742 break;
1500 1743
@@ -1519,11 +1762,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1519 } 1762 }
1520 1763
1521 mlog(0, "alloc succeeds: we give %u bits from block group %llu\n", 1764 mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
1522 tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno)); 1765 res->sr_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
1523 1766
1524 *num_bits = tmp_bits; 1767 res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
1768
1769 BUG_ON(res->sr_bits == 0);
1770 if (!status)
1771 ocfs2_bg_discontig_fix_result(ac, bg, res);
1525 1772
1526 BUG_ON(*num_bits == 0);
1527 1773
1528 /* 1774 /*
1529 * Keep track of previous block descriptor read. When 1775 * Keep track of previous block descriptor read. When
@@ -1540,7 +1786,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1540 */ 1786 */
1541 if (ac->ac_allow_chain_relink && 1787 if (ac->ac_allow_chain_relink &&
1542 (prev_group_bh) && 1788 (prev_group_bh) &&
1543 (ocfs2_block_group_reasonably_empty(bg, *num_bits))) { 1789 (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
1544 status = ocfs2_relink_block_group(handle, alloc_inode, 1790 status = ocfs2_relink_block_group(handle, alloc_inode,
1545 ac->ac_bh, group_bh, 1791 ac->ac_bh, group_bh,
1546 prev_group_bh, chain); 1792 prev_group_bh, chain);
@@ -1562,31 +1808,24 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1562 } 1808 }
1563 1809
1564 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 1810 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1565 fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used); 1811 fe->id1.bitmap1.i_used = cpu_to_le32(res->sr_bits + tmp_used);
1566 le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits)); 1812 le32_add_cpu(&cl->cl_recs[chain].c_free, -res->sr_bits);
1567 1813 ocfs2_journal_dirty(handle, ac->ac_bh);
1568 status = ocfs2_journal_dirty(handle,
1569 ac->ac_bh);
1570 if (status < 0) {
1571 mlog_errno(status);
1572 goto bail;
1573 }
1574 1814
1575 status = ocfs2_block_group_set_bits(handle, 1815 status = ocfs2_block_group_set_bits(handle,
1576 alloc_inode, 1816 alloc_inode,
1577 bg, 1817 bg,
1578 group_bh, 1818 group_bh,
1579 *bit_off, 1819 res->sr_bit_offset,
1580 *num_bits); 1820 res->sr_bits);
1581 if (status < 0) { 1821 if (status < 0) {
1582 mlog_errno(status); 1822 mlog_errno(status);
1583 goto bail; 1823 goto bail;
1584 } 1824 }
1585 1825
1586 mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits, 1826 mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
1587 (unsigned long long)le64_to_cpu(fe->i_blkno)); 1827 (unsigned long long)le64_to_cpu(fe->i_blkno));
1588 1828
1589 *bg_blkno = le64_to_cpu(bg->bg_blkno);
1590 *bits_left = le16_to_cpu(bg->bg_free_bits_count); 1829 *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1591bail: 1830bail:
1592 brelse(group_bh); 1831 brelse(group_bh);
@@ -1597,19 +1836,15 @@ bail:
1597} 1836}
1598 1837
1599/* will give out up to bits_wanted contiguous bits. */ 1838/* will give out up to bits_wanted contiguous bits. */
1600static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, 1839static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1601 struct ocfs2_alloc_context *ac,
1602 handle_t *handle, 1840 handle_t *handle,
1603 u32 bits_wanted, 1841 u32 bits_wanted,
1604 u32 min_bits, 1842 u32 min_bits,
1605 u16 *bit_off, 1843 struct ocfs2_suballoc_result *res)
1606 unsigned int *num_bits,
1607 u64 *bg_blkno)
1608{ 1844{
1609 int status; 1845 int status;
1610 u16 victim, i; 1846 u16 victim, i;
1611 u16 bits_left = 0; 1847 u16 bits_left = 0;
1612 u64 hint_blkno = ac->ac_last_group;
1613 struct ocfs2_chain_list *cl; 1848 struct ocfs2_chain_list *cl;
1614 struct ocfs2_dinode *fe; 1849 struct ocfs2_dinode *fe;
1615 1850
@@ -1627,7 +1862,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1627 1862
1628 if (le32_to_cpu(fe->id1.bitmap1.i_used) >= 1863 if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1629 le32_to_cpu(fe->id1.bitmap1.i_total)) { 1864 le32_to_cpu(fe->id1.bitmap1.i_total)) {
1630 ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used " 1865 ocfs2_error(ac->ac_inode->i_sb,
1866 "Chain allocator dinode %llu has %u used "
1631 "bits but only %u total.", 1867 "bits but only %u total.",
1632 (unsigned long long)le64_to_cpu(fe->i_blkno), 1868 (unsigned long long)le64_to_cpu(fe->i_blkno),
1633 le32_to_cpu(fe->id1.bitmap1.i_used), 1869 le32_to_cpu(fe->id1.bitmap1.i_used),
@@ -1636,22 +1872,16 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1636 goto bail; 1872 goto bail;
1637 } 1873 }
1638 1874
1639 if (hint_blkno) { 1875 res->sr_bg_blkno = ac->ac_last_group;
1876 if (res->sr_bg_blkno) {
1640 /* Attempt to short-circuit the usual search mechanism 1877 /* Attempt to short-circuit the usual search mechanism
1641 * by jumping straight to the most recently used 1878 * by jumping straight to the most recently used
1642 * allocation group. This helps us mantain some 1879 * allocation group. This helps us mantain some
1643 * contiguousness across allocations. */ 1880 * contiguousness across allocations. */
1644 status = ocfs2_search_one_group(ac, handle, bits_wanted, 1881 status = ocfs2_search_one_group(ac, handle, bits_wanted,
1645 min_bits, bit_off, num_bits, 1882 min_bits, res, &bits_left);
1646 hint_blkno, &bits_left); 1883 if (!status)
1647 if (!status) {
1648 /* Be careful to update *bg_blkno here as the
1649 * caller is expecting it to be filled in, and
1650 * ocfs2_search_one_group() won't do that for
1651 * us. */
1652 *bg_blkno = hint_blkno;
1653 goto set_hint; 1884 goto set_hint;
1654 }
1655 if (status < 0 && status != -ENOSPC) { 1885 if (status < 0 && status != -ENOSPC) {
1656 mlog_errno(status); 1886 mlog_errno(status);
1657 goto bail; 1887 goto bail;
@@ -1664,8 +1894,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1664 ac->ac_chain = victim; 1894 ac->ac_chain = victim;
1665 ac->ac_allow_chain_relink = 1; 1895 ac->ac_allow_chain_relink = 1;
1666 1896
1667 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off, 1897 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1668 num_bits, bg_blkno, &bits_left); 1898 res, &bits_left);
1669 if (!status) 1899 if (!status)
1670 goto set_hint; 1900 goto set_hint;
1671 if (status < 0 && status != -ENOSPC) { 1901 if (status < 0 && status != -ENOSPC) {
@@ -1689,8 +1919,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1689 1919
1690 ac->ac_chain = i; 1920 ac->ac_chain = i;
1691 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 1921 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1692 bit_off, num_bits, bg_blkno, 1922 res, &bits_left);
1693 &bits_left);
1694 if (!status) 1923 if (!status)
1695 break; 1924 break;
1696 if (status < 0 && status != -ENOSPC) { 1925 if (status < 0 && status != -ENOSPC) {
@@ -1707,7 +1936,7 @@ set_hint:
1707 if (bits_left < min_bits) 1936 if (bits_left < min_bits)
1708 ac->ac_last_group = 0; 1937 ac->ac_last_group = 0;
1709 else 1938 else
1710 ac->ac_last_group = *bg_blkno; 1939 ac->ac_last_group = res->sr_bg_blkno;
1711 } 1940 }
1712 1941
1713bail: 1942bail:
@@ -1715,37 +1944,37 @@ bail:
1715 return status; 1944 return status;
1716} 1945}
1717 1946
1718int ocfs2_claim_metadata(struct ocfs2_super *osb, 1947int ocfs2_claim_metadata(handle_t *handle,
1719 handle_t *handle,
1720 struct ocfs2_alloc_context *ac, 1948 struct ocfs2_alloc_context *ac,
1721 u32 bits_wanted, 1949 u32 bits_wanted,
1950 u64 *suballoc_loc,
1722 u16 *suballoc_bit_start, 1951 u16 *suballoc_bit_start,
1723 unsigned int *num_bits, 1952 unsigned int *num_bits,
1724 u64 *blkno_start) 1953 u64 *blkno_start)
1725{ 1954{
1726 int status; 1955 int status;
1727 u64 bg_blkno; 1956 struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
1728 1957
1729 BUG_ON(!ac); 1958 BUG_ON(!ac);
1730 BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted)); 1959 BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1731 BUG_ON(ac->ac_which != OCFS2_AC_USE_META); 1960 BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1732 1961
1733 status = ocfs2_claim_suballoc_bits(osb, 1962 status = ocfs2_claim_suballoc_bits(ac,
1734 ac,
1735 handle, 1963 handle,
1736 bits_wanted, 1964 bits_wanted,
1737 1, 1965 1,
1738 suballoc_bit_start, 1966 &res);
1739 num_bits,
1740 &bg_blkno);
1741 if (status < 0) { 1967 if (status < 0) {
1742 mlog_errno(status); 1968 mlog_errno(status);
1743 goto bail; 1969 goto bail;
1744 } 1970 }
1745 atomic_inc(&osb->alloc_stats.bg_allocs); 1971 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
1746 1972
1747 *blkno_start = bg_blkno + (u64) *suballoc_bit_start; 1973 *suballoc_loc = res.sr_bg_blkno;
1748 ac->ac_bits_given += (*num_bits); 1974 *suballoc_bit_start = res.sr_bit_offset;
1975 *blkno_start = res.sr_blkno;
1976 ac->ac_bits_given += res.sr_bits;
1977 *num_bits = res.sr_bits;
1749 status = 0; 1978 status = 0;
1750bail: 1979bail:
1751 mlog_exit(status); 1980 mlog_exit(status);
@@ -1753,10 +1982,10 @@ bail:
1753} 1982}
1754 1983
1755static void ocfs2_init_inode_ac_group(struct inode *dir, 1984static void ocfs2_init_inode_ac_group(struct inode *dir,
1756 struct buffer_head *parent_fe_bh, 1985 struct buffer_head *parent_di_bh,
1757 struct ocfs2_alloc_context *ac) 1986 struct ocfs2_alloc_context *ac)
1758{ 1987{
1759 struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data; 1988 struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
1760 /* 1989 /*
1761 * Try to allocate inodes from some specific group. 1990 * Try to allocate inodes from some specific group.
1762 * 1991 *
@@ -1770,10 +1999,14 @@ static void ocfs2_init_inode_ac_group(struct inode *dir,
1770 if (OCFS2_I(dir)->ip_last_used_group && 1999 if (OCFS2_I(dir)->ip_last_used_group &&
1771 OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot) 2000 OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
1772 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group; 2001 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
1773 else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot) 2002 else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
1774 ac->ac_last_group = ocfs2_which_suballoc_group( 2003 if (di->i_suballoc_loc)
1775 le64_to_cpu(fe->i_blkno), 2004 ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
1776 le16_to_cpu(fe->i_suballoc_bit)); 2005 else
2006 ac->ac_last_group = ocfs2_which_suballoc_group(
2007 le64_to_cpu(di->i_blkno),
2008 le16_to_cpu(di->i_suballoc_bit));
2009 }
1777} 2010}
1778 2011
1779static inline void ocfs2_save_inode_ac_group(struct inode *dir, 2012static inline void ocfs2_save_inode_ac_group(struct inode *dir,
@@ -1783,17 +2016,16 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir,
1783 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; 2016 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
1784} 2017}
1785 2018
1786int ocfs2_claim_new_inode(struct ocfs2_super *osb, 2019int ocfs2_claim_new_inode(handle_t *handle,
1787 handle_t *handle,
1788 struct inode *dir, 2020 struct inode *dir,
1789 struct buffer_head *parent_fe_bh, 2021 struct buffer_head *parent_fe_bh,
1790 struct ocfs2_alloc_context *ac, 2022 struct ocfs2_alloc_context *ac,
2023 u64 *suballoc_loc,
1791 u16 *suballoc_bit, 2024 u16 *suballoc_bit,
1792 u64 *fe_blkno) 2025 u64 *fe_blkno)
1793{ 2026{
1794 int status; 2027 int status;
1795 unsigned int num_bits; 2028 struct ocfs2_suballoc_result res;
1796 u64 bg_blkno;
1797 2029
1798 mlog_entry_void(); 2030 mlog_entry_void();
1799 2031
@@ -1804,23 +2036,22 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1804 2036
1805 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); 2037 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
1806 2038
1807 status = ocfs2_claim_suballoc_bits(osb, 2039 status = ocfs2_claim_suballoc_bits(ac,
1808 ac,
1809 handle, 2040 handle,
1810 1, 2041 1,
1811 1, 2042 1,
1812 suballoc_bit, 2043 &res);
1813 &num_bits,
1814 &bg_blkno);
1815 if (status < 0) { 2044 if (status < 0) {
1816 mlog_errno(status); 2045 mlog_errno(status);
1817 goto bail; 2046 goto bail;
1818 } 2047 }
1819 atomic_inc(&osb->alloc_stats.bg_allocs); 2048 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
1820 2049
1821 BUG_ON(num_bits != 1); 2050 BUG_ON(res.sr_bits != 1);
1822 2051
1823 *fe_blkno = bg_blkno + (u64) (*suballoc_bit); 2052 *suballoc_loc = res.sr_bg_blkno;
2053 *suballoc_bit = res.sr_bit_offset;
2054 *fe_blkno = res.sr_blkno;
1824 ac->ac_bits_given++; 2055 ac->ac_bits_given++;
1825 ocfs2_save_inode_ac_group(dir, ac); 2056 ocfs2_save_inode_ac_group(dir, ac);
1826 status = 0; 2057 status = 0;
@@ -1890,8 +2121,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
1890 * contig. allocation, set to '1' to indicate we can deal with extents 2121 * contig. allocation, set to '1' to indicate we can deal with extents
1891 * of any size. 2122 * of any size.
1892 */ 2123 */
1893int __ocfs2_claim_clusters(struct ocfs2_super *osb, 2124int __ocfs2_claim_clusters(handle_t *handle,
1894 handle_t *handle,
1895 struct ocfs2_alloc_context *ac, 2125 struct ocfs2_alloc_context *ac,
1896 u32 min_clusters, 2126 u32 min_clusters,
1897 u32 max_clusters, 2127 u32 max_clusters,
@@ -1900,8 +2130,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1900{ 2130{
1901 int status; 2131 int status;
1902 unsigned int bits_wanted = max_clusters; 2132 unsigned int bits_wanted = max_clusters;
1903 u64 bg_blkno = 0; 2133 struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
1904 u16 bg_bit_off; 2134 struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
1905 2135
1906 mlog_entry_void(); 2136 mlog_entry_void();
1907 2137
@@ -1911,6 +2141,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1911 && ac->ac_which != OCFS2_AC_USE_MAIN); 2141 && ac->ac_which != OCFS2_AC_USE_MAIN);
1912 2142
1913 if (ac->ac_which == OCFS2_AC_USE_LOCAL) { 2143 if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
2144 WARN_ON(min_clusters > 1);
2145
1914 status = ocfs2_claim_local_alloc_bits(osb, 2146 status = ocfs2_claim_local_alloc_bits(osb,
1915 handle, 2147 handle,
1916 ac, 2148 ac,
@@ -1933,20 +2165,19 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1933 if (bits_wanted > (osb->bitmap_cpg - 1)) 2165 if (bits_wanted > (osb->bitmap_cpg - 1))
1934 bits_wanted = osb->bitmap_cpg - 1; 2166 bits_wanted = osb->bitmap_cpg - 1;
1935 2167
1936 status = ocfs2_claim_suballoc_bits(osb, 2168 status = ocfs2_claim_suballoc_bits(ac,
1937 ac,
1938 handle, 2169 handle,
1939 bits_wanted, 2170 bits_wanted,
1940 min_clusters, 2171 min_clusters,
1941 &bg_bit_off, 2172 &res);
1942 num_clusters,
1943 &bg_blkno);
1944 if (!status) { 2173 if (!status) {
2174 BUG_ON(res.sr_blkno); /* cluster alloc can't set */
1945 *cluster_start = 2175 *cluster_start =
1946 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode, 2176 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
1947 bg_blkno, 2177 res.sr_bg_blkno,
1948 bg_bit_off); 2178 res.sr_bit_offset);
1949 atomic_inc(&osb->alloc_stats.bitmap_data); 2179 atomic_inc(&osb->alloc_stats.bitmap_data);
2180 *num_clusters = res.sr_bits;
1950 } 2181 }
1951 } 2182 }
1952 if (status < 0) { 2183 if (status < 0) {
@@ -1962,8 +2193,7 @@ bail:
1962 return status; 2193 return status;
1963} 2194}
1964 2195
1965int ocfs2_claim_clusters(struct ocfs2_super *osb, 2196int ocfs2_claim_clusters(handle_t *handle,
1966 handle_t *handle,
1967 struct ocfs2_alloc_context *ac, 2197 struct ocfs2_alloc_context *ac,
1968 u32 min_clusters, 2198 u32 min_clusters,
1969 u32 *cluster_start, 2199 u32 *cluster_start,
@@ -1971,22 +2201,22 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
1971{ 2201{
1972 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; 2202 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1973 2203
1974 return __ocfs2_claim_clusters(osb, handle, ac, min_clusters, 2204 return __ocfs2_claim_clusters(handle, ac, min_clusters,
1975 bits_wanted, cluster_start, num_clusters); 2205 bits_wanted, cluster_start, num_clusters);
1976} 2206}
1977 2207
1978static inline int ocfs2_block_group_clear_bits(handle_t *handle, 2208static int ocfs2_block_group_clear_bits(handle_t *handle,
1979 struct inode *alloc_inode, 2209 struct inode *alloc_inode,
1980 struct ocfs2_group_desc *bg, 2210 struct ocfs2_group_desc *bg,
1981 struct buffer_head *group_bh, 2211 struct buffer_head *group_bh,
1982 unsigned int bit_off, 2212 unsigned int bit_off,
1983 unsigned int num_bits) 2213 unsigned int num_bits,
2214 void (*undo_fn)(unsigned int bit,
2215 unsigned long *bmap))
1984{ 2216{
1985 int status; 2217 int status;
1986 unsigned int tmp; 2218 unsigned int tmp;
1987 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1988 struct ocfs2_group_desc *undo_bg = NULL; 2219 struct ocfs2_group_desc *undo_bg = NULL;
1989 int cluster_bitmap = 0;
1990 2220
1991 mlog_entry_void(); 2221 mlog_entry_void();
1992 2222
@@ -1996,20 +2226,18 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1996 2226
1997 mlog(0, "off = %u, num = %u\n", bit_off, num_bits); 2227 mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1998 2228
1999 if (ocfs2_is_cluster_bitmap(alloc_inode)) 2229 BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
2000 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
2001
2002 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 2230 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
2003 group_bh, journal_type); 2231 group_bh,
2232 undo_fn ?
2233 OCFS2_JOURNAL_ACCESS_UNDO :
2234 OCFS2_JOURNAL_ACCESS_WRITE);
2004 if (status < 0) { 2235 if (status < 0) {
2005 mlog_errno(status); 2236 mlog_errno(status);
2006 goto bail; 2237 goto bail;
2007 } 2238 }
2008 2239
2009 if (ocfs2_is_cluster_bitmap(alloc_inode)) 2240 if (undo_fn) {
2010 cluster_bitmap = 1;
2011
2012 if (cluster_bitmap) {
2013 jbd_lock_bh_state(group_bh); 2241 jbd_lock_bh_state(group_bh);
2014 undo_bg = (struct ocfs2_group_desc *) 2242 undo_bg = (struct ocfs2_group_desc *)
2015 bh2jh(group_bh)->b_committed_data; 2243 bh2jh(group_bh)->b_committed_data;
@@ -2020,18 +2248,16 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
2020 while(tmp--) { 2248 while(tmp--) {
2021 ocfs2_clear_bit((bit_off + tmp), 2249 ocfs2_clear_bit((bit_off + tmp),
2022 (unsigned long *) bg->bg_bitmap); 2250 (unsigned long *) bg->bg_bitmap);
2023 if (cluster_bitmap) 2251 if (undo_fn)
2024 ocfs2_set_bit(bit_off + tmp, 2252 undo_fn(bit_off + tmp,
2025 (unsigned long *) undo_bg->bg_bitmap); 2253 (unsigned long *) undo_bg->bg_bitmap);
2026 } 2254 }
2027 le16_add_cpu(&bg->bg_free_bits_count, num_bits); 2255 le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2028 2256
2029 if (cluster_bitmap) 2257 if (undo_fn)
2030 jbd_unlock_bh_state(group_bh); 2258 jbd_unlock_bh_state(group_bh);
2031 2259
2032 status = ocfs2_journal_dirty(handle, group_bh); 2260 ocfs2_journal_dirty(handle, group_bh);
2033 if (status < 0)
2034 mlog_errno(status);
2035bail: 2261bail:
2036 return status; 2262 return status;
2037} 2263}
@@ -2039,12 +2265,14 @@ bail:
2039/* 2265/*
2040 * expects the suballoc inode to already be locked. 2266 * expects the suballoc inode to already be locked.
2041 */ 2267 */
2042int ocfs2_free_suballoc_bits(handle_t *handle, 2268static int _ocfs2_free_suballoc_bits(handle_t *handle,
2043 struct inode *alloc_inode, 2269 struct inode *alloc_inode,
2044 struct buffer_head *alloc_bh, 2270 struct buffer_head *alloc_bh,
2045 unsigned int start_bit, 2271 unsigned int start_bit,
2046 u64 bg_blkno, 2272 u64 bg_blkno,
2047 unsigned int count) 2273 unsigned int count,
2274 void (*undo_fn)(unsigned int bit,
2275 unsigned long *bitmap))
2048{ 2276{
2049 int status = 0; 2277 int status = 0;
2050 u32 tmp_used; 2278 u32 tmp_used;
@@ -2079,7 +2307,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
2079 2307
2080 status = ocfs2_block_group_clear_bits(handle, alloc_inode, 2308 status = ocfs2_block_group_clear_bits(handle, alloc_inode,
2081 group, group_bh, 2309 group, group_bh,
2082 start_bit, count); 2310 start_bit, count, undo_fn);
2083 if (status < 0) { 2311 if (status < 0) {
2084 mlog_errno(status); 2312 mlog_errno(status);
2085 goto bail; 2313 goto bail;
@@ -2096,12 +2324,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
2096 count); 2324 count);
2097 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 2325 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2098 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); 2326 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2099 2327 ocfs2_journal_dirty(handle, alloc_bh);
2100 status = ocfs2_journal_dirty(handle, alloc_bh);
2101 if (status < 0) {
2102 mlog_errno(status);
2103 goto bail;
2104 }
2105 2328
2106bail: 2329bail:
2107 brelse(group_bh); 2330 brelse(group_bh);
@@ -2110,6 +2333,17 @@ bail:
2110 return status; 2333 return status;
2111} 2334}
2112 2335
2336int ocfs2_free_suballoc_bits(handle_t *handle,
2337 struct inode *alloc_inode,
2338 struct buffer_head *alloc_bh,
2339 unsigned int start_bit,
2340 u64 bg_blkno,
2341 unsigned int count)
2342{
2343 return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2344 start_bit, bg_blkno, count, NULL);
2345}
2346
2113int ocfs2_free_dinode(handle_t *handle, 2347int ocfs2_free_dinode(handle_t *handle,
2114 struct inode *inode_alloc_inode, 2348 struct inode *inode_alloc_inode,
2115 struct buffer_head *inode_alloc_bh, 2349 struct buffer_head *inode_alloc_bh,
@@ -2119,15 +2353,19 @@ int ocfs2_free_dinode(handle_t *handle,
2119 u16 bit = le16_to_cpu(di->i_suballoc_bit); 2353 u16 bit = le16_to_cpu(di->i_suballoc_bit);
2120 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 2354 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2121 2355
2356 if (di->i_suballoc_loc)
2357 bg_blkno = le64_to_cpu(di->i_suballoc_loc);
2122 return ocfs2_free_suballoc_bits(handle, inode_alloc_inode, 2358 return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
2123 inode_alloc_bh, bit, bg_blkno, 1); 2359 inode_alloc_bh, bit, bg_blkno, 1);
2124} 2360}
2125 2361
2126int ocfs2_free_clusters(handle_t *handle, 2362static int _ocfs2_free_clusters(handle_t *handle,
2127 struct inode *bitmap_inode, 2363 struct inode *bitmap_inode,
2128 struct buffer_head *bitmap_bh, 2364 struct buffer_head *bitmap_bh,
2129 u64 start_blk, 2365 u64 start_blk,
2130 unsigned int num_clusters) 2366 unsigned int num_clusters,
2367 void (*undo_fn)(unsigned int bit,
2368 unsigned long *bitmap))
2131{ 2369{
2132 int status; 2370 int status;
2133 u16 bg_start_bit; 2371 u16 bg_start_bit;
@@ -2154,9 +2392,9 @@ int ocfs2_free_clusters(handle_t *handle,
2154 mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n", 2392 mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
2155 (unsigned long long)bg_blkno, bg_start_bit); 2393 (unsigned long long)bg_blkno, bg_start_bit);
2156 2394
2157 status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, 2395 status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2158 bg_start_bit, bg_blkno, 2396 bg_start_bit, bg_blkno,
2159 num_clusters); 2397 num_clusters, undo_fn);
2160 if (status < 0) { 2398 if (status < 0) {
2161 mlog_errno(status); 2399 mlog_errno(status);
2162 goto out; 2400 goto out;
@@ -2170,6 +2408,32 @@ out:
2170 return status; 2408 return status;
2171} 2409}
2172 2410
2411int ocfs2_free_clusters(handle_t *handle,
2412 struct inode *bitmap_inode,
2413 struct buffer_head *bitmap_bh,
2414 u64 start_blk,
2415 unsigned int num_clusters)
2416{
2417 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2418 start_blk, num_clusters,
2419 _ocfs2_set_bit);
2420}
2421
2422/*
2423 * Give never-used clusters back to the global bitmap. We don't need
2424 * to protect these bits in the undo buffer.
2425 */
2426int ocfs2_release_clusters(handle_t *handle,
2427 struct inode *bitmap_inode,
2428 struct buffer_head *bitmap_bh,
2429 u64 start_blk,
2430 unsigned int num_clusters)
2431{
2432 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2433 start_blk, num_clusters,
2434 _ocfs2_clear_bit);
2435}
2436
2173static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg) 2437static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
2174{ 2438{
2175 printk("Block Group:\n"); 2439 printk("Block Group:\n");
@@ -2360,7 +2624,7 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2360 struct buffer_head *alloc_bh, u64 blkno, 2624 struct buffer_head *alloc_bh, u64 blkno,
2361 u16 bit, int *res) 2625 u16 bit, int *res)
2362{ 2626{
2363 struct ocfs2_dinode *alloc_fe; 2627 struct ocfs2_dinode *alloc_di;
2364 struct ocfs2_group_desc *group; 2628 struct ocfs2_group_desc *group;
2365 struct buffer_head *group_bh = NULL; 2629 struct buffer_head *group_bh = NULL;
2366 u64 bg_blkno; 2630 u64 bg_blkno;
@@ -2369,17 +2633,20 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2369 mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno, 2633 mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
2370 (unsigned int)bit); 2634 (unsigned int)bit);
2371 2635
2372 alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data; 2636 alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
2373 if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) { 2637 if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
2374 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n", 2638 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2375 (unsigned int)bit, 2639 (unsigned int)bit,
2376 ocfs2_bits_per_group(&alloc_fe->id2.i_chain)); 2640 ocfs2_bits_per_group(&alloc_di->id2.i_chain));
2377 status = -EINVAL; 2641 status = -EINVAL;
2378 goto bail; 2642 goto bail;
2379 } 2643 }
2380 2644
2381 bg_blkno = ocfs2_which_suballoc_group(blkno, bit); 2645 if (alloc_di->i_suballoc_loc)
2382 status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno, 2646 bg_blkno = le64_to_cpu(alloc_di->i_suballoc_loc);
2647 else
2648 bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
2649 status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
2383 &group_bh); 2650 &group_bh);
2384 if (status < 0) { 2651 if (status < 0) {
2385 mlog(ML_ERROR, "read group %llu failed %d\n", 2652 mlog(ML_ERROR, "read group %llu failed %d\n",
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index fa60723c43e8..a017dd3ee7d9 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -26,13 +26,14 @@
26#ifndef _CHAINALLOC_H_ 26#ifndef _CHAINALLOC_H_
27#define _CHAINALLOC_H_ 27#define _CHAINALLOC_H_
28 28
29struct ocfs2_suballoc_result;
29typedef int (group_search_t)(struct inode *, 30typedef int (group_search_t)(struct inode *,
30 struct buffer_head *, 31 struct buffer_head *,
31 u32, /* bits_wanted */ 32 u32, /* bits_wanted */
32 u32, /* min_bits */ 33 u32, /* min_bits */
33 u64, /* max_block */ 34 u64, /* max_block */
34 u16 *, /* *bit_off */ 35 struct ocfs2_suballoc_result *);
35 u16 *); /* *bits_found */ 36 /* found bits */
36 37
37struct ocfs2_alloc_context { 38struct ocfs2_alloc_context {
38 struct inode *ac_inode; /* which bitmap are we allocating from? */ 39 struct inode *ac_inode; /* which bitmap are we allocating from? */
@@ -54,6 +55,8 @@ struct ocfs2_alloc_context {
54 u64 ac_last_group; 55 u64 ac_last_group;
55 u64 ac_max_block; /* Highest block number to allocate. 0 is 56 u64 ac_max_block; /* Highest block number to allocate. 0 is
56 is the same as ~0 - unlimited */ 57 is the same as ~0 - unlimited */
58
59 struct ocfs2_alloc_reservation *ac_resv;
57}; 60};
58 61
59void ocfs2_init_steal_slots(struct ocfs2_super *osb); 62void ocfs2_init_steal_slots(struct ocfs2_super *osb);
@@ -80,22 +83,21 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
80 u32 bits_wanted, 83 u32 bits_wanted,
81 struct ocfs2_alloc_context **ac); 84 struct ocfs2_alloc_context **ac);
82 85
83int ocfs2_claim_metadata(struct ocfs2_super *osb, 86int ocfs2_claim_metadata(handle_t *handle,
84 handle_t *handle,
85 struct ocfs2_alloc_context *ac, 87 struct ocfs2_alloc_context *ac,
86 u32 bits_wanted, 88 u32 bits_wanted,
89 u64 *suballoc_loc,
87 u16 *suballoc_bit_start, 90 u16 *suballoc_bit_start,
88 u32 *num_bits, 91 u32 *num_bits,
89 u64 *blkno_start); 92 u64 *blkno_start);
90int ocfs2_claim_new_inode(struct ocfs2_super *osb, 93int ocfs2_claim_new_inode(handle_t *handle,
91 handle_t *handle,
92 struct inode *dir, 94 struct inode *dir,
93 struct buffer_head *parent_fe_bh, 95 struct buffer_head *parent_fe_bh,
94 struct ocfs2_alloc_context *ac, 96 struct ocfs2_alloc_context *ac,
97 u64 *suballoc_loc,
95 u16 *suballoc_bit, 98 u16 *suballoc_bit,
96 u64 *fe_blkno); 99 u64 *fe_blkno);
97int ocfs2_claim_clusters(struct ocfs2_super *osb, 100int ocfs2_claim_clusters(handle_t *handle,
98 handle_t *handle,
99 struct ocfs2_alloc_context *ac, 101 struct ocfs2_alloc_context *ac,
100 u32 min_clusters, 102 u32 min_clusters,
101 u32 *cluster_start, 103 u32 *cluster_start,
@@ -104,8 +106,7 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
104 * Use this variant of ocfs2_claim_clusters to specify a maxiumum 106 * Use this variant of ocfs2_claim_clusters to specify a maxiumum
105 * number of clusters smaller than the allocation reserved. 107 * number of clusters smaller than the allocation reserved.
106 */ 108 */
107int __ocfs2_claim_clusters(struct ocfs2_super *osb, 109int __ocfs2_claim_clusters(handle_t *handle,
108 handle_t *handle,
109 struct ocfs2_alloc_context *ac, 110 struct ocfs2_alloc_context *ac,
110 u32 min_clusters, 111 u32 min_clusters,
111 u32 max_clusters, 112 u32 max_clusters,
@@ -127,6 +128,11 @@ int ocfs2_free_clusters(handle_t *handle,
127 struct buffer_head *bitmap_bh, 128 struct buffer_head *bitmap_bh,
128 u64 start_blk, 129 u64 start_blk,
129 unsigned int num_clusters); 130 unsigned int num_clusters);
131int ocfs2_release_clusters(handle_t *handle,
132 struct inode *bitmap_inode,
133 struct buffer_head *bitmap_bh,
134 u64 start_blk,
135 unsigned int num_clusters);
130 136
131static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit) 137static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
132{ 138{
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index dee03197a494..1c2c39f6f0b6 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -94,7 +94,9 @@ struct mount_options
94 unsigned long mount_opt; 94 unsigned long mount_opt;
95 unsigned int atime_quantum; 95 unsigned int atime_quantum;
96 signed short slot; 96 signed short slot;
97 unsigned int localalloc_opt; 97 int localalloc_opt;
98 unsigned int resv_level;
99 int dir_resv_level;
98 char cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; 100 char cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
99}; 101};
100 102
@@ -176,6 +178,8 @@ enum {
176 Opt_noacl, 178 Opt_noacl,
177 Opt_usrquota, 179 Opt_usrquota,
178 Opt_grpquota, 180 Opt_grpquota,
181 Opt_resv_level,
182 Opt_dir_resv_level,
179 Opt_err, 183 Opt_err,
180}; 184};
181 185
@@ -202,6 +206,8 @@ static const match_table_t tokens = {
202 {Opt_noacl, "noacl"}, 206 {Opt_noacl, "noacl"},
203 {Opt_usrquota, "usrquota"}, 207 {Opt_usrquota, "usrquota"},
204 {Opt_grpquota, "grpquota"}, 208 {Opt_grpquota, "grpquota"},
209 {Opt_resv_level, "resv_level=%u"},
210 {Opt_dir_resv_level, "dir_resv_level=%u"},
205 {Opt_err, NULL} 211 {Opt_err, NULL}
206}; 212};
207 213
@@ -1028,8 +1034,14 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1028 osb->s_atime_quantum = parsed_options.atime_quantum; 1034 osb->s_atime_quantum = parsed_options.atime_quantum;
1029 osb->preferred_slot = parsed_options.slot; 1035 osb->preferred_slot = parsed_options.slot;
1030 osb->osb_commit_interval = parsed_options.commit_interval; 1036 osb->osb_commit_interval = parsed_options.commit_interval;
1031 osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt); 1037
1032 osb->local_alloc_bits = osb->local_alloc_default_bits; 1038 ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt);
1039 osb->osb_resv_level = parsed_options.resv_level;
1040 osb->osb_dir_resv_level = parsed_options.resv_level;
1041 if (parsed_options.dir_resv_level == -1)
1042 osb->osb_dir_resv_level = parsed_options.resv_level;
1043 else
1044 osb->osb_dir_resv_level = parsed_options.dir_resv_level;
1033 1045
1034 status = ocfs2_verify_userspace_stack(osb, &parsed_options); 1046 status = ocfs2_verify_userspace_stack(osb, &parsed_options);
1035 if (status) 1047 if (status)
@@ -1285,11 +1297,13 @@ static int ocfs2_parse_options(struct super_block *sb,
1285 options ? options : "(none)"); 1297 options ? options : "(none)");
1286 1298
1287 mopt->commit_interval = 0; 1299 mopt->commit_interval = 0;
1288 mopt->mount_opt = 0; 1300 mopt->mount_opt = OCFS2_MOUNT_NOINTR;
1289 mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; 1301 mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
1290 mopt->slot = OCFS2_INVALID_SLOT; 1302 mopt->slot = OCFS2_INVALID_SLOT;
1291 mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE; 1303 mopt->localalloc_opt = -1;
1292 mopt->cluster_stack[0] = '\0'; 1304 mopt->cluster_stack[0] = '\0';
1305 mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL;
1306 mopt->dir_resv_level = -1;
1293 1307
1294 if (!options) { 1308 if (!options) {
1295 status = 1; 1309 status = 1;
@@ -1380,7 +1394,7 @@ static int ocfs2_parse_options(struct super_block *sb,
1380 status = 0; 1394 status = 0;
1381 goto bail; 1395 goto bail;
1382 } 1396 }
1383 if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8)) 1397 if (option >= 0)
1384 mopt->localalloc_opt = option; 1398 mopt->localalloc_opt = option;
1385 break; 1399 break;
1386 case Opt_localflocks: 1400 case Opt_localflocks:
@@ -1433,6 +1447,28 @@ static int ocfs2_parse_options(struct super_block *sb,
1433 mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; 1447 mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
1434 mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; 1448 mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
1435 break; 1449 break;
1450 case Opt_resv_level:
1451 if (is_remount)
1452 break;
1453 if (match_int(&args[0], &option)) {
1454 status = 0;
1455 goto bail;
1456 }
1457 if (option >= OCFS2_MIN_RESV_LEVEL &&
1458 option < OCFS2_MAX_RESV_LEVEL)
1459 mopt->resv_level = option;
1460 break;
1461 case Opt_dir_resv_level:
1462 if (is_remount)
1463 break;
1464 if (match_int(&args[0], &option)) {
1465 status = 0;
1466 goto bail;
1467 }
1468 if (option >= OCFS2_MIN_RESV_LEVEL &&
1469 option < OCFS2_MAX_RESV_LEVEL)
1470 mopt->dir_resv_level = option;
1471 break;
1436 default: 1472 default:
1437 mlog(ML_ERROR, 1473 mlog(ML_ERROR,
1438 "Unrecognized mount option \"%s\" " 1474 "Unrecognized mount option \"%s\" "
@@ -1487,7 +1523,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1487 (unsigned) (osb->osb_commit_interval / HZ)); 1523 (unsigned) (osb->osb_commit_interval / HZ));
1488 1524
1489 local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits); 1525 local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
1490 if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE) 1526 if (local_alloc_megs != ocfs2_la_default_mb(osb))
1491 seq_printf(s, ",localalloc=%d", local_alloc_megs); 1527 seq_printf(s, ",localalloc=%d", local_alloc_megs);
1492 1528
1493 if (opts & OCFS2_MOUNT_LOCALFLOCKS) 1529 if (opts & OCFS2_MOUNT_LOCALFLOCKS)
@@ -1514,6 +1550,12 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1514 else 1550 else
1515 seq_printf(s, ",noacl"); 1551 seq_printf(s, ",noacl");
1516 1552
1553 if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL)
1554 seq_printf(s, ",resv_level=%d", osb->osb_resv_level);
1555
1556 if (osb->osb_dir_resv_level != osb->osb_resv_level)
1557 seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
1558
1517 return 0; 1559 return 0;
1518} 1560}
1519 1561
@@ -1688,6 +1730,8 @@ static void ocfs2_inode_init_once(void *data)
1688 oi->ip_blkno = 0ULL; 1730 oi->ip_blkno = 0ULL;
1689 oi->ip_clusters = 0; 1731 oi->ip_clusters = 0;
1690 1732
1733 ocfs2_resv_init_once(&oi->ip_la_data_resv);
1734
1691 ocfs2_lock_res_init_once(&oi->ip_rw_lockres); 1735 ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
1692 ocfs2_lock_res_init_once(&oi->ip_inode_lockres); 1736 ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
1693 ocfs2_lock_res_init_once(&oi->ip_open_lockres); 1737 ocfs2_lock_res_init_once(&oi->ip_open_lockres);
@@ -2042,6 +2086,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
2042 2086
2043 init_waitqueue_head(&osb->osb_mount_event); 2087 init_waitqueue_head(&osb->osb_mount_event);
2044 2088
2089 status = ocfs2_resmap_init(osb, &osb->osb_la_resmap);
2090 if (status) {
2091 mlog_errno(status);
2092 goto bail;
2093 }
2094
2045 osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); 2095 osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
2046 if (!osb->vol_label) { 2096 if (!osb->vol_label) {
2047 mlog(ML_ERROR, "unable to alloc vol label\n"); 2097 mlog(ML_ERROR, "unable to alloc vol label\n");
@@ -2224,9 +2274,11 @@ static int ocfs2_initialize_super(struct super_block *sb,
2224 } 2274 }
2225 2275
2226 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; 2276 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
2277 osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters;
2227 iput(inode); 2278 iput(inode);
2228 2279
2229 osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8; 2280 osb->bitmap_cpg = ocfs2_group_bitmap_size(sb, 0,
2281 osb->s_feature_incompat) * 8;
2230 2282
2231 status = ocfs2_init_slot_info(osb); 2283 status = ocfs2_init_slot_info(osb);
2232 if (status < 0) { 2284 if (status < 0) {
@@ -2509,5 +2561,25 @@ void __ocfs2_abort(struct super_block* sb,
2509 ocfs2_handle_error(sb); 2561 ocfs2_handle_error(sb);
2510} 2562}
2511 2563
2564/*
2565 * Void signal blockers, because in-kernel sigprocmask() only fails
2566 * when SIG_* is wrong.
2567 */
2568void ocfs2_block_signals(sigset_t *oldset)
2569{
2570 int rc;
2571 sigset_t blocked;
2572
2573 sigfillset(&blocked);
2574 rc = sigprocmask(SIG_BLOCK, &blocked, oldset);
2575 BUG_ON(rc);
2576}
2577
2578void ocfs2_unblock_signals(sigset_t *oldset)
2579{
2580 int rc = sigprocmask(SIG_SETMASK, oldset, NULL);
2581 BUG_ON(rc);
2582}
2583
2512module_init(ocfs2_init); 2584module_init(ocfs2_init);
2513module_exit(ocfs2_exit); 2585module_exit(ocfs2_exit);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 783f5270f2a1..40c7de084c10 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -45,4 +45,11 @@ void __ocfs2_abort(struct super_block *sb,
45 45
46#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) 46#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
47 47
48/*
49 * Void signal blockers, because in-kernel sigprocmask() only fails
50 * when SIG_* is wrong.
51 */
52void ocfs2_block_signals(sigset_t *oldset);
53void ocfs2_unblock_signals(sigset_t *oldset);
54
48#endif /* OCFS2_SUPER_H */ 55#endif /* OCFS2_SUPER_H */
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 40e53702948c..bfe7190cdbf1 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30 29
31#define MLOG_MASK_PREFIX ML_INODE 30#define MLOG_MASK_PREFIX ML_INODE
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d1b0d386f6d1..98ee6c44102d 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -79,6 +79,7 @@ struct ocfs2_xattr_set_ctxt {
79 struct ocfs2_alloc_context *meta_ac; 79 struct ocfs2_alloc_context *meta_ac;
80 struct ocfs2_alloc_context *data_ac; 80 struct ocfs2_alloc_context *data_ac;
81 struct ocfs2_cached_dealloc_ctxt dealloc; 81 struct ocfs2_cached_dealloc_ctxt dealloc;
82 int set_abort;
82}; 83};
83 84
84#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root)) 85#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root))
@@ -739,11 +740,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
739 goto leave; 740 goto leave;
740 } 741 }
741 742
742 status = ocfs2_journal_dirty(handle, vb->vb_bh); 743 ocfs2_journal_dirty(handle, vb->vb_bh);
743 if (status < 0) {
744 mlog_errno(status);
745 goto leave;
746 }
747 744
748 clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters; 745 clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
749 746
@@ -786,12 +783,7 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
786 } 783 }
787 784
788 le32_add_cpu(&vb->vb_xv->xr_clusters, -len); 785 le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
789 786 ocfs2_journal_dirty(handle, vb->vb_bh);
790 ret = ocfs2_journal_dirty(handle, vb->vb_bh);
791 if (ret) {
792 mlog_errno(ret);
793 goto out;
794 }
795 787
796 if (ext_flags & OCFS2_EXT_REFCOUNTED) 788 if (ext_flags & OCFS2_EXT_REFCOUNTED)
797 ret = ocfs2_decrease_refcount(inode, handle, 789 ret = ocfs2_decrease_refcount(inode, handle,
@@ -1374,11 +1366,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
1374 memset(bh->b_data + cp_len, 0, 1366 memset(bh->b_data + cp_len, 0,
1375 blocksize - cp_len); 1367 blocksize - cp_len);
1376 1368
1377 ret = ocfs2_journal_dirty(handle, bh); 1369 ocfs2_journal_dirty(handle, bh);
1378 if (ret < 0) {
1379 mlog_errno(ret);
1380 goto out;
1381 }
1382 brelse(bh); 1370 brelse(bh);
1383 bh = NULL; 1371 bh = NULL;
1384 1372
@@ -1622,7 +1610,7 @@ static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc)
1622 /* Now tell xh->xh_entries about it */ 1610 /* Now tell xh->xh_entries about it */
1623 for (i = 0; i < count; i++) { 1611 for (i = 0; i < count; i++) {
1624 offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset); 1612 offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
1625 if (offset < namevalue_offset) 1613 if (offset <= namevalue_offset)
1626 le16_add_cpu(&xh->xh_entries[i].xe_name_offset, 1614 le16_add_cpu(&xh->xh_entries[i].xe_name_offset,
1627 namevalue_size); 1615 namevalue_size);
1628 } 1616 }
@@ -2148,15 +2136,19 @@ alloc_value:
2148 orig_clusters = ocfs2_xa_value_clusters(loc); 2136 orig_clusters = ocfs2_xa_value_clusters(loc);
2149 rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt); 2137 rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt);
2150 if (rc < 0) { 2138 if (rc < 0) {
2151 /* 2139 ctxt->set_abort = 1;
2152 * If we tried to grow an existing external value,
2153 * ocfs2_xa_cleanuP-value_truncate() is going to
2154 * let it stand. We have to restore its original
2155 * value size.
2156 */
2157 loc->xl_entry->xe_value_size = orig_value_size;
2158 ocfs2_xa_cleanup_value_truncate(loc, "growing", 2140 ocfs2_xa_cleanup_value_truncate(loc, "growing",
2159 orig_clusters); 2141 orig_clusters);
2142 /*
2143 * If we were growing an existing value,
2144 * ocfs2_xa_cleanup_value_truncate() won't remove
2145 * the entry. We need to restore the original value
2146 * size.
2147 */
2148 if (loc->xl_entry) {
2149 BUG_ON(!orig_value_size);
2150 loc->xl_entry->xe_value_size = orig_value_size;
2151 }
2160 mlog_errno(rc); 2152 mlog_errno(rc);
2161 } 2153 }
2162 } 2154 }
@@ -2479,7 +2471,10 @@ static int ocfs2_xattr_free_block(struct inode *inode,
2479 xb = (struct ocfs2_xattr_block *)blk_bh->b_data; 2471 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
2480 blk = le64_to_cpu(xb->xb_blkno); 2472 blk = le64_to_cpu(xb->xb_blkno);
2481 bit = le16_to_cpu(xb->xb_suballoc_bit); 2473 bit = le16_to_cpu(xb->xb_suballoc_bit);
2482 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 2474 if (xb->xb_suballoc_loc)
2475 bg_blkno = le64_to_cpu(xb->xb_suballoc_loc);
2476 else
2477 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2483 2478
2484 xb_alloc_inode = ocfs2_get_system_file_inode(osb, 2479 xb_alloc_inode = ocfs2_get_system_file_inode(osb,
2485 EXTENT_ALLOC_SYSTEM_INODE, 2480 EXTENT_ALLOC_SYSTEM_INODE,
@@ -2594,9 +2589,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
2594 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 2589 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
2595 spin_unlock(&oi->ip_lock); 2590 spin_unlock(&oi->ip_lock);
2596 2591
2597 ret = ocfs2_journal_dirty(handle, di_bh); 2592 ocfs2_journal_dirty(handle, di_bh);
2598 if (ret < 0)
2599 mlog_errno(ret);
2600out_commit: 2593out_commit:
2601 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 2594 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
2602out: 2595out:
@@ -2724,9 +2717,7 @@ static int ocfs2_xattr_ibody_init(struct inode *inode,
2724 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 2717 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
2725 spin_unlock(&oi->ip_lock); 2718 spin_unlock(&oi->ip_lock);
2726 2719
2727 ret = ocfs2_journal_dirty(ctxt->handle, di_bh); 2720 ocfs2_journal_dirty(ctxt->handle, di_bh);
2728 if (ret < 0)
2729 mlog_errno(ret);
2730 2721
2731out: 2722out:
2732 return ret; 2723 return ret;
@@ -2846,9 +2837,8 @@ static int ocfs2_create_xattr_block(struct inode *inode,
2846 int ret; 2837 int ret;
2847 u16 suballoc_bit_start; 2838 u16 suballoc_bit_start;
2848 u32 num_got; 2839 u32 num_got;
2849 u64 first_blkno; 2840 u64 suballoc_loc, first_blkno;
2850 struct ocfs2_dinode *di = (struct ocfs2_dinode *)inode_bh->b_data; 2841 struct ocfs2_dinode *di = (struct ocfs2_dinode *)inode_bh->b_data;
2851 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2852 struct buffer_head *new_bh = NULL; 2842 struct buffer_head *new_bh = NULL;
2853 struct ocfs2_xattr_block *xblk; 2843 struct ocfs2_xattr_block *xblk;
2854 2844
@@ -2859,9 +2849,9 @@ static int ocfs2_create_xattr_block(struct inode *inode,
2859 goto end; 2849 goto end;
2860 } 2850 }
2861 2851
2862 ret = ocfs2_claim_metadata(osb, ctxt->handle, ctxt->meta_ac, 1, 2852 ret = ocfs2_claim_metadata(ctxt->handle, ctxt->meta_ac, 1,
2863 &suballoc_bit_start, &num_got, 2853 &suballoc_loc, &suballoc_bit_start,
2864 &first_blkno); 2854 &num_got, &first_blkno);
2865 if (ret < 0) { 2855 if (ret < 0) {
2866 mlog_errno(ret); 2856 mlog_errno(ret);
2867 goto end; 2857 goto end;
@@ -2883,8 +2873,10 @@ static int ocfs2_create_xattr_block(struct inode *inode,
2883 memset(xblk, 0, inode->i_sb->s_blocksize); 2873 memset(xblk, 0, inode->i_sb->s_blocksize);
2884 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); 2874 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
2885 xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot); 2875 xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot);
2876 xblk->xb_suballoc_loc = cpu_to_le64(suballoc_loc);
2886 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); 2877 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
2887 xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation); 2878 xblk->xb_fs_generation =
2879 cpu_to_le32(OCFS2_SB(inode->i_sb)->fs_generation);
2888 xblk->xb_blkno = cpu_to_le64(first_blkno); 2880 xblk->xb_blkno = cpu_to_le64(first_blkno);
2889 if (indexed) { 2881 if (indexed) {
2890 struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root; 2882 struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
@@ -2956,7 +2948,7 @@ static int ocfs2_xattr_block_set(struct inode *inode,
2956 ret = ocfs2_xa_set(&loc, xi, ctxt); 2948 ret = ocfs2_xa_set(&loc, xi, ctxt);
2957 if (!ret) 2949 if (!ret)
2958 xs->here = loc.xl_entry; 2950 xs->here = loc.xl_entry;
2959 else if (ret != -ENOSPC) 2951 else if ((ret != -ENOSPC) || ctxt->set_abort)
2960 goto end; 2952 goto end;
2961 else { 2953 else {
2962 ret = ocfs2_xattr_create_index_block(inode, xs, ctxt); 2954 ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
@@ -3312,14 +3304,13 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
3312 goto out; 3304 goto out;
3313 } 3305 }
3314 3306
3315 ret = ocfs2_extend_trans(ctxt->handle, credits + 3307 ret = ocfs2_extend_trans(ctxt->handle, credits);
3316 ctxt->handle->h_buffer_credits);
3317 if (ret) { 3308 if (ret) {
3318 mlog_errno(ret); 3309 mlog_errno(ret);
3319 goto out; 3310 goto out;
3320 } 3311 }
3321 ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt); 3312 ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
3322 } else if (ret == -ENOSPC) { 3313 } else if ((ret == -ENOSPC) && !ctxt->set_abort) {
3323 if (di->i_xattr_loc && !xbs->xattr_bh) { 3314 if (di->i_xattr_loc && !xbs->xattr_bh) {
3324 ret = ocfs2_xattr_block_find(inode, 3315 ret = ocfs2_xattr_block_find(inode,
3325 xi->xi_name_index, 3316 xi->xi_name_index,
@@ -3343,8 +3334,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
3343 goto out; 3334 goto out;
3344 } 3335 }
3345 3336
3346 ret = ocfs2_extend_trans(ctxt->handle, credits + 3337 ret = ocfs2_extend_trans(ctxt->handle, credits);
3347 ctxt->handle->h_buffer_credits);
3348 if (ret) { 3338 if (ret) {
3349 mlog_errno(ret); 3339 mlog_errno(ret);
3350 goto out; 3340 goto out;
@@ -3378,8 +3368,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
3378 goto out; 3368 goto out;
3379 } 3369 }
3380 3370
3381 ret = ocfs2_extend_trans(ctxt->handle, credits + 3371 ret = ocfs2_extend_trans(ctxt->handle, credits);
3382 ctxt->handle->h_buffer_credits);
3383 if (ret) { 3372 if (ret) {
3384 mlog_errno(ret); 3373 mlog_errno(ret);
3385 goto out; 3374 goto out;
@@ -4249,7 +4238,6 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
4249 u32 bit_off, len; 4238 u32 bit_off, len;
4250 u64 blkno; 4239 u64 blkno;
4251 handle_t *handle = ctxt->handle; 4240 handle_t *handle = ctxt->handle;
4252 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
4253 struct ocfs2_inode_info *oi = OCFS2_I(inode); 4241 struct ocfs2_inode_info *oi = OCFS2_I(inode);
4254 struct buffer_head *xb_bh = xs->xattr_bh; 4242 struct buffer_head *xb_bh = xs->xattr_bh;
4255 struct ocfs2_xattr_block *xb = 4243 struct ocfs2_xattr_block *xb =
@@ -4277,7 +4265,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
4277 goto out; 4265 goto out;
4278 } 4266 }
4279 4267
4280 ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 4268 ret = __ocfs2_claim_clusters(handle, ctxt->data_ac,
4281 1, 1, &bit_off, &len); 4269 1, 1, &bit_off, &len);
4282 if (ret) { 4270 if (ret) {
4283 mlog_errno(ret); 4271 mlog_errno(ret);
@@ -4887,8 +4875,7 @@ static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
4887 * We need to update the first bucket of the old extent and all 4875 * We need to update the first bucket of the old extent and all
4888 * the buckets going to the new extent. 4876 * the buckets going to the new extent.
4889 */ 4877 */
4890 credits = ((num_buckets + 1) * blks_per_bucket) + 4878 credits = ((num_buckets + 1) * blks_per_bucket);
4891 handle->h_buffer_credits;
4892 ret = ocfs2_extend_trans(handle, credits); 4879 ret = ocfs2_extend_trans(handle, credits);
4893 if (ret) { 4880 if (ret) {
4894 mlog_errno(ret); 4881 mlog_errno(ret);
@@ -4958,7 +4945,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
4958 u32 *first_hash) 4945 u32 *first_hash)
4959{ 4946{
4960 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); 4947 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
4961 int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits; 4948 int ret, credits = 2 * blk_per_bucket;
4962 4949
4963 BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize); 4950 BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
4964 4951
@@ -5099,7 +5086,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
5099 goto leave; 5086 goto leave;
5100 } 5087 }
5101 5088
5102 ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1, 5089 ret = __ocfs2_claim_clusters(handle, ctxt->data_ac, 1,
5103 clusters_to_add, &bit_off, &num_bits); 5090 clusters_to_add, &bit_off, &num_bits);
5104 if (ret < 0) { 5091 if (ret < 0) {
5105 if (ret != -ENOSPC) 5092 if (ret != -ENOSPC)
@@ -5153,9 +5140,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
5153 goto leave; 5140 goto leave;
5154 } 5141 }
5155 5142
5156 ret = ocfs2_journal_dirty(handle, root_bh); 5143 ocfs2_journal_dirty(handle, root_bh);
5157 if (ret < 0)
5158 mlog_errno(ret);
5159 5144
5160leave: 5145leave:
5161 return ret; 5146 return ret;
@@ -5200,8 +5185,7 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
5200 * existing bucket. Then we add the last existing bucket, the 5185 * existing bucket. Then we add the last existing bucket, the
5201 * new bucket, and the first bucket (3 * blk_per_bucket). 5186 * new bucket, and the first bucket (3 * blk_per_bucket).
5202 */ 5187 */
5203 credits = (end_blk - target_blk) + (3 * blk_per_bucket) + 5188 credits = (end_blk - target_blk) + (3 * blk_per_bucket);
5204 handle->h_buffer_credits;
5205 ret = ocfs2_extend_trans(handle, credits); 5189 ret = ocfs2_extend_trans(handle, credits);
5206 if (ret) { 5190 if (ret) {
5207 mlog_errno(ret); 5191 mlog_errno(ret);
@@ -5477,12 +5461,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
5477 } 5461 }
5478 5462
5479 le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len); 5463 le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len);
5480 5464 ocfs2_journal_dirty(handle, root_bh);
5481 ret = ocfs2_journal_dirty(handle, root_bh);
5482 if (ret) {
5483 mlog_errno(ret);
5484 goto out_commit;
5485 }
5486 5465
5487 ret = ocfs2_truncate_log_append(osb, handle, blkno, len); 5466 ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
5488 if (ret) 5467 if (ret)
@@ -6528,13 +6507,11 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
6528 int indexed) 6507 int indexed)
6529{ 6508{
6530 int ret; 6509 int ret;
6531 struct ocfs2_alloc_context *meta_ac;
6532 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 6510 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6533 struct ocfs2_xattr_set_ctxt ctxt = { 6511 struct ocfs2_xattr_set_ctxt ctxt;
6534 .meta_ac = meta_ac,
6535 };
6536 6512
6537 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); 6513 memset(&ctxt, 0, sizeof(ctxt));
6514 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &ctxt.meta_ac);
6538 if (ret < 0) { 6515 if (ret < 0) {
6539 mlog_errno(ret); 6516 mlog_errno(ret);
6540 return ret; 6517 return ret;
@@ -6556,7 +6533,7 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
6556 6533
6557 ocfs2_commit_trans(osb, ctxt.handle); 6534 ocfs2_commit_trans(osb, ctxt.handle);
6558out: 6535out:
6559 ocfs2_free_alloc_context(meta_ac); 6536 ocfs2_free_alloc_context(ctxt.meta_ac);
6560 return ret; 6537 return ret;
6561} 6538}
6562 6539
@@ -6937,7 +6914,7 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
6937 goto out; 6914 goto out;
6938 } 6915 }
6939 6916
6940 ret = ocfs2_claim_clusters(osb, handle, data_ac, 6917 ret = ocfs2_claim_clusters(handle, data_ac,
6941 len, &p_cluster, &num_clusters); 6918 len, &p_cluster, &num_clusters);
6942 if (ret) { 6919 if (ret) {
6943 mlog_errno(ret); 6920 mlog_errno(ret);
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 75d9b5ba1d45..b44bb835e8ea 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -3,9 +3,9 @@
3 * Copyright (C) 2006 Bob Copeland <me@bobcopeland.com> 3 * Copyright (C) 2006 Bob Copeland <me@bobcopeland.com>
4 * Released under GPL v2. 4 * Released under GPL v2.
5 */ 5 */
6#include <linux/version.h>
7#include <linux/module.h> 6#include <linux/module.h>
8#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/slab.h>
9#include <linux/fs.h> 9#include <linux/fs.h>
10#include <linux/vfs.h> 10#include <linux/vfs.h>
11#include <linux/parser.h> 11#include <linux/parser.h>
diff --git a/fs/open.c b/fs/open.c
index e17f54454b50..74e5cd9f718e 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -10,7 +10,6 @@
10#include <linux/fdtable.h> 10#include <linux/fdtable.h>
11#include <linux/fsnotify.h> 11#include <linux/fsnotify.h>
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/slab.h>
14#include <linux/tty.h> 13#include <linux/tty.h>
15#include <linux/namei.h> 14#include <linux/namei.h>
16#include <linux/backing-dev.h> 15#include <linux/backing-dev.h>
@@ -20,6 +19,7 @@
20#include <linux/mount.h> 19#include <linux/mount.h>
21#include <linux/vfs.h> 20#include <linux/vfs.h>
22#include <linux/fcntl.h> 21#include <linux/fcntl.h>
22#include <linux/slab.h>
23#include <asm/uaccess.h> 23#include <asm/uaccess.h>
24#include <linux/fs.h> 24#include <linux/fs.h>
25#include <linux/personality.h> 25#include <linux/personality.h>
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index e8865c11777f..e238ab23a9e7 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -16,6 +16,7 @@
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/slab.h>
19#include <linux/kmod.h> 20#include <linux/kmod.h>
20#include <linux/ctype.h> 21#include <linux/ctype.h>
21#include <linux/genhd.h> 22#include <linux/genhd.h>
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 49cfd5f54238..91babdae7587 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -95,6 +95,7 @@
95 ************************************************************/ 95 ************************************************************/
96#include <linux/crc32.h> 96#include <linux/crc32.h>
97#include <linux/math64.h> 97#include <linux/math64.h>
98#include <linux/slab.h>
98#include "check.h" 99#include "check.h"
99#include "efi.h" 100#include "efi.h"
100 101
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 0028d2ef0662..90be97f1f5a8 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -31,14 +31,17 @@
31 */ 31 */
32#include <asm/unaligned.h> 32#include <asm/unaligned.h>
33 33
34#define SYS_IND(p) (get_unaligned(&p->sys_ind)) 34#define SYS_IND(p) get_unaligned(&p->sys_ind)
35#define NR_SECTS(p) ({ __le32 __a = get_unaligned(&p->nr_sects); \
36 le32_to_cpu(__a); \
37 })
38 35
39#define START_SECT(p) ({ __le32 __a = get_unaligned(&p->start_sect); \ 36static inline sector_t nr_sects(struct partition *p)
40 le32_to_cpu(__a); \ 37{
41 }) 38 return (sector_t)get_unaligned_le32(&p->nr_sects);
39}
40
41static inline sector_t start_sect(struct partition *p)
42{
43 return (sector_t)get_unaligned_le32(&p->start_sect);
44}
42 45
43static inline int is_extended_partition(struct partition *p) 46static inline int is_extended_partition(struct partition *p)
44{ 47{
@@ -104,13 +107,13 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
104 107
105static void 108static void
106parse_extended(struct parsed_partitions *state, struct block_device *bdev, 109parse_extended(struct parsed_partitions *state, struct block_device *bdev,
107 u32 first_sector, u32 first_size) 110 sector_t first_sector, sector_t first_size)
108{ 111{
109 struct partition *p; 112 struct partition *p;
110 Sector sect; 113 Sector sect;
111 unsigned char *data; 114 unsigned char *data;
112 u32 this_sector, this_size; 115 sector_t this_sector, this_size;
113 int sector_size = bdev_logical_block_size(bdev) / 512; 116 sector_t sector_size = bdev_logical_block_size(bdev) / 512;
114 int loopct = 0; /* number of links followed 117 int loopct = 0; /* number of links followed
115 without finding a data partition */ 118 without finding a data partition */
116 int i; 119 int i;
@@ -145,14 +148,14 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
145 * First process the data partition(s) 148 * First process the data partition(s)
146 */ 149 */
147 for (i=0; i<4; i++, p++) { 150 for (i=0; i<4; i++, p++) {
148 u32 offs, size, next; 151 sector_t offs, size, next;
149 if (!NR_SECTS(p) || is_extended_partition(p)) 152 if (!nr_sects(p) || is_extended_partition(p))
150 continue; 153 continue;
151 154
152 /* Check the 3rd and 4th entries - 155 /* Check the 3rd and 4th entries -
153 these sometimes contain random garbage */ 156 these sometimes contain random garbage */
154 offs = START_SECT(p)*sector_size; 157 offs = start_sect(p)*sector_size;
155 size = NR_SECTS(p)*sector_size; 158 size = nr_sects(p)*sector_size;
156 next = this_sector + offs; 159 next = this_sector + offs;
157 if (i >= 2) { 160 if (i >= 2) {
158 if (offs + size > this_size) 161 if (offs + size > this_size)
@@ -179,13 +182,13 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
179 */ 182 */
180 p -= 4; 183 p -= 4;
181 for (i=0; i<4; i++, p++) 184 for (i=0; i<4; i++, p++)
182 if (NR_SECTS(p) && is_extended_partition(p)) 185 if (nr_sects(p) && is_extended_partition(p))
183 break; 186 break;
184 if (i == 4) 187 if (i == 4)
185 goto done; /* nothing left to do */ 188 goto done; /* nothing left to do */
186 189
187 this_sector = first_sector + START_SECT(p) * sector_size; 190 this_sector = first_sector + start_sect(p) * sector_size;
188 this_size = NR_SECTS(p) * sector_size; 191 this_size = nr_sects(p) * sector_size;
189 put_dev_sector(sect); 192 put_dev_sector(sect);
190 } 193 }
191done: 194done:
@@ -197,7 +200,7 @@ done:
197 200
198static void 201static void
199parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev, 202parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
200 u32 offset, u32 size, int origin) 203 sector_t offset, sector_t size, int origin)
201{ 204{
202#ifdef CONFIG_SOLARIS_X86_PARTITION 205#ifdef CONFIG_SOLARIS_X86_PARTITION
203 Sector sect; 206 Sector sect;
@@ -244,7 +247,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
244 */ 247 */
245static void 248static void
246parse_bsd(struct parsed_partitions *state, struct block_device *bdev, 249parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
247 u32 offset, u32 size, int origin, char *flavour, 250 sector_t offset, sector_t size, int origin, char *flavour,
248 int max_partitions) 251 int max_partitions)
249{ 252{
250 Sector sect; 253 Sector sect;
@@ -263,7 +266,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
263 if (le16_to_cpu(l->d_npartitions) < max_partitions) 266 if (le16_to_cpu(l->d_npartitions) < max_partitions)
264 max_partitions = le16_to_cpu(l->d_npartitions); 267 max_partitions = le16_to_cpu(l->d_npartitions);
265 for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) { 268 for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) {
266 u32 bsd_start, bsd_size; 269 sector_t bsd_start, bsd_size;
267 270
268 if (state->next == state->limit) 271 if (state->next == state->limit)
269 break; 272 break;
@@ -290,7 +293,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
290 293
291static void 294static void
292parse_freebsd(struct parsed_partitions *state, struct block_device *bdev, 295parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
293 u32 offset, u32 size, int origin) 296 sector_t offset, sector_t size, int origin)
294{ 297{
295#ifdef CONFIG_BSD_DISKLABEL 298#ifdef CONFIG_BSD_DISKLABEL
296 parse_bsd(state, bdev, offset, size, origin, 299 parse_bsd(state, bdev, offset, size, origin,
@@ -300,7 +303,7 @@ parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
300 303
301static void 304static void
302parse_netbsd(struct parsed_partitions *state, struct block_device *bdev, 305parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
303 u32 offset, u32 size, int origin) 306 sector_t offset, sector_t size, int origin)
304{ 307{
305#ifdef CONFIG_BSD_DISKLABEL 308#ifdef CONFIG_BSD_DISKLABEL
306 parse_bsd(state, bdev, offset, size, origin, 309 parse_bsd(state, bdev, offset, size, origin,
@@ -310,7 +313,7 @@ parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
310 313
311static void 314static void
312parse_openbsd(struct parsed_partitions *state, struct block_device *bdev, 315parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
313 u32 offset, u32 size, int origin) 316 sector_t offset, sector_t size, int origin)
314{ 317{
315#ifdef CONFIG_BSD_DISKLABEL 318#ifdef CONFIG_BSD_DISKLABEL
316 parse_bsd(state, bdev, offset, size, origin, 319 parse_bsd(state, bdev, offset, size, origin,
@@ -324,7 +327,7 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
324 */ 327 */
325static void 328static void
326parse_unixware(struct parsed_partitions *state, struct block_device *bdev, 329parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
327 u32 offset, u32 size, int origin) 330 sector_t offset, sector_t size, int origin)
328{ 331{
329#ifdef CONFIG_UNIXWARE_DISKLABEL 332#ifdef CONFIG_UNIXWARE_DISKLABEL
330 Sector sect; 333 Sector sect;
@@ -348,7 +351,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
348 351
349 if (p->s_label != UNIXWARE_FS_UNUSED) 352 if (p->s_label != UNIXWARE_FS_UNUSED)
350 put_partition(state, state->next++, 353 put_partition(state, state->next++,
351 START_SECT(p), NR_SECTS(p)); 354 le32_to_cpu(p->start_sect),
355 le32_to_cpu(p->nr_sects));
352 p++; 356 p++;
353 } 357 }
354 put_dev_sector(sect); 358 put_dev_sector(sect);
@@ -363,7 +367,7 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
363 */ 367 */
364static void 368static void
365parse_minix(struct parsed_partitions *state, struct block_device *bdev, 369parse_minix(struct parsed_partitions *state, struct block_device *bdev,
366 u32 offset, u32 size, int origin) 370 sector_t offset, sector_t size, int origin)
367{ 371{
368#ifdef CONFIG_MINIX_SUBPARTITION 372#ifdef CONFIG_MINIX_SUBPARTITION
369 Sector sect; 373 Sector sect;
@@ -390,7 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
390 /* add each partition in use */ 394 /* add each partition in use */
391 if (SYS_IND(p) == MINIX_PARTITION) 395 if (SYS_IND(p) == MINIX_PARTITION)
392 put_partition(state, state->next++, 396 put_partition(state, state->next++,
393 START_SECT(p), NR_SECTS(p)); 397 start_sect(p), nr_sects(p));
394 } 398 }
395 printk(" >\n"); 399 printk(" >\n");
396 } 400 }
@@ -401,7 +405,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
401static struct { 405static struct {
402 unsigned char id; 406 unsigned char id;
403 void (*parse)(struct parsed_partitions *, struct block_device *, 407 void (*parse)(struct parsed_partitions *, struct block_device *,
404 u32, u32, int); 408 sector_t, sector_t, int);
405} subtypes[] = { 409} subtypes[] = {
406 {FREEBSD_PARTITION, parse_freebsd}, 410 {FREEBSD_PARTITION, parse_freebsd},
407 {NETBSD_PARTITION, parse_netbsd}, 411 {NETBSD_PARTITION, parse_netbsd},
@@ -415,7 +419,7 @@ static struct {
415 419
416int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) 420int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
417{ 421{
418 int sector_size = bdev_logical_block_size(bdev) / 512; 422 sector_t sector_size = bdev_logical_block_size(bdev) / 512;
419 Sector sect; 423 Sector sect;
420 unsigned char *data; 424 unsigned char *data;
421 struct partition *p; 425 struct partition *p;
@@ -483,14 +487,21 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
483 487
484 state->next = 5; 488 state->next = 5;
485 for (slot = 1 ; slot <= 4 ; slot++, p++) { 489 for (slot = 1 ; slot <= 4 ; slot++, p++) {
486 u32 start = START_SECT(p)*sector_size; 490 sector_t start = start_sect(p)*sector_size;
487 u32 size = NR_SECTS(p)*sector_size; 491 sector_t size = nr_sects(p)*sector_size;
488 if (!size) 492 if (!size)
489 continue; 493 continue;
490 if (is_extended_partition(p)) { 494 if (is_extended_partition(p)) {
491 /* prevent someone doing mkfs or mkswap on an 495 /*
492 extended partition, but leave room for LILO */ 496 * prevent someone doing mkfs or mkswap on an
493 put_partition(state, slot, start, size == 1 ? 1 : 2); 497 * extended partition, but leave room for LILO
498 * FIXME: this uses one logical sector for > 512b
499 * sector, although it may not be enough/proper.
500 */
501 sector_t n = 2;
502 n = min(size, max(sector_size, n));
503 put_partition(state, slot, start, n);
504
494 printk(" <"); 505 printk(" <");
495 parse_extended(state, bdev, start, size); 506 parse_extended(state, bdev, start, size);
496 printk(" >"); 507 printk(" >");
@@ -513,7 +524,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
513 unsigned char id = SYS_IND(p); 524 unsigned char id = SYS_IND(p);
514 int n; 525 int n;
515 526
516 if (!NR_SECTS(p)) 527 if (!nr_sects(p))
517 continue; 528 continue;
518 529
519 for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++) 530 for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++)
@@ -521,8 +532,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
521 532
522 if (!subtypes[n].parse) 533 if (!subtypes[n].parse)
523 continue; 534 continue;
524 subtypes[n].parse(state, bdev, START_SECT(p)*sector_size, 535 subtypes[n].parse(state, bdev, start_sect(p)*sector_size,
525 NR_SECTS(p)*sector_size, slot); 536 nr_sects(p)*sector_size, slot);
526 } 537 }
527 put_dev_sector(sect); 538 put_dev_sector(sect);
528 return 1; 539 return 1;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index aa8637b81028..885ab5513ac5 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -68,7 +68,6 @@
68#include <linux/hugetlb.h> 68#include <linux/hugetlb.h>
69#include <linux/pagemap.h> 69#include <linux/pagemap.h>
70#include <linux/swap.h> 70#include <linux/swap.h>
71#include <linux/slab.h>
72#include <linux/smp.h> 71#include <linux/smp.h>
73#include <linux/signal.h> 72#include <linux/signal.h>
74#include <linux/highmem.h> 73#include <linux/highmem.h>
@@ -82,7 +81,6 @@
82#include <linux/pid_namespace.h> 81#include <linux/pid_namespace.h>
83#include <linux/ptrace.h> 82#include <linux/ptrace.h>
84#include <linux/tracehook.h> 83#include <linux/tracehook.h>
85#include <linux/swapops.h>
86 84
87#include <asm/pgtable.h> 85#include <asm/pgtable.h>
88#include <asm/processor.h> 86#include <asm/processor.h>
@@ -496,7 +494,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
496 rsslim, 494 rsslim,
497 mm ? mm->start_code : 0, 495 mm ? mm->start_code : 0,
498 mm ? mm->end_code : 0, 496 mm ? mm->end_code : 0,
499 (permitted && mm) ? task->stack_start : 0, 497 (permitted && mm) ? mm->start_stack : 0,
500 esp, 498 esp,
501 eip, 499 eip,
502 /* The signal information here is obsolete. 500 /* The signal information here is obsolete.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a7310841c831..c7f9f23449dc 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -81,6 +81,7 @@
81#include <linux/elf.h> 81#include <linux/elf.h>
82#include <linux/pid_namespace.h> 82#include <linux/pid_namespace.h>
83#include <linux/fs_struct.h> 83#include <linux/fs_struct.h>
84#include <linux/slab.h>
84#include "internal.h" 85#include "internal.h"
85 86
86/* NOTE: 87/* NOTE:
@@ -442,12 +443,13 @@ static const struct file_operations proc_lstats_operations = {
442unsigned long badness(struct task_struct *p, unsigned long uptime); 443unsigned long badness(struct task_struct *p, unsigned long uptime);
443static int proc_oom_score(struct task_struct *task, char *buffer) 444static int proc_oom_score(struct task_struct *task, char *buffer)
444{ 445{
445 unsigned long points; 446 unsigned long points = 0;
446 struct timespec uptime; 447 struct timespec uptime;
447 448
448 do_posix_clock_monotonic_gettime(&uptime); 449 do_posix_clock_monotonic_gettime(&uptime);
449 read_lock(&tasklist_lock); 450 read_lock(&tasklist_lock);
450 points = badness(task->group_leader, uptime.tv_sec); 451 if (pid_alive(task))
452 points = badness(task, uptime.tv_sec);
451 read_unlock(&tasklist_lock); 453 read_unlock(&tasklist_lock);
452 return sprintf(buffer, "%lu\n", points); 454 return sprintf(buffer, "%lu\n", points);
453} 455}
@@ -728,6 +730,7 @@ out_no_task:
728 730
729static const struct file_operations proc_info_file_operations = { 731static const struct file_operations proc_info_file_operations = {
730 .read = proc_info_read, 732 .read = proc_info_read,
733 .llseek = generic_file_llseek,
731}; 734};
732 735
733static int proc_single_show(struct seq_file *m, void *v) 736static int proc_single_show(struct seq_file *m, void *v)
@@ -985,6 +988,7 @@ out_no_task:
985 988
986static const struct file_operations proc_environ_operations = { 989static const struct file_operations proc_environ_operations = {
987 .read = environ_read, 990 .read = environ_read,
991 .llseek = generic_file_llseek,
988}; 992};
989 993
990static ssize_t oom_adjust_read(struct file *file, char __user *buf, 994static ssize_t oom_adjust_read(struct file *file, char __user *buf,
@@ -1058,6 +1062,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
1058static const struct file_operations proc_oom_adjust_operations = { 1062static const struct file_operations proc_oom_adjust_operations = {
1059 .read = oom_adjust_read, 1063 .read = oom_adjust_read,
1060 .write = oom_adjust_write, 1064 .write = oom_adjust_write,
1065 .llseek = generic_file_llseek,
1061}; 1066};
1062 1067
1063#ifdef CONFIG_AUDITSYSCALL 1068#ifdef CONFIG_AUDITSYSCALL
@@ -1129,6 +1134,7 @@ out_free_page:
1129static const struct file_operations proc_loginuid_operations = { 1134static const struct file_operations proc_loginuid_operations = {
1130 .read = proc_loginuid_read, 1135 .read = proc_loginuid_read,
1131 .write = proc_loginuid_write, 1136 .write = proc_loginuid_write,
1137 .llseek = generic_file_llseek,
1132}; 1138};
1133 1139
1134static ssize_t proc_sessionid_read(struct file * file, char __user * buf, 1140static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
@@ -1149,6 +1155,7 @@ static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
1149 1155
1150static const struct file_operations proc_sessionid_operations = { 1156static const struct file_operations proc_sessionid_operations = {
1151 .read = proc_sessionid_read, 1157 .read = proc_sessionid_read,
1158 .llseek = generic_file_llseek,
1152}; 1159};
1153#endif 1160#endif
1154 1161
@@ -1200,6 +1207,7 @@ static ssize_t proc_fault_inject_write(struct file * file,
1200static const struct file_operations proc_fault_inject_operations = { 1207static const struct file_operations proc_fault_inject_operations = {
1201 .read = proc_fault_inject_read, 1208 .read = proc_fault_inject_read,
1202 .write = proc_fault_inject_write, 1209 .write = proc_fault_inject_write,
1210 .llseek = generic_file_llseek,
1203}; 1211};
1204#endif 1212#endif
1205 1213
@@ -1941,7 +1949,7 @@ static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
1941} 1949}
1942 1950
1943static const struct file_operations proc_fdinfo_file_operations = { 1951static const struct file_operations proc_fdinfo_file_operations = {
1944 .open = nonseekable_open, 1952 .open = nonseekable_open,
1945 .read = proc_fdinfo_read, 1953 .read = proc_fdinfo_read,
1946}; 1954};
1947 1955
@@ -2225,6 +2233,7 @@ out_no_task:
2225static const struct file_operations proc_pid_attr_operations = { 2233static const struct file_operations proc_pid_attr_operations = {
2226 .read = proc_pid_attr_read, 2234 .read = proc_pid_attr_read,
2227 .write = proc_pid_attr_write, 2235 .write = proc_pid_attr_write,
2236 .llseek = generic_file_llseek,
2228}; 2237};
2229 2238
2230static const struct pid_entry attr_dir_stuff[] = { 2239static const struct pid_entry attr_dir_stuff[] = {
@@ -2345,6 +2354,7 @@ static ssize_t proc_coredump_filter_write(struct file *file,
2345static const struct file_operations proc_coredump_filter_operations = { 2354static const struct file_operations proc_coredump_filter_operations = {
2346 .read = proc_coredump_filter_read, 2355 .read = proc_coredump_filter_read,
2347 .write = proc_coredump_filter_write, 2356 .write = proc_coredump_filter_write,
2357 .llseek = generic_file_llseek,
2348}; 2358};
2349#endif 2359#endif
2350 2360
@@ -2907,7 +2917,7 @@ out_no_task:
2907 */ 2917 */
2908static const struct pid_entry tid_base_stuff[] = { 2918static const struct pid_entry tid_base_stuff[] = {
2909 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), 2919 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
2910 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fd_operations), 2920 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
2911 REG("environ", S_IRUSR, proc_environ_operations), 2921 REG("environ", S_IRUSR, proc_environ_operations),
2912 INF("auxv", S_IRUSR, proc_pid_auxv), 2922 INF("auxv", S_IRUSR, proc_pid_auxv),
2913 ONE("status", S_IRUGO, proc_pid_status), 2923 ONE("status", S_IRUGO, proc_pid_status),
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 08f4d71dacd7..43c127490606 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -13,6 +13,7 @@
13#include <linux/proc_fs.h> 13#include <linux/proc_fs.h>
14#include <linux/stat.h> 14#include <linux/stat.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/slab.h>
16#include <linux/mount.h> 17#include <linux/mount.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/idr.h> 19#include <linux/idr.h>
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 445a02bcaab3..aea8502e58a3 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -18,6 +18,7 @@
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/smp_lock.h> 19#include <linux/smp_lock.h>
20#include <linux/sysctl.h> 20#include <linux/sysctl.h>
21#include <linux/slab.h>
21 22
22#include <asm/system.h> 23#include <asm/system.h>
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
@@ -231,9 +232,9 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
231 if (rv == -ENOIOCTLCMD) 232 if (rv == -ENOIOCTLCMD)
232 rv = -EINVAL; 233 rv = -EINVAL;
233 } else if (ioctl) { 234 } else if (ioctl) {
234 lock_kernel(); 235 WARN_ONCE(1, "Procfs ioctl handlers must use unlocked_ioctl, "
236 "%pf will be called without the Bkl held\n", ioctl);
235 rv = ioctl(file->f_path.dentry->d_inode, file, cmd, arg); 237 rv = ioctl(file->f_path.dentry->d_inode, file, cmd, arg);
236 unlock_kernel();
237 } 238 }
238 239
239 pde_users_dec(pde); 240 pde_users_dec(pde);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a44a7897fd4d..c837a77351be 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -19,6 +19,7 @@
19#include <linux/highmem.h> 19#include <linux/highmem.h>
20#include <linux/bootmem.h> 20#include <linux/bootmem.h>
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/slab.h>
22#include <asm/uaccess.h> 23#include <asm/uaccess.h>
23#include <asm/io.h> 24#include <asm/io.h>
24#include <linux/list.h> 25#include <linux/list.h>
@@ -490,7 +491,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
490 } 491 }
491 read_unlock(&kclist_lock); 492 read_unlock(&kclist_lock);
492 493
493 if (m == NULL) { 494 if (&m->list == &kclist_head) {
494 if (clear_user(buffer, tsz)) 495 if (clear_user(buffer, tsz))
495 return -EFAULT; 496 return -EFAULT;
496 } else if (is_vmalloc_or_module_addr((void *)start)) { 497 } else if (is_vmalloc_or_module_addr((void *)start)) {
@@ -557,6 +558,7 @@ static int open_kcore(struct inode *inode, struct file *filp)
557static const struct file_operations proc_kcore_operations = { 558static const struct file_operations proc_kcore_operations = {
558 .read = read_kcore, 559 .read = read_kcore,
559 .open = open_kcore, 560 .open = open_kcore,
561 .llseek = generic_file_llseek,
560}; 562};
561 563
562#ifdef CONFIG_MEMORY_HOTPLUG 564#ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index cfe90a48a6e8..bd4b5a740ff1 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -53,6 +53,7 @@ static const struct file_operations proc_kmsg_operations = {
53 .poll = kmsg_poll, 53 .poll = kmsg_poll,
54 .open = kmsg_open, 54 .open = kmsg_open,
55 .release = kmsg_release, 55 .release = kmsg_release,
56 .llseek = generic_file_llseek,
56}; 57};
57 58
58static int __init proc_kmsg_init(void) 59static int __init proc_kmsg_init(void)
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 9fe7d7ebe115..b1822dde55c2 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -21,7 +21,6 @@
21#include <linux/mmzone.h> 21#include <linux/mmzone.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/swap.h> 23#include <linux/swap.h>
24#include <linux/slab.h>
25#include <linux/smp.h> 24#include <linux/smp.h>
26#include <linux/seq_file.h> 25#include <linux/seq_file.h>
27#include <linux/hugetlb.h> 26#include <linux/hugetlb.h>
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index f8650dce74fb..ce94801f48ca 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -12,6 +12,7 @@
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/of.h> 13#include <linux/of.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/slab.h>
15#include <asm/prom.h> 16#include <asm/prom.h>
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17#include "internal.h" 18#include "internal.h"
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 04d1270f1c38..9020ac15baaa 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -14,6 +14,7 @@
14#include <linux/time.h> 14#include <linux/time.h>
15#include <linux/proc_fs.h> 15#include <linux/proc_fs.h>
16#include <linux/stat.h> 16#include <linux/stat.h>
17#include <linux/slab.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/sched.h> 19#include <linux/sched.h>
19#include <linux/module.h> 20#include <linux/module.h>
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index b9b7aad2003d..bf31b03fc275 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -1,6 +1,5 @@
1#include <linux/cpumask.h> 1#include <linux/cpumask.h>
2#include <linux/fs.h> 2#include <linux/fs.h>
3#include <linux/gfp.h>
4#include <linux/init.h> 3#include <linux/init.h>
5#include <linux/interrupt.h> 4#include <linux/interrupt.h>
6#include <linux/kernel_stat.h> 5#include <linux/kernel_stat.h>
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 183f8ff5f400..47f5b145f56e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -4,6 +4,7 @@
4#include <linux/seq_file.h> 4#include <linux/seq_file.h>
5#include <linux/highmem.h> 5#include <linux/highmem.h>
6#include <linux/ptrace.h> 6#include <linux/ptrace.h>
7#include <linux/slab.h>
7#include <linux/pagemap.h> 8#include <linux/pagemap.h>
8#include <linux/mempolicy.h> 9#include <linux/mempolicy.h>
9#include <linux/swap.h> 10#include <linux/swap.h>
@@ -246,25 +247,6 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
246 } else if (vma->vm_start <= mm->start_stack && 247 } else if (vma->vm_start <= mm->start_stack &&
247 vma->vm_end >= mm->start_stack) { 248 vma->vm_end >= mm->start_stack) {
248 name = "[stack]"; 249 name = "[stack]";
249 } else {
250 unsigned long stack_start;
251 struct proc_maps_private *pmp;
252
253 pmp = m->private;
254 stack_start = pmp->task->stack_start;
255
256 if (vma->vm_start <= stack_start &&
257 vma->vm_end >= stack_start) {
258 pad_len_spaces(m, len);
259 seq_printf(m,
260 "[threadstack:%08lx]",
261#ifdef CONFIG_STACK_GROWSUP
262 vma->vm_end - stack_start
263#else
264 stack_start - vma->vm_start
265#endif
266 );
267 }
268 } 250 }
269 } else { 251 } else {
270 name = "[vdso]"; 252 name = "[vdso]";
@@ -406,6 +388,7 @@ static int show_smap(struct seq_file *m, void *v)
406 388
407 memset(&mss, 0, sizeof mss); 389 memset(&mss, 0, sizeof mss);
408 mss.vma = vma; 390 mss.vma = vma;
391 /* mmap_sem is held in m_start */
409 if (vma->vm_mm && !is_vm_hugetlb_page(vma)) 392 if (vma->vm_mm && !is_vm_hugetlb_page(vma))
410 walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); 393 walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
411 394
@@ -552,7 +535,8 @@ const struct file_operations proc_clear_refs_operations = {
552}; 535};
553 536
554struct pagemapread { 537struct pagemapread {
555 u64 __user *out, *end; 538 int pos, len;
539 u64 *buffer;
556}; 540};
557 541
558#define PM_ENTRY_BYTES sizeof(u64) 542#define PM_ENTRY_BYTES sizeof(u64)
@@ -575,10 +559,8 @@ struct pagemapread {
575static int add_to_pagemap(unsigned long addr, u64 pfn, 559static int add_to_pagemap(unsigned long addr, u64 pfn,
576 struct pagemapread *pm) 560 struct pagemapread *pm)
577{ 561{
578 if (put_user(pfn, pm->out)) 562 pm->buffer[pm->pos++] = pfn;
579 return -EFAULT; 563 if (pm->pos >= pm->len)
580 pm->out++;
581 if (pm->out >= pm->end)
582 return PM_END_OF_BUFFER; 564 return PM_END_OF_BUFFER;
583 return 0; 565 return 0;
584} 566}
@@ -661,31 +643,18 @@ static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
661 return pme; 643 return pme;
662} 644}
663 645
664static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr, 646/* This function walks within one hugetlb entry in the single call */
665 unsigned long end, struct mm_walk *walk) 647static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
648 unsigned long addr, unsigned long end,
649 struct mm_walk *walk)
666{ 650{
667 struct vm_area_struct *vma;
668 struct pagemapread *pm = walk->private; 651 struct pagemapread *pm = walk->private;
669 struct hstate *hs = NULL;
670 int err = 0; 652 int err = 0;
653 u64 pfn;
671 654
672 vma = find_vma(walk->mm, addr);
673 if (vma)
674 hs = hstate_vma(vma);
675 for (; addr != end; addr += PAGE_SIZE) { 655 for (; addr != end; addr += PAGE_SIZE) {
676 u64 pfn = PM_NOT_PRESENT; 656 int offset = (addr & ~hmask) >> PAGE_SHIFT;
677 657 pfn = huge_pte_to_pagemap_entry(*pte, offset);
678 if (vma && (addr >= vma->vm_end)) {
679 vma = find_vma(walk->mm, addr);
680 if (vma)
681 hs = hstate_vma(vma);
682 }
683
684 if (vma && (vma->vm_start <= addr) && is_vm_hugetlb_page(vma)) {
685 /* calculate pfn of the "raw" page in the hugepage. */
686 int offset = (addr & ~huge_page_mask(hs)) >> PAGE_SHIFT;
687 pfn = huge_pte_to_pagemap_entry(*pte, offset);
688 }
689 err = add_to_pagemap(addr, pfn, pm); 658 err = add_to_pagemap(addr, pfn, pm);
690 if (err) 659 if (err)
691 return err; 660 return err;
@@ -720,21 +689,20 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr,
720 * determine which areas of memory are actually mapped and llseek to 689 * determine which areas of memory are actually mapped and llseek to
721 * skip over unmapped regions. 690 * skip over unmapped regions.
722 */ 691 */
692#define PAGEMAP_WALK_SIZE (PMD_SIZE)
723static ssize_t pagemap_read(struct file *file, char __user *buf, 693static ssize_t pagemap_read(struct file *file, char __user *buf,
724 size_t count, loff_t *ppos) 694 size_t count, loff_t *ppos)
725{ 695{
726 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); 696 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
727 struct page **pages, *page;
728 unsigned long uaddr, uend;
729 struct mm_struct *mm; 697 struct mm_struct *mm;
730 struct pagemapread pm; 698 struct pagemapread pm;
731 int pagecount;
732 int ret = -ESRCH; 699 int ret = -ESRCH;
733 struct mm_walk pagemap_walk = {}; 700 struct mm_walk pagemap_walk = {};
734 unsigned long src; 701 unsigned long src;
735 unsigned long svpfn; 702 unsigned long svpfn;
736 unsigned long start_vaddr; 703 unsigned long start_vaddr;
737 unsigned long end_vaddr; 704 unsigned long end_vaddr;
705 int copied = 0;
738 706
739 if (!task) 707 if (!task)
740 goto out; 708 goto out;
@@ -757,35 +725,12 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
757 if (!mm) 725 if (!mm)
758 goto out_task; 726 goto out_task;
759 727
760 728 pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
761 uaddr = (unsigned long)buf & PAGE_MASK; 729 pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
762 uend = (unsigned long)(buf + count);
763 pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE;
764 ret = 0;
765 if (pagecount == 0)
766 goto out_mm;
767 pages = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
768 ret = -ENOMEM; 730 ret = -ENOMEM;
769 if (!pages) 731 if (!pm.buffer)
770 goto out_mm; 732 goto out_mm;
771 733
772 down_read(&current->mm->mmap_sem);
773 ret = get_user_pages(current, current->mm, uaddr, pagecount,
774 1, 0, pages, NULL);
775 up_read(&current->mm->mmap_sem);
776
777 if (ret < 0)
778 goto out_free;
779
780 if (ret != pagecount) {
781 pagecount = ret;
782 ret = -EFAULT;
783 goto out_pages;
784 }
785
786 pm.out = (u64 __user *)buf;
787 pm.end = (u64 __user *)(buf + count);
788
789 pagemap_walk.pmd_entry = pagemap_pte_range; 734 pagemap_walk.pmd_entry = pagemap_pte_range;
790 pagemap_walk.pte_hole = pagemap_pte_hole; 735 pagemap_walk.pte_hole = pagemap_pte_hole;
791 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; 736 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
@@ -807,23 +752,36 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
807 * user buffer is tracked in "pm", and the walk 752 * user buffer is tracked in "pm", and the walk
808 * will stop when we hit the end of the buffer. 753 * will stop when we hit the end of the buffer.
809 */ 754 */
810 ret = walk_page_range(start_vaddr, end_vaddr, &pagemap_walk); 755 ret = 0;
811 if (ret == PM_END_OF_BUFFER) 756 while (count && (start_vaddr < end_vaddr)) {
812 ret = 0; 757 int len;
813 /* don't need mmap_sem for these, but this looks cleaner */ 758 unsigned long end;
814 *ppos += (char __user *)pm.out - buf; 759
815 if (!ret) 760 pm.pos = 0;
816 ret = (char __user *)pm.out - buf; 761 end = start_vaddr + PAGEMAP_WALK_SIZE;
817 762 /* overflow ? */
818out_pages: 763 if (end < start_vaddr || end > end_vaddr)
819 for (; pagecount; pagecount--) { 764 end = end_vaddr;
820 page = pages[pagecount-1]; 765 down_read(&mm->mmap_sem);
821 if (!PageReserved(page)) 766 ret = walk_page_range(start_vaddr, end, &pagemap_walk);
822 SetPageDirty(page); 767 up_read(&mm->mmap_sem);
823 page_cache_release(page); 768 start_vaddr = end;
769
770 len = min(count, PM_ENTRY_BYTES * pm.pos);
771 if (copy_to_user(buf, pm.buffer, len)) {
772 ret = -EFAULT;
773 goto out_free;
774 }
775 copied += len;
776 buf += len;
777 count -= len;
824 } 778 }
779 *ppos += copied;
780 if (!ret || ret == PM_END_OF_BUFFER)
781 ret = copied;
782
825out_free: 783out_free:
826 kfree(pages); 784 kfree(pm.buffer);
827out_mm: 785out_mm:
828 mmput(mm); 786 mmput(mm);
829out_task: 787out_task:
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 5d9fd64ef81a..46d4b5d72bd3 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -5,6 +5,7 @@
5#include <linux/fs_struct.h> 5#include <linux/fs_struct.h>
6#include <linux/mount.h> 6#include <linux/mount.h>
7#include <linux/ptrace.h> 7#include <linux/ptrace.h>
8#include <linux/slab.h>
8#include <linux/seq_file.h> 9#include <linux/seq_file.h>
9#include "internal.h" 10#include "internal.h"
10 11
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 0872afa58d39..91c817ff02c3 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -12,6 +12,7 @@
12#include <linux/user.h> 12#include <linux/user.h>
13#include <linux/elf.h> 13#include <linux/elf.h>
14#include <linux/elfcore.h> 14#include <linux/elfcore.h>
15#include <linux/slab.h>
15#include <linux/highmem.h> 16#include <linux/highmem.h>
16#include <linux/bootmem.h> 17#include <linux/bootmem.h>
17#include <linux/init.h> 18#include <linux/init.h>
@@ -162,6 +163,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
162 163
163static const struct file_operations proc_vmcore_operations = { 164static const struct file_operations proc_vmcore_operations = {
164 .read = read_vmcore, 165 .read = read_vmcore,
166 .llseek = generic_file_llseek,
165}; 167};
166 168
167static struct vmcore* __init get_new_element(void) 169static struct vmcore* __init get_new_element(void)
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index ebf3440d28ca..277575ddc05c 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -201,7 +201,8 @@ static const char *qnx4_checkroot(struct super_block *sb)
201 rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE); 201 rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE);
202 if (rootdir->di_fname != NULL) { 202 if (rootdir->di_fname != NULL) {
203 QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname)); 203 QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
204 if (!strncmp(rootdir->di_fname, QNX4_BMNAME, sizeof QNX4_BMNAME)) { 204 if (!strcmp(rootdir->di_fname,
205 QNX4_BMNAME)) {
205 found = 1; 206 found = 1;
206 qnx4_sb(sb)->BitMap = kmalloc( sizeof( struct qnx4_inode_entry ), GFP_KERNEL ); 207 qnx4_sb(sb)->BitMap = kmalloc( sizeof( struct qnx4_inode_entry ), GFP_KERNEL );
207 if (!qnx4_sb(sb)->BitMap) { 208 if (!qnx4_sb(sb)->BitMap) {
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index dad7fb247ddc..3e21b1e2ad3a 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -33,6 +33,14 @@ config PRINT_QUOTA_WARNING
33 Note that this behavior is currently deprecated and may go away in 33 Note that this behavior is currently deprecated and may go away in
34 future. Please use notification via netlink socket instead. 34 future. Please use notification via netlink socket instead.
35 35
36config QUOTA_DEBUG
37 bool "Additional quota sanity checks"
38 depends on QUOTA
39 default n
40 help
41 If you say Y here, quota subsystem will perform some additional
42 sanity checks of quota internal structures. If unsure, say N.
43
36# Generic support for tree structured quota files. Selected when needed. 44# Generic support for tree structured quota files. Selected when needed.
37config QUOTA_TREE 45config QUOTA_TREE
38 tristate 46 tristate
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index e0b870f4749f..788b5802a7ce 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -80,8 +80,6 @@
80 80
81#include <asm/uaccess.h> 81#include <asm/uaccess.h>
82 82
83#define __DQUOT_PARANOIA
84
85/* 83/*
86 * There are three quota SMP locks. dq_list_lock protects all lists with quotas 84 * There are three quota SMP locks. dq_list_lock protects all lists with quotas
87 * and quota formats, dqstats structure containing statistics about the lists 85 * and quota formats, dqstats structure containing statistics about the lists
@@ -695,7 +693,7 @@ void dqput(struct dquot *dquot)
695 693
696 if (!dquot) 694 if (!dquot)
697 return; 695 return;
698#ifdef __DQUOT_PARANOIA 696#ifdef CONFIG_QUOTA_DEBUG
699 if (!atomic_read(&dquot->dq_count)) { 697 if (!atomic_read(&dquot->dq_count)) {
700 printk("VFS: dqput: trying to free free dquot\n"); 698 printk("VFS: dqput: trying to free free dquot\n");
701 printk("VFS: device %s, dquot of %s %d\n", 699 printk("VFS: device %s, dquot of %s %d\n",
@@ -748,7 +746,7 @@ we_slept:
748 goto we_slept; 746 goto we_slept;
749 } 747 }
750 atomic_dec(&dquot->dq_count); 748 atomic_dec(&dquot->dq_count);
751#ifdef __DQUOT_PARANOIA 749#ifdef CONFIG_QUOTA_DEBUG
752 /* sanity check */ 750 /* sanity check */
753 BUG_ON(!list_empty(&dquot->dq_free)); 751 BUG_ON(!list_empty(&dquot->dq_free));
754#endif 752#endif
@@ -845,7 +843,7 @@ we_slept:
845 dquot = NULL; 843 dquot = NULL;
846 goto out; 844 goto out;
847 } 845 }
848#ifdef __DQUOT_PARANOIA 846#ifdef CONFIG_QUOTA_DEBUG
849 BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */ 847 BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */
850#endif 848#endif
851out: 849out:
@@ -874,14 +872,18 @@ static int dqinit_needed(struct inode *inode, int type)
874static void add_dquot_ref(struct super_block *sb, int type) 872static void add_dquot_ref(struct super_block *sb, int type)
875{ 873{
876 struct inode *inode, *old_inode = NULL; 874 struct inode *inode, *old_inode = NULL;
875#ifdef CONFIG_QUOTA_DEBUG
877 int reserved = 0; 876 int reserved = 0;
877#endif
878 878
879 spin_lock(&inode_lock); 879 spin_lock(&inode_lock);
880 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 880 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
881 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) 881 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
882 continue; 882 continue;
883#ifdef CONFIG_QUOTA_DEBUG
883 if (unlikely(inode_get_rsv_space(inode) > 0)) 884 if (unlikely(inode_get_rsv_space(inode) > 0))
884 reserved = 1; 885 reserved = 1;
886#endif
885 if (!atomic_read(&inode->i_writecount)) 887 if (!atomic_read(&inode->i_writecount))
886 continue; 888 continue;
887 if (!dqinit_needed(inode, type)) 889 if (!dqinit_needed(inode, type))
@@ -903,11 +905,13 @@ static void add_dquot_ref(struct super_block *sb, int type)
903 spin_unlock(&inode_lock); 905 spin_unlock(&inode_lock);
904 iput(old_inode); 906 iput(old_inode);
905 907
908#ifdef CONFIG_QUOTA_DEBUG
906 if (reserved) { 909 if (reserved) {
907 printk(KERN_WARNING "VFS (%s): Writes happened before quota" 910 printk(KERN_WARNING "VFS (%s): Writes happened before quota"
908 " was turned on thus quota information is probably " 911 " was turned on thus quota information is probably "
909 "inconsistent. Please run quotacheck(8).\n", sb->s_id); 912 "inconsistent. Please run quotacheck(8).\n", sb->s_id);
910 } 913 }
914#endif
911} 915}
912 916
913/* 917/*
@@ -934,7 +938,7 @@ static int remove_inode_dquot_ref(struct inode *inode, int type,
934 inode->i_dquot[type] = NULL; 938 inode->i_dquot[type] = NULL;
935 if (dquot) { 939 if (dquot) {
936 if (dqput_blocks(dquot)) { 940 if (dqput_blocks(dquot)) {
937#ifdef __DQUOT_PARANOIA 941#ifdef CONFIG_QUOTA_DEBUG
938 if (atomic_read(&dquot->dq_count) != 1) 942 if (atomic_read(&dquot->dq_count) != 1)
939 printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count)); 943 printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count));
940#endif 944#endif
@@ -2322,34 +2326,34 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
2322 if (di->dqb_valid & QIF_SPACE) { 2326 if (di->dqb_valid & QIF_SPACE) {
2323 dm->dqb_curspace = di->dqb_curspace - dm->dqb_rsvspace; 2327 dm->dqb_curspace = di->dqb_curspace - dm->dqb_rsvspace;
2324 check_blim = 1; 2328 check_blim = 1;
2325 __set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags); 2329 set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
2326 } 2330 }
2327 if (di->dqb_valid & QIF_BLIMITS) { 2331 if (di->dqb_valid & QIF_BLIMITS) {
2328 dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit); 2332 dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit);
2329 dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit); 2333 dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit);
2330 check_blim = 1; 2334 check_blim = 1;
2331 __set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags); 2335 set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
2332 } 2336 }
2333 if (di->dqb_valid & QIF_INODES) { 2337 if (di->dqb_valid & QIF_INODES) {
2334 dm->dqb_curinodes = di->dqb_curinodes; 2338 dm->dqb_curinodes = di->dqb_curinodes;
2335 check_ilim = 1; 2339 check_ilim = 1;
2336 __set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags); 2340 set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
2337 } 2341 }
2338 if (di->dqb_valid & QIF_ILIMITS) { 2342 if (di->dqb_valid & QIF_ILIMITS) {
2339 dm->dqb_isoftlimit = di->dqb_isoftlimit; 2343 dm->dqb_isoftlimit = di->dqb_isoftlimit;
2340 dm->dqb_ihardlimit = di->dqb_ihardlimit; 2344 dm->dqb_ihardlimit = di->dqb_ihardlimit;
2341 check_ilim = 1; 2345 check_ilim = 1;
2342 __set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags); 2346 set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
2343 } 2347 }
2344 if (di->dqb_valid & QIF_BTIME) { 2348 if (di->dqb_valid & QIF_BTIME) {
2345 dm->dqb_btime = di->dqb_btime; 2349 dm->dqb_btime = di->dqb_btime;
2346 check_blim = 1; 2350 check_blim = 1;
2347 __set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags); 2351 set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
2348 } 2352 }
2349 if (di->dqb_valid & QIF_ITIME) { 2353 if (di->dqb_valid & QIF_ITIME) {
2350 dm->dqb_itime = di->dqb_itime; 2354 dm->dqb_itime = di->dqb_itime;
2351 check_ilim = 1; 2355 check_ilim = 1;
2352 __set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); 2356 set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
2353 } 2357 }
2354 2358
2355 if (check_blim) { 2359 if (check_blim) {
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index 2663ed90fb03..d67908b407d9 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -5,6 +5,7 @@
5#include <linux/kernel.h> 5#include <linux/kernel.h>
6#include <linux/quotaops.h> 6#include <linux/quotaops.h>
7#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/slab.h>
8#include <net/netlink.h> 9#include <net/netlink.h>
9#include <net/genetlink.h> 10#include <net/genetlink.h>
10 11
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 1739a4aba25f..5ea4ad81a429 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -21,6 +21,7 @@
21#include <linux/pagevec.h> 21#include <linux/pagevec.h>
22#include <linux/mman.h> 22#include <linux/mman.h>
23#include <linux/sched.h> 23#include <linux/sched.h>
24#include <linux/slab.h>
24 25
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26#include "internal.h" 27#include "internal.h"
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a6090aa1a7c1..f47cd212dee1 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -35,6 +35,7 @@
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/parser.h> 36#include <linux/parser.h>
37#include <linux/magic.h> 37#include <linux/magic.h>
38#include <linux/slab.h>
38#include <asm/uaccess.h> 39#include <asm/uaccess.h>
39#include "internal.h" 40#include "internal.h"
40 41
@@ -213,7 +214,7 @@ static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
213 return 0; 214 return 0;
214} 215}
215 216
216static int ramfs_fill_super(struct super_block * sb, void * data, int silent) 217int ramfs_fill_super(struct super_block *sb, void *data, int silent)
217{ 218{
218 struct ramfs_fs_info *fsi; 219 struct ramfs_fs_info *fsi;
219 struct inode *inode = NULL; 220 struct inode *inode = NULL;
diff --git a/fs/read_write.c b/fs/read_write.c
index b7f4a1f94d48..113386d6fd2d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -258,6 +258,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
258 init_sync_kiocb(&kiocb, filp); 258 init_sync_kiocb(&kiocb, filp);
259 kiocb.ki_pos = *ppos; 259 kiocb.ki_pos = *ppos;
260 kiocb.ki_left = len; 260 kiocb.ki_left = len;
261 kiocb.ki_nbytes = len;
261 262
262 for (;;) { 263 for (;;) {
263 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); 264 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
@@ -313,6 +314,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
313 init_sync_kiocb(&kiocb, filp); 314 init_sync_kiocb(&kiocb, filp);
314 kiocb.ki_pos = *ppos; 315 kiocb.ki_pos = *ppos;
315 kiocb.ki_left = len; 316 kiocb.ki_left = len;
317 kiocb.ki_nbytes = len;
316 318
317 for (;;) { 319 for (;;) {
318 ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); 320 ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index dc014f7def05..483442e66ed6 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -169,7 +169,7 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
169 return 0; // No free blocks in this bitmap 169 return 0; // No free blocks in this bitmap
170 } 170 }
171 171
172 /* search for a first zero bit -- beggining of a window */ 172 /* search for a first zero bit -- beginning of a window */
173 *beg = reiserfs_find_next_zero_le_bit 173 *beg = reiserfs_find_next_zero_le_bit
174 ((unsigned long *)(bh->b_data), boundary, *beg); 174 ((unsigned long *)(bh->b_data), boundary, *beg);
175 175
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index c094f58c7448..07930449a958 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -8,6 +8,7 @@
8#include <linux/reiserfs_fs.h> 8#include <linux/reiserfs_fs.h>
9#include <linux/stat.h> 9#include <linux/stat.h>
10#include <linux/buffer_head.h> 10#include <linux/buffer_head.h>
11#include <linux/slab.h>
11#include <asm/uaccess.h> 12#include <asm/uaccess.h>
12 13
13extern const struct reiserfs_key MIN_KEY; 14extern const struct reiserfs_key MIN_KEY;
@@ -45,8 +46,6 @@ static inline bool is_privroot_deh(struct dentry *dir,
45 struct reiserfs_de_head *deh) 46 struct reiserfs_de_head *deh)
46{ 47{
47 struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root; 48 struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
48 if (reiserfs_expose_privroot(dir->d_sb))
49 return 0;
50 return (dir == dir->d_parent && privroot->d_inode && 49 return (dir == dir->d_parent && privroot->d_inode &&
51 deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid); 50 deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
52} 51}
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 6591cb21edf6..1e4250bc3a6f 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -35,6 +35,7 @@
35 **/ 35 **/
36 36
37#include <linux/time.h> 37#include <linux/time.h>
38#include <linux/slab.h>
38#include <linux/string.h> 39#include <linux/string.h>
39#include <linux/reiserfs_fs.h> 40#include <linux/reiserfs_fs.h>
40#include <linux/buffer_head.h> 41#include <linux/buffer_head.h>
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index d1da94b82d8f..dc2c65e04853 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -11,6 +11,7 @@
11#include <linux/smp_lock.h> 11#include <linux/smp_lock.h>
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/highmem.h> 13#include <linux/highmem.h>
14#include <linux/slab.h>
14#include <asm/uaccess.h> 15#include <asm/uaccess.h>
15#include <asm/unaligned.h> 16#include <asm/unaligned.h>
16#include <linux/buffer_head.h> 17#include <linux/buffer_head.h>
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index ba98546fabbd..19fbc810e8e7 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -50,6 +50,7 @@
50#include <linux/blkdev.h> 50#include <linux/blkdev.h>
51#include <linux/backing-dev.h> 51#include <linux/backing-dev.h>
52#include <linux/uaccess.h> 52#include <linux/uaccess.h>
53#include <linux/slab.h>
53 54
54#include <asm/system.h> 55#include <asm/system.h>
55 56
@@ -2217,6 +2218,15 @@ static int journal_read_transaction(struct super_block *sb,
2217 brelse(d_bh); 2218 brelse(d_bh);
2218 return 1; 2219 return 1;
2219 } 2220 }
2221
2222 if (bdev_read_only(sb->s_bdev)) {
2223 reiserfs_warning(sb, "clm-2076",
2224 "device is readonly, unable to replay log");
2225 brelse(c_bh);
2226 brelse(d_bh);
2227 return -EROFS;
2228 }
2229
2220 trans_id = get_desc_trans_id(desc); 2230 trans_id = get_desc_trans_id(desc);
2221 /* now we know we've got a good transaction, and it was inside the valid time ranges */ 2231 /* now we know we've got a good transaction, and it was inside the valid time ranges */
2222 log_blocks = kmalloc(get_desc_trans_len(desc) * 2232 log_blocks = kmalloc(get_desc_trans_len(desc) *
@@ -2459,12 +2469,6 @@ static int journal_read(struct super_block *sb)
2459 goto start_log_replay; 2469 goto start_log_replay;
2460 } 2470 }
2461 2471
2462 if (continue_replay && bdev_read_only(sb->s_bdev)) {
2463 reiserfs_warning(sb, "clm-2076",
2464 "device is readonly, unable to replay log");
2465 return -1;
2466 }
2467
2468 /* ok, there are transactions that need to be replayed. start with the first log block, find 2472 /* ok, there are transactions that need to be replayed. start with the first log block, find
2469 ** all the valid transactions, and pick out the oldest. 2473 ** all the valid transactions, and pick out the oldest.
2470 */ 2474 */
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 96e4cbbfaa18..d0c43cb99ffc 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -13,6 +13,7 @@
13 13
14#include <linux/time.h> 14#include <linux/time.h>
15#include <linux/bitops.h> 15#include <linux/bitops.h>
16#include <linux/slab.h>
16#include <linux/reiserfs_fs.h> 17#include <linux/reiserfs_fs.h>
17#include <linux/reiserfs_acl.h> 18#include <linux/reiserfs_acl.h>
18#include <linux/reiserfs_xattr.h> 19#include <linux/reiserfs_xattr.h>
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 04bf5d791bda..59125fb36d42 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -12,6 +12,7 @@
12 */ 12 */
13 13
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/slab.h>
15#include <linux/vmalloc.h> 16#include <linux/vmalloc.h>
16#include <linux/time.h> 17#include <linux/time.h>
17#include <asm/uaccess.h> 18#include <asm/uaccess.h>
@@ -1618,10 +1619,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1618 save_mount_options(s, data); 1619 save_mount_options(s, data);
1619 1620
1620 sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL); 1621 sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
1621 if (!sbi) { 1622 if (!sbi)
1622 errval = -ENOMEM; 1623 return -ENOMEM;
1623 goto error_alloc;
1624 }
1625 s->s_fs_info = sbi; 1624 s->s_fs_info = sbi;
1626 /* Set default values for options: non-aggressive tails, RO on errors */ 1625 /* Set default values for options: non-aggressive tails, RO on errors */
1627 REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL); 1626 REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
@@ -1878,12 +1877,12 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1878 return (0); 1877 return (0);
1879 1878
1880error: 1879error:
1881 reiserfs_write_unlock(s);
1882error_alloc:
1883 if (jinit_done) { /* kill the commit thread, free journal ram */ 1880 if (jinit_done) { /* kill the commit thread, free journal ram */
1884 journal_release_error(NULL, s); 1881 journal_release_error(NULL, s);
1885 } 1882 }
1886 1883
1884 reiserfs_write_unlock(s);
1885
1887 reiserfs_free_bitmap_cache(s); 1886 reiserfs_free_bitmap_cache(s);
1888 if (SB_BUFFER_WITH_SB(s)) 1887 if (SB_BUFFER_WITH_SB(s))
1889 brelse(SB_BUFFER_WITH_SB(s)); 1888 brelse(SB_BUFFER_WITH_SB(s));
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 37d034ca7d99..e7cc00e636dc 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -38,6 +38,7 @@
38#include <linux/dcache.h> 38#include <linux/dcache.h>
39#include <linux/namei.h> 39#include <linux/namei.h>
40#include <linux/errno.h> 40#include <linux/errno.h>
41#include <linux/gfp.h>
41#include <linux/fs.h> 42#include <linux/fs.h>
42#include <linux/file.h> 43#include <linux/file.h>
43#include <linux/pagemap.h> 44#include <linux/pagemap.h>
@@ -553,7 +554,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
553 if (!err && new_size < i_size_read(dentry->d_inode)) { 554 if (!err && new_size < i_size_read(dentry->d_inode)) {
554 struct iattr newattrs = { 555 struct iattr newattrs = {
555 .ia_ctime = current_fs_time(inode->i_sb), 556 .ia_ctime = current_fs_time(inode->i_sb),
556 .ia_size = buffer_size, 557 .ia_size = new_size,
557 .ia_valid = ATTR_SIZE | ATTR_CTIME, 558 .ia_valid = ATTR_SIZE | ATTR_CTIME,
558 }; 559 };
559 560
@@ -972,21 +973,13 @@ int reiserfs_permission(struct inode *inode, int mask)
972 return generic_permission(inode, mask, NULL); 973 return generic_permission(inode, mask, NULL);
973} 974}
974 975
975/* This will catch lookups from the fs root to .reiserfs_priv */ 976static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
976static int
977xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name)
978{ 977{
979 struct dentry *priv_root = REISERFS_SB(dentry->d_sb)->priv_root; 978 return -EPERM;
980 if (container_of(q1, struct dentry, d_name) == priv_root)
981 return -ENOENT;
982 if (q1->len == name->len &&
983 !memcmp(q1->name, name->name, name->len))
984 return 0;
985 return 1;
986} 979}
987 980
988static const struct dentry_operations xattr_lookup_poison_ops = { 981static const struct dentry_operations xattr_lookup_poison_ops = {
989 .d_compare = xattr_lookup_poison, 982 .d_revalidate = xattr_hide_revalidate,
990}; 983};
991 984
992int reiserfs_lookup_privroot(struct super_block *s) 985int reiserfs_lookup_privroot(struct super_block *s)
@@ -1000,8 +993,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
1000 strlen(PRIVROOT_NAME)); 993 strlen(PRIVROOT_NAME));
1001 if (!IS_ERR(dentry)) { 994 if (!IS_ERR(dentry)) {
1002 REISERFS_SB(s)->priv_root = dentry; 995 REISERFS_SB(s)->priv_root = dentry;
1003 if (!reiserfs_expose_privroot(s)) 996 dentry->d_op = &xattr_lookup_poison_ops;
1004 s->s_root->d_op = &xattr_lookup_poison_ops;
1005 if (dentry->d_inode) 997 if (dentry->d_inode)
1006 dentry->d_inode->i_flags |= S_PRIVATE; 998 dentry->d_inode->i_flags |= S_PRIVATE;
1007 } else 999 } else
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index dd20a7883f0f..9cdb759645a9 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -5,6 +5,7 @@
5#include <linux/errno.h> 5#include <linux/errno.h>
6#include <linux/pagemap.h> 6#include <linux/pagemap.h>
7#include <linux/xattr.h> 7#include <linux/xattr.h>
8#include <linux/slab.h>
8#include <linux/posix_acl_xattr.h> 9#include <linux/posix_acl_xattr.h>
9#include <linux/reiserfs_xattr.h> 10#include <linux/reiserfs_xattr.h>
10#include <linux/reiserfs_acl.h> 11#include <linux/reiserfs_acl.h>
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index d8b5bfcbdd30..7271a477c041 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -3,6 +3,7 @@
3#include <linux/fs.h> 3#include <linux/fs.h>
4#include <linux/pagemap.h> 4#include <linux/pagemap.h>
5#include <linux/xattr.h> 5#include <linux/xattr.h>
6#include <linux/slab.h>
6#include <linux/reiserfs_xattr.h> 7#include <linux/reiserfs_xattr.h>
7#include <linux/security.h> 8#include <linux/security.h>
8#include <asm/uaccess.h> 9#include <asm/uaccess.h>
@@ -76,7 +77,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
76 return error; 77 return error;
77 } 78 }
78 79
79 if (sec->length) { 80 if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) {
80 blocks = reiserfs_xattr_jcreate_nblocks(inode) + 81 blocks = reiserfs_xattr_jcreate_nblocks(inode) +
81 reiserfs_xattr_nblocks(inode, sec->length); 82 reiserfs_xattr_nblocks(inode, sec->length);
82 /* We don't want to count the directories twice if we have 83 /* We don't want to count the directories twice if we have
diff --git a/fs/select.c b/fs/select.c
index 73715e90030f..500a669f7790 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -691,6 +691,23 @@ SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
691} 691}
692#endif /* HAVE_SET_RESTORE_SIGMASK */ 692#endif /* HAVE_SET_RESTORE_SIGMASK */
693 693
694#ifdef __ARCH_WANT_SYS_OLD_SELECT
695struct sel_arg_struct {
696 unsigned long n;
697 fd_set __user *inp, *outp, *exp;
698 struct timeval __user *tvp;
699};
700
701SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
702{
703 struct sel_arg_struct a;
704
705 if (copy_from_user(&a, arg, sizeof(a)))
706 return -EFAULT;
707 return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
708}
709#endif
710
694struct poll_list { 711struct poll_list {
695 struct poll_list *next; 712 struct poll_list *next;
696 int len; 713 int len;
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 1dabe4ee02fe..f329849ce3c0 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -22,6 +22,7 @@
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/slab.h>
25#include <linux/kernel.h> 26#include <linux/kernel.h>
26#include <linux/signal.h> 27#include <linux/signal.h>
27#include <linux/list.h> 28#include <linux/list.h>
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index 92d5e8ffb639..dbf6548bbf06 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -13,7 +13,6 @@
13#include <linux/fcntl.h> 13#include <linux/fcntl.h>
14#include <linux/stat.h> 14#include <linux/stat.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/slab.h>
17#include <linux/pagemap.h> 16#include <linux/pagemap.h>
18#include <linux/smp_lock.h> 17#include <linux/smp_lock.h>
19#include <linux/net.h> 18#include <linux/net.h>
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 1c4c8f089970..dfa1d67f8fca 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -479,6 +479,7 @@ smb_put_super(struct super_block *sb)
479 if (server->conn_pid) 479 if (server->conn_pid)
480 kill_pid(server->conn_pid, SIGTERM, 1); 480 kill_pid(server->conn_pid, SIGTERM, 1);
481 481
482 bdi_destroy(&server->bdi);
482 kfree(server->ops); 483 kfree(server->ops);
483 smb_unload_nls(server); 484 smb_unload_nls(server);
484 sb->s_fs_info = NULL; 485 sb->s_fs_info = NULL;
@@ -525,6 +526,11 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
525 if (!server) 526 if (!server)
526 goto out_no_server; 527 goto out_no_server;
527 sb->s_fs_info = server; 528 sb->s_fs_info = server;
529
530 if (bdi_setup_and_register(&server->bdi, "smbfs", BDI_CAP_MAP_COPY))
531 goto out_bdi;
532
533 sb->s_bdi = &server->bdi;
528 534
529 server->super_block = sb; 535 server->super_block = sb;
530 server->mnt = NULL; 536 server->mnt = NULL;
@@ -624,6 +630,8 @@ out_no_smbiod:
624out_bad_option: 630out_bad_option:
625 kfree(mem); 631 kfree(mem);
626out_no_mem: 632out_no_mem:
633 bdi_destroy(&server->bdi);
634out_bdi:
627 if (!server->mnt) 635 if (!server->mnt)
628 printk(KERN_ERR "smb_fill_super: allocation failure\n"); 636 printk(KERN_ERR "smb_fill_super: allocation failure\n");
629 sb->s_fs_info = NULL; 637 sb->s_fs_info = NULL;
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
index 6bd9b691a463..0e39a924f10a 100644
--- a/fs/smbfs/smbiod.c
+++ b/fs/smbfs/smbiod.c
@@ -12,7 +12,6 @@
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/stat.h> 13#include <linux/stat.h>
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/slab.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/file.h> 16#include <linux/file.h>
18#include <linux/dcache.h> 17#include <linux/dcache.h>
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
index 00b2909bd469..54350b59046b 100644
--- a/fs/smbfs/symlink.c
+++ b/fs/smbfs/symlink.c
@@ -15,6 +15,7 @@
15#include <linux/pagemap.h> 15#include <linux/pagemap.h>
16#include <linux/net.h> 16#include <linux/net.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/slab.h>
18 19
19#include <asm/uaccess.h> 20#include <asm/uaccess.h>
20#include <asm/system.h> 21#include <asm/system.h>
diff --git a/fs/splice.c b/fs/splice.c
index 39208663aaf1..9313b6124a2e 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -30,6 +30,7 @@
30#include <linux/syscalls.h> 30#include <linux/syscalls.h>
31#include <linux/uio.h> 31#include <linux/uio.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/gfp.h>
33 34
34/* 35/*
35 * Attempt to steal a page from a pipe buffer. This should perhaps go into 36 * Attempt to steal a page from a pipe buffer. This should perhaps go into
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 1cb0d81b164b..653c030eb840 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -87,9 +87,8 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
87 u64 cur_index = index >> msblk->devblksize_log2; 87 u64 cur_index = index >> msblk->devblksize_log2;
88 int bytes, compressed, b = 0, k = 0, page = 0, avail; 88 int bytes, compressed, b = 0, k = 0, page = 0, avail;
89 89
90 90 bh = kcalloc(((srclength + msblk->devblksize - 1)
91 bh = kcalloc((msblk->block_size >> msblk->devblksize_log2) + 1, 91 >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL);
92 sizeof(*bh), GFP_KERNEL);
93 if (bh == NULL) 92 if (bh == NULL)
94 return -ENOMEM; 93 return -ENOMEM;
95 94
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 3550aec2f655..48b6f4a385a6 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -275,7 +275,8 @@ allocate_root:
275 275
276 err = squashfs_read_inode(root, root_inode); 276 err = squashfs_read_inode(root, root_inode);
277 if (err) { 277 if (err) {
278 iget_failed(root); 278 make_bad_inode(root);
279 iput(root);
279 goto failed_mount; 280 goto failed_mount;
280 } 281 }
281 insert_inode_hash(root); 282 insert_inode_hash(root);
@@ -353,6 +354,7 @@ static void squashfs_put_super(struct super_block *sb)
353 kfree(sbi->id_table); 354 kfree(sbi->id_table);
354 kfree(sbi->fragment_index); 355 kfree(sbi->fragment_index);
355 kfree(sbi->meta_index); 356 kfree(sbi->meta_index);
357 kfree(sbi->inode_lookup_table);
356 kfree(sb->s_fs_info); 358 kfree(sb->s_fs_info);
357 sb->s_fs_info = NULL; 359 sb->s_fs_info = NULL;
358 } 360 }
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index e80be2022a7f..32b911f4ee39 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -33,7 +33,6 @@
33#include <linux/fs.h> 33#include <linux/fs.h>
34#include <linux/vfs.h> 34#include <linux/vfs.h>
35#include <linux/kernel.h> 35#include <linux/kernel.h>
36#include <linux/slab.h>
37#include <linux/string.h> 36#include <linux/string.h>
38#include <linux/pagemap.h> 37#include <linux/pagemap.h>
39 38
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 4dd70e04333b..7a603874e483 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -24,6 +24,7 @@
24 24
25#include <linux/mutex.h> 25#include <linux/mutex.h>
26#include <linux/buffer_head.h> 26#include <linux/buffer_head.h>
27#include <linux/slab.h>
27#include <linux/zlib.h> 28#include <linux/zlib.h>
28 29
29#include "squashfs_fs.h" 30#include "squashfs_fs.h"
@@ -127,8 +128,9 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
127 goto release_mutex; 128 goto release_mutex;
128 } 129 }
129 130
131 length = stream->total_out;
130 mutex_unlock(&msblk->read_data_mutex); 132 mutex_unlock(&msblk->read_data_mutex);
131 return stream->total_out; 133 return length;
132 134
133release_mutex: 135release_mutex:
134 mutex_unlock(&msblk->read_data_mutex); 136 mutex_unlock(&msblk->read_data_mutex);
diff --git a/fs/super.c b/fs/super.c
index f35ac6022109..1527e6a0ee35 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -37,6 +37,7 @@
37#include <linux/kobject.h> 37#include <linux/kobject.h>
38#include <linux/mutex.h> 38#include <linux/mutex.h>
39#include <linux/file.h> 39#include <linux/file.h>
40#include <linux/backing-dev.h>
40#include <asm/uaccess.h> 41#include <asm/uaccess.h>
41#include "internal.h" 42#include "internal.h"
42 43
@@ -693,6 +694,7 @@ int set_anon_super(struct super_block *s, void *data)
693 return -EMFILE; 694 return -EMFILE;
694 } 695 }
695 s->s_dev = MKDEV(0, dev & MINORMASK); 696 s->s_dev = MKDEV(0, dev & MINORMASK);
697 s->s_bdi = &noop_backing_dev_info;
696 return 0; 698 return 0;
697} 699}
698 700
@@ -954,10 +956,11 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
954 if (error < 0) 956 if (error < 0)
955 goto out_free_secdata; 957 goto out_free_secdata;
956 BUG_ON(!mnt->mnt_sb); 958 BUG_ON(!mnt->mnt_sb);
959 WARN_ON(!mnt->mnt_sb->s_bdi);
957 960
958 error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata); 961 error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
959 if (error) 962 if (error)
960 goto out_sb; 963 goto out_sb;
961 964
962 /* 965 /*
963 * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE 966 * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
diff --git a/fs/sync.c b/fs/sync.c
index f557d71cb097..92b228176f7c 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -5,6 +5,7 @@
5#include <linux/kernel.h> 5#include <linux/kernel.h>
6#include <linux/file.h> 6#include <linux/file.h>
7#include <linux/fs.h> 7#include <linux/fs.h>
8#include <linux/slab.h>
8#include <linux/module.h> 9#include <linux/module.h>
9#include <linux/sched.h> 10#include <linux/sched.h>
10#include <linux/writeback.h> 11#include <linux/writeback.h>
@@ -13,6 +14,7 @@
13#include <linux/pagemap.h> 14#include <linux/pagemap.h>
14#include <linux/quotaops.h> 15#include <linux/quotaops.h>
15#include <linux/buffer_head.h> 16#include <linux/buffer_head.h>
17#include <linux/backing-dev.h>
16#include "internal.h" 18#include "internal.h"
17 19
18#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ 20#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
@@ -31,7 +33,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
31 * This should be safe, as we require bdi backing to actually 33 * This should be safe, as we require bdi backing to actually
32 * write out data in the first place 34 * write out data in the first place
33 */ 35 */
34 if (!sb->s_bdi) 36 if (!sb->s_bdi || sb->s_bdi == &noop_backing_dev_info)
35 return 0; 37 return 0;
36 38
37 if (sb->s_qcop && sb->s_qcop->quota_sync) 39 if (sb->s_qcop && sb->s_qcop->quota_sync)
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index a0a500af24a1..806b277453f9 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -54,14 +54,14 @@ fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count)
54 int rc; 54 int rc;
55 55
56 /* need attr_sd for attr, its parent for kobj */ 56 /* need attr_sd for attr, its parent for kobj */
57 if (!sysfs_get_active_two(attr_sd)) 57 if (!sysfs_get_active(attr_sd))
58 return -ENODEV; 58 return -ENODEV;
59 59
60 rc = -EIO; 60 rc = -EIO;
61 if (attr->read) 61 if (attr->read)
62 rc = attr->read(kobj, attr, buffer, off, count); 62 rc = attr->read(kobj, attr, buffer, off, count);
63 63
64 sysfs_put_active_two(attr_sd); 64 sysfs_put_active(attr_sd);
65 65
66 return rc; 66 return rc;
67} 67}
@@ -125,14 +125,14 @@ flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count)
125 int rc; 125 int rc;
126 126
127 /* need attr_sd for attr, its parent for kobj */ 127 /* need attr_sd for attr, its parent for kobj */
128 if (!sysfs_get_active_two(attr_sd)) 128 if (!sysfs_get_active(attr_sd))
129 return -ENODEV; 129 return -ENODEV;
130 130
131 rc = -EIO; 131 rc = -EIO;
132 if (attr->write) 132 if (attr->write)
133 rc = attr->write(kobj, attr, buffer, offset, count); 133 rc = attr->write(kobj, attr, buffer, offset, count);
134 134
135 sysfs_put_active_two(attr_sd); 135 sysfs_put_active(attr_sd);
136 136
137 return rc; 137 return rc;
138} 138}
@@ -184,12 +184,12 @@ static void bin_vma_open(struct vm_area_struct *vma)
184 if (!bb->vm_ops || !bb->vm_ops->open) 184 if (!bb->vm_ops || !bb->vm_ops->open)
185 return; 185 return;
186 186
187 if (!sysfs_get_active_two(attr_sd)) 187 if (!sysfs_get_active(attr_sd))
188 return; 188 return;
189 189
190 bb->vm_ops->open(vma); 190 bb->vm_ops->open(vma);
191 191
192 sysfs_put_active_two(attr_sd); 192 sysfs_put_active(attr_sd);
193} 193}
194 194
195static void bin_vma_close(struct vm_area_struct *vma) 195static void bin_vma_close(struct vm_area_struct *vma)
@@ -201,12 +201,12 @@ static void bin_vma_close(struct vm_area_struct *vma)
201 if (!bb->vm_ops || !bb->vm_ops->close) 201 if (!bb->vm_ops || !bb->vm_ops->close)
202 return; 202 return;
203 203
204 if (!sysfs_get_active_two(attr_sd)) 204 if (!sysfs_get_active(attr_sd))
205 return; 205 return;
206 206
207 bb->vm_ops->close(vma); 207 bb->vm_ops->close(vma);
208 208
209 sysfs_put_active_two(attr_sd); 209 sysfs_put_active(attr_sd);
210} 210}
211 211
212static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 212static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -219,12 +219,12 @@ static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
219 if (!bb->vm_ops || !bb->vm_ops->fault) 219 if (!bb->vm_ops || !bb->vm_ops->fault)
220 return VM_FAULT_SIGBUS; 220 return VM_FAULT_SIGBUS;
221 221
222 if (!sysfs_get_active_two(attr_sd)) 222 if (!sysfs_get_active(attr_sd))
223 return VM_FAULT_SIGBUS; 223 return VM_FAULT_SIGBUS;
224 224
225 ret = bb->vm_ops->fault(vma, vmf); 225 ret = bb->vm_ops->fault(vma, vmf);
226 226
227 sysfs_put_active_two(attr_sd); 227 sysfs_put_active(attr_sd);
228 return ret; 228 return ret;
229} 229}
230 230
@@ -241,12 +241,12 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
241 if (!bb->vm_ops->page_mkwrite) 241 if (!bb->vm_ops->page_mkwrite)
242 return 0; 242 return 0;
243 243
244 if (!sysfs_get_active_two(attr_sd)) 244 if (!sysfs_get_active(attr_sd))
245 return VM_FAULT_SIGBUS; 245 return VM_FAULT_SIGBUS;
246 246
247 ret = bb->vm_ops->page_mkwrite(vma, vmf); 247 ret = bb->vm_ops->page_mkwrite(vma, vmf);
248 248
249 sysfs_put_active_two(attr_sd); 249 sysfs_put_active(attr_sd);
250 return ret; 250 return ret;
251} 251}
252 252
@@ -261,12 +261,12 @@ static int bin_access(struct vm_area_struct *vma, unsigned long addr,
261 if (!bb->vm_ops || !bb->vm_ops->access) 261 if (!bb->vm_ops || !bb->vm_ops->access)
262 return -EINVAL; 262 return -EINVAL;
263 263
264 if (!sysfs_get_active_two(attr_sd)) 264 if (!sysfs_get_active(attr_sd))
265 return -EINVAL; 265 return -EINVAL;
266 266
267 ret = bb->vm_ops->access(vma, addr, buf, len, write); 267 ret = bb->vm_ops->access(vma, addr, buf, len, write);
268 268
269 sysfs_put_active_two(attr_sd); 269 sysfs_put_active(attr_sd);
270 return ret; 270 return ret;
271} 271}
272 272
@@ -281,12 +281,12 @@ static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
281 if (!bb->vm_ops || !bb->vm_ops->set_policy) 281 if (!bb->vm_ops || !bb->vm_ops->set_policy)
282 return 0; 282 return 0;
283 283
284 if (!sysfs_get_active_two(attr_sd)) 284 if (!sysfs_get_active(attr_sd))
285 return -EINVAL; 285 return -EINVAL;
286 286
287 ret = bb->vm_ops->set_policy(vma, new); 287 ret = bb->vm_ops->set_policy(vma, new);
288 288
289 sysfs_put_active_two(attr_sd); 289 sysfs_put_active(attr_sd);
290 return ret; 290 return ret;
291} 291}
292 292
@@ -301,12 +301,12 @@ static struct mempolicy *bin_get_policy(struct vm_area_struct *vma,
301 if (!bb->vm_ops || !bb->vm_ops->get_policy) 301 if (!bb->vm_ops || !bb->vm_ops->get_policy)
302 return vma->vm_policy; 302 return vma->vm_policy;
303 303
304 if (!sysfs_get_active_two(attr_sd)) 304 if (!sysfs_get_active(attr_sd))
305 return vma->vm_policy; 305 return vma->vm_policy;
306 306
307 pol = bb->vm_ops->get_policy(vma, addr); 307 pol = bb->vm_ops->get_policy(vma, addr);
308 308
309 sysfs_put_active_two(attr_sd); 309 sysfs_put_active(attr_sd);
310 return pol; 310 return pol;
311} 311}
312 312
@@ -321,12 +321,12 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
321 if (!bb->vm_ops || !bb->vm_ops->migrate) 321 if (!bb->vm_ops || !bb->vm_ops->migrate)
322 return 0; 322 return 0;
323 323
324 if (!sysfs_get_active_two(attr_sd)) 324 if (!sysfs_get_active(attr_sd))
325 return 0; 325 return 0;
326 326
327 ret = bb->vm_ops->migrate(vma, from, to, flags); 327 ret = bb->vm_ops->migrate(vma, from, to, flags);
328 328
329 sysfs_put_active_two(attr_sd); 329 sysfs_put_active(attr_sd);
330 return ret; 330 return ret;
331} 331}
332#endif 332#endif
@@ -356,7 +356,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
356 356
357 /* need attr_sd for attr, its parent for kobj */ 357 /* need attr_sd for attr, its parent for kobj */
358 rc = -ENODEV; 358 rc = -ENODEV;
359 if (!sysfs_get_active_two(attr_sd)) 359 if (!sysfs_get_active(attr_sd))
360 goto out_unlock; 360 goto out_unlock;
361 361
362 rc = -EINVAL; 362 rc = -EINVAL;
@@ -384,7 +384,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
384 bb->vm_ops = vma->vm_ops; 384 bb->vm_ops = vma->vm_ops;
385 vma->vm_ops = &bin_vm_ops; 385 vma->vm_ops = &bin_vm_ops;
386out_put: 386out_put:
387 sysfs_put_active_two(attr_sd); 387 sysfs_put_active(attr_sd);
388out_unlock: 388out_unlock:
389 mutex_unlock(&bb->mutex); 389 mutex_unlock(&bb->mutex);
390 390
@@ -399,7 +399,7 @@ static int open(struct inode * inode, struct file * file)
399 int error; 399 int error;
400 400
401 /* binary file operations requires both @sd and its parent */ 401 /* binary file operations requires both @sd and its parent */
402 if (!sysfs_get_active_two(attr_sd)) 402 if (!sysfs_get_active(attr_sd))
403 return -ENODEV; 403 return -ENODEV;
404 404
405 error = -EACCES; 405 error = -EACCES;
@@ -426,11 +426,11 @@ static int open(struct inode * inode, struct file * file)
426 mutex_unlock(&sysfs_bin_lock); 426 mutex_unlock(&sysfs_bin_lock);
427 427
428 /* open succeeded, put active references */ 428 /* open succeeded, put active references */
429 sysfs_put_active_two(attr_sd); 429 sysfs_put_active(attr_sd);
430 return 0; 430 return 0;
431 431
432 err_out: 432 err_out:
433 sysfs_put_active_two(attr_sd); 433 sysfs_put_active(attr_sd);
434 kfree(bb); 434 kfree(bb);
435 return error; 435 return error;
436} 436}
@@ -501,7 +501,7 @@ int sysfs_create_bin_file(struct kobject *kobj,
501void sysfs_remove_bin_file(struct kobject *kobj, 501void sysfs_remove_bin_file(struct kobject *kobj,
502 const struct bin_attribute *attr) 502 const struct bin_attribute *attr)
503{ 503{
504 sysfs_hash_and_remove(kobj->sd, attr->attr.name); 504 sysfs_hash_and_remove(kobj->sd, NULL, attr->attr.name);
505} 505}
506 506
507EXPORT_SYMBOL_GPL(sysfs_create_bin_file); 507EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 699f371b9f12..b2b83067ccc8 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -93,7 +93,7 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
93 * RETURNS: 93 * RETURNS:
94 * Pointer to @sd on success, NULL on failure. 94 * Pointer to @sd on success, NULL on failure.
95 */ 95 */
96static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) 96struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
97{ 97{
98 if (unlikely(!sd)) 98 if (unlikely(!sd))
99 return NULL; 99 return NULL;
@@ -124,7 +124,7 @@ static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
124 * Put an active reference to @sd. This function is noop if @sd 124 * Put an active reference to @sd. This function is noop if @sd
125 * is NULL. 125 * is NULL.
126 */ 126 */
127static void sysfs_put_active(struct sysfs_dirent *sd) 127void sysfs_put_active(struct sysfs_dirent *sd)
128{ 128{
129 struct completion *cmpl; 129 struct completion *cmpl;
130 int v; 130 int v;
@@ -145,45 +145,6 @@ static void sysfs_put_active(struct sysfs_dirent *sd)
145} 145}
146 146
147/** 147/**
148 * sysfs_get_active_two - get active references to sysfs_dirent and parent
149 * @sd: sysfs_dirent of interest
150 *
151 * Get active reference to @sd and its parent. Parent's active
152 * reference is grabbed first. This function is noop if @sd is
153 * NULL.
154 *
155 * RETURNS:
156 * Pointer to @sd on success, NULL on failure.
157 */
158struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd)
159{
160 if (sd) {
161 if (sd->s_parent && unlikely(!sysfs_get_active(sd->s_parent)))
162 return NULL;
163 if (unlikely(!sysfs_get_active(sd))) {
164 sysfs_put_active(sd->s_parent);
165 return NULL;
166 }
167 }
168 return sd;
169}
170
171/**
172 * sysfs_put_active_two - put active references to sysfs_dirent and parent
173 * @sd: sysfs_dirent of interest
174 *
175 * Put active references to @sd and its parent. This function is
176 * noop if @sd is NULL.
177 */
178void sysfs_put_active_two(struct sysfs_dirent *sd)
179{
180 if (sd) {
181 sysfs_put_active(sd);
182 sysfs_put_active(sd->s_parent);
183 }
184}
185
186/**
187 * sysfs_deactivate - deactivate sysfs_dirent 148 * sysfs_deactivate - deactivate sysfs_dirent
188 * @sd: sysfs_dirent to deactivate 149 * @sd: sysfs_dirent to deactivate
189 * 150 *
@@ -195,6 +156,10 @@ static void sysfs_deactivate(struct sysfs_dirent *sd)
195 int v; 156 int v;
196 157
197 BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED)); 158 BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED));
159
160 if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
161 return;
162
198 sd->s_sibling = (void *)&wait; 163 sd->s_sibling = (void *)&wait;
199 164
200 rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_); 165 rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
@@ -354,7 +319,6 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
354 319
355 atomic_set(&sd->s_count, 1); 320 atomic_set(&sd->s_count, 1);
356 atomic_set(&sd->s_active, 0); 321 atomic_set(&sd->s_active, 0);
357 sysfs_dirent_init_lockdep(sd);
358 322
359 sd->s_name = name; 323 sd->s_name = name;
360 sd->s_mode = mode; 324 sd->s_mode = mode;
@@ -416,9 +380,15 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
416{ 380{
417 struct sysfs_inode_attrs *ps_iattr; 381 struct sysfs_inode_attrs *ps_iattr;
418 382
419 if (sysfs_find_dirent(acxt->parent_sd, sd->s_name)) 383 if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name))
420 return -EEXIST; 384 return -EEXIST;
421 385
386 if (sysfs_ns_type(acxt->parent_sd) && !sd->s_ns) {
387 WARN(1, KERN_WARNING "sysfs: ns required in '%s' for '%s'\n",
388 acxt->parent_sd->s_name, sd->s_name);
389 return -EINVAL;
390 }
391
422 sd->s_parent = sysfs_get(acxt->parent_sd); 392 sd->s_parent = sysfs_get(acxt->parent_sd);
423 393
424 sysfs_link_sibling(sd); 394 sysfs_link_sibling(sd);
@@ -569,13 +539,17 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
569 * Pointer to sysfs_dirent if found, NULL if not. 539 * Pointer to sysfs_dirent if found, NULL if not.
570 */ 540 */
571struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, 541struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
542 const void *ns,
572 const unsigned char *name) 543 const unsigned char *name)
573{ 544{
574 struct sysfs_dirent *sd; 545 struct sysfs_dirent *sd;
575 546
576 for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) 547 for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) {
548 if (sd->s_ns != ns)
549 continue;
577 if (!strcmp(sd->s_name, name)) 550 if (!strcmp(sd->s_name, name))
578 return sd; 551 return sd;
552 }
579 return NULL; 553 return NULL;
580} 554}
581 555
@@ -594,12 +568,13 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
594 * Pointer to sysfs_dirent if found, NULL if not. 568 * Pointer to sysfs_dirent if found, NULL if not.
595 */ 569 */
596struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, 570struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
571 const void *ns,
597 const unsigned char *name) 572 const unsigned char *name)
598{ 573{
599 struct sysfs_dirent *sd; 574 struct sysfs_dirent *sd;
600 575
601 mutex_lock(&sysfs_mutex); 576 mutex_lock(&sysfs_mutex);
602 sd = sysfs_find_dirent(parent_sd, name); 577 sd = sysfs_find_dirent(parent_sd, ns, name);
603 sysfs_get(sd); 578 sysfs_get(sd);
604 mutex_unlock(&sysfs_mutex); 579 mutex_unlock(&sysfs_mutex);
605 580
@@ -608,7 +583,8 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
608EXPORT_SYMBOL_GPL(sysfs_get_dirent); 583EXPORT_SYMBOL_GPL(sysfs_get_dirent);
609 584
610static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, 585static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
611 const char *name, struct sysfs_dirent **p_sd) 586 enum kobj_ns_type type, const void *ns, const char *name,
587 struct sysfs_dirent **p_sd)
612{ 588{
613 umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; 589 umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
614 struct sysfs_addrm_cxt acxt; 590 struct sysfs_addrm_cxt acxt;
@@ -619,6 +595,9 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
619 sd = sysfs_new_dirent(name, mode, SYSFS_DIR); 595 sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
620 if (!sd) 596 if (!sd)
621 return -ENOMEM; 597 return -ENOMEM;
598
599 sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT);
600 sd->s_ns = ns;
622 sd->s_dir.kobj = kobj; 601 sd->s_dir.kobj = kobj;
623 602
624 /* link in */ 603 /* link in */
@@ -637,7 +616,25 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
637int sysfs_create_subdir(struct kobject *kobj, const char *name, 616int sysfs_create_subdir(struct kobject *kobj, const char *name,
638 struct sysfs_dirent **p_sd) 617 struct sysfs_dirent **p_sd)
639{ 618{
640 return create_dir(kobj, kobj->sd, name, p_sd); 619 return create_dir(kobj, kobj->sd,
620 KOBJ_NS_TYPE_NONE, NULL, name, p_sd);
621}
622
623static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
624{
625 const struct kobj_ns_type_operations *ops;
626 enum kobj_ns_type type;
627
628 ops = kobj_child_ns_ops(kobj);
629 if (!ops)
630 return KOBJ_NS_TYPE_NONE;
631
632 type = ops->type;
633 BUG_ON(type <= KOBJ_NS_TYPE_NONE);
634 BUG_ON(type >= KOBJ_NS_TYPES);
635 BUG_ON(!kobj_ns_type_registered(type));
636
637 return type;
641} 638}
642 639
643/** 640/**
@@ -646,7 +643,9 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
646 */ 643 */
647int sysfs_create_dir(struct kobject * kobj) 644int sysfs_create_dir(struct kobject * kobj)
648{ 645{
646 enum kobj_ns_type type;
649 struct sysfs_dirent *parent_sd, *sd; 647 struct sysfs_dirent *parent_sd, *sd;
648 const void *ns = NULL;
650 int error = 0; 649 int error = 0;
651 650
652 BUG_ON(!kobj); 651 BUG_ON(!kobj);
@@ -656,7 +655,11 @@ int sysfs_create_dir(struct kobject * kobj)
656 else 655 else
657 parent_sd = &sysfs_root; 656 parent_sd = &sysfs_root;
658 657
659 error = create_dir(kobj, parent_sd, kobject_name(kobj), &sd); 658 if (sysfs_ns_type(parent_sd))
659 ns = kobj->ktype->namespace(kobj);
660 type = sysfs_read_ns_type(kobj);
661
662 error = create_dir(kobj, parent_sd, type, ns, kobject_name(kobj), &sd);
660 if (!error) 663 if (!error)
661 kobj->sd = sd; 664 kobj->sd = sd;
662 return error; 665 return error;
@@ -666,13 +669,19 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
666 struct nameidata *nd) 669 struct nameidata *nd)
667{ 670{
668 struct dentry *ret = NULL; 671 struct dentry *ret = NULL;
669 struct sysfs_dirent *parent_sd = dentry->d_parent->d_fsdata; 672 struct dentry *parent = dentry->d_parent;
673 struct sysfs_dirent *parent_sd = parent->d_fsdata;
670 struct sysfs_dirent *sd; 674 struct sysfs_dirent *sd;
671 struct inode *inode; 675 struct inode *inode;
676 enum kobj_ns_type type;
677 const void *ns;
672 678
673 mutex_lock(&sysfs_mutex); 679 mutex_lock(&sysfs_mutex);
674 680
675 sd = sysfs_find_dirent(parent_sd, dentry->d_name.name); 681 type = sysfs_ns_type(parent_sd);
682 ns = sysfs_info(dir->i_sb)->ns[type];
683
684 sd = sysfs_find_dirent(parent_sd, ns, dentry->d_name.name);
676 685
677 /* no such entry */ 686 /* no such entry */
678 if (!sd) { 687 if (!sd) {
@@ -681,7 +690,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
681 } 690 }
682 691
683 /* attach dentry and inode */ 692 /* attach dentry and inode */
684 inode = sysfs_get_inode(sd); 693 inode = sysfs_get_inode(dir->i_sb, sd);
685 if (!inode) { 694 if (!inode) {
686 ret = ERR_PTR(-ENOMEM); 695 ret = ERR_PTR(-ENOMEM);
687 goto out_unlock; 696 goto out_unlock;
@@ -771,7 +780,8 @@ void sysfs_remove_dir(struct kobject * kobj)
771} 780}
772 781
773int sysfs_rename(struct sysfs_dirent *sd, 782int sysfs_rename(struct sysfs_dirent *sd,
774 struct sysfs_dirent *new_parent_sd, const char *new_name) 783 struct sysfs_dirent *new_parent_sd, const void *new_ns,
784 const char *new_name)
775{ 785{
776 const char *dup_name = NULL; 786 const char *dup_name = NULL;
777 int error; 787 int error;
@@ -779,12 +789,12 @@ int sysfs_rename(struct sysfs_dirent *sd,
779 mutex_lock(&sysfs_mutex); 789 mutex_lock(&sysfs_mutex);
780 790
781 error = 0; 791 error = 0;
782 if ((sd->s_parent == new_parent_sd) && 792 if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) &&
783 (strcmp(sd->s_name, new_name) == 0)) 793 (strcmp(sd->s_name, new_name) == 0))
784 goto out; /* nothing to rename */ 794 goto out; /* nothing to rename */
785 795
786 error = -EEXIST; 796 error = -EEXIST;
787 if (sysfs_find_dirent(new_parent_sd, new_name)) 797 if (sysfs_find_dirent(new_parent_sd, new_ns, new_name))
788 goto out; 798 goto out;
789 799
790 /* rename sysfs_dirent */ 800 /* rename sysfs_dirent */
@@ -806,6 +816,7 @@ int sysfs_rename(struct sysfs_dirent *sd,
806 sd->s_parent = new_parent_sd; 816 sd->s_parent = new_parent_sd;
807 sysfs_link_sibling(sd); 817 sysfs_link_sibling(sd);
808 } 818 }
819 sd->s_ns = new_ns;
809 820
810 error = 0; 821 error = 0;
811 out: 822 out:
@@ -816,19 +827,28 @@ int sysfs_rename(struct sysfs_dirent *sd,
816 827
817int sysfs_rename_dir(struct kobject *kobj, const char *new_name) 828int sysfs_rename_dir(struct kobject *kobj, const char *new_name)
818{ 829{
819 return sysfs_rename(kobj->sd, kobj->sd->s_parent, new_name); 830 struct sysfs_dirent *parent_sd = kobj->sd->s_parent;
831 const void *new_ns = NULL;
832
833 if (sysfs_ns_type(parent_sd))
834 new_ns = kobj->ktype->namespace(kobj);
835
836 return sysfs_rename(kobj->sd, parent_sd, new_ns, new_name);
820} 837}
821 838
822int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj) 839int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
823{ 840{
824 struct sysfs_dirent *sd = kobj->sd; 841 struct sysfs_dirent *sd = kobj->sd;
825 struct sysfs_dirent *new_parent_sd; 842 struct sysfs_dirent *new_parent_sd;
843 const void *new_ns = NULL;
826 844
827 BUG_ON(!sd->s_parent); 845 BUG_ON(!sd->s_parent);
846 if (sysfs_ns_type(sd->s_parent))
847 new_ns = kobj->ktype->namespace(kobj);
828 new_parent_sd = new_parent_kobj && new_parent_kobj->sd ? 848 new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
829 new_parent_kobj->sd : &sysfs_root; 849 new_parent_kobj->sd : &sysfs_root;
830 850
831 return sysfs_rename(sd, new_parent_sd, sd->s_name); 851 return sysfs_rename(sd, new_parent_sd, new_ns, sd->s_name);
832} 852}
833 853
834/* Relationship between s_mode and the DT_xxx types */ 854/* Relationship between s_mode and the DT_xxx types */
@@ -837,13 +857,56 @@ static inline unsigned char dt_type(struct sysfs_dirent *sd)
837 return (sd->s_mode >> 12) & 15; 857 return (sd->s_mode >> 12) & 15;
838} 858}
839 859
860static int sysfs_dir_release(struct inode *inode, struct file *filp)
861{
862 sysfs_put(filp->private_data);
863 return 0;
864}
865
866static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
867 struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
868{
869 if (pos) {
870 int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
871 pos->s_parent == parent_sd &&
872 ino == pos->s_ino;
873 sysfs_put(pos);
874 if (!valid)
875 pos = NULL;
876 }
877 if (!pos && (ino > 1) && (ino < INT_MAX)) {
878 pos = parent_sd->s_dir.children;
879 while (pos && (ino > pos->s_ino))
880 pos = pos->s_sibling;
881 }
882 while (pos && pos->s_ns != ns)
883 pos = pos->s_sibling;
884 return pos;
885}
886
887static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
888 struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
889{
890 pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
891 if (pos)
892 pos = pos->s_sibling;
893 while (pos && pos->s_ns != ns)
894 pos = pos->s_sibling;
895 return pos;
896}
897
840static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) 898static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
841{ 899{
842 struct dentry *dentry = filp->f_path.dentry; 900 struct dentry *dentry = filp->f_path.dentry;
843 struct sysfs_dirent * parent_sd = dentry->d_fsdata; 901 struct sysfs_dirent * parent_sd = dentry->d_fsdata;
844 struct sysfs_dirent *pos; 902 struct sysfs_dirent *pos = filp->private_data;
903 enum kobj_ns_type type;
904 const void *ns;
845 ino_t ino; 905 ino_t ino;
846 906
907 type = sysfs_ns_type(parent_sd);
908 ns = sysfs_info(dentry->d_sb)->ns[type];
909
847 if (filp->f_pos == 0) { 910 if (filp->f_pos == 0) {
848 ino = parent_sd->s_ino; 911 ino = parent_sd->s_ino;
849 if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0) 912 if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0)
@@ -857,29 +920,31 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
857 if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0) 920 if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0)
858 filp->f_pos++; 921 filp->f_pos++;
859 } 922 }
860 if ((filp->f_pos > 1) && (filp->f_pos < INT_MAX)) { 923 mutex_lock(&sysfs_mutex);
861 mutex_lock(&sysfs_mutex); 924 for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos);
862 925 pos;
863 /* Skip the dentries we have already reported */ 926 pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) {
864 pos = parent_sd->s_dir.children; 927 const char * name;
865 while (pos && (filp->f_pos > pos->s_ino)) 928 unsigned int type;
866 pos = pos->s_sibling; 929 int len, ret;
867 930
868 for ( ; pos; pos = pos->s_sibling) { 931 name = pos->s_name;
869 const char * name; 932 len = strlen(name);
870 int len; 933 ino = pos->s_ino;
871 934 type = dt_type(pos);
872 name = pos->s_name; 935 filp->f_pos = ino;
873 len = strlen(name); 936 filp->private_data = sysfs_get(pos);
874 filp->f_pos = ino = pos->s_ino;
875 937
876 if (filldir(dirent, name, len, filp->f_pos, ino,
877 dt_type(pos)) < 0)
878 break;
879 }
880 if (!pos)
881 filp->f_pos = INT_MAX;
882 mutex_unlock(&sysfs_mutex); 938 mutex_unlock(&sysfs_mutex);
939 ret = filldir(dirent, name, len, filp->f_pos, ino, type);
940 mutex_lock(&sysfs_mutex);
941 if (ret < 0)
942 break;
943 }
944 mutex_unlock(&sysfs_mutex);
945 if ((filp->f_pos > 1) && !pos) { /* EOF */
946 filp->f_pos = INT_MAX;
947 filp->private_data = NULL;
883 } 948 }
884 return 0; 949 return 0;
885} 950}
@@ -888,5 +953,6 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
888const struct file_operations sysfs_dir_operations = { 953const struct file_operations sysfs_dir_operations = {
889 .read = generic_read_dir, 954 .read = generic_read_dir,
890 .readdir = sysfs_readdir, 955 .readdir = sysfs_readdir,
956 .release = sysfs_dir_release,
891 .llseek = generic_file_llseek, 957 .llseek = generic_file_llseek,
892}; 958};
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index dc30d9e31683..1beaa739d0a6 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -53,7 +53,7 @@ struct sysfs_buffer {
53 size_t count; 53 size_t count;
54 loff_t pos; 54 loff_t pos;
55 char * page; 55 char * page;
56 struct sysfs_ops * ops; 56 const struct sysfs_ops * ops;
57 struct mutex mutex; 57 struct mutex mutex;
58 int needs_read_fill; 58 int needs_read_fill;
59 int event; 59 int event;
@@ -75,7 +75,7 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
75{ 75{
76 struct sysfs_dirent *attr_sd = dentry->d_fsdata; 76 struct sysfs_dirent *attr_sd = dentry->d_fsdata;
77 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 77 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
78 struct sysfs_ops * ops = buffer->ops; 78 const struct sysfs_ops * ops = buffer->ops;
79 int ret = 0; 79 int ret = 0;
80 ssize_t count; 80 ssize_t count;
81 81
@@ -85,13 +85,13 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
85 return -ENOMEM; 85 return -ENOMEM;
86 86
87 /* need attr_sd for attr and ops, its parent for kobj */ 87 /* need attr_sd for attr and ops, its parent for kobj */
88 if (!sysfs_get_active_two(attr_sd)) 88 if (!sysfs_get_active(attr_sd))
89 return -ENODEV; 89 return -ENODEV;
90 90
91 buffer->event = atomic_read(&attr_sd->s_attr.open->event); 91 buffer->event = atomic_read(&attr_sd->s_attr.open->event);
92 count = ops->show(kobj, attr_sd->s_attr.attr, buffer->page); 92 count = ops->show(kobj, attr_sd->s_attr.attr, buffer->page);
93 93
94 sysfs_put_active_two(attr_sd); 94 sysfs_put_active(attr_sd);
95 95
96 /* 96 /*
97 * The code works fine with PAGE_SIZE return but it's likely to 97 * The code works fine with PAGE_SIZE return but it's likely to
@@ -199,16 +199,16 @@ flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t
199{ 199{
200 struct sysfs_dirent *attr_sd = dentry->d_fsdata; 200 struct sysfs_dirent *attr_sd = dentry->d_fsdata;
201 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 201 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
202 struct sysfs_ops * ops = buffer->ops; 202 const struct sysfs_ops * ops = buffer->ops;
203 int rc; 203 int rc;
204 204
205 /* need attr_sd for attr and ops, its parent for kobj */ 205 /* need attr_sd for attr and ops, its parent for kobj */
206 if (!sysfs_get_active_two(attr_sd)) 206 if (!sysfs_get_active(attr_sd))
207 return -ENODEV; 207 return -ENODEV;
208 208
209 rc = ops->store(kobj, attr_sd->s_attr.attr, buffer->page, count); 209 rc = ops->store(kobj, attr_sd->s_attr.attr, buffer->page, count);
210 210
211 sysfs_put_active_two(attr_sd); 211 sysfs_put_active(attr_sd);
212 212
213 return rc; 213 return rc;
214} 214}
@@ -335,7 +335,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
335 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 335 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
336 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 336 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
337 struct sysfs_buffer *buffer; 337 struct sysfs_buffer *buffer;
338 struct sysfs_ops *ops; 338 const struct sysfs_ops *ops;
339 int error = -EACCES; 339 int error = -EACCES;
340 char *p; 340 char *p;
341 341
@@ -344,7 +344,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
344 memmove(last_sysfs_file, p, strlen(p) + 1); 344 memmove(last_sysfs_file, p, strlen(p) + 1);
345 345
346 /* need attr_sd for attr and ops, its parent for kobj */ 346 /* need attr_sd for attr and ops, its parent for kobj */
347 if (!sysfs_get_active_two(attr_sd)) 347 if (!sysfs_get_active(attr_sd))
348 return -ENODEV; 348 return -ENODEV;
349 349
350 /* every kobject with an attribute needs a ktype assigned */ 350 /* every kobject with an attribute needs a ktype assigned */
@@ -393,13 +393,13 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
393 goto err_free; 393 goto err_free;
394 394
395 /* open succeeded, put active references */ 395 /* open succeeded, put active references */
396 sysfs_put_active_two(attr_sd); 396 sysfs_put_active(attr_sd);
397 return 0; 397 return 0;
398 398
399 err_free: 399 err_free:
400 kfree(buffer); 400 kfree(buffer);
401 err_out: 401 err_out:
402 sysfs_put_active_two(attr_sd); 402 sysfs_put_active(attr_sd);
403 return error; 403 return error;
404} 404}
405 405
@@ -437,12 +437,12 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
437 struct sysfs_open_dirent *od = attr_sd->s_attr.open; 437 struct sysfs_open_dirent *od = attr_sd->s_attr.open;
438 438
439 /* need parent for the kobj, grab both */ 439 /* need parent for the kobj, grab both */
440 if (!sysfs_get_active_two(attr_sd)) 440 if (!sysfs_get_active(attr_sd))
441 goto trigger; 441 goto trigger;
442 442
443 poll_wait(filp, &od->poll, wait); 443 poll_wait(filp, &od->poll, wait);
444 444
445 sysfs_put_active_two(attr_sd); 445 sysfs_put_active(attr_sd);
446 446
447 if (buffer->event != atomic_read(&od->event)) 447 if (buffer->event != atomic_read(&od->event))
448 goto trigger; 448 goto trigger;
@@ -478,9 +478,12 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
478 mutex_lock(&sysfs_mutex); 478 mutex_lock(&sysfs_mutex);
479 479
480 if (sd && dir) 480 if (sd && dir)
481 sd = sysfs_find_dirent(sd, dir); 481 /* Only directories are tagged, so no need to pass
482 * a tag explicitly.
483 */
484 sd = sysfs_find_dirent(sd, NULL, dir);
482 if (sd && attr) 485 if (sd && attr)
483 sd = sysfs_find_dirent(sd, attr); 486 sd = sysfs_find_dirent(sd, NULL, attr);
484 if (sd) 487 if (sd)
485 sysfs_notify_dirent(sd); 488 sysfs_notify_dirent(sd);
486 489
@@ -509,6 +512,7 @@ int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
509 if (!sd) 512 if (!sd)
510 return -ENOMEM; 513 return -ENOMEM;
511 sd->s_attr.attr = (void *)attr; 514 sd->s_attr.attr = (void *)attr;
515 sysfs_dirent_init_lockdep(sd);
512 516
513 sysfs_addrm_start(&acxt, dir_sd); 517 sysfs_addrm_start(&acxt, dir_sd);
514 rc = sysfs_add_one(&acxt, sd); 518 rc = sysfs_add_one(&acxt, sd);
@@ -542,6 +546,18 @@ int sysfs_create_file(struct kobject * kobj, const struct attribute * attr)
542 546
543} 547}
544 548
549int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr)
550{
551 int err = 0;
552 int i;
553
554 for (i = 0; ptr[i] && !err; i++)
555 err = sysfs_create_file(kobj, ptr[i]);
556 if (err)
557 while (--i >= 0)
558 sysfs_remove_file(kobj, ptr[i]);
559 return err;
560}
545 561
546/** 562/**
547 * sysfs_add_file_to_group - add an attribute file to a pre-existing group. 563 * sysfs_add_file_to_group - add an attribute file to a pre-existing group.
@@ -556,7 +572,7 @@ int sysfs_add_file_to_group(struct kobject *kobj,
556 int error; 572 int error;
557 573
558 if (group) 574 if (group)
559 dir_sd = sysfs_get_dirent(kobj->sd, group); 575 dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
560 else 576 else
561 dir_sd = sysfs_get(kobj->sd); 577 dir_sd = sysfs_get(kobj->sd);
562 578
@@ -586,7 +602,7 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
586 mutex_lock(&sysfs_mutex); 602 mutex_lock(&sysfs_mutex);
587 603
588 rc = -ENOENT; 604 rc = -ENOENT;
589 sd = sysfs_find_dirent(kobj->sd, attr->name); 605 sd = sysfs_find_dirent(kobj->sd, NULL, attr->name);
590 if (!sd) 606 if (!sd)
591 goto out; 607 goto out;
592 608
@@ -611,9 +627,15 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
611 627
612void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) 628void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
613{ 629{
614 sysfs_hash_and_remove(kobj->sd, attr->name); 630 sysfs_hash_and_remove(kobj->sd, NULL, attr->name);
615} 631}
616 632
633void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr)
634{
635 int i;
636 for (i = 0; ptr[i]; i++)
637 sysfs_remove_file(kobj, ptr[i]);
638}
617 639
618/** 640/**
619 * sysfs_remove_file_from_group - remove an attribute file from a group. 641 * sysfs_remove_file_from_group - remove an attribute file from a group.
@@ -627,11 +649,11 @@ void sysfs_remove_file_from_group(struct kobject *kobj,
627 struct sysfs_dirent *dir_sd; 649 struct sysfs_dirent *dir_sd;
628 650
629 if (group) 651 if (group)
630 dir_sd = sysfs_get_dirent(kobj->sd, group); 652 dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
631 else 653 else
632 dir_sd = sysfs_get(kobj->sd); 654 dir_sd = sysfs_get(kobj->sd);
633 if (dir_sd) { 655 if (dir_sd) {
634 sysfs_hash_and_remove(dir_sd, attr->name); 656 sysfs_hash_and_remove(dir_sd, NULL, attr->name);
635 sysfs_put(dir_sd); 657 sysfs_put(dir_sd);
636 } 658 }
637} 659}
@@ -732,3 +754,5 @@ EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
732 754
733EXPORT_SYMBOL_GPL(sysfs_create_file); 755EXPORT_SYMBOL_GPL(sysfs_create_file);
734EXPORT_SYMBOL_GPL(sysfs_remove_file); 756EXPORT_SYMBOL_GPL(sysfs_remove_file);
757EXPORT_SYMBOL_GPL(sysfs_remove_files);
758EXPORT_SYMBOL_GPL(sysfs_create_files);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index fe611949a7f7..23c1e598792a 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -23,7 +23,7 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
23 int i; 23 int i;
24 24
25 for (i = 0, attr = grp->attrs; *attr; i++, attr++) 25 for (i = 0, attr = grp->attrs; *attr; i++, attr++)
26 sysfs_hash_and_remove(dir_sd, (*attr)->name); 26 sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
27} 27}
28 28
29static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, 29static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
@@ -39,7 +39,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
39 * visibility. Do this by first removing then 39 * visibility. Do this by first removing then
40 * re-adding (if required) the file */ 40 * re-adding (if required) the file */
41 if (update) 41 if (update)
42 sysfs_hash_and_remove(dir_sd, (*attr)->name); 42 sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
43 if (grp->is_visible) { 43 if (grp->is_visible) {
44 mode = grp->is_visible(kobj, *attr, i); 44 mode = grp->is_visible(kobj, *attr, i);
45 if (!mode) 45 if (!mode)
@@ -132,7 +132,7 @@ void sysfs_remove_group(struct kobject * kobj,
132 struct sysfs_dirent *sd; 132 struct sysfs_dirent *sd;
133 133
134 if (grp->name) { 134 if (grp->name) {
135 sd = sysfs_get_dirent(dir_sd, grp->name); 135 sd = sysfs_get_dirent(dir_sd, NULL, grp->name);
136 if (!sd) { 136 if (!sd) {
137 WARN(!sd, KERN_WARNING "sysfs group %p not found for " 137 WARN(!sd, KERN_WARNING "sysfs group %p not found for "
138 "kobject '%s'\n", grp, kobject_name(kobj)); 138 "kobject '%s'\n", grp, kobject_name(kobj));
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 6a06a1d1ea7b..cf2bad1462ea 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -18,6 +18,7 @@
18#include <linux/capability.h> 18#include <linux/capability.h>
19#include <linux/errno.h> 19#include <linux/errno.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h>
21#include <linux/xattr.h> 22#include <linux/xattr.h>
22#include <linux/security.h> 23#include <linux/security.h>
23#include "sysfs.h" 24#include "sysfs.h"
@@ -111,20 +112,20 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
111 if (!sd) 112 if (!sd)
112 return -EINVAL; 113 return -EINVAL;
113 114
115 mutex_lock(&sysfs_mutex);
114 error = inode_change_ok(inode, iattr); 116 error = inode_change_ok(inode, iattr);
115 if (error) 117 if (error)
116 return error; 118 goto out;
117 119
118 iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */ 120 iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */
119 121
120 error = inode_setattr(inode, iattr); 122 error = inode_setattr(inode, iattr);
121 if (error) 123 if (error)
122 return error; 124 goto out;
123 125
124 mutex_lock(&sysfs_mutex);
125 error = sysfs_sd_setattr(sd, iattr); 126 error = sysfs_sd_setattr(sd, iattr);
127out:
126 mutex_unlock(&sysfs_mutex); 128 mutex_unlock(&sysfs_mutex);
127
128 return error; 129 return error;
129} 130}
130 131
@@ -283,6 +284,7 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
283 284
284/** 285/**
285 * sysfs_get_inode - get inode for sysfs_dirent 286 * sysfs_get_inode - get inode for sysfs_dirent
287 * @sb: super block
286 * @sd: sysfs_dirent to allocate inode for 288 * @sd: sysfs_dirent to allocate inode for
287 * 289 *
288 * Get inode for @sd. If such inode doesn't exist, a new inode 290 * Get inode for @sd. If such inode doesn't exist, a new inode
@@ -295,11 +297,11 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
295 * RETURNS: 297 * RETURNS:
296 * Pointer to allocated inode on success, NULL on failure. 298 * Pointer to allocated inode on success, NULL on failure.
297 */ 299 */
298struct inode * sysfs_get_inode(struct sysfs_dirent *sd) 300struct inode * sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd)
299{ 301{
300 struct inode *inode; 302 struct inode *inode;
301 303
302 inode = iget_locked(sysfs_sb, sd->s_ino); 304 inode = iget_locked(sb, sd->s_ino);
303 if (inode && (inode->i_state & I_NEW)) 305 if (inode && (inode->i_state & I_NEW))
304 sysfs_init_inode(sd, inode); 306 sysfs_init_inode(sd, inode);
305 307
@@ -322,7 +324,7 @@ void sysfs_delete_inode(struct inode *inode)
322 sysfs_put(sd); 324 sysfs_put(sd);
323} 325}
324 326
325int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name) 327int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name)
326{ 328{
327 struct sysfs_addrm_cxt acxt; 329 struct sysfs_addrm_cxt acxt;
328 struct sysfs_dirent *sd; 330 struct sysfs_dirent *sd;
@@ -332,7 +334,7 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
332 334
333 sysfs_addrm_start(&acxt, dir_sd); 335 sysfs_addrm_start(&acxt, dir_sd);
334 336
335 sd = sysfs_find_dirent(dir_sd, name); 337 sd = sysfs_find_dirent(dir_sd, ns, name);
336 if (sd) 338 if (sd)
337 sysfs_remove_one(&acxt, sd); 339 sysfs_remove_one(&acxt, sd);
338 340
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 49749955ccaf..1afa32ba242c 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -18,12 +18,12 @@
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/magic.h> 20#include <linux/magic.h>
21#include <linux/slab.h>
21 22
22#include "sysfs.h" 23#include "sysfs.h"
23 24
24 25
25static struct vfsmount *sysfs_mount; 26static struct vfsmount *sysfs_mount;
26struct super_block * sysfs_sb = NULL;
27struct kmem_cache *sysfs_dir_cachep; 27struct kmem_cache *sysfs_dir_cachep;
28 28
29static const struct super_operations sysfs_ops = { 29static const struct super_operations sysfs_ops = {
@@ -35,7 +35,7 @@ static const struct super_operations sysfs_ops = {
35struct sysfs_dirent sysfs_root = { 35struct sysfs_dirent sysfs_root = {
36 .s_name = "", 36 .s_name = "",
37 .s_count = ATOMIC_INIT(1), 37 .s_count = ATOMIC_INIT(1),
38 .s_flags = SYSFS_DIR, 38 .s_flags = SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT),
39 .s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, 39 .s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
40 .s_ino = 1, 40 .s_ino = 1,
41}; 41};
@@ -50,11 +50,10 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
50 sb->s_magic = SYSFS_MAGIC; 50 sb->s_magic = SYSFS_MAGIC;
51 sb->s_op = &sysfs_ops; 51 sb->s_op = &sysfs_ops;
52 sb->s_time_gran = 1; 52 sb->s_time_gran = 1;
53 sysfs_sb = sb;
54 53
55 /* get root inode, initialize and unlock it */ 54 /* get root inode, initialize and unlock it */
56 mutex_lock(&sysfs_mutex); 55 mutex_lock(&sysfs_mutex);
57 inode = sysfs_get_inode(&sysfs_root); 56 inode = sysfs_get_inode(sb, &sysfs_root);
58 mutex_unlock(&sysfs_mutex); 57 mutex_unlock(&sysfs_mutex);
59 if (!inode) { 58 if (!inode) {
60 pr_debug("sysfs: could not get root inode\n"); 59 pr_debug("sysfs: could not get root inode\n");
@@ -73,18 +72,102 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
73 return 0; 72 return 0;
74} 73}
75 74
75static int sysfs_test_super(struct super_block *sb, void *data)
76{
77 struct sysfs_super_info *sb_info = sysfs_info(sb);
78 struct sysfs_super_info *info = data;
79 enum kobj_ns_type type;
80 int found = 1;
81
82 for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
83 if (sb_info->ns[type] != info->ns[type])
84 found = 0;
85 }
86 return found;
87}
88
89static int sysfs_set_super(struct super_block *sb, void *data)
90{
91 int error;
92 error = set_anon_super(sb, data);
93 if (!error)
94 sb->s_fs_info = data;
95 return error;
96}
97
76static int sysfs_get_sb(struct file_system_type *fs_type, 98static int sysfs_get_sb(struct file_system_type *fs_type,
77 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 99 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
78{ 100{
79 return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt); 101 struct sysfs_super_info *info;
102 enum kobj_ns_type type;
103 struct super_block *sb;
104 int error;
105
106 error = -ENOMEM;
107 info = kzalloc(sizeof(*info), GFP_KERNEL);
108 if (!info)
109 goto out;
110
111 for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
112 info->ns[type] = kobj_ns_current(type);
113
114 sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info);
115 if (IS_ERR(sb) || sb->s_fs_info != info)
116 kfree(info);
117 if (IS_ERR(sb)) {
118 error = PTR_ERR(sb);
119 goto out;
120 }
121 if (!sb->s_root) {
122 sb->s_flags = flags;
123 error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
124 if (error) {
125 deactivate_locked_super(sb);
126 goto out;
127 }
128 sb->s_flags |= MS_ACTIVE;
129 }
130
131 simple_set_mnt(mnt, sb);
132 error = 0;
133out:
134 return error;
135}
136
137static void sysfs_kill_sb(struct super_block *sb)
138{
139 struct sysfs_super_info *info = sysfs_info(sb);
140
141 kill_anon_super(sb);
142 kfree(info);
80} 143}
81 144
82static struct file_system_type sysfs_fs_type = { 145static struct file_system_type sysfs_fs_type = {
83 .name = "sysfs", 146 .name = "sysfs",
84 .get_sb = sysfs_get_sb, 147 .get_sb = sysfs_get_sb,
85 .kill_sb = kill_anon_super, 148 .kill_sb = sysfs_kill_sb,
86}; 149};
87 150
151void sysfs_exit_ns(enum kobj_ns_type type, const void *ns)
152{
153 struct super_block *sb;
154
155 mutex_lock(&sysfs_mutex);
156 spin_lock(&sb_lock);
157 list_for_each_entry(sb, &sysfs_fs_type.fs_supers, s_instances) {
158 struct sysfs_super_info *info = sysfs_info(sb);
159 /* Ignore superblocks that are in the process of unmounting */
160 if (sb->s_count <= S_BIAS)
161 continue;
162 /* Ignore superblocks with the wrong ns */
163 if (info->ns[type] != ns)
164 continue;
165 info->ns[type] = NULL;
166 }
167 spin_unlock(&sb_lock);
168 mutex_unlock(&sysfs_mutex);
169}
170
88int __init sysfs_init(void) 171int __init sysfs_init(void)
89{ 172{
90 int err = -ENOMEM; 173 int err = -ENOMEM;
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index c5eff49fa41b..b6ebdaa00f37 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -11,6 +11,7 @@
11 */ 11 */
12 12
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/gfp.h>
14#include <linux/mount.h> 15#include <linux/mount.h>
15#include <linux/module.h> 16#include <linux/module.h>
16#include <linux/kobject.h> 17#include <linux/kobject.h>
@@ -57,6 +58,8 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
57 if (!sd) 58 if (!sd)
58 goto out_put; 59 goto out_put;
59 60
61 if (sysfs_ns_type(parent_sd))
62 sd->s_ns = target->ktype->namespace(target);
60 sd->s_symlink.target_sd = target_sd; 63 sd->s_symlink.target_sd = target_sd;
61 target_sd = NULL; /* reference is now owned by the symlink */ 64 target_sd = NULL; /* reference is now owned by the symlink */
62 65
@@ -120,7 +123,52 @@ void sysfs_remove_link(struct kobject * kobj, const char * name)
120 else 123 else
121 parent_sd = kobj->sd; 124 parent_sd = kobj->sd;
122 125
123 sysfs_hash_and_remove(parent_sd, name); 126 sysfs_hash_and_remove(parent_sd, NULL, name);
127}
128
129/**
130 * sysfs_rename_link - rename symlink in object's directory.
131 * @kobj: object we're acting for.
132 * @targ: object we're pointing to.
133 * @old: previous name of the symlink.
134 * @new: new name of the symlink.
135 *
136 * A helper function for the common rename symlink idiom.
137 */
138int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
139 const char *old, const char *new)
140{
141 struct sysfs_dirent *parent_sd, *sd = NULL;
142 const void *old_ns = NULL, *new_ns = NULL;
143 int result;
144
145 if (!kobj)
146 parent_sd = &sysfs_root;
147 else
148 parent_sd = kobj->sd;
149
150 if (targ->sd)
151 old_ns = targ->sd->s_ns;
152
153 result = -ENOENT;
154 sd = sysfs_get_dirent(parent_sd, old_ns, old);
155 if (!sd)
156 goto out;
157
158 result = -EINVAL;
159 if (sysfs_type(sd) != SYSFS_KOBJ_LINK)
160 goto out;
161 if (sd->s_symlink.target_sd->s_dir.kobj != targ)
162 goto out;
163
164 if (sysfs_ns_type(parent_sd))
165 new_ns = targ->ktype->namespace(targ);
166
167 result = sysfs_rename(sd, parent_sd, new_ns, new);
168
169out:
170 sysfs_put(sd);
171 return result;
124} 172}
125 173
126static int sysfs_get_target_path(struct sysfs_dirent *parent_sd, 174static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
@@ -222,3 +270,4 @@ const struct inode_operations sysfs_symlink_inode_operations = {
222 270
223EXPORT_SYMBOL_GPL(sysfs_create_link); 271EXPORT_SYMBOL_GPL(sysfs_create_link);
224EXPORT_SYMBOL_GPL(sysfs_remove_link); 272EXPORT_SYMBOL_GPL(sysfs_remove_link);
273EXPORT_SYMBOL_GPL(sysfs_rename_link);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index cdd9377a6e06..93847d54c2e3 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -58,6 +58,7 @@ struct sysfs_dirent {
58 struct sysfs_dirent *s_sibling; 58 struct sysfs_dirent *s_sibling;
59 const char *s_name; 59 const char *s_name;
60 60
61 const void *s_ns;
61 union { 62 union {
62 struct sysfs_elem_dir s_dir; 63 struct sysfs_elem_dir s_dir;
63 struct sysfs_elem_symlink s_symlink; 64 struct sysfs_elem_symlink s_symlink;
@@ -66,8 +67,8 @@ struct sysfs_dirent {
66 }; 67 };
67 68
68 unsigned int s_flags; 69 unsigned int s_flags;
70 unsigned short s_mode;
69 ino_t s_ino; 71 ino_t s_ino;
70 umode_t s_mode;
71 struct sysfs_inode_attrs *s_iattr; 72 struct sysfs_inode_attrs *s_iattr;
72}; 73};
73 74
@@ -79,21 +80,33 @@ struct sysfs_dirent {
79#define SYSFS_KOBJ_BIN_ATTR 0x0004 80#define SYSFS_KOBJ_BIN_ATTR 0x0004
80#define SYSFS_KOBJ_LINK 0x0008 81#define SYSFS_KOBJ_LINK 0x0008
81#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK) 82#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK)
83#define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
82 84
83#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK 85#define SYSFS_NS_TYPE_MASK 0xff00
84#define SYSFS_FLAG_REMOVED 0x0200 86#define SYSFS_NS_TYPE_SHIFT 8
87
88#define SYSFS_FLAG_MASK ~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK)
89#define SYSFS_FLAG_REMOVED 0x020000
85 90
86static inline unsigned int sysfs_type(struct sysfs_dirent *sd) 91static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
87{ 92{
88 return sd->s_flags & SYSFS_TYPE_MASK; 93 return sd->s_flags & SYSFS_TYPE_MASK;
89} 94}
90 95
96static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd)
97{
98 return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT;
99}
100
91#ifdef CONFIG_DEBUG_LOCK_ALLOC 101#ifdef CONFIG_DEBUG_LOCK_ALLOC
92#define sysfs_dirent_init_lockdep(sd) \ 102#define sysfs_dirent_init_lockdep(sd) \
93do { \ 103do { \
94 static struct lock_class_key __key; \ 104 struct attribute *attr = sd->s_attr.attr; \
105 struct lock_class_key *key = attr->key; \
106 if (!key) \
107 key = &attr->skey; \
95 \ 108 \
96 lockdep_init_map(&sd->dep_map, "s_active", &__key, 0); \ 109 lockdep_init_map(&sd->dep_map, "s_active", key, 0); \
97} while(0) 110} while(0)
98#else 111#else
99#define sysfs_dirent_init_lockdep(sd) do {} while(0) 112#define sysfs_dirent_init_lockdep(sd) do {} while(0)
@@ -110,8 +123,11 @@ struct sysfs_addrm_cxt {
110/* 123/*
111 * mount.c 124 * mount.c
112 */ 125 */
126struct sysfs_super_info {
127 const void *ns[KOBJ_NS_TYPES];
128};
129#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
113extern struct sysfs_dirent sysfs_root; 130extern struct sysfs_dirent sysfs_root;
114extern struct super_block *sysfs_sb;
115extern struct kmem_cache *sysfs_dir_cachep; 131extern struct kmem_cache *sysfs_dir_cachep;
116 132
117/* 133/*
@@ -124,8 +140,8 @@ extern const struct file_operations sysfs_dir_operations;
124extern const struct inode_operations sysfs_dir_inode_operations; 140extern const struct inode_operations sysfs_dir_inode_operations;
125 141
126struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd); 142struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd);
127struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd); 143struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
128void sysfs_put_active_two(struct sysfs_dirent *sd); 144void sysfs_put_active(struct sysfs_dirent *sd);
129void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, 145void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
130 struct sysfs_dirent *parent_sd); 146 struct sysfs_dirent *parent_sd);
131int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd); 147int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
@@ -134,8 +150,10 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
134void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt); 150void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
135 151
136struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, 152struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
153 const void *ns,
137 const unsigned char *name); 154 const unsigned char *name);
138struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, 155struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
156 const void *ns,
139 const unsigned char *name); 157 const unsigned char *name);
140struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type); 158struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
141 159
@@ -146,7 +164,7 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
146void sysfs_remove_subdir(struct sysfs_dirent *sd); 164void sysfs_remove_subdir(struct sysfs_dirent *sd);
147 165
148int sysfs_rename(struct sysfs_dirent *sd, 166int sysfs_rename(struct sysfs_dirent *sd,
149 struct sysfs_dirent *new_parent_sd, const char *new_name); 167 struct sysfs_dirent *new_parent_sd, const void *ns, const char *new_name);
150 168
151static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd) 169static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
152{ 170{
@@ -168,7 +186,7 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
168/* 186/*
169 * inode.c 187 * inode.c
170 */ 188 */
171struct inode *sysfs_get_inode(struct sysfs_dirent *sd); 189struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
172void sysfs_delete_inode(struct inode *inode); 190void sysfs_delete_inode(struct inode *inode);
173int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr); 191int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
174int sysfs_permission(struct inode *inode, int mask); 192int sysfs_permission(struct inode *inode, int mask);
@@ -176,7 +194,7 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
176int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); 194int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
177int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, 195int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
178 size_t size, int flags); 196 size_t size, int flags);
179int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name); 197int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name);
180int sysfs_inode_init(void); 198int sysfs_inode_init(void);
181 199
182/* 200/*
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 4e50286a4cc3..1dabed286b4c 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -164,8 +164,8 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_
164 name, de->name)) 164 name, de->name))
165 goto found; 165 goto found;
166 } 166 }
167 dir_put_page(page);
167 } 168 }
168 dir_put_page(page);
169 169
170 if (++n >= npages) 170 if (++n >= npages)
171 n = 0; 171 n = 0;
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 1bfc95ad5f71..b86ab8eff79a 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -14,6 +14,7 @@
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/slab.h>
17#include <linux/list.h> 18#include <linux/list.h>
18#include <linux/spinlock.h> 19#include <linux/spinlock.h>
19#include <linux/time.h> 20#include <linux/time.h>
@@ -109,31 +110,14 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
109 struct timerfd_ctx *ctx = file->private_data; 110 struct timerfd_ctx *ctx = file->private_data;
110 ssize_t res; 111 ssize_t res;
111 u64 ticks = 0; 112 u64 ticks = 0;
112 DECLARE_WAITQUEUE(wait, current);
113 113
114 if (count < sizeof(ticks)) 114 if (count < sizeof(ticks))
115 return -EINVAL; 115 return -EINVAL;
116 spin_lock_irq(&ctx->wqh.lock); 116 spin_lock_irq(&ctx->wqh.lock);
117 res = -EAGAIN; 117 if (file->f_flags & O_NONBLOCK)
118 if (!ctx->ticks && !(file->f_flags & O_NONBLOCK)) { 118 res = -EAGAIN;
119 __add_wait_queue(&ctx->wqh, &wait); 119 else
120 for (res = 0;;) { 120 res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks);
121 set_current_state(TASK_INTERRUPTIBLE);
122 if (ctx->ticks) {
123 res = 0;
124 break;
125 }
126 if (signal_pending(current)) {
127 res = -ERESTARTSYS;
128 break;
129 }
130 spin_unlock_irq(&ctx->wqh.lock);
131 schedule();
132 spin_lock_irq(&ctx->wqh.lock);
133 }
134 __remove_wait_queue(&ctx->wqh, &wait);
135 __set_current_state(TASK_RUNNING);
136 }
137 if (ctx->ticks) { 121 if (ctx->ticks) {
138 ticks = ctx->ticks; 122 ticks = ctx->ticks;
139 if (ctx->expired && ctx->tintv.tv64) { 123 if (ctx->expired && ctx->tintv.tv64) {
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 4775af401167..37fa7ed062d8 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -45,6 +45,7 @@
45 45
46#include <linux/freezer.h> 46#include <linux/freezer.h>
47#include <linux/kthread.h> 47#include <linux/kthread.h>
48#include <linux/slab.h>
48#include "ubifs.h" 49#include "ubifs.h"
49 50
50/** 51/**
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 90492327b383..c2a68baa782f 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -34,6 +34,7 @@
34#include <linux/moduleparam.h> 34#include <linux/moduleparam.h>
35#include <linux/debugfs.h> 35#include <linux/debugfs.h>
36#include <linux/math64.h> 36#include <linux/math64.h>
37#include <linux/slab.h>
37 38
38#ifdef CONFIG_UBIFS_FS_DEBUG 39#ifdef CONFIG_UBIFS_FS_DEBUG
39 40
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index e26c02ab6cd5..5692cf72b807 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -52,6 +52,7 @@
52#include "ubifs.h" 52#include "ubifs.h"
53#include <linux/mount.h> 53#include <linux/mount.h>
54#include <linux/namei.h> 54#include <linux/namei.h>
55#include <linux/slab.h>
55 56
56static int read_block(struct inode *inode, void *addr, unsigned int block, 57static int read_block(struct inode *inode, void *addr, unsigned int block,
57 struct ubifs_data_node *dn) 58 struct ubifs_data_node *dn)
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index e5a3d8e96bb7..918d1582ca05 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -53,6 +53,7 @@
53 * good, and GC takes extra care when moving them. 53 * good, and GC takes extra care when moving them.
54 */ 54 */
55 55
56#include <linux/slab.h>
56#include <linux/pagemap.h> 57#include <linux/pagemap.h>
57#include <linux/list_sort.h> 58#include <linux/list_sort.h>
58#include "ubifs.h" 59#include "ubifs.h"
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index e589fedaf1ef..bcf5a16f30bb 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -51,6 +51,7 @@
51 */ 51 */
52 52
53#include <linux/crc32.h> 53#include <linux/crc32.h>
54#include <linux/slab.h>
54#include "ubifs.h" 55#include "ubifs.h"
55 56
56/** 57/**
@@ -63,6 +64,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
63 if (!c->ro_media) { 64 if (!c->ro_media) {
64 c->ro_media = 1; 65 c->ro_media = 1;
65 c->no_chk_data_crc = 0; 66 c->no_chk_data_crc = 0;
67 c->vfs_sb->s_flags |= MS_RDONLY;
66 ubifs_warn("switched to read-only mode, error %d", err); 68 ubifs_warn("switched to read-only mode, error %d", err);
67 dbg_dump_stack(); 69 dbg_dump_stack();
68 } 70 }
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index b2792e84d245..ad7f67b827ea 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -46,6 +46,7 @@
46#include "ubifs.h" 46#include "ubifs.h"
47#include <linux/crc16.h> 47#include <linux/crc16.h>
48#include <linux/math64.h> 48#include <linux/math64.h>
49#include <linux/slab.h>
49 50
50/** 51/**
51 * do_calc_lpt_geom - calculate sizes for the LPT area. 52 * do_calc_lpt_geom - calculate sizes for the LPT area.
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 8cbfb8248025..13cb7a4237bf 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -26,6 +26,7 @@
26 */ 26 */
27 27
28#include <linux/crc16.h> 28#include <linux/crc16.h>
29#include <linux/slab.h>
29#include "ubifs.h" 30#include "ubifs.h"
30 31
31/** 32/**
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 868a55ee080f..109c6ea03bb5 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -31,6 +31,7 @@
31 */ 31 */
32 32
33#include <linux/crc32.h> 33#include <linux/crc32.h>
34#include <linux/slab.h>
34#include "ubifs.h" 35#include "ubifs.h"
35 36
36/** 37/**
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 57085e43320f..96cb62c8a9dd 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -27,6 +27,7 @@
27 */ 27 */
28 28
29#include "ubifs.h" 29#include "ubifs.h"
30#include <linux/slab.h>
30#include <linux/random.h> 31#include <linux/random.h>
31#include <linux/math64.h> 32#include <linux/math64.h>
32 33
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index e5b1a7d00fa0..2194915220e5 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -31,6 +31,7 @@
31 */ 31 */
32 32
33#include <linux/crc32.h> 33#include <linux/crc32.h>
34#include <linux/slab.h>
34#include "ubifs.h" 35#include "ubifs.h"
35 36
36/* 37/*
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index b2d976366a46..bd2542dad014 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -28,6 +28,7 @@
28#include <linux/fs.h> 28#include <linux/fs.h>
29#include <linux/err.h> 29#include <linux/err.h>
30#include <linux/sched.h> 30#include <linux/sched.h>
31#include <linux/slab.h>
31#include <linux/vmalloc.h> 32#include <linux/vmalloc.h>
32#include <linux/spinlock.h> 33#include <linux/spinlock.h>
33#include <linux/mutex.h> 34#include <linux/mutex.h>
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 195830f47569..c74400f88fe0 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -56,6 +56,7 @@
56 */ 56 */
57 57
58#include "ubifs.h" 58#include "ubifs.h"
59#include <linux/slab.h>
59#include <linux/xattr.h> 60#include <linux/xattr.h>
60#include <linux/posix_acl_xattr.h> 61#include <linux/posix_acl_xattr.h>
61 62
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index ccc3ad7242d4..9a9378b4eb5a 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -31,55 +31,8 @@
31#define udf_clear_bit(nr, addr) ext2_clear_bit(nr, addr) 31#define udf_clear_bit(nr, addr) ext2_clear_bit(nr, addr)
32#define udf_set_bit(nr, addr) ext2_set_bit(nr, addr) 32#define udf_set_bit(nr, addr) ext2_set_bit(nr, addr)
33#define udf_test_bit(nr, addr) ext2_test_bit(nr, addr) 33#define udf_test_bit(nr, addr) ext2_test_bit(nr, addr)
34#define udf_find_first_one_bit(addr, size) find_first_one_bit(addr, size)
35#define udf_find_next_one_bit(addr, size, offset) \ 34#define udf_find_next_one_bit(addr, size, offset) \
36 find_next_one_bit(addr, size, offset) 35 ext2_find_next_bit(addr, size, offset)
37
38#define leBPL_to_cpup(x) leNUM_to_cpup(BITS_PER_LONG, x)
39#define leNUM_to_cpup(x, y) xleNUM_to_cpup(x, y)
40#define xleNUM_to_cpup(x, y) (le ## x ## _to_cpup(y))
41#define uintBPL_t uint(BITS_PER_LONG)
42#define uint(x) xuint(x)
43#define xuint(x) __le ## x
44
45static inline int find_next_one_bit(void *addr, int size, int offset)
46{
47 uintBPL_t *p = ((uintBPL_t *) addr) + (offset / BITS_PER_LONG);
48 int result = offset & ~(BITS_PER_LONG - 1);
49 unsigned long tmp;
50
51 if (offset >= size)
52 return size;
53 size -= result;
54 offset &= (BITS_PER_LONG - 1);
55 if (offset) {
56 tmp = leBPL_to_cpup(p++);
57 tmp &= ~0UL << offset;
58 if (size < BITS_PER_LONG)
59 goto found_first;
60 if (tmp)
61 goto found_middle;
62 size -= BITS_PER_LONG;
63 result += BITS_PER_LONG;
64 }
65 while (size & ~(BITS_PER_LONG - 1)) {
66 tmp = leBPL_to_cpup(p++);
67 if (tmp)
68 goto found_middle;
69 result += BITS_PER_LONG;
70 size -= BITS_PER_LONG;
71 }
72 if (!size)
73 return result;
74 tmp = leBPL_to_cpup(p);
75found_first:
76 tmp &= ~0UL >> (BITS_PER_LONG - size);
77found_middle:
78 return result + ffz(~tmp);
79}
80
81#define find_first_one_bit(addr, size)\
82 find_next_one_bit((addr), (size), 0)
83 36
84static int read_block_bitmap(struct super_block *sb, 37static int read_block_bitmap(struct super_block *sb,
85 struct udf_bitmap *bitmap, unsigned int block, 38 struct udf_bitmap *bitmap, unsigned int block,
@@ -172,9 +125,8 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
172 125
173 mutex_lock(&sbi->s_alloc_mutex); 126 mutex_lock(&sbi->s_alloc_mutex);
174 partmap = &sbi->s_partmaps[bloc->partitionReferenceNum]; 127 partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
175 if (bloc->logicalBlockNum < 0 || 128 if (bloc->logicalBlockNum + count < count ||
176 (bloc->logicalBlockNum + count) > 129 (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
177 partmap->s_partition_len) {
178 udf_debug("%d < %d || %d + %d > %d\n", 130 udf_debug("%d < %d || %d + %d > %d\n",
179 bloc->logicalBlockNum, 0, bloc->logicalBlockNum, 131 bloc->logicalBlockNum, 0, bloc->logicalBlockNum,
180 count, partmap->s_partition_len); 132 count, partmap->s_partition_len);
@@ -440,9 +392,8 @@ static void udf_table_free_blocks(struct super_block *sb,
440 392
441 mutex_lock(&sbi->s_alloc_mutex); 393 mutex_lock(&sbi->s_alloc_mutex);
442 partmap = &sbi->s_partmaps[bloc->partitionReferenceNum]; 394 partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
443 if (bloc->logicalBlockNum < 0 || 395 if (bloc->logicalBlockNum + count < count ||
444 (bloc->logicalBlockNum + count) > 396 (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
445 partmap->s_partition_len) {
446 udf_debug("%d < %d || %d + %d > %d\n", 397 udf_debug("%d < %d || %d + %d > %d\n",
447 bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count, 398 bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count,
448 partmap->s_partition_len); 399 partmap->s_partition_len);
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 1eb06774ed90..4b6a46ccbf46 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -218,7 +218,7 @@ const struct file_operations udf_file_operations = {
218 .llseek = generic_file_llseek, 218 .llseek = generic_file_llseek,
219}; 219};
220 220
221static int udf_setattr(struct dentry *dentry, struct iattr *iattr) 221int udf_setattr(struct dentry *dentry, struct iattr *iattr)
222{ 222{
223 struct inode *inode = dentry->d_inode; 223 struct inode *inode = dentry->d_inode;
224 int error; 224 int error;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index b57ab0402d89..8a3fbd177cab 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -106,7 +106,7 @@ void udf_clear_inode(struct inode *inode)
106 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && 106 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
107 inode->i_size != iinfo->i_lenExtents) { 107 inode->i_size != iinfo->i_lenExtents) {
108 printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has " 108 printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has "
109 "inode size %llu different from extent lenght %llu. " 109 "inode size %llu different from extent length %llu. "
110 "Filesystem need not be standards compliant.\n", 110 "Filesystem need not be standards compliant.\n",
111 inode->i_sb->s_id, inode->i_ino, inode->i_mode, 111 inode->i_sb->s_id, inode->i_ino, inode->i_mode,
112 (unsigned long long)inode->i_size, 112 (unsigned long long)inode->i_size,
@@ -1314,7 +1314,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1314 break; 1314 break;
1315 case ICBTAG_FILE_TYPE_SYMLINK: 1315 case ICBTAG_FILE_TYPE_SYMLINK:
1316 inode->i_data.a_ops = &udf_symlink_aops; 1316 inode->i_data.a_ops = &udf_symlink_aops;
1317 inode->i_op = &page_symlink_inode_operations; 1317 inode->i_op = &udf_symlink_inode_operations;
1318 inode->i_mode = S_IFLNK | S_IRWXUGO; 1318 inode->i_mode = S_IFLNK | S_IRWXUGO;
1319 break; 1319 break;
1320 case ICBTAG_FILE_TYPE_MAIN: 1320 case ICBTAG_FILE_TYPE_MAIN:
@@ -1408,20 +1408,19 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1408 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; 1408 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
1409 struct udf_inode_info *iinfo = UDF_I(inode); 1409 struct udf_inode_info *iinfo = UDF_I(inode);
1410 1410
1411 bh = udf_tread(inode->i_sb, 1411 bh = udf_tgetblk(inode->i_sb,
1412 udf_get_lb_pblock(inode->i_sb, 1412 udf_get_lb_pblock(inode->i_sb, &iinfo->i_location, 0));
1413 &iinfo->i_location, 0));
1414 if (!bh) { 1413 if (!bh) {
1415 udf_debug("bread failure\n"); 1414 udf_debug("getblk failure\n");
1416 return -EIO; 1415 return -ENOMEM;
1417 } 1416 }
1418 1417
1419 memset(bh->b_data, 0x00, inode->i_sb->s_blocksize); 1418 lock_buffer(bh);
1420 1419 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
1421 fe = (struct fileEntry *)bh->b_data; 1420 fe = (struct fileEntry *)bh->b_data;
1422 efe = (struct extendedFileEntry *)bh->b_data; 1421 efe = (struct extendedFileEntry *)bh->b_data;
1423 1422
1424 if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) { 1423 if (iinfo->i_use) {
1425 struct unallocSpaceEntry *use = 1424 struct unallocSpaceEntry *use =
1426 (struct unallocSpaceEntry *)bh->b_data; 1425 (struct unallocSpaceEntry *)bh->b_data;
1427 1426
@@ -1429,20 +1428,18 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1429 memcpy(bh->b_data + sizeof(struct unallocSpaceEntry), 1428 memcpy(bh->b_data + sizeof(struct unallocSpaceEntry),
1430 iinfo->i_ext.i_data, inode->i_sb->s_blocksize - 1429 iinfo->i_ext.i_data, inode->i_sb->s_blocksize -
1431 sizeof(struct unallocSpaceEntry)); 1430 sizeof(struct unallocSpaceEntry));
1431 use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE);
1432 use->descTag.tagLocation =
1433 cpu_to_le32(iinfo->i_location.logicalBlockNum);
1432 crclen = sizeof(struct unallocSpaceEntry) + 1434 crclen = sizeof(struct unallocSpaceEntry) +
1433 iinfo->i_lenAlloc - sizeof(struct tag); 1435 iinfo->i_lenAlloc - sizeof(struct tag);
1434 use->descTag.tagLocation = cpu_to_le32(
1435 iinfo->i_location.
1436 logicalBlockNum);
1437 use->descTag.descCRCLength = cpu_to_le16(crclen); 1436 use->descTag.descCRCLength = cpu_to_le16(crclen);
1438 use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use + 1437 use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use +
1439 sizeof(struct tag), 1438 sizeof(struct tag),
1440 crclen)); 1439 crclen));
1441 use->descTag.tagChecksum = udf_tag_checksum(&use->descTag); 1440 use->descTag.tagChecksum = udf_tag_checksum(&use->descTag);
1442 1441
1443 mark_buffer_dirty(bh); 1442 goto out;
1444 brelse(bh);
1445 return err;
1446 } 1443 }
1447 1444
1448 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET)) 1445 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET))
@@ -1597,18 +1594,21 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1597 fe->descTag.tagSerialNum = cpu_to_le16(sbi->s_serial_number); 1594 fe->descTag.tagSerialNum = cpu_to_le16(sbi->s_serial_number);
1598 fe->descTag.tagLocation = cpu_to_le32( 1595 fe->descTag.tagLocation = cpu_to_le32(
1599 iinfo->i_location.logicalBlockNum); 1596 iinfo->i_location.logicalBlockNum);
1600 crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - 1597 crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - sizeof(struct tag);
1601 sizeof(struct tag);
1602 fe->descTag.descCRCLength = cpu_to_le16(crclen); 1598 fe->descTag.descCRCLength = cpu_to_le16(crclen);
1603 fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag), 1599 fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag),
1604 crclen)); 1600 crclen));
1605 fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag); 1601 fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag);
1606 1602
1603out:
1604 set_buffer_uptodate(bh);
1605 unlock_buffer(bh);
1606
1607 /* write the data blocks */ 1607 /* write the data blocks */
1608 mark_buffer_dirty(bh); 1608 mark_buffer_dirty(bh);
1609 if (do_sync) { 1609 if (do_sync) {
1610 sync_dirty_buffer(bh); 1610 sync_dirty_buffer(bh);
1611 if (buffer_req(bh) && !buffer_uptodate(bh)) { 1611 if (buffer_write_io_error(bh)) {
1612 printk(KERN_WARNING "IO error syncing udf inode " 1612 printk(KERN_WARNING "IO error syncing udf inode "
1613 "[%s:%08lx]\n", inode->i_sb->s_id, 1613 "[%s:%08lx]\n", inode->i_sb->s_id,
1614 inode->i_ino); 1614 inode->i_ino);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index db423ab078b1..75816025f95f 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -925,7 +925,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
925 iinfo = UDF_I(inode); 925 iinfo = UDF_I(inode);
926 inode->i_mode = S_IFLNK | S_IRWXUGO; 926 inode->i_mode = S_IFLNK | S_IRWXUGO;
927 inode->i_data.a_ops = &udf_symlink_aops; 927 inode->i_data.a_ops = &udf_symlink_aops;
928 inode->i_op = &page_symlink_inode_operations; 928 inode->i_op = &udf_symlink_inode_operations;
929 929
930 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { 930 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
931 struct kernel_lb_addr eloc; 931 struct kernel_lb_addr eloc;
@@ -1393,6 +1393,7 @@ const struct export_operations udf_export_ops = {
1393const struct inode_operations udf_dir_inode_operations = { 1393const struct inode_operations udf_dir_inode_operations = {
1394 .lookup = udf_lookup, 1394 .lookup = udf_lookup,
1395 .create = udf_create, 1395 .create = udf_create,
1396 .setattr = udf_setattr,
1396 .link = udf_link, 1397 .link = udf_link,
1397 .unlink = udf_unlink, 1398 .unlink = udf_unlink,
1398 .symlink = udf_symlink, 1399 .symlink = udf_symlink,
@@ -1401,3 +1402,9 @@ const struct inode_operations udf_dir_inode_operations = {
1401 .mknod = udf_mknod, 1402 .mknod = udf_mknod,
1402 .rename = udf_rename, 1403 .rename = udf_rename,
1403}; 1404};
1405const struct inode_operations udf_symlink_inode_operations = {
1406 .readlink = generic_readlink,
1407 .follow_link = page_follow_link_light,
1408 .put_link = page_put_link,
1409 .setattr = udf_setattr,
1410};
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 4b540ee632d5..745eb209be0c 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -24,7 +24,6 @@
24 24
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/string.h> 26#include <linux/string.h>
27#include <linux/slab.h>
28#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
29 28
30uint32_t udf_get_pblock(struct super_block *sb, uint32_t block, 29uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 852e91845688..16064787d2b7 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -26,7 +26,6 @@
26#include <linux/time.h> 26#include <linux/time.h>
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/stat.h> 28#include <linux/stat.h>
29#include <linux/slab.h>
30#include <linux/pagemap.h> 29#include <linux/pagemap.h>
31#include <linux/smp_lock.h> 30#include <linux/smp_lock.h>
32#include <linux/buffer_head.h> 31#include <linux/buffer_head.h>
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 4223ac855da9..702a1148e702 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -76,6 +76,7 @@ extern const struct inode_operations udf_dir_inode_operations;
76extern const struct file_operations udf_dir_operations; 76extern const struct file_operations udf_dir_operations;
77extern const struct inode_operations udf_file_inode_operations; 77extern const struct inode_operations udf_file_inode_operations;
78extern const struct file_operations udf_file_operations; 78extern const struct file_operations udf_file_operations;
79extern const struct inode_operations udf_symlink_inode_operations;
79extern const struct address_space_operations udf_aops; 80extern const struct address_space_operations udf_aops;
80extern const struct address_space_operations udf_adinicb_aops; 81extern const struct address_space_operations udf_adinicb_aops;
81extern const struct address_space_operations udf_symlink_aops; 82extern const struct address_space_operations udf_symlink_aops;
@@ -131,7 +132,7 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
131/* file.c */ 132/* file.c */
132extern int udf_ioctl(struct inode *, struct file *, unsigned int, 133extern int udf_ioctl(struct inode *, struct file *, unsigned int,
133 unsigned long); 134 unsigned long);
134 135extern int udf_setattr(struct dentry *dentry, struct iattr *iattr);
135/* inode.c */ 136/* inode.c */
136extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); 137extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
137extern int udf_sync_inode(struct inode *); 138extern int udf_sync_inode(struct inode *);
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index cefa8c8913e6..d03a90b6ad69 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -24,6 +24,7 @@
24#include <linux/string.h> /* for memset */ 24#include <linux/string.h> /* for memset */
25#include <linux/nls.h> 25#include <linux/nls.h>
26#include <linux/crc-itu-t.h> 26#include <linux/crc-itu-t.h>
27#include <linux/slab.h>
27 28
28#include "udf_sb.h" 29#include "udf_sb.h"
29 30
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 66b63a751615..14743d935a93 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1016,6 +1016,9 @@ magic_found:
1016 case UFS_FSSTABLE: 1016 case UFS_FSSTABLE:
1017 UFSD("fs is stable\n"); 1017 UFSD("fs is stable\n");
1018 break; 1018 break;
1019 case UFS_FSLOG:
1020 UFSD("fs is logging fs\n");
1021 break;
1019 case UFS_FSOSF1: 1022 case UFS_FSOSF1:
1020 UFSD("fs is DEC OSF/1\n"); 1023 UFSD("fs is DEC OSF/1\n");
1021 break; 1024 break;
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index 54bde1895a80..6943ec677c0b 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -138,6 +138,7 @@ typedef __u16 __bitwise __fs16;
138 138
139#define UFS_USEEFT ((__u16)65535) 139#define UFS_USEEFT ((__u16)65535)
140 140
141/* fs_clean values */
141#define UFS_FSOK 0x7c269d38 142#define UFS_FSOK 0x7c269d38
142#define UFS_FSACTIVE ((__s8)0x00) 143#define UFS_FSACTIVE ((__s8)0x00)
143#define UFS_FSCLEAN ((__s8)0x01) 144#define UFS_FSCLEAN ((__s8)0x01)
@@ -145,6 +146,11 @@ typedef __u16 __bitwise __fs16;
145#define UFS_FSOSF1 ((__s8)0x03) /* is this correct for DEC OSF/1? */ 146#define UFS_FSOSF1 ((__s8)0x03) /* is this correct for DEC OSF/1? */
146#define UFS_FSBAD ((__s8)0xff) 147#define UFS_FSBAD ((__s8)0xff)
147 148
149/* Solaris-specific fs_clean values */
150#define UFS_FSSUSPEND ((__s8)0xfe) /* temporarily suspended */
151#define UFS_FSLOG ((__s8)0xfd) /* logging fs */
152#define UFS_FSFIX ((__s8)0xfc) /* being repaired while mounted */
153
148/* From here to next blank line, s_flags for ufs_sb_info */ 154/* From here to next blank line, s_flags for ufs_sb_info */
149/* directory entry encoding */ 155/* directory entry encoding */
150#define UFS_DE_MASK 0x00000010 /* mask for the following */ 156#define UFS_DE_MASK 0x00000010 /* mask for the following */
@@ -227,11 +233,16 @@ typedef __u16 __bitwise __fs16;
227 */ 233 */
228#define ufs_cbtocylno(bno) \ 234#define ufs_cbtocylno(bno) \
229 ((bno) * uspi->s_nspf / uspi->s_spc) 235 ((bno) * uspi->s_nspf / uspi->s_spc)
230#define ufs_cbtorpos(bno) \ 236#define ufs_cbtorpos(bno) \
237 ((UFS_SB(sb)->s_flags & UFS_CG_SUN) ? \
238 (((((bno) * uspi->s_nspf % uspi->s_spc) % \
239 uspi->s_nsect) * \
240 uspi->s_nrpos) / uspi->s_nsect) \
241 : \
231 ((((bno) * uspi->s_nspf % uspi->s_spc / uspi->s_nsect \ 242 ((((bno) * uspi->s_nspf % uspi->s_spc / uspi->s_nsect \
232 * uspi->s_trackskew + (bno) * uspi->s_nspf % uspi->s_spc \ 243 * uspi->s_trackskew + (bno) * uspi->s_nspf % uspi->s_spc \
233 % uspi->s_nsect * uspi->s_interleave) % uspi->s_nsect \ 244 % uspi->s_nsect * uspi->s_interleave) % uspi->s_nsect \
234 * uspi->s_nrpos) / uspi->s_npsect) 245 * uspi->s_nrpos) / uspi->s_npsect))
235 246
236/* 247/*
237 * The following macros optimize certain frequently calculated 248 * The following macros optimize certain frequently calculated
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
index 05ac0fe9c4d3..8d5a506c82eb 100644
--- a/fs/xattr_acl.c
+++ b/fs/xattr_acl.c
@@ -6,9 +6,9 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/slab.h>
10#include <linux/fs.h> 9#include <linux/fs.h>
11#include <linux/posix_acl_xattr.h> 10#include <linux/posix_acl_xattr.h>
11#include <linux/gfp.h>
12 12
13 13
14/* 14/*
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index bc7405585def..666c9db48eb6 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include <linux/mm.h> 18#include <linux/mm.h>
19#include <linux/highmem.h> 19#include <linux/highmem.h>
20#include <linux/slab.h>
20#include <linux/swap.h> 21#include <linux/swap.h>
21#include <linux/blkdev.h> 22#include <linux/blkdev.h>
22#include <linux/backing-dev.h> 23#include <linux/backing-dev.h>
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index bf85bbe4a9ae..a7bc925c4d60 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -22,6 +22,7 @@
22#include "xfs_inode.h" 22#include "xfs_inode.h"
23#include "xfs_vnodeops.h" 23#include "xfs_vnodeops.h"
24#include "xfs_trace.h" 24#include "xfs_trace.h"
25#include <linux/slab.h>
25#include <linux/xattr.h> 26#include <linux/xattr.h>
26#include <linux/posix_acl_xattr.h> 27#include <linux/posix_acl_xattr.h>
27 28
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 9083357f9e44..089eaca860b4 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -40,10 +40,20 @@
40#include "xfs_vnodeops.h" 40#include "xfs_vnodeops.h"
41#include "xfs_trace.h" 41#include "xfs_trace.h"
42#include "xfs_bmap.h" 42#include "xfs_bmap.h"
43#include <linux/gfp.h>
43#include <linux/mpage.h> 44#include <linux/mpage.h>
44#include <linux/pagevec.h> 45#include <linux/pagevec.h>
45#include <linux/writeback.h> 46#include <linux/writeback.h>
46 47
48/*
49 * Types of I/O for bmap clustering and I/O completion tracking.
50 */
51enum {
52 IO_READ, /* mapping for a read */
53 IO_DELAY, /* mapping covers delalloc region */
54 IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */
55 IO_NEW /* just allocated */
56};
47 57
48/* 58/*
49 * Prime number of hash buckets since address is used as the key. 59 * Prime number of hash buckets since address is used as the key.
@@ -102,8 +112,9 @@ xfs_count_page_state(
102 112
103STATIC struct block_device * 113STATIC struct block_device *
104xfs_find_bdev_for_inode( 114xfs_find_bdev_for_inode(
105 struct xfs_inode *ip) 115 struct inode *inode)
106{ 116{
117 struct xfs_inode *ip = XFS_I(inode);
107 struct xfs_mount *mp = ip->i_mount; 118 struct xfs_mount *mp = ip->i_mount;
108 119
109 if (XFS_IS_REALTIME_INODE(ip)) 120 if (XFS_IS_REALTIME_INODE(ip))
@@ -182,7 +193,7 @@ xfs_setfilesize(
182 xfs_fsize_t isize; 193 xfs_fsize_t isize;
183 194
184 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); 195 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
185 ASSERT(ioend->io_type != IOMAP_READ); 196 ASSERT(ioend->io_type != IO_READ);
186 197
187 if (unlikely(ioend->io_error)) 198 if (unlikely(ioend->io_error))
188 return 0; 199 return 0;
@@ -213,7 +224,7 @@ xfs_finish_ioend(
213 if (atomic_dec_and_test(&ioend->io_remaining)) { 224 if (atomic_dec_and_test(&ioend->io_remaining)) {
214 struct workqueue_struct *wq; 225 struct workqueue_struct *wq;
215 226
216 wq = (ioend->io_type == IOMAP_UNWRITTEN) ? 227 wq = (ioend->io_type == IO_UNWRITTEN) ?
217 xfsconvertd_workqueue : xfsdatad_workqueue; 228 xfsconvertd_workqueue : xfsdatad_workqueue;
218 queue_work(wq, &ioend->io_work); 229 queue_work(wq, &ioend->io_work);
219 if (wait) 230 if (wait)
@@ -236,7 +247,7 @@ xfs_end_io(
236 * For unwritten extents we need to issue transactions to convert a 247 * For unwritten extents we need to issue transactions to convert a
237 * range to normal written extens after the data I/O has finished. 248 * range to normal written extens after the data I/O has finished.
238 */ 249 */
239 if (ioend->io_type == IOMAP_UNWRITTEN && 250 if (ioend->io_type == IO_UNWRITTEN &&
240 likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { 251 likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
241 252
242 error = xfs_iomap_write_unwritten(ip, ioend->io_offset, 253 error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
@@ -249,7 +260,7 @@ xfs_end_io(
249 * We might have to update the on-disk file size after extending 260 * We might have to update the on-disk file size after extending
250 * writes. 261 * writes.
251 */ 262 */
252 if (ioend->io_type != IOMAP_READ) { 263 if (ioend->io_type != IO_READ) {
253 error = xfs_setfilesize(ioend); 264 error = xfs_setfilesize(ioend);
254 ASSERT(!error || error == EAGAIN); 265 ASSERT(!error || error == EAGAIN);
255 } 266 }
@@ -308,21 +319,25 @@ xfs_map_blocks(
308 struct inode *inode, 319 struct inode *inode,
309 loff_t offset, 320 loff_t offset,
310 ssize_t count, 321 ssize_t count,
311 xfs_iomap_t *mapp, 322 struct xfs_bmbt_irec *imap,
312 int flags) 323 int flags)
313{ 324{
314 int nmaps = 1; 325 int nmaps = 1;
326 int new = 0;
315 327
316 return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps); 328 return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new);
317} 329}
318 330
319STATIC int 331STATIC int
320xfs_iomap_valid( 332xfs_imap_valid(
321 xfs_iomap_t *iomapp, 333 struct inode *inode,
322 loff_t offset) 334 struct xfs_bmbt_irec *imap,
335 xfs_off_t offset)
323{ 336{
324 return offset >= iomapp->iomap_offset && 337 offset >>= inode->i_blkbits;
325 offset < iomapp->iomap_offset + iomapp->iomap_bsize; 338
339 return offset >= imap->br_startoff &&
340 offset < imap->br_startoff + imap->br_blockcount;
326} 341}
327 342
328/* 343/*
@@ -553,19 +568,23 @@ xfs_add_to_ioend(
553 568
554STATIC void 569STATIC void
555xfs_map_buffer( 570xfs_map_buffer(
571 struct inode *inode,
556 struct buffer_head *bh, 572 struct buffer_head *bh,
557 xfs_iomap_t *mp, 573 struct xfs_bmbt_irec *imap,
558 xfs_off_t offset, 574 xfs_off_t offset)
559 uint block_bits)
560{ 575{
561 sector_t bn; 576 sector_t bn;
577 struct xfs_mount *m = XFS_I(inode)->i_mount;
578 xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
579 xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
562 580
563 ASSERT(mp->iomap_bn != IOMAP_DADDR_NULL); 581 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
582 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
564 583
565 bn = (mp->iomap_bn >> (block_bits - BBSHIFT)) + 584 bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
566 ((offset - mp->iomap_offset) >> block_bits); 585 ((offset - iomap_offset) >> inode->i_blkbits);
567 586
568 ASSERT(bn || (mp->iomap_flags & IOMAP_REALTIME)); 587 ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
569 588
570 bh->b_blocknr = bn; 589 bh->b_blocknr = bn;
571 set_buffer_mapped(bh); 590 set_buffer_mapped(bh);
@@ -573,17 +592,17 @@ xfs_map_buffer(
573 592
574STATIC void 593STATIC void
575xfs_map_at_offset( 594xfs_map_at_offset(
595 struct inode *inode,
576 struct buffer_head *bh, 596 struct buffer_head *bh,
577 loff_t offset, 597 struct xfs_bmbt_irec *imap,
578 int block_bits, 598 xfs_off_t offset)
579 xfs_iomap_t *iomapp)
580{ 599{
581 ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE)); 600 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
582 ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY)); 601 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
583 602
584 lock_buffer(bh); 603 lock_buffer(bh);
585 xfs_map_buffer(bh, iomapp, offset, block_bits); 604 xfs_map_buffer(inode, bh, imap, offset);
586 bh->b_bdev = iomapp->iomap_target->bt_bdev; 605 bh->b_bdev = xfs_find_bdev_for_inode(inode);
587 set_buffer_mapped(bh); 606 set_buffer_mapped(bh);
588 clear_buffer_delay(bh); 607 clear_buffer_delay(bh);
589 clear_buffer_unwritten(bh); 608 clear_buffer_unwritten(bh);
@@ -712,11 +731,11 @@ xfs_is_delayed_page(
712 bh = head = page_buffers(page); 731 bh = head = page_buffers(page);
713 do { 732 do {
714 if (buffer_unwritten(bh)) 733 if (buffer_unwritten(bh))
715 acceptable = (type == IOMAP_UNWRITTEN); 734 acceptable = (type == IO_UNWRITTEN);
716 else if (buffer_delay(bh)) 735 else if (buffer_delay(bh))
717 acceptable = (type == IOMAP_DELAY); 736 acceptable = (type == IO_DELAY);
718 else if (buffer_dirty(bh) && buffer_mapped(bh)) 737 else if (buffer_dirty(bh) && buffer_mapped(bh))
719 acceptable = (type == IOMAP_NEW); 738 acceptable = (type == IO_NEW);
720 else 739 else
721 break; 740 break;
722 } while ((bh = bh->b_this_page) != head); 741 } while ((bh = bh->b_this_page) != head);
@@ -739,7 +758,7 @@ xfs_convert_page(
739 struct inode *inode, 758 struct inode *inode,
740 struct page *page, 759 struct page *page,
741 loff_t tindex, 760 loff_t tindex,
742 xfs_iomap_t *mp, 761 struct xfs_bmbt_irec *imap,
743 xfs_ioend_t **ioendp, 762 xfs_ioend_t **ioendp,
744 struct writeback_control *wbc, 763 struct writeback_control *wbc,
745 int startio, 764 int startio,
@@ -749,7 +768,6 @@ xfs_convert_page(
749 xfs_off_t end_offset; 768 xfs_off_t end_offset;
750 unsigned long p_offset; 769 unsigned long p_offset;
751 unsigned int type; 770 unsigned int type;
752 int bbits = inode->i_blkbits;
753 int len, page_dirty; 771 int len, page_dirty;
754 int count = 0, done = 0, uptodate = 1; 772 int count = 0, done = 0, uptodate = 1;
755 xfs_off_t offset = page_offset(page); 773 xfs_off_t offset = page_offset(page);
@@ -801,19 +819,19 @@ xfs_convert_page(
801 819
802 if (buffer_unwritten(bh) || buffer_delay(bh)) { 820 if (buffer_unwritten(bh) || buffer_delay(bh)) {
803 if (buffer_unwritten(bh)) 821 if (buffer_unwritten(bh))
804 type = IOMAP_UNWRITTEN; 822 type = IO_UNWRITTEN;
805 else 823 else
806 type = IOMAP_DELAY; 824 type = IO_DELAY;
807 825
808 if (!xfs_iomap_valid(mp, offset)) { 826 if (!xfs_imap_valid(inode, imap, offset)) {
809 done = 1; 827 done = 1;
810 continue; 828 continue;
811 } 829 }
812 830
813 ASSERT(!(mp->iomap_flags & IOMAP_HOLE)); 831 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
814 ASSERT(!(mp->iomap_flags & IOMAP_DELAY)); 832 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
815 833
816 xfs_map_at_offset(bh, offset, bbits, mp); 834 xfs_map_at_offset(inode, bh, imap, offset);
817 if (startio) { 835 if (startio) {
818 xfs_add_to_ioend(inode, bh, offset, 836 xfs_add_to_ioend(inode, bh, offset,
819 type, ioendp, done); 837 type, ioendp, done);
@@ -825,7 +843,7 @@ xfs_convert_page(
825 page_dirty--; 843 page_dirty--;
826 count++; 844 count++;
827 } else { 845 } else {
828 type = IOMAP_NEW; 846 type = IO_NEW;
829 if (buffer_mapped(bh) && all_bh && startio) { 847 if (buffer_mapped(bh) && all_bh && startio) {
830 lock_buffer(bh); 848 lock_buffer(bh);
831 xfs_add_to_ioend(inode, bh, offset, 849 xfs_add_to_ioend(inode, bh, offset,
@@ -865,7 +883,7 @@ STATIC void
865xfs_cluster_write( 883xfs_cluster_write(
866 struct inode *inode, 884 struct inode *inode,
867 pgoff_t tindex, 885 pgoff_t tindex,
868 xfs_iomap_t *iomapp, 886 struct xfs_bmbt_irec *imap,
869 xfs_ioend_t **ioendp, 887 xfs_ioend_t **ioendp,
870 struct writeback_control *wbc, 888 struct writeback_control *wbc,
871 int startio, 889 int startio,
@@ -884,7 +902,7 @@ xfs_cluster_write(
884 902
885 for (i = 0; i < pagevec_count(&pvec); i++) { 903 for (i = 0; i < pagevec_count(&pvec); i++) {
886 done = xfs_convert_page(inode, pvec.pages[i], tindex++, 904 done = xfs_convert_page(inode, pvec.pages[i], tindex++,
887 iomapp, ioendp, wbc, startio, all_bh); 905 imap, ioendp, wbc, startio, all_bh);
888 if (done) 906 if (done)
889 break; 907 break;
890 } 908 }
@@ -929,7 +947,10 @@ xfs_aops_discard_page(
929 loff_t offset = page_offset(page); 947 loff_t offset = page_offset(page);
930 ssize_t len = 1 << inode->i_blkbits; 948 ssize_t len = 1 << inode->i_blkbits;
931 949
932 if (!xfs_is_delayed_page(page, IOMAP_DELAY)) 950 if (!xfs_is_delayed_page(page, IO_DELAY))
951 goto out_invalidate;
952
953 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
933 goto out_invalidate; 954 goto out_invalidate;
934 955
935 xfs_fs_cmn_err(CE_ALERT, ip->i_mount, 956 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
@@ -964,8 +985,10 @@ xfs_aops_discard_page(
964 985
965 if (error) { 986 if (error) {
966 /* something screwed, just bail */ 987 /* something screwed, just bail */
967 xfs_fs_cmn_err(CE_ALERT, ip->i_mount, 988 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
968 "page discard failed delalloc mapping lookup."); 989 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
990 "page discard failed delalloc mapping lookup.");
991 }
969 break; 992 break;
970 } 993 }
971 if (!nimaps) { 994 if (!nimaps) {
@@ -991,8 +1014,10 @@ xfs_aops_discard_page(
991 ASSERT(!flist.xbf_count && !flist.xbf_first); 1014 ASSERT(!flist.xbf_count && !flist.xbf_first);
992 if (error) { 1015 if (error) {
993 /* something screwed, just bail */ 1016 /* something screwed, just bail */
994 xfs_fs_cmn_err(CE_ALERT, ip->i_mount, 1017 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1018 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
995 "page discard unable to remove delalloc mapping."); 1019 "page discard unable to remove delalloc mapping.");
1020 }
996 break; 1021 break;
997 } 1022 }
998next_buffer: 1023next_buffer:
@@ -1034,15 +1059,15 @@ xfs_page_state_convert(
1034 int unmapped) /* also implies page uptodate */ 1059 int unmapped) /* also implies page uptodate */
1035{ 1060{
1036 struct buffer_head *bh, *head; 1061 struct buffer_head *bh, *head;
1037 xfs_iomap_t iomap; 1062 struct xfs_bmbt_irec imap;
1038 xfs_ioend_t *ioend = NULL, *iohead = NULL; 1063 xfs_ioend_t *ioend = NULL, *iohead = NULL;
1039 loff_t offset; 1064 loff_t offset;
1040 unsigned long p_offset = 0; 1065 unsigned long p_offset = 0;
1041 unsigned int type; 1066 unsigned int type;
1042 __uint64_t end_offset; 1067 __uint64_t end_offset;
1043 pgoff_t end_index, last_index, tlast; 1068 pgoff_t end_index, last_index;
1044 ssize_t size, len; 1069 ssize_t size, len;
1045 int flags, err, iomap_valid = 0, uptodate = 1; 1070 int flags, err, imap_valid = 0, uptodate = 1;
1046 int page_dirty, count = 0; 1071 int page_dirty, count = 0;
1047 int trylock = 0; 1072 int trylock = 0;
1048 int all_bh = unmapped; 1073 int all_bh = unmapped;
@@ -1089,7 +1114,7 @@ xfs_page_state_convert(
1089 bh = head = page_buffers(page); 1114 bh = head = page_buffers(page);
1090 offset = page_offset(page); 1115 offset = page_offset(page);
1091 flags = BMAPI_READ; 1116 flags = BMAPI_READ;
1092 type = IOMAP_NEW; 1117 type = IO_NEW;
1093 1118
1094 /* TODO: cleanup count and page_dirty */ 1119 /* TODO: cleanup count and page_dirty */
1095 1120
@@ -1103,12 +1128,12 @@ xfs_page_state_convert(
1103 * the iomap is actually still valid, but the ioend 1128 * the iomap is actually still valid, but the ioend
1104 * isn't. shouldn't happen too often. 1129 * isn't. shouldn't happen too often.
1105 */ 1130 */
1106 iomap_valid = 0; 1131 imap_valid = 0;
1107 continue; 1132 continue;
1108 } 1133 }
1109 1134
1110 if (iomap_valid) 1135 if (imap_valid)
1111 iomap_valid = xfs_iomap_valid(&iomap, offset); 1136 imap_valid = xfs_imap_valid(inode, &imap, offset);
1112 1137
1113 /* 1138 /*
1114 * First case, map an unwritten extent and prepare for 1139 * First case, map an unwritten extent and prepare for
@@ -1129,20 +1154,20 @@ xfs_page_state_convert(
1129 * Make sure we don't use a read-only iomap 1154 * Make sure we don't use a read-only iomap
1130 */ 1155 */
1131 if (flags == BMAPI_READ) 1156 if (flags == BMAPI_READ)
1132 iomap_valid = 0; 1157 imap_valid = 0;
1133 1158
1134 if (buffer_unwritten(bh)) { 1159 if (buffer_unwritten(bh)) {
1135 type = IOMAP_UNWRITTEN; 1160 type = IO_UNWRITTEN;
1136 flags = BMAPI_WRITE | BMAPI_IGNSTATE; 1161 flags = BMAPI_WRITE | BMAPI_IGNSTATE;
1137 } else if (buffer_delay(bh)) { 1162 } else if (buffer_delay(bh)) {
1138 type = IOMAP_DELAY; 1163 type = IO_DELAY;
1139 flags = BMAPI_ALLOCATE | trylock; 1164 flags = BMAPI_ALLOCATE | trylock;
1140 } else { 1165 } else {
1141 type = IOMAP_NEW; 1166 type = IO_NEW;
1142 flags = BMAPI_WRITE | BMAPI_MMAP; 1167 flags = BMAPI_WRITE | BMAPI_MMAP;
1143 } 1168 }
1144 1169
1145 if (!iomap_valid) { 1170 if (!imap_valid) {
1146 /* 1171 /*
1147 * if we didn't have a valid mapping then we 1172 * if we didn't have a valid mapping then we
1148 * need to ensure that we put the new mapping 1173 * need to ensure that we put the new mapping
@@ -1152,7 +1177,7 @@ xfs_page_state_convert(
1152 * for unwritten extent conversion. 1177 * for unwritten extent conversion.
1153 */ 1178 */
1154 new_ioend = 1; 1179 new_ioend = 1;
1155 if (type == IOMAP_NEW) { 1180 if (type == IO_NEW) {
1156 size = xfs_probe_cluster(inode, 1181 size = xfs_probe_cluster(inode,
1157 page, bh, head, 0); 1182 page, bh, head, 0);
1158 } else { 1183 } else {
@@ -1160,14 +1185,14 @@ xfs_page_state_convert(
1160 } 1185 }
1161 1186
1162 err = xfs_map_blocks(inode, offset, size, 1187 err = xfs_map_blocks(inode, offset, size,
1163 &iomap, flags); 1188 &imap, flags);
1164 if (err) 1189 if (err)
1165 goto error; 1190 goto error;
1166 iomap_valid = xfs_iomap_valid(&iomap, offset); 1191 imap_valid = xfs_imap_valid(inode, &imap,
1192 offset);
1167 } 1193 }
1168 if (iomap_valid) { 1194 if (imap_valid) {
1169 xfs_map_at_offset(bh, offset, 1195 xfs_map_at_offset(inode, bh, &imap, offset);
1170 inode->i_blkbits, &iomap);
1171 if (startio) { 1196 if (startio) {
1172 xfs_add_to_ioend(inode, bh, offset, 1197 xfs_add_to_ioend(inode, bh, offset,
1173 type, &ioend, 1198 type, &ioend,
@@ -1186,40 +1211,41 @@ xfs_page_state_convert(
1186 * That means it must already have extents allocated 1211 * That means it must already have extents allocated
1187 * underneath it. Map the extent by reading it. 1212 * underneath it. Map the extent by reading it.
1188 */ 1213 */
1189 if (!iomap_valid || flags != BMAPI_READ) { 1214 if (!imap_valid || flags != BMAPI_READ) {
1190 flags = BMAPI_READ; 1215 flags = BMAPI_READ;
1191 size = xfs_probe_cluster(inode, page, bh, 1216 size = xfs_probe_cluster(inode, page, bh,
1192 head, 1); 1217 head, 1);
1193 err = xfs_map_blocks(inode, offset, size, 1218 err = xfs_map_blocks(inode, offset, size,
1194 &iomap, flags); 1219 &imap, flags);
1195 if (err) 1220 if (err)
1196 goto error; 1221 goto error;
1197 iomap_valid = xfs_iomap_valid(&iomap, offset); 1222 imap_valid = xfs_imap_valid(inode, &imap,
1223 offset);
1198 } 1224 }
1199 1225
1200 /* 1226 /*
1201 * We set the type to IOMAP_NEW in case we are doing a 1227 * We set the type to IO_NEW in case we are doing a
1202 * small write at EOF that is extending the file but 1228 * small write at EOF that is extending the file but
1203 * without needing an allocation. We need to update the 1229 * without needing an allocation. We need to update the
1204 * file size on I/O completion in this case so it is 1230 * file size on I/O completion in this case so it is
1205 * the same case as having just allocated a new extent 1231 * the same case as having just allocated a new extent
1206 * that we are writing into for the first time. 1232 * that we are writing into for the first time.
1207 */ 1233 */
1208 type = IOMAP_NEW; 1234 type = IO_NEW;
1209 if (trylock_buffer(bh)) { 1235 if (trylock_buffer(bh)) {
1210 ASSERT(buffer_mapped(bh)); 1236 ASSERT(buffer_mapped(bh));
1211 if (iomap_valid) 1237 if (imap_valid)
1212 all_bh = 1; 1238 all_bh = 1;
1213 xfs_add_to_ioend(inode, bh, offset, type, 1239 xfs_add_to_ioend(inode, bh, offset, type,
1214 &ioend, !iomap_valid); 1240 &ioend, !imap_valid);
1215 page_dirty--; 1241 page_dirty--;
1216 count++; 1242 count++;
1217 } else { 1243 } else {
1218 iomap_valid = 0; 1244 imap_valid = 0;
1219 } 1245 }
1220 } else if ((buffer_uptodate(bh) || PageUptodate(page)) && 1246 } else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
1221 (unmapped || startio)) { 1247 (unmapped || startio)) {
1222 iomap_valid = 0; 1248 imap_valid = 0;
1223 } 1249 }
1224 1250
1225 if (!iohead) 1251 if (!iohead)
@@ -1233,12 +1259,23 @@ xfs_page_state_convert(
1233 if (startio) 1259 if (startio)
1234 xfs_start_page_writeback(page, 1, count); 1260 xfs_start_page_writeback(page, 1, count);
1235 1261
1236 if (ioend && iomap_valid) { 1262 if (ioend && imap_valid) {
1237 offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >> 1263 xfs_off_t end_index;
1238 PAGE_CACHE_SHIFT; 1264
1239 tlast = min_t(pgoff_t, offset, last_index); 1265 end_index = imap.br_startoff + imap.br_blockcount;
1240 xfs_cluster_write(inode, page->index + 1, &iomap, &ioend, 1266
1241 wbc, startio, all_bh, tlast); 1267 /* to bytes */
1268 end_index <<= inode->i_blkbits;
1269
1270 /* to pages */
1271 end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
1272
1273 /* check against file size */
1274 if (end_index > last_index)
1275 end_index = last_index;
1276
1277 xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
1278 wbc, startio, all_bh, end_index);
1242 } 1279 }
1243 1280
1244 if (iohead) 1281 if (iohead)
@@ -1440,10 +1477,11 @@ __xfs_get_blocks(
1440 int direct, 1477 int direct,
1441 bmapi_flags_t flags) 1478 bmapi_flags_t flags)
1442{ 1479{
1443 xfs_iomap_t iomap; 1480 struct xfs_bmbt_irec imap;
1444 xfs_off_t offset; 1481 xfs_off_t offset;
1445 ssize_t size; 1482 ssize_t size;
1446 int niomap = 1; 1483 int nimap = 1;
1484 int new = 0;
1447 int error; 1485 int error;
1448 1486
1449 offset = (xfs_off_t)iblock << inode->i_blkbits; 1487 offset = (xfs_off_t)iblock << inode->i_blkbits;
@@ -1454,22 +1492,21 @@ __xfs_get_blocks(
1454 return 0; 1492 return 0;
1455 1493
1456 error = xfs_iomap(XFS_I(inode), offset, size, 1494 error = xfs_iomap(XFS_I(inode), offset, size,
1457 create ? flags : BMAPI_READ, &iomap, &niomap); 1495 create ? flags : BMAPI_READ, &imap, &nimap, &new);
1458 if (error) 1496 if (error)
1459 return -error; 1497 return -error;
1460 if (niomap == 0) 1498 if (nimap == 0)
1461 return 0; 1499 return 0;
1462 1500
1463 if (iomap.iomap_bn != IOMAP_DADDR_NULL) { 1501 if (imap.br_startblock != HOLESTARTBLOCK &&
1502 imap.br_startblock != DELAYSTARTBLOCK) {
1464 /* 1503 /*
1465 * For unwritten extents do not report a disk address on 1504 * For unwritten extents do not report a disk address on
1466 * the read case (treat as if we're reading into a hole). 1505 * the read case (treat as if we're reading into a hole).
1467 */ 1506 */
1468 if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) { 1507 if (create || !ISUNWRITTEN(&imap))
1469 xfs_map_buffer(bh_result, &iomap, offset, 1508 xfs_map_buffer(inode, bh_result, &imap, offset);
1470 inode->i_blkbits); 1509 if (create && ISUNWRITTEN(&imap)) {
1471 }
1472 if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) {
1473 if (direct) 1510 if (direct)
1474 bh_result->b_private = inode; 1511 bh_result->b_private = inode;
1475 set_buffer_unwritten(bh_result); 1512 set_buffer_unwritten(bh_result);
@@ -1480,7 +1517,7 @@ __xfs_get_blocks(
1480 * If this is a realtime file, data may be on a different device. 1517 * If this is a realtime file, data may be on a different device.
1481 * to that pointed to from the buffer_head b_bdev currently. 1518 * to that pointed to from the buffer_head b_bdev currently.
1482 */ 1519 */
1483 bh_result->b_bdev = iomap.iomap_target->bt_bdev; 1520 bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
1484 1521
1485 /* 1522 /*
1486 * If we previously allocated a block out beyond eof and we are now 1523 * If we previously allocated a block out beyond eof and we are now
@@ -1494,10 +1531,10 @@ __xfs_get_blocks(
1494 if (create && 1531 if (create &&
1495 ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || 1532 ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
1496 (offset >= i_size_read(inode)) || 1533 (offset >= i_size_read(inode)) ||
1497 (iomap.iomap_flags & (IOMAP_NEW|IOMAP_UNWRITTEN)))) 1534 (new || ISUNWRITTEN(&imap))))
1498 set_buffer_new(bh_result); 1535 set_buffer_new(bh_result);
1499 1536
1500 if (iomap.iomap_flags & IOMAP_DELAY) { 1537 if (imap.br_startblock == DELAYSTARTBLOCK) {
1501 BUG_ON(direct); 1538 BUG_ON(direct);
1502 if (create) { 1539 if (create) {
1503 set_buffer_uptodate(bh_result); 1540 set_buffer_uptodate(bh_result);
@@ -1506,11 +1543,23 @@ __xfs_get_blocks(
1506 } 1543 }
1507 } 1544 }
1508 1545
1546 /*
1547 * If this is O_DIRECT or the mpage code calling tell them how large
1548 * the mapping is, so that we can avoid repeated get_blocks calls.
1549 */
1509 if (direct || size > (1 << inode->i_blkbits)) { 1550 if (direct || size > (1 << inode->i_blkbits)) {
1510 ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0); 1551 xfs_off_t mapping_size;
1511 offset = min_t(xfs_off_t, 1552
1512 iomap.iomap_bsize - iomap.iomap_delta, size); 1553 mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
1513 bh_result->b_size = (ssize_t)min_t(xfs_off_t, LONG_MAX, offset); 1554 mapping_size <<= inode->i_blkbits;
1555
1556 ASSERT(mapping_size > 0);
1557 if (mapping_size > size)
1558 mapping_size = size;
1559 if (mapping_size > LONG_MAX)
1560 mapping_size = LONG_MAX;
1561
1562 bh_result->b_size = mapping_size;
1514 } 1563 }
1515 1564
1516 return 0; 1565 return 0;
@@ -1568,7 +1617,7 @@ xfs_end_io_direct(
1568 */ 1617 */
1569 ioend->io_offset = offset; 1618 ioend->io_offset = offset;
1570 ioend->io_size = size; 1619 ioend->io_size = size;
1571 if (ioend->io_type == IOMAP_READ) { 1620 if (ioend->io_type == IO_READ) {
1572 xfs_finish_ioend(ioend, 0); 1621 xfs_finish_ioend(ioend, 0);
1573 } else if (private && size > 0) { 1622 } else if (private && size > 0) {
1574 xfs_finish_ioend(ioend, is_sync_kiocb(iocb)); 1623 xfs_finish_ioend(ioend, is_sync_kiocb(iocb));
@@ -1579,7 +1628,7 @@ xfs_end_io_direct(
1579 * didn't map an unwritten extent so switch it's completion 1628 * didn't map an unwritten extent so switch it's completion
1580 * handler. 1629 * handler.
1581 */ 1630 */
1582 ioend->io_type = IOMAP_NEW; 1631 ioend->io_type = IO_NEW;
1583 xfs_finish_ioend(ioend, 0); 1632 xfs_finish_ioend(ioend, 0);
1584 } 1633 }
1585 1634
@@ -1604,10 +1653,10 @@ xfs_vm_direct_IO(
1604 struct block_device *bdev; 1653 struct block_device *bdev;
1605 ssize_t ret; 1654 ssize_t ret;
1606 1655
1607 bdev = xfs_find_bdev_for_inode(XFS_I(inode)); 1656 bdev = xfs_find_bdev_for_inode(inode);
1608 1657
1609 iocb->private = xfs_alloc_ioend(inode, rw == WRITE ? 1658 iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
1610 IOMAP_UNWRITTEN : IOMAP_READ); 1659 IO_UNWRITTEN : IO_READ);
1611 1660
1612 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov, 1661 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
1613 offset, nr_segs, 1662 offset, nr_segs,
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 6f76ba85f193..f01de3c55c43 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -18,7 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include <linux/stddef.h> 19#include <linux/stddef.h>
20#include <linux/errno.h> 20#include <linux/errno.h>
21#include <linux/slab.h> 21#include <linux/gfp.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
@@ -168,75 +168,6 @@ test_page_region(
168} 168}
169 169
170/* 170/*
171 * Mapping of multi-page buffers into contiguous virtual space
172 */
173
174typedef struct a_list {
175 void *vm_addr;
176 struct a_list *next;
177} a_list_t;
178
179static a_list_t *as_free_head;
180static int as_list_len;
181static DEFINE_SPINLOCK(as_lock);
182
183/*
184 * Try to batch vunmaps because they are costly.
185 */
186STATIC void
187free_address(
188 void *addr)
189{
190 a_list_t *aentry;
191
192#ifdef CONFIG_XEN
193 /*
194 * Xen needs to be able to make sure it can get an exclusive
195 * RO mapping of pages it wants to turn into a pagetable. If
196 * a newly allocated page is also still being vmap()ed by xfs,
197 * it will cause pagetable construction to fail. This is a
198 * quick workaround to always eagerly unmap pages so that Xen
199 * is happy.
200 */
201 vunmap(addr);
202 return;
203#endif
204
205 aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
206 if (likely(aentry)) {
207 spin_lock(&as_lock);
208 aentry->next = as_free_head;
209 aentry->vm_addr = addr;
210 as_free_head = aentry;
211 as_list_len++;
212 spin_unlock(&as_lock);
213 } else {
214 vunmap(addr);
215 }
216}
217
218STATIC void
219purge_addresses(void)
220{
221 a_list_t *aentry, *old;
222
223 if (as_free_head == NULL)
224 return;
225
226 spin_lock(&as_lock);
227 aentry = as_free_head;
228 as_free_head = NULL;
229 as_list_len = 0;
230 spin_unlock(&as_lock);
231
232 while ((old = aentry) != NULL) {
233 vunmap(aentry->vm_addr);
234 aentry = aentry->next;
235 kfree(old);
236 }
237}
238
239/*
240 * Internal xfs_buf_t object manipulation 171 * Internal xfs_buf_t object manipulation
241 */ 172 */
242 173
@@ -337,7 +268,8 @@ xfs_buf_free(
337 uint i; 268 uint i;
338 269
339 if (xfs_buf_is_vmapped(bp)) 270 if (xfs_buf_is_vmapped(bp))
340 free_address(bp->b_addr - bp->b_offset); 271 vm_unmap_ram(bp->b_addr - bp->b_offset,
272 bp->b_page_count);
341 273
342 for (i = 0; i < bp->b_page_count; i++) { 274 for (i = 0; i < bp->b_page_count; i++) {
343 struct page *page = bp->b_pages[i]; 275 struct page *page = bp->b_pages[i];
@@ -457,10 +389,8 @@ _xfs_buf_map_pages(
457 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 389 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
458 bp->b_flags |= XBF_MAPPED; 390 bp->b_flags |= XBF_MAPPED;
459 } else if (flags & XBF_MAPPED) { 391 } else if (flags & XBF_MAPPED) {
460 if (as_list_len > 64) 392 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
461 purge_addresses(); 393 -1, PAGE_KERNEL);
462 bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
463 VM_MAP, PAGE_KERNEL);
464 if (unlikely(bp->b_addr == NULL)) 394 if (unlikely(bp->b_addr == NULL))
465 return -ENOMEM; 395 return -ENOMEM;
466 bp->b_addr += bp->b_offset; 396 bp->b_addr += bp->b_offset;
@@ -1077,25 +1007,20 @@ xfs_bwrite(
1077 struct xfs_mount *mp, 1007 struct xfs_mount *mp,
1078 struct xfs_buf *bp) 1008 struct xfs_buf *bp)
1079{ 1009{
1080 int iowait = (bp->b_flags & XBF_ASYNC) == 0; 1010 int error;
1081 int error = 0;
1082 1011
1083 bp->b_strat = xfs_bdstrat_cb; 1012 bp->b_strat = xfs_bdstrat_cb;
1084 bp->b_mount = mp; 1013 bp->b_mount = mp;
1085 bp->b_flags |= XBF_WRITE; 1014 bp->b_flags |= XBF_WRITE;
1086 if (!iowait) 1015 bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
1087 bp->b_flags |= _XBF_RUN_QUEUES;
1088 1016
1089 xfs_buf_delwri_dequeue(bp); 1017 xfs_buf_delwri_dequeue(bp);
1090 xfs_buf_iostrategy(bp); 1018 xfs_buf_iostrategy(bp);
1091 1019
1092 if (iowait) { 1020 error = xfs_buf_iowait(bp);
1093 error = xfs_buf_iowait(bp); 1021 if (error)
1094 if (error) 1022 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1095 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1023 xfs_buf_relse(bp);
1096 xfs_buf_relse(bp);
1097 }
1098
1099 return error; 1024 return error;
1100} 1025}
1101 1026
@@ -1684,7 +1609,8 @@ xfs_mapping_buftarg(
1684 1609
1685STATIC int 1610STATIC int
1686xfs_alloc_delwrite_queue( 1611xfs_alloc_delwrite_queue(
1687 xfs_buftarg_t *btp) 1612 xfs_buftarg_t *btp,
1613 const char *fsname)
1688{ 1614{
1689 int error = 0; 1615 int error = 0;
1690 1616
@@ -1692,7 +1618,7 @@ xfs_alloc_delwrite_queue(
1692 INIT_LIST_HEAD(&btp->bt_delwrite_queue); 1618 INIT_LIST_HEAD(&btp->bt_delwrite_queue);
1693 spin_lock_init(&btp->bt_delwrite_lock); 1619 spin_lock_init(&btp->bt_delwrite_lock);
1694 btp->bt_flags = 0; 1620 btp->bt_flags = 0;
1695 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd"); 1621 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
1696 if (IS_ERR(btp->bt_task)) { 1622 if (IS_ERR(btp->bt_task)) {
1697 error = PTR_ERR(btp->bt_task); 1623 error = PTR_ERR(btp->bt_task);
1698 goto out_error; 1624 goto out_error;
@@ -1705,7 +1631,8 @@ out_error:
1705xfs_buftarg_t * 1631xfs_buftarg_t *
1706xfs_alloc_buftarg( 1632xfs_alloc_buftarg(
1707 struct block_device *bdev, 1633 struct block_device *bdev,
1708 int external) 1634 int external,
1635 const char *fsname)
1709{ 1636{
1710 xfs_buftarg_t *btp; 1637 xfs_buftarg_t *btp;
1711 1638
@@ -1717,7 +1644,7 @@ xfs_alloc_buftarg(
1717 goto error; 1644 goto error;
1718 if (xfs_mapping_buftarg(btp, bdev)) 1645 if (xfs_mapping_buftarg(btp, bdev))
1719 goto error; 1646 goto error;
1720 if (xfs_alloc_delwrite_queue(btp)) 1647 if (xfs_alloc_delwrite_queue(btp, fsname))
1721 goto error; 1648 goto error;
1722 xfs_alloc_bufhash(btp, external); 1649 xfs_alloc_bufhash(btp, external);
1723 return btp; 1650 return btp;
@@ -1955,9 +1882,6 @@ xfsbufd(
1955 xfs_buf_iostrategy(bp); 1882 xfs_buf_iostrategy(bp);
1956 count++; 1883 count++;
1957 } 1884 }
1958
1959 if (as_list_len > 0)
1960 purge_addresses();
1961 if (count) 1885 if (count)
1962 blk_run_address_space(target->bt_mapping); 1886 blk_run_address_space(target->bt_mapping);
1963 1887
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 386e7361e50e..5fbecefa5dfd 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -390,7 +390,7 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
390/* 390/*
391 * Handling of buftargs. 391 * Handling of buftargs.
392 */ 392 */
393extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int); 393extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *);
394extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); 394extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
395extern void xfs_wait_buftarg(xfs_buftarg_t *); 395extern void xfs_wait_buftarg(xfs_buftarg_t *);
396extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 396extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 42dd3bcfba6b..d8fb1b5d6cb5 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -115,6 +115,8 @@ xfs_file_fsync(
115 115
116 xfs_iflags_clear(ip, XFS_ITRUNCATED); 116 xfs_iflags_clear(ip, XFS_ITRUNCATED);
117 117
118 xfs_ioend_wait(ip);
119
118 /* 120 /*
119 * We always need to make sure that the required inode state is safe on 121 * We always need to make sure that the required inode state is safe on
120 * disk. The inode might be clean but we still might need to force the 122 * disk. The inode might be clean but we still might need to force the
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 4ea1ee18aded..699b60cbab9c 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -58,6 +58,7 @@
58#include <linux/mount.h> 58#include <linux/mount.h>
59#include <linux/namei.h> 59#include <linux/namei.h>
60#include <linux/pagemap.h> 60#include <linux/pagemap.h>
61#include <linux/slab.h>
61#include <linux/exportfs.h> 62#include <linux/exportfs.h>
62 63
63/* 64/*
@@ -526,6 +527,10 @@ xfs_attrmulti_by_handle(
526 if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t))) 527 if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
527 return -XFS_ERROR(EFAULT); 528 return -XFS_ERROR(EFAULT);
528 529
530 /* overflow check */
531 if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t))
532 return -E2BIG;
533
529 dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq); 534 dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq);
530 if (IS_ERR(dentry)) 535 if (IS_ERR(dentry))
531 return PTR_ERR(dentry); 536 return PTR_ERR(dentry);
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 0bf6d61f0528..9287135e9bfc 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -18,6 +18,7 @@
18#include <linux/compat.h> 18#include <linux/compat.h>
19#include <linux/ioctl.h> 19#include <linux/ioctl.h>
20#include <linux/mount.h> 20#include <linux/mount.h>
21#include <linux/slab.h>
21#include <asm/uaccess.h> 22#include <asm/uaccess.h>
22#include "xfs.h" 23#include "xfs.h"
23#include "xfs_fs.h" 24#include "xfs_fs.h"
@@ -419,6 +420,10 @@ xfs_compat_attrmulti_by_handle(
419 sizeof(compat_xfs_fsop_attrmulti_handlereq_t))) 420 sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
420 return -XFS_ERROR(EFAULT); 421 return -XFS_ERROR(EFAULT);
421 422
423 /* overflow check */
424 if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t))
425 return -E2BIG;
426
422 dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq); 427 dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq);
423 if (IS_ERR(dentry)) 428 if (IS_ERR(dentry))
424 return PTR_ERR(dentry); 429 return PTR_ERR(dentry);
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 61a99608731e..9c8019c78c92 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -56,6 +56,7 @@
56#include <linux/security.h> 56#include <linux/security.h>
57#include <linux/falloc.h> 57#include <linux/falloc.h>
58#include <linux/fiemap.h> 58#include <linux/fiemap.h>
59#include <linux/slab.h>
59 60
60/* 61/*
61 * Bring the timestamps in the XFS inode uptodate. 62 * Bring the timestamps in the XFS inode uptodate.
@@ -672,7 +673,10 @@ xfs_vn_fiemap(
672 bm.bmv_length = BTOBB(length); 673 bm.bmv_length = BTOBB(length);
673 674
674 /* We add one because in getbmap world count includes the header */ 675 /* We add one because in getbmap world count includes the header */
675 bm.bmv_count = fieinfo->fi_extents_max + 1; 676 bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
677 fieinfo->fi_extents_max + 1;
678 bm.bmv_count = min_t(__s32, bm.bmv_count,
679 (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
676 bm.bmv_iflags = BMV_IF_PREALLOC; 680 bm.bmv_iflags = BMV_IF_PREALLOC;
677 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) 681 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
678 bm.bmv_iflags |= BMV_IF_ATTRFORK; 682 bm.bmv_iflags |= BMV_IF_ATTRFORK;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 71345a370d9f..e9002513e08f 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -61,6 +61,7 @@
61 61
62#include <linux/namei.h> 62#include <linux/namei.h>
63#include <linux/init.h> 63#include <linux/init.h>
64#include <linux/slab.h>
64#include <linux/mount.h> 65#include <linux/mount.h>
65#include <linux/mempool.h> 66#include <linux/mempool.h>
66#include <linux/writeback.h> 67#include <linux/writeback.h>
@@ -788,18 +789,18 @@ xfs_open_devices(
788 * Setup xfs_mount buffer target pointers 789 * Setup xfs_mount buffer target pointers
789 */ 790 */
790 error = ENOMEM; 791 error = ENOMEM;
791 mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0); 792 mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname);
792 if (!mp->m_ddev_targp) 793 if (!mp->m_ddev_targp)
793 goto out_close_rtdev; 794 goto out_close_rtdev;
794 795
795 if (rtdev) { 796 if (rtdev) {
796 mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1); 797 mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname);
797 if (!mp->m_rtdev_targp) 798 if (!mp->m_rtdev_targp)
798 goto out_free_ddev_targ; 799 goto out_free_ddev_targ;
799 } 800 }
800 801
801 if (logdev && logdev != ddev) { 802 if (logdev && logdev != ddev) {
802 mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1); 803 mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname);
803 if (!mp->m_logdev_targp) 804 if (!mp->m_logdev_targp)
804 goto out_free_rtdev_targ; 805 goto out_free_rtdev_targ;
805 } else { 806 } else {
@@ -901,7 +902,8 @@ xfsaild_start(
901 struct xfs_ail *ailp) 902 struct xfs_ail *ailp)
902{ 903{
903 ailp->xa_target = 0; 904 ailp->xa_target = 0;
904 ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild"); 905 ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
906 ailp->xa_mount->m_fsname);
905 if (IS_ERR(ailp->xa_task)) 907 if (IS_ERR(ailp->xa_task))
906 return -PTR_ERR(ailp->xa_task); 908 return -PTR_ERR(ailp->xa_task);
907 return 0; 909 return 0;
@@ -1091,6 +1093,7 @@ xfs_fs_write_inode(
1091 * the code will only flush the inode if it isn't already 1093 * the code will only flush the inode if it isn't already
1092 * being flushed. 1094 * being flushed.
1093 */ 1095 */
1096 xfs_ioend_wait(ip);
1094 xfs_ilock(ip, XFS_ILOCK_SHARED); 1097 xfs_ilock(ip, XFS_ILOCK_SHARED);
1095 if (ip->i_update_core) { 1098 if (ip->i_update_core) {
1096 error = xfs_log_inode(ip); 1099 error = xfs_log_inode(ip);
@@ -1208,6 +1211,7 @@ xfs_fs_put_super(
1208 1211
1209 xfs_unmountfs(mp); 1212 xfs_unmountfs(mp);
1210 xfs_freesb(mp); 1213 xfs_freesb(mp);
1214 xfs_inode_shrinker_unregister(mp);
1211 xfs_icsb_destroy_counters(mp); 1215 xfs_icsb_destroy_counters(mp);
1212 xfs_close_devices(mp); 1216 xfs_close_devices(mp);
1213 xfs_dmops_put(mp); 1217 xfs_dmops_put(mp);
@@ -1621,6 +1625,8 @@ xfs_fs_fill_super(
1621 if (error) 1625 if (error)
1622 goto fail_vnrele; 1626 goto fail_vnrele;
1623 1627
1628 xfs_inode_shrinker_register(mp);
1629
1624 kfree(mtpt); 1630 kfree(mtpt);
1625 return 0; 1631 return 0;
1626 1632
@@ -1866,6 +1872,7 @@ init_xfs_fs(void)
1866 goto out_cleanup_procfs; 1872 goto out_cleanup_procfs;
1867 1873
1868 vfs_initquota(); 1874 vfs_initquota();
1875 xfs_inode_shrinker_init();
1869 1876
1870 error = register_filesystem(&xfs_fs_type); 1877 error = register_filesystem(&xfs_fs_type);
1871 if (error) 1878 if (error)
@@ -1893,6 +1900,7 @@ exit_xfs_fs(void)
1893{ 1900{
1894 vfs_exitquota(); 1901 vfs_exitquota();
1895 unregister_filesystem(&xfs_fs_type); 1902 unregister_filesystem(&xfs_fs_type);
1903 xfs_inode_shrinker_destroy();
1896 xfs_sysctl_unregister(); 1904 xfs_sysctl_unregister();
1897 xfs_cleanup_procfs(); 1905 xfs_cleanup_procfs();
1898 xfs_buf_terminate(); 1906 xfs_buf_terminate();
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 05cd85317f6f..3884e20bc14e 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -95,7 +95,8 @@ xfs_inode_ag_walk(
95 struct xfs_perag *pag, int flags), 95 struct xfs_perag *pag, int flags),
96 int flags, 96 int flags,
97 int tag, 97 int tag,
98 int exclusive) 98 int exclusive,
99 int *nr_to_scan)
99{ 100{
100 uint32_t first_index; 101 uint32_t first_index;
101 int last_error = 0; 102 int last_error = 0;
@@ -134,7 +135,7 @@ restart:
134 if (error == EFSCORRUPTED) 135 if (error == EFSCORRUPTED)
135 break; 136 break;
136 137
137 } while (1); 138 } while ((*nr_to_scan)--);
138 139
139 if (skipped) { 140 if (skipped) {
140 delay(1); 141 delay(1);
@@ -150,12 +151,15 @@ xfs_inode_ag_iterator(
150 struct xfs_perag *pag, int flags), 151 struct xfs_perag *pag, int flags),
151 int flags, 152 int flags,
152 int tag, 153 int tag,
153 int exclusive) 154 int exclusive,
155 int *nr_to_scan)
154{ 156{
155 int error = 0; 157 int error = 0;
156 int last_error = 0; 158 int last_error = 0;
157 xfs_agnumber_t ag; 159 xfs_agnumber_t ag;
160 int nr;
158 161
162 nr = nr_to_scan ? *nr_to_scan : INT_MAX;
159 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { 163 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
160 struct xfs_perag *pag; 164 struct xfs_perag *pag;
161 165
@@ -165,14 +169,18 @@ xfs_inode_ag_iterator(
165 continue; 169 continue;
166 } 170 }
167 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, 171 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
168 exclusive); 172 exclusive, &nr);
169 xfs_perag_put(pag); 173 xfs_perag_put(pag);
170 if (error) { 174 if (error) {
171 last_error = error; 175 last_error = error;
172 if (error == EFSCORRUPTED) 176 if (error == EFSCORRUPTED)
173 break; 177 break;
174 } 178 }
179 if (nr <= 0)
180 break;
175 } 181 }
182 if (nr_to_scan)
183 *nr_to_scan = nr;
176 return XFS_ERROR(last_error); 184 return XFS_ERROR(last_error);
177} 185}
178 186
@@ -291,7 +299,7 @@ xfs_sync_data(
291 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); 299 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
292 300
293 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, 301 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
294 XFS_ICI_NO_TAG, 0); 302 XFS_ICI_NO_TAG, 0, NULL);
295 if (error) 303 if (error)
296 return XFS_ERROR(error); 304 return XFS_ERROR(error);
297 305
@@ -310,7 +318,7 @@ xfs_sync_attr(
310 ASSERT((flags & ~SYNC_WAIT) == 0); 318 ASSERT((flags & ~SYNC_WAIT) == 0);
311 319
312 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, 320 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
313 XFS_ICI_NO_TAG, 0); 321 XFS_ICI_NO_TAG, 0, NULL);
314} 322}
315 323
316STATIC int 324STATIC int
@@ -348,68 +356,23 @@ xfs_commit_dummy_trans(
348 356
349STATIC int 357STATIC int
350xfs_sync_fsdata( 358xfs_sync_fsdata(
351 struct xfs_mount *mp, 359 struct xfs_mount *mp)
352 int flags)
353{ 360{
354 struct xfs_buf *bp; 361 struct xfs_buf *bp;
355 struct xfs_buf_log_item *bip;
356 int error = 0;
357
358 /*
359 * If this is xfssyncd() then only sync the superblock if we can
360 * lock it without sleeping and it is not pinned.
361 */
362 if (flags & SYNC_TRYLOCK) {
363 ASSERT(!(flags & SYNC_WAIT));
364
365 bp = xfs_getsb(mp, XBF_TRYLOCK);
366 if (!bp)
367 goto out;
368
369 bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *);
370 if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp))
371 goto out_brelse;
372 } else {
373 bp = xfs_getsb(mp, 0);
374
375 /*
376 * If the buffer is pinned then push on the log so we won't
377 * get stuck waiting in the write for someone, maybe
378 * ourselves, to flush the log.
379 *
380 * Even though we just pushed the log above, we did not have
381 * the superblock buffer locked at that point so it can
382 * become pinned in between there and here.
383 */
384 if (XFS_BUF_ISPINNED(bp))
385 xfs_log_force(mp, 0);
386 }
387
388
389 if (flags & SYNC_WAIT)
390 XFS_BUF_UNASYNC(bp);
391 else
392 XFS_BUF_ASYNC(bp);
393
394 error = xfs_bwrite(mp, bp);
395 if (error)
396 return error;
397 362
398 /* 363 /*
399 * If this is a data integrity sync make sure all pending buffers 364 * If the buffer is pinned then push on the log so we won't get stuck
400 * are flushed out for the log coverage check below. 365 * waiting in the write for someone, maybe ourselves, to flush the log.
366 *
367 * Even though we just pushed the log above, we did not have the
368 * superblock buffer locked at that point so it can become pinned in
369 * between there and here.
401 */ 370 */
402 if (flags & SYNC_WAIT) 371 bp = xfs_getsb(mp, 0);
403 xfs_flush_buftarg(mp->m_ddev_targp, 1); 372 if (XFS_BUF_ISPINNED(bp))
404 373 xfs_log_force(mp, 0);
405 if (xfs_log_need_covered(mp))
406 error = xfs_commit_dummy_trans(mp, flags);
407 return error;
408 374
409 out_brelse: 375 return xfs_bwrite(mp, bp);
410 xfs_buf_relse(bp);
411 out:
412 return error;
413} 376}
414 377
415/* 378/*
@@ -433,7 +396,7 @@ int
433xfs_quiesce_data( 396xfs_quiesce_data(
434 struct xfs_mount *mp) 397 struct xfs_mount *mp)
435{ 398{
436 int error; 399 int error, error2 = 0;
437 400
438 /* push non-blocking */ 401 /* push non-blocking */
439 xfs_sync_data(mp, 0); 402 xfs_sync_data(mp, 0);
@@ -444,13 +407,20 @@ xfs_quiesce_data(
444 xfs_qm_sync(mp, SYNC_WAIT); 407 xfs_qm_sync(mp, SYNC_WAIT);
445 408
446 /* write superblock and hoover up shutdown errors */ 409 /* write superblock and hoover up shutdown errors */
447 error = xfs_sync_fsdata(mp, SYNC_WAIT); 410 error = xfs_sync_fsdata(mp);
411
412 /* make sure all delwri buffers are written out */
413 xfs_flush_buftarg(mp->m_ddev_targp, 1);
414
415 /* mark the log as covered if needed */
416 if (xfs_log_need_covered(mp))
417 error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT);
448 418
449 /* flush data-only devices */ 419 /* flush data-only devices */
450 if (mp->m_rtdev_targp) 420 if (mp->m_rtdev_targp)
451 XFS_bflush(mp->m_rtdev_targp); 421 XFS_bflush(mp->m_rtdev_targp);
452 422
453 return error; 423 return error ? error : error2;
454} 424}
455 425
456STATIC void 426STATIC void
@@ -573,9 +543,9 @@ xfs_flush_inodes(
573} 543}
574 544
575/* 545/*
576 * Every sync period we need to unpin all items, reclaim inodes, sync 546 * Every sync period we need to unpin all items, reclaim inodes and sync
577 * quota and write out the superblock. We might need to cover the log 547 * disk quotas. We might need to cover the log to indicate that the
578 * to indicate it is idle. 548 * filesystem is idle.
579 */ 549 */
580STATIC void 550STATIC void
581xfs_sync_worker( 551xfs_sync_worker(
@@ -589,7 +559,8 @@ xfs_sync_worker(
589 xfs_reclaim_inodes(mp, 0); 559 xfs_reclaim_inodes(mp, 0);
590 /* dgc: errors ignored here */ 560 /* dgc: errors ignored here */
591 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 561 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
592 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); 562 if (xfs_log_need_covered(mp))
563 error = xfs_commit_dummy_trans(mp, 0);
593 } 564 }
594 mp->m_sync_seq++; 565 mp->m_sync_seq++;
595 wake_up(&mp->m_wait_single_sync_task); 566 wake_up(&mp->m_wait_single_sync_task);
@@ -652,7 +623,7 @@ xfs_syncd_init(
652 mp->m_sync_work.w_syncer = xfs_sync_worker; 623 mp->m_sync_work.w_syncer = xfs_sync_worker;
653 mp->m_sync_work.w_mount = mp; 624 mp->m_sync_work.w_mount = mp;
654 mp->m_sync_work.w_completion = NULL; 625 mp->m_sync_work.w_completion = NULL;
655 mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd"); 626 mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname);
656 if (IS_ERR(mp->m_sync_task)) 627 if (IS_ERR(mp->m_sync_task))
657 return -PTR_ERR(mp->m_sync_task); 628 return -PTR_ERR(mp->m_sync_task);
658 return 0; 629 return 0;
@@ -673,6 +644,7 @@ __xfs_inode_set_reclaim_tag(
673 radix_tree_tag_set(&pag->pag_ici_root, 644 radix_tree_tag_set(&pag->pag_ici_root,
674 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), 645 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
675 XFS_ICI_RECLAIM_TAG); 646 XFS_ICI_RECLAIM_TAG);
647 pag->pag_ici_reclaimable++;
676} 648}
677 649
678/* 650/*
@@ -705,6 +677,7 @@ __xfs_inode_clear_reclaim_tag(
705{ 677{
706 radix_tree_tag_clear(&pag->pag_ici_root, 678 radix_tree_tag_clear(&pag->pag_ici_root,
707 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); 679 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
680 pag->pag_ici_reclaimable--;
708} 681}
709 682
710/* 683/*
@@ -820,10 +793,10 @@ xfs_reclaim_inode(
820 * call into reclaim to find it in a clean state instead of waiting for 793 * call into reclaim to find it in a clean state instead of waiting for
821 * it now. We also don't return errors here - if the error is transient 794 * it now. We also don't return errors here - if the error is transient
822 * then the next reclaim pass will flush the inode, and if the error 795 * then the next reclaim pass will flush the inode, and if the error
823 * is permanent then the next sync reclaim will relcaim the inode and 796 * is permanent then the next sync reclaim will reclaim the inode and
824 * pass on the error. 797 * pass on the error.
825 */ 798 */
826 if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { 799 if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
827 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 800 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
828 "inode 0x%llx background reclaim flush failed with %d", 801 "inode 0x%llx background reclaim flush failed with %d",
829 (long long)ip->i_ino, error); 802 (long long)ip->i_ino, error);
@@ -854,5 +827,93 @@ xfs_reclaim_inodes(
854 int mode) 827 int mode)
855{ 828{
856 return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode, 829 return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
857 XFS_ICI_RECLAIM_TAG, 1); 830 XFS_ICI_RECLAIM_TAG, 1, NULL);
831}
832
833/*
834 * Shrinker infrastructure.
835 *
836 * This is all far more complex than it needs to be. It adds a global list of
837 * mounts because the shrinkers can only call a global context. We need to make
838 * the shrinkers pass a context to avoid the need for global state.
839 */
840static LIST_HEAD(xfs_mount_list);
841static struct rw_semaphore xfs_mount_list_lock;
842
843static int
844xfs_reclaim_inode_shrink(
845 int nr_to_scan,
846 gfp_t gfp_mask)
847{
848 struct xfs_mount *mp;
849 struct xfs_perag *pag;
850 xfs_agnumber_t ag;
851 int reclaimable = 0;
852
853 if (nr_to_scan) {
854 if (!(gfp_mask & __GFP_FS))
855 return -1;
856
857 down_read(&xfs_mount_list_lock);
858 list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
859 xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
860 XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
861 if (nr_to_scan <= 0)
862 break;
863 }
864 up_read(&xfs_mount_list_lock);
865 }
866
867 down_read(&xfs_mount_list_lock);
868 list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
869 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
870
871 pag = xfs_perag_get(mp, ag);
872 if (!pag->pag_ici_init) {
873 xfs_perag_put(pag);
874 continue;
875 }
876 reclaimable += pag->pag_ici_reclaimable;
877 xfs_perag_put(pag);
878 }
879 }
880 up_read(&xfs_mount_list_lock);
881 return reclaimable;
882}
883
884static struct shrinker xfs_inode_shrinker = {
885 .shrink = xfs_reclaim_inode_shrink,
886 .seeks = DEFAULT_SEEKS,
887};
888
889void __init
890xfs_inode_shrinker_init(void)
891{
892 init_rwsem(&xfs_mount_list_lock);
893 register_shrinker(&xfs_inode_shrinker);
894}
895
896void
897xfs_inode_shrinker_destroy(void)
898{
899 ASSERT(list_empty(&xfs_mount_list));
900 unregister_shrinker(&xfs_inode_shrinker);
901}
902
903void
904xfs_inode_shrinker_register(
905 struct xfs_mount *mp)
906{
907 down_write(&xfs_mount_list_lock);
908 list_add_tail(&mp->m_mplist, &xfs_mount_list);
909 up_write(&xfs_mount_list_lock);
910}
911
912void
913xfs_inode_shrinker_unregister(
914 struct xfs_mount *mp)
915{
916 down_write(&xfs_mount_list_lock);
917 list_del(&mp->m_mplist);
918 up_write(&xfs_mount_list_lock);
858} 919}
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index d480c346cabb..cdcbaaca9880 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -53,6 +53,11 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
53int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag); 53int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
54int xfs_inode_ag_iterator(struct xfs_mount *mp, 54int xfs_inode_ag_iterator(struct xfs_mount *mp,
55 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), 55 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
56 int flags, int tag, int write_lock); 56 int flags, int tag, int write_lock, int *nr_to_scan);
57
58void xfs_inode_shrinker_init(void);
59void xfs_inode_shrinker_destroy(void);
60void xfs_inode_shrinker_register(struct xfs_mount *mp);
61void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
57 62
58#endif 63#endif
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
index 5a107601e969..207fa77f63ae 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -41,7 +41,6 @@
41#include "xfs_alloc.h" 41#include "xfs_alloc.h"
42#include "xfs_bmap.h" 42#include "xfs_bmap.h"
43#include "xfs_attr.h" 43#include "xfs_attr.h"
44#include "xfs_attr_sf.h"
45#include "xfs_attr_leaf.h" 44#include "xfs_attr_leaf.h"
46#include "xfs_log_priv.h" 45#include "xfs_log_priv.h"
47#include "xfs_buf_item.h" 46#include "xfs_buf_item.h"
@@ -50,6 +49,9 @@
50#include "xfs_aops.h" 49#include "xfs_aops.h"
51#include "quota/xfs_dquot_item.h" 50#include "quota/xfs_dquot_item.h"
52#include "quota/xfs_dquot.h" 51#include "quota/xfs_dquot.h"
52#include "xfs_log_recover.h"
53#include "xfs_buf_item.h"
54#include "xfs_inode_item.h"
53 55
54/* 56/*
55 * We include this last to have the helpers above available for the trace 57 * We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index fcaa62f0799e..8a319cfd2901 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -32,6 +32,10 @@ struct xfs_da_node_entry;
32struct xfs_dquot; 32struct xfs_dquot;
33struct xlog_ticket; 33struct xlog_ticket;
34struct log; 34struct log;
35struct xlog_recover;
36struct xlog_recover_item;
37struct xfs_buf_log_format;
38struct xfs_inode_log_format;
35 39
36DECLARE_EVENT_CLASS(xfs_attr_list_class, 40DECLARE_EVENT_CLASS(xfs_attr_list_class,
37 TP_PROTO(struct xfs_attr_list_context *ctx), 41 TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -562,18 +566,21 @@ DECLARE_EVENT_CLASS(xfs_inode_class,
562 __field(dev_t, dev) 566 __field(dev_t, dev)
563 __field(xfs_ino_t, ino) 567 __field(xfs_ino_t, ino)
564 __field(int, count) 568 __field(int, count)
569 __field(int, pincount)
565 __field(unsigned long, caller_ip) 570 __field(unsigned long, caller_ip)
566 ), 571 ),
567 TP_fast_assign( 572 TP_fast_assign(
568 __entry->dev = VFS_I(ip)->i_sb->s_dev; 573 __entry->dev = VFS_I(ip)->i_sb->s_dev;
569 __entry->ino = ip->i_ino; 574 __entry->ino = ip->i_ino;
570 __entry->count = atomic_read(&VFS_I(ip)->i_count); 575 __entry->count = atomic_read(&VFS_I(ip)->i_count);
576 __entry->pincount = atomic_read(&ip->i_pincount);
571 __entry->caller_ip = caller_ip; 577 __entry->caller_ip = caller_ip;
572 ), 578 ),
573 TP_printk("dev %d:%d ino 0x%llx count %d caller %pf", 579 TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf",
574 MAJOR(__entry->dev), MINOR(__entry->dev), 580 MAJOR(__entry->dev), MINOR(__entry->dev),
575 __entry->ino, 581 __entry->ino,
576 __entry->count, 582 __entry->count,
583 __entry->pincount,
577 (char *)__entry->caller_ip) 584 (char *)__entry->caller_ip)
578) 585)
579 586
@@ -583,6 +590,10 @@ DEFINE_EVENT(xfs_inode_class, name, \
583 TP_ARGS(ip, caller_ip)) 590 TP_ARGS(ip, caller_ip))
584DEFINE_INODE_EVENT(xfs_ihold); 591DEFINE_INODE_EVENT(xfs_ihold);
585DEFINE_INODE_EVENT(xfs_irele); 592DEFINE_INODE_EVENT(xfs_irele);
593DEFINE_INODE_EVENT(xfs_inode_pin);
594DEFINE_INODE_EVENT(xfs_inode_unpin);
595DEFINE_INODE_EVENT(xfs_inode_unpin_nowait);
596
586/* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */ 597/* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */
587DEFINE_INODE_EVENT(xfs_inode); 598DEFINE_INODE_EVENT(xfs_inode);
588#define xfs_itrace_entry(ip) \ 599#define xfs_itrace_entry(ip) \
@@ -642,8 +653,6 @@ DEFINE_EVENT(xfs_dquot_class, name, \
642 TP_PROTO(struct xfs_dquot *dqp), \ 653 TP_PROTO(struct xfs_dquot *dqp), \
643 TP_ARGS(dqp)) 654 TP_ARGS(dqp))
644DEFINE_DQUOT_EVENT(xfs_dqadjust); 655DEFINE_DQUOT_EVENT(xfs_dqadjust);
645DEFINE_DQUOT_EVENT(xfs_dqshake_dirty);
646DEFINE_DQUOT_EVENT(xfs_dqshake_unlink);
647DEFINE_DQUOT_EVENT(xfs_dqreclaim_want); 656DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
648DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty); 657DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
649DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink); 658DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink);
@@ -658,7 +667,6 @@ DEFINE_DQUOT_EVENT(xfs_dqread_fail);
658DEFINE_DQUOT_EVENT(xfs_dqlookup_found); 667DEFINE_DQUOT_EVENT(xfs_dqlookup_found);
659DEFINE_DQUOT_EVENT(xfs_dqlookup_want); 668DEFINE_DQUOT_EVENT(xfs_dqlookup_want);
660DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist); 669DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist);
661DEFINE_DQUOT_EVENT(xfs_dqlookup_move);
662DEFINE_DQUOT_EVENT(xfs_dqlookup_done); 670DEFINE_DQUOT_EVENT(xfs_dqlookup_done);
663DEFINE_DQUOT_EVENT(xfs_dqget_hit); 671DEFINE_DQUOT_EVENT(xfs_dqget_hit);
664DEFINE_DQUOT_EVENT(xfs_dqget_miss); 672DEFINE_DQUOT_EVENT(xfs_dqget_miss);
@@ -1495,6 +1503,140 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \
1495DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); 1503DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
1496DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); 1504DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
1497 1505
1506DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
1507 TP_PROTO(struct log *log, struct xlog_recover *trans,
1508 struct xlog_recover_item *item, int pass),
1509 TP_ARGS(log, trans, item, pass),
1510 TP_STRUCT__entry(
1511 __field(dev_t, dev)
1512 __field(unsigned long, item)
1513 __field(xlog_tid_t, tid)
1514 __field(int, type)
1515 __field(int, pass)
1516 __field(int, count)
1517 __field(int, total)
1518 ),
1519 TP_fast_assign(
1520 __entry->dev = log->l_mp->m_super->s_dev;
1521 __entry->item = (unsigned long)item;
1522 __entry->tid = trans->r_log_tid;
1523 __entry->type = ITEM_TYPE(item);
1524 __entry->pass = pass;
1525 __entry->count = item->ri_cnt;
1526 __entry->total = item->ri_total;
1527 ),
1528 TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s "
1529 "item region count/total %d/%d",
1530 MAJOR(__entry->dev), MINOR(__entry->dev),
1531 __entry->tid,
1532 __entry->pass,
1533 (void *)__entry->item,
1534 __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
1535 __entry->count,
1536 __entry->total)
1537)
1538
1539#define DEFINE_LOG_RECOVER_ITEM(name) \
1540DEFINE_EVENT(xfs_log_recover_item_class, name, \
1541 TP_PROTO(struct log *log, struct xlog_recover *trans, \
1542 struct xlog_recover_item *item, int pass), \
1543 TP_ARGS(log, trans, item, pass))
1544
1545DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add);
1546DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add_cont);
1547DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_head);
1548DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail);
1549DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover);
1550
1551DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class,
1552 TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f),
1553 TP_ARGS(log, buf_f),
1554 TP_STRUCT__entry(
1555 __field(dev_t, dev)
1556 __field(__int64_t, blkno)
1557 __field(unsigned short, len)
1558 __field(unsigned short, flags)
1559 __field(unsigned short, size)
1560 __field(unsigned int, map_size)
1561 ),
1562 TP_fast_assign(
1563 __entry->dev = log->l_mp->m_super->s_dev;
1564 __entry->blkno = buf_f->blf_blkno;
1565 __entry->len = buf_f->blf_len;
1566 __entry->flags = buf_f->blf_flags;
1567 __entry->size = buf_f->blf_size;
1568 __entry->map_size = buf_f->blf_map_size;
1569 ),
1570 TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, "
1571 "map_size %d",
1572 MAJOR(__entry->dev), MINOR(__entry->dev),
1573 __entry->blkno,
1574 __entry->len,
1575 __entry->flags,
1576 __entry->size,
1577 __entry->map_size)
1578)
1579
1580#define DEFINE_LOG_RECOVER_BUF_ITEM(name) \
1581DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \
1582 TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), \
1583 TP_ARGS(log, buf_f))
1584
1585DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel);
1586DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel);
1587DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add);
1588DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc);
1589DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover);
1590DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf);
1591DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf);
1592DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf);
1593
1594DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class,
1595 TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f),
1596 TP_ARGS(log, in_f),
1597 TP_STRUCT__entry(
1598 __field(dev_t, dev)
1599 __field(xfs_ino_t, ino)
1600 __field(unsigned short, size)
1601 __field(int, fields)
1602 __field(unsigned short, asize)
1603 __field(unsigned short, dsize)
1604 __field(__int64_t, blkno)
1605 __field(int, len)
1606 __field(int, boffset)
1607 ),
1608 TP_fast_assign(
1609 __entry->dev = log->l_mp->m_super->s_dev;
1610 __entry->ino = in_f->ilf_ino;
1611 __entry->size = in_f->ilf_size;
1612 __entry->fields = in_f->ilf_fields;
1613 __entry->asize = in_f->ilf_asize;
1614 __entry->dsize = in_f->ilf_dsize;
1615 __entry->blkno = in_f->ilf_blkno;
1616 __entry->len = in_f->ilf_len;
1617 __entry->boffset = in_f->ilf_boffset;
1618 ),
1619 TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, "
1620 "dsize %d, blkno 0x%llx, len %d, boffset %d",
1621 MAJOR(__entry->dev), MINOR(__entry->dev),
1622 __entry->ino,
1623 __entry->size,
1624 __entry->fields,
1625 __entry->asize,
1626 __entry->dsize,
1627 __entry->blkno,
1628 __entry->len,
1629 __entry->boffset)
1630)
1631#define DEFINE_LOG_RECOVER_INO_ITEM(name) \
1632DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \
1633 TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), \
1634 TP_ARGS(log, in_f))
1635
1636DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
1637DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
1638DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
1639
1498#endif /* _TRACE_XFS_H */ 1640#endif /* _TRACE_XFS_H */
1499 1641
1500#undef TRACE_INCLUDE_PATH 1642#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 5f79dd78626b..b89ec5df0129 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -101,7 +101,7 @@ xfs_qm_dqinit(
101 * No need to re-initialize these if this is a reclaimed dquot. 101 * No need to re-initialize these if this is a reclaimed dquot.
102 */ 102 */
103 if (brandnewdquot) { 103 if (brandnewdquot) {
104 dqp->dq_flnext = dqp->dq_flprev = dqp; 104 INIT_LIST_HEAD(&dqp->q_freelist);
105 mutex_init(&dqp->q_qlock); 105 mutex_init(&dqp->q_qlock);
106 init_waitqueue_head(&dqp->q_pinwait); 106 init_waitqueue_head(&dqp->q_pinwait);
107 107
@@ -119,20 +119,20 @@ xfs_qm_dqinit(
119 * Only the q_core portion was zeroed in dqreclaim_one(). 119 * Only the q_core portion was zeroed in dqreclaim_one().
120 * So, we need to reset others. 120 * So, we need to reset others.
121 */ 121 */
122 dqp->q_nrefs = 0; 122 dqp->q_nrefs = 0;
123 dqp->q_blkno = 0; 123 dqp->q_blkno = 0;
124 dqp->MPL_NEXT = dqp->HL_NEXT = NULL; 124 INIT_LIST_HEAD(&dqp->q_mplist);
125 dqp->HL_PREVP = dqp->MPL_PREVP = NULL; 125 INIT_LIST_HEAD(&dqp->q_hashlist);
126 dqp->q_bufoffset = 0; 126 dqp->q_bufoffset = 0;
127 dqp->q_fileoffset = 0; 127 dqp->q_fileoffset = 0;
128 dqp->q_transp = NULL; 128 dqp->q_transp = NULL;
129 dqp->q_gdquot = NULL; 129 dqp->q_gdquot = NULL;
130 dqp->q_res_bcount = 0; 130 dqp->q_res_bcount = 0;
131 dqp->q_res_icount = 0; 131 dqp->q_res_icount = 0;
132 dqp->q_res_rtbcount = 0; 132 dqp->q_res_rtbcount = 0;
133 atomic_set(&dqp->q_pincount, 0); 133 atomic_set(&dqp->q_pincount, 0);
134 dqp->q_hash = NULL; 134 dqp->q_hash = NULL;
135 ASSERT(dqp->dq_flnext == dqp->dq_flprev); 135 ASSERT(list_empty(&dqp->q_freelist));
136 136
137 trace_xfs_dqreuse(dqp); 137 trace_xfs_dqreuse(dqp);
138 } 138 }
@@ -158,7 +158,7 @@ void
158xfs_qm_dqdestroy( 158xfs_qm_dqdestroy(
159 xfs_dquot_t *dqp) 159 xfs_dquot_t *dqp)
160{ 160{
161 ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp)); 161 ASSERT(list_empty(&dqp->q_freelist));
162 162
163 mutex_destroy(&dqp->q_qlock); 163 mutex_destroy(&dqp->q_qlock);
164 sv_destroy(&dqp->q_pinwait); 164 sv_destroy(&dqp->q_pinwait);
@@ -252,7 +252,7 @@ xfs_qm_adjust_dqtimers(
252 (be64_to_cpu(d->d_bcount) >= 252 (be64_to_cpu(d->d_bcount) >=
253 be64_to_cpu(d->d_blk_hardlimit)))) { 253 be64_to_cpu(d->d_blk_hardlimit)))) {
254 d->d_btimer = cpu_to_be32(get_seconds() + 254 d->d_btimer = cpu_to_be32(get_seconds() +
255 XFS_QI_BTIMELIMIT(mp)); 255 mp->m_quotainfo->qi_btimelimit);
256 } else { 256 } else {
257 d->d_bwarns = 0; 257 d->d_bwarns = 0;
258 } 258 }
@@ -275,7 +275,7 @@ xfs_qm_adjust_dqtimers(
275 (be64_to_cpu(d->d_icount) >= 275 (be64_to_cpu(d->d_icount) >=
276 be64_to_cpu(d->d_ino_hardlimit)))) { 276 be64_to_cpu(d->d_ino_hardlimit)))) {
277 d->d_itimer = cpu_to_be32(get_seconds() + 277 d->d_itimer = cpu_to_be32(get_seconds() +
278 XFS_QI_ITIMELIMIT(mp)); 278 mp->m_quotainfo->qi_itimelimit);
279 } else { 279 } else {
280 d->d_iwarns = 0; 280 d->d_iwarns = 0;
281 } 281 }
@@ -298,7 +298,7 @@ xfs_qm_adjust_dqtimers(
298 (be64_to_cpu(d->d_rtbcount) >= 298 (be64_to_cpu(d->d_rtbcount) >=
299 be64_to_cpu(d->d_rtb_hardlimit)))) { 299 be64_to_cpu(d->d_rtb_hardlimit)))) {
300 d->d_rtbtimer = cpu_to_be32(get_seconds() + 300 d->d_rtbtimer = cpu_to_be32(get_seconds() +
301 XFS_QI_RTBTIMELIMIT(mp)); 301 mp->m_quotainfo->qi_rtbtimelimit);
302 } else { 302 } else {
303 d->d_rtbwarns = 0; 303 d->d_rtbwarns = 0;
304 } 304 }
@@ -325,6 +325,7 @@ xfs_qm_init_dquot_blk(
325 uint type, 325 uint type,
326 xfs_buf_t *bp) 326 xfs_buf_t *bp)
327{ 327{
328 struct xfs_quotainfo *q = mp->m_quotainfo;
328 xfs_dqblk_t *d; 329 xfs_dqblk_t *d;
329 int curid, i; 330 int curid, i;
330 331
@@ -337,16 +338,16 @@ xfs_qm_init_dquot_blk(
337 /* 338 /*
338 * ID of the first dquot in the block - id's are zero based. 339 * ID of the first dquot in the block - id's are zero based.
339 */ 340 */
340 curid = id - (id % XFS_QM_DQPERBLK(mp)); 341 curid = id - (id % q->qi_dqperchunk);
341 ASSERT(curid >= 0); 342 ASSERT(curid >= 0);
342 memset(d, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp))); 343 memset(d, 0, BBTOB(q->qi_dqchunklen));
343 for (i = 0; i < XFS_QM_DQPERBLK(mp); i++, d++, curid++) 344 for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++)
344 xfs_qm_dqinit_core(curid, type, d); 345 xfs_qm_dqinit_core(curid, type, d);
345 xfs_trans_dquot_buf(tp, bp, 346 xfs_trans_dquot_buf(tp, bp,
346 (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF : 347 (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF :
347 ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF : 348 ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF :
348 XFS_BLI_GDQUOT_BUF))); 349 XFS_BLI_GDQUOT_BUF)));
349 xfs_trans_log_buf(tp, bp, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1); 350 xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
350} 351}
351 352
352 353
@@ -419,7 +420,7 @@ xfs_qm_dqalloc(
419 /* now we can just get the buffer (there's nothing to read yet) */ 420 /* now we can just get the buffer (there's nothing to read yet) */
420 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, 421 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
421 dqp->q_blkno, 422 dqp->q_blkno,
422 XFS_QI_DQCHUNKLEN(mp), 423 mp->m_quotainfo->qi_dqchunklen,
423 0); 424 0);
424 if (!bp || (error = XFS_BUF_GETERROR(bp))) 425 if (!bp || (error = XFS_BUF_GETERROR(bp)))
425 goto error1; 426 goto error1;
@@ -500,7 +501,8 @@ xfs_qm_dqtobp(
500 */ 501 */
501 if (dqp->q_blkno == (xfs_daddr_t) 0) { 502 if (dqp->q_blkno == (xfs_daddr_t) 0) {
502 /* We use the id as an index */ 503 /* We use the id as an index */
503 dqp->q_fileoffset = (xfs_fileoff_t)id / XFS_QM_DQPERBLK(mp); 504 dqp->q_fileoffset = (xfs_fileoff_t)id /
505 mp->m_quotainfo->qi_dqperchunk;
504 nmaps = 1; 506 nmaps = 1;
505 quotip = XFS_DQ_TO_QIP(dqp); 507 quotip = XFS_DQ_TO_QIP(dqp);
506 xfs_ilock(quotip, XFS_ILOCK_SHARED); 508 xfs_ilock(quotip, XFS_ILOCK_SHARED);
@@ -529,7 +531,7 @@ xfs_qm_dqtobp(
529 /* 531 /*
530 * offset of dquot in the (fixed sized) dquot chunk. 532 * offset of dquot in the (fixed sized) dquot chunk.
531 */ 533 */
532 dqp->q_bufoffset = (id % XFS_QM_DQPERBLK(mp)) * 534 dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
533 sizeof(xfs_dqblk_t); 535 sizeof(xfs_dqblk_t);
534 if (map.br_startblock == HOLESTARTBLOCK) { 536 if (map.br_startblock == HOLESTARTBLOCK) {
535 /* 537 /*
@@ -559,15 +561,13 @@ xfs_qm_dqtobp(
559 * Read in the buffer, unless we've just done the allocation 561 * Read in the buffer, unless we've just done the allocation
560 * (in which case we already have the buf). 562 * (in which case we already have the buf).
561 */ 563 */
562 if (! newdquot) { 564 if (!newdquot) {
563 trace_xfs_dqtobp_read(dqp); 565 trace_xfs_dqtobp_read(dqp);
564 566
565 if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 567 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
566 dqp->q_blkno, 568 dqp->q_blkno,
567 XFS_QI_DQCHUNKLEN(mp), 569 mp->m_quotainfo->qi_dqchunklen,
568 0, &bp))) { 570 0, &bp);
569 return (error);
570 }
571 if (error || !bp) 571 if (error || !bp)
572 return XFS_ERROR(error); 572 return XFS_ERROR(error);
573 } 573 }
@@ -689,14 +689,14 @@ xfs_qm_idtodq(
689 tp = NULL; 689 tp = NULL;
690 if (flags & XFS_QMOPT_DQALLOC) { 690 if (flags & XFS_QMOPT_DQALLOC) {
691 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); 691 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
692 if ((error = xfs_trans_reserve(tp, 692 error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp),
693 XFS_QM_DQALLOC_SPACE_RES(mp), 693 XFS_WRITE_LOG_RES(mp) +
694 XFS_WRITE_LOG_RES(mp) + 694 BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 +
695 BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1 + 695 128,
696 128, 696 0,
697 0, 697 XFS_TRANS_PERM_LOG_RES,
698 XFS_TRANS_PERM_LOG_RES, 698 XFS_WRITE_LOG_COUNT);
699 XFS_WRITE_LOG_COUNT))) { 699 if (error) {
700 cancelflags = 0; 700 cancelflags = 0;
701 goto error0; 701 goto error0;
702 } 702 }
@@ -751,7 +751,6 @@ xfs_qm_dqlookup(
751{ 751{
752 xfs_dquot_t *dqp; 752 xfs_dquot_t *dqp;
753 uint flist_locked; 753 uint flist_locked;
754 xfs_dquot_t *d;
755 754
756 ASSERT(mutex_is_locked(&qh->qh_lock)); 755 ASSERT(mutex_is_locked(&qh->qh_lock));
757 756
@@ -760,7 +759,7 @@ xfs_qm_dqlookup(
760 /* 759 /*
761 * Traverse the hashchain looking for a match 760 * Traverse the hashchain looking for a match
762 */ 761 */
763 for (dqp = qh->qh_next; dqp != NULL; dqp = dqp->HL_NEXT) { 762 list_for_each_entry(dqp, &qh->qh_list, q_hashlist) {
764 /* 763 /*
765 * We already have the hashlock. We don't need the 764 * We already have the hashlock. We don't need the
766 * dqlock to look at the id field of the dquot, since the 765 * dqlock to look at the id field of the dquot, since the
@@ -772,12 +771,12 @@ xfs_qm_dqlookup(
772 /* 771 /*
773 * All in core dquots must be on the dqlist of mp 772 * All in core dquots must be on the dqlist of mp
774 */ 773 */
775 ASSERT(dqp->MPL_PREVP != NULL); 774 ASSERT(!list_empty(&dqp->q_mplist));
776 775
777 xfs_dqlock(dqp); 776 xfs_dqlock(dqp);
778 if (dqp->q_nrefs == 0) { 777 if (dqp->q_nrefs == 0) {
779 ASSERT (XFS_DQ_IS_ON_FREELIST(dqp)); 778 ASSERT(!list_empty(&dqp->q_freelist));
780 if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) { 779 if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
781 trace_xfs_dqlookup_want(dqp); 780 trace_xfs_dqlookup_want(dqp);
782 781
783 /* 782 /*
@@ -787,7 +786,7 @@ xfs_qm_dqlookup(
787 */ 786 */
788 dqp->dq_flags |= XFS_DQ_WANT; 787 dqp->dq_flags |= XFS_DQ_WANT;
789 xfs_dqunlock(dqp); 788 xfs_dqunlock(dqp);
790 xfs_qm_freelist_lock(xfs_Gqm); 789 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
791 xfs_dqlock(dqp); 790 xfs_dqlock(dqp);
792 dqp->dq_flags &= ~(XFS_DQ_WANT); 791 dqp->dq_flags &= ~(XFS_DQ_WANT);
793 } 792 }
@@ -802,46 +801,28 @@ xfs_qm_dqlookup(
802 801
803 if (flist_locked) { 802 if (flist_locked) {
804 if (dqp->q_nrefs != 0) { 803 if (dqp->q_nrefs != 0) {
805 xfs_qm_freelist_unlock(xfs_Gqm); 804 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
806 flist_locked = B_FALSE; 805 flist_locked = B_FALSE;
807 } else { 806 } else {
808 /* 807 /* take it off the freelist */
809 * take it off the freelist
810 */
811 trace_xfs_dqlookup_freelist(dqp); 808 trace_xfs_dqlookup_freelist(dqp);
812 XQM_FREELIST_REMOVE(dqp); 809 list_del_init(&dqp->q_freelist);
813 /* xfs_qm_freelist_print(&(xfs_Gqm-> 810 xfs_Gqm->qm_dqfrlist_cnt--;
814 qm_dqfreelist),
815 "after removal"); */
816 } 811 }
817 } 812 }
818 813
819 /*
820 * grab a reference
821 */
822 XFS_DQHOLD(dqp); 814 XFS_DQHOLD(dqp);
823 815
824 if (flist_locked) 816 if (flist_locked)
825 xfs_qm_freelist_unlock(xfs_Gqm); 817 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
826 /* 818 /*
827 * move the dquot to the front of the hashchain 819 * move the dquot to the front of the hashchain
828 */ 820 */
829 ASSERT(mutex_is_locked(&qh->qh_lock)); 821 ASSERT(mutex_is_locked(&qh->qh_lock));
830 if (dqp->HL_PREVP != &qh->qh_next) { 822 list_move(&dqp->q_hashlist, &qh->qh_list);
831 trace_xfs_dqlookup_move(dqp);
832 if ((d = dqp->HL_NEXT))
833 d->HL_PREVP = dqp->HL_PREVP;
834 *(dqp->HL_PREVP) = d;
835 d = qh->qh_next;
836 d->HL_PREVP = &dqp->HL_NEXT;
837 dqp->HL_NEXT = d;
838 dqp->HL_PREVP = &qh->qh_next;
839 qh->qh_next = dqp;
840 }
841 trace_xfs_dqlookup_done(dqp); 823 trace_xfs_dqlookup_done(dqp);
842 *O_dqpp = dqp; 824 *O_dqpp = dqp;
843 ASSERT(mutex_is_locked(&qh->qh_lock)); 825 return 0;
844 return (0);
845 } 826 }
846 } 827 }
847 828
@@ -975,16 +956,17 @@ xfs_qm_dqget(
975 */ 956 */
976 if (ip) { 957 if (ip) {
977 xfs_ilock(ip, XFS_ILOCK_EXCL); 958 xfs_ilock(ip, XFS_ILOCK_EXCL);
978 if (! XFS_IS_DQTYPE_ON(mp, type)) { 959
979 /* inode stays locked on return */
980 xfs_qm_dqdestroy(dqp);
981 return XFS_ERROR(ESRCH);
982 }
983 /* 960 /*
984 * A dquot could be attached to this inode by now, since 961 * A dquot could be attached to this inode by now, since
985 * we had dropped the ilock. 962 * we had dropped the ilock.
986 */ 963 */
987 if (type == XFS_DQ_USER) { 964 if (type == XFS_DQ_USER) {
965 if (!XFS_IS_UQUOTA_ON(mp)) {
966 /* inode stays locked on return */
967 xfs_qm_dqdestroy(dqp);
968 return XFS_ERROR(ESRCH);
969 }
988 if (ip->i_udquot) { 970 if (ip->i_udquot) {
989 xfs_qm_dqdestroy(dqp); 971 xfs_qm_dqdestroy(dqp);
990 dqp = ip->i_udquot; 972 dqp = ip->i_udquot;
@@ -992,6 +974,11 @@ xfs_qm_dqget(
992 goto dqret; 974 goto dqret;
993 } 975 }
994 } else { 976 } else {
977 if (!XFS_IS_OQUOTA_ON(mp)) {
978 /* inode stays locked on return */
979 xfs_qm_dqdestroy(dqp);
980 return XFS_ERROR(ESRCH);
981 }
995 if (ip->i_gdquot) { 982 if (ip->i_gdquot) {
996 xfs_qm_dqdestroy(dqp); 983 xfs_qm_dqdestroy(dqp);
997 dqp = ip->i_gdquot; 984 dqp = ip->i_gdquot;
@@ -1033,13 +1020,14 @@ xfs_qm_dqget(
1033 */ 1020 */
1034 ASSERT(mutex_is_locked(&h->qh_lock)); 1021 ASSERT(mutex_is_locked(&h->qh_lock));
1035 dqp->q_hash = h; 1022 dqp->q_hash = h;
1036 XQM_HASHLIST_INSERT(h, dqp); 1023 list_add(&dqp->q_hashlist, &h->qh_list);
1024 h->qh_version++;
1037 1025
1038 /* 1026 /*
1039 * Attach this dquot to this filesystem's list of all dquots, 1027 * Attach this dquot to this filesystem's list of all dquots,
1040 * kept inside the mount structure in m_quotainfo field 1028 * kept inside the mount structure in m_quotainfo field
1041 */ 1029 */
1042 xfs_qm_mplist_lock(mp); 1030 mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
1043 1031
1044 /* 1032 /*
1045 * We return a locked dquot to the caller, with a reference taken 1033 * We return a locked dquot to the caller, with a reference taken
@@ -1047,9 +1035,9 @@ xfs_qm_dqget(
1047 xfs_dqlock(dqp); 1035 xfs_dqlock(dqp);
1048 dqp->q_nrefs = 1; 1036 dqp->q_nrefs = 1;
1049 1037
1050 XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp); 1038 list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist);
1051 1039 mp->m_quotainfo->qi_dquots++;
1052 xfs_qm_mplist_unlock(mp); 1040 mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
1053 mutex_unlock(&h->qh_lock); 1041 mutex_unlock(&h->qh_lock);
1054 dqret: 1042 dqret:
1055 ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); 1043 ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -1086,10 +1074,10 @@ xfs_qm_dqput(
1086 * drop the dqlock and acquire the freelist and dqlock 1074 * drop the dqlock and acquire the freelist and dqlock
1087 * in the right order; but try to get it out-of-order first 1075 * in the right order; but try to get it out-of-order first
1088 */ 1076 */
1089 if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) { 1077 if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
1090 trace_xfs_dqput_wait(dqp); 1078 trace_xfs_dqput_wait(dqp);
1091 xfs_dqunlock(dqp); 1079 xfs_dqunlock(dqp);
1092 xfs_qm_freelist_lock(xfs_Gqm); 1080 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
1093 xfs_dqlock(dqp); 1081 xfs_dqlock(dqp);
1094 } 1082 }
1095 1083
@@ -1100,10 +1088,8 @@ xfs_qm_dqput(
1100 if (--dqp->q_nrefs == 0) { 1088 if (--dqp->q_nrefs == 0) {
1101 trace_xfs_dqput_free(dqp); 1089 trace_xfs_dqput_free(dqp);
1102 1090
1103 /* 1091 list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist);
1104 * insert at end of the freelist. 1092 xfs_Gqm->qm_dqfrlist_cnt++;
1105 */
1106 XQM_FREELIST_INSERT(&(xfs_Gqm->qm_dqfreelist), dqp);
1107 1093
1108 /* 1094 /*
1109 * If we just added a udquot to the freelist, then 1095 * If we just added a udquot to the freelist, then
@@ -1118,10 +1104,6 @@ xfs_qm_dqput(
1118 xfs_dqlock(gdqp); 1104 xfs_dqlock(gdqp);
1119 dqp->q_gdquot = NULL; 1105 dqp->q_gdquot = NULL;
1120 } 1106 }
1121
1122 /* xfs_qm_freelist_print(&(xfs_Gqm->qm_dqfreelist),
1123 "@@@@@++ Free list (after append) @@@@@+");
1124 */
1125 } 1107 }
1126 xfs_dqunlock(dqp); 1108 xfs_dqunlock(dqp);
1127 1109
@@ -1133,7 +1115,7 @@ xfs_qm_dqput(
1133 break; 1115 break;
1134 dqp = gdqp; 1116 dqp = gdqp;
1135 } 1117 }
1136 xfs_qm_freelist_unlock(xfs_Gqm); 1118 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1137} 1119}
1138 1120
1139/* 1121/*
@@ -1386,10 +1368,10 @@ int
1386xfs_qm_dqpurge( 1368xfs_qm_dqpurge(
1387 xfs_dquot_t *dqp) 1369 xfs_dquot_t *dqp)
1388{ 1370{
1389 xfs_dqhash_t *thishash; 1371 xfs_dqhash_t *qh = dqp->q_hash;
1390 xfs_mount_t *mp = dqp->q_mount; 1372 xfs_mount_t *mp = dqp->q_mount;
1391 1373
1392 ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); 1374 ASSERT(mutex_is_locked(&mp->m_quotainfo->qi_dqlist_lock));
1393 ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock)); 1375 ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock));
1394 1376
1395 xfs_dqlock(dqp); 1377 xfs_dqlock(dqp);
@@ -1407,7 +1389,7 @@ xfs_qm_dqpurge(
1407 return (1); 1389 return (1);
1408 } 1390 }
1409 1391
1410 ASSERT(XFS_DQ_IS_ON_FREELIST(dqp)); 1392 ASSERT(!list_empty(&dqp->q_freelist));
1411 1393
1412 /* 1394 /*
1413 * If we're turning off quotas, we have to make sure that, for 1395 * If we're turning off quotas, we have to make sure that, for
@@ -1452,14 +1434,16 @@ xfs_qm_dqpurge(
1452 ASSERT(XFS_FORCED_SHUTDOWN(mp) || 1434 ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
1453 !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); 1435 !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
1454 1436
1455 thishash = dqp->q_hash; 1437 list_del_init(&dqp->q_hashlist);
1456 XQM_HASHLIST_REMOVE(thishash, dqp); 1438 qh->qh_version++;
1457 XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(mp)), dqp); 1439 list_del_init(&dqp->q_mplist);
1440 mp->m_quotainfo->qi_dqreclaims++;
1441 mp->m_quotainfo->qi_dquots--;
1458 /* 1442 /*
1459 * XXX Move this to the front of the freelist, if we can get the 1443 * XXX Move this to the front of the freelist, if we can get the
1460 * freelist lock. 1444 * freelist lock.
1461 */ 1445 */
1462 ASSERT(XFS_DQ_IS_ON_FREELIST(dqp)); 1446 ASSERT(!list_empty(&dqp->q_freelist));
1463 1447
1464 dqp->q_mount = NULL; 1448 dqp->q_mount = NULL;
1465 dqp->q_hash = NULL; 1449 dqp->q_hash = NULL;
@@ -1467,7 +1451,7 @@ xfs_qm_dqpurge(
1467 memset(&dqp->q_core, 0, sizeof(dqp->q_core)); 1451 memset(&dqp->q_core, 0, sizeof(dqp->q_core));
1468 xfs_dqfunlock(dqp); 1452 xfs_dqfunlock(dqp);
1469 xfs_dqunlock(dqp); 1453 xfs_dqunlock(dqp);
1470 mutex_unlock(&thishash->qh_lock); 1454 mutex_unlock(&qh->qh_lock);
1471 return (0); 1455 return (0);
1472} 1456}
1473 1457
@@ -1517,6 +1501,7 @@ void
1517xfs_qm_dqflock_pushbuf_wait( 1501xfs_qm_dqflock_pushbuf_wait(
1518 xfs_dquot_t *dqp) 1502 xfs_dquot_t *dqp)
1519{ 1503{
1504 xfs_mount_t *mp = dqp->q_mount;
1520 xfs_buf_t *bp; 1505 xfs_buf_t *bp;
1521 1506
1522 /* 1507 /*
@@ -1525,14 +1510,14 @@ xfs_qm_dqflock_pushbuf_wait(
1525 * out immediately. We'll be able to acquire 1510 * out immediately. We'll be able to acquire
1526 * the flush lock when the I/O completes. 1511 * the flush lock when the I/O completes.
1527 */ 1512 */
1528 bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno, 1513 bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno,
1529 XFS_QI_DQCHUNKLEN(dqp->q_mount), XBF_TRYLOCK); 1514 mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
1530 if (!bp) 1515 if (!bp)
1531 goto out_lock; 1516 goto out_lock;
1532 1517
1533 if (XFS_BUF_ISDELAYWRITE(bp)) { 1518 if (XFS_BUF_ISDELAYWRITE(bp)) {
1534 if (XFS_BUF_ISPINNED(bp)) 1519 if (XFS_BUF_ISPINNED(bp))
1535 xfs_log_force(dqp->q_mount, 0); 1520 xfs_log_force(mp, 0);
1536 xfs_buf_delwri_promote(bp); 1521 xfs_buf_delwri_promote(bp);
1537 wake_up_process(bp->b_target->bt_task); 1522 wake_up_process(bp->b_target->bt_task);
1538 } 1523 }
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index a0f7da586d1b..5da3a23b820d 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -33,40 +33,23 @@
33 * The hash chain headers (hash buckets) 33 * The hash chain headers (hash buckets)
34 */ 34 */
35typedef struct xfs_dqhash { 35typedef struct xfs_dqhash {
36 struct xfs_dquot *qh_next; 36 struct list_head qh_list;
37 struct mutex qh_lock; 37 struct mutex qh_lock;
38 uint qh_version; /* ever increasing version */ 38 uint qh_version; /* ever increasing version */
39 uint qh_nelems; /* number of dquots on the list */ 39 uint qh_nelems; /* number of dquots on the list */
40} xfs_dqhash_t; 40} xfs_dqhash_t;
41 41
42typedef struct xfs_dqlink {
43 struct xfs_dquot *ql_next; /* forward link */
44 struct xfs_dquot **ql_prevp; /* pointer to prev ql_next */
45} xfs_dqlink_t;
46
47struct xfs_mount; 42struct xfs_mount;
48struct xfs_trans; 43struct xfs_trans;
49 44
50/* 45/*
51 * This is the marker which is designed to occupy the first few
52 * bytes of the xfs_dquot_t structure. Even inside this, the freelist pointers
53 * must come first.
54 * This serves as the marker ("sentinel") when we have to restart list
55 * iterations because of locking considerations.
56 */
57typedef struct xfs_dqmarker {
58 struct xfs_dquot*dqm_flnext; /* link to freelist: must be first */
59 struct xfs_dquot*dqm_flprev;
60 xfs_dqlink_t dqm_mplist; /* link to mount's list of dquots */
61 xfs_dqlink_t dqm_hashlist; /* link to the hash chain */
62 uint dqm_flags; /* various flags (XFS_DQ_*) */
63} xfs_dqmarker_t;
64
65/*
66 * The incore dquot structure 46 * The incore dquot structure
67 */ 47 */
68typedef struct xfs_dquot { 48typedef struct xfs_dquot {
69 xfs_dqmarker_t q_lists; /* list ptrs, q_flags (marker) */ 49 uint dq_flags; /* various flags (XFS_DQ_*) */
50 struct list_head q_freelist; /* global free list of dquots */
51 struct list_head q_mplist; /* mount's list of dquots */
52 struct list_head q_hashlist; /* gloabl hash list of dquots */
70 xfs_dqhash_t *q_hash; /* the hashchain header */ 53 xfs_dqhash_t *q_hash; /* the hashchain header */
71 struct xfs_mount*q_mount; /* filesystem this relates to */ 54 struct xfs_mount*q_mount; /* filesystem this relates to */
72 struct xfs_trans*q_transp; /* trans this belongs to currently */ 55 struct xfs_trans*q_transp; /* trans this belongs to currently */
@@ -87,13 +70,6 @@ typedef struct xfs_dquot {
87 wait_queue_head_t q_pinwait; /* dquot pinning wait queue */ 70 wait_queue_head_t q_pinwait; /* dquot pinning wait queue */
88} xfs_dquot_t; 71} xfs_dquot_t;
89 72
90
91#define dq_flnext q_lists.dqm_flnext
92#define dq_flprev q_lists.dqm_flprev
93#define dq_mplist q_lists.dqm_mplist
94#define dq_hashlist q_lists.dqm_hashlist
95#define dq_flags q_lists.dqm_flags
96
97/* 73/*
98 * Lock hierarchy for q_qlock: 74 * Lock hierarchy for q_qlock:
99 * XFS_QLOCK_NORMAL is the implicit default, 75 * XFS_QLOCK_NORMAL is the implicit default,
@@ -127,7 +103,6 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
127} 103}
128 104
129#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) 105#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock)))
130#define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp))
131#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) 106#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY)
132#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) 107#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
133#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) 108#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ)
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 4e4ee9a57194..8d89a24ae324 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -107,8 +107,7 @@ xfs_qm_dquot_logitem_pin(
107/* ARGSUSED */ 107/* ARGSUSED */
108STATIC void 108STATIC void
109xfs_qm_dquot_logitem_unpin( 109xfs_qm_dquot_logitem_unpin(
110 xfs_dq_logitem_t *logitem, 110 xfs_dq_logitem_t *logitem)
111 int stale)
112{ 111{
113 xfs_dquot_t *dqp = logitem->qli_dquot; 112 xfs_dquot_t *dqp = logitem->qli_dquot;
114 113
@@ -123,7 +122,7 @@ xfs_qm_dquot_logitem_unpin_remove(
123 xfs_dq_logitem_t *logitem, 122 xfs_dq_logitem_t *logitem,
124 xfs_trans_t *tp) 123 xfs_trans_t *tp)
125{ 124{
126 xfs_qm_dquot_logitem_unpin(logitem, 0); 125 xfs_qm_dquot_logitem_unpin(logitem);
127} 126}
128 127
129/* 128/*
@@ -228,7 +227,7 @@ xfs_qm_dquot_logitem_pushbuf(
228 } 227 }
229 mp = dqp->q_mount; 228 mp = dqp->q_mount;
230 bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno, 229 bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
231 XFS_QI_DQCHUNKLEN(mp), XBF_TRYLOCK); 230 mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
232 xfs_dqunlock(dqp); 231 xfs_dqunlock(dqp);
233 if (!bp) 232 if (!bp)
234 return; 233 return;
@@ -329,8 +328,7 @@ static struct xfs_item_ops xfs_dquot_item_ops = {
329 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 328 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
330 xfs_qm_dquot_logitem_format, 329 xfs_qm_dquot_logitem_format,
331 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin, 330 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin,
332 .iop_unpin = (void(*)(xfs_log_item_t*, int)) 331 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unpin,
333 xfs_qm_dquot_logitem_unpin,
334 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) 332 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
335 xfs_qm_dquot_logitem_unpin_remove, 333 xfs_qm_dquot_logitem_unpin_remove,
336 .iop_trylock = (uint(*)(xfs_log_item_t*)) 334 .iop_trylock = (uint(*)(xfs_log_item_t*))
@@ -357,9 +355,8 @@ xfs_qm_dquot_logitem_init(
357 xfs_dq_logitem_t *lp; 355 xfs_dq_logitem_t *lp;
358 lp = &dqp->q_logitem; 356 lp = &dqp->q_logitem;
359 357
360 lp->qli_item.li_type = XFS_LI_DQUOT; 358 xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
361 lp->qli_item.li_ops = &xfs_dquot_item_ops; 359 &xfs_dquot_item_ops);
362 lp->qli_item.li_mountp = dqp->q_mount;
363 lp->qli_dquot = dqp; 360 lp->qli_dquot = dqp;
364 lp->qli_format.qlf_type = XFS_LI_DQUOT; 361 lp->qli_format.qlf_type = XFS_LI_DQUOT;
365 lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id); 362 lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id);
@@ -426,7 +423,7 @@ xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf)
426 */ 423 */
427/*ARGSUSED*/ 424/*ARGSUSED*/
428STATIC void 425STATIC void
429xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf, int stale) 426xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf)
430{ 427{
431 return; 428 return;
432} 429}
@@ -537,8 +534,7 @@ static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
537 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 534 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
538 xfs_qm_qoff_logitem_format, 535 xfs_qm_qoff_logitem_format,
539 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin, 536 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
540 .iop_unpin = (void(*)(xfs_log_item_t* ,int)) 537 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin,
541 xfs_qm_qoff_logitem_unpin,
542 .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*)) 538 .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
543 xfs_qm_qoff_logitem_unpin_remove, 539 xfs_qm_qoff_logitem_unpin_remove,
544 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock, 540 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
@@ -559,8 +555,7 @@ static struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
559 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 555 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
560 xfs_qm_qoff_logitem_format, 556 xfs_qm_qoff_logitem_format,
561 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin, 557 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
562 .iop_unpin = (void(*)(xfs_log_item_t*, int)) 558 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin,
563 xfs_qm_qoff_logitem_unpin,
564 .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*)) 559 .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
565 xfs_qm_qoff_logitem_unpin_remove, 560 xfs_qm_qoff_logitem_unpin_remove,
566 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock, 561 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
@@ -586,11 +581,8 @@ xfs_qm_qoff_logitem_init(
586 581
587 qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP); 582 qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP);
588 583
589 qf->qql_item.li_type = XFS_LI_QUOTAOFF; 584 xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
590 if (start) 585 &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
591 qf->qql_item.li_ops = &xfs_qm_qoffend_logitem_ops;
592 else
593 qf->qql_item.li_ops = &xfs_qm_qoff_logitem_ops;
594 qf->qql_item.li_mountp = mp; 586 qf->qql_item.li_mountp = mp;
595 qf->qql_format.qf_type = XFS_LI_QUOTAOFF; 587 qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
596 qf->qql_format.qf_flags = flags; 588 qf->qql_format.qf_flags = flags;
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 417e61e3d9dd..38e764146644 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -67,9 +67,6 @@ static cred_t xfs_zerocr;
67STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); 67STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int);
68STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); 68STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
69 69
70STATIC void xfs_qm_freelist_init(xfs_frlist_t *);
71STATIC void xfs_qm_freelist_destroy(xfs_frlist_t *);
72
73STATIC int xfs_qm_init_quotainos(xfs_mount_t *); 70STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
74STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); 71STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
75STATIC int xfs_qm_shake(int, gfp_t); 72STATIC int xfs_qm_shake(int, gfp_t);
@@ -84,21 +81,25 @@ extern struct mutex qcheck_lock;
84#endif 81#endif
85 82
86#ifdef QUOTADEBUG 83#ifdef QUOTADEBUG
87#define XQM_LIST_PRINT(l, NXT, title) \ 84static void
88{ \ 85xfs_qm_dquot_list_print(
89 xfs_dquot_t *dqp; int i = 0; \ 86 struct xfs_mount *mp)
90 cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \ 87{
91 for (dqp = (l)->qh_next; dqp != NULL; dqp = dqp->NXT) { \ 88 xfs_dquot_t *dqp;
92 cmn_err(CE_DEBUG, " %d. \"%d (%s)\" " \ 89 int i = 0;
93 "bcnt = %d, icnt = %d, refs = %d", \ 90
94 ++i, (int) be32_to_cpu(dqp->q_core.d_id), \ 91 list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) {
95 DQFLAGTO_TYPESTR(dqp), \ 92 cmn_err(CE_DEBUG, " %d. \"%d (%s)\" "
96 (int) be64_to_cpu(dqp->q_core.d_bcount), \ 93 "bcnt = %lld, icnt = %lld, refs = %d",
97 (int) be64_to_cpu(dqp->q_core.d_icount), \ 94 i++, be32_to_cpu(dqp->q_core.d_id),
98 (int) dqp->q_nrefs); } \ 95 DQFLAGTO_TYPESTR(dqp),
96 (long long)be64_to_cpu(dqp->q_core.d_bcount),
97 (long long)be64_to_cpu(dqp->q_core.d_icount),
98 dqp->q_nrefs);
99 }
99} 100}
100#else 101#else
101#define XQM_LIST_PRINT(l, NXT, title) do { } while (0) 102static void xfs_qm_dquot_list_print(struct xfs_mount *mp) { }
102#endif 103#endif
103 104
104/* 105/*
@@ -144,7 +145,9 @@ xfs_Gqm_init(void)
144 /* 145 /*
145 * Freelist of all dquots of all file systems 146 * Freelist of all dquots of all file systems
146 */ 147 */
147 xfs_qm_freelist_init(&(xqm->qm_dqfreelist)); 148 INIT_LIST_HEAD(&xqm->qm_dqfrlist);
149 xqm->qm_dqfrlist_cnt = 0;
150 mutex_init(&xqm->qm_dqfrlist_lock);
148 151
149 /* 152 /*
150 * dquot zone. we register our own low-memory callback. 153 * dquot zone. we register our own low-memory callback.
@@ -189,6 +192,7 @@ STATIC void
189xfs_qm_destroy( 192xfs_qm_destroy(
190 struct xfs_qm *xqm) 193 struct xfs_qm *xqm)
191{ 194{
195 struct xfs_dquot *dqp, *n;
192 int hsize, i; 196 int hsize, i;
193 197
194 ASSERT(xqm != NULL); 198 ASSERT(xqm != NULL);
@@ -204,7 +208,21 @@ xfs_qm_destroy(
204 xqm->qm_usr_dqhtable = NULL; 208 xqm->qm_usr_dqhtable = NULL;
205 xqm->qm_grp_dqhtable = NULL; 209 xqm->qm_grp_dqhtable = NULL;
206 xqm->qm_dqhashmask = 0; 210 xqm->qm_dqhashmask = 0;
207 xfs_qm_freelist_destroy(&(xqm->qm_dqfreelist)); 211
212 /* frlist cleanup */
213 mutex_lock(&xqm->qm_dqfrlist_lock);
214 list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) {
215 xfs_dqlock(dqp);
216#ifdef QUOTADEBUG
217 cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
218#endif
219 list_del_init(&dqp->q_freelist);
220 xfs_Gqm->qm_dqfrlist_cnt--;
221 xfs_dqunlock(dqp);
222 xfs_qm_dqdestroy(dqp);
223 }
224 mutex_unlock(&xqm->qm_dqfrlist_lock);
225 mutex_destroy(&xqm->qm_dqfrlist_lock);
208#ifdef DEBUG 226#ifdef DEBUG
209 mutex_destroy(&qcheck_lock); 227 mutex_destroy(&qcheck_lock);
210#endif 228#endif
@@ -256,7 +274,7 @@ STATIC void
256xfs_qm_rele_quotafs_ref( 274xfs_qm_rele_quotafs_ref(
257 struct xfs_mount *mp) 275 struct xfs_mount *mp)
258{ 276{
259 xfs_dquot_t *dqp, *nextdqp; 277 xfs_dquot_t *dqp, *n;
260 278
261 ASSERT(xfs_Gqm); 279 ASSERT(xfs_Gqm);
262 ASSERT(xfs_Gqm->qm_nrefs > 0); 280 ASSERT(xfs_Gqm->qm_nrefs > 0);
@@ -264,26 +282,24 @@ xfs_qm_rele_quotafs_ref(
264 /* 282 /*
265 * Go thru the freelist and destroy all inactive dquots. 283 * Go thru the freelist and destroy all inactive dquots.
266 */ 284 */
267 xfs_qm_freelist_lock(xfs_Gqm); 285 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
268 286
269 for (dqp = xfs_Gqm->qm_dqfreelist.qh_next; 287 list_for_each_entry_safe(dqp, n, &xfs_Gqm->qm_dqfrlist, q_freelist) {
270 dqp != (xfs_dquot_t *)&(xfs_Gqm->qm_dqfreelist); ) {
271 xfs_dqlock(dqp); 288 xfs_dqlock(dqp);
272 nextdqp = dqp->dq_flnext;
273 if (dqp->dq_flags & XFS_DQ_INACTIVE) { 289 if (dqp->dq_flags & XFS_DQ_INACTIVE) {
274 ASSERT(dqp->q_mount == NULL); 290 ASSERT(dqp->q_mount == NULL);
275 ASSERT(! XFS_DQ_IS_DIRTY(dqp)); 291 ASSERT(! XFS_DQ_IS_DIRTY(dqp));
276 ASSERT(dqp->HL_PREVP == NULL); 292 ASSERT(list_empty(&dqp->q_hashlist));
277 ASSERT(dqp->MPL_PREVP == NULL); 293 ASSERT(list_empty(&dqp->q_mplist));
278 XQM_FREELIST_REMOVE(dqp); 294 list_del_init(&dqp->q_freelist);
295 xfs_Gqm->qm_dqfrlist_cnt--;
279 xfs_dqunlock(dqp); 296 xfs_dqunlock(dqp);
280 xfs_qm_dqdestroy(dqp); 297 xfs_qm_dqdestroy(dqp);
281 } else { 298 } else {
282 xfs_dqunlock(dqp); 299 xfs_dqunlock(dqp);
283 } 300 }
284 dqp = nextdqp;
285 } 301 }
286 xfs_qm_freelist_unlock(xfs_Gqm); 302 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
287 303
288 /* 304 /*
289 * Destroy the entire XQM. If somebody mounts with quotaon, this'll 305 * Destroy the entire XQM. If somebody mounts with quotaon, this'll
@@ -305,7 +321,7 @@ xfs_qm_unmount(
305 struct xfs_mount *mp) 321 struct xfs_mount *mp)
306{ 322{
307 if (mp->m_quotainfo) { 323 if (mp->m_quotainfo) {
308 xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING); 324 xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
309 xfs_qm_destroy_quotainfo(mp); 325 xfs_qm_destroy_quotainfo(mp);
310 } 326 }
311} 327}
@@ -449,20 +465,21 @@ xfs_qm_unmount_quotas(
449 */ 465 */
450STATIC int 466STATIC int
451xfs_qm_dqflush_all( 467xfs_qm_dqflush_all(
452 xfs_mount_t *mp, 468 struct xfs_mount *mp,
453 int sync_mode) 469 int sync_mode)
454{ 470{
455 int recl; 471 struct xfs_quotainfo *q = mp->m_quotainfo;
456 xfs_dquot_t *dqp; 472 int recl;
457 int niters; 473 struct xfs_dquot *dqp;
458 int error; 474 int niters;
475 int error;
459 476
460 if (mp->m_quotainfo == NULL) 477 if (!q)
461 return 0; 478 return 0;
462 niters = 0; 479 niters = 0;
463again: 480again:
464 xfs_qm_mplist_lock(mp); 481 mutex_lock(&q->qi_dqlist_lock);
465 FOREACH_DQUOT_IN_MP(dqp, mp) { 482 list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
466 xfs_dqlock(dqp); 483 xfs_dqlock(dqp);
467 if (! XFS_DQ_IS_DIRTY(dqp)) { 484 if (! XFS_DQ_IS_DIRTY(dqp)) {
468 xfs_dqunlock(dqp); 485 xfs_dqunlock(dqp);
@@ -470,7 +487,7 @@ again:
470 } 487 }
471 488
472 /* XXX a sentinel would be better */ 489 /* XXX a sentinel would be better */
473 recl = XFS_QI_MPLRECLAIMS(mp); 490 recl = q->qi_dqreclaims;
474 if (!xfs_dqflock_nowait(dqp)) { 491 if (!xfs_dqflock_nowait(dqp)) {
475 /* 492 /*
476 * If we can't grab the flush lock then check 493 * If we can't grab the flush lock then check
@@ -485,21 +502,21 @@ again:
485 * Let go of the mplist lock. We don't want to hold it 502 * Let go of the mplist lock. We don't want to hold it
486 * across a disk write. 503 * across a disk write.
487 */ 504 */
488 xfs_qm_mplist_unlock(mp); 505 mutex_unlock(&q->qi_dqlist_lock);
489 error = xfs_qm_dqflush(dqp, sync_mode); 506 error = xfs_qm_dqflush(dqp, sync_mode);
490 xfs_dqunlock(dqp); 507 xfs_dqunlock(dqp);
491 if (error) 508 if (error)
492 return error; 509 return error;
493 510
494 xfs_qm_mplist_lock(mp); 511 mutex_lock(&q->qi_dqlist_lock);
495 if (recl != XFS_QI_MPLRECLAIMS(mp)) { 512 if (recl != q->qi_dqreclaims) {
496 xfs_qm_mplist_unlock(mp); 513 mutex_unlock(&q->qi_dqlist_lock);
497 /* XXX restart limit */ 514 /* XXX restart limit */
498 goto again; 515 goto again;
499 } 516 }
500 } 517 }
501 518
502 xfs_qm_mplist_unlock(mp); 519 mutex_unlock(&q->qi_dqlist_lock);
503 /* return ! busy */ 520 /* return ! busy */
504 return 0; 521 return 0;
505} 522}
@@ -509,15 +526,15 @@ again:
509 */ 526 */
510STATIC void 527STATIC void
511xfs_qm_detach_gdquots( 528xfs_qm_detach_gdquots(
512 xfs_mount_t *mp) 529 struct xfs_mount *mp)
513{ 530{
514 xfs_dquot_t *dqp, *gdqp; 531 struct xfs_quotainfo *q = mp->m_quotainfo;
515 int nrecl; 532 struct xfs_dquot *dqp, *gdqp;
533 int nrecl;
516 534
517 again: 535 again:
518 ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); 536 ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
519 dqp = XFS_QI_MPLNEXT(mp); 537 list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
520 while (dqp) {
521 xfs_dqlock(dqp); 538 xfs_dqlock(dqp);
522 if ((gdqp = dqp->q_gdquot)) { 539 if ((gdqp = dqp->q_gdquot)) {
523 xfs_dqlock(gdqp); 540 xfs_dqlock(gdqp);
@@ -530,15 +547,14 @@ xfs_qm_detach_gdquots(
530 * Can't hold the mplist lock across a dqput. 547 * Can't hold the mplist lock across a dqput.
531 * XXXmust convert to marker based iterations here. 548 * XXXmust convert to marker based iterations here.
532 */ 549 */
533 nrecl = XFS_QI_MPLRECLAIMS(mp); 550 nrecl = q->qi_dqreclaims;
534 xfs_qm_mplist_unlock(mp); 551 mutex_unlock(&q->qi_dqlist_lock);
535 xfs_qm_dqput(gdqp); 552 xfs_qm_dqput(gdqp);
536 553
537 xfs_qm_mplist_lock(mp); 554 mutex_lock(&q->qi_dqlist_lock);
538 if (nrecl != XFS_QI_MPLRECLAIMS(mp)) 555 if (nrecl != q->qi_dqreclaims)
539 goto again; 556 goto again;
540 } 557 }
541 dqp = dqp->MPL_NEXT;
542 } 558 }
543} 559}
544 560
@@ -550,23 +566,23 @@ xfs_qm_detach_gdquots(
550 */ 566 */
551STATIC int 567STATIC int
552xfs_qm_dqpurge_int( 568xfs_qm_dqpurge_int(
553 xfs_mount_t *mp, 569 struct xfs_mount *mp,
554 uint flags) /* QUOTAOFF/UMOUNTING/UQUOTA/PQUOTA/GQUOTA */ 570 uint flags)
555{ 571{
556 xfs_dquot_t *dqp; 572 struct xfs_quotainfo *q = mp->m_quotainfo;
557 uint dqtype; 573 struct xfs_dquot *dqp, *n;
558 int nrecl; 574 uint dqtype;
559 xfs_dquot_t *nextdqp; 575 int nrecl;
560 int nmisses; 576 int nmisses;
561 577
562 if (mp->m_quotainfo == NULL) 578 if (!q)
563 return 0; 579 return 0;
564 580
565 dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0; 581 dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0;
566 dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0; 582 dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0;
567 dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0; 583 dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0;
568 584
569 xfs_qm_mplist_lock(mp); 585 mutex_lock(&q->qi_dqlist_lock);
570 586
571 /* 587 /*
572 * In the first pass through all incore dquots of this filesystem, 588 * In the first pass through all incore dquots of this filesystem,
@@ -578,28 +594,25 @@ xfs_qm_dqpurge_int(
578 594
579 again: 595 again:
580 nmisses = 0; 596 nmisses = 0;
581 ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); 597 ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
582 /* 598 /*
583 * Try to get rid of all of the unwanted dquots. The idea is to 599 * Try to get rid of all of the unwanted dquots. The idea is to
584 * get them off mplist and hashlist, but leave them on freelist. 600 * get them off mplist and hashlist, but leave them on freelist.
585 */ 601 */
586 dqp = XFS_QI_MPLNEXT(mp); 602 list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) {
587 while (dqp) {
588 /* 603 /*
589 * It's OK to look at the type without taking dqlock here. 604 * It's OK to look at the type without taking dqlock here.
590 * We're holding the mplist lock here, and that's needed for 605 * We're holding the mplist lock here, and that's needed for
591 * a dqreclaim. 606 * a dqreclaim.
592 */ 607 */
593 if ((dqp->dq_flags & dqtype) == 0) { 608 if ((dqp->dq_flags & dqtype) == 0)
594 dqp = dqp->MPL_NEXT;
595 continue; 609 continue;
596 }
597 610
598 if (!mutex_trylock(&dqp->q_hash->qh_lock)) { 611 if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
599 nrecl = XFS_QI_MPLRECLAIMS(mp); 612 nrecl = q->qi_dqreclaims;
600 xfs_qm_mplist_unlock(mp); 613 mutex_unlock(&q->qi_dqlist_lock);
601 mutex_lock(&dqp->q_hash->qh_lock); 614 mutex_lock(&dqp->q_hash->qh_lock);
602 xfs_qm_mplist_lock(mp); 615 mutex_lock(&q->qi_dqlist_lock);
603 616
604 /* 617 /*
605 * XXXTheoretically, we can get into a very long 618 * XXXTheoretically, we can get into a very long
@@ -607,7 +620,7 @@ xfs_qm_dqpurge_int(
607 * No one can be adding dquots to the mplist at 620 * No one can be adding dquots to the mplist at
608 * this point, but somebody might be taking things off. 621 * this point, but somebody might be taking things off.
609 */ 622 */
610 if (nrecl != XFS_QI_MPLRECLAIMS(mp)) { 623 if (nrecl != q->qi_dqreclaims) {
611 mutex_unlock(&dqp->q_hash->qh_lock); 624 mutex_unlock(&dqp->q_hash->qh_lock);
612 goto again; 625 goto again;
613 } 626 }
@@ -617,11 +630,9 @@ xfs_qm_dqpurge_int(
617 * Take the dquot off the mplist and hashlist. It may remain on 630 * Take the dquot off the mplist and hashlist. It may remain on
618 * freelist in INACTIVE state. 631 * freelist in INACTIVE state.
619 */ 632 */
620 nextdqp = dqp->MPL_NEXT;
621 nmisses += xfs_qm_dqpurge(dqp); 633 nmisses += xfs_qm_dqpurge(dqp);
622 dqp = nextdqp;
623 } 634 }
624 xfs_qm_mplist_unlock(mp); 635 mutex_unlock(&q->qi_dqlist_lock);
625 return nmisses; 636 return nmisses;
626} 637}
627 638
@@ -921,12 +932,13 @@ xfs_qm_dqdetach(
921 932
922int 933int
923xfs_qm_sync( 934xfs_qm_sync(
924 xfs_mount_t *mp, 935 struct xfs_mount *mp,
925 int flags) 936 int flags)
926{ 937{
927 int recl, restarts; 938 struct xfs_quotainfo *q = mp->m_quotainfo;
928 xfs_dquot_t *dqp; 939 int recl, restarts;
929 int error; 940 struct xfs_dquot *dqp;
941 int error;
930 942
931 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) 943 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
932 return 0; 944 return 0;
@@ -934,18 +946,19 @@ xfs_qm_sync(
934 restarts = 0; 946 restarts = 0;
935 947
936 again: 948 again:
937 xfs_qm_mplist_lock(mp); 949 mutex_lock(&q->qi_dqlist_lock);
938 /* 950 /*
939 * dqpurge_all() also takes the mplist lock and iterate thru all dquots 951 * dqpurge_all() also takes the mplist lock and iterate thru all dquots
940 * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared 952 * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared
941 * when we have the mplist lock, we know that dquots will be consistent 953 * when we have the mplist lock, we know that dquots will be consistent
942 * as long as we have it locked. 954 * as long as we have it locked.
943 */ 955 */
944 if (! XFS_IS_QUOTA_ON(mp)) { 956 if (!XFS_IS_QUOTA_ON(mp)) {
945 xfs_qm_mplist_unlock(mp); 957 mutex_unlock(&q->qi_dqlist_lock);
946 return 0; 958 return 0;
947 } 959 }
948 FOREACH_DQUOT_IN_MP(dqp, mp) { 960 ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
961 list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
949 /* 962 /*
950 * If this is vfs_sync calling, then skip the dquots that 963 * If this is vfs_sync calling, then skip the dquots that
951 * don't 'seem' to be dirty. ie. don't acquire dqlock. 964 * don't 'seem' to be dirty. ie. don't acquire dqlock.
@@ -969,7 +982,7 @@ xfs_qm_sync(
969 } 982 }
970 983
971 /* XXX a sentinel would be better */ 984 /* XXX a sentinel would be better */
972 recl = XFS_QI_MPLRECLAIMS(mp); 985 recl = q->qi_dqreclaims;
973 if (!xfs_dqflock_nowait(dqp)) { 986 if (!xfs_dqflock_nowait(dqp)) {
974 if (flags & SYNC_TRYLOCK) { 987 if (flags & SYNC_TRYLOCK) {
975 xfs_dqunlock(dqp); 988 xfs_dqunlock(dqp);
@@ -989,7 +1002,7 @@ xfs_qm_sync(
989 * Let go of the mplist lock. We don't want to hold it 1002 * Let go of the mplist lock. We don't want to hold it
990 * across a disk write 1003 * across a disk write
991 */ 1004 */
992 xfs_qm_mplist_unlock(mp); 1005 mutex_unlock(&q->qi_dqlist_lock);
993 error = xfs_qm_dqflush(dqp, flags); 1006 error = xfs_qm_dqflush(dqp, flags);
994 xfs_dqunlock(dqp); 1007 xfs_dqunlock(dqp);
995 if (error && XFS_FORCED_SHUTDOWN(mp)) 1008 if (error && XFS_FORCED_SHUTDOWN(mp))
@@ -997,17 +1010,17 @@ xfs_qm_sync(
997 else if (error) 1010 else if (error)
998 return error; 1011 return error;
999 1012
1000 xfs_qm_mplist_lock(mp); 1013 mutex_lock(&q->qi_dqlist_lock);
1001 if (recl != XFS_QI_MPLRECLAIMS(mp)) { 1014 if (recl != q->qi_dqreclaims) {
1002 if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS) 1015 if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS)
1003 break; 1016 break;
1004 1017
1005 xfs_qm_mplist_unlock(mp); 1018 mutex_unlock(&q->qi_dqlist_lock);
1006 goto again; 1019 goto again;
1007 } 1020 }
1008 } 1021 }
1009 1022
1010 xfs_qm_mplist_unlock(mp); 1023 mutex_unlock(&q->qi_dqlist_lock);
1011 return 0; 1024 return 0;
1012} 1025}
1013 1026
@@ -1052,8 +1065,9 @@ xfs_qm_init_quotainfo(
1052 return error; 1065 return error;
1053 } 1066 }
1054 1067
1055 xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0); 1068 INIT_LIST_HEAD(&qinf->qi_dqlist);
1056 lockdep_set_class(&qinf->qi_dqlist.qh_lock, &xfs_quota_mplist_class); 1069 mutex_init(&qinf->qi_dqlist_lock);
1070 lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class);
1057 1071
1058 qinf->qi_dqreclaims = 0; 1072 qinf->qi_dqreclaims = 0;
1059 1073
@@ -1150,7 +1164,8 @@ xfs_qm_destroy_quotainfo(
1150 */ 1164 */
1151 xfs_qm_rele_quotafs_ref(mp); 1165 xfs_qm_rele_quotafs_ref(mp);
1152 1166
1153 xfs_qm_list_destroy(&qi->qi_dqlist); 1167 ASSERT(list_empty(&qi->qi_dqlist));
1168 mutex_destroy(&qi->qi_dqlist_lock);
1154 1169
1155 if (qi->qi_uquotaip) { 1170 if (qi->qi_uquotaip) {
1156 IRELE(qi->qi_uquotaip); 1171 IRELE(qi->qi_uquotaip);
@@ -1177,7 +1192,7 @@ xfs_qm_list_init(
1177 int n) 1192 int n)
1178{ 1193{
1179 mutex_init(&list->qh_lock); 1194 mutex_init(&list->qh_lock);
1180 list->qh_next = NULL; 1195 INIT_LIST_HEAD(&list->qh_list);
1181 list->qh_version = 0; 1196 list->qh_version = 0;
1182 list->qh_nelems = 0; 1197 list->qh_nelems = 0;
1183} 1198}
@@ -1316,9 +1331,6 @@ xfs_qm_qino_alloc(
1316 */ 1331 */
1317 spin_lock(&mp->m_sb_lock); 1332 spin_lock(&mp->m_sb_lock);
1318 if (flags & XFS_QMOPT_SBVERSION) { 1333 if (flags & XFS_QMOPT_SBVERSION) {
1319#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
1320 unsigned oldv = mp->m_sb.sb_versionnum;
1321#endif
1322 ASSERT(!xfs_sb_version_hasquota(&mp->m_sb)); 1334 ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
1323 ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | 1335 ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
1324 XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) == 1336 XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) ==
@@ -1331,11 +1343,6 @@ xfs_qm_qino_alloc(
1331 1343
1332 /* qflags will get updated _after_ quotacheck */ 1344 /* qflags will get updated _after_ quotacheck */
1333 mp->m_sb.sb_qflags = 0; 1345 mp->m_sb.sb_qflags = 0;
1334#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
1335 cmn_err(CE_NOTE,
1336 "Old superblock version %x, converting to %x.",
1337 oldv, mp->m_sb.sb_versionnum);
1338#endif
1339 } 1346 }
1340 if (flags & XFS_QMOPT_UQUOTA) 1347 if (flags & XFS_QMOPT_UQUOTA)
1341 mp->m_sb.sb_uquotino = (*ip)->i_ino; 1348 mp->m_sb.sb_uquotino = (*ip)->i_ino;
@@ -1371,10 +1378,10 @@ xfs_qm_reset_dqcounts(
1371#ifdef DEBUG 1378#ifdef DEBUG
1372 j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); 1379 j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
1373 do_div(j, sizeof(xfs_dqblk_t)); 1380 do_div(j, sizeof(xfs_dqblk_t));
1374 ASSERT(XFS_QM_DQPERBLK(mp) == j); 1381 ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
1375#endif 1382#endif
1376 ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp); 1383 ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp);
1377 for (j = 0; j < XFS_QM_DQPERBLK(mp); j++) { 1384 for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
1378 /* 1385 /*
1379 * Do a sanity check, and if needed, repair the dqblk. Don't 1386 * Do a sanity check, and if needed, repair the dqblk. Don't
1380 * output any warnings because it's perfectly possible to 1387 * output any warnings because it's perfectly possible to
@@ -1429,7 +1436,7 @@ xfs_qm_dqiter_bufs(
1429 while (blkcnt--) { 1436 while (blkcnt--) {
1430 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, 1437 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
1431 XFS_FSB_TO_DADDR(mp, bno), 1438 XFS_FSB_TO_DADDR(mp, bno),
1432 (int)XFS_QI_DQCHUNKLEN(mp), 0, &bp); 1439 mp->m_quotainfo->qi_dqchunklen, 0, &bp);
1433 if (error) 1440 if (error)
1434 break; 1441 break;
1435 1442
@@ -1439,7 +1446,7 @@ xfs_qm_dqiter_bufs(
1439 * goto the next block. 1446 * goto the next block.
1440 */ 1447 */
1441 bno++; 1448 bno++;
1442 firstid += XFS_QM_DQPERBLK(mp); 1449 firstid += mp->m_quotainfo->qi_dqperchunk;
1443 } 1450 }
1444 return error; 1451 return error;
1445} 1452}
@@ -1505,7 +1512,7 @@ xfs_qm_dqiterate(
1505 continue; 1512 continue;
1506 1513
1507 firstid = (xfs_dqid_t) map[i].br_startoff * 1514 firstid = (xfs_dqid_t) map[i].br_startoff *
1508 XFS_QM_DQPERBLK(mp); 1515 mp->m_quotainfo->qi_dqperchunk;
1509 /* 1516 /*
1510 * Do a read-ahead on the next extent. 1517 * Do a read-ahead on the next extent.
1511 */ 1518 */
@@ -1516,7 +1523,7 @@ xfs_qm_dqiterate(
1516 while (rablkcnt--) { 1523 while (rablkcnt--) {
1517 xfs_baread(mp->m_ddev_targp, 1524 xfs_baread(mp->m_ddev_targp,
1518 XFS_FSB_TO_DADDR(mp, rablkno), 1525 XFS_FSB_TO_DADDR(mp, rablkno),
1519 (int)XFS_QI_DQCHUNKLEN(mp)); 1526 mp->m_quotainfo->qi_dqchunklen);
1520 rablkno++; 1527 rablkno++;
1521 } 1528 }
1522 } 1529 }
@@ -1576,8 +1583,10 @@ xfs_qm_quotacheck_dqadjust(
1576 1583
1577 /* 1584 /*
1578 * Set default limits, adjust timers (since we changed usages) 1585 * Set default limits, adjust timers (since we changed usages)
1586 *
1587 * There are no timers for the default values set in the root dquot.
1579 */ 1588 */
1580 if (! XFS_IS_SUSER_DQUOT(dqp)) { 1589 if (dqp->q_core.d_id) {
1581 xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core); 1590 xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core);
1582 xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core); 1591 xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core);
1583 } 1592 }
@@ -1747,14 +1756,14 @@ xfs_qm_quotacheck(
1747 lastino = 0; 1756 lastino = 0;
1748 flags = 0; 1757 flags = 0;
1749 1758
1750 ASSERT(XFS_QI_UQIP(mp) || XFS_QI_GQIP(mp)); 1759 ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip);
1751 ASSERT(XFS_IS_QUOTA_RUNNING(mp)); 1760 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
1752 1761
1753 /* 1762 /*
1754 * There should be no cached dquots. The (simplistic) quotacheck 1763 * There should be no cached dquots. The (simplistic) quotacheck
1755 * algorithm doesn't like that. 1764 * algorithm doesn't like that.
1756 */ 1765 */
1757 ASSERT(XFS_QI_MPLNDQUOTS(mp) == 0); 1766 ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
1758 1767
1759 cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname); 1768 cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname);
1760 1769
@@ -1763,15 +1772,19 @@ xfs_qm_quotacheck(
1763 * their counters to zero. We need a clean slate. 1772 * their counters to zero. We need a clean slate.
1764 * We don't log our changes till later. 1773 * We don't log our changes till later.
1765 */ 1774 */
1766 if ((uip = XFS_QI_UQIP(mp))) { 1775 uip = mp->m_quotainfo->qi_uquotaip;
1767 if ((error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA))) 1776 if (uip) {
1777 error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA);
1778 if (error)
1768 goto error_return; 1779 goto error_return;
1769 flags |= XFS_UQUOTA_CHKD; 1780 flags |= XFS_UQUOTA_CHKD;
1770 } 1781 }
1771 1782
1772 if ((gip = XFS_QI_GQIP(mp))) { 1783 gip = mp->m_quotainfo->qi_gquotaip;
1773 if ((error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? 1784 if (gip) {
1774 XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA))) 1785 error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
1786 XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
1787 if (error)
1775 goto error_return; 1788 goto error_return;
1776 flags |= XFS_OQUOTA_CHKD; 1789 flags |= XFS_OQUOTA_CHKD;
1777 } 1790 }
@@ -1804,7 +1817,7 @@ xfs_qm_quotacheck(
1804 * at this point (because we intentionally didn't in dqget_noattach). 1817 * at this point (because we intentionally didn't in dqget_noattach).
1805 */ 1818 */
1806 if (error) { 1819 if (error) {
1807 xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF); 1820 xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
1808 goto error_return; 1821 goto error_return;
1809 } 1822 }
1810 1823
@@ -1825,7 +1838,7 @@ xfs_qm_quotacheck(
1825 mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD); 1838 mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD);
1826 mp->m_qflags |= flags; 1839 mp->m_qflags |= flags;
1827 1840
1828 XQM_LIST_PRINT(&(XFS_QI_MPL_LIST(mp)), MPL_NEXT, "++++ Mp list +++"); 1841 xfs_qm_dquot_list_print(mp);
1829 1842
1830 error_return: 1843 error_return:
1831 if (error) { 1844 if (error) {
@@ -1920,59 +1933,53 @@ xfs_qm_init_quotainos(
1920 } 1933 }
1921 } 1934 }
1922 1935
1923 XFS_QI_UQIP(mp) = uip; 1936 mp->m_quotainfo->qi_uquotaip = uip;
1924 XFS_QI_GQIP(mp) = gip; 1937 mp->m_quotainfo->qi_gquotaip = gip;
1925 1938
1926 return 0; 1939 return 0;
1927} 1940}
1928 1941
1929 1942
1943
1930/* 1944/*
1931 * Traverse the freelist of dquots and attempt to reclaim a maximum of 1945 * Just pop the least recently used dquot off the freelist and
1932 * 'howmany' dquots. This operation races with dqlookup(), and attempts to 1946 * recycle it. The returned dquot is locked.
1933 * favor the lookup function ...
1934 * XXXsup merge this with qm_reclaim_one().
1935 */ 1947 */
1936STATIC int 1948STATIC xfs_dquot_t *
1937xfs_qm_shake_freelist( 1949xfs_qm_dqreclaim_one(void)
1938 int howmany)
1939{ 1950{
1940 int nreclaimed; 1951 xfs_dquot_t *dqpout;
1941 xfs_dqhash_t *hash; 1952 xfs_dquot_t *dqp;
1942 xfs_dquot_t *dqp, *nextdqp;
1943 int restarts; 1953 int restarts;
1944 int nflushes;
1945
1946 if (howmany <= 0)
1947 return 0;
1948 1954
1949 nreclaimed = 0;
1950 restarts = 0; 1955 restarts = 0;
1951 nflushes = 0; 1956 dqpout = NULL;
1952 1957
1953#ifdef QUOTADEBUG 1958 /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
1954 cmn_err(CE_DEBUG, "Shake free 0x%x", howmany); 1959startagain:
1955#endif 1960 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
1956 /* lock order is : hashchainlock, freelistlock, mplistlock */
1957 tryagain:
1958 xfs_qm_freelist_lock(xfs_Gqm);
1959 1961
1960 for (dqp = xfs_Gqm->qm_dqfreelist.qh_next; 1962 list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
1961 ((dqp != (xfs_dquot_t *) &xfs_Gqm->qm_dqfreelist) && 1963 struct xfs_mount *mp = dqp->q_mount;
1962 nreclaimed < howmany); ) {
1963 xfs_dqlock(dqp); 1964 xfs_dqlock(dqp);
1964 1965
1965 /* 1966 /*
1966 * We are racing with dqlookup here. Naturally we don't 1967 * We are racing with dqlookup here. Naturally we don't
1967 * want to reclaim a dquot that lookup wants. 1968 * want to reclaim a dquot that lookup wants. We release the
1969 * freelist lock and start over, so that lookup will grab
1970 * both the dquot and the freelistlock.
1968 */ 1971 */
1969 if (dqp->dq_flags & XFS_DQ_WANT) { 1972 if (dqp->dq_flags & XFS_DQ_WANT) {
1973 ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
1974
1975 trace_xfs_dqreclaim_want(dqp);
1976
1970 xfs_dqunlock(dqp); 1977 xfs_dqunlock(dqp);
1971 xfs_qm_freelist_unlock(xfs_Gqm); 1978 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1972 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) 1979 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
1973 return nreclaimed; 1980 return NULL;
1974 XQM_STATS_INC(xqmstats.xs_qm_dqwants); 1981 XQM_STATS_INC(xqmstats.xs_qm_dqwants);
1975 goto tryagain; 1982 goto startagain;
1976 } 1983 }
1977 1984
1978 /* 1985 /*
@@ -1981,23 +1988,27 @@ xfs_qm_shake_freelist(
1981 * life easier. 1988 * life easier.
1982 */ 1989 */
1983 if (dqp->dq_flags & XFS_DQ_INACTIVE) { 1990 if (dqp->dq_flags & XFS_DQ_INACTIVE) {
1984 ASSERT(dqp->q_mount == NULL); 1991 ASSERT(mp == NULL);
1985 ASSERT(! XFS_DQ_IS_DIRTY(dqp)); 1992 ASSERT(! XFS_DQ_IS_DIRTY(dqp));
1986 ASSERT(dqp->HL_PREVP == NULL); 1993 ASSERT(list_empty(&dqp->q_hashlist));
1987 ASSERT(dqp->MPL_PREVP == NULL); 1994 ASSERT(list_empty(&dqp->q_mplist));
1995 list_del_init(&dqp->q_freelist);
1996 xfs_Gqm->qm_dqfrlist_cnt--;
1997 xfs_dqunlock(dqp);
1998 dqpout = dqp;
1988 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); 1999 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
1989 nextdqp = dqp->dq_flnext; 2000 break;
1990 goto off_freelist;
1991 } 2001 }
1992 2002
1993 ASSERT(dqp->MPL_PREVP); 2003 ASSERT(dqp->q_hash);
2004 ASSERT(!list_empty(&dqp->q_mplist));
2005
1994 /* 2006 /*
1995 * Try to grab the flush lock. If this dquot is in the process of 2007 * Try to grab the flush lock. If this dquot is in the process of
1996 * getting flushed to disk, we don't want to reclaim it. 2008 * getting flushed to disk, we don't want to reclaim it.
1997 */ 2009 */
1998 if (!xfs_dqflock_nowait(dqp)) { 2010 if (!xfs_dqflock_nowait(dqp)) {
1999 xfs_dqunlock(dqp); 2011 xfs_dqunlock(dqp);
2000 dqp = dqp->dq_flnext;
2001 continue; 2012 continue;
2002 } 2013 }
2003 2014
@@ -2010,21 +2021,21 @@ xfs_qm_shake_freelist(
2010 if (XFS_DQ_IS_DIRTY(dqp)) { 2021 if (XFS_DQ_IS_DIRTY(dqp)) {
2011 int error; 2022 int error;
2012 2023
2013 trace_xfs_dqshake_dirty(dqp); 2024 trace_xfs_dqreclaim_dirty(dqp);
2014 2025
2015 /* 2026 /*
2016 * We flush it delayed write, so don't bother 2027 * We flush it delayed write, so don't bother
2017 * releasing the mplock. 2028 * releasing the freelist lock.
2018 */ 2029 */
2019 error = xfs_qm_dqflush(dqp, 0); 2030 error = xfs_qm_dqflush(dqp, 0);
2020 if (error) { 2031 if (error) {
2021 xfs_fs_cmn_err(CE_WARN, dqp->q_mount, 2032 xfs_fs_cmn_err(CE_WARN, mp,
2022 "xfs_qm_dqflush_all: dquot %p flush failed", dqp); 2033 "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
2023 } 2034 }
2024 xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ 2035 xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
2025 dqp = dqp->dq_flnext;
2026 continue; 2036 continue;
2027 } 2037 }
2038
2028 /* 2039 /*
2029 * We're trying to get the hashlock out of order. This races 2040 * We're trying to get the hashlock out of order. This races
2030 * with dqlookup; so, we giveup and goto the next dquot if 2041 * with dqlookup; so, we giveup and goto the next dquot if
@@ -2033,56 +2044,74 @@ xfs_qm_shake_freelist(
2033 * waiting for the freelist lock. 2044 * waiting for the freelist lock.
2034 */ 2045 */
2035 if (!mutex_trylock(&dqp->q_hash->qh_lock)) { 2046 if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
2036 xfs_dqfunlock(dqp); 2047 restarts++;
2037 xfs_dqunlock(dqp); 2048 goto dqfunlock;
2038 dqp = dqp->dq_flnext;
2039 continue;
2040 } 2049 }
2050
2041 /* 2051 /*
2042 * This races with dquot allocation code as well as dqflush_all 2052 * This races with dquot allocation code as well as dqflush_all
2043 * and reclaim code. So, if we failed to grab the mplist lock, 2053 * and reclaim code. So, if we failed to grab the mplist lock,
2044 * giveup everything and start over. 2054 * giveup everything and start over.
2045 */ 2055 */
2046 hash = dqp->q_hash; 2056 if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
2047 ASSERT(hash); 2057 restarts++;
2048 if (! xfs_qm_mplist_nowait(dqp->q_mount)) { 2058 mutex_unlock(&dqp->q_hash->qh_lock);
2049 /* XXX put a sentinel so that we can come back here */
2050 xfs_dqfunlock(dqp); 2059 xfs_dqfunlock(dqp);
2051 xfs_dqunlock(dqp); 2060 xfs_dqunlock(dqp);
2052 mutex_unlock(&hash->qh_lock); 2061 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
2053 xfs_qm_freelist_unlock(xfs_Gqm); 2062 if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS)
2054 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) 2063 return NULL;
2055 return nreclaimed; 2064 goto startagain;
2056 goto tryagain;
2057 } 2065 }
2058 2066
2059 trace_xfs_dqshake_unlink(dqp);
2060
2061#ifdef QUOTADEBUG
2062 cmn_err(CE_DEBUG, "Shake 0x%p, ID 0x%x\n",
2063 dqp, be32_to_cpu(dqp->q_core.d_id));
2064#endif
2065 ASSERT(dqp->q_nrefs == 0); 2067 ASSERT(dqp->q_nrefs == 0);
2066 nextdqp = dqp->dq_flnext; 2068 list_del_init(&dqp->q_mplist);
2067 XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp); 2069 mp->m_quotainfo->qi_dquots--;
2068 XQM_HASHLIST_REMOVE(hash, dqp); 2070 mp->m_quotainfo->qi_dqreclaims++;
2071 list_del_init(&dqp->q_hashlist);
2072 dqp->q_hash->qh_version++;
2073 list_del_init(&dqp->q_freelist);
2074 xfs_Gqm->qm_dqfrlist_cnt--;
2075 dqpout = dqp;
2076 mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
2077 mutex_unlock(&dqp->q_hash->qh_lock);
2078dqfunlock:
2069 xfs_dqfunlock(dqp); 2079 xfs_dqfunlock(dqp);
2070 xfs_qm_mplist_unlock(dqp->q_mount);
2071 mutex_unlock(&hash->qh_lock);
2072
2073 off_freelist:
2074 XQM_FREELIST_REMOVE(dqp);
2075 xfs_dqunlock(dqp); 2080 xfs_dqunlock(dqp);
2076 nreclaimed++; 2081 if (dqpout)
2077 XQM_STATS_INC(xqmstats.xs_qm_dqshake_reclaims); 2082 break;
2083 if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
2084 return NULL;
2085 }
2086 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
2087 return dqpout;
2088}
2089
2090/*
2091 * Traverse the freelist of dquots and attempt to reclaim a maximum of
2092 * 'howmany' dquots. This operation races with dqlookup(), and attempts to
2093 * favor the lookup function ...
2094 */
2095STATIC int
2096xfs_qm_shake_freelist(
2097 int howmany)
2098{
2099 int nreclaimed = 0;
2100 xfs_dquot_t *dqp;
2101
2102 if (howmany <= 0)
2103 return 0;
2104
2105 while (nreclaimed < howmany) {
2106 dqp = xfs_qm_dqreclaim_one();
2107 if (!dqp)
2108 return nreclaimed;
2078 xfs_qm_dqdestroy(dqp); 2109 xfs_qm_dqdestroy(dqp);
2079 dqp = nextdqp; 2110 nreclaimed++;
2080 } 2111 }
2081 xfs_qm_freelist_unlock(xfs_Gqm);
2082 return nreclaimed; 2112 return nreclaimed;
2083} 2113}
2084 2114
2085
2086/* 2115/*
2087 * The kmem_shake interface is invoked when memory is running low. 2116 * The kmem_shake interface is invoked when memory is running low.
2088 */ 2117 */
@@ -2097,7 +2126,7 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
2097 if (!xfs_Gqm) 2126 if (!xfs_Gqm)
2098 return 0; 2127 return 0;
2099 2128
2100 nfree = xfs_Gqm->qm_dqfreelist.qh_nelems; /* free dquots */ 2129 nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */
2101 /* incore dquots in all f/s's */ 2130 /* incore dquots in all f/s's */
2102 ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree; 2131 ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree;
2103 2132
@@ -2113,131 +2142,6 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
2113} 2142}
2114 2143
2115 2144
2116/*
2117 * Just pop the least recently used dquot off the freelist and
2118 * recycle it. The returned dquot is locked.
2119 */
2120STATIC xfs_dquot_t *
2121xfs_qm_dqreclaim_one(void)
2122{
2123 xfs_dquot_t *dqpout;
2124 xfs_dquot_t *dqp;
2125 int restarts;
2126 int nflushes;
2127
2128 restarts = 0;
2129 dqpout = NULL;
2130 nflushes = 0;
2131
2132 /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
2133 startagain:
2134 xfs_qm_freelist_lock(xfs_Gqm);
2135
2136 FOREACH_DQUOT_IN_FREELIST(dqp, &(xfs_Gqm->qm_dqfreelist)) {
2137 xfs_dqlock(dqp);
2138
2139 /*
2140 * We are racing with dqlookup here. Naturally we don't
2141 * want to reclaim a dquot that lookup wants. We release the
2142 * freelist lock and start over, so that lookup will grab
2143 * both the dquot and the freelistlock.
2144 */
2145 if (dqp->dq_flags & XFS_DQ_WANT) {
2146 ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
2147
2148 trace_xfs_dqreclaim_want(dqp);
2149
2150 xfs_dqunlock(dqp);
2151 xfs_qm_freelist_unlock(xfs_Gqm);
2152 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
2153 return NULL;
2154 XQM_STATS_INC(xqmstats.xs_qm_dqwants);
2155 goto startagain;
2156 }
2157
2158 /*
2159 * If the dquot is inactive, we are assured that it is
2160 * not on the mplist or the hashlist, and that makes our
2161 * life easier.
2162 */
2163 if (dqp->dq_flags & XFS_DQ_INACTIVE) {
2164 ASSERT(dqp->q_mount == NULL);
2165 ASSERT(! XFS_DQ_IS_DIRTY(dqp));
2166 ASSERT(dqp->HL_PREVP == NULL);
2167 ASSERT(dqp->MPL_PREVP == NULL);
2168 XQM_FREELIST_REMOVE(dqp);
2169 xfs_dqunlock(dqp);
2170 dqpout = dqp;
2171 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
2172 break;
2173 }
2174
2175 ASSERT(dqp->q_hash);
2176 ASSERT(dqp->MPL_PREVP);
2177
2178 /*
2179 * Try to grab the flush lock. If this dquot is in the process of
2180 * getting flushed to disk, we don't want to reclaim it.
2181 */
2182 if (!xfs_dqflock_nowait(dqp)) {
2183 xfs_dqunlock(dqp);
2184 continue;
2185 }
2186
2187 /*
2188 * We have the flush lock so we know that this is not in the
2189 * process of being flushed. So, if this is dirty, flush it
2190 * DELWRI so that we don't get a freelist infested with
2191 * dirty dquots.
2192 */
2193 if (XFS_DQ_IS_DIRTY(dqp)) {
2194 int error;
2195
2196 trace_xfs_dqreclaim_dirty(dqp);
2197
2198 /*
2199 * We flush it delayed write, so don't bother
2200 * releasing the freelist lock.
2201 */
2202 error = xfs_qm_dqflush(dqp, 0);
2203 if (error) {
2204 xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
2205 "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
2206 }
2207 xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
2208 continue;
2209 }
2210
2211 if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
2212 xfs_dqfunlock(dqp);
2213 xfs_dqunlock(dqp);
2214 continue;
2215 }
2216
2217 if (!mutex_trylock(&dqp->q_hash->qh_lock))
2218 goto mplistunlock;
2219
2220 trace_xfs_dqreclaim_unlink(dqp);
2221
2222 ASSERT(dqp->q_nrefs == 0);
2223 XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
2224 XQM_HASHLIST_REMOVE(dqp->q_hash, dqp);
2225 XQM_FREELIST_REMOVE(dqp);
2226 dqpout = dqp;
2227 mutex_unlock(&dqp->q_hash->qh_lock);
2228 mplistunlock:
2229 xfs_qm_mplist_unlock(dqp->q_mount);
2230 xfs_dqfunlock(dqp);
2231 xfs_dqunlock(dqp);
2232 if (dqpout)
2233 break;
2234 }
2235
2236 xfs_qm_freelist_unlock(xfs_Gqm);
2237 return dqpout;
2238}
2239
2240
2241/*------------------------------------------------------------------*/ 2145/*------------------------------------------------------------------*/
2242 2146
2243/* 2147/*
@@ -2662,66 +2566,3 @@ xfs_qm_vop_create_dqattach(
2662 } 2566 }
2663} 2567}
2664 2568
2665/* ------------- list stuff -----------------*/
2666STATIC void
2667xfs_qm_freelist_init(xfs_frlist_t *ql)
2668{
2669 ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql;
2670 mutex_init(&ql->qh_lock);
2671 ql->qh_version = 0;
2672 ql->qh_nelems = 0;
2673}
2674
2675STATIC void
2676xfs_qm_freelist_destroy(xfs_frlist_t *ql)
2677{
2678 xfs_dquot_t *dqp, *nextdqp;
2679
2680 mutex_lock(&ql->qh_lock);
2681 for (dqp = ql->qh_next;
2682 dqp != (xfs_dquot_t *)ql; ) {
2683 xfs_dqlock(dqp);
2684 nextdqp = dqp->dq_flnext;
2685#ifdef QUOTADEBUG
2686 cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
2687#endif
2688 XQM_FREELIST_REMOVE(dqp);
2689 xfs_dqunlock(dqp);
2690 xfs_qm_dqdestroy(dqp);
2691 dqp = nextdqp;
2692 }
2693 mutex_unlock(&ql->qh_lock);
2694 mutex_destroy(&ql->qh_lock);
2695
2696 ASSERT(ql->qh_nelems == 0);
2697}
2698
2699STATIC void
2700xfs_qm_freelist_insert(xfs_frlist_t *ql, xfs_dquot_t *dq)
2701{
2702 dq->dq_flnext = ql->qh_next;
2703 dq->dq_flprev = (xfs_dquot_t *)ql;
2704 ql->qh_next = dq;
2705 dq->dq_flnext->dq_flprev = dq;
2706 xfs_Gqm->qm_dqfreelist.qh_nelems++;
2707 xfs_Gqm->qm_dqfreelist.qh_version++;
2708}
2709
2710void
2711xfs_qm_freelist_unlink(xfs_dquot_t *dq)
2712{
2713 xfs_dquot_t *next = dq->dq_flnext;
2714 xfs_dquot_t *prev = dq->dq_flprev;
2715
2716 next->dq_flprev = prev;
2717 prev->dq_flnext = next;
2718 dq->dq_flnext = dq->dq_flprev = dq;
2719 xfs_Gqm->qm_dqfreelist.qh_nelems--;
2720 xfs_Gqm->qm_dqfreelist.qh_version++;
2721}
2722
2723void
2724xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq)
2725{
2726 xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq);
2727}
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index 495564b8af38..c9446f1c726d 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -72,17 +72,6 @@ extern kmem_zone_t *qm_dqtrxzone;
72#define XFS_QM_MAX_DQCLUSTER_LOGSZ 3 72#define XFS_QM_MAX_DQCLUSTER_LOGSZ 3
73 73
74typedef xfs_dqhash_t xfs_dqlist_t; 74typedef xfs_dqhash_t xfs_dqlist_t;
75/*
76 * The freelist head. The first two fields match the first two in the
77 * xfs_dquot_t structure (in xfs_dqmarker_t)
78 */
79typedef struct xfs_frlist {
80 struct xfs_dquot *qh_next;
81 struct xfs_dquot *qh_prev;
82 struct mutex qh_lock;
83 uint qh_version;
84 uint qh_nelems;
85} xfs_frlist_t;
86 75
87/* 76/*
88 * Quota Manager (global) structure. Lives only in core. 77 * Quota Manager (global) structure. Lives only in core.
@@ -91,7 +80,9 @@ typedef struct xfs_qm {
91 xfs_dqlist_t *qm_usr_dqhtable;/* udquot hash table */ 80 xfs_dqlist_t *qm_usr_dqhtable;/* udquot hash table */
92 xfs_dqlist_t *qm_grp_dqhtable;/* gdquot hash table */ 81 xfs_dqlist_t *qm_grp_dqhtable;/* gdquot hash table */
93 uint qm_dqhashmask; /* # buckets in dq hashtab - 1 */ 82 uint qm_dqhashmask; /* # buckets in dq hashtab - 1 */
94 xfs_frlist_t qm_dqfreelist; /* freelist of dquots */ 83 struct list_head qm_dqfrlist; /* freelist of dquots */
84 struct mutex qm_dqfrlist_lock;
85 int qm_dqfrlist_cnt;
95 atomic_t qm_totaldquots; /* total incore dquots */ 86 atomic_t qm_totaldquots; /* total incore dquots */
96 uint qm_nrefs; /* file systems with quota on */ 87 uint qm_nrefs; /* file systems with quota on */
97 int qm_dqfree_ratio;/* ratio of free to inuse dquots */ 88 int qm_dqfree_ratio;/* ratio of free to inuse dquots */
@@ -106,7 +97,9 @@ typedef struct xfs_qm {
106typedef struct xfs_quotainfo { 97typedef struct xfs_quotainfo {
107 xfs_inode_t *qi_uquotaip; /* user quota inode */ 98 xfs_inode_t *qi_uquotaip; /* user quota inode */
108 xfs_inode_t *qi_gquotaip; /* group quota inode */ 99 xfs_inode_t *qi_gquotaip; /* group quota inode */
109 xfs_dqlist_t qi_dqlist; /* all dquots in filesys */ 100 struct list_head qi_dqlist; /* all dquots in filesys */
101 struct mutex qi_dqlist_lock;
102 int qi_dquots;
110 int qi_dqreclaims; /* a change here indicates 103 int qi_dqreclaims; /* a change here indicates
111 a removal in the dqlist */ 104 a removal in the dqlist */
112 time_t qi_btimelimit; /* limit for blks timer */ 105 time_t qi_btimelimit; /* limit for blks timer */
@@ -175,10 +168,6 @@ extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
175extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint); 168extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint);
176extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint); 169extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
177 170
178/* list stuff */
179extern void xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
180extern void xfs_qm_freelist_unlink(xfs_dquot_t *);
181
182#ifdef DEBUG 171#ifdef DEBUG
183extern int xfs_qm_internalqcheck(xfs_mount_t *); 172extern int xfs_qm_internalqcheck(xfs_mount_t *);
184#else 173#else
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 83e7ea3e25fa..3d1fc79532e2 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -55,7 +55,7 @@ static int xqm_proc_show(struct seq_file *m, void *v)
55 ndquot, 55 ndquot,
56 xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0, 56 xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
57 xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0, 57 xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
58 xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0); 58 xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0);
59 return 0; 59 return 0;
60} 60}
61 61
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 5d0ee8d492db..26fa43140f2e 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -79,6 +79,7 @@ xfs_qm_scall_quotaoff(
79 xfs_mount_t *mp, 79 xfs_mount_t *mp,
80 uint flags) 80 uint flags)
81{ 81{
82 struct xfs_quotainfo *q = mp->m_quotainfo;
82 uint dqtype; 83 uint dqtype;
83 int error; 84 int error;
84 uint inactivate_flags; 85 uint inactivate_flags;
@@ -102,11 +103,8 @@ xfs_qm_scall_quotaoff(
102 * critical thing. 103 * critical thing.
103 * If quotaoff, then we must be dealing with the root filesystem. 104 * If quotaoff, then we must be dealing with the root filesystem.
104 */ 105 */
105 ASSERT(mp->m_quotainfo); 106 ASSERT(q);
106 if (mp->m_quotainfo) 107 mutex_lock(&q->qi_quotaofflock);
107 mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
108
109 ASSERT(mp->m_quotainfo);
110 108
111 /* 109 /*
112 * If we're just turning off quota enforcement, change mp and go. 110 * If we're just turning off quota enforcement, change mp and go.
@@ -117,7 +115,7 @@ xfs_qm_scall_quotaoff(
117 spin_lock(&mp->m_sb_lock); 115 spin_lock(&mp->m_sb_lock);
118 mp->m_sb.sb_qflags = mp->m_qflags; 116 mp->m_sb.sb_qflags = mp->m_qflags;
119 spin_unlock(&mp->m_sb_lock); 117 spin_unlock(&mp->m_sb_lock);
120 mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); 118 mutex_unlock(&q->qi_quotaofflock);
121 119
122 /* XXX what to do if error ? Revert back to old vals incore ? */ 120 /* XXX what to do if error ? Revert back to old vals incore ? */
123 error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS); 121 error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
@@ -150,10 +148,8 @@ xfs_qm_scall_quotaoff(
150 * Nothing to do? Don't complain. This happens when we're just 148 * Nothing to do? Don't complain. This happens when we're just
151 * turning off quota enforcement. 149 * turning off quota enforcement.
152 */ 150 */
153 if ((mp->m_qflags & flags) == 0) { 151 if ((mp->m_qflags & flags) == 0)
154 mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); 152 goto out_unlock;
155 return (0);
156 }
157 153
158 /* 154 /*
159 * Write the LI_QUOTAOFF log record, and do SB changes atomically, 155 * Write the LI_QUOTAOFF log record, and do SB changes atomically,
@@ -162,7 +158,7 @@ xfs_qm_scall_quotaoff(
162 */ 158 */
163 error = xfs_qm_log_quotaoff(mp, &qoffstart, flags); 159 error = xfs_qm_log_quotaoff(mp, &qoffstart, flags);
164 if (error) 160 if (error)
165 goto out_error; 161 goto out_unlock;
166 162
167 /* 163 /*
168 * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct 164 * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
@@ -204,7 +200,7 @@ xfs_qm_scall_quotaoff(
204 * So, if we couldn't purge all the dquots from the filesystem, 200 * So, if we couldn't purge all the dquots from the filesystem,
205 * we can't get rid of the incore data structures. 201 * we can't get rid of the incore data structures.
206 */ 202 */
207 while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype|XFS_QMOPT_QUOTAOFF))) 203 while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype)))
208 delay(10 * nculprits); 204 delay(10 * nculprits);
209 205
210 /* 206 /*
@@ -222,7 +218,7 @@ xfs_qm_scall_quotaoff(
222 if (error) { 218 if (error) {
223 /* We're screwed now. Shutdown is the only option. */ 219 /* We're screwed now. Shutdown is the only option. */
224 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 220 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
225 goto out_error; 221 goto out_unlock;
226 } 222 }
227 223
228 /* 224 /*
@@ -230,27 +226,26 @@ xfs_qm_scall_quotaoff(
230 */ 226 */
231 if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) || 227 if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) ||
232 ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) { 228 ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) {
233 mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); 229 mutex_unlock(&q->qi_quotaofflock);
234 xfs_qm_destroy_quotainfo(mp); 230 xfs_qm_destroy_quotainfo(mp);
235 return (0); 231 return (0);
236 } 232 }
237 233
238 /* 234 /*
239 * Release our quotainode references, and vn_purge them, 235 * Release our quotainode references if we don't need them anymore.
240 * if we don't need them anymore.
241 */ 236 */
242 if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) { 237 if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) {
243 IRELE(XFS_QI_UQIP(mp)); 238 IRELE(q->qi_uquotaip);
244 XFS_QI_UQIP(mp) = NULL; 239 q->qi_uquotaip = NULL;
245 } 240 }
246 if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && XFS_QI_GQIP(mp)) { 241 if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && q->qi_gquotaip) {
247 IRELE(XFS_QI_GQIP(mp)); 242 IRELE(q->qi_gquotaip);
248 XFS_QI_GQIP(mp) = NULL; 243 q->qi_gquotaip = NULL;
249 } 244 }
250out_error:
251 mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
252 245
253 return (error); 246out_unlock:
247 mutex_unlock(&q->qi_quotaofflock);
248 return error;
254} 249}
255 250
256int 251int
@@ -379,9 +374,9 @@ xfs_qm_scall_quotaon(
379 /* 374 /*
380 * Switch on quota enforcement in core. 375 * Switch on quota enforcement in core.
381 */ 376 */
382 mutex_lock(&(XFS_QI_QOFFLOCK(mp))); 377 mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
383 mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD); 378 mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
384 mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); 379 mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
385 380
386 return (0); 381 return (0);
387} 382}
@@ -392,11 +387,12 @@ xfs_qm_scall_quotaon(
392 */ 387 */
393int 388int
394xfs_qm_scall_getqstat( 389xfs_qm_scall_getqstat(
395 xfs_mount_t *mp, 390 struct xfs_mount *mp,
396 fs_quota_stat_t *out) 391 struct fs_quota_stat *out)
397{ 392{
398 xfs_inode_t *uip, *gip; 393 struct xfs_quotainfo *q = mp->m_quotainfo;
399 boolean_t tempuqip, tempgqip; 394 struct xfs_inode *uip, *gip;
395 boolean_t tempuqip, tempgqip;
400 396
401 uip = gip = NULL; 397 uip = gip = NULL;
402 tempuqip = tempgqip = B_FALSE; 398 tempuqip = tempgqip = B_FALSE;
@@ -415,9 +411,9 @@ xfs_qm_scall_getqstat(
415 out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino; 411 out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
416 out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; 412 out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
417 413
418 if (mp->m_quotainfo) { 414 if (q) {
419 uip = mp->m_quotainfo->qi_uquotaip; 415 uip = q->qi_uquotaip;
420 gip = mp->m_quotainfo->qi_gquotaip; 416 gip = q->qi_gquotaip;
421 } 417 }
422 if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { 418 if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
423 if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 419 if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
@@ -441,15 +437,15 @@ xfs_qm_scall_getqstat(
441 if (tempgqip) 437 if (tempgqip)
442 IRELE(gip); 438 IRELE(gip);
443 } 439 }
444 if (mp->m_quotainfo) { 440 if (q) {
445 out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp); 441 out->qs_incoredqs = q->qi_dquots;
446 out->qs_btimelimit = XFS_QI_BTIMELIMIT(mp); 442 out->qs_btimelimit = q->qi_btimelimit;
447 out->qs_itimelimit = XFS_QI_ITIMELIMIT(mp); 443 out->qs_itimelimit = q->qi_itimelimit;
448 out->qs_rtbtimelimit = XFS_QI_RTBTIMELIMIT(mp); 444 out->qs_rtbtimelimit = q->qi_rtbtimelimit;
449 out->qs_bwarnlimit = XFS_QI_BWARNLIMIT(mp); 445 out->qs_bwarnlimit = q->qi_bwarnlimit;
450 out->qs_iwarnlimit = XFS_QI_IWARNLIMIT(mp); 446 out->qs_iwarnlimit = q->qi_iwarnlimit;
451 } 447 }
452 return (0); 448 return 0;
453} 449}
454 450
455/* 451/*
@@ -462,6 +458,7 @@ xfs_qm_scall_setqlim(
462 uint type, 458 uint type,
463 fs_disk_quota_t *newlim) 459 fs_disk_quota_t *newlim)
464{ 460{
461 struct xfs_quotainfo *q = mp->m_quotainfo;
465 xfs_disk_dquot_t *ddq; 462 xfs_disk_dquot_t *ddq;
466 xfs_dquot_t *dqp; 463 xfs_dquot_t *dqp;
467 xfs_trans_t *tp; 464 xfs_trans_t *tp;
@@ -485,7 +482,7 @@ xfs_qm_scall_setqlim(
485 * a quotaoff from happening). (XXXThis doesn't currently happen 482 * a quotaoff from happening). (XXXThis doesn't currently happen
486 * because we take the vfslock before calling xfs_qm_sysent). 483 * because we take the vfslock before calling xfs_qm_sysent).
487 */ 484 */
488 mutex_lock(&(XFS_QI_QOFFLOCK(mp))); 485 mutex_lock(&q->qi_quotaofflock);
489 486
490 /* 487 /*
491 * Get the dquot (locked), and join it to the transaction. 488 * Get the dquot (locked), and join it to the transaction.
@@ -493,9 +490,8 @@ xfs_qm_scall_setqlim(
493 */ 490 */
494 if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) { 491 if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) {
495 xfs_trans_cancel(tp, XFS_TRANS_ABORT); 492 xfs_trans_cancel(tp, XFS_TRANS_ABORT);
496 mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
497 ASSERT(error != ENOENT); 493 ASSERT(error != ENOENT);
498 return (error); 494 goto out_unlock;
499 } 495 }
500 xfs_trans_dqjoin(tp, dqp); 496 xfs_trans_dqjoin(tp, dqp);
501 ddq = &dqp->q_core; 497 ddq = &dqp->q_core;
@@ -513,8 +509,8 @@ xfs_qm_scall_setqlim(
513 ddq->d_blk_hardlimit = cpu_to_be64(hard); 509 ddq->d_blk_hardlimit = cpu_to_be64(hard);
514 ddq->d_blk_softlimit = cpu_to_be64(soft); 510 ddq->d_blk_softlimit = cpu_to_be64(soft);
515 if (id == 0) { 511 if (id == 0) {
516 mp->m_quotainfo->qi_bhardlimit = hard; 512 q->qi_bhardlimit = hard;
517 mp->m_quotainfo->qi_bsoftlimit = soft; 513 q->qi_bsoftlimit = soft;
518 } 514 }
519 } else { 515 } else {
520 qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft); 516 qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft);
@@ -529,8 +525,8 @@ xfs_qm_scall_setqlim(
529 ddq->d_rtb_hardlimit = cpu_to_be64(hard); 525 ddq->d_rtb_hardlimit = cpu_to_be64(hard);
530 ddq->d_rtb_softlimit = cpu_to_be64(soft); 526 ddq->d_rtb_softlimit = cpu_to_be64(soft);
531 if (id == 0) { 527 if (id == 0) {
532 mp->m_quotainfo->qi_rtbhardlimit = hard; 528 q->qi_rtbhardlimit = hard;
533 mp->m_quotainfo->qi_rtbsoftlimit = soft; 529 q->qi_rtbsoftlimit = soft;
534 } 530 }
535 } else { 531 } else {
536 qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft); 532 qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
@@ -546,8 +542,8 @@ xfs_qm_scall_setqlim(
546 ddq->d_ino_hardlimit = cpu_to_be64(hard); 542 ddq->d_ino_hardlimit = cpu_to_be64(hard);
547 ddq->d_ino_softlimit = cpu_to_be64(soft); 543 ddq->d_ino_softlimit = cpu_to_be64(soft);
548 if (id == 0) { 544 if (id == 0) {
549 mp->m_quotainfo->qi_ihardlimit = hard; 545 q->qi_ihardlimit = hard;
550 mp->m_quotainfo->qi_isoftlimit = soft; 546 q->qi_isoftlimit = soft;
551 } 547 }
552 } else { 548 } else {
553 qdprintk("ihard %Ld < isoft %Ld\n", hard, soft); 549 qdprintk("ihard %Ld < isoft %Ld\n", hard, soft);
@@ -572,23 +568,23 @@ xfs_qm_scall_setqlim(
572 * for warnings. 568 * for warnings.
573 */ 569 */
574 if (newlim->d_fieldmask & FS_DQ_BTIMER) { 570 if (newlim->d_fieldmask & FS_DQ_BTIMER) {
575 mp->m_quotainfo->qi_btimelimit = newlim->d_btimer; 571 q->qi_btimelimit = newlim->d_btimer;
576 ddq->d_btimer = cpu_to_be32(newlim->d_btimer); 572 ddq->d_btimer = cpu_to_be32(newlim->d_btimer);
577 } 573 }
578 if (newlim->d_fieldmask & FS_DQ_ITIMER) { 574 if (newlim->d_fieldmask & FS_DQ_ITIMER) {
579 mp->m_quotainfo->qi_itimelimit = newlim->d_itimer; 575 q->qi_itimelimit = newlim->d_itimer;
580 ddq->d_itimer = cpu_to_be32(newlim->d_itimer); 576 ddq->d_itimer = cpu_to_be32(newlim->d_itimer);
581 } 577 }
582 if (newlim->d_fieldmask & FS_DQ_RTBTIMER) { 578 if (newlim->d_fieldmask & FS_DQ_RTBTIMER) {
583 mp->m_quotainfo->qi_rtbtimelimit = newlim->d_rtbtimer; 579 q->qi_rtbtimelimit = newlim->d_rtbtimer;
584 ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer); 580 ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer);
585 } 581 }
586 if (newlim->d_fieldmask & FS_DQ_BWARNS) 582 if (newlim->d_fieldmask & FS_DQ_BWARNS)
587 mp->m_quotainfo->qi_bwarnlimit = newlim->d_bwarns; 583 q->qi_bwarnlimit = newlim->d_bwarns;
588 if (newlim->d_fieldmask & FS_DQ_IWARNS) 584 if (newlim->d_fieldmask & FS_DQ_IWARNS)
589 mp->m_quotainfo->qi_iwarnlimit = newlim->d_iwarns; 585 q->qi_iwarnlimit = newlim->d_iwarns;
590 if (newlim->d_fieldmask & FS_DQ_RTBWARNS) 586 if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
591 mp->m_quotainfo->qi_rtbwarnlimit = newlim->d_rtbwarns; 587 q->qi_rtbwarnlimit = newlim->d_rtbwarns;
592 } else { 588 } else {
593 /* 589 /*
594 * If the user is now over quota, start the timelimit. 590 * If the user is now over quota, start the timelimit.
@@ -605,8 +601,9 @@ xfs_qm_scall_setqlim(
605 error = xfs_trans_commit(tp, 0); 601 error = xfs_trans_commit(tp, 0);
606 xfs_qm_dqprint(dqp); 602 xfs_qm_dqprint(dqp);
607 xfs_qm_dqrele(dqp); 603 xfs_qm_dqrele(dqp);
608 mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
609 604
605 out_unlock:
606 mutex_unlock(&q->qi_quotaofflock);
610 return error; 607 return error;
611} 608}
612 609
@@ -853,7 +850,8 @@ xfs_dqrele_inode(
853 int error; 850 int error;
854 851
855 /* skip quota inodes */ 852 /* skip quota inodes */
856 if (ip == XFS_QI_UQIP(ip->i_mount) || ip == XFS_QI_GQIP(ip->i_mount)) { 853 if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
854 ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
857 ASSERT(ip->i_udquot == NULL); 855 ASSERT(ip->i_udquot == NULL);
858 ASSERT(ip->i_gdquot == NULL); 856 ASSERT(ip->i_gdquot == NULL);
859 read_unlock(&pag->pag_ici_lock); 857 read_unlock(&pag->pag_ici_lock);
@@ -891,7 +889,8 @@ xfs_qm_dqrele_all_inodes(
891 uint flags) 889 uint flags)
892{ 890{
893 ASSERT(mp->m_quotainfo); 891 ASSERT(mp->m_quotainfo);
894 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0); 892 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags,
893 XFS_ICI_NO_TAG, 0, NULL);
895} 894}
896 895
897/*------------------------------------------------------------------------*/ 896/*------------------------------------------------------------------------*/
@@ -930,7 +929,8 @@ struct mutex qcheck_lock;
930} 929}
931 930
932typedef struct dqtest { 931typedef struct dqtest {
933 xfs_dqmarker_t q_lists; 932 uint dq_flags; /* various flags (XFS_DQ_*) */
933 struct list_head q_hashlist;
934 xfs_dqhash_t *q_hash; /* the hashchain header */ 934 xfs_dqhash_t *q_hash; /* the hashchain header */
935 xfs_mount_t *q_mount; /* filesystem this relates to */ 935 xfs_mount_t *q_mount; /* filesystem this relates to */
936 xfs_dqid_t d_id; /* user id or group id */ 936 xfs_dqid_t d_id; /* user id or group id */
@@ -941,14 +941,9 @@ typedef struct dqtest {
941STATIC void 941STATIC void
942xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp) 942xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
943{ 943{
944 xfs_dquot_t *d; 944 list_add(&dqp->q_hashlist, &h->qh_list);
945 if (((d) = (h)->qh_next)) 945 h->qh_version++;
946 (d)->HL_PREVP = &((dqp)->HL_NEXT); 946 h->qh_nelems++;
947 (dqp)->HL_NEXT = d;
948 (dqp)->HL_PREVP = &((h)->qh_next);
949 (h)->qh_next = (xfs_dquot_t *)dqp;
950 (h)->qh_version++;
951 (h)->qh_nelems++;
952} 947}
953STATIC void 948STATIC void
954xfs_qm_dqtest_print( 949xfs_qm_dqtest_print(
@@ -1060,9 +1055,7 @@ xfs_qm_internalqcheck_dqget(
1060 xfs_dqhash_t *h; 1055 xfs_dqhash_t *h;
1061 1056
1062 h = DQTEST_HASH(mp, id, type); 1057 h = DQTEST_HASH(mp, id, type);
1063 for (d = (xfs_dqtest_t *) h->qh_next; d != NULL; 1058 list_for_each_entry(d, &h->qh_list, q_hashlist) {
1064 d = (xfs_dqtest_t *) d->HL_NEXT) {
1065 /* DQTEST_LIST_PRINT(h, HL_NEXT, "@@@@@ dqtestlist @@@@@"); */
1066 if (d->d_id == id && mp == d->q_mount) { 1059 if (d->d_id == id && mp == d->q_mount) {
1067 *O_dq = d; 1060 *O_dq = d;
1068 return (0); 1061 return (0);
@@ -1073,6 +1066,7 @@ xfs_qm_internalqcheck_dqget(
1073 d->d_id = id; 1066 d->d_id = id;
1074 d->q_mount = mp; 1067 d->q_mount = mp;
1075 d->q_hash = h; 1068 d->q_hash = h;
1069 INIT_LIST_HEAD(&d->q_hashlist);
1076 xfs_qm_hashinsert(h, d); 1070 xfs_qm_hashinsert(h, d);
1077 *O_dq = d; 1071 *O_dq = d;
1078 return (0); 1072 return (0);
@@ -1179,8 +1173,6 @@ xfs_qm_internalqcheck(
1179 xfs_ino_t lastino; 1173 xfs_ino_t lastino;
1180 int done, count; 1174 int done, count;
1181 int i; 1175 int i;
1182 xfs_dqtest_t *d, *e;
1183 xfs_dqhash_t *h1;
1184 int error; 1176 int error;
1185 1177
1186 lastino = 0; 1178 lastino = 0;
@@ -1220,19 +1212,18 @@ xfs_qm_internalqcheck(
1220 } 1212 }
1221 cmn_err(CE_DEBUG, "Checking results against system dquots"); 1213 cmn_err(CE_DEBUG, "Checking results against system dquots");
1222 for (i = 0; i < qmtest_hashmask; i++) { 1214 for (i = 0; i < qmtest_hashmask; i++) {
1223 h1 = &qmtest_udqtab[i]; 1215 xfs_dqtest_t *d, *n;
1224 for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) { 1216 xfs_dqhash_t *h;
1217
1218 h = &qmtest_udqtab[i];
1219 list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) {
1225 xfs_dqtest_cmp(d); 1220 xfs_dqtest_cmp(d);
1226 e = (xfs_dqtest_t *) d->HL_NEXT;
1227 kmem_free(d); 1221 kmem_free(d);
1228 d = e;
1229 } 1222 }
1230 h1 = &qmtest_gdqtab[i]; 1223 h = &qmtest_gdqtab[i];
1231 for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) { 1224 list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) {
1232 xfs_dqtest_cmp(d); 1225 xfs_dqtest_cmp(d);
1233 e = (xfs_dqtest_t *) d->HL_NEXT;
1234 kmem_free(d); 1226 kmem_free(d);
1235 d = e;
1236 } 1227 }
1237 } 1228 }
1238 1229
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
index 8286b2842b6b..94a3d927d716 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -24,43 +24,6 @@
24 */ 24 */
25#define XFS_DQITER_MAP_SIZE 10 25#define XFS_DQITER_MAP_SIZE 10
26 26
27/* Number of dquots that fit in to a dquot block */
28#define XFS_QM_DQPERBLK(mp) ((mp)->m_quotainfo->qi_dqperchunk)
29
30#define XFS_DQ_IS_ADDEDTO_TRX(t, d) ((d)->q_transp == (t))
31
32#define XFS_QI_MPLRECLAIMS(mp) ((mp)->m_quotainfo->qi_dqreclaims)
33#define XFS_QI_UQIP(mp) ((mp)->m_quotainfo->qi_uquotaip)
34#define XFS_QI_GQIP(mp) ((mp)->m_quotainfo->qi_gquotaip)
35#define XFS_QI_DQCHUNKLEN(mp) ((mp)->m_quotainfo->qi_dqchunklen)
36#define XFS_QI_BTIMELIMIT(mp) ((mp)->m_quotainfo->qi_btimelimit)
37#define XFS_QI_RTBTIMELIMIT(mp) ((mp)->m_quotainfo->qi_rtbtimelimit)
38#define XFS_QI_ITIMELIMIT(mp) ((mp)->m_quotainfo->qi_itimelimit)
39#define XFS_QI_BWARNLIMIT(mp) ((mp)->m_quotainfo->qi_bwarnlimit)
40#define XFS_QI_RTBWARNLIMIT(mp) ((mp)->m_quotainfo->qi_rtbwarnlimit)
41#define XFS_QI_IWARNLIMIT(mp) ((mp)->m_quotainfo->qi_iwarnlimit)
42#define XFS_QI_QOFFLOCK(mp) ((mp)->m_quotainfo->qi_quotaofflock)
43
44#define XFS_QI_MPL_LIST(mp) ((mp)->m_quotainfo->qi_dqlist)
45#define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next)
46#define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems)
47
48#define xfs_qm_mplist_lock(mp) \
49 mutex_lock(&(XFS_QI_MPL_LIST(mp).qh_lock))
50#define xfs_qm_mplist_nowait(mp) \
51 mutex_trylock(&(XFS_QI_MPL_LIST(mp).qh_lock))
52#define xfs_qm_mplist_unlock(mp) \
53 mutex_unlock(&(XFS_QI_MPL_LIST(mp).qh_lock))
54#define XFS_QM_IS_MPLIST_LOCKED(mp) \
55 mutex_is_locked(&(XFS_QI_MPL_LIST(mp).qh_lock))
56
57#define xfs_qm_freelist_lock(qm) \
58 mutex_lock(&((qm)->qm_dqfreelist.qh_lock))
59#define xfs_qm_freelist_lock_nowait(qm) \
60 mutex_trylock(&((qm)->qm_dqfreelist.qh_lock))
61#define xfs_qm_freelist_unlock(qm) \
62 mutex_unlock(&((qm)->qm_dqfreelist.qh_lock))
63
64/* 27/*
65 * Hash into a bucket in the dquot hash table, based on <mp, id>. 28 * Hash into a bucket in the dquot hash table, based on <mp, id>.
66 */ 29 */
@@ -72,9 +35,6 @@
72 XFS_DQ_HASHVAL(mp, id)) : \ 35 XFS_DQ_HASHVAL(mp, id)) : \
73 (xfs_Gqm->qm_grp_dqhtable + \ 36 (xfs_Gqm->qm_grp_dqhtable + \
74 XFS_DQ_HASHVAL(mp, id))) 37 XFS_DQ_HASHVAL(mp, id)))
75#define XFS_IS_DQTYPE_ON(mp, type) (type == XFS_DQ_USER ? \
76 XFS_IS_UQUOTA_ON(mp) : \
77 XFS_IS_OQUOTA_ON(mp))
78#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ 38#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
79 !dqp->q_core.d_blk_hardlimit && \ 39 !dqp->q_core.d_blk_hardlimit && \
80 !dqp->q_core.d_blk_softlimit && \ 40 !dqp->q_core.d_blk_softlimit && \
@@ -86,68 +46,6 @@
86 !dqp->q_core.d_rtbcount && \ 46 !dqp->q_core.d_rtbcount && \
87 !dqp->q_core.d_icount) 47 !dqp->q_core.d_icount)
88 48
89#define HL_PREVP dq_hashlist.ql_prevp
90#define HL_NEXT dq_hashlist.ql_next
91#define MPL_PREVP dq_mplist.ql_prevp
92#define MPL_NEXT dq_mplist.ql_next
93
94
95#define _LIST_REMOVE(h, dqp, PVP, NXT) \
96 { \
97 xfs_dquot_t *d; \
98 if (((d) = (dqp)->NXT)) \
99 (d)->PVP = (dqp)->PVP; \
100 *((dqp)->PVP) = d; \
101 (dqp)->NXT = NULL; \
102 (dqp)->PVP = NULL; \
103 (h)->qh_version++; \
104 (h)->qh_nelems--; \
105 }
106
107#define _LIST_INSERT(h, dqp, PVP, NXT) \
108 { \
109 xfs_dquot_t *d; \
110 if (((d) = (h)->qh_next)) \
111 (d)->PVP = &((dqp)->NXT); \
112 (dqp)->NXT = d; \
113 (dqp)->PVP = &((h)->qh_next); \
114 (h)->qh_next = dqp; \
115 (h)->qh_version++; \
116 (h)->qh_nelems++; \
117 }
118
119#define FOREACH_DQUOT_IN_MP(dqp, mp) \
120 for ((dqp) = XFS_QI_MPLNEXT(mp); (dqp) != NULL; (dqp) = (dqp)->MPL_NEXT)
121
122#define FOREACH_DQUOT_IN_FREELIST(dqp, qlist) \
123for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \
124 (dqp) = (dqp)->dq_flnext)
125
126#define XQM_HASHLIST_INSERT(h, dqp) \
127 _LIST_INSERT(h, dqp, HL_PREVP, HL_NEXT)
128
129#define XQM_FREELIST_INSERT(h, dqp) \
130 xfs_qm_freelist_append(h, dqp)
131
132#define XQM_MPLIST_INSERT(h, dqp) \
133 _LIST_INSERT(h, dqp, MPL_PREVP, MPL_NEXT)
134
135#define XQM_HASHLIST_REMOVE(h, dqp) \
136 _LIST_REMOVE(h, dqp, HL_PREVP, HL_NEXT)
137#define XQM_FREELIST_REMOVE(dqp) \
138 xfs_qm_freelist_unlink(dqp)
139#define XQM_MPLIST_REMOVE(h, dqp) \
140 { _LIST_REMOVE(h, dqp, MPL_PREVP, MPL_NEXT); \
141 XFS_QI_MPLRECLAIMS((dqp)->q_mount)++; }
142
143#define XFS_DQ_IS_LOGITEM_INITD(dqp) ((dqp)->q_logitem.qli_dquot == (dqp))
144
145#define XFS_QM_DQP_TO_DQACCT(tp, dqp) (XFS_QM_ISUDQ(dqp) ? \
146 (tp)->t_dqinfo->dqa_usrdquots : \
147 (tp)->t_dqinfo->dqa_grpdquots)
148#define XFS_IS_SUSER_DQUOT(dqp) \
149 (!((dqp)->q_core.d_id))
150
151#define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \ 49#define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
152 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \ 50 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
153 (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???"))) 51 (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index c3ab75cb1d9a..061d827da33c 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -59,12 +59,11 @@ xfs_trans_dqjoin(
59 xfs_trans_t *tp, 59 xfs_trans_t *tp,
60 xfs_dquot_t *dqp) 60 xfs_dquot_t *dqp)
61{ 61{
62 xfs_dq_logitem_t *lp; 62 xfs_dq_logitem_t *lp = &dqp->q_logitem;
63 63
64 ASSERT(! XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); 64 ASSERT(dqp->q_transp != tp);
65 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 65 ASSERT(XFS_DQ_IS_LOCKED(dqp));
66 ASSERT(XFS_DQ_IS_LOGITEM_INITD(dqp)); 66 ASSERT(lp->qli_dquot == dqp);
67 lp = &dqp->q_logitem;
68 67
69 /* 68 /*
70 * Get a log_item_desc to point at the new item. 69 * Get a log_item_desc to point at the new item.
@@ -96,7 +95,7 @@ xfs_trans_log_dquot(
96{ 95{
97 xfs_log_item_desc_t *lidp; 96 xfs_log_item_desc_t *lidp;
98 97
99 ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); 98 ASSERT(dqp->q_transp == tp);
100 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 99 ASSERT(XFS_DQ_IS_LOCKED(dqp));
101 100
102 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem)); 101 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem));
@@ -198,16 +197,16 @@ xfs_trans_get_dqtrx(
198 int i; 197 int i;
199 xfs_dqtrx_t *qa; 198 xfs_dqtrx_t *qa;
200 199
201 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { 200 qa = XFS_QM_ISUDQ(dqp) ?
202 qa = XFS_QM_DQP_TO_DQACCT(tp, dqp); 201 tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots;
203 202
203 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
204 if (qa[i].qt_dquot == NULL || 204 if (qa[i].qt_dquot == NULL ||
205 qa[i].qt_dquot == dqp) { 205 qa[i].qt_dquot == dqp)
206 return (&qa[i]); 206 return &qa[i];
207 }
208 } 207 }
209 208
210 return (NULL); 209 return NULL;
211} 210}
212 211
213/* 212/*
@@ -381,7 +380,7 @@ xfs_trans_apply_dquot_deltas(
381 break; 380 break;
382 381
383 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 382 ASSERT(XFS_DQ_IS_LOCKED(dqp));
384 ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); 383 ASSERT(dqp->q_transp == tp);
385 384
386 /* 385 /*
387 * adjust the actual number of blocks used 386 * adjust the actual number of blocks used
@@ -639,7 +638,7 @@ xfs_trans_dqresv(
639 softlimit = q->qi_bsoftlimit; 638 softlimit = q->qi_bsoftlimit;
640 timer = be32_to_cpu(dqp->q_core.d_btimer); 639 timer = be32_to_cpu(dqp->q_core.d_btimer);
641 warns = be16_to_cpu(dqp->q_core.d_bwarns); 640 warns = be16_to_cpu(dqp->q_core.d_bwarns);
642 warnlimit = XFS_QI_BWARNLIMIT(dqp->q_mount); 641 warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit;
643 resbcountp = &dqp->q_res_bcount; 642 resbcountp = &dqp->q_res_bcount;
644 } else { 643 } else {
645 ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS); 644 ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
@@ -651,7 +650,7 @@ xfs_trans_dqresv(
651 softlimit = q->qi_rtbsoftlimit; 650 softlimit = q->qi_rtbsoftlimit;
652 timer = be32_to_cpu(dqp->q_core.d_rtbtimer); 651 timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
653 warns = be16_to_cpu(dqp->q_core.d_rtbwarns); 652 warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
654 warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount); 653 warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit;
655 resbcountp = &dqp->q_res_rtbcount; 654 resbcountp = &dqp->q_res_rtbcount;
656 } 655 }
657 656
@@ -691,7 +690,7 @@ xfs_trans_dqresv(
691 count = be64_to_cpu(dqp->q_core.d_icount); 690 count = be64_to_cpu(dqp->q_core.d_icount);
692 timer = be32_to_cpu(dqp->q_core.d_itimer); 691 timer = be32_to_cpu(dqp->q_core.d_itimer);
693 warns = be16_to_cpu(dqp->q_core.d_iwarns); 692 warns = be16_to_cpu(dqp->q_core.d_iwarns);
694 warnlimit = XFS_QI_IWARNLIMIT(dqp->q_mount); 693 warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
695 hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); 694 hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
696 if (!hardlimit) 695 if (!hardlimit)
697 hardlimit = q->qi_ihardlimit; 696 hardlimit = q->qi_ihardlimit;
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index b1a5a1ff88ea..abb8222b88c9 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -223,6 +223,7 @@ typedef struct xfs_perag {
223 int pag_ici_init; /* incore inode cache initialised */ 223 int pag_ici_init; /* incore inode cache initialised */
224 rwlock_t pag_ici_lock; /* incore inode lock */ 224 rwlock_t pag_ici_lock; /* incore inode lock */
225 struct radix_tree_root pag_ici_root; /* incore inode cache root */ 225 struct radix_tree_root pag_ici_root; /* incore inode cache root */
226 int pag_ici_reclaimable; /* reclaimable inodes */
226#endif 227#endif
227 int pagb_count; /* pagb slots in use */ 228 int pagb_count; /* pagb slots in use */
228 xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */ 229 xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 5c11e4d17010..99587ded043f 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3829,7 +3829,7 @@ xfs_bmap_add_attrfork(
3829 } 3829 }
3830 if ((error = xfs_bmap_finish(&tp, &flist, &committed))) 3830 if ((error = xfs_bmap_finish(&tp, &flist, &committed)))
3831 goto error2; 3831 goto error2;
3832 error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES); 3832 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3833 ASSERT(ip->i_df.if_ext_max == 3833 ASSERT(ip->i_df.if_ext_max ==
3834 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); 3834 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
3835 return error; 3835 return error;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index f3c49e69eab9..240340a4727b 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -372,12 +372,12 @@ xfs_buf_item_pin(
372 */ 372 */
373STATIC void 373STATIC void
374xfs_buf_item_unpin( 374xfs_buf_item_unpin(
375 xfs_buf_log_item_t *bip, 375 xfs_buf_log_item_t *bip)
376 int stale)
377{ 376{
378 struct xfs_ail *ailp; 377 struct xfs_ail *ailp;
379 xfs_buf_t *bp; 378 xfs_buf_t *bp;
380 int freed; 379 int freed;
380 int stale = bip->bli_flags & XFS_BLI_STALE;
381 381
382 bp = bip->bli_buf; 382 bp = bip->bli_buf;
383 ASSERT(bp != NULL); 383 ASSERT(bp != NULL);
@@ -428,40 +428,34 @@ xfs_buf_item_unpin_remove(
428 xfs_buf_log_item_t *bip, 428 xfs_buf_log_item_t *bip,
429 xfs_trans_t *tp) 429 xfs_trans_t *tp)
430{ 430{
431 xfs_buf_t *bp; 431 /* will xfs_buf_item_unpin() call xfs_buf_item_relse()? */
432 xfs_log_item_desc_t *lidp;
433 int stale = 0;
434
435 bp = bip->bli_buf;
436 /*
437 * will xfs_buf_item_unpin() call xfs_buf_item_relse()?
438 */
439 if ((atomic_read(&bip->bli_refcount) == 1) && 432 if ((atomic_read(&bip->bli_refcount) == 1) &&
440 (bip->bli_flags & XFS_BLI_STALE)) { 433 (bip->bli_flags & XFS_BLI_STALE)) {
434 /*
435 * yes -- We can safely do some work here and then call
436 * buf_item_unpin to do the rest because we are
437 * are holding the buffer locked so no one else will be
438 * able to bump up the refcount. We have to remove the
439 * log item from the transaction as we are about to release
440 * our reference to the buffer. If we don't, the unlock that
441 * occurs later in the xfs_trans_uncommit() will try to
442 * reference the buffer which we no longer have a hold on.
443 */
444 struct xfs_log_item_desc *lidp;
445
441 ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0); 446 ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);
442 trace_xfs_buf_item_unpin_stale(bip); 447 trace_xfs_buf_item_unpin_stale(bip);
443 448
444 /* 449 lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)bip);
445 * yes -- clear the xaction descriptor in-use flag
446 * and free the chunk if required. We can safely
447 * do some work here and then call buf_item_unpin
448 * to do the rest because if the if is true, then
449 * we are holding the buffer locked so no one else
450 * will be able to bump up the refcount.
451 */
452 lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip);
453 stale = lidp->lid_flags & XFS_LID_BUF_STALE;
454 xfs_trans_free_item(tp, lidp); 450 xfs_trans_free_item(tp, lidp);
451
455 /* 452 /*
456 * Since the transaction no longer refers to the buffer, 453 * Since the transaction no longer refers to the buffer, the
457 * the buffer should no longer refer to the transaction. 454 * buffer should no longer refer to the transaction.
458 */ 455 */
459 XFS_BUF_SET_FSPRIVATE2(bp, NULL); 456 XFS_BUF_SET_FSPRIVATE2(bip->bli_buf, NULL);
460 } 457 }
461 458 xfs_buf_item_unpin(bip);
462 xfs_buf_item_unpin(bip, stale);
463
464 return;
465} 459}
466 460
467/* 461/*
@@ -675,7 +669,7 @@ static struct xfs_item_ops xfs_buf_item_ops = {
675 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 669 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
676 xfs_buf_item_format, 670 xfs_buf_item_format,
677 .iop_pin = (void(*)(xfs_log_item_t*))xfs_buf_item_pin, 671 .iop_pin = (void(*)(xfs_log_item_t*))xfs_buf_item_pin,
678 .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_buf_item_unpin, 672 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_buf_item_unpin,
679 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) 673 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
680 xfs_buf_item_unpin_remove, 674 xfs_buf_item_unpin_remove,
681 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock, 675 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock,
@@ -733,10 +727,7 @@ xfs_buf_item_init(
733 727
734 bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, 728 bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
735 KM_SLEEP); 729 KM_SLEEP);
736 bip->bli_item.li_type = XFS_LI_BUF; 730 xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
737 bip->bli_item.li_ops = &xfs_buf_item_ops;
738 bip->bli_item.li_mountp = mp;
739 bip->bli_item.li_ailp = mp->m_ail;
740 bip->bli_buf = bp; 731 bip->bli_buf = bp;
741 xfs_buf_hold(bp); 732 xfs_buf_hold(bp);
742 bip->bli_format.blf_type = XFS_LI_BUF; 733 bip->bli_format.blf_type = XFS_LI_BUF;
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 217f34af00cb..df4454511f73 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -26,7 +26,7 @@ extern kmem_zone_t *xfs_buf_item_zone;
26 * have been logged. 26 * have been logged.
27 * For 6.2 and beyond, this is XFS_LI_BUF. We use this to log everything. 27 * For 6.2 and beyond, this is XFS_LI_BUF. We use this to log everything.
28 */ 28 */
29typedef struct xfs_buf_log_format_t { 29typedef struct xfs_buf_log_format {
30 unsigned short blf_type; /* buf log item type indicator */ 30 unsigned short blf_type; /* buf log item type indicator */
31 unsigned short blf_size; /* size of this item */ 31 unsigned short blf_size; /* size of this item */
32 ushort blf_flags; /* misc state */ 32 ushort blf_flags; /* misc state */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index cd27c9d6c71f..5bba29a07812 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -177,16 +177,26 @@ xfs_swap_extents_check_format(
177 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max) 177 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
178 return EINVAL; 178 return EINVAL;
179 179
180 /* Check root block of temp in btree form to max in target */ 180 /*
181 * If we are in a btree format, check that the temp root block will fit
182 * in the target and that it has enough extents to be in btree format
183 * in the target.
184 *
185 * Note that we have to be careful to allow btree->extent conversions
186 * (a common defrag case) which will occur when the temp inode is in
187 * extent format...
188 */
181 if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE && 189 if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
182 XFS_IFORK_BOFF(ip) && 190 ((XFS_IFORK_BOFF(ip) &&
183 tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) 191 tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) ||
192 XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max))
184 return EINVAL; 193 return EINVAL;
185 194
186 /* Check root block of target in btree form to max in temp */ 195 /* Reciprocal target->temp btree format checks */
187 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && 196 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
188 XFS_IFORK_BOFF(tip) && 197 ((XFS_IFORK_BOFF(tip) &&
189 ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) 198 ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) ||
199 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max))
190 return EINVAL; 200 return EINVAL;
191 201
192 return 0; 202 return 0;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 92d5cd5bf4f2..ef96175c0744 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -186,18 +186,18 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
186 186
187void 187void
188xfs_error_report( 188xfs_error_report(
189 char *tag, 189 const char *tag,
190 int level, 190 int level,
191 xfs_mount_t *mp, 191 struct xfs_mount *mp,
192 char *fname, 192 const char *filename,
193 int linenum, 193 int linenum,
194 inst_t *ra) 194 inst_t *ra)
195{ 195{
196 if (level <= xfs_error_level) { 196 if (level <= xfs_error_level) {
197 xfs_cmn_err(XFS_PTAG_ERROR_REPORT, 197 xfs_cmn_err(XFS_PTAG_ERROR_REPORT,
198 CE_ALERT, mp, 198 CE_ALERT, mp,
199 "XFS internal error %s at line %d of file %s. Caller 0x%p\n", 199 "XFS internal error %s at line %d of file %s. Caller 0x%p\n",
200 tag, linenum, fname, ra); 200 tag, linenum, filename, ra);
201 201
202 xfs_stack_trace(); 202 xfs_stack_trace();
203 } 203 }
@@ -205,15 +205,15 @@ xfs_error_report(
205 205
206void 206void
207xfs_corruption_error( 207xfs_corruption_error(
208 char *tag, 208 const char *tag,
209 int level, 209 int level,
210 xfs_mount_t *mp, 210 struct xfs_mount *mp,
211 void *p, 211 void *p,
212 char *fname, 212 const char *filename,
213 int linenum, 213 int linenum,
214 inst_t *ra) 214 inst_t *ra)
215{ 215{
216 if (level <= xfs_error_level) 216 if (level <= xfs_error_level)
217 xfs_hex_dump(p, 16); 217 xfs_hex_dump(p, 16);
218 xfs_error_report(tag, level, mp, fname, linenum, ra); 218 xfs_error_report(tag, level, mp, filename, linenum, ra);
219} 219}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 0c93051c4651..c2c1a072bb82 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -29,10 +29,11 @@ extern int xfs_error_trap(int);
29 29
30struct xfs_mount; 30struct xfs_mount;
31 31
32extern void xfs_error_report(char *tag, int level, struct xfs_mount *mp, 32extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
33 char *fname, int linenum, inst_t *ra); 33 const char *filename, int linenum, inst_t *ra);
34extern void xfs_corruption_error(char *tag, int level, struct xfs_mount *mp, 34extern void xfs_corruption_error(const char *tag, int level,
35 void *p, char *fname, int linenum, inst_t *ra); 35 struct xfs_mount *mp, void *p, const char *filename,
36 int linenum, inst_t *ra);
36 37
37#define XFS_ERROR_REPORT(e, lvl, mp) \ 38#define XFS_ERROR_REPORT(e, lvl, mp) \
38 xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) 39 xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 6f35ed1b39b9..409fe81585fd 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -106,7 +106,7 @@ xfs_efi_item_pin(xfs_efi_log_item_t *efip)
106 */ 106 */
107/*ARGSUSED*/ 107/*ARGSUSED*/
108STATIC void 108STATIC void
109xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale) 109xfs_efi_item_unpin(xfs_efi_log_item_t *efip)
110{ 110{
111 struct xfs_ail *ailp = efip->efi_item.li_ailp; 111 struct xfs_ail *ailp = efip->efi_item.li_ailp;
112 112
@@ -224,7 +224,7 @@ static struct xfs_item_ops xfs_efi_item_ops = {
224 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 224 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
225 xfs_efi_item_format, 225 xfs_efi_item_format,
226 .iop_pin = (void(*)(xfs_log_item_t*))xfs_efi_item_pin, 226 .iop_pin = (void(*)(xfs_log_item_t*))xfs_efi_item_pin,
227 .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_efi_item_unpin, 227 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_efi_item_unpin,
228 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) 228 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
229 xfs_efi_item_unpin_remove, 229 xfs_efi_item_unpin_remove,
230 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock, 230 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock,
@@ -259,10 +259,7 @@ xfs_efi_init(xfs_mount_t *mp,
259 KM_SLEEP); 259 KM_SLEEP);
260 } 260 }
261 261
262 efip->efi_item.li_type = XFS_LI_EFI; 262 xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
263 efip->efi_item.li_ops = &xfs_efi_item_ops;
264 efip->efi_item.li_mountp = mp;
265 efip->efi_item.li_ailp = mp->m_ail;
266 efip->efi_format.efi_nextents = nextents; 263 efip->efi_format.efi_nextents = nextents;
267 efip->efi_format.efi_id = (__psint_t)(void*)efip; 264 efip->efi_format.efi_id = (__psint_t)(void*)efip;
268 265
@@ -428,7 +425,7 @@ xfs_efd_item_pin(xfs_efd_log_item_t *efdp)
428 */ 425 */
429/*ARGSUSED*/ 426/*ARGSUSED*/
430STATIC void 427STATIC void
431xfs_efd_item_unpin(xfs_efd_log_item_t *efdp, int stale) 428xfs_efd_item_unpin(xfs_efd_log_item_t *efdp)
432{ 429{
433 return; 430 return;
434} 431}
@@ -518,7 +515,7 @@ static struct xfs_item_ops xfs_efd_item_ops = {
518 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 515 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
519 xfs_efd_item_format, 516 xfs_efd_item_format,
520 .iop_pin = (void(*)(xfs_log_item_t*))xfs_efd_item_pin, 517 .iop_pin = (void(*)(xfs_log_item_t*))xfs_efd_item_pin,
521 .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_efd_item_unpin, 518 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_efd_item_unpin,
522 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) 519 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
523 xfs_efd_item_unpin_remove, 520 xfs_efd_item_unpin_remove,
524 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock, 521 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock,
@@ -554,10 +551,7 @@ xfs_efd_init(xfs_mount_t *mp,
554 KM_SLEEP); 551 KM_SLEEP);
555 } 552 }
556 553
557 efdp->efd_item.li_type = XFS_LI_EFD; 554 xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops);
558 efdp->efd_item.li_ops = &xfs_efd_item_ops;
559 efdp->efd_item.li_mountp = mp;
560 efdp->efd_item.li_ailp = mp->m_ail;
561 efdp->efd_efip = efip; 555 efdp->efd_efip = efip;
562 efdp->efd_format.efd_nextents = nextents; 556 efdp->efd_format.efd_nextents = nextents;
563 efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; 557 efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 0ffd56447045..8cd6e8d8fe9c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2449,6 +2449,8 @@ xfs_iunpin_nowait(
2449{ 2449{
2450 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2450 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2451 2451
2452 trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
2453
2452 /* Give the log a push to start the unpinning I/O */ 2454 /* Give the log a push to start the unpinning I/O */
2453 xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0); 2455 xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
2454 2456
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7bfea8540159..cf8249a60004 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -543,6 +543,7 @@ xfs_inode_item_pin(
543{ 543{
544 ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); 544 ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
545 545
546 trace_xfs_inode_pin(iip->ili_inode, _RET_IP_);
546 atomic_inc(&iip->ili_inode->i_pincount); 547 atomic_inc(&iip->ili_inode->i_pincount);
547} 548}
548 549
@@ -556,11 +557,11 @@ xfs_inode_item_pin(
556/* ARGSUSED */ 557/* ARGSUSED */
557STATIC void 558STATIC void
558xfs_inode_item_unpin( 559xfs_inode_item_unpin(
559 xfs_inode_log_item_t *iip, 560 xfs_inode_log_item_t *iip)
560 int stale)
561{ 561{
562 struct xfs_inode *ip = iip->ili_inode; 562 struct xfs_inode *ip = iip->ili_inode;
563 563
564 trace_xfs_inode_unpin(ip, _RET_IP_);
564 ASSERT(atomic_read(&ip->i_pincount) > 0); 565 ASSERT(atomic_read(&ip->i_pincount) > 0);
565 if (atomic_dec_and_test(&ip->i_pincount)) 566 if (atomic_dec_and_test(&ip->i_pincount))
566 wake_up(&ip->i_ipin_wait); 567 wake_up(&ip->i_ipin_wait);
@@ -572,7 +573,7 @@ xfs_inode_item_unpin_remove(
572 xfs_inode_log_item_t *iip, 573 xfs_inode_log_item_t *iip,
573 xfs_trans_t *tp) 574 xfs_trans_t *tp)
574{ 575{
575 xfs_inode_item_unpin(iip, 0); 576 xfs_inode_item_unpin(iip);
576} 577}
577 578
578/* 579/*
@@ -838,7 +839,7 @@ static struct xfs_item_ops xfs_inode_item_ops = {
838 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 839 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
839 xfs_inode_item_format, 840 xfs_inode_item_format,
840 .iop_pin = (void(*)(xfs_log_item_t*))xfs_inode_item_pin, 841 .iop_pin = (void(*)(xfs_log_item_t*))xfs_inode_item_pin,
841 .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_inode_item_unpin, 842 .iop_unpin = (void(*)(xfs_log_item_t*))xfs_inode_item_unpin,
842 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) 843 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
843 xfs_inode_item_unpin_remove, 844 xfs_inode_item_unpin_remove,
844 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock, 845 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock,
@@ -865,17 +866,9 @@ xfs_inode_item_init(
865 ASSERT(ip->i_itemp == NULL); 866 ASSERT(ip->i_itemp == NULL);
866 iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP); 867 iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
867 868
868 iip->ili_item.li_type = XFS_LI_INODE;
869 iip->ili_item.li_ops = &xfs_inode_item_ops;
870 iip->ili_item.li_mountp = mp;
871 iip->ili_item.li_ailp = mp->m_ail;
872 iip->ili_inode = ip; 869 iip->ili_inode = ip;
873 870 xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
874 /* 871 &xfs_inode_item_ops);
875 We have zeroed memory. No need ...
876 iip->ili_extents_buf = NULL;
877 */
878
879 iip->ili_format.ilf_type = XFS_LI_INODE; 872 iip->ili_format.ilf_type = XFS_LI_INODE;
880 iip->ili_format.ilf_ino = ip->i_ino; 873 iip->ili_format.ilf_ino = ip->i_ino;
881 iip->ili_format.ilf_blkno = ip->i_imap.im_blkno; 874 iip->ili_format.ilf_blkno = ip->i_imap.im_blkno;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 0b65039951a0..ef14943829da 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -55,71 +55,33 @@
55#define XFS_STRAT_WRITE_IMAPS 2 55#define XFS_STRAT_WRITE_IMAPS 2
56#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP 56#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
57 57
58STATIC int 58STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
59xfs_imap_to_bmap( 59 int, struct xfs_bmbt_irec *, int *);
60 xfs_inode_t *ip, 60STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
61 xfs_off_t offset, 61 struct xfs_bmbt_irec *, int *);
62 xfs_bmbt_irec_t *imap, 62STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
63 xfs_iomap_t *iomapp, 63 struct xfs_bmbt_irec *, int *);
64 int imaps, /* Number of imap entries */
65 int iomaps, /* Number of iomap entries */
66 int flags)
67{
68 xfs_mount_t *mp = ip->i_mount;
69 int pbm;
70 xfs_fsblock_t start_block;
71
72
73 for (pbm = 0; imaps && pbm < iomaps; imaps--, iomapp++, imap++, pbm++) {
74 iomapp->iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff);
75 iomapp->iomap_delta = offset - iomapp->iomap_offset;
76 iomapp->iomap_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount);
77 iomapp->iomap_flags = flags;
78
79 if (XFS_IS_REALTIME_INODE(ip)) {
80 iomapp->iomap_flags |= IOMAP_REALTIME;
81 iomapp->iomap_target = mp->m_rtdev_targp;
82 } else {
83 iomapp->iomap_target = mp->m_ddev_targp;
84 }
85 start_block = imap->br_startblock;
86 if (start_block == HOLESTARTBLOCK) {
87 iomapp->iomap_bn = IOMAP_DADDR_NULL;
88 iomapp->iomap_flags |= IOMAP_HOLE;
89 } else if (start_block == DELAYSTARTBLOCK) {
90 iomapp->iomap_bn = IOMAP_DADDR_NULL;
91 iomapp->iomap_flags |= IOMAP_DELAY;
92 } else {
93 iomapp->iomap_bn = xfs_fsb_to_db(ip, start_block);
94 if (ISUNWRITTEN(imap))
95 iomapp->iomap_flags |= IOMAP_UNWRITTEN;
96 }
97
98 offset += iomapp->iomap_bsize - iomapp->iomap_delta;
99 }
100 return pbm; /* Return the number filled */
101}
102 64
103int 65int
104xfs_iomap( 66xfs_iomap(
105 xfs_inode_t *ip, 67 struct xfs_inode *ip,
106 xfs_off_t offset, 68 xfs_off_t offset,
107 ssize_t count, 69 ssize_t count,
108 int flags, 70 int flags,
109 xfs_iomap_t *iomapp, 71 struct xfs_bmbt_irec *imap,
110 int *niomaps) 72 int *nimaps,
73 int *new)
111{ 74{
112 xfs_mount_t *mp = ip->i_mount; 75 struct xfs_mount *mp = ip->i_mount;
113 xfs_fileoff_t offset_fsb, end_fsb; 76 xfs_fileoff_t offset_fsb, end_fsb;
114 int error = 0; 77 int error = 0;
115 int lockmode = 0; 78 int lockmode = 0;
116 xfs_bmbt_irec_t imap; 79 int bmapi_flags = 0;
117 int nimaps = 1;
118 int bmapi_flags = 0;
119 int iomap_flags = 0;
120 80
121 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); 81 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
122 82
83 *new = 0;
84
123 if (XFS_FORCED_SHUTDOWN(mp)) 85 if (XFS_FORCED_SHUTDOWN(mp))
124 return XFS_ERROR(EIO); 86 return XFS_ERROR(EIO);
125 87
@@ -160,8 +122,8 @@ xfs_iomap(
160 122
161 error = xfs_bmapi(NULL, ip, offset_fsb, 123 error = xfs_bmapi(NULL, ip, offset_fsb,
162 (xfs_filblks_t)(end_fsb - offset_fsb), 124 (xfs_filblks_t)(end_fsb - offset_fsb),
163 bmapi_flags, NULL, 0, &imap, 125 bmapi_flags, NULL, 0, imap,
164 &nimaps, NULL, NULL); 126 nimaps, NULL, NULL);
165 127
166 if (error) 128 if (error)
167 goto out; 129 goto out;
@@ -169,46 +131,41 @@ xfs_iomap(
169 switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) { 131 switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
170 case BMAPI_WRITE: 132 case BMAPI_WRITE:
171 /* If we found an extent, return it */ 133 /* If we found an extent, return it */
172 if (nimaps && 134 if (*nimaps &&
173 (imap.br_startblock != HOLESTARTBLOCK) && 135 (imap->br_startblock != HOLESTARTBLOCK) &&
174 (imap.br_startblock != DELAYSTARTBLOCK)) { 136 (imap->br_startblock != DELAYSTARTBLOCK)) {
175 trace_xfs_iomap_found(ip, offset, count, flags, &imap); 137 trace_xfs_iomap_found(ip, offset, count, flags, imap);
176 break; 138 break;
177 } 139 }
178 140
179 if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) { 141 if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) {
180 error = xfs_iomap_write_direct(ip, offset, count, flags, 142 error = xfs_iomap_write_direct(ip, offset, count, flags,
181 &imap, &nimaps, nimaps); 143 imap, nimaps);
182 } else { 144 } else {
183 error = xfs_iomap_write_delay(ip, offset, count, flags, 145 error = xfs_iomap_write_delay(ip, offset, count, flags,
184 &imap, &nimaps); 146 imap, nimaps);
185 } 147 }
186 if (!error) { 148 if (!error) {
187 trace_xfs_iomap_alloc(ip, offset, count, flags, &imap); 149 trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
188 } 150 }
189 iomap_flags = IOMAP_NEW; 151 *new = 1;
190 break; 152 break;
191 case BMAPI_ALLOCATE: 153 case BMAPI_ALLOCATE:
192 /* If we found an extent, return it */ 154 /* If we found an extent, return it */
193 xfs_iunlock(ip, lockmode); 155 xfs_iunlock(ip, lockmode);
194 lockmode = 0; 156 lockmode = 0;
195 157
196 if (nimaps && !isnullstartblock(imap.br_startblock)) { 158 if (*nimaps && !isnullstartblock(imap->br_startblock)) {
197 trace_xfs_iomap_found(ip, offset, count, flags, &imap); 159 trace_xfs_iomap_found(ip, offset, count, flags, imap);
198 break; 160 break;
199 } 161 }
200 162
201 error = xfs_iomap_write_allocate(ip, offset, count, 163 error = xfs_iomap_write_allocate(ip, offset, count,
202 &imap, &nimaps); 164 imap, nimaps);
203 break; 165 break;
204 } 166 }
205 167
206 if (nimaps) { 168 ASSERT(*nimaps <= 1);
207 *niomaps = xfs_imap_to_bmap(ip, offset, &imap,
208 iomapp, nimaps, *niomaps, iomap_flags);
209 } else if (niomaps) {
210 *niomaps = 0;
211 }
212 169
213out: 170out:
214 if (lockmode) 171 if (lockmode)
@@ -216,7 +173,6 @@ out:
216 return XFS_ERROR(error); 173 return XFS_ERROR(error);
217} 174}
218 175
219
220STATIC int 176STATIC int
221xfs_iomap_eof_align_last_fsb( 177xfs_iomap_eof_align_last_fsb(
222 xfs_mount_t *mp, 178 xfs_mount_t *mp,
@@ -285,15 +241,14 @@ xfs_cmn_err_fsblock_zero(
285 return EFSCORRUPTED; 241 return EFSCORRUPTED;
286} 242}
287 243
288int 244STATIC int
289xfs_iomap_write_direct( 245xfs_iomap_write_direct(
290 xfs_inode_t *ip, 246 xfs_inode_t *ip,
291 xfs_off_t offset, 247 xfs_off_t offset,
292 size_t count, 248 size_t count,
293 int flags, 249 int flags,
294 xfs_bmbt_irec_t *ret_imap, 250 xfs_bmbt_irec_t *ret_imap,
295 int *nmaps, 251 int *nmaps)
296 int found)
297{ 252{
298 xfs_mount_t *mp = ip->i_mount; 253 xfs_mount_t *mp = ip->i_mount;
299 xfs_fileoff_t offset_fsb; 254 xfs_fileoff_t offset_fsb;
@@ -330,7 +285,7 @@ xfs_iomap_write_direct(
330 if (error) 285 if (error)
331 goto error_out; 286 goto error_out;
332 } else { 287 } else {
333 if (found && (ret_imap->br_startblock == HOLESTARTBLOCK)) 288 if (*nmaps && (ret_imap->br_startblock == HOLESTARTBLOCK))
334 last_fsb = MIN(last_fsb, (xfs_fileoff_t) 289 last_fsb = MIN(last_fsb, (xfs_fileoff_t)
335 ret_imap->br_blockcount + 290 ret_imap->br_blockcount +
336 ret_imap->br_startoff); 291 ret_imap->br_startoff);
@@ -485,7 +440,7 @@ xfs_iomap_eof_want_preallocate(
485 return 0; 440 return 0;
486} 441}
487 442
488int 443STATIC int
489xfs_iomap_write_delay( 444xfs_iomap_write_delay(
490 xfs_inode_t *ip, 445 xfs_inode_t *ip,
491 xfs_off_t offset, 446 xfs_off_t offset,
@@ -588,7 +543,7 @@ retry:
588 * We no longer bother to look at the incoming map - all we have to 543 * We no longer bother to look at the incoming map - all we have to
589 * guarantee is that whatever we allocate fills the required range. 544 * guarantee is that whatever we allocate fills the required range.
590 */ 545 */
591int 546STATIC int
592xfs_iomap_write_allocate( 547xfs_iomap_write_allocate(
593 xfs_inode_t *ip, 548 xfs_inode_t *ip,
594 xfs_off_t offset, 549 xfs_off_t offset,
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 174f29990991..81ac4afd45b3 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,19 +18,6 @@
18#ifndef __XFS_IOMAP_H__ 18#ifndef __XFS_IOMAP_H__
19#define __XFS_IOMAP_H__ 19#define __XFS_IOMAP_H__
20 20
21#define IOMAP_DADDR_NULL ((xfs_daddr_t) (-1LL))
22
23
24typedef enum { /* iomap_flags values */
25 IOMAP_READ = 0, /* mapping for a read */
26 IOMAP_HOLE = 0x02, /* mapping covers a hole */
27 IOMAP_DELAY = 0x04, /* mapping covers delalloc region */
28 IOMAP_REALTIME = 0x10, /* mapping on the realtime device */
29 IOMAP_UNWRITTEN = 0x20, /* mapping covers allocated */
30 /* but uninitialized file data */
31 IOMAP_NEW = 0x40 /* just allocate */
32} iomap_flags_t;
33
34typedef enum { 21typedef enum {
35 /* base extent manipulation calls */ 22 /* base extent manipulation calls */
36 BMAPI_READ = (1 << 0), /* read extents */ 23 BMAPI_READ = (1 << 0), /* read extents */
@@ -52,43 +39,11 @@ typedef enum {
52 { BMAPI_MMAP, "MMAP" }, \ 39 { BMAPI_MMAP, "MMAP" }, \
53 { BMAPI_TRYLOCK, "TRYLOCK" } 40 { BMAPI_TRYLOCK, "TRYLOCK" }
54 41
55/*
56 * xfs_iomap_t: File system I/O map
57 *
58 * The iomap_bn field is expressed in 512-byte blocks, and is where the
59 * mapping starts on disk.
60 *
61 * The iomap_offset, iomap_bsize and iomap_delta fields are in bytes.
62 * iomap_offset is the offset of the mapping in the file itself.
63 * iomap_bsize is the size of the mapping, iomap_delta is the
64 * desired data's offset into the mapping, given the offset supplied
65 * to the file I/O map routine.
66 *
67 * When a request is made to read beyond the logical end of the object,
68 * iomap_size may be set to 0, but iomap_offset and iomap_length should be set
69 * to the actual amount of underlying storage that has been allocated, if any.
70 */
71
72typedef struct xfs_iomap {
73 xfs_daddr_t iomap_bn; /* first 512B blk of mapping */
74 xfs_buftarg_t *iomap_target;
75 xfs_off_t iomap_offset; /* offset of mapping, bytes */
76 xfs_off_t iomap_bsize; /* size of mapping, bytes */
77 xfs_off_t iomap_delta; /* offset into mapping, bytes */
78 iomap_flags_t iomap_flags;
79} xfs_iomap_t;
80
81struct xfs_inode; 42struct xfs_inode;
82struct xfs_bmbt_irec; 43struct xfs_bmbt_irec;
83 44
84extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int, 45extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int,
85 struct xfs_iomap *, int *); 46 struct xfs_bmbt_irec *, int *, int *);
86extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
87 int, struct xfs_bmbt_irec *, int *, int);
88extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
89 struct xfs_bmbt_irec *, int *);
90extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
91 struct xfs_bmbt_irec *, int *);
92extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); 47extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
93 48
94#endif /* __XFS_IOMAP_H__*/ 49#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index e8fba92d7cd9..3038dd52c72a 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -44,13 +44,8 @@
44 44
45kmem_zone_t *xfs_log_ticket_zone; 45kmem_zone_t *xfs_log_ticket_zone;
46 46
47#define xlog_write_adv_cnt(ptr, len, off, bytes) \
48 { (ptr) += (bytes); \
49 (len) -= (bytes); \
50 (off) += (bytes);}
51
52/* Local miscellaneous function prototypes */ 47/* Local miscellaneous function prototypes */
53STATIC int xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket, 48STATIC int xlog_commit_record(struct log *log, struct xlog_ticket *ticket,
54 xlog_in_core_t **, xfs_lsn_t *); 49 xlog_in_core_t **, xfs_lsn_t *);
55STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, 50STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
56 xfs_buftarg_t *log_target, 51 xfs_buftarg_t *log_target,
@@ -59,11 +54,9 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
59STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); 54STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes);
60STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); 55STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
61STATIC void xlog_dealloc_log(xlog_t *log); 56STATIC void xlog_dealloc_log(xlog_t *log);
62STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[], 57STATIC int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
63 int nentries, struct xlog_ticket *tic, 58 struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
64 xfs_lsn_t *start_lsn, 59 xlog_in_core_t **commit_iclog, uint flags);
65 xlog_in_core_t **commit_iclog,
66 uint flags);
67 60
68/* local state machine functions */ 61/* local state machine functions */
69STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); 62STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
@@ -102,7 +95,7 @@ STATIC xlog_ticket_t *xlog_ticket_alloc(xlog_t *log,
102 uint flags); 95 uint flags);
103 96
104#if defined(DEBUG) 97#if defined(DEBUG)
105STATIC void xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr); 98STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr);
106STATIC void xlog_verify_grant_head(xlog_t *log, int equals); 99STATIC void xlog_verify_grant_head(xlog_t *log, int equals);
107STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, 100STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
108 int count, boolean_t syncing); 101 int count, boolean_t syncing);
@@ -258,7 +251,7 @@ xfs_log_done(
258 * If we get an error, just continue and give back the log ticket. 251 * If we get an error, just continue and give back the log ticket.
259 */ 252 */
260 (((ticket->t_flags & XLOG_TIC_INITED) == 0) && 253 (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
261 (xlog_commit_record(mp, ticket, iclog, &lsn)))) { 254 (xlog_commit_record(log, ticket, iclog, &lsn)))) {
262 lsn = (xfs_lsn_t) -1; 255 lsn = (xfs_lsn_t) -1;
263 if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { 256 if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
264 flags |= XFS_LOG_REL_PERM_RESERV; 257 flags |= XFS_LOG_REL_PERM_RESERV;
@@ -516,18 +509,10 @@ xfs_log_unmount_write(xfs_mount_t *mp)
516#ifdef DEBUG 509#ifdef DEBUG
517 xlog_in_core_t *first_iclog; 510 xlog_in_core_t *first_iclog;
518#endif 511#endif
519 xfs_log_iovec_t reg[1];
520 xlog_ticket_t *tic = NULL; 512 xlog_ticket_t *tic = NULL;
521 xfs_lsn_t lsn; 513 xfs_lsn_t lsn;
522 int error; 514 int error;
523 515
524 /* the data section must be 32 bit size aligned */
525 struct {
526 __uint16_t magic;
527 __uint16_t pad1;
528 __uint32_t pad2; /* may as well make it 64 bits */
529 } magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
530
531 /* 516 /*
532 * Don't write out unmount record on read-only mounts. 517 * Don't write out unmount record on read-only mounts.
533 * Or, if we are doing a forced umount (typically because of IO errors). 518 * Or, if we are doing a forced umount (typically because of IO errors).
@@ -549,16 +534,30 @@ xfs_log_unmount_write(xfs_mount_t *mp)
549 } while (iclog != first_iclog); 534 } while (iclog != first_iclog);
550#endif 535#endif
551 if (! (XLOG_FORCED_SHUTDOWN(log))) { 536 if (! (XLOG_FORCED_SHUTDOWN(log))) {
552 reg[0].i_addr = (void*)&magic;
553 reg[0].i_len = sizeof(magic);
554 reg[0].i_type = XLOG_REG_TYPE_UNMOUNT;
555
556 error = xfs_log_reserve(mp, 600, 1, &tic, 537 error = xfs_log_reserve(mp, 600, 1, &tic,
557 XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE); 538 XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
558 if (!error) { 539 if (!error) {
540 /* the data section must be 32 bit size aligned */
541 struct {
542 __uint16_t magic;
543 __uint16_t pad1;
544 __uint32_t pad2; /* may as well make it 64 bits */
545 } magic = {
546 .magic = XLOG_UNMOUNT_TYPE,
547 };
548 struct xfs_log_iovec reg = {
549 .i_addr = (void *)&magic,
550 .i_len = sizeof(magic),
551 .i_type = XLOG_REG_TYPE_UNMOUNT,
552 };
553 struct xfs_log_vec vec = {
554 .lv_niovecs = 1,
555 .lv_iovecp = &reg,
556 };
557
559 /* remove inited flag */ 558 /* remove inited flag */
560 ((xlog_ticket_t *)tic)->t_flags = 0; 559 tic->t_flags = 0;
561 error = xlog_write(mp, reg, 1, tic, &lsn, 560 error = xlog_write(log, &vec, tic, &lsn,
562 NULL, XLOG_UNMOUNT_TRANS); 561 NULL, XLOG_UNMOUNT_TRANS);
563 /* 562 /*
564 * At this point, we're umounting anyway, 563 * At this point, we're umounting anyway,
@@ -648,10 +647,26 @@ xfs_log_unmount(xfs_mount_t *mp)
648 xlog_dealloc_log(mp->m_log); 647 xlog_dealloc_log(mp->m_log);
649} 648}
650 649
650void
651xfs_log_item_init(
652 struct xfs_mount *mp,
653 struct xfs_log_item *item,
654 int type,
655 struct xfs_item_ops *ops)
656{
657 item->li_mountp = mp;
658 item->li_ailp = mp->m_ail;
659 item->li_type = type;
660 item->li_ops = ops;
661}
662
651/* 663/*
652 * Write region vectors to log. The write happens using the space reservation 664 * Write region vectors to log. The write happens using the space reservation
653 * of the ticket (tic). It is not a requirement that all writes for a given 665 * of the ticket (tic). It is not a requirement that all writes for a given
654 * transaction occur with one call to xfs_log_write(). 666 * transaction occur with one call to xfs_log_write(). However, it is important
667 * to note that the transaction reservation code makes an assumption about the
668 * number of log headers a transaction requires that may be violated if you
669 * don't pass all the transaction vectors in one call....
655 */ 670 */
656int 671int
657xfs_log_write( 672xfs_log_write(
@@ -663,11 +678,15 @@ xfs_log_write(
663{ 678{
664 struct log *log = mp->m_log; 679 struct log *log = mp->m_log;
665 int error; 680 int error;
681 struct xfs_log_vec vec = {
682 .lv_niovecs = nentries,
683 .lv_iovecp = reg,
684 };
666 685
667 if (XLOG_FORCED_SHUTDOWN(log)) 686 if (XLOG_FORCED_SHUTDOWN(log))
668 return XFS_ERROR(EIO); 687 return XFS_ERROR(EIO);
669 688
670 error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0); 689 error = xlog_write(log, &vec, tic, start_lsn, NULL, 0);
671 if (error) 690 if (error)
672 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 691 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
673 return error; 692 return error;
@@ -745,9 +764,16 @@ xfs_log_move_tail(xfs_mount_t *mp,
745 764
746/* 765/*
747 * Determine if we have a transaction that has gone to disk 766 * Determine if we have a transaction that has gone to disk
748 * that needs to be covered. Log activity needs to be idle (no AIL and 767 * that needs to be covered. To begin the transition to the idle state
749 * nothing in the iclogs). And, we need to be in the right state indicating 768 * firstly the log needs to be idle (no AIL and nothing in the iclogs).
750 * something has gone out. 769 * If we are then in a state where covering is needed, the caller is informed
770 * that dummy transactions are required to move the log into the idle state.
771 *
772 * Because this is called as part of the sync process, we should also indicate
773 * that dummy transactions should be issued in anything but the covered or
774 * idle states. This ensures that the log tail is accurately reflected in
775 * the log at the end of the sync, hence if a crash occurrs avoids replay
776 * of transactions where the metadata is already on disk.
751 */ 777 */
752int 778int
753xfs_log_need_covered(xfs_mount_t *mp) 779xfs_log_need_covered(xfs_mount_t *mp)
@@ -759,17 +785,24 @@ xfs_log_need_covered(xfs_mount_t *mp)
759 return 0; 785 return 0;
760 786
761 spin_lock(&log->l_icloglock); 787 spin_lock(&log->l_icloglock);
762 if (((log->l_covered_state == XLOG_STATE_COVER_NEED) || 788 switch (log->l_covered_state) {
763 (log->l_covered_state == XLOG_STATE_COVER_NEED2)) 789 case XLOG_STATE_COVER_DONE:
764 && !xfs_trans_ail_tail(log->l_ailp) 790 case XLOG_STATE_COVER_DONE2:
765 && xlog_iclogs_empty(log)) { 791 case XLOG_STATE_COVER_IDLE:
766 if (log->l_covered_state == XLOG_STATE_COVER_NEED) 792 break;
767 log->l_covered_state = XLOG_STATE_COVER_DONE; 793 case XLOG_STATE_COVER_NEED:
768 else { 794 case XLOG_STATE_COVER_NEED2:
769 ASSERT(log->l_covered_state == XLOG_STATE_COVER_NEED2); 795 if (!xfs_trans_ail_tail(log->l_ailp) &&
770 log->l_covered_state = XLOG_STATE_COVER_DONE2; 796 xlog_iclogs_empty(log)) {
797 if (log->l_covered_state == XLOG_STATE_COVER_NEED)
798 log->l_covered_state = XLOG_STATE_COVER_DONE;
799 else
800 log->l_covered_state = XLOG_STATE_COVER_DONE2;
771 } 801 }
802 /* FALLTHRU */
803 default:
772 needed = 1; 804 needed = 1;
805 break;
773 } 806 }
774 spin_unlock(&log->l_icloglock); 807 spin_unlock(&log->l_icloglock);
775 return needed; 808 return needed;
@@ -1006,6 +1039,7 @@ xlog_alloc_log(xfs_mount_t *mp,
1006 int i; 1039 int i;
1007 int iclogsize; 1040 int iclogsize;
1008 int error = ENOMEM; 1041 int error = ENOMEM;
1042 uint log2_size = 0;
1009 1043
1010 log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL); 1044 log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
1011 if (!log) { 1045 if (!log) {
@@ -1031,29 +1065,30 @@ xlog_alloc_log(xfs_mount_t *mp,
1031 1065
1032 error = EFSCORRUPTED; 1066 error = EFSCORRUPTED;
1033 if (xfs_sb_version_hassector(&mp->m_sb)) { 1067 if (xfs_sb_version_hassector(&mp->m_sb)) {
1034 log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT; 1068 log2_size = mp->m_sb.sb_logsectlog;
1035 if (log->l_sectbb_log < 0 || 1069 if (log2_size < BBSHIFT) {
1036 log->l_sectbb_log > mp->m_sectbb_log) { 1070 xlog_warn("XFS: Log sector size too small "
1037 xlog_warn("XFS: Log sector size (0x%x) out of range.", 1071 "(0x%x < 0x%x)", log2_size, BBSHIFT);
1038 log->l_sectbb_log);
1039 goto out_free_log; 1072 goto out_free_log;
1040 } 1073 }
1041 1074
1042 /* for larger sector sizes, must have v2 or external log */ 1075 log2_size -= BBSHIFT;
1043 if (log->l_sectbb_log != 0 && 1076 if (log2_size > mp->m_sectbb_log) {
1044 (log->l_logBBstart != 0 && 1077 xlog_warn("XFS: Log sector size too large "
1045 !xfs_sb_version_haslogv2(&mp->m_sb))) { 1078 "(0x%x > 0x%x)", log2_size, mp->m_sectbb_log);
1046 xlog_warn("XFS: log sector size (0x%x) invalid "
1047 "for configuration.", log->l_sectbb_log);
1048 goto out_free_log; 1079 goto out_free_log;
1049 } 1080 }
1050 if (mp->m_sb.sb_logsectlog < BBSHIFT) { 1081
1051 xlog_warn("XFS: Log sector log (0x%x) too small.", 1082 /* for larger sector sizes, must have v2 or external log */
1052 mp->m_sb.sb_logsectlog); 1083 if (log2_size && log->l_logBBstart > 0 &&
1084 !xfs_sb_version_haslogv2(&mp->m_sb)) {
1085
1086 xlog_warn("XFS: log sector size (0x%x) invalid "
1087 "for configuration.", log2_size);
1053 goto out_free_log; 1088 goto out_free_log;
1054 } 1089 }
1055 } 1090 }
1056 log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1; 1091 log->l_sectBBsize = 1 << log2_size;
1057 1092
1058 xlog_get_iclog_buffer_size(mp, log); 1093 xlog_get_iclog_buffer_size(mp, log);
1059 1094
@@ -1160,26 +1195,31 @@ out:
1160 * ticket. Return the lsn of the commit record. 1195 * ticket. Return the lsn of the commit record.
1161 */ 1196 */
1162STATIC int 1197STATIC int
1163xlog_commit_record(xfs_mount_t *mp, 1198xlog_commit_record(
1164 xlog_ticket_t *ticket, 1199 struct log *log,
1165 xlog_in_core_t **iclog, 1200 struct xlog_ticket *ticket,
1166 xfs_lsn_t *commitlsnp) 1201 struct xlog_in_core **iclog,
1202 xfs_lsn_t *commitlsnp)
1167{ 1203{
1168 int error; 1204 struct xfs_mount *mp = log->l_mp;
1169 xfs_log_iovec_t reg[1]; 1205 int error;
1170 1206 struct xfs_log_iovec reg = {
1171 reg[0].i_addr = NULL; 1207 .i_addr = NULL,
1172 reg[0].i_len = 0; 1208 .i_len = 0,
1173 reg[0].i_type = XLOG_REG_TYPE_COMMIT; 1209 .i_type = XLOG_REG_TYPE_COMMIT,
1210 };
1211 struct xfs_log_vec vec = {
1212 .lv_niovecs = 1,
1213 .lv_iovecp = &reg,
1214 };
1174 1215
1175 ASSERT_ALWAYS(iclog); 1216 ASSERT_ALWAYS(iclog);
1176 if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp, 1217 error = xlog_write(log, &vec, ticket, commitlsnp, iclog,
1177 iclog, XLOG_COMMIT_TRANS))) { 1218 XLOG_COMMIT_TRANS);
1219 if (error)
1178 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 1220 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
1179 }
1180 return error; 1221 return error;
1181} /* xlog_commit_record */ 1222}
1182
1183 1223
1184/* 1224/*
1185 * Push on the buffer cache code if we ever use more than 75% of the on-disk 1225 * Push on the buffer cache code if we ever use more than 75% of the on-disk
@@ -1600,6 +1640,192 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
1600} 1640}
1601 1641
1602/* 1642/*
1643 * Calculate the potential space needed by the log vector. Each region gets
1644 * its own xlog_op_header_t and may need to be double word aligned.
1645 */
1646static int
1647xlog_write_calc_vec_length(
1648 struct xlog_ticket *ticket,
1649 struct xfs_log_vec *log_vector)
1650{
1651 struct xfs_log_vec *lv;
1652 int headers = 0;
1653 int len = 0;
1654 int i;
1655
1656 /* acct for start rec of xact */
1657 if (ticket->t_flags & XLOG_TIC_INITED)
1658 headers++;
1659
1660 for (lv = log_vector; lv; lv = lv->lv_next) {
1661 headers += lv->lv_niovecs;
1662
1663 for (i = 0; i < lv->lv_niovecs; i++) {
1664 struct xfs_log_iovec *vecp = &lv->lv_iovecp[i];
1665
1666 len += vecp->i_len;
1667 xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type);
1668 }
1669 }
1670
1671 ticket->t_res_num_ophdrs += headers;
1672 len += headers * sizeof(struct xlog_op_header);
1673
1674 return len;
1675}
1676
1677/*
1678 * If first write for transaction, insert start record We can't be trying to
1679 * commit if we are inited. We can't have any "partial_copy" if we are inited.
1680 */
1681static int
1682xlog_write_start_rec(
1683 struct xlog_op_header *ophdr,
1684 struct xlog_ticket *ticket)
1685{
1686 if (!(ticket->t_flags & XLOG_TIC_INITED))
1687 return 0;
1688
1689 ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
1690 ophdr->oh_clientid = ticket->t_clientid;
1691 ophdr->oh_len = 0;
1692 ophdr->oh_flags = XLOG_START_TRANS;
1693 ophdr->oh_res2 = 0;
1694
1695 ticket->t_flags &= ~XLOG_TIC_INITED;
1696
1697 return sizeof(struct xlog_op_header);
1698}
1699
1700static xlog_op_header_t *
1701xlog_write_setup_ophdr(
1702 struct log *log,
1703 struct xlog_op_header *ophdr,
1704 struct xlog_ticket *ticket,
1705 uint flags)
1706{
1707 ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
1708 ophdr->oh_clientid = ticket->t_clientid;
1709 ophdr->oh_res2 = 0;
1710
1711 /* are we copying a commit or unmount record? */
1712 ophdr->oh_flags = flags;
1713
1714 /*
1715 * We've seen logs corrupted with bad transaction client ids. This
1716 * makes sure that XFS doesn't generate them on. Turn this into an EIO
1717 * and shut down the filesystem.
1718 */
1719 switch (ophdr->oh_clientid) {
1720 case XFS_TRANSACTION:
1721 case XFS_VOLUME:
1722 case XFS_LOG:
1723 break;
1724 default:
1725 xfs_fs_cmn_err(CE_WARN, log->l_mp,
1726 "Bad XFS transaction clientid 0x%x in ticket 0x%p",
1727 ophdr->oh_clientid, ticket);
1728 return NULL;
1729 }
1730
1731 return ophdr;
1732}
1733
1734/*
1735 * Set up the parameters of the region copy into the log. This has
1736 * to handle region write split across multiple log buffers - this
1737 * state is kept external to this function so that this code can
1738 * can be written in an obvious, self documenting manner.
1739 */
1740static int
1741xlog_write_setup_copy(
1742 struct xlog_ticket *ticket,
1743 struct xlog_op_header *ophdr,
1744 int space_available,
1745 int space_required,
1746 int *copy_off,
1747 int *copy_len,
1748 int *last_was_partial_copy,
1749 int *bytes_consumed)
1750{
1751 int still_to_copy;
1752
1753 still_to_copy = space_required - *bytes_consumed;
1754 *copy_off = *bytes_consumed;
1755
1756 if (still_to_copy <= space_available) {
1757 /* write of region completes here */
1758 *copy_len = still_to_copy;
1759 ophdr->oh_len = cpu_to_be32(*copy_len);
1760 if (*last_was_partial_copy)
1761 ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
1762 *last_was_partial_copy = 0;
1763 *bytes_consumed = 0;
1764 return 0;
1765 }
1766
1767 /* partial write of region, needs extra log op header reservation */
1768 *copy_len = space_available;
1769 ophdr->oh_len = cpu_to_be32(*copy_len);
1770 ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
1771 if (*last_was_partial_copy)
1772 ophdr->oh_flags |= XLOG_WAS_CONT_TRANS;
1773 *bytes_consumed += *copy_len;
1774 (*last_was_partial_copy)++;
1775
1776 /* account for new log op header */
1777 ticket->t_curr_res -= sizeof(struct xlog_op_header);
1778 ticket->t_res_num_ophdrs++;
1779
1780 return sizeof(struct xlog_op_header);
1781}
1782
1783static int
1784xlog_write_copy_finish(
1785 struct log *log,
1786 struct xlog_in_core *iclog,
1787 uint flags,
1788 int *record_cnt,
1789 int *data_cnt,
1790 int *partial_copy,
1791 int *partial_copy_len,
1792 int log_offset,
1793 struct xlog_in_core **commit_iclog)
1794{
1795 if (*partial_copy) {
1796 /*
1797 * This iclog has already been marked WANT_SYNC by
1798 * xlog_state_get_iclog_space.
1799 */
1800 xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
1801 *record_cnt = 0;
1802 *data_cnt = 0;
1803 return xlog_state_release_iclog(log, iclog);
1804 }
1805
1806 *partial_copy = 0;
1807 *partial_copy_len = 0;
1808
1809 if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
1810 /* no more space in this iclog - push it. */
1811 xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
1812 *record_cnt = 0;
1813 *data_cnt = 0;
1814
1815 spin_lock(&log->l_icloglock);
1816 xlog_state_want_sync(log, iclog);
1817 spin_unlock(&log->l_icloglock);
1818
1819 if (!commit_iclog)
1820 return xlog_state_release_iclog(log, iclog);
1821 ASSERT(flags & XLOG_COMMIT_TRANS);
1822 *commit_iclog = iclog;
1823 }
1824
1825 return 0;
1826}
1827
1828/*
1603 * Write some region out to in-core log 1829 * Write some region out to in-core log
1604 * 1830 *
1605 * This will be called when writing externally provided regions or when 1831 * This will be called when writing externally provided regions or when
@@ -1641,209 +1867,157 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
1641 */ 1867 */
1642STATIC int 1868STATIC int
1643xlog_write( 1869xlog_write(
1644 struct xfs_mount *mp, 1870 struct log *log,
1645 struct xfs_log_iovec reg[], 1871 struct xfs_log_vec *log_vector,
1646 int nentries,
1647 struct xlog_ticket *ticket, 1872 struct xlog_ticket *ticket,
1648 xfs_lsn_t *start_lsn, 1873 xfs_lsn_t *start_lsn,
1649 struct xlog_in_core **commit_iclog, 1874 struct xlog_in_core **commit_iclog,
1650 uint flags) 1875 uint flags)
1651{ 1876{
1652 xlog_t *log = mp->m_log; 1877 struct xlog_in_core *iclog = NULL;
1653 xlog_in_core_t *iclog = NULL; /* ptr to current in-core log */ 1878 struct xfs_log_iovec *vecp;
1654 xlog_op_header_t *logop_head; /* ptr to log operation header */ 1879 struct xfs_log_vec *lv;
1655 __psint_t ptr; /* copy address into data region */ 1880 int len;
1656 int len; /* # xlog_write() bytes 2 still copy */ 1881 int index;
1657 int index; /* region index currently copying */ 1882 int partial_copy = 0;
1658 int log_offset; /* offset (from 0) into data region */ 1883 int partial_copy_len = 0;
1659 int start_rec_copy; /* # bytes to copy for start record */ 1884 int contwr = 0;
1660 int partial_copy; /* did we split a region? */ 1885 int record_cnt = 0;
1661 int partial_copy_len;/* # bytes copied if split region */ 1886 int data_cnt = 0;
1662 int need_copy; /* # bytes need to memcpy this region */ 1887 int error;
1663 int copy_len; /* # bytes actually memcpy'ing */
1664 int copy_off; /* # bytes from entry start */
1665 int contwr; /* continued write of in-core log? */
1666 int error;
1667 int record_cnt = 0, data_cnt = 0;
1668
1669 partial_copy_len = partial_copy = 0;
1670
1671 /* Calculate potential maximum space. Each region gets its own
1672 * xlog_op_header_t and may need to be double word aligned.
1673 */
1674 len = 0;
1675 if (ticket->t_flags & XLOG_TIC_INITED) { /* acct for start rec of xact */
1676 len += sizeof(xlog_op_header_t);
1677 ticket->t_res_num_ophdrs++;
1678 }
1679 1888
1680 for (index = 0; index < nentries; index++) { 1889 *start_lsn = 0;
1681 len += sizeof(xlog_op_header_t); /* each region gets >= 1 */
1682 ticket->t_res_num_ophdrs++;
1683 len += reg[index].i_len;
1684 xlog_tic_add_region(ticket, reg[index].i_len, reg[index].i_type);
1685 }
1686 contwr = *start_lsn = 0;
1687 1890
1688 if (ticket->t_curr_res < len) { 1891 len = xlog_write_calc_vec_length(ticket, log_vector);
1689 xlog_print_tic_res(mp, ticket); 1892 if (ticket->t_curr_res < len) {
1893 xlog_print_tic_res(log->l_mp, ticket);
1690#ifdef DEBUG 1894#ifdef DEBUG
1691 xlog_panic( 1895 xlog_panic(
1692 "xfs_log_write: reservation ran out. Need to up reservation"); 1896 "xfs_log_write: reservation ran out. Need to up reservation");
1693#else 1897#else
1694 /* Customer configurable panic */ 1898 /* Customer configurable panic */
1695 xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp, 1899 xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, log->l_mp,
1696 "xfs_log_write: reservation ran out. Need to up reservation"); 1900 "xfs_log_write: reservation ran out. Need to up reservation");
1697 /* If we did not panic, shutdown the filesystem */ 1901
1698 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 1902 /* If we did not panic, shutdown the filesystem */
1903 xfs_force_shutdown(log->l_mp, SHUTDOWN_CORRUPT_INCORE);
1699#endif 1904#endif
1700 } else 1905 }
1906
1701 ticket->t_curr_res -= len; 1907 ticket->t_curr_res -= len;
1702 1908
1703 for (index = 0; index < nentries; ) { 1909 index = 0;
1704 if ((error = xlog_state_get_iclog_space(log, len, &iclog, ticket, 1910 lv = log_vector;
1705 &contwr, &log_offset))) 1911 vecp = lv->lv_iovecp;
1706 return error; 1912 while (lv && index < lv->lv_niovecs) {
1913 void *ptr;
1914 int log_offset;
1707 1915
1708 ASSERT(log_offset <= iclog->ic_size - 1); 1916 error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
1709 ptr = (__psint_t) ((char *)iclog->ic_datap+log_offset); 1917 &contwr, &log_offset);
1918 if (error)
1919 return error;
1710 1920
1711 /* start_lsn is the first lsn written to. That's all we need. */ 1921 ASSERT(log_offset <= iclog->ic_size - 1);
1712 if (! *start_lsn) 1922 ptr = iclog->ic_datap + log_offset;
1713 *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
1714 1923
1715 /* This loop writes out as many regions as can fit in the amount 1924 /* start_lsn is the first lsn written to. That's all we need. */
1716 * of space which was allocated by xlog_state_get_iclog_space(). 1925 if (!*start_lsn)
1717 */ 1926 *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
1718 while (index < nentries) {
1719 ASSERT(reg[index].i_len % sizeof(__int32_t) == 0);
1720 ASSERT((__psint_t)ptr % sizeof(__int32_t) == 0);
1721 start_rec_copy = 0;
1722
1723 /* If first write for transaction, insert start record.
1724 * We can't be trying to commit if we are inited. We can't
1725 * have any "partial_copy" if we are inited.
1726 */
1727 if (ticket->t_flags & XLOG_TIC_INITED) {
1728 logop_head = (xlog_op_header_t *)ptr;
1729 logop_head->oh_tid = cpu_to_be32(ticket->t_tid);
1730 logop_head->oh_clientid = ticket->t_clientid;
1731 logop_head->oh_len = 0;
1732 logop_head->oh_flags = XLOG_START_TRANS;
1733 logop_head->oh_res2 = 0;
1734 ticket->t_flags &= ~XLOG_TIC_INITED; /* clear bit */
1735 record_cnt++;
1736
1737 start_rec_copy = sizeof(xlog_op_header_t);
1738 xlog_write_adv_cnt(ptr, len, log_offset, start_rec_copy);
1739 }
1740 1927
1741 /* Copy log operation header directly into data section */ 1928 /*
1742 logop_head = (xlog_op_header_t *)ptr; 1929 * This loop writes out as many regions as can fit in the amount
1743 logop_head->oh_tid = cpu_to_be32(ticket->t_tid); 1930 * of space which was allocated by xlog_state_get_iclog_space().
1744 logop_head->oh_clientid = ticket->t_clientid; 1931 */
1745 logop_head->oh_res2 = 0; 1932 while (lv && index < lv->lv_niovecs) {
1933 struct xfs_log_iovec *reg = &vecp[index];
1934 struct xlog_op_header *ophdr;
1935 int start_rec_copy;
1936 int copy_len;
1937 int copy_off;
1938
1939 ASSERT(reg->i_len % sizeof(__int32_t) == 0);
1940 ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
1941
1942 start_rec_copy = xlog_write_start_rec(ptr, ticket);
1943 if (start_rec_copy) {
1944 record_cnt++;
1945 xlog_write_adv_cnt(&ptr, &len, &log_offset,
1946 start_rec_copy);
1947 }
1746 1948
1747 /* header copied directly */ 1949 ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
1748 xlog_write_adv_cnt(ptr, len, log_offset, sizeof(xlog_op_header_t)); 1950 if (!ophdr)
1951 return XFS_ERROR(EIO);
1749 1952
1750 /* are we copying a commit or unmount record? */ 1953 xlog_write_adv_cnt(&ptr, &len, &log_offset,
1751 logop_head->oh_flags = flags; 1954 sizeof(struct xlog_op_header));
1955
1956 len += xlog_write_setup_copy(ticket, ophdr,
1957 iclog->ic_size-log_offset,
1958 reg->i_len,
1959 &copy_off, &copy_len,
1960 &partial_copy,
1961 &partial_copy_len);
1962 xlog_verify_dest_ptr(log, ptr);
1963
1964 /* copy region */
1965 ASSERT(copy_len >= 0);
1966 memcpy(ptr, reg->i_addr + copy_off, copy_len);
1967 xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len);
1968
1969 copy_len += start_rec_copy + sizeof(xlog_op_header_t);
1970 record_cnt++;
1971 data_cnt += contwr ? copy_len : 0;
1972
1973 error = xlog_write_copy_finish(log, iclog, flags,
1974 &record_cnt, &data_cnt,
1975 &partial_copy,
1976 &partial_copy_len,
1977 log_offset,
1978 commit_iclog);
1979 if (error)
1980 return error;
1752 1981
1753 /* 1982 /*
1754 * We've seen logs corrupted with bad transaction client 1983 * if we had a partial copy, we need to get more iclog
1755 * ids. This makes sure that XFS doesn't generate them on. 1984 * space but we don't want to increment the region
1756 * Turn this into an EIO and shut down the filesystem. 1985 * index because there is still more is this region to
1757 */ 1986 * write.
1758 switch (logop_head->oh_clientid) { 1987 *
1759 case XFS_TRANSACTION: 1988 * If we completed writing this region, and we flushed
1760 case XFS_VOLUME: 1989 * the iclog (indicated by resetting of the record
1761 case XFS_LOG: 1990 * count), then we also need to get more log space. If
1762 break; 1991 * this was the last record, though, we are done and
1763 default: 1992 * can just return.
1764 xfs_fs_cmn_err(CE_WARN, mp, 1993 */
1765 "Bad XFS transaction clientid 0x%x in ticket 0x%p", 1994 if (partial_copy)
1766 logop_head->oh_clientid, ticket); 1995 break;
1767 return XFS_ERROR(EIO);
1768 }
1769 1996
1770 /* Partial write last time? => (partial_copy != 0) 1997 if (++index == lv->lv_niovecs) {
1771 * need_copy is the amount we'd like to copy if everything could 1998 lv = lv->lv_next;
1772 * fit in the current memcpy. 1999 index = 0;
1773 */ 2000 if (lv)
1774 need_copy = reg[index].i_len - partial_copy_len; 2001 vecp = lv->lv_iovecp;
1775 2002 }
1776 copy_off = partial_copy_len; 2003 if (record_cnt == 0) {
1777 if (need_copy <= iclog->ic_size - log_offset) { /*complete write */ 2004 if (!lv)
1778 copy_len = need_copy; 2005 return 0;
1779 logop_head->oh_len = cpu_to_be32(copy_len); 2006 break;
1780 if (partial_copy) 2007 }
1781 logop_head->oh_flags|= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
1782 partial_copy_len = partial_copy = 0;
1783 } else { /* partial write */
1784 copy_len = iclog->ic_size - log_offset;
1785 logop_head->oh_len = cpu_to_be32(copy_len);
1786 logop_head->oh_flags |= XLOG_CONTINUE_TRANS;
1787 if (partial_copy)
1788 logop_head->oh_flags |= XLOG_WAS_CONT_TRANS;
1789 partial_copy_len += copy_len;
1790 partial_copy++;
1791 len += sizeof(xlog_op_header_t); /* from splitting of region */
1792 /* account for new log op header */
1793 ticket->t_curr_res -= sizeof(xlog_op_header_t);
1794 ticket->t_res_num_ophdrs++;
1795 }
1796 xlog_verify_dest_ptr(log, ptr);
1797
1798 /* copy region */
1799 ASSERT(copy_len >= 0);
1800 memcpy((xfs_caddr_t)ptr, reg[index].i_addr + copy_off, copy_len);
1801 xlog_write_adv_cnt(ptr, len, log_offset, copy_len);
1802
1803 /* make copy_len total bytes copied, including headers */
1804 copy_len += start_rec_copy + sizeof(xlog_op_header_t);
1805 record_cnt++;
1806 data_cnt += contwr ? copy_len : 0;
1807 if (partial_copy) { /* copied partial region */
1808 /* already marked WANT_SYNC by xlog_state_get_iclog_space */
1809 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
1810 record_cnt = data_cnt = 0;
1811 if ((error = xlog_state_release_iclog(log, iclog)))
1812 return error;
1813 break; /* don't increment index */
1814 } else { /* copied entire region */
1815 index++;
1816 partial_copy_len = partial_copy = 0;
1817
1818 if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
1819 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
1820 record_cnt = data_cnt = 0;
1821 spin_lock(&log->l_icloglock);
1822 xlog_state_want_sync(log, iclog);
1823 spin_unlock(&log->l_icloglock);
1824 if (commit_iclog) {
1825 ASSERT(flags & XLOG_COMMIT_TRANS);
1826 *commit_iclog = iclog;
1827 } else if ((error = xlog_state_release_iclog(log, iclog)))
1828 return error;
1829 if (index == nentries)
1830 return 0; /* we are done */
1831 else
1832 break;
1833 } 2008 }
1834 } /* if (partial_copy) */ 2009 }
1835 } /* while (index < nentries) */ 2010
1836 } /* for (index = 0; index < nentries; ) */ 2011 ASSERT(len == 0);
1837 ASSERT(len == 0); 2012
2013 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
2014 if (!commit_iclog)
2015 return xlog_state_release_iclog(log, iclog);
1838 2016
1839 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
1840 if (commit_iclog) {
1841 ASSERT(flags & XLOG_COMMIT_TRANS); 2017 ASSERT(flags & XLOG_COMMIT_TRANS);
1842 *commit_iclog = iclog; 2018 *commit_iclog = iclog;
1843 return 0; 2019 return 0;
1844 } 2020}
1845 return xlog_state_release_iclog(log, iclog);
1846} /* xlog_write */
1847 2021
1848 2022
1849/***************************************************************************** 2023/*****************************************************************************
@@ -3143,14 +3317,16 @@ xfs_log_ticket_get(
3143 * Allocate and initialise a new log ticket. 3317 * Allocate and initialise a new log ticket.
3144 */ 3318 */
3145STATIC xlog_ticket_t * 3319STATIC xlog_ticket_t *
3146xlog_ticket_alloc(xlog_t *log, 3320xlog_ticket_alloc(
3147 int unit_bytes, 3321 struct log *log,
3148 int cnt, 3322 int unit_bytes,
3149 char client, 3323 int cnt,
3150 uint xflags) 3324 char client,
3325 uint xflags)
3151{ 3326{
3152 xlog_ticket_t *tic; 3327 struct xlog_ticket *tic;
3153 uint num_headers; 3328 uint num_headers;
3329 int iclog_space;
3154 3330
3155 tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL); 3331 tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL);
3156 if (!tic) 3332 if (!tic)
@@ -3194,16 +3370,40 @@ xlog_ticket_alloc(xlog_t *log,
3194 /* for start-rec */ 3370 /* for start-rec */
3195 unit_bytes += sizeof(xlog_op_header_t); 3371 unit_bytes += sizeof(xlog_op_header_t);
3196 3372
3197 /* for LR headers */ 3373 /*
3198 num_headers = ((unit_bytes + log->l_iclog_size-1) >> log->l_iclog_size_log); 3374 * for LR headers - the space for data in an iclog is the size minus
3375 * the space used for the headers. If we use the iclog size, then we
3376 * undercalculate the number of headers required.
3377 *
3378 * Furthermore - the addition of op headers for split-recs might
3379 * increase the space required enough to require more log and op
3380 * headers, so take that into account too.
3381 *
3382 * IMPORTANT: This reservation makes the assumption that if this
3383 * transaction is the first in an iclog and hence has the LR headers
3384 * accounted to it, then the remaining space in the iclog is
3385 * exclusively for this transaction. i.e. if the transaction is larger
3386 * than the iclog, it will be the only thing in that iclog.
3387 * Fundamentally, this means we must pass the entire log vector to
3388 * xlog_write to guarantee this.
3389 */
3390 iclog_space = log->l_iclog_size - log->l_iclog_hsize;
3391 num_headers = howmany(unit_bytes, iclog_space);
3392
3393 /* for split-recs - ophdrs added when data split over LRs */
3394 unit_bytes += sizeof(xlog_op_header_t) * num_headers;
3395
3396 /* add extra header reservations if we overrun */
3397 while (!num_headers ||
3398 howmany(unit_bytes, iclog_space) > num_headers) {
3399 unit_bytes += sizeof(xlog_op_header_t);
3400 num_headers++;
3401 }
3199 unit_bytes += log->l_iclog_hsize * num_headers; 3402 unit_bytes += log->l_iclog_hsize * num_headers;
3200 3403
3201 /* for commit-rec LR header - note: padding will subsume the ophdr */ 3404 /* for commit-rec LR header - note: padding will subsume the ophdr */
3202 unit_bytes += log->l_iclog_hsize; 3405 unit_bytes += log->l_iclog_hsize;
3203 3406
3204 /* for split-recs - ophdrs added when data split over LRs */
3205 unit_bytes += sizeof(xlog_op_header_t) * num_headers;
3206
3207 /* for roundoff padding for transaction data and one for commit record */ 3407 /* for roundoff padding for transaction data and one for commit record */
3208 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && 3408 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
3209 log->l_mp->m_sb.sb_logsunit > 1) { 3409 log->l_mp->m_sb.sb_logsunit > 1) {
@@ -3219,13 +3419,13 @@ xlog_ticket_alloc(xlog_t *log,
3219 tic->t_curr_res = unit_bytes; 3419 tic->t_curr_res = unit_bytes;
3220 tic->t_cnt = cnt; 3420 tic->t_cnt = cnt;
3221 tic->t_ocnt = cnt; 3421 tic->t_ocnt = cnt;
3222 tic->t_tid = (xlog_tid_t)((__psint_t)tic & 0xffffffff); 3422 tic->t_tid = random32();
3223 tic->t_clientid = client; 3423 tic->t_clientid = client;
3224 tic->t_flags = XLOG_TIC_INITED; 3424 tic->t_flags = XLOG_TIC_INITED;
3225 tic->t_trans_type = 0; 3425 tic->t_trans_type = 0;
3226 if (xflags & XFS_LOG_PERM_RESERV) 3426 if (xflags & XFS_LOG_PERM_RESERV)
3227 tic->t_flags |= XLOG_TIC_PERM_RESERV; 3427 tic->t_flags |= XLOG_TIC_PERM_RESERV;
3228 sv_init(&(tic->t_wait), SV_DEFAULT, "logtick"); 3428 sv_init(&tic->t_wait, SV_DEFAULT, "logtick");
3229 3429
3230 xlog_tic_reset_res(tic); 3430 xlog_tic_reset_res(tic);
3231 3431
@@ -3246,20 +3446,22 @@ xlog_ticket_alloc(xlog_t *log,
3246 * part of the log in case we trash the log structure. 3446 * part of the log in case we trash the log structure.
3247 */ 3447 */
3248void 3448void
3249xlog_verify_dest_ptr(xlog_t *log, 3449xlog_verify_dest_ptr(
3250 __psint_t ptr) 3450 struct log *log,
3451 char *ptr)
3251{ 3452{
3252 int i; 3453 int i;
3253 int good_ptr = 0; 3454 int good_ptr = 0;
3254 3455
3255 for (i=0; i < log->l_iclog_bufs; i++) { 3456 for (i = 0; i < log->l_iclog_bufs; i++) {
3256 if (ptr >= (__psint_t)log->l_iclog_bak[i] && 3457 if (ptr >= log->l_iclog_bak[i] &&
3257 ptr <= (__psint_t)log->l_iclog_bak[i]+log->l_iclog_size) 3458 ptr <= log->l_iclog_bak[i] + log->l_iclog_size)
3258 good_ptr++; 3459 good_ptr++;
3259 } 3460 }
3260 if (! good_ptr) 3461
3462 if (!good_ptr)
3261 xlog_panic("xlog_verify_dest_ptr: invalid ptr"); 3463 xlog_panic("xlog_verify_dest_ptr: invalid ptr");
3262} /* xlog_verify_dest_ptr */ 3464}
3263 3465
3264STATIC void 3466STATIC void
3265xlog_verify_grant_head(xlog_t *log, int equals) 3467xlog_verify_grant_head(xlog_t *log, int equals)
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 97a24c7795a4..229d1f36ba9a 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -110,6 +110,12 @@ typedef struct xfs_log_iovec {
110 uint i_type; /* type of region */ 110 uint i_type; /* type of region */
111} xfs_log_iovec_t; 111} xfs_log_iovec_t;
112 112
113struct xfs_log_vec {
114 struct xfs_log_vec *lv_next; /* next lv in build list */
115 int lv_niovecs; /* number of iovecs in lv */
116 struct xfs_log_iovec *lv_iovecp; /* iovec array */
117};
118
113/* 119/*
114 * Structure used to pass callback function and the function's argument 120 * Structure used to pass callback function and the function's argument
115 * to the log manager. 121 * to the log manager.
@@ -126,6 +132,13 @@ typedef struct xfs_log_callback {
126struct xfs_mount; 132struct xfs_mount;
127struct xlog_in_core; 133struct xlog_in_core;
128struct xlog_ticket; 134struct xlog_ticket;
135struct xfs_log_item;
136struct xfs_item_ops;
137
138void xfs_log_item_init(struct xfs_mount *mp,
139 struct xfs_log_item *item,
140 int type,
141 struct xfs_item_ops *ops);
129 142
130xfs_lsn_t xfs_log_done(struct xfs_mount *mp, 143xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
131 struct xlog_ticket *ticket, 144 struct xlog_ticket *ticket,
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index fd02a18facd5..9cf695154451 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -396,9 +396,7 @@ typedef struct log {
396 struct xfs_buf_cancel **l_buf_cancel_table; 396 struct xfs_buf_cancel **l_buf_cancel_table;
397 int l_iclog_hsize; /* size of iclog header */ 397 int l_iclog_hsize; /* size of iclog header */
398 int l_iclog_heads; /* # of iclog header sectors */ 398 int l_iclog_heads; /* # of iclog header sectors */
399 uint l_sectbb_log; /* log2 of sector size in BBs */ 399 uint l_sectBBsize; /* sector size in BBs (2^n) */
400 uint l_sectbb_mask; /* sector size (in BBs)
401 * alignment mask */
402 int l_iclog_size; /* size of log in bytes */ 400 int l_iclog_size; /* size of log in bytes */
403 int l_iclog_size_log; /* log power size of log */ 401 int l_iclog_size_log; /* log power size of log */
404 int l_iclog_bufs; /* number of iclog buffers */ 402 int l_iclog_bufs; /* number of iclog buffers */
@@ -449,6 +447,14 @@ extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
449 447
450extern kmem_zone_t *xfs_log_ticket_zone; 448extern kmem_zone_t *xfs_log_ticket_zone;
451 449
450static inline void
451xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
452{
453 *ptr += bytes;
454 *len -= bytes;
455 *off += bytes;
456}
457
452/* 458/*
453 * Unmount record type is used as a pseudo transaction type for the ticket. 459 * Unmount record type is used as a pseudo transaction type for the ticket.
454 * It's value must be outside the range of XFS_TRANS_* values. 460 * It's value must be outside the range of XFS_TRANS_* values.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 22e6efdc17ea..0de08e366315 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -56,33 +56,61 @@ STATIC void xlog_recover_check_summary(xlog_t *);
56#define xlog_recover_check_summary(log) 56#define xlog_recover_check_summary(log)
57#endif 57#endif
58 58
59
60/* 59/*
61 * Sector aligned buffer routines for buffer create/read/write/access 60 * Sector aligned buffer routines for buffer create/read/write/access
62 */ 61 */
63 62
64#define XLOG_SECTOR_ROUNDUP_BBCOUNT(log, bbs) \ 63/*
65 ( ((log)->l_sectbb_mask && (bbs & (log)->l_sectbb_mask)) ? \ 64 * Verify the given count of basic blocks is valid number of blocks
66 ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) ) 65 * to specify for an operation involving the given XFS log buffer.
67#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno) ((bno) & ~(log)->l_sectbb_mask) 66 * Returns nonzero if the count is valid, 0 otherwise.
67 */
68 68
69static inline int
70xlog_buf_bbcount_valid(
71 xlog_t *log,
72 int bbcount)
73{
74 return bbcount > 0 && bbcount <= log->l_logBBsize;
75}
76
77/*
78 * Allocate a buffer to hold log data. The buffer needs to be able
79 * to map to a range of nbblks basic blocks at any valid (basic
80 * block) offset within the log.
81 */
69STATIC xfs_buf_t * 82STATIC xfs_buf_t *
70xlog_get_bp( 83xlog_get_bp(
71 xlog_t *log, 84 xlog_t *log,
72 int nbblks) 85 int nbblks)
73{ 86{
74 if (nbblks <= 0 || nbblks > log->l_logBBsize) { 87 if (!xlog_buf_bbcount_valid(log, nbblks)) {
75 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); 88 xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
76 XFS_ERROR_REPORT("xlog_get_bp(1)", 89 nbblks);
77 XFS_ERRLEVEL_HIGH, log->l_mp); 90 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
78 return NULL; 91 return NULL;
79 } 92 }
80 93
81 if (log->l_sectbb_log) { 94 /*
82 if (nbblks > 1) 95 * We do log I/O in units of log sectors (a power-of-2
83 nbblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); 96 * multiple of the basic block size), so we round up the
84 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); 97 * requested size to acommodate the basic blocks required
85 } 98 * for complete log sectors.
99 *
100 * In addition, the buffer may be used for a non-sector-
101 * aligned block offset, in which case an I/O of the
102 * requested size could extend beyond the end of the
103 * buffer. If the requested size is only 1 basic block it
104 * will never straddle a sector boundary, so this won't be
105 * an issue. Nor will this be a problem if the log I/O is
106 * done in basic blocks (sector size 1). But otherwise we
107 * extend the buffer by one extra log sector to ensure
108 * there's space to accomodate this possiblility.
109 */
110 if (nbblks > 1 && log->l_sectBBsize > 1)
111 nbblks += log->l_sectBBsize;
112 nbblks = round_up(nbblks, log->l_sectBBsize);
113
86 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp); 114 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
87} 115}
88 116
@@ -93,6 +121,10 @@ xlog_put_bp(
93 xfs_buf_free(bp); 121 xfs_buf_free(bp);
94} 122}
95 123
124/*
125 * Return the address of the start of the given block number's data
126 * in a log buffer. The buffer covers a log sector-aligned region.
127 */
96STATIC xfs_caddr_t 128STATIC xfs_caddr_t
97xlog_align( 129xlog_align(
98 xlog_t *log, 130 xlog_t *log,
@@ -100,14 +132,14 @@ xlog_align(
100 int nbblks, 132 int nbblks,
101 xfs_buf_t *bp) 133 xfs_buf_t *bp)
102{ 134{
135 xfs_daddr_t offset;
103 xfs_caddr_t ptr; 136 xfs_caddr_t ptr;
104 137
105 if (!log->l_sectbb_log) 138 offset = blk_no & ((xfs_daddr_t) log->l_sectBBsize - 1);
106 return XFS_BUF_PTR(bp); 139 ptr = XFS_BUF_PTR(bp) + BBTOB(offset);
140
141 ASSERT(ptr + BBTOB(nbblks) <= XFS_BUF_PTR(bp) + XFS_BUF_SIZE(bp));
107 142
108 ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
109 ASSERT(XFS_BUF_SIZE(bp) >=
110 BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
111 return ptr; 143 return ptr;
112} 144}
113 145
@@ -124,21 +156,18 @@ xlog_bread_noalign(
124{ 156{
125 int error; 157 int error;
126 158
127 if (nbblks <= 0 || nbblks > log->l_logBBsize) { 159 if (!xlog_buf_bbcount_valid(log, nbblks)) {
128 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); 160 xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
129 XFS_ERROR_REPORT("xlog_bread(1)", 161 nbblks);
130 XFS_ERRLEVEL_HIGH, log->l_mp); 162 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
131 return EFSCORRUPTED; 163 return EFSCORRUPTED;
132 } 164 }
133 165
134 if (log->l_sectbb_log) { 166 blk_no = round_down(blk_no, log->l_sectBBsize);
135 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); 167 nbblks = round_up(nbblks, log->l_sectBBsize);
136 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
137 }
138 168
139 ASSERT(nbblks > 0); 169 ASSERT(nbblks > 0);
140 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); 170 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
141 ASSERT(bp);
142 171
143 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); 172 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
144 XFS_BUF_READ(bp); 173 XFS_BUF_READ(bp);
@@ -186,17 +215,15 @@ xlog_bwrite(
186{ 215{
187 int error; 216 int error;
188 217
189 if (nbblks <= 0 || nbblks > log->l_logBBsize) { 218 if (!xlog_buf_bbcount_valid(log, nbblks)) {
190 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); 219 xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
191 XFS_ERROR_REPORT("xlog_bwrite(1)", 220 nbblks);
192 XFS_ERRLEVEL_HIGH, log->l_mp); 221 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
193 return EFSCORRUPTED; 222 return EFSCORRUPTED;
194 } 223 }
195 224
196 if (log->l_sectbb_log) { 225 blk_no = round_down(blk_no, log->l_sectBBsize);
197 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); 226 nbblks = round_up(nbblks, log->l_sectBBsize);
198 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
199 }
200 227
201 ASSERT(nbblks > 0); 228 ASSERT(nbblks > 0);
202 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); 229 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
@@ -327,39 +354,38 @@ xlog_find_cycle_start(
327{ 354{
328 xfs_caddr_t offset; 355 xfs_caddr_t offset;
329 xfs_daddr_t mid_blk; 356 xfs_daddr_t mid_blk;
357 xfs_daddr_t end_blk;
330 uint mid_cycle; 358 uint mid_cycle;
331 int error; 359 int error;
332 360
333 mid_blk = BLK_AVG(first_blk, *last_blk); 361 end_blk = *last_blk;
334 while (mid_blk != first_blk && mid_blk != *last_blk) { 362 mid_blk = BLK_AVG(first_blk, end_blk);
363 while (mid_blk != first_blk && mid_blk != end_blk) {
335 error = xlog_bread(log, mid_blk, 1, bp, &offset); 364 error = xlog_bread(log, mid_blk, 1, bp, &offset);
336 if (error) 365 if (error)
337 return error; 366 return error;
338 mid_cycle = xlog_get_cycle(offset); 367 mid_cycle = xlog_get_cycle(offset);
339 if (mid_cycle == cycle) { 368 if (mid_cycle == cycle)
340 *last_blk = mid_blk; 369 end_blk = mid_blk; /* last_half_cycle == mid_cycle */
341 /* last_half_cycle == mid_cycle */ 370 else
342 } else { 371 first_blk = mid_blk; /* first_half_cycle == mid_cycle */
343 first_blk = mid_blk; 372 mid_blk = BLK_AVG(first_blk, end_blk);
344 /* first_half_cycle == mid_cycle */
345 }
346 mid_blk = BLK_AVG(first_blk, *last_blk);
347 } 373 }
348 ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) || 374 ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
349 (mid_blk == *last_blk && mid_blk-1 == first_blk)); 375 (mid_blk == end_blk && mid_blk-1 == first_blk));
376
377 *last_blk = end_blk;
350 378
351 return 0; 379 return 0;
352} 380}
353 381
354/* 382/*
355 * Check that the range of blocks does not contain the cycle number 383 * Check that a range of blocks does not contain stop_on_cycle_no.
356 * given. The scan needs to occur from front to back and the ptr into the 384 * Fill in *new_blk with the block offset where such a block is
357 * region must be updated since a later routine will need to perform another 385 * found, or with -1 (an invalid block number) if there is no such
358 * test. If the region is completely good, we end up returning the same 386 * block in the range. The scan needs to occur from front to back
359 * last block number. 387 * and the pointer into the region must be updated since a later
360 * 388 * routine will need to perform another test.
361 * Set blkno to -1 if we encounter no errors. This is an invalid block number
362 * since we don't ever expect logs to get this large.
363 */ 389 */
364STATIC int 390STATIC int
365xlog_find_verify_cycle( 391xlog_find_verify_cycle(
@@ -376,12 +402,16 @@ xlog_find_verify_cycle(
376 xfs_caddr_t buf = NULL; 402 xfs_caddr_t buf = NULL;
377 int error = 0; 403 int error = 0;
378 404
405 /*
406 * Greedily allocate a buffer big enough to handle the full
407 * range of basic blocks we'll be examining. If that fails,
408 * try a smaller size. We need to be able to read at least
409 * a log sector, or we're out of luck.
410 */
379 bufblks = 1 << ffs(nbblks); 411 bufblks = 1 << ffs(nbblks);
380
381 while (!(bp = xlog_get_bp(log, bufblks))) { 412 while (!(bp = xlog_get_bp(log, bufblks))) {
382 /* can't get enough memory to do everything in one big buffer */
383 bufblks >>= 1; 413 bufblks >>= 1;
384 if (bufblks <= log->l_sectbb_log) 414 if (bufblks < log->l_sectBBsize)
385 return ENOMEM; 415 return ENOMEM;
386 } 416 }
387 417
@@ -629,7 +659,7 @@ xlog_find_head(
629 * In this case we want to find the first block with cycle 659 * In this case we want to find the first block with cycle
630 * number matching last_half_cycle. We expect the log to be 660 * number matching last_half_cycle. We expect the log to be
631 * some variation on 661 * some variation on
632 * x + 1 ... | x ... 662 * x + 1 ... | x ... | x
633 * The first block with cycle number x (last_half_cycle) will 663 * The first block with cycle number x (last_half_cycle) will
634 * be where the new head belongs. First we do a binary search 664 * be where the new head belongs. First we do a binary search
635 * for the first occurrence of last_half_cycle. The binary 665 * for the first occurrence of last_half_cycle. The binary
@@ -639,11 +669,13 @@ xlog_find_head(
639 * the log, then we look for occurrences of last_half_cycle - 1 669 * the log, then we look for occurrences of last_half_cycle - 1
640 * at the end of the log. The cases we're looking for look 670 * at the end of the log. The cases we're looking for look
641 * like 671 * like
642 * x + 1 ... | x | x + 1 | x ... 672 * v binary search stopped here
643 * ^ binary search stopped here 673 * x + 1 ... | x | x + 1 | x ... | x
674 * ^ but we want to locate this spot
644 * or 675 * or
645 * x + 1 ... | x ... | x - 1 | x
646 * <---------> less than scan distance 676 * <---------> less than scan distance
677 * x + 1 ... | x ... | x - 1 | x
678 * ^ we want to locate this spot
647 */ 679 */
648 stop_on_cycle = last_half_cycle; 680 stop_on_cycle = last_half_cycle;
649 if ((error = xlog_find_cycle_start(log, bp, first_blk, 681 if ((error = xlog_find_cycle_start(log, bp, first_blk,
@@ -699,16 +731,16 @@ xlog_find_head(
699 * certainly not the head of the log. By searching for 731 * certainly not the head of the log. By searching for
700 * last_half_cycle-1 we accomplish that. 732 * last_half_cycle-1 we accomplish that.
701 */ 733 */
702 start_blk = log_bbnum - num_scan_bblks + head_blk;
703 ASSERT(head_blk <= INT_MAX && 734 ASSERT(head_blk <= INT_MAX &&
704 (xfs_daddr_t) num_scan_bblks - head_blk >= 0); 735 (xfs_daddr_t) num_scan_bblks >= head_blk);
736 start_blk = log_bbnum - (num_scan_bblks - head_blk);
705 if ((error = xlog_find_verify_cycle(log, start_blk, 737 if ((error = xlog_find_verify_cycle(log, start_blk,
706 num_scan_bblks - (int)head_blk, 738 num_scan_bblks - (int)head_blk,
707 (stop_on_cycle - 1), &new_blk))) 739 (stop_on_cycle - 1), &new_blk)))
708 goto bp_err; 740 goto bp_err;
709 if (new_blk != -1) { 741 if (new_blk != -1) {
710 head_blk = new_blk; 742 head_blk = new_blk;
711 goto bad_blk; 743 goto validate_head;
712 } 744 }
713 745
714 /* 746 /*
@@ -726,7 +758,7 @@ xlog_find_head(
726 head_blk = new_blk; 758 head_blk = new_blk;
727 } 759 }
728 760
729 bad_blk: 761validate_head:
730 /* 762 /*
731 * Now we need to make sure head_blk is not pointing to a block in 763 * Now we need to make sure head_blk is not pointing to a block in
732 * the middle of a log record. 764 * the middle of a log record.
@@ -748,7 +780,7 @@ xlog_find_head(
748 if ((error = xlog_find_verify_log_record(log, start_blk, 780 if ((error = xlog_find_verify_log_record(log, start_blk,
749 &head_blk, 0)) == -1) { 781 &head_blk, 0)) == -1) {
750 /* We hit the beginning of the log during our search */ 782 /* We hit the beginning of the log during our search */
751 start_blk = log_bbnum - num_scan_bblks + head_blk; 783 start_blk = log_bbnum - (num_scan_bblks - head_blk);
752 new_blk = log_bbnum; 784 new_blk = log_bbnum;
753 ASSERT(start_blk <= INT_MAX && 785 ASSERT(start_blk <= INT_MAX &&
754 (xfs_daddr_t) log_bbnum-start_blk >= 0); 786 (xfs_daddr_t) log_bbnum-start_blk >= 0);
@@ -833,12 +865,12 @@ xlog_find_tail(
833 if (*head_blk == 0) { /* special case */ 865 if (*head_blk == 0) { /* special case */
834 error = xlog_bread(log, 0, 1, bp, &offset); 866 error = xlog_bread(log, 0, 1, bp, &offset);
835 if (error) 867 if (error)
836 goto bread_err; 868 goto done;
837 869
838 if (xlog_get_cycle(offset) == 0) { 870 if (xlog_get_cycle(offset) == 0) {
839 *tail_blk = 0; 871 *tail_blk = 0;
840 /* leave all other log inited values alone */ 872 /* leave all other log inited values alone */
841 goto exit; 873 goto done;
842 } 874 }
843 } 875 }
844 876
@@ -849,7 +881,7 @@ xlog_find_tail(
849 for (i = (int)(*head_blk) - 1; i >= 0; i--) { 881 for (i = (int)(*head_blk) - 1; i >= 0; i--) {
850 error = xlog_bread(log, i, 1, bp, &offset); 882 error = xlog_bread(log, i, 1, bp, &offset);
851 if (error) 883 if (error)
852 goto bread_err; 884 goto done;
853 885
854 if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) { 886 if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
855 found = 1; 887 found = 1;
@@ -866,7 +898,7 @@ xlog_find_tail(
866 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) { 898 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
867 error = xlog_bread(log, i, 1, bp, &offset); 899 error = xlog_bread(log, i, 1, bp, &offset);
868 if (error) 900 if (error)
869 goto bread_err; 901 goto done;
870 902
871 if (XLOG_HEADER_MAGIC_NUM == 903 if (XLOG_HEADER_MAGIC_NUM ==
872 be32_to_cpu(*(__be32 *)offset)) { 904 be32_to_cpu(*(__be32 *)offset)) {
@@ -941,7 +973,7 @@ xlog_find_tail(
941 umount_data_blk = (i + hblks) % log->l_logBBsize; 973 umount_data_blk = (i + hblks) % log->l_logBBsize;
942 error = xlog_bread(log, umount_data_blk, 1, bp, &offset); 974 error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
943 if (error) 975 if (error)
944 goto bread_err; 976 goto done;
945 977
946 op_head = (xlog_op_header_t *)offset; 978 op_head = (xlog_op_header_t *)offset;
947 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { 979 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
@@ -987,12 +1019,10 @@ xlog_find_tail(
987 * But... if the -device- itself is readonly, just skip this. 1019 * But... if the -device- itself is readonly, just skip this.
988 * We can't recover this device anyway, so it won't matter. 1020 * We can't recover this device anyway, so it won't matter.
989 */ 1021 */
990 if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) { 1022 if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
991 error = xlog_clear_stale_blocks(log, tail_lsn); 1023 error = xlog_clear_stale_blocks(log, tail_lsn);
992 }
993 1024
994bread_err: 1025done:
995exit:
996 xlog_put_bp(bp); 1026 xlog_put_bp(bp);
997 1027
998 if (error) 1028 if (error)
@@ -1152,16 +1182,22 @@ xlog_write_log_records(
1152 xfs_caddr_t offset; 1182 xfs_caddr_t offset;
1153 xfs_buf_t *bp; 1183 xfs_buf_t *bp;
1154 int balign, ealign; 1184 int balign, ealign;
1155 int sectbb = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); 1185 int sectbb = log->l_sectBBsize;
1156 int end_block = start_block + blocks; 1186 int end_block = start_block + blocks;
1157 int bufblks; 1187 int bufblks;
1158 int error = 0; 1188 int error = 0;
1159 int i, j = 0; 1189 int i, j = 0;
1160 1190
1191 /*
1192 * Greedily allocate a buffer big enough to handle the full
1193 * range of basic blocks to be written. If that fails, try
1194 * a smaller size. We need to be able to write at least a
1195 * log sector, or we're out of luck.
1196 */
1161 bufblks = 1 << ffs(blocks); 1197 bufblks = 1 << ffs(blocks);
1162 while (!(bp = xlog_get_bp(log, bufblks))) { 1198 while (!(bp = xlog_get_bp(log, bufblks))) {
1163 bufblks >>= 1; 1199 bufblks >>= 1;
1164 if (bufblks <= log->l_sectbb_log) 1200 if (bufblks < sectbb)
1165 return ENOMEM; 1201 return ENOMEM;
1166 } 1202 }
1167 1203
@@ -1169,7 +1205,7 @@ xlog_write_log_records(
1169 * the buffer in the starting sector not covered by the first 1205 * the buffer in the starting sector not covered by the first
1170 * write below. 1206 * write below.
1171 */ 1207 */
1172 balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block); 1208 balign = round_down(start_block, sectbb);
1173 if (balign != start_block) { 1209 if (balign != start_block) {
1174 error = xlog_bread_noalign(log, start_block, 1, bp); 1210 error = xlog_bread_noalign(log, start_block, 1, bp);
1175 if (error) 1211 if (error)
@@ -1188,7 +1224,7 @@ xlog_write_log_records(
1188 * the buffer in the final sector not covered by the write. 1224 * the buffer in the final sector not covered by the write.
1189 * If this is the same sector as the above read, skip it. 1225 * If this is the same sector as the above read, skip it.
1190 */ 1226 */
1191 ealign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, end_block); 1227 ealign = round_down(end_block, sectbb);
1192 if (j == 0 && (start_block + endcount > ealign)) { 1228 if (j == 0 && (start_block + endcount > ealign)) {
1193 offset = XFS_BUF_PTR(bp); 1229 offset = XFS_BUF_PTR(bp);
1194 balign = BBTOB(ealign - start_block); 1230 balign = BBTOB(ealign - start_block);
@@ -1408,6 +1444,7 @@ xlog_recover_add_item(
1408 1444
1409STATIC int 1445STATIC int
1410xlog_recover_add_to_cont_trans( 1446xlog_recover_add_to_cont_trans(
1447 struct log *log,
1411 xlog_recover_t *trans, 1448 xlog_recover_t *trans,
1412 xfs_caddr_t dp, 1449 xfs_caddr_t dp,
1413 int len) 1450 int len)
@@ -1434,6 +1471,7 @@ xlog_recover_add_to_cont_trans(
1434 memcpy(&ptr[old_len], dp, len); /* d, s, l */ 1471 memcpy(&ptr[old_len], dp, len); /* d, s, l */
1435 item->ri_buf[item->ri_cnt-1].i_len += len; 1472 item->ri_buf[item->ri_cnt-1].i_len += len;
1436 item->ri_buf[item->ri_cnt-1].i_addr = ptr; 1473 item->ri_buf[item->ri_cnt-1].i_addr = ptr;
1474 trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
1437 return 0; 1475 return 0;
1438} 1476}
1439 1477
@@ -1452,6 +1490,7 @@ xlog_recover_add_to_cont_trans(
1452 */ 1490 */
1453STATIC int 1491STATIC int
1454xlog_recover_add_to_trans( 1492xlog_recover_add_to_trans(
1493 struct log *log,
1455 xlog_recover_t *trans, 1494 xlog_recover_t *trans,
1456 xfs_caddr_t dp, 1495 xfs_caddr_t dp,
1457 int len) 1496 int len)
@@ -1510,6 +1549,7 @@ xlog_recover_add_to_trans(
1510 item->ri_buf[item->ri_cnt].i_addr = ptr; 1549 item->ri_buf[item->ri_cnt].i_addr = ptr;
1511 item->ri_buf[item->ri_cnt].i_len = len; 1550 item->ri_buf[item->ri_cnt].i_len = len;
1512 item->ri_cnt++; 1551 item->ri_cnt++;
1552 trace_xfs_log_recover_item_add(log, trans, item, 0);
1513 return 0; 1553 return 0;
1514} 1554}
1515 1555
@@ -1521,7 +1561,9 @@ xlog_recover_add_to_trans(
1521 */ 1561 */
1522STATIC int 1562STATIC int
1523xlog_recover_reorder_trans( 1563xlog_recover_reorder_trans(
1524 xlog_recover_t *trans) 1564 struct log *log,
1565 xlog_recover_t *trans,
1566 int pass)
1525{ 1567{
1526 xlog_recover_item_t *item, *n; 1568 xlog_recover_item_t *item, *n;
1527 LIST_HEAD(sort_list); 1569 LIST_HEAD(sort_list);
@@ -1535,6 +1577,8 @@ xlog_recover_reorder_trans(
1535 switch (ITEM_TYPE(item)) { 1577 switch (ITEM_TYPE(item)) {
1536 case XFS_LI_BUF: 1578 case XFS_LI_BUF:
1537 if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) { 1579 if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) {
1580 trace_xfs_log_recover_item_reorder_head(log,
1581 trans, item, pass);
1538 list_move(&item->ri_list, &trans->r_itemq); 1582 list_move(&item->ri_list, &trans->r_itemq);
1539 break; 1583 break;
1540 } 1584 }
@@ -1543,6 +1587,8 @@ xlog_recover_reorder_trans(
1543 case XFS_LI_QUOTAOFF: 1587 case XFS_LI_QUOTAOFF:
1544 case XFS_LI_EFD: 1588 case XFS_LI_EFD:
1545 case XFS_LI_EFI: 1589 case XFS_LI_EFI:
1590 trace_xfs_log_recover_item_reorder_tail(log,
1591 trans, item, pass);
1546 list_move_tail(&item->ri_list, &trans->r_itemq); 1592 list_move_tail(&item->ri_list, &trans->r_itemq);
1547 break; 1593 break;
1548 default: 1594 default:
@@ -1592,8 +1638,10 @@ xlog_recover_do_buffer_pass1(
1592 /* 1638 /*
1593 * If this isn't a cancel buffer item, then just return. 1639 * If this isn't a cancel buffer item, then just return.
1594 */ 1640 */
1595 if (!(flags & XFS_BLI_CANCEL)) 1641 if (!(flags & XFS_BLI_CANCEL)) {
1642 trace_xfs_log_recover_buf_not_cancel(log, buf_f);
1596 return; 1643 return;
1644 }
1597 1645
1598 /* 1646 /*
1599 * Insert an xfs_buf_cancel record into the hash table of 1647 * Insert an xfs_buf_cancel record into the hash table of
@@ -1627,6 +1675,7 @@ xlog_recover_do_buffer_pass1(
1627 while (nextp != NULL) { 1675 while (nextp != NULL) {
1628 if (nextp->bc_blkno == blkno && nextp->bc_len == len) { 1676 if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
1629 nextp->bc_refcount++; 1677 nextp->bc_refcount++;
1678 trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
1630 return; 1679 return;
1631 } 1680 }
1632 prevp = nextp; 1681 prevp = nextp;
@@ -1640,6 +1689,7 @@ xlog_recover_do_buffer_pass1(
1640 bcp->bc_refcount = 1; 1689 bcp->bc_refcount = 1;
1641 bcp->bc_next = NULL; 1690 bcp->bc_next = NULL;
1642 prevp->bc_next = bcp; 1691 prevp->bc_next = bcp;
1692 trace_xfs_log_recover_buf_cancel_add(log, buf_f);
1643} 1693}
1644 1694
1645/* 1695/*
@@ -1779,6 +1829,8 @@ xlog_recover_do_inode_buffer(
1779 unsigned int *data_map = NULL; 1829 unsigned int *data_map = NULL;
1780 unsigned int map_size = 0; 1830 unsigned int map_size = 0;
1781 1831
1832 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
1833
1782 switch (buf_f->blf_type) { 1834 switch (buf_f->blf_type) {
1783 case XFS_LI_BUF: 1835 case XFS_LI_BUF:
1784 data_map = buf_f->blf_data_map; 1836 data_map = buf_f->blf_data_map;
@@ -1874,6 +1926,7 @@ xlog_recover_do_inode_buffer(
1874/*ARGSUSED*/ 1926/*ARGSUSED*/
1875STATIC void 1927STATIC void
1876xlog_recover_do_reg_buffer( 1928xlog_recover_do_reg_buffer(
1929 struct xfs_mount *mp,
1877 xlog_recover_item_t *item, 1930 xlog_recover_item_t *item,
1878 xfs_buf_t *bp, 1931 xfs_buf_t *bp,
1879 xfs_buf_log_format_t *buf_f) 1932 xfs_buf_log_format_t *buf_f)
@@ -1885,6 +1938,8 @@ xlog_recover_do_reg_buffer(
1885 unsigned int map_size = 0; 1938 unsigned int map_size = 0;
1886 int error; 1939 int error;
1887 1940
1941 trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
1942
1888 switch (buf_f->blf_type) { 1943 switch (buf_f->blf_type) {
1889 case XFS_LI_BUF: 1944 case XFS_LI_BUF:
1890 data_map = buf_f->blf_data_map; 1945 data_map = buf_f->blf_data_map;
@@ -2083,6 +2138,8 @@ xlog_recover_do_dquot_buffer(
2083{ 2138{
2084 uint type; 2139 uint type;
2085 2140
2141 trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
2142
2086 /* 2143 /*
2087 * Filesystems are required to send in quota flags at mount time. 2144 * Filesystems are required to send in quota flags at mount time.
2088 */ 2145 */
@@ -2103,7 +2160,7 @@ xlog_recover_do_dquot_buffer(
2103 if (log->l_quotaoffs_flag & type) 2160 if (log->l_quotaoffs_flag & type)
2104 return; 2161 return;
2105 2162
2106 xlog_recover_do_reg_buffer(item, bp, buf_f); 2163 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2107} 2164}
2108 2165
2109/* 2166/*
@@ -2164,9 +2221,11 @@ xlog_recover_do_buffer_trans(
2164 */ 2221 */
2165 cancel = xlog_recover_do_buffer_pass2(log, buf_f); 2222 cancel = xlog_recover_do_buffer_pass2(log, buf_f);
2166 if (cancel) { 2223 if (cancel) {
2224 trace_xfs_log_recover_buf_cancel(log, buf_f);
2167 return 0; 2225 return 0;
2168 } 2226 }
2169 } 2227 }
2228 trace_xfs_log_recover_buf_recover(log, buf_f);
2170 switch (buf_f->blf_type) { 2229 switch (buf_f->blf_type) {
2171 case XFS_LI_BUF: 2230 case XFS_LI_BUF:
2172 blkno = buf_f->blf_blkno; 2231 blkno = buf_f->blf_blkno;
@@ -2204,7 +2263,7 @@ xlog_recover_do_buffer_trans(
2204 (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { 2263 (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
2205 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); 2264 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2206 } else { 2265 } else {
2207 xlog_recover_do_reg_buffer(item, bp, buf_f); 2266 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2208 } 2267 }
2209 if (error) 2268 if (error)
2210 return XFS_ERROR(error); 2269 return XFS_ERROR(error);
@@ -2284,8 +2343,10 @@ xlog_recover_do_inode_trans(
2284 if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno, 2343 if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
2285 in_f->ilf_len, 0)) { 2344 in_f->ilf_len, 0)) {
2286 error = 0; 2345 error = 0;
2346 trace_xfs_log_recover_inode_cancel(log, in_f);
2287 goto error; 2347 goto error;
2288 } 2348 }
2349 trace_xfs_log_recover_inode_recover(log, in_f);
2289 2350
2290 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 2351 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
2291 XBF_LOCK); 2352 XBF_LOCK);
@@ -2337,6 +2398,7 @@ xlog_recover_do_inode_trans(
2337 /* do nothing */ 2398 /* do nothing */
2338 } else { 2399 } else {
2339 xfs_buf_relse(bp); 2400 xfs_buf_relse(bp);
2401 trace_xfs_log_recover_inode_skip(log, in_f);
2340 error = 0; 2402 error = 0;
2341 goto error; 2403 goto error;
2342 } 2404 }
@@ -2758,11 +2820,12 @@ xlog_recover_do_trans(
2758 int error = 0; 2820 int error = 0;
2759 xlog_recover_item_t *item; 2821 xlog_recover_item_t *item;
2760 2822
2761 error = xlog_recover_reorder_trans(trans); 2823 error = xlog_recover_reorder_trans(log, trans, pass);
2762 if (error) 2824 if (error)
2763 return error; 2825 return error;
2764 2826
2765 list_for_each_entry(item, &trans->r_itemq, ri_list) { 2827 list_for_each_entry(item, &trans->r_itemq, ri_list) {
2828 trace_xfs_log_recover_item_recover(log, trans, item, pass);
2766 switch (ITEM_TYPE(item)) { 2829 switch (ITEM_TYPE(item)) {
2767 case XFS_LI_BUF: 2830 case XFS_LI_BUF:
2768 error = xlog_recover_do_buffer_trans(log, item, pass); 2831 error = xlog_recover_do_buffer_trans(log, item, pass);
@@ -2919,8 +2982,9 @@ xlog_recover_process_data(
2919 error = xlog_recover_unmount_trans(trans); 2982 error = xlog_recover_unmount_trans(trans);
2920 break; 2983 break;
2921 case XLOG_WAS_CONT_TRANS: 2984 case XLOG_WAS_CONT_TRANS:
2922 error = xlog_recover_add_to_cont_trans(trans, 2985 error = xlog_recover_add_to_cont_trans(log,
2923 dp, be32_to_cpu(ohead->oh_len)); 2986 trans, dp,
2987 be32_to_cpu(ohead->oh_len));
2924 break; 2988 break;
2925 case XLOG_START_TRANS: 2989 case XLOG_START_TRANS:
2926 xlog_warn( 2990 xlog_warn(
@@ -2930,7 +2994,7 @@ xlog_recover_process_data(
2930 break; 2994 break;
2931 case 0: 2995 case 0:
2932 case XLOG_CONTINUE_TRANS: 2996 case XLOG_CONTINUE_TRANS:
2933 error = xlog_recover_add_to_trans(trans, 2997 error = xlog_recover_add_to_trans(log, trans,
2934 dp, be32_to_cpu(ohead->oh_len)); 2998 dp, be32_to_cpu(ohead->oh_len));
2935 break; 2999 break;
2936 default: 3000 default:
@@ -3331,42 +3395,6 @@ xlog_pack_data(
3331 } 3395 }
3332} 3396}
3333 3397
3334#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
3335STATIC void
3336xlog_unpack_data_checksum(
3337 xlog_rec_header_t *rhead,
3338 xfs_caddr_t dp,
3339 xlog_t *log)
3340{
3341 __be32 *up = (__be32 *)dp;
3342 uint chksum = 0;
3343 int i;
3344
3345 /* divide length by 4 to get # words */
3346 for (i=0; i < be32_to_cpu(rhead->h_len) >> 2; i++) {
3347 chksum ^= be32_to_cpu(*up);
3348 up++;
3349 }
3350 if (chksum != be32_to_cpu(rhead->h_chksum)) {
3351 if (rhead->h_chksum ||
3352 ((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) {
3353 cmn_err(CE_DEBUG,
3354 "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n",
3355 be32_to_cpu(rhead->h_chksum), chksum);
3356 cmn_err(CE_DEBUG,
3357"XFS: Disregard message if filesystem was created with non-DEBUG kernel");
3358 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3359 cmn_err(CE_DEBUG,
3360 "XFS: LogR this is a LogV2 filesystem\n");
3361 }
3362 log->l_flags |= XLOG_CHKSUM_MISMATCH;
3363 }
3364 }
3365}
3366#else
3367#define xlog_unpack_data_checksum(rhead, dp, log)
3368#endif
3369
3370STATIC void 3398STATIC void
3371xlog_unpack_data( 3399xlog_unpack_data(
3372 xlog_rec_header_t *rhead, 3400 xlog_rec_header_t *rhead,
@@ -3390,8 +3418,6 @@ xlog_unpack_data(
3390 dp += BBSIZE; 3418 dp += BBSIZE;
3391 } 3419 }
3392 } 3420 }
3393
3394 xlog_unpack_data_checksum(rhead, dp, log);
3395} 3421}
3396 3422
3397STATIC int 3423STATIC int
@@ -3490,7 +3516,7 @@ xlog_do_recovery_pass(
3490 hblks = 1; 3516 hblks = 1;
3491 } 3517 }
3492 } else { 3518 } else {
3493 ASSERT(log->l_sectbb_log == 0); 3519 ASSERT(log->l_sectBBsize == 1);
3494 hblks = 1; 3520 hblks = 1;
3495 hbp = xlog_get_bp(log, 1); 3521 hbp = xlog_get_bp(log, 1);
3496 h_size = XLOG_BIG_RECORD_BSIZE; 3522 h_size = XLOG_BIG_RECORD_BSIZE;
@@ -3946,10 +3972,6 @@ xlog_recover_check_summary(
3946 xfs_agf_t *agfp; 3972 xfs_agf_t *agfp;
3947 xfs_buf_t *agfbp; 3973 xfs_buf_t *agfbp;
3948 xfs_buf_t *agibp; 3974 xfs_buf_t *agibp;
3949 xfs_buf_t *sbbp;
3950#ifdef XFS_LOUD_RECOVERY
3951 xfs_sb_t *sbp;
3952#endif
3953 xfs_agnumber_t agno; 3975 xfs_agnumber_t agno;
3954 __uint64_t freeblks; 3976 __uint64_t freeblks;
3955 __uint64_t itotal; 3977 __uint64_t itotal;
@@ -3984,30 +4006,5 @@ xlog_recover_check_summary(
3984 xfs_buf_relse(agibp); 4006 xfs_buf_relse(agibp);
3985 } 4007 }
3986 } 4008 }
3987
3988 sbbp = xfs_getsb(mp, 0);
3989#ifdef XFS_LOUD_RECOVERY
3990 sbp = &mp->m_sb;
3991 xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(sbbp));
3992 cmn_err(CE_NOTE,
3993 "xlog_recover_check_summary: sb_icount %Lu itotal %Lu",
3994 sbp->sb_icount, itotal);
3995 cmn_err(CE_NOTE,
3996 "xlog_recover_check_summary: sb_ifree %Lu itotal %Lu",
3997 sbp->sb_ifree, ifree);
3998 cmn_err(CE_NOTE,
3999 "xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu",
4000 sbp->sb_fdblocks, freeblks);
4001#if 0
4002 /*
4003 * This is turned off until I account for the allocation
4004 * btree blocks which live in free space.
4005 */
4006 ASSERT(sbp->sb_icount == itotal);
4007 ASSERT(sbp->sb_ifree == ifree);
4008 ASSERT(sbp->sb_fdblocks == freeblks);
4009#endif
4010#endif
4011 xfs_buf_relse(sbbp);
4012} 4009}
4013#endif /* DEBUG */ 4010#endif /* DEBUG */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index e79b56b4bca6..d7bf38c8cd1c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1405,13 +1405,6 @@ xfs_mountfs(
1405 xfs_qm_mount_quotas(mp); 1405 xfs_qm_mount_quotas(mp);
1406 } 1406 }
1407 1407
1408#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
1409 if (XFS_IS_QUOTA_ON(mp))
1410 xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on");
1411 else
1412 xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on");
1413#endif
1414
1415 /* 1408 /*
1416 * Now we are mounted, reserve a small amount of unused space for 1409 * Now we are mounted, reserve a small amount of unused space for
1417 * privileged transactions. This is needed so that transaction 1410 * privileged transactions. This is needed so that transaction
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 4fa0bc7b983e..9ff48a16a7ee 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -259,6 +259,7 @@ typedef struct xfs_mount {
259 wait_queue_head_t m_wait_single_sync_task; 259 wait_queue_head_t m_wait_single_sync_task;
260 __int64_t m_update_flags; /* sb flags we need to update 260 __int64_t m_update_flags; /* sb flags we need to update
261 on the next remount,rw */ 261 on the next remount,rw */
262 struct list_head m_mplist; /* inode shrinker mount list */
262} xfs_mount_t; 263} xfs_mount_t;
263 264
264/* 265/*
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index fdcab3f81dde..e0e64b113bd6 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -201,9 +201,6 @@ typedef struct xfs_qoff_logformat {
201#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ 201#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */
202#define XFS_QMOPT_DQSUSER 0x0000020 /* don't cache super users dquot */ 202#define XFS_QMOPT_DQSUSER 0x0000020 /* don't cache super users dquot */
203#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ 203#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */
204#define XFS_QMOPT_QUOTAOFF 0x0000080 /* quotas are being turned off */
205#define XFS_QMOPT_UMOUNTING 0x0000100 /* filesys is being unmounted */
206#define XFS_QMOPT_DOLOG 0x0000200 /* log buf changes (in quotacheck) */
207#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ 204#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */
208#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ 205#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */
209#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ 206#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index f73e358bae8d..be578ecb4af2 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -45,23 +45,12 @@
45#include "xfs_trans_space.h" 45#include "xfs_trans_space.h"
46#include "xfs_inode_item.h" 46#include "xfs_inode_item.h"
47 47
48
49STATIC void xfs_trans_apply_sb_deltas(xfs_trans_t *);
50STATIC uint xfs_trans_count_vecs(xfs_trans_t *);
51STATIC void xfs_trans_fill_vecs(xfs_trans_t *, xfs_log_iovec_t *);
52STATIC void xfs_trans_uncommit(xfs_trans_t *, uint);
53STATIC void xfs_trans_committed(xfs_trans_t *, int);
54STATIC void xfs_trans_chunk_committed(xfs_log_item_chunk_t *, xfs_lsn_t, int);
55STATIC void xfs_trans_free(xfs_trans_t *);
56
57kmem_zone_t *xfs_trans_zone; 48kmem_zone_t *xfs_trans_zone;
58 49
59
60/* 50/*
61 * Reservation functions here avoid a huge stack in xfs_trans_init 51 * Reservation functions here avoid a huge stack in xfs_trans_init
62 * due to register overflow from temporaries in the calculations. 52 * due to register overflow from temporaries in the calculations.
63 */ 53 */
64
65STATIC uint 54STATIC uint
66xfs_calc_write_reservation(xfs_mount_t *mp) 55xfs_calc_write_reservation(xfs_mount_t *mp)
67{ 56{
@@ -261,6 +250,19 @@ _xfs_trans_alloc(
261} 250}
262 251
263/* 252/*
253 * Free the transaction structure. If there is more clean up
254 * to do when the structure is freed, add it here.
255 */
256STATIC void
257xfs_trans_free(
258 xfs_trans_t *tp)
259{
260 atomic_dec(&tp->t_mountp->m_active_trans);
261 xfs_trans_free_dqinfo(tp);
262 kmem_zone_free(xfs_trans_zone, tp);
263}
264
265/*
264 * This is called to create a new transaction which will share the 266 * This is called to create a new transaction which will share the
265 * permanent log reservation of the given transaction. The remaining 267 * permanent log reservation of the given transaction. The remaining
266 * unused block and rt extent reservations are also inherited. This 268 * unused block and rt extent reservations are also inherited. This
@@ -764,94 +766,278 @@ xfs_trans_unreserve_and_mod_sb(
764 } 766 }
765} 767}
766 768
769/*
770 * Total up the number of log iovecs needed to commit this
771 * transaction. The transaction itself needs one for the
772 * transaction header. Ask each dirty item in turn how many
773 * it needs to get the total.
774 */
775static uint
776xfs_trans_count_vecs(
777 struct xfs_trans *tp)
778{
779 int nvecs;
780 xfs_log_item_desc_t *lidp;
781
782 nvecs = 1;
783 lidp = xfs_trans_first_item(tp);
784 ASSERT(lidp != NULL);
785
786 /* In the non-debug case we need to start bailing out if we
787 * didn't find a log_item here, return zero and let trans_commit
788 * deal with it.
789 */
790 if (lidp == NULL)
791 return 0;
792
793 while (lidp != NULL) {
794 /*
795 * Skip items which aren't dirty in this transaction.
796 */
797 if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
798 lidp = xfs_trans_next_item(tp, lidp);
799 continue;
800 }
801 lidp->lid_size = IOP_SIZE(lidp->lid_item);
802 nvecs += lidp->lid_size;
803 lidp = xfs_trans_next_item(tp, lidp);
804 }
805
806 return nvecs;
807}
767 808
768/* 809/*
769 * xfs_trans_commit 810 * Fill in the vector with pointers to data to be logged
811 * by this transaction. The transaction header takes
812 * the first vector, and then each dirty item takes the
813 * number of vectors it indicated it needed in xfs_trans_count_vecs().
770 * 814 *
771 * Commit the given transaction to the log a/synchronously. 815 * As each item fills in the entries it needs, also pin the item
816 * so that it cannot be flushed out until the log write completes.
817 */
818static void
819xfs_trans_fill_vecs(
820 struct xfs_trans *tp,
821 struct xfs_log_iovec *log_vector)
822{
823 xfs_log_item_desc_t *lidp;
824 struct xfs_log_iovec *vecp;
825 uint nitems;
826
827 /*
828 * Skip over the entry for the transaction header, we'll
829 * fill that in at the end.
830 */
831 vecp = log_vector + 1;
832
833 nitems = 0;
834 lidp = xfs_trans_first_item(tp);
835 ASSERT(lidp);
836 while (lidp) {
837 /* Skip items which aren't dirty in this transaction. */
838 if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
839 lidp = xfs_trans_next_item(tp, lidp);
840 continue;
841 }
842
843 /*
844 * The item may be marked dirty but not log anything. This can
845 * be used to get called when a transaction is committed.
846 */
847 if (lidp->lid_size)
848 nitems++;
849 IOP_FORMAT(lidp->lid_item, vecp);
850 vecp += lidp->lid_size;
851 IOP_PIN(lidp->lid_item);
852 lidp = xfs_trans_next_item(tp, lidp);
853 }
854
855 /*
856 * Now that we've counted the number of items in this transaction, fill
857 * in the transaction header. Note that the transaction header does not
858 * have a log item.
859 */
860 tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC;
861 tp->t_header.th_type = tp->t_type;
862 tp->t_header.th_num_items = nitems;
863 log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
864 log_vector->i_len = sizeof(xfs_trans_header_t);
865 log_vector->i_type = XLOG_REG_TYPE_TRANSHDR;
866}
867
868/*
869 * The committed item processing consists of calling the committed routine of
870 * each logged item, updating the item's position in the AIL if necessary, and
871 * unpinning each item. If the committed routine returns -1, then do nothing
872 * further with the item because it may have been freed.
772 * 873 *
773 * XFS disk error handling mechanism is not based on a typical 874 * Since items are unlocked when they are copied to the incore log, it is
774 * transaction abort mechanism. Logically after the filesystem 875 * possible for two transactions to be completing and manipulating the same
775 * gets marked 'SHUTDOWN', we can't let any new transactions 876 * item simultaneously. The AIL lock will protect the lsn field of each item.
776 * be durable - ie. committed to disk - because some metadata might 877 * The value of this field can never go backwards.
777 * be inconsistent. In such cases, this returns an error, and the 878 *
778 * caller may assume that all locked objects joined to the transaction 879 * We unpin the items after repositioning them in the AIL, because otherwise
779 * have already been unlocked as if the commit had succeeded. 880 * they could be immediately flushed and we'd have to race with the flusher
780 * Do not reference the transaction structure after this call. 881 * trying to pull the item from the AIL as we add it.
781 */ 882 */
782 /*ARGSUSED*/ 883static void
783int 884xfs_trans_item_committed(
784_xfs_trans_commit( 885 struct xfs_log_item *lip,
785 xfs_trans_t *tp, 886 xfs_lsn_t commit_lsn,
786 uint flags, 887 int aborted)
787 int *log_flushed)
788{ 888{
789 xfs_log_iovec_t *log_vector; 889 xfs_lsn_t item_lsn;
790 int nvec; 890 struct xfs_ail *ailp;
791 xfs_mount_t *mp;
792 xfs_lsn_t commit_lsn;
793 /* REFERENCED */
794 int error;
795 int log_flags;
796 int sync;
797#define XFS_TRANS_LOGVEC_COUNT 16
798 xfs_log_iovec_t log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
799 struct xlog_in_core *commit_iclog;
800 int shutdown;
801 891
802 commit_lsn = -1; 892 if (aborted)
893 lip->li_flags |= XFS_LI_ABORTED;
894 item_lsn = IOP_COMMITTED(lip, commit_lsn);
895
896 /* If the committed routine returns -1, item has been freed. */
897 if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
898 return;
803 899
804 /* 900 /*
805 * Determine whether this commit is releasing a permanent 901 * If the returned lsn is greater than what it contained before, update
806 * log reservation or not. 902 * the location of the item in the AIL. If it is not, then do nothing.
903 * Items can never move backwards in the AIL.
904 *
905 * While the new lsn should usually be greater, it is possible that a
906 * later transaction completing simultaneously with an earlier one
907 * using the same item could complete first with a higher lsn. This
908 * would cause the earlier transaction to fail the test below.
807 */ 909 */
808 if (flags & XFS_TRANS_RELEASE_LOG_RES) { 910 ailp = lip->li_ailp;
809 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 911 spin_lock(&ailp->xa_lock);
810 log_flags = XFS_LOG_REL_PERM_RESERV; 912 if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
913 /*
914 * This will set the item's lsn to item_lsn and update the
915 * position of the item in the AIL.
916 *
917 * xfs_trans_ail_update() drops the AIL lock.
918 */
919 xfs_trans_ail_update(ailp, lip, item_lsn);
811 } else { 920 } else {
812 log_flags = 0; 921 spin_unlock(&ailp->xa_lock);
813 } 922 }
814 mp = tp->t_mountp;
815 923
816 /* 924 /*
817 * If there is nothing to be logged by the transaction, 925 * Now that we've repositioned the item in the AIL, unpin it so it can
818 * then unlock all of the items associated with the 926 * be flushed. Pass information about buffer stale state down from the
819 * transaction and free the transaction structure. 927 * log item flags, if anyone else stales the buffer we do not want to
820 * Also make sure to return any reserved blocks to 928 * pay any attention to it.
821 * the free pool.
822 */ 929 */
823shut_us_down: 930 IOP_UNPIN(lip);
824 shutdown = XFS_FORCED_SHUTDOWN(mp) ? EIO : 0; 931}
825 if (!(tp->t_flags & XFS_TRANS_DIRTY) || shutdown) { 932
826 xfs_trans_unreserve_and_mod_sb(tp); 933/* Clear all the per-AG busy list items listed in this transaction */
934static void
935xfs_trans_clear_busy_extents(
936 struct xfs_trans *tp)
937{
938 xfs_log_busy_chunk_t *lbcp;
939 xfs_log_busy_slot_t *lbsp;
940 int i;
941
942 for (lbcp = &tp->t_busy; lbcp != NULL; lbcp = lbcp->lbc_next) {
943 i = 0;
944 for (lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) {
945 if (XFS_LBC_ISFREE(lbcp, i))
946 continue;
947 xfs_alloc_clear_busy(tp, lbsp->lbc_ag, lbsp->lbc_idx);
948 }
949 }
950 xfs_trans_free_busy(tp);
951}
952
953/*
954 * This is typically called by the LM when a transaction has been fully
955 * committed to disk. It needs to unpin the items which have
956 * been logged by the transaction and update their positions
957 * in the AIL if necessary.
958 *
959 * This also gets called when the transactions didn't get written out
960 * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
961 */
962STATIC void
963xfs_trans_committed(
964 struct xfs_trans *tp,
965 int abortflag)
966{
967 xfs_log_item_desc_t *lidp;
968 xfs_log_item_chunk_t *licp;
969 xfs_log_item_chunk_t *next_licp;
970
971 /* Call the transaction's completion callback if there is one. */
972 if (tp->t_callback != NULL)
973 tp->t_callback(tp, tp->t_callarg);
974
975 for (lidp = xfs_trans_first_item(tp);
976 lidp != NULL;
977 lidp = xfs_trans_next_item(tp, lidp)) {
978 xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag);
979 }
980
981 /* free the item chunks, ignoring the embedded chunk */
982 for (licp = tp->t_items.lic_next; licp != NULL; licp = next_licp) {
983 next_licp = licp->lic_next;
984 kmem_free(licp);
985 }
986
987 xfs_trans_clear_busy_extents(tp);
988 xfs_trans_free(tp);
989}
990
991/*
992 * Called from the trans_commit code when we notice that
993 * the filesystem is in the middle of a forced shutdown.
994 */
995STATIC void
996xfs_trans_uncommit(
997 struct xfs_trans *tp,
998 uint flags)
999{
1000 xfs_log_item_desc_t *lidp;
1001
1002 for (lidp = xfs_trans_first_item(tp);
1003 lidp != NULL;
1004 lidp = xfs_trans_next_item(tp, lidp)) {
827 /* 1005 /*
828 * It is indeed possible for the transaction to be 1006 * Unpin all but those that aren't dirty.
829 * not dirty but the dqinfo portion to be. All that
830 * means is that we have some (non-persistent) quota
831 * reservations that need to be unreserved.
832 */ 1007 */
833 xfs_trans_unreserve_and_mod_dquots(tp); 1008 if (lidp->lid_flags & XFS_LID_DIRTY)
834 if (tp->t_ticket) { 1009 IOP_UNPIN_REMOVE(lidp->lid_item, tp);
835 commit_lsn = xfs_log_done(mp, tp->t_ticket,
836 NULL, log_flags);
837 if (commit_lsn == -1 && !shutdown)
838 shutdown = XFS_ERROR(EIO);
839 }
840 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
841 xfs_trans_free_items(tp, shutdown? XFS_TRANS_ABORT : 0);
842 xfs_trans_free_busy(tp);
843 xfs_trans_free(tp);
844 XFS_STATS_INC(xs_trans_empty);
845 return (shutdown);
846 } 1010 }
847 ASSERT(tp->t_ticket != NULL);
848 1011
849 /* 1012 xfs_trans_unreserve_and_mod_sb(tp);
850 * If we need to update the superblock, then do it now. 1013 xfs_trans_unreserve_and_mod_dquots(tp);
851 */ 1014
852 if (tp->t_flags & XFS_TRANS_SB_DIRTY) 1015 xfs_trans_free_items(tp, flags);
853 xfs_trans_apply_sb_deltas(tp); 1016 xfs_trans_free_busy(tp);
854 xfs_trans_apply_dquot_deltas(tp); 1017 xfs_trans_free(tp);
1018}
1019
1020/*
1021 * Format the transaction direct to the iclog. This isolates the physical
1022 * transaction commit operation from the logical operation and hence allows
1023 * other methods to be introduced without affecting the existing commit path.
1024 */
1025static int
1026xfs_trans_commit_iclog(
1027 struct xfs_mount *mp,
1028 struct xfs_trans *tp,
1029 xfs_lsn_t *commit_lsn,
1030 int flags)
1031{
1032 int shutdown;
1033 int error;
1034 int log_flags = 0;
1035 struct xlog_in_core *commit_iclog;
1036#define XFS_TRANS_LOGVEC_COUNT 16
1037 struct xfs_log_iovec log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
1038 struct xfs_log_iovec *log_vector;
1039 uint nvec;
1040
855 1041
856 /* 1042 /*
857 * Ask each log item how many log_vector entries it will 1043 * Ask each log item how many log_vector entries it will
@@ -861,8 +1047,7 @@ shut_us_down:
861 */ 1047 */
862 nvec = xfs_trans_count_vecs(tp); 1048 nvec = xfs_trans_count_vecs(tp);
863 if (nvec == 0) { 1049 if (nvec == 0) {
864 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 1050 return ENOMEM; /* triggers a shutdown! */
865 goto shut_us_down;
866 } else if (nvec <= XFS_TRANS_LOGVEC_COUNT) { 1051 } else if (nvec <= XFS_TRANS_LOGVEC_COUNT) {
867 log_vector = log_vector_fast; 1052 log_vector = log_vector_fast;
868 } else { 1053 } else {
@@ -877,6 +1062,9 @@ shut_us_down:
877 */ 1062 */
878 xfs_trans_fill_vecs(tp, log_vector); 1063 xfs_trans_fill_vecs(tp, log_vector);
879 1064
1065 if (flags & XFS_TRANS_RELEASE_LOG_RES)
1066 log_flags = XFS_LOG_REL_PERM_RESERV;
1067
880 error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn)); 1068 error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn));
881 1069
882 /* 1070 /*
@@ -884,18 +1072,17 @@ shut_us_down:
884 * at any time after this call. However, all the items associated 1072 * at any time after this call. However, all the items associated
885 * with the transaction are still locked and pinned in memory. 1073 * with the transaction are still locked and pinned in memory.
886 */ 1074 */
887 commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); 1075 *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
888 1076
889 tp->t_commit_lsn = commit_lsn; 1077 tp->t_commit_lsn = *commit_lsn;
890 if (nvec > XFS_TRANS_LOGVEC_COUNT) { 1078 if (nvec > XFS_TRANS_LOGVEC_COUNT)
891 kmem_free(log_vector); 1079 kmem_free(log_vector);
892 }
893 1080
894 /* 1081 /*
895 * If we got a log write error. Unpin the logitems that we 1082 * If we got a log write error. Unpin the logitems that we
896 * had pinned, clean up, free trans structure, and return error. 1083 * had pinned, clean up, free trans structure, and return error.
897 */ 1084 */
898 if (error || commit_lsn == -1) { 1085 if (error || *commit_lsn == -1) {
899 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1086 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
900 xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT); 1087 xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT);
901 return XFS_ERROR(EIO); 1088 return XFS_ERROR(EIO);
@@ -909,8 +1096,6 @@ shut_us_down:
909 */ 1096 */
910 xfs_trans_unreserve_and_mod_sb(tp); 1097 xfs_trans_unreserve_and_mod_sb(tp);
911 1098
912 sync = tp->t_flags & XFS_TRANS_SYNC;
913
914 /* 1099 /*
915 * Tell the LM to call the transaction completion routine 1100 * Tell the LM to call the transaction completion routine
916 * when the log write with LSN commit_lsn completes (e.g. 1101 * when the log write with LSN commit_lsn completes (e.g.
@@ -953,7 +1138,7 @@ shut_us_down:
953 * the commit lsn of this transaction for dependency tracking 1138 * the commit lsn of this transaction for dependency tracking
954 * purposes. 1139 * purposes.
955 */ 1140 */
956 xfs_trans_unlock_items(tp, commit_lsn); 1141 xfs_trans_unlock_items(tp, *commit_lsn);
957 1142
958 /* 1143 /*
959 * If we detected a log error earlier, finish committing 1144 * If we detected a log error earlier, finish committing
@@ -973,156 +1158,114 @@ shut_us_down:
973 * and the items are released we can finally allow the iclog to 1158 * and the items are released we can finally allow the iclog to
974 * go to disk. 1159 * go to disk.
975 */ 1160 */
976 error = xfs_log_release_iclog(mp, commit_iclog); 1161 return xfs_log_release_iclog(mp, commit_iclog);
977
978 /*
979 * If the transaction needs to be synchronous, then force the
980 * log out now and wait for it.
981 */
982 if (sync) {
983 if (!error) {
984 error = _xfs_log_force_lsn(mp, commit_lsn,
985 XFS_LOG_SYNC, log_flushed);
986 }
987 XFS_STATS_INC(xs_trans_sync);
988 } else {
989 XFS_STATS_INC(xs_trans_async);
990 }
991
992 return (error);
993} 1162}
994 1163
995 1164
996/* 1165/*
997 * Total up the number of log iovecs needed to commit this 1166 * xfs_trans_commit
998 * transaction. The transaction itself needs one for the 1167 *
999 * transaction header. Ask each dirty item in turn how many 1168 * Commit the given transaction to the log a/synchronously.
1000 * it needs to get the total. 1169 *
1170 * XFS disk error handling mechanism is not based on a typical
1171 * transaction abort mechanism. Logically after the filesystem
1172 * gets marked 'SHUTDOWN', we can't let any new transactions
1173 * be durable - ie. committed to disk - because some metadata might
1174 * be inconsistent. In such cases, this returns an error, and the
1175 * caller may assume that all locked objects joined to the transaction
1176 * have already been unlocked as if the commit had succeeded.
1177 * Do not reference the transaction structure after this call.
1001 */ 1178 */
1002STATIC uint 1179int
1003xfs_trans_count_vecs( 1180_xfs_trans_commit(
1004 xfs_trans_t *tp) 1181 struct xfs_trans *tp,
1182 uint flags,
1183 int *log_flushed)
1005{ 1184{
1006 int nvecs; 1185 struct xfs_mount *mp = tp->t_mountp;
1007 xfs_log_item_desc_t *lidp; 1186 xfs_lsn_t commit_lsn = -1;
1187 int error = 0;
1188 int log_flags = 0;
1189 int sync = tp->t_flags & XFS_TRANS_SYNC;
1008 1190
1009 nvecs = 1; 1191 /*
1010 lidp = xfs_trans_first_item(tp); 1192 * Determine whether this commit is releasing a permanent
1011 ASSERT(lidp != NULL); 1193 * log reservation or not.
1012
1013 /* In the non-debug case we need to start bailing out if we
1014 * didn't find a log_item here, return zero and let trans_commit
1015 * deal with it.
1016 */ 1194 */
1017 if (lidp == NULL) 1195 if (flags & XFS_TRANS_RELEASE_LOG_RES) {
1018 return 0; 1196 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1019 1197 log_flags = XFS_LOG_REL_PERM_RESERV;
1020 while (lidp != NULL) {
1021 /*
1022 * Skip items which aren't dirty in this transaction.
1023 */
1024 if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
1025 lidp = xfs_trans_next_item(tp, lidp);
1026 continue;
1027 }
1028 lidp->lid_size = IOP_SIZE(lidp->lid_item);
1029 nvecs += lidp->lid_size;
1030 lidp = xfs_trans_next_item(tp, lidp);
1031 } 1198 }
1032 1199
1033 return nvecs; 1200 /*
1034} 1201 * If there is nothing to be logged by the transaction,
1035 1202 * then unlock all of the items associated with the
1036/* 1203 * transaction and free the transaction structure.
1037 * Called from the trans_commit code when we notice that 1204 * Also make sure to return any reserved blocks to
1038 * the filesystem is in the middle of a forced shutdown. 1205 * the free pool.
1039 */ 1206 */
1040STATIC void 1207 if (!(tp->t_flags & XFS_TRANS_DIRTY))
1041xfs_trans_uncommit( 1208 goto out_unreserve;
1042 xfs_trans_t *tp,
1043 uint flags)
1044{
1045 xfs_log_item_desc_t *lidp;
1046 1209
1047 for (lidp = xfs_trans_first_item(tp); 1210 if (XFS_FORCED_SHUTDOWN(mp)) {
1048 lidp != NULL; 1211 error = XFS_ERROR(EIO);
1049 lidp = xfs_trans_next_item(tp, lidp)) { 1212 goto out_unreserve;
1050 /*
1051 * Unpin all but those that aren't dirty.
1052 */
1053 if (lidp->lid_flags & XFS_LID_DIRTY)
1054 IOP_UNPIN_REMOVE(lidp->lid_item, tp);
1055 } 1213 }
1056 1214
1057 xfs_trans_unreserve_and_mod_sb(tp); 1215 ASSERT(tp->t_ticket != NULL);
1058 xfs_trans_unreserve_and_mod_dquots(tp);
1059 1216
1060 xfs_trans_free_items(tp, flags); 1217 /*
1061 xfs_trans_free_busy(tp); 1218 * If we need to update the superblock, then do it now.
1062 xfs_trans_free(tp); 1219 */
1063} 1220 if (tp->t_flags & XFS_TRANS_SB_DIRTY)
1221 xfs_trans_apply_sb_deltas(tp);
1222 xfs_trans_apply_dquot_deltas(tp);
1064 1223
1065/* 1224 error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags);
1066 * Fill in the vector with pointers to data to be logged 1225 if (error == ENOMEM) {
1067 * by this transaction. The transaction header takes 1226 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
1068 * the first vector, and then each dirty item takes the 1227 error = XFS_ERROR(EIO);
1069 * number of vectors it indicated it needed in xfs_trans_count_vecs(). 1228 goto out_unreserve;
1070 * 1229 }
1071 * As each item fills in the entries it needs, also pin the item
1072 * so that it cannot be flushed out until the log write completes.
1073 */
1074STATIC void
1075xfs_trans_fill_vecs(
1076 xfs_trans_t *tp,
1077 xfs_log_iovec_t *log_vector)
1078{
1079 xfs_log_item_desc_t *lidp;
1080 xfs_log_iovec_t *vecp;
1081 uint nitems;
1082 1230
1083 /* 1231 /*
1084 * Skip over the entry for the transaction header, we'll 1232 * If the transaction needs to be synchronous, then force the
1085 * fill that in at the end. 1233 * log out now and wait for it.
1086 */ 1234 */
1087 vecp = log_vector + 1; /* pointer arithmetic */ 1235 if (sync) {
1088 1236 if (!error) {
1089 nitems = 0; 1237 error = _xfs_log_force_lsn(mp, commit_lsn,
1090 lidp = xfs_trans_first_item(tp); 1238 XFS_LOG_SYNC, log_flushed);
1091 ASSERT(lidp != NULL);
1092 while (lidp != NULL) {
1093 /*
1094 * Skip items which aren't dirty in this transaction.
1095 */
1096 if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
1097 lidp = xfs_trans_next_item(tp, lidp);
1098 continue;
1099 }
1100 /*
1101 * The item may be marked dirty but not log anything.
1102 * This can be used to get called when a transaction
1103 * is committed.
1104 */
1105 if (lidp->lid_size) {
1106 nitems++;
1107 } 1239 }
1108 IOP_FORMAT(lidp->lid_item, vecp); 1240 XFS_STATS_INC(xs_trans_sync);
1109 vecp += lidp->lid_size; /* pointer arithmetic */ 1241 } else {
1110 IOP_PIN(lidp->lid_item); 1242 XFS_STATS_INC(xs_trans_async);
1111 lidp = xfs_trans_next_item(tp, lidp);
1112 } 1243 }
1113 1244
1245 return error;
1246
1247out_unreserve:
1248 xfs_trans_unreserve_and_mod_sb(tp);
1249
1114 /* 1250 /*
1115 * Now that we've counted the number of items in this 1251 * It is indeed possible for the transaction to be not dirty but
1116 * transaction, fill in the transaction header. 1252 * the dqinfo portion to be. All that means is that we have some
1253 * (non-persistent) quota reservations that need to be unreserved.
1117 */ 1254 */
1118 tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC; 1255 xfs_trans_unreserve_and_mod_dquots(tp);
1119 tp->t_header.th_type = tp->t_type; 1256 if (tp->t_ticket) {
1120 tp->t_header.th_num_items = nitems; 1257 commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
1121 log_vector->i_addr = (xfs_caddr_t)&tp->t_header; 1258 if (commit_lsn == -1 && !error)
1122 log_vector->i_len = sizeof(xfs_trans_header_t); 1259 error = XFS_ERROR(EIO);
1123 log_vector->i_type = XLOG_REG_TYPE_TRANSHDR; 1260 }
1124} 1261 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1262 xfs_trans_free_items(tp, error ? XFS_TRANS_ABORT : 0);
1263 xfs_trans_free_busy(tp);
1264 xfs_trans_free(tp);
1125 1265
1266 XFS_STATS_INC(xs_trans_empty);
1267 return error;
1268}
1126 1269
1127/* 1270/*
1128 * Unlock all of the transaction's items and free the transaction. 1271 * Unlock all of the transaction's items and free the transaction.
@@ -1200,20 +1343,6 @@ xfs_trans_cancel(
1200 xfs_trans_free(tp); 1343 xfs_trans_free(tp);
1201} 1344}
1202 1345
1203
1204/*
1205 * Free the transaction structure. If there is more clean up
1206 * to do when the structure is freed, add it here.
1207 */
1208STATIC void
1209xfs_trans_free(
1210 xfs_trans_t *tp)
1211{
1212 atomic_dec(&tp->t_mountp->m_active_trans);
1213 xfs_trans_free_dqinfo(tp);
1214 kmem_zone_free(xfs_trans_zone, tp);
1215}
1216
1217/* 1346/*
1218 * Roll from one trans in the sequence of PERMANENT transactions to 1347 * Roll from one trans in the sequence of PERMANENT transactions to
1219 * the next: permanent transactions are only flushed out when 1348 * the next: permanent transactions are only flushed out when
@@ -1283,174 +1412,3 @@ xfs_trans_roll(
1283 xfs_trans_ihold(trans, dp); 1412 xfs_trans_ihold(trans, dp);
1284 return 0; 1413 return 0;
1285} 1414}
1286
1287/*
1288 * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item().
1289 *
1290 * This is typically called by the LM when a transaction has been fully
1291 * committed to disk. It needs to unpin the items which have
1292 * been logged by the transaction and update their positions
1293 * in the AIL if necessary.
1294 * This also gets called when the transactions didn't get written out
1295 * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
1296 *
1297 * Call xfs_trans_chunk_committed() to process the items in
1298 * each chunk.
1299 */
1300STATIC void
1301xfs_trans_committed(
1302 xfs_trans_t *tp,
1303 int abortflag)
1304{
1305 xfs_log_item_chunk_t *licp;
1306 xfs_log_item_chunk_t *next_licp;
1307 xfs_log_busy_chunk_t *lbcp;
1308 xfs_log_busy_slot_t *lbsp;
1309 int i;
1310
1311 /*
1312 * Call the transaction's completion callback if there
1313 * is one.
1314 */
1315 if (tp->t_callback != NULL) {
1316 tp->t_callback(tp, tp->t_callarg);
1317 }
1318
1319 /*
1320 * Special case the chunk embedded in the transaction.
1321 */
1322 licp = &(tp->t_items);
1323 if (!(xfs_lic_are_all_free(licp))) {
1324 xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
1325 }
1326
1327 /*
1328 * Process the items in each chunk in turn.
1329 */
1330 licp = licp->lic_next;
1331 while (licp != NULL) {
1332 ASSERT(!xfs_lic_are_all_free(licp));
1333 xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
1334 next_licp = licp->lic_next;
1335 kmem_free(licp);
1336 licp = next_licp;
1337 }
1338
1339 /*
1340 * Clear all the per-AG busy list items listed in this transaction
1341 */
1342 lbcp = &tp->t_busy;
1343 while (lbcp != NULL) {
1344 for (i = 0, lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) {
1345 if (!XFS_LBC_ISFREE(lbcp, i)) {
1346 xfs_alloc_clear_busy(tp, lbsp->lbc_ag,
1347 lbsp->lbc_idx);
1348 }
1349 }
1350 lbcp = lbcp->lbc_next;
1351 }
1352 xfs_trans_free_busy(tp);
1353
1354 /*
1355 * That's it for the transaction structure. Free it.
1356 */
1357 xfs_trans_free(tp);
1358}
1359
1360/*
1361 * This is called to perform the commit processing for each
1362 * item described by the given chunk.
1363 *
1364 * The commit processing consists of unlocking items which were
1365 * held locked with the SYNC_UNLOCK attribute, calling the committed
1366 * routine of each logged item, updating the item's position in the AIL
1367 * if necessary, and unpinning each item. If the committed routine
1368 * returns -1, then do nothing further with the item because it
1369 * may have been freed.
1370 *
1371 * Since items are unlocked when they are copied to the incore
1372 * log, it is possible for two transactions to be completing
1373 * and manipulating the same item simultaneously. The AIL lock
1374 * will protect the lsn field of each item. The value of this
1375 * field can never go backwards.
1376 *
1377 * We unpin the items after repositioning them in the AIL, because
1378 * otherwise they could be immediately flushed and we'd have to race
1379 * with the flusher trying to pull the item from the AIL as we add it.
1380 */
1381STATIC void
1382xfs_trans_chunk_committed(
1383 xfs_log_item_chunk_t *licp,
1384 xfs_lsn_t lsn,
1385 int aborted)
1386{
1387 xfs_log_item_desc_t *lidp;
1388 xfs_log_item_t *lip;
1389 xfs_lsn_t item_lsn;
1390 int i;
1391
1392 lidp = licp->lic_descs;
1393 for (i = 0; i < licp->lic_unused; i++, lidp++) {
1394 struct xfs_ail *ailp;
1395
1396 if (xfs_lic_isfree(licp, i)) {
1397 continue;
1398 }
1399
1400 lip = lidp->lid_item;
1401 if (aborted)
1402 lip->li_flags |= XFS_LI_ABORTED;
1403
1404 /*
1405 * Send in the ABORTED flag to the COMMITTED routine
1406 * so that it knows whether the transaction was aborted
1407 * or not.
1408 */
1409 item_lsn = IOP_COMMITTED(lip, lsn);
1410
1411 /*
1412 * If the committed routine returns -1, make
1413 * no more references to the item.
1414 */
1415 if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) {
1416 continue;
1417 }
1418
1419 /*
1420 * If the returned lsn is greater than what it
1421 * contained before, update the location of the
1422 * item in the AIL. If it is not, then do nothing.
1423 * Items can never move backwards in the AIL.
1424 *
1425 * While the new lsn should usually be greater, it
1426 * is possible that a later transaction completing
1427 * simultaneously with an earlier one using the
1428 * same item could complete first with a higher lsn.
1429 * This would cause the earlier transaction to fail
1430 * the test below.
1431 */
1432 ailp = lip->li_ailp;
1433 spin_lock(&ailp->xa_lock);
1434 if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
1435 /*
1436 * This will set the item's lsn to item_lsn
1437 * and update the position of the item in
1438 * the AIL.
1439 *
1440 * xfs_trans_ail_update() drops the AIL lock.
1441 */
1442 xfs_trans_ail_update(ailp, lip, item_lsn);
1443 } else {
1444 spin_unlock(&ailp->xa_lock);
1445 }
1446
1447 /*
1448 * Now that we've repositioned the item in the AIL,
1449 * unpin it so it can be flushed. Pass information
1450 * about buffer stale state down from the log item
1451 * flags, if anyone else stales the buffer we do not
1452 * want to pay any attention to it.
1453 */
1454 IOP_UNPIN(lip, lidp->lid_flags & XFS_LID_BUF_STALE);
1455 }
1456}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 79c8bab9dfff..c62beee0921e 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -49,6 +49,15 @@ typedef struct xfs_trans_header {
49#define XFS_LI_DQUOT 0x123d 49#define XFS_LI_DQUOT 0x123d
50#define XFS_LI_QUOTAOFF 0x123e 50#define XFS_LI_QUOTAOFF 0x123e
51 51
52#define XFS_LI_TYPE_DESC \
53 { XFS_LI_EFI, "XFS_LI_EFI" }, \
54 { XFS_LI_EFD, "XFS_LI_EFD" }, \
55 { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \
56 { XFS_LI_INODE, "XFS_LI_INODE" }, \
57 { XFS_LI_BUF, "XFS_LI_BUF" }, \
58 { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \
59 { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }
60
52/* 61/*
53 * Transaction types. Used to distinguish types of buffers. 62 * Transaction types. Used to distinguish types of buffers.
54 */ 63 */
@@ -159,7 +168,6 @@ typedef struct xfs_log_item_desc {
159 168
160#define XFS_LID_DIRTY 0x1 169#define XFS_LID_DIRTY 0x1
161#define XFS_LID_PINNED 0x2 170#define XFS_LID_PINNED 0x2
162#define XFS_LID_BUF_STALE 0x8
163 171
164/* 172/*
165 * This structure is used to maintain a chunk list of log_item_desc 173 * This structure is used to maintain a chunk list of log_item_desc
@@ -833,7 +841,7 @@ typedef struct xfs_item_ops {
833 uint (*iop_size)(xfs_log_item_t *); 841 uint (*iop_size)(xfs_log_item_t *);
834 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); 842 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
835 void (*iop_pin)(xfs_log_item_t *); 843 void (*iop_pin)(xfs_log_item_t *);
836 void (*iop_unpin)(xfs_log_item_t *, int); 844 void (*iop_unpin)(xfs_log_item_t *);
837 void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *); 845 void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
838 uint (*iop_trylock)(xfs_log_item_t *); 846 uint (*iop_trylock)(xfs_log_item_t *);
839 void (*iop_unlock)(xfs_log_item_t *); 847 void (*iop_unlock)(xfs_log_item_t *);
@@ -846,7 +854,7 @@ typedef struct xfs_item_ops {
846#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) 854#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip)
847#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) 855#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp)
848#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip) 856#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip)
849#define IOP_UNPIN(ip, flags) (*(ip)->li_ops->iop_unpin)(ip, flags) 857#define IOP_UNPIN(ip) (*(ip)->li_ops->iop_unpin)(ip)
850#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp) 858#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
851#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip) 859#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip)
852#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip) 860#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip)
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index fb586360d1c9..9cd809025f3a 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -40,11 +40,51 @@
40#include "xfs_rw.h" 40#include "xfs_rw.h"
41#include "xfs_trace.h" 41#include "xfs_trace.h"
42 42
43/*
44 * Check to see if a buffer matching the given parameters is already
45 * a part of the given transaction.
46 */
47STATIC struct xfs_buf *
48xfs_trans_buf_item_match(
49 struct xfs_trans *tp,
50 struct xfs_buftarg *target,
51 xfs_daddr_t blkno,
52 int len)
53{
54 xfs_log_item_chunk_t *licp;
55 xfs_log_item_desc_t *lidp;
56 xfs_buf_log_item_t *blip;
57 int i;
43 58
44STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *, 59 len = BBTOB(len);
45 xfs_daddr_t, int); 60 for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
46STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *, 61 if (xfs_lic_are_all_free(licp)) {
47 xfs_daddr_t, int); 62 ASSERT(licp == &tp->t_items);
63 ASSERT(licp->lic_next == NULL);
64 return NULL;
65 }
66
67 for (i = 0; i < licp->lic_unused; i++) {
68 /*
69 * Skip unoccupied slots.
70 */
71 if (xfs_lic_isfree(licp, i))
72 continue;
73
74 lidp = xfs_lic_slot(licp, i);
75 blip = (xfs_buf_log_item_t *)lidp->lid_item;
76 if (blip->bli_item.li_type != XFS_LI_BUF)
77 continue;
78
79 if (XFS_BUF_TARGET(blip->bli_buf) == target &&
80 XFS_BUF_ADDR(blip->bli_buf) == blkno &&
81 XFS_BUF_COUNT(blip->bli_buf) == len)
82 return blip->bli_buf;
83 }
84 }
85
86 return NULL;
87}
48 88
49/* 89/*
50 * Add the locked buffer to the transaction. 90 * Add the locked buffer to the transaction.
@@ -112,14 +152,6 @@ xfs_trans_bjoin(
112 * within the transaction, just increment its lock recursion count 152 * within the transaction, just increment its lock recursion count
113 * and return a pointer to it. 153 * and return a pointer to it.
114 * 154 *
115 * Use the fast path function xfs_trans_buf_item_match() or the buffer
116 * cache routine incore_match() to find the buffer
117 * if it is already owned by this transaction.
118 *
119 * If we don't already own the buffer, use get_buf() to get it.
120 * If it doesn't yet have an associated xfs_buf_log_item structure,
121 * then allocate one and add the item to this transaction.
122 *
123 * If the transaction pointer is NULL, make this just a normal 155 * If the transaction pointer is NULL, make this just a normal
124 * get_buf() call. 156 * get_buf() call.
125 */ 157 */
@@ -149,11 +181,7 @@ xfs_trans_get_buf(xfs_trans_t *tp,
149 * have it locked. In this case we just increment the lock 181 * have it locked. In this case we just increment the lock
150 * recursion count and return the buffer to the caller. 182 * recursion count and return the buffer to the caller.
151 */ 183 */
152 if (tp->t_items.lic_next == NULL) { 184 bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
153 bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
154 } else {
155 bp = xfs_trans_buf_item_match_all(tp, target_dev, blkno, len);
156 }
157 if (bp != NULL) { 185 if (bp != NULL) {
158 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 186 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
159 if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) 187 if (XFS_FORCED_SHUTDOWN(tp->t_mountp))
@@ -259,14 +287,6 @@ int xfs_error_mod = 33;
259 * within the transaction and already read in, just increment its 287 * within the transaction and already read in, just increment its
260 * lock recursion count and return a pointer to it. 288 * lock recursion count and return a pointer to it.
261 * 289 *
262 * Use the fast path function xfs_trans_buf_item_match() or the buffer
263 * cache routine incore_match() to find the buffer
264 * if it is already owned by this transaction.
265 *
266 * If we don't already own the buffer, use read_buf() to get it.
267 * If it doesn't yet have an associated xfs_buf_log_item structure,
268 * then allocate one and add the item to this transaction.
269 *
270 * If the transaction pointer is NULL, make this just a normal 290 * If the transaction pointer is NULL, make this just a normal
271 * read_buf() call. 291 * read_buf() call.
272 */ 292 */
@@ -328,11 +348,7 @@ xfs_trans_read_buf(
328 * If the buffer is not yet read in, then we read it in, increment 348 * If the buffer is not yet read in, then we read it in, increment
329 * the lock recursion count, and return it to the caller. 349 * the lock recursion count, and return it to the caller.
330 */ 350 */
331 if (tp->t_items.lic_next == NULL) { 351 bp = xfs_trans_buf_item_match(tp, target, blkno, len);
332 bp = xfs_trans_buf_item_match(tp, target, blkno, len);
333 } else {
334 bp = xfs_trans_buf_item_match_all(tp, target, blkno, len);
335 }
336 if (bp != NULL) { 352 if (bp != NULL) {
337 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 353 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
338 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); 354 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
@@ -696,7 +712,6 @@ xfs_trans_log_buf(xfs_trans_t *tp,
696 712
697 tp->t_flags |= XFS_TRANS_DIRTY; 713 tp->t_flags |= XFS_TRANS_DIRTY;
698 lidp->lid_flags |= XFS_LID_DIRTY; 714 lidp->lid_flags |= XFS_LID_DIRTY;
699 lidp->lid_flags &= ~XFS_LID_BUF_STALE;
700 bip->bli_flags |= XFS_BLI_LOGGED; 715 bip->bli_flags |= XFS_BLI_LOGGED;
701 xfs_buf_item_log(bip, first, last); 716 xfs_buf_item_log(bip, first, last);
702} 717}
@@ -782,7 +797,7 @@ xfs_trans_binval(
782 bip->bli_format.blf_flags |= XFS_BLI_CANCEL; 797 bip->bli_format.blf_flags |= XFS_BLI_CANCEL;
783 memset((char *)(bip->bli_format.blf_data_map), 0, 798 memset((char *)(bip->bli_format.blf_data_map), 0,
784 (bip->bli_format.blf_map_size * sizeof(uint))); 799 (bip->bli_format.blf_map_size * sizeof(uint)));
785 lidp->lid_flags |= XFS_LID_DIRTY|XFS_LID_BUF_STALE; 800 lidp->lid_flags |= XFS_LID_DIRTY;
786 tp->t_flags |= XFS_TRANS_DIRTY; 801 tp->t_flags |= XFS_TRANS_DIRTY;
787} 802}
788 803
@@ -902,111 +917,3 @@ xfs_trans_dquot_buf(
902 917
903 bip->bli_format.blf_flags |= type; 918 bip->bli_format.blf_flags |= type;
904} 919}
905
906/*
907 * Check to see if a buffer matching the given parameters is already
908 * a part of the given transaction. Only check the first, embedded
909 * chunk, since we don't want to spend all day scanning large transactions.
910 */
911STATIC xfs_buf_t *
912xfs_trans_buf_item_match(
913 xfs_trans_t *tp,
914 xfs_buftarg_t *target,
915 xfs_daddr_t blkno,
916 int len)
917{
918 xfs_log_item_chunk_t *licp;
919 xfs_log_item_desc_t *lidp;
920 xfs_buf_log_item_t *blip;
921 xfs_buf_t *bp;
922 int i;
923
924 bp = NULL;
925 len = BBTOB(len);
926 licp = &tp->t_items;
927 if (!xfs_lic_are_all_free(licp)) {
928 for (i = 0; i < licp->lic_unused; i++) {
929 /*
930 * Skip unoccupied slots.
931 */
932 if (xfs_lic_isfree(licp, i)) {
933 continue;
934 }
935
936 lidp = xfs_lic_slot(licp, i);
937 blip = (xfs_buf_log_item_t *)lidp->lid_item;
938 if (blip->bli_item.li_type != XFS_LI_BUF) {
939 continue;
940 }
941
942 bp = blip->bli_buf;
943 if ((XFS_BUF_TARGET(bp) == target) &&
944 (XFS_BUF_ADDR(bp) == blkno) &&
945 (XFS_BUF_COUNT(bp) == len)) {
946 /*
947 * We found it. Break out and
948 * return the pointer to the buffer.
949 */
950 break;
951 } else {
952 bp = NULL;
953 }
954 }
955 }
956 return bp;
957}
958
959/*
960 * Check to see if a buffer matching the given parameters is already
961 * a part of the given transaction. Check all the chunks, we
962 * want to be thorough.
963 */
964STATIC xfs_buf_t *
965xfs_trans_buf_item_match_all(
966 xfs_trans_t *tp,
967 xfs_buftarg_t *target,
968 xfs_daddr_t blkno,
969 int len)
970{
971 xfs_log_item_chunk_t *licp;
972 xfs_log_item_desc_t *lidp;
973 xfs_buf_log_item_t *blip;
974 xfs_buf_t *bp;
975 int i;
976
977 bp = NULL;
978 len = BBTOB(len);
979 for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
980 if (xfs_lic_are_all_free(licp)) {
981 ASSERT(licp == &tp->t_items);
982 ASSERT(licp->lic_next == NULL);
983 return NULL;
984 }
985 for (i = 0; i < licp->lic_unused; i++) {
986 /*
987 * Skip unoccupied slots.
988 */
989 if (xfs_lic_isfree(licp, i)) {
990 continue;
991 }
992
993 lidp = xfs_lic_slot(licp, i);
994 blip = (xfs_buf_log_item_t *)lidp->lid_item;
995 if (blip->bli_item.li_type != XFS_LI_BUF) {
996 continue;
997 }
998
999 bp = blip->bli_buf;
1000 if ((XFS_BUF_TARGET(bp) == target) &&
1001 (XFS_BUF_ADDR(bp) == blkno) &&
1002 (XFS_BUF_COUNT(bp) == len)) {
1003 /*
1004 * We found it. Break out and
1005 * return the pointer to the buffer.
1006 */
1007 return bp;
1008 }
1009 }
1010 }
1011 return NULL;
1012}